コード例 #1
0
def remove_invalid_files(apps, schema_editor):
    books = DatabaseBook.list_available()
    for book in tqdm(books, "Removing old files"):
        for page in book.pages():
            obsolete_files = [
                'annotation.json',
                'binary_cropped.png',
                'binary_cropped_preview.jpg',
                'binary_deskewed.png',
                'binary_deskewed_preview.jpg',
                'binary_original.png',
                'binary_original_preview.jpg',
                'color_cropped.jpg',
                'color_cropped_preview.jpg',
                'color_deskewed.jpg',
                'color_deskewed_preview.jpg',
                'gray_cropped.jpg',
                'gray_cropped_preview.jpg',
                'gray_deskewed.jpg',
                'gray_deskewed_preview.jpg',
                'gray_original.jpg',
                'gray_original_preview.jpg',
                'connected_components_deskewed.pkl',
            ]
            for f in obsolete_files:
                f = page.local_file_path(f)
                if os.path.exists(f):
                    os.remove(f)
コード例 #2
0
def dataset_by_locked_pages(
        n_train,
        locks: List[LockState],
        shuffle: bool = True,
        datasets: List[DatabaseBook] = None
) -> Tuple[List[PcGts], List[PcGts]]:
    logger.info("Finding PcGts files with valid ground truth")
    pcgts = []
    for dataset in (datasets if datasets else DatabaseBook.list_available()):
        logger.debug("Listing files of dataset '{}'".format(dataset.book))
        if not dataset.exists():
            raise ValueError("Dataset '{}' does not exist at '{}'".format(
                dataset.book, dataset.local_path()))

        for page in dataset.pages_with_lock(locks):
            pcgts.append(PcGts.from_file(page.file('pcgts')))

    if len(pcgts) == 0:
        raise EmptyDataSetException()

    if shuffle:
        random.shuffle(pcgts)

    train_pcgts = pcgts[:int(len(pcgts) * n_train)]
    val_pcgts = pcgts[len(train_pcgts):]

    if 0 < n_train < 1 and (len(train_pcgts) == 0 or len(val_pcgts) == 0):
        raise EmptyDataSetException()

    return train_pcgts, val_pcgts
コード例 #3
0
def remove_word_and_neume_connector_layer(apps, schema_editor):
    books = DatabaseBook.list_available()
    for book in books:
        for page in book.pages():
            pcgts_file = page.file('pcgts')
            try:
                if not pcgts_file.exists():
                    continue

                with open(pcgts_file.local_path(), 'r') as f:
                    pcgts = json.load(f)

                page = pcgts['page']
                if not page:
                    continue

                text_regions = page.get('textRegions', [])
                for text_region in text_regions:
                    text_lines = text_region.get('textLines', [])
                    for text_line in text_lines:
                        words = text_line.get('words', [])
                        text_line['syllables'] = text_line.get('syllables', [])
                        if not words:
                            continue

                        for word in words:
                            text_line['syllables'] += word.get('syllables', [])

                annotations = page.get('annotations', {})
                for connection in annotations.get('connections', []):
                    for syllable_connector in connection.get(
                            'syllableConnectors', []):
                        if 'refID' in syllable_connector:
                            syllable_connector[
                                'syllableID'] = syllable_connector['refID']

                        neume_connectors = syllable_connector.get(
                            'neumeConnectors', [])

                        if len(neume_connectors) == 0:
                            continue
                        elif len(neume_connectors) == 1:
                            syllable_connector['neumeID'] = neume_connectors[
                                0]['refID']
                        else:
                            raise ValueError(
                                "Cannot convert {}. Neume connector has {} neume connectors. "
                                "You need to manually convert this file. "
                                "".format(pcgts_file.local_path(),
                                          len(neume_connectors)))

                with open(pcgts_file.local_path(), 'w') as f:
                    json.dump(pcgts, f)
            except Exception as e:
                logger.error(
                    "Exception occurred during processing of page {}".format(
                        pcgts_file.local_path()))
                raise e
コード例 #4
0
    def to_train_val(
            self,
            locks: List[LockState],
            shuffle: bool = True,
            books: List[DatabaseBook] = None
    ) -> Tuple[List[PcGts], List[PcGts]]:
        if self.includeAllTrainingData:
            books = DatabaseBook.list_available()

        return dataset_by_locked_pages(self.nTrain, locks, shuffle, books)
コード例 #5
0
def pcgts_to_relative_coords(apps, schema_editor):
    books = DatabaseBook.list_available()
    for book in tqdm(books, "Converting to relative coords"):
        for page in book.pages():
            pcgts_file = page.file('pcgts')
            size = Image.open(page.file('color_original').local_path()).size
            if not pcgts_file.exists():
                continue

            with open(pcgts_file.local_path()) as f:
                j = json.load(f)
            was_local = to_relative_coords(j, size)
            if not was_local:
                with open(pcgts_file.local_path(), 'w') as f:
                    json.dump(j, f)
コード例 #6
0
def pcgts_update_version(apps, schema_editor):
    books = DatabaseBook.list_available()
    version = 1
    for book in tqdm(books, "Converting to pcgts version {}".format(version)):
        for page in book.pages():
            pcgts_file = page.file('pcgts')
            if not pcgts_file.exists():
                continue

            with open(pcgts_file.local_path()) as f:
                j = json.load(f)

            upgraded = update_pcgts(j, target_version=version)

            if upgraded:
                with open(pcgts_file.local_path(), 'w') as f:
                    json.dump(j, f, indent=2)
コード例 #7
0
def fix_dataset_params(apps, schema_editor):
    # book models
    for book in DatabaseBook.list_available():
        if not os.path.exists(book.local_models_path()):
            continue

        for alg in os.listdir(book.local_models_path()):
            alg_dir = os.path.join(book.local_models_path(alg))
            for model in os.listdir(alg_dir):
                path = os.path.join(alg_dir, model, 'dataset_params.json')
                fix_file(path)

    # default models
    default_models = os.path.join(BASE_DIR, 'internal_storage',
                                  'default_models')
    if os.path.exists(default_models):
        for t in os.listdir(default_models):
            t_dir = os.path.join(default_models, t)
            for alg in os.listdir(t_dir):
                path = os.path.join(t_dir, alg, 'dataset_params.json')
                fix_file(path)
コード例 #8
0
def extract_from_pcgts(pcgts: PcGts):
    for l in pcgts.page.all_text_lines():
        hyphenated = l.sentence.text()
        for word in hyphenated.split():
            word = normalize(word)
            dictionary[word.replace("-", "")] = word


def extract_from_book(book: DatabaseBook):
    for page in tqdm(book.pages(), desc="Processing {}".format(book.book)):
        extract_from_pcgts(page.pcgts())


if args.books is None:
    books = DatabaseBook.list_available()
else:
    books = [DatabaseBook(b) for b in args.books]

print("Processing {} books".format(len(books)))

for book in books:
    print("Processing book {}".format(book.book))
    extract_from_book(book)

print("Extracted {} words".format(len(dictionary)))

with open(args.output, 'w') as f:
    for word, hyphen in dictionary.items():
        f.write("{:20s} {}\n".format(word, hyphen))