Ejemplo n.º 1
0
def get_text_ids() -> List[TextId]:
    context = create_context()
    corpus = Corpus(
        context.text_repository,
        context.get_bibliography(),
        context.changelog,
        context.sign_repository,
        context.parallel_line_injector,
    )
    return [text.id for text in corpus.list()]
Ejemplo n.º 2
0
def update(number) -> State:
    context = create_context()
    corpus = Corpus(
        context.text_repository,
        context.get_bibliography(),
        context.changelog,
        context.sign_repository,
        context.parallel_line_injector,
    )
    state = State()
    text = corpus.find(number)

    try:
        update_text(corpus, text)
        state.add_updated()
    except Exception as error:
        state.add_error(error, text)

    return state
def align_fragment(
    number: MuseumNumber,
    chapters: Iterable[Tuple[Text, Chapter]],
    max_lines: int,
    min_score: int,
) -> List[dict]:
    sys.setrecursionlimit(50000)
    context = create_context()
    fragment = context.fragment_repository.query_by_museum_number(number)

    return (
        [
            to_dict(fragment, text, chapter, result)
            for (text, chapter) in chapters
            for result in align_fragment_and_chapter(fragment, chapter)
            if result.score >= min_score
        ]
        if fragment.text.number_of_lines <= max_lines
        else []
    )
    parser.add_argument(
        "-t",
        "--threads",
        action="store_true",
        help="Use threads instead of processes.",
    )

    return parser.parse_args()


if __name__ == "__main__":
    args = parse_arguments()
    start = args.skip
    end = args.skip + args.limit

    context = create_context()
    fragments = context.fragment_repository

    t0 = time.time()

    fragment_numbers = fragments.query_transliterated_numbers()[start:end]
    chapters = load_chapters(context)

    Executor = ThreadPoolExecutor if args.threads else ProcessPoolExecutor

    with Executor(max_workers=args.workers) as executor, open(
        args.output, "w", encoding="utf-8"
    ) as file:
        results = tqdm(
            executor.map(
                partial(
    def insert_into_db(self, ebl_lines, filename):

        context = create_context()
        transliteration_factory = context.get_transliteration_update_factory()
        updater = context.get_fragment_updater()

        cdli_number = self.get_cdli_number(ebl_lines["control_lines"])
        museum_number = self.get_museum_number_by_cdli_number(cdli_number)

        if museum_number is None:
            self.logger.warning(
                "No museum number to cdli number'"
                + cdli_number
                + "' found. Trying to parse from original file..."
            )
            try:
                museum_number_split = self.get_museum_number(ebl_lines["control_lines"])
                parse_museum_number(museum_number_split.strip())
                museum_number = museum_number_split
            except Exception:
                self.logger.error(
                    "Could not find valid museum number in '" + filename + "'"
                )

        skip = False
        while museum_number is None:
            museum_number_input = input(
                "Please enter a valid museum number (enter 'skip' to skip this file): "
            )
            try:
                if museum_number_input == "skip":
                    skip = True
                    break
                parse_museum_number(museum_number_input)
                museum_number = museum_number_input
                self.logger.info("Museum number '" + museum_number + "' is valid!")
            except Exception:
                pass

        if skip:
            failed.append(filename + " could not be imported: Museum number not found")
            self.logger.error("Museum number not found")
            self.logger.info(
                Util.print_frame('Conversion of "' + filename + '.atf" failed')
            )
            return

        try:
            # insert transliteration
            self.insert_translitertions(
                transliteration_factory,
                updater,
                ebl_lines["transliteration"],
                museum_number,
            )
            # insert lemmatization
            self.insert_lemmatization(
                updater, ebl_lines["lemmatization"], museum_number
            )

            success.append(filename + " successfully imported")
            self.logger.info(
                Util.print_frame(
                    'Conversion of "'
                    + filename
                    + '.atf" finished (museum number "'
                    + museum_number
                    + '")'
                )
            )

        except Exception as e:
            self.logger.error(filename + " could not be imported: " + str(e))
            failed.append(filename + " could not be imported: " + str(e))
def create_context_() -> Context:
    context = create_context()
    context = attr.evolve(
        context, sign_repository=MemoizingSignRepository(context.sign_repository)
    )
    return context