コード例 #1
0
def get_docbin(file_key=None, user_key=None, language='??'):
    if file_key:
        docbin = DocBin(store_user_data=True)
        path = path_from_file_key(file_key)
        docbin.from_disk(path)
    else:
        file_key, docbin = make_docbin(user_key, language=language)
    return file_key, docbin
コード例 #2
0
def get_principal_docbins(user_key=None, project_key=None):
    if user_key:
        file_key_pattern = file_key_from_principal_key(principal_key=user_key,
                                                       principal_type='u')
    elif project_key:
        file_key_pattern = file_key_from_principal_key(
            principal_key=project_key, principal_type='p')
    path_pattern = os.path.join(settings.TEMP_ROOT,
                                file_key_pattern) + '.spacy'
    principal_docbins = []
    for path in glob.glob(path_pattern):
        file_key = file_key_from_path(path)
        time_stamp = time.ctime(os.path.getmtime(path))
        docbin = DocBin(store_user_data=True)
        docbin.from_disk(path)
        principal_docbins.append([file_key, docbin, time_stamp])
    return principal_docbins
コード例 #3
0
def test_to_spacy_file_and_back(small_dataset):
    spacy_pipeline = spacy.load("en_core_web_sm")
    InputSample.create_spacy_dataset(
        small_dataset,
        output_path="dataset.spacy",
        translate_tags=False,
        spacy_pipeline=spacy_pipeline,
        alignment_mode="strict",
    )

    db = DocBin()
    db.from_disk("dataset.spacy")
    docs = db.get_docs(vocab=spacy_pipeline.vocab)
    for doc, input_sample in zip(docs, small_dataset):
        input_ents = sorted(input_sample.spans, key=lambda x: x.start_position)
        spacy_ents = sorted(doc.ents, key=lambda x: x.start_char)
        for spacy_ent, input_span in zip(spacy_ents, input_ents):
            assert spacy_ent.start_char == input_span.start_position
            assert spacy_ent.end_char == input_span.end_position