def get_docbin(file_key=None, user_key=None, language='??'): if file_key: docbin = DocBin(store_user_data=True) path = path_from_file_key(file_key) docbin.from_disk(path) else: file_key, docbin = make_docbin(user_key, language=language) return file_key, docbin
def get_principal_docbins(user_key=None, project_key=None): if user_key: file_key_pattern = file_key_from_principal_key(principal_key=user_key, principal_type='u') elif project_key: file_key_pattern = file_key_from_principal_key( principal_key=project_key, principal_type='p') path_pattern = os.path.join(settings.TEMP_ROOT, file_key_pattern) + '.spacy' principal_docbins = [] for path in glob.glob(path_pattern): file_key = file_key_from_path(path) time_stamp = time.ctime(os.path.getmtime(path)) docbin = DocBin(store_user_data=True) docbin.from_disk(path) principal_docbins.append([file_key, docbin, time_stamp]) return principal_docbins
def test_to_spacy_file_and_back(small_dataset): spacy_pipeline = spacy.load("en_core_web_sm") InputSample.create_spacy_dataset( small_dataset, output_path="dataset.spacy", translate_tags=False, spacy_pipeline=spacy_pipeline, alignment_mode="strict", ) db = DocBin() db.from_disk("dataset.spacy") docs = db.get_docs(vocab=spacy_pipeline.vocab) for doc, input_sample in zip(docs, small_dataset): input_ents = sorted(input_sample.spans, key=lambda x: x.start_position) spacy_ents = sorted(doc.ents, key=lambda x: x.start_char) for spacy_ent, input_span in zip(spacy_ents, input_ents): assert spacy_ent.start_char == input_span.start_position assert spacy_ent.end_char == input_span.end_position