def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): doc_bin.add(doc) msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve())
def write_docs_to_bin(docs: List[Doc], fname: str) -> None: doc_bin = DocBin(attrs=attrs) for doc in docs: doc_bin.add(doc) with open(fname, 'wb') as fh: doc_bin_bytes = doc_bin.to_bytes() fh.write(doc_bin_bytes)
def load_or_create_spacy_doc(sents, do_preprocess, use_cache, verbose): """ @sents list of string to be tokenized. @use_cache if true, try load from disk first. Otherwise, tokenize. @return DocBin object """ if do_preprocess: sents = preprocess(sents) fname = SPACY_DOC_PATH + str(do_preprocess) + hash_sents(sents) + ".bin" if os.path.exists(fname) and use_cache: now = time.time() if verbose: print("Loading tokenized document from disk...") with open(fname, "rb") as f: doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]).from_bytes(f.read()) if verbose: print("Finished loading tokenized document in {:.2f}s!".format(time.time() - now)) return doc_bin else: now = time.time() if verbose: print("Start tokenizing document...") doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]) for doc in nlp.pipe(sents, disable=["parser"]): doc_bin.add(doc) with open(fname, "wb") as f: f.write(doc_bin.to_bytes()) if verbose: print("Finish tokenizing document and save to disk in {:.2f}s!".format(time.time() - now)) return doc_bin
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) header = True with open(input_path, "r") as in_f, open(output_path, "w") as out_f: for line in tqdm(in_f): if header: header = False continue sentence, tokens = pd.read_csv(StringIO(line), header=None, usecols=[0, 1]).values[0] tokens = eval(tokens) dict_line = line_to_dict(sentence, tokens) eg = dict_line if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path}")
def build_vocab(water_bodies: Dict, nlp: Language): """Load new vocab and wikidata. Parameters ---------- water_bodies : Dict Dictionary containing the list of new water bodies to be loaded. Format: { "LAKE": [(Name, Wiki_Id), ...], "RIVER": [(Name, Wiki_Id), ...], ... } nlp: Language spacy nlp object """ vocab = {} wikidata = {} doc_bins_bytes = {} stop_words = set(srsly.read_json(stop_words_file)['stop_words']) for key in water_bodies: doc_bin = DocBin() for wb, _ in tqdm(water_bodies[key], desc=f'Loading {key}(s)'): doc_bin.add(nlp(wb)) doc_bins_bytes[key] = doc_bin.to_bytes() if key not in wikidata: wikidata[key] = {} for name, id in water_bodies[key]: wikidata[key][name.lower()] = id vocab[str(nlp.vocab.strings[key])] = key write_data_files(vocab, wikidata, stop_words, doc_bins_bytes)
def convert(json_path, output): db = DocBin() for line in srsly.read_jsonl(json_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output)
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) db = DocBin() for line in srsly.read_jsonl(input_path): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def convert(output_path): global nlp db = DocBin() for line in srsly.read_jsonl("db.json"): doc = nlp.make_doc(line["text"]) doc.cats = line["cats"] db.add(doc) db.to_disk(output_path)
def main( # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option( 1, "--n-process", "-n", help="Number of processes (multiprocessing)"), max_docs: int = typer.Option(10**6, "--max-docs", "-m", help="Maximum docs per batch"), # fmt: on ): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") count = 0 batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): if count < max_docs: doc_bin.add(doc) count += 1 else: batch_num += 1 count = 0 msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin( attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) batch_num += 1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve())
def generate_corpus(nlp): directory_path = path.join('data') corpus_path = Path(path.join(directory_path, file_name) + ".spacy") raw_path = Path(path.join(directory_path, file_name) + ".jsonl") if exists(corpus_path): return Corpus(corpus_path)(nlp) vulnerabilities = [] with open(raw_path) as file: for line in file.readlines(): vulnerability = loads(line) vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])}) corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"]) for vulnerability in vulnerabilities: document = nlp.make_doc(vulnerability['description'].lower()) #print(vulnerability) #print(len(document)) #iob = [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc] #biluo = iob_to_biluo(iob) #print(biluo) #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']]) #document.set_ents(list(document.ents)) tags = offsets_to_biluo_tags(document, vulnerability['entities']) entities = biluo_tags_to_spans(document, tags) document.set_ents(entities) ''' Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens. ''' #print(document.has_annotation(1)) #ID of "SOFTWARE" # passt alles! ents = list(document.ents) for i, _ in enumerate(ents): print(ents[i].label_) print(ents[i].text) print('\n') print('\nOK\n') #exit() corpus.add(document) print(len(corpus)) print(list(corpus.get_docs(nlp.vocab))) corpus.to_disk(corpus_path) if exists(corpus_path): return Corpus(corpus_path)(nlp)
def bin_inscriptions(corpus): """ put the texts into the docbin """ doc_bin = DocBin(attrs=["LEMMA", "TAG", "POS", "DEP", "HEAD"], store_user_data=True) for c in corpus: doc = nlp(c) doc_bin.add(doc) with open('dbg.bin', 'wb') as f: f.write(doc_bin.to_bytes())
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) in_db = DocBin().from_disk(input_path) out_db = DocBin() logging.info(f"Read {len(in_db)} documents from {input_path}.") for doc in in_db.get_docs(nlp.vocab): new_doc = nlp.make_doc(doc.text) new_doc.user_data = doc.user_data new_doc.ents = doc.ents out_db.add(new_doc) out_db.to_disk(output_path)
def json_path_to_examples(data_path, NLP): data = srsly.read_json(data_path) # no good way to convert with a specified vocab, so convert, then reload # through DocBin with the right vocab docs = json_to_docs(data) docbin = DocBin() for doc in docs: docbin.add(doc) docs = docbin.get_docs(NLP.vocab) examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs] return examples
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] nlp = English() for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) list(doc_bin.get_docs(nlp.vocab))
def write_docs(texts, attrs, lang, file, provider='spacy'): from tqdm import tqdm nlp = get_model(lang, provider=provider) doc_bin = DocBin(attrs=[a.upper() for a in attrs], store_user_data=True) # doc_bin = DocBin(attrs=["DEP", "HEAD"]) # for doc in nlp.pipe(texts): # the tqdm library just wraps a loop. When you call it around nlp.pipe, # the loop you're wrapping are the individual batches. for doc in tqdm(nlp.pipe(texts)): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() write_doc_to(bytes_data, file)
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): """Test that custom extensions are correctly serialized in DocBin.""" Doc.set_extension("foo", default="nothing") doc = Doc(en_vocab, words=["hello", "world"]) doc._.foo = "bar" doc_bin_1 = DocBin(store_user_data=writer_flag) doc_bin_1.add(doc) doc_bin_bytes = doc_bin_1.to_bytes() doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] assert doc_2._.foo == reader_value Underscore.doc_extensions = {}
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin() data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path)) for doc, eg in nlp.pipe(data_tuples, as_tuples=True): # doc.cats = {category: 0 for category in CATEGORIES} doc.cats[eg["label"]] = 1 doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def test_issue4528(en_vocab): """Test that user_data is correctly serialized in DocBin.""" doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" # This is how extension attribute values are stored in the user data doc.user_data[("._.", "foo", None, None)] = "bar" doc_bin = DocBin(store_user_data=True) doc_bin.add(doc) doc_bin_bytes = doc_bin.to_bytes() new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) new_doc = list(new_doc_bin.get_docs(en_vocab))[0] assert new_doc.user_data["foo"] == "bar" assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
def parse_dataset(self, in_dataset_path, out_dataset_path, save_as_str=False, condition_func=None): in_dataset = self.load_dataset(in_dataset_path, out_dataset_path, binary=False) if in_dataset is None: return None start_time = time.time() sents = [] doc_bin = DocBin(attrs=[ "ID", "ORTH", "LEMMA", "TAG", "POS", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE" ], store_user_data=True) for sent in in_dataset: doc = self.parse_sentence(sent) if doc is None: continue if condition_func and condition_func(doc): continue doc_bin.add(doc) sents.append(sent) self.print_status(len(doc_bin), out_dataset_path, start_time) if not self.check_dataset_size(doc_bin): break print( f"The dataset {basename(out_dataset_path)} contains {len(doc_bin)} parsed sentences" ) os.makedirs(dirname(out_dataset_path), exist_ok=True) # Save the resulted dataset as string if needed if save_as_str: with open(out_dataset_path, "w") as target_file: target_file.writelines(sents) # Save the resulted parsed dataset with open(out_dataset_path, "wb") as parsed_file: parsed_file.write(doc_bin.to_bytes())
def main(): keywords_df = pd.read_csv('data/keywords.csv') keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity'])) data = pd.read_csv('data/taged_all.csv') cd = CleanData() data_clean = cd.normalize_text(data.copy()) data_clean['keywords'] = data_clean['clean_text'].str.split() doc_entity_df = doc_entity(data_clean, keywords_dic) doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target', 'predict']], how='left', left_on='id', right_on='id') doc_entity_df.set_index('id', inplace=True) doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True) doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = [ "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]" ] nlp = spacy.load("en_core_web_md") for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Read and write binary file with open('data/sample', "wb") as out_file: out_file.write(bytes_data) with open('data/sample', "rb") as in_file: data = in_file.read() in_file.close() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(data) docs = list(doc_bin.get_docs(nlp.vocab)) # ################################################################################### data = pd.read_csv('data/taged_all.csv') for row in tqdm(data['text'], total=data.shape[0]): doc = nlp(row) doc.to_disk('data/sample') print([(X.text, X.label_) for X in doc.ents])
def convert(input_path, output_path, lang='en'): nlp = spacy.blank(lang) db = DocBin() for text, annot in srsly.read_json(input_path): doc = nlp.make_doc(text) ents = [] for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label) if span is None: print("Skipping entity") else: ents.append(span) doc.ents = ents db.add(doc) db.to_disk(output_path)
def convert(lang: str, input_path: Path, output_path: Path): nlp = spacy.blank(lang) db = DocBin() for text, annot in srsly.read_json(input_path): doc = nlp.make_doc(text) ents = [] for start, end, label in annot["entities"]: span = doc.char_span(start, end, label=label) if span is None: msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n" warnings.warn(msg) else: ents.append(span) doc.ents = ents db.add(doc) db.to_disk(output_path)
def process_uploaded_file(f, title): doc_bin = DocBin( attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "POS", "TAG", "HEAD", "DEP"], store_user_data=True) # add newlines using spacy's sentence detection f = lineizer(f) # this assumes a text that has sentences split into new lines doclist = f for doc in nlp.pipe(doclist): print(doc) doc_bin.add(doc) # for doc in nlp.pipe(): # print(doc) # doc_bin.add(doc) bytes_data = doc_bin.to_bytes() with open(f"media/{title}", "wb") as binary_file: binary_file.write(bytes_data)
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for eg in srsly.read_json(input_path): print(eg) doc = nlp(eg[0]) doc.ents = [ doc.char_span(s[0], s[1], label=s[2]) for s in eg[1].get("entities", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def spacy_parse_store_from_dataframe(fname, df, nlp): chunks = math.ceil(len(df)) start_time = time.time() for chunk in range(chunks): start = chunk * 10000 end = start + 10000 chunk_df = df.iloc[start:end, ] chunk_fname = fname + f'_{chunk}' doc_bin = DocBin(attrs=attrs) for ti, text in enumerate(get_dataframe_review_texts(chunk_df)): doc = nlp(text) doc_bin.add(doc) if (ti + 1) % 1000 == 0: print(ti + 1, 'reviews parsed in chunk', chunk, '\ttime:', time.time() - start_time) with open(chunk_fname, 'wb') as fh: fh.write(doc_bin.to_bytes())
def to_spacy(self, df, file_path=None): """ Function to convert dataframe returned by annotator into spacy . Parameters ---------- df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()). file_path (str): Filepath (including filename) to save the .spacy file to. Returns ------- Spacy docbin if a user wants to combine additional training data """ if (not isinstance(df, pd.DataFrame)): raise TypeError("Pass the pandas dataframe returned by annotate()") if file_path and (not isinstance(file_path, str)): raise TypeError("The file_path must be a string or None") if file_path is None: file_path = os.path.join(os.getcwd(), 'annotations.spacy') db = DocBin() training_data = [ant for ant in df['annotations'].tolist() if ant] for text, annotations in training_data: ents = [] doc = self.nlp(text) for start, end, label in annotations['entities']: span = doc.char_span(start, end, label=label) ents.append(span) # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans. # See: https://spacy.io/api/top-level#util.filter_spans # TODO: alert users that some spans have been dropped. doc.ents = filter_spans(ents) db.add(doc) db.to_disk(file_path) print(f"Spacy file saved to: {file_path}") return db
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats
def descrip_to_spacy(data: pd.DataFrame, output_path: str) -> None: "Takes in dataframe with description and label to save DocBin to disk" tuples = data.apply(lambda row: (strip_html_tags(row["description"]), row["fraud"]), axis=1).to_list() nlp = spacy.blank("en") db = DocBin() for doc, label in nlp.pipe(tuples, as_tuples=True): if label: doc.cats["FRAUD"] = 1 doc.cats["NOTFRAUD"] = 0 else: doc.cats["FRAUD"] = 0 doc.cats["NOTFRAUD"] = 1 db.add(doc) db.to_disk(output_path)
def convert_file( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for eg in tqdm(srsly.read_jsonl(input_path)): if eg["answer"] != "accept": continue tokens = [token["text"] for token in eg["tokens"]] words, spaces = get_words_and_spaces(tokens, eg["text"]) doc = Doc(nlp.vocab, words=words, spaces=spaces) doc.ents = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in eg.get("spans", []) ] doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")
def main( input_path: Path = typer.Argument(..., exists=True, dir_okay=False), output_path: Path = typer.Argument(..., dir_okay=False), ): nlp = spacy.blank("en") doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"]) for idx, eg in enumerate(srsly.read_jsonl(input_path)): if idx % 10000 == 0: print(f"converted {idx} sentences") doc = nlp(eg["text"]) spans_from_json = eg.get("spans", []) spans_objects = [ doc.char_span(s["start"], s["end"], label=s["label"]) for s in spans_from_json ] spans_objects = filter_spans(spans_objects) doc.ents = spans_objects doc_bin.add(doc) doc_bin.to_disk(output_path) print(f"Processed {len(doc_bin)} documents: {output_path.name}")