def main( # fmt: off in_file: str = typer.Argument(..., help="Path to input file"), out_dir: str = typer.Argument(..., help="Path to output directory"), spacy_model: str = typer.Argument("en_core_web_sm", help="Name of spaCy model to use"), n_process: int = typer.Option( 1, "--n-process", "-n", help="Number of processes (multiprocessing)"), max_docs: int = typer.Option(10**6, "--max-docs", "-m", help="Maximum docs per batch"), # fmt: on ): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") count = 0 batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): if count < max_docs: doc_bin.add(doc) count += 1 else: batch_num += 1 count = 0 msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin( attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) batch_num += 1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve())
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=1, max_docs=10**7): """ Step 1: Parse raw text with spaCy Expects an input file with one sentence per line and will output a .spacy file of the parsed collection of Doc objects (DocBin). """ input_path = Path(in_file) output_path = Path(out_dir) if not input_path.exists(): msg.fail("Can't find input file", in_file, exits=1) if not output_path.exists(): output_path.mkdir(parents=True) msg.good(f"Created output directory {out_dir}") nlp = spacy.load(spacy_model) msg.info(f"Using spaCy model {spacy_model}") doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) msg.text("Preprocessing text...") count = 0 batch_num = 0 with input_path.open("r", encoding="utf8") as texts: docs = nlp.pipe(texts, n_process=n_process) for doc in tqdm.tqdm(docs, desc="Docs", unit=""): if count < max_docs: doc_bin.add(doc) count += 1 output_file = output_path / f"{input_path.stem}.spacy" else: batch_num += 1 count = 0 msg.good(f"Processed {len(doc_bin)} docs") doc_bin_bytes = doc_bin.to_bytes() output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" with output_file.open("wb") as f: f.write(doc_bin_bytes) msg.good(f"Saved parsed docs to file", output_file.resolve()) doc_bin = DocBin( attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"]) with output_file.open("wb") as f: batch_num += 1 output_file = output_path / f"{input_path.stem}-{batch_num}.spacy" doc_bin_bytes = doc_bin.to_bytes() f.write(doc_bin_bytes) msg.good(f"Complete. Saved final parsed docs to file", output_file.resolve())
def test_issue5141(en_vocab): """ Ensure an empty DocBin does not crash on serialization """ doc_bin = DocBin(attrs=["DEP", "HEAD"]) assert list(doc_bin.get_docs(en_vocab)) == [] doc_bin_bytes = doc_bin.to_bytes() doc_bin_2 = DocBin().from_bytes(doc_bin_bytes) assert list(doc_bin_2.get_docs(en_vocab)) == []
def build_vocab(water_bodies: Dict, nlp: Language): """Load new vocab and wikidata. Parameters ---------- water_bodies : Dict Dictionary containing the list of new water bodies to be loaded. Format: { "LAKE": [(Name, Wiki_Id), ...], "RIVER": [(Name, Wiki_Id), ...], ... } nlp: Language spacy nlp object """ vocab = {} wikidata = {} doc_bins_bytes = {} stop_words = set(srsly.read_json(stop_words_file)['stop_words']) for key in water_bodies: doc_bin = DocBin() for wb, _ in tqdm(water_bodies[key], desc=f'Loading {key}(s)'): doc_bin.add(nlp(wb)) doc_bins_bytes[key] = doc_bin.to_bytes() if key not in wikidata: wikidata[key] = {} for name, id in water_bodies[key]: wikidata[key][name.lower()] = id vocab[str(nlp.vocab.strings[key])] = key write_data_files(vocab, wikidata, stop_words, doc_bins_bytes)
def load_or_create_spacy_doc(sents, do_preprocess, use_cache, verbose): """ @sents list of string to be tokenized. @use_cache if true, try load from disk first. Otherwise, tokenize. @return DocBin object """ if do_preprocess: sents = preprocess(sents) fname = SPACY_DOC_PATH + str(do_preprocess) + hash_sents(sents) + ".bin" if os.path.exists(fname) and use_cache: now = time.time() if verbose: print("Loading tokenized document from disk...") with open(fname, "rb") as f: doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]).from_bytes(f.read()) if verbose: print("Finished loading tokenized document in {:.2f}s!".format(time.time() - now)) return doc_bin else: now = time.time() if verbose: print("Start tokenizing document...") doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]) for doc in nlp.pipe(sents, disable=["parser"]): doc_bin.add(doc) with open(fname, "wb") as f: f.write(doc_bin.to_bytes()) if verbose: print("Finish tokenizing document and save to disk in {:.2f}s!".format(time.time() - now)) return doc_bin
def write_docs_to_bin(docs: List[Doc], fname: str) -> None: doc_bin = DocBin(attrs=attrs) for doc in docs: doc_bin.add(doc) with open(fname, 'wb') as fh: doc_bin_bytes = doc_bin.to_bytes() fh.write(doc_bin_bytes)
def bin_inscriptions(corpus): """ put the texts into the docbin """ doc_bin = DocBin(attrs=["LEMMA", "TAG", "POS", "DEP", "HEAD"], store_user_data=True) for c in corpus: doc = nlp(c) doc_bin.add(doc) with open('dbg.bin', 'wb') as f: f.write(doc_bin.to_bytes())
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] nlp = English() for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) list(doc_bin.get_docs(nlp.vocab))
def write_docs(texts, attrs, lang, file, provider='spacy'): from tqdm import tqdm nlp = get_model(lang, provider=provider) doc_bin = DocBin(attrs=[a.upper() for a in attrs], store_user_data=True) # doc_bin = DocBin(attrs=["DEP", "HEAD"]) # for doc in nlp.pipe(texts): # the tqdm library just wraps a loop. When you call it around nlp.pipe, # the loop you're wrapping are the individual batches. for doc in tqdm(nlp.pipe(texts)): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() write_doc_to(bytes_data, file)
def test_issue4528(en_vocab): """Test that user_data is correctly serialized in DocBin.""" doc = Doc(en_vocab, words=["hello", "world"]) doc.user_data["foo"] = "bar" # This is how extension attribute values are stored in the user data doc.user_data[("._.", "foo", None, None)] = "bar" doc_bin = DocBin(store_user_data=True) doc_bin.add(doc) doc_bin_bytes = doc_bin.to_bytes() new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes) new_doc = list(new_doc_bin.get_docs(en_vocab))[0] assert new_doc.user_data["foo"] == "bar" assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag, reader_value): """Test that custom extensions are correctly serialized in DocBin.""" Doc.set_extension("foo", default="nothing") doc = Doc(en_vocab, words=["hello", "world"]) doc._.foo = "bar" doc_bin_1 = DocBin(store_user_data=writer_flag) doc_bin_1.add(doc) doc_bin_bytes = doc_bin_1.to_bytes() doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes) doc_2 = list(doc_bin_2.get_docs(en_vocab))[0] assert doc_2._.foo == reader_value Underscore.doc_extensions = {}
def parse_dataset(self, in_dataset_path, out_dataset_path, save_as_str=False, condition_func=None): in_dataset = self.load_dataset(in_dataset_path, out_dataset_path, binary=False) if in_dataset is None: return None start_time = time.time() sents = [] doc_bin = DocBin(attrs=[ "ID", "ORTH", "LEMMA", "TAG", "POS", "HEAD", "DEP", "ENT_IOB", "ENT_TYPE" ], store_user_data=True) for sent in in_dataset: doc = self.parse_sentence(sent) if doc is None: continue if condition_func and condition_func(doc): continue doc_bin.add(doc) sents.append(sent) self.print_status(len(doc_bin), out_dataset_path, start_time) if not self.check_dataset_size(doc_bin): break print( f"The dataset {basename(out_dataset_path)} contains {len(doc_bin)} parsed sentences" ) os.makedirs(dirname(out_dataset_path), exist_ok=True) # Save the resulted dataset as string if needed if save_as_str: with open(out_dataset_path, "w") as target_file: target_file.writelines(sents) # Save the resulted parsed dataset with open(out_dataset_path, "wb") as parsed_file: parsed_file.write(doc_bin.to_bytes())
def main(): keywords_df = pd.read_csv('data/keywords.csv') keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity'])) data = pd.read_csv('data/taged_all.csv') cd = CleanData() data_clean = cd.normalize_text(data.copy()) data_clean['keywords'] = data_clean['clean_text'].str.split() doc_entity_df = doc_entity(data_clean, keywords_dic) doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target', 'predict']], how='left', left_on='id', right_on='id') doc_entity_df.set_index('id', inplace=True) doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True) doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = [ "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]" ] nlp = spacy.load("en_core_web_md") for doc in nlp.pipe(texts): doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Read and write binary file with open('data/sample', "wb") as out_file: out_file.write(bytes_data) with open('data/sample', "rb") as in_file: data = in_file.read() in_file.close() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(data) docs = list(doc_bin.get_docs(nlp.vocab)) # ################################################################################### data = pd.read_csv('data/taged_all.csv') for row in tqdm(data['text'], total=data.shape[0]): doc = nlp(row) doc.to_disk('data/sample') print([(X.text, X.label_) for X in doc.ents])
def main() -> None: # logger = mp.log_to_stderr(level=logging.INFO) args = parse_arguments() logger.info(f"Receive input from {args.input.name}") texts = list(args.input) logger.info(f"Received {len(texts)} lines") doc_bin = DocBin(attrs=["LEMMA", "IS_STOP"]) logger.info(f"Initialize {args.workers} processes") with mp.Pool(args.workers, initializer=init_nlp, initargs=(args,),) as pool: pool_iter = pool.imap(process, texts, chunksize=args.batch_size) for doc in tqdm(pool_iter, total=len(texts)): doc_bin.merge(doc) args.output.write(doc_bin.to_bytes()) logger.info(f"Saved at {args.output.name}")
def spacy_parse_store_from_dataframe(fname, df, nlp): chunks = math.ceil(len(df)) start_time = time.time() for chunk in range(chunks): start = chunk * 10000 end = start + 10000 chunk_df = df.iloc[start:end, ] chunk_fname = fname + f'_{chunk}' doc_bin = DocBin(attrs=attrs) for ti, text in enumerate(get_dataframe_review_texts(chunk_df)): doc = nlp(text) doc_bin.add(doc) if (ti + 1) % 1000 == 0: print(ti + 1, 'reviews parsed in chunk', chunk, '\ttime:', time.time() - start_time) with open(chunk_fname, 'wb') as fh: fh.write(doc_bin.to_bytes())
def process_uploaded_file(f, title): doc_bin = DocBin( attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "POS", "TAG", "HEAD", "DEP"], store_user_data=True) # add newlines using spacy's sentence detection f = lineizer(f) # this assumes a text that has sentences split into new lines doclist = f for doc in nlp.pipe(doclist): print(doc) doc_bin.add(doc) # for doc in nlp.pipe(): # print(doc) # doc_bin.add(doc) bytes_data = doc_bin.to_bytes() with open(f"media/{title}", "wb") as binary_file: binary_file.write(bytes_data)
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats
def cache_docbin(self, force=False): sp = self.sp refresh = force or not os.path.isfile(DOCBIN_CACHE) \ or not os.path.isdir(VOCAB_CACHE) if refresh: paragraphs = self.load_paragraphs() doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True) for doc in sp.pipe(tqdm(paragraphs)): doc_bin.add(doc) with open(DOCBIN_CACHE, "wb") as f: f.write(doc_bin.to_bytes()) sp.vocab.to_disk(VOCAB_CACHE) sp.vocab.from_disk(VOCAB_CACHE) with open(DOCBIN_CACHE, "rb") as f: bb = f.read() doc_bin = DocBin().from_bytes(bb) return list(doc_bin.get_docs(sp.vocab))
def save( self, filepath: Union[str, pathlib.Path], store_user_data: bool = True, ) -> None: """ Save :class:`Corpus` to disk as binary data. Args: filepath: Full path to file on disk where :class:`Corpus` data will be saved as a binary file. store_user_data: If True, store user data and values of custom extension attributes along with core spaCy attributes. See Also: - :meth:`Corpus.load()` - :class:`spacy.tokens.DocBin` """ attrs = [ spacy.attrs.ORTH, spacy.attrs.SPACY, ] if self[0].is_tagged: attrs.append(spacy.attrs.TAG) if self[0].is_parsed: attrs.append(spacy.attrs.HEAD) attrs.append(spacy.attrs.DEP) # NOTE: HEAD sets sentence boundaries implicitly based on tree structure, so # also setting SENT_START would potentially conflict with existing annotations. elif self[0].is_sentenced: attrs.append(spacy.attrs.SENT_START) if self[0].is_nered: attrs.append(spacy.attrs.ENT_IOB) attrs.append(spacy.attrs.ENT_TYPE) doc_bin = DocBin(attrs=attrs, store_user_data=store_user_data) for doc in self: doc_bin.add(doc) with tio.open_sesame(filepath, mode="wb") as f: f.write(doc_bin.to_bytes())
def write_spacy_docs(docs, vocab, filepath, vocab_filepath): """ Writes serialized spacy docs to file. Parameters ---------- docs: list of spacy.tokens.doc.Doc List of spacy Docs to write to file filepath: str File path to serialized spacy docs """ from spacy.attrs import IDS attr_exclude = ['SENT_START'] attrs = [attr for attr in IDS.keys() if attr not in attr_exclude] doc_bin = DocBin(attrs=attrs) for doc in docs: doc_bin.add(doc) with open(filepath, 'wb') as f: f.write(doc_bin.to_bytes()) with open(vocab_filepath, 'wb') as f: f.write(vocab.to_bytes())
def docbin_writer(docs: Iterable[Doc], docbin_output_path: str): """Writes a stream of Spacy Doc objects to a binary file in the DocBin format.""" import spacy.attrs # Creating the DocBin object (with all attributes) attrs = [ spacy.attrs.LEMMA, spacy.attrs.TAG, spacy.attrs.DEP, spacy.attrs.HEAD, spacy.attrs.ENT_IOB, spacy.attrs.ENT_TYPE ] docbin = DocBin(attrs=attrs, store_user_data=True) # Storing the documents in the DocBin repository for doc in docs: doc.cats = {} docbin.add(doc) data = docbin.to_bytes() # And writing the content to the file print("Write to", docbin_output_path, end="...", flush=True) fd = open(docbin_output_path, "wb") fd.write(data) fd.close() print("done")
def test_serialize_doc_bin(): doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"], store_user_data=True) texts = ["Some text", "Lots of texts...", "..."] cats = {"A": 0.5} nlp = English() for doc in nlp.pipe(texts): doc.cats = cats doc.spans["start"] = [doc[0:2]] doc[0].norm_ = "UNUSUAL_TOKEN_NORM" doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID" doc_bin.add(doc) bytes_data = doc_bin.to_bytes() # Deserialize later, e.g. in a new process nlp = spacy.blank("en") doc_bin = DocBin().from_bytes(bytes_data) reloaded_docs = list(doc_bin.get_docs(nlp.vocab)) for i, doc in enumerate(reloaded_docs): assert doc.text == texts[i] assert doc.cats == cats assert len(doc.spans) == 1 assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM" assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
token_row = classifiers_neg_gpc[classifiers_neg_gpc.form == token.text] if not token_row.empty: negative_token.append(token.text) else: token_row = classifiers_pos_gpc[classifiers_pos_gpc.form == token.text] if not token_row.empty: positive_token.append(token.text) sentiment_tokens["positiv"] = positive_token sentiment_tokens["negativ"] = negative_token return sentiment_tokens counter = 0 for f, protokoll in spacy_db.items(): counter += 1 print(counter) for rede in protokoll: rede.user_data["sentiws"] = sentiws_eval(rede) for rede in protokoll: rede.user_data["gpc"] = gpc_eval(rede) doc_bin = DocBin( attrs=["POS", "TAG", "LEMMA", "IS_STOP", "DEP", "SHAPE", "ENT_ID", "ENT_IOB", "ENT_KB_ID", "ENT_TYPE"], store_user_data=True) for doc in protokoll: doc_bin.add(doc) spacy_out = doc_bin.to_bytes() with open(file=("plenarprotokolle/pp19/" + f + ".sentiment"), mode="wb") as spacy_outfile: spacy_outfile.write(spacy_out)
text = "I love coffee" # Load base document nlp = spacy.load("en_core_web_sm") doc_base = nlp(text) print("") print_doc(doc_base) # Serialize document to disk and bytes doc_base.to_disk("doc.spacy") doc_base_bytes = doc_base.to_bytes() # Serialize using DocBin docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True) docbin_base.add(doc_base) docbin_base_bytes = docbin_base.to_bytes() # Restore document from disk doc = Doc(Vocab()) doc.from_disk("doc.spacy") print("") print_doc(doc) # Restore document from bytes doc = Doc(Vocab()) doc.from_bytes(doc_base_bytes) print("") print_doc(doc) # Restore using DocBin docbin = DocBin().from_bytes(docbin_base_bytes)
config.CPU_CORES = args.cores if config.CPU_CORES < 0 or config.CPU_CORES > cpu_count(): raise ValueError('Invalid core number specified.') print('CPU_CORES:', config.CPU_CORES) return config if __name__ == '__main__': conf = parse_config() df = pd.read_csv(conf.INPUT_FILE, usecols=['sentence_id', 'text']) if spacy.__version__[0] == '3': # Spacy v3 saved all attributes by default. doc_bin = DocBin(store_user_data=True) else: doc_bin = DocBin(["LEMMA", "ENT_TYPE", "POS", "DEP"], store_user_data=True) for i, doc in enumerate(nlp.pipe(df['text'], n_process=conf.CPU_CORES)): if doc.is_parsed: doc.user_data['sentence_id'] = df.iloc[i, 0] doc_bin.add(doc) print('Writing file...') bytes_data = doc_bin.to_bytes() with open(conf.OUTPUT_FILE, 'wb+') as f: f.write(bytes_data) print('Exiting...')
def process(self): """ Reads text and outputs entities per text body. """ # prepare staging area staging_area = self.dataset.get_staging_area() self.dataset.update_status("Preparing data") # go through all archived token sets and vectorise them results = [] # Load the spacy goods nlp = spacy.load("en_core_web_sm") nlp.tokenizer = self.custom_tokenizer(nlp) # Keep words with a dash in between # Disable what has _not_ been selected options = ["parser", "tagger", "ner"] enable = self.parameters.get("enable", False) if not enable: self.dataset.update_status("Select at least one of the options.") self.dataset.finish(0) return disable = [option for option in options if option not in enable] with open(self.source_file, encoding="utf-8") as source: # Get all ze text first so we can process it in batches csv_reader = csv.DictReader(source) posts = [post["body"] if post["body"] else "" for post in csv_reader] # Process the text in batches if len(posts) < 100000: self.dataset.update_status("Extracting linguistic features") else: self.dataset.update_status( "Extracting linguistic features is currently only available for datasets with less than 100.000 items.") self.dataset.finish(0) return # Make sure only the needed information is extracted. attrs = [] if "tagger" not in disable: attrs.append("POS") if "parser" not in disable: attrs.append("DEP") if "ner": attrs.append("ENT_IOB") attrs.append("ENT_TYPE") attrs.append("ENT_ID") attrs.append("ENT_KB_ID") # DocBin for quick saving doc_bin = DocBin(attrs=attrs) # Start the processing! for i, doc in enumerate(nlp.pipe(posts, disable=disable)): try: doc_bin.add(doc) except MemoryError: self.dataset.update_status("Out of memory while parsing data. Try again with a smaller dataset.", is_final=True) return # It's quite a heavy process, so make sure it can be interrupted if self.interrupted: raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file") if i % 1000 == 0: self.dataset.update_status("Done with post %s out of %s" % (i, len(posts))) self.dataset.update_status("Serializing results - this will take a while") # Then serialize the NLP docs and the vocab doc_bytes = doc_bin.to_bytes() # Dump ze data in a temporary folder with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile: pickle.dump(doc_bytes, outputfile) # create zip of archive and delete temporary files and folder self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA)