コード例 #1
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm",
                                      help="Name of spaCy model to use"),
    n_process: int = typer.Option(
        1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    max_docs: int = typer.Option(10**6,
                                 "--max-docs",
                                 "-m",
                                 help="Maximum docs per batch"),
    # fmt: on
):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        batch_num += 1
        output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
        with output_file.open("wb") as f:
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
コード例 #2
0
def main(in_file,
         out_dir,
         spacy_model="en_core_web_sm",
         n_process=1,
         max_docs=10**7):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
                output_file = output_path / f"{input_path.stem}.spacy"
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        with output_file.open("wb") as f:
            batch_num += 1
            output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
コード例 #3
0
def test_issue5141(en_vocab):
    """ Ensure an empty DocBin does not crash on serialization """
    doc_bin = DocBin(attrs=["DEP", "HEAD"])
    assert list(doc_bin.get_docs(en_vocab)) == []
    doc_bin_bytes = doc_bin.to_bytes()
    doc_bin_2 = DocBin().from_bytes(doc_bin_bytes)
    assert list(doc_bin_2.get_docs(en_vocab)) == []
コード例 #4
0
ファイル: util.py プロジェクト: kevinxyc1/waterwheel
def build_vocab(water_bodies: Dict, nlp: Language):
    """Load new vocab and wikidata.
    
    Parameters
    ----------
    water_bodies : Dict
        Dictionary containing the list of new water bodies to be loaded.
        Format:
        {
            "LAKE": [(Name, Wiki_Id), ...],
            "RIVER": [(Name, Wiki_Id), ...],
            ...
        }
    nlp: Language
        spacy nlp object
    """

    vocab = {}
    wikidata = {}
    doc_bins_bytes = {}
    stop_words = set(srsly.read_json(stop_words_file)['stop_words'])

    for key in water_bodies:
        doc_bin = DocBin()
        for wb, _ in tqdm(water_bodies[key], desc=f'Loading {key}(s)'):
            doc_bin.add(nlp(wb))
        doc_bins_bytes[key] = doc_bin.to_bytes()

        if key not in wikidata:
            wikidata[key] = {}
        for name, id in water_bodies[key]:
            wikidata[key][name.lower()] = id

        vocab[str(nlp.vocab.strings[key])] = key
    write_data_files(vocab, wikidata, stop_words, doc_bins_bytes)
def load_or_create_spacy_doc(sents, do_preprocess, use_cache, verbose):
    """
    @sents list of string to be tokenized.
    @use_cache if true, try load from disk first. Otherwise, tokenize.
    @return DocBin object
    """
    
    if do_preprocess:
        sents = preprocess(sents)
        
    fname = SPACY_DOC_PATH + str(do_preprocess) + hash_sents(sents) + ".bin"
    
    if os.path.exists(fname) and use_cache:
        now = time.time()
        if verbose:
            print("Loading tokenized document from disk...")
        with open(fname, "rb") as f:
            doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]).from_bytes(f.read())
        if verbose:
            print("Finished loading tokenized document in {:.2f}s!".format(time.time() - now))
        return doc_bin
    else:
        now = time.time()
        if verbose:
            print("Start tokenizing document...")
        doc_bin = DocBin(attrs=["POS", "ENT_TYPE"])
        for doc in nlp.pipe(sents, disable=["parser"]):
            doc_bin.add(doc)
        with open(fname, "wb") as f:
            f.write(doc_bin.to_bytes())
        if verbose:
            print("Finish tokenizing document and save to disk in {:.2f}s!".format(time.time() - now))
        return doc_bin
コード例 #6
0
def write_docs_to_bin(docs: List[Doc], fname: str) -> None:
    doc_bin = DocBin(attrs=attrs)
    for doc in docs:
        doc_bin.add(doc)
    with open(fname, 'wb') as fh:
        doc_bin_bytes = doc_bin.to_bytes()
        fh.write(doc_bin_bytes)
コード例 #7
0
def bin_inscriptions(corpus):
    """ put the texts into the docbin """
    doc_bin = DocBin(attrs=["LEMMA", "TAG", "POS", "DEP", "HEAD"],
                     store_user_data=True)
    for c in corpus:
        doc = nlp(c)
        doc_bin.add(doc)

    with open('dbg.bin', 'wb') as f:
        f.write(doc_bin.to_bytes())
コード例 #8
0
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    nlp = English()
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    list(doc_bin.get_docs(nlp.vocab))
コード例 #9
0
ファイル: analspa.py プロジェクト: CoderOverflow/stack
def write_docs(texts, attrs, lang, file, provider='spacy'):
    from tqdm import tqdm
    nlp = get_model(lang, provider=provider)
    doc_bin = DocBin(attrs=[a.upper() for a in attrs], store_user_data=True)
    # doc_bin = DocBin(attrs=["DEP", "HEAD"])
    # for doc in nlp.pipe(texts):
    # the tqdm library just wraps a loop. When you call it around nlp.pipe,
    # the loop you're wrapping are the individual batches.
    for doc in tqdm(nlp.pipe(texts)):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()
    write_doc_to(bytes_data, file)
コード例 #10
0
def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
コード例 #11
0
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag,
                                    reader_value):
    """Test that custom extensions are correctly serialized in DocBin."""
    Doc.set_extension("foo", default="nothing")
    doc = Doc(en_vocab, words=["hello", "world"])
    doc._.foo = "bar"
    doc_bin_1 = DocBin(store_user_data=writer_flag)
    doc_bin_1.add(doc)
    doc_bin_bytes = doc_bin_1.to_bytes()
    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
    assert doc_2._.foo == reader_value
    Underscore.doc_extensions = {}
コード例 #12
0
    def parse_dataset(self,
                      in_dataset_path,
                      out_dataset_path,
                      save_as_str=False,
                      condition_func=None):
        in_dataset = self.load_dataset(in_dataset_path,
                                       out_dataset_path,
                                       binary=False)

        if in_dataset is None:
            return None

        start_time = time.time()
        sents = []
        doc_bin = DocBin(attrs=[
            "ID", "ORTH", "LEMMA", "TAG", "POS", "HEAD", "DEP", "ENT_IOB",
            "ENT_TYPE"
        ],
                         store_user_data=True)

        for sent in in_dataset:
            doc = self.parse_sentence(sent)

            if doc is None:
                continue

            if condition_func and condition_func(doc):
                continue

            doc_bin.add(doc)
            sents.append(sent)

            self.print_status(len(doc_bin), out_dataset_path, start_time)
            if not self.check_dataset_size(doc_bin):
                break

        print(
            f"The dataset {basename(out_dataset_path)} contains {len(doc_bin)} parsed sentences"
        )

        os.makedirs(dirname(out_dataset_path), exist_ok=True)

        # Save the resulted dataset as string if needed
        if save_as_str:
            with open(out_dataset_path, "w") as target_file:
                target_file.writelines(sents)

        # Save the resulted parsed dataset
        with open(out_dataset_path, "wb") as parsed_file:
            parsed_file.write(doc_bin.to_bytes())
コード例 #13
0
def main():
    keywords_df = pd.read_csv('data/keywords.csv')
    keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity']))
    data = pd.read_csv('data/taged_all.csv')

    cd = CleanData()
    data_clean = cd.normalize_text(data.copy())
    data_clean['keywords'] = data_clean['clean_text'].str.split()

    doc_entity_df = doc_entity(data_clean, keywords_dic)

    doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target',
                                                    'predict']],
                                        how='left',
                                        left_on='id',
                                        right_on='id')
    doc_entity_df.set_index('id', inplace=True)
    doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True)

    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = [
        "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]"
    ]
    nlp = spacy.load("en_core_web_md")
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Read and write binary file
    with open('data/sample', "wb") as out_file:
        out_file.write(bytes_data)

    with open('data/sample', "rb") as in_file:
        data = in_file.read()
        in_file.close()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(data)
    docs = list(doc_bin.get_docs(nlp.vocab))

    # ###################################################################################
    data = pd.read_csv('data/taged_all.csv')

    for row in tqdm(data['text'], total=data.shape[0]):
        doc = nlp(row)
        doc.to_disk('data/sample')
    print([(X.text, X.label_) for X in doc.ents])
コード例 #14
0
ファイル: spacify.py プロジェクト: binshengliu/irtools
def main() -> None:
    # logger = mp.log_to_stderr(level=logging.INFO)
    args = parse_arguments()
    logger.info(f"Receive input from {args.input.name}")
    texts = list(args.input)
    logger.info(f"Received {len(texts)} lines")
    doc_bin = DocBin(attrs=["LEMMA", "IS_STOP"])
    logger.info(f"Initialize {args.workers} processes")
    with mp.Pool(args.workers, initializer=init_nlp, initargs=(args,),) as pool:
        pool_iter = pool.imap(process, texts, chunksize=args.batch_size)
        for doc in tqdm(pool_iter, total=len(texts)):
            doc_bin.merge(doc)

    args.output.write(doc_bin.to_bytes())
    logger.info(f"Saved at {args.output.name}")
コード例 #15
0
def spacy_parse_store_from_dataframe(fname, df, nlp):
    chunks = math.ceil(len(df))
    start_time = time.time()
    for chunk in range(chunks):
        start = chunk * 10000
        end = start + 10000
        chunk_df = df.iloc[start:end, ]
        chunk_fname = fname + f'_{chunk}'
        doc_bin = DocBin(attrs=attrs)
        for ti, text in enumerate(get_dataframe_review_texts(chunk_df)):
            doc = nlp(text)
            doc_bin.add(doc)
            if (ti + 1) % 1000 == 0:
                print(ti + 1, 'reviews parsed in chunk', chunk, '\ttime:',
                      time.time() - start_time)
        with open(chunk_fname, 'wb') as fh:
            fh.write(doc_bin.to_bytes())
コード例 #16
0
ファイル: nlp.py プロジェクト: jdbuysse/lgap
def process_uploaded_file(f, title):
    doc_bin = DocBin(
        attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "POS", "TAG", "HEAD", "DEP"],
        store_user_data=True)
    # add newlines using spacy's sentence detection
    f = lineizer(f)
    # this assumes a text that has sentences split into new lines
    doclist = f
    for doc in nlp.pipe(doclist):
        print(doc)
        doc_bin.add(doc)
    # for doc in nlp.pipe():
    #      print(doc)
    #      doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()
    with open(f"media/{title}", "wb") as binary_file:
        binary_file.write(bytes_data)
コード例 #17
0
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    cats = {"A": 0.5}
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
コード例 #18
0
ファイル: indexers.py プロジェクト: yashwanthv19/questionable
    def cache_docbin(self, force=False):
        sp = self.sp
        refresh = force or not os.path.isfile(DOCBIN_CACHE) \
                        or not os.path.isdir(VOCAB_CACHE)

        if refresh:
            paragraphs = self.load_paragraphs()
            doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                             store_user_data=True)
            for doc in sp.pipe(tqdm(paragraphs)):
                doc_bin.add(doc)
            with open(DOCBIN_CACHE, "wb") as f:
                f.write(doc_bin.to_bytes())
            sp.vocab.to_disk(VOCAB_CACHE)

        sp.vocab.from_disk(VOCAB_CACHE)

        with open(DOCBIN_CACHE, "rb") as f:
            bb = f.read()
            doc_bin = DocBin().from_bytes(bb)
        return list(doc_bin.get_docs(sp.vocab))
コード例 #19
0
ファイル: corpus.py プロジェクト: wayne9qiu/textacy
    def save(
        self,
        filepath: Union[str, pathlib.Path],
        store_user_data: bool = True,
    ) -> None:
        """
        Save :class:`Corpus` to disk as binary data.

        Args:
            filepath: Full path to file on disk where :class:`Corpus` data
                will be saved as a binary file.
            store_user_data: If True, store user data and values of
                custom extension attributes along with core spaCy attributes.

        See Also:
            - :meth:`Corpus.load()`
            - :class:`spacy.tokens.DocBin`
        """
        attrs = [
            spacy.attrs.ORTH,
            spacy.attrs.SPACY,
        ]
        if self[0].is_tagged:
            attrs.append(spacy.attrs.TAG)
        if self[0].is_parsed:
            attrs.append(spacy.attrs.HEAD)
            attrs.append(spacy.attrs.DEP)
        # NOTE: HEAD sets sentence boundaries implicitly based on tree structure, so
        # also setting SENT_START would potentially conflict with existing annotations.
        elif self[0].is_sentenced:
            attrs.append(spacy.attrs.SENT_START)
        if self[0].is_nered:
            attrs.append(spacy.attrs.ENT_IOB)
            attrs.append(spacy.attrs.ENT_TYPE)
        doc_bin = DocBin(attrs=attrs, store_user_data=store_user_data)
        for doc in self:
            doc_bin.add(doc)
        with tio.open_sesame(filepath, mode="wb") as f:
            f.write(doc_bin.to_bytes())
コード例 #20
0
def write_spacy_docs(docs, vocab, filepath, vocab_filepath):
    """ Writes serialized spacy docs to file.  
    
    Parameters
    ----------
    docs: list of spacy.tokens.doc.Doc
        List of spacy Docs to write to file 
    filepath: str
        File path to serialized spacy docs
    """
    from spacy.attrs import IDS
    attr_exclude = ['SENT_START']
    attrs = [attr for attr in IDS.keys() if attr not in attr_exclude]

    doc_bin = DocBin(attrs=attrs)
    for doc in docs:
        doc_bin.add(doc)

    with open(filepath, 'wb') as f:
        f.write(doc_bin.to_bytes())
    with open(vocab_filepath, 'wb') as f:
        f.write(vocab.to_bytes())
コード例 #21
0
def docbin_writer(docs: Iterable[Doc], docbin_output_path: str):
    """Writes a stream of Spacy Doc objects to a binary file in the DocBin format."""

    import spacy.attrs
    # Creating the DocBin object (with all attributes)
    attrs = [
        spacy.attrs.LEMMA, spacy.attrs.TAG, spacy.attrs.DEP, spacy.attrs.HEAD,
        spacy.attrs.ENT_IOB, spacy.attrs.ENT_TYPE
    ]
    docbin = DocBin(attrs=attrs, store_user_data=True)

    # Storing the documents in the DocBin repository
    for doc in docs:
        doc.cats = {}
        docbin.add(doc)
    data = docbin.to_bytes()

    # And writing the content to the file
    print("Write to", docbin_output_path, end="...", flush=True)
    fd = open(docbin_output_path, "wb")
    fd.write(data)
    fd.close()
    print("done")
コード例 #22
0
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"],
                     store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    cats = {"A": 0.5}
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
        doc.spans["start"] = [doc[0:2]]
        doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
        doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
        assert len(doc.spans) == 1
        assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
        assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
コード例 #23
0
            token_row = classifiers_neg_gpc[classifiers_neg_gpc.form == token.text]
            if not token_row.empty:
                negative_token.append(token.text)
            else:
                token_row = classifiers_pos_gpc[classifiers_pos_gpc.form == token.text]
                if not token_row.empty:
                    positive_token.append(token.text)
    sentiment_tokens["positiv"] = positive_token
    sentiment_tokens["negativ"] = negative_token
    return sentiment_tokens

counter = 0
for f, protokoll in spacy_db.items():
    counter += 1
    print(counter)
    for rede in protokoll:
        rede.user_data["sentiws"] = sentiws_eval(rede)
    for rede in protokoll:
        rede.user_data["gpc"] = gpc_eval(rede)
    doc_bin = DocBin(
        attrs=["POS", "TAG", "LEMMA", "IS_STOP", "DEP", "SHAPE", "ENT_ID", "ENT_IOB", "ENT_KB_ID", "ENT_TYPE"],
        store_user_data=True)
    for doc in protokoll:
        doc_bin.add(doc)
    spacy_out = doc_bin.to_bytes()
    with open(file=("plenarprotokolle/pp19/" + f + ".sentiment"), mode="wb") as spacy_outfile:
        spacy_outfile.write(spacy_out)



コード例 #24
0
text = "I love coffee"

# Load base document
nlp = spacy.load("en_core_web_sm")
doc_base = nlp(text)
print("")
print_doc(doc_base)

# Serialize document to disk and bytes
doc_base.to_disk("doc.spacy")
doc_base_bytes = doc_base.to_bytes()

# Serialize using DocBin
docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True)
docbin_base.add(doc_base)
docbin_base_bytes = docbin_base.to_bytes()

# Restore document from disk
doc = Doc(Vocab())
doc.from_disk("doc.spacy")
print("")
print_doc(doc)

# Restore document from bytes
doc = Doc(Vocab())
doc.from_bytes(doc_base_bytes)
print("")
print_doc(doc)

# Restore using DocBin
docbin = DocBin().from_bytes(docbin_base_bytes)
コード例 #25
0
    config.CPU_CORES = args.cores
    if config.CPU_CORES < 0 or config.CPU_CORES > cpu_count():
        raise ValueError('Invalid core number specified.')

    print('CPU_CORES:', config.CPU_CORES)
    return config


if __name__ == '__main__':
    conf = parse_config()
    df = pd.read_csv(conf.INPUT_FILE, usecols=['sentence_id', 'text'])

    if spacy.__version__[0] == '3':
        # Spacy v3 saved all attributes by default.
        doc_bin = DocBin(store_user_data=True)
    else:
        doc_bin = DocBin(["LEMMA", "ENT_TYPE", "POS", "DEP"],
                         store_user_data=True)

    for i, doc in enumerate(nlp.pipe(df['text'], n_process=conf.CPU_CORES)):
        if doc.is_parsed:
            doc.user_data['sentence_id'] = df.iloc[i, 0]
            doc_bin.add(doc)

    print('Writing file...')
    bytes_data = doc_bin.to_bytes()
    with open(conf.OUTPUT_FILE, 'wb+') as f:
        f.write(bytes_data)
    print('Exiting...')
コード例 #26
0
	def process(self):
		"""
		Reads text and outputs entities per text body.
		"""

		# prepare staging area
		staging_area = self.dataset.get_staging_area()

		self.dataset.update_status("Preparing data")

		# go through all archived token sets and vectorise them
		results = []

		# Load the spacy goods
		nlp = spacy.load("en_core_web_sm")
		nlp.tokenizer = self.custom_tokenizer(nlp)  # Keep words with a dash in between

		# Disable what has _not_ been selected
		options = ["parser", "tagger", "ner"]
		enable = self.parameters.get("enable", False)

		if not enable:
			self.dataset.update_status("Select at least one of the options.")
			self.dataset.finish(0)
			return

		disable = [option for option in options if option not in enable]

		with open(self.source_file, encoding="utf-8") as source:

			# Get all ze text first so we can process it in batches
			csv_reader = csv.DictReader(source)
			posts = [post["body"] if post["body"] else "" for post in csv_reader]

			# Process the text in batches
			if len(posts) < 100000:
				self.dataset.update_status("Extracting linguistic features")
			else:
				self.dataset.update_status(
					"Extracting linguistic features is currently only available for datasets with less than 100.000 items.")
				self.dataset.finish(0)
				return

			# Make sure only the needed information is extracted.
			attrs = []
			if "tagger" not in disable:
				attrs.append("POS")
			if "parser" not in disable:
				attrs.append("DEP")
			if "ner":
				attrs.append("ENT_IOB")
				attrs.append("ENT_TYPE")
				attrs.append("ENT_ID")
				attrs.append("ENT_KB_ID")

			# DocBin for quick saving
			doc_bin = DocBin(attrs=attrs)

			# Start the processing!
			for i, doc in enumerate(nlp.pipe(posts, disable=disable)):
				try:
					doc_bin.add(doc)
				except MemoryError:
					self.dataset.update_status("Out of memory while parsing data. Try again with a smaller dataset.", is_final=True)
					return

				# It's quite a heavy process, so make sure it can be interrupted
				if self.interrupted:
					raise ProcessorInterruptedException("Processor interrupted while iterating through CSV file")

				if i % 1000 == 0:
					self.dataset.update_status("Done with post %s out of %s" % (i, len(posts)))

			self.dataset.update_status("Serializing results - this will take a while")

			# Then serialize the NLP docs and the vocab
			doc_bytes = doc_bin.to_bytes()

		# Dump ze data in a temporary folder
		with staging_area.joinpath("spacy_docs.pb").open("wb") as outputfile:
			pickle.dump(doc_bytes, outputfile)

		# create zip of archive and delete temporary files and folder
		self.write_archive_and_finish(staging_area, compression=zipfile.ZIP_LZMA)