Ejemplo n.º 1
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc_bin.add(doc)
    msg.good(f"Processed {len(doc_bin)} docs")
    doc_bin_bytes = doc_bin.to_bytes()
    output_file = output_path / f"{input_path.stem}.spacy"
    with output_file.open("wb") as f:
        f.write(doc_bin_bytes)
    msg.good(f"Saved parsed docs to file", output_file.resolve())
Ejemplo n.º 2
0
def write_docs_to_bin(docs: List[Doc], fname: str) -> None:
    doc_bin = DocBin(attrs=attrs)
    for doc in docs:
        doc_bin.add(doc)
    with open(fname, 'wb') as fh:
        doc_bin_bytes = doc_bin.to_bytes()
        fh.write(doc_bin_bytes)
def load_or_create_spacy_doc(sents, do_preprocess, use_cache, verbose):
    """
    @sents list of string to be tokenized.
    @use_cache if true, try load from disk first. Otherwise, tokenize.
    @return DocBin object
    """
    
    if do_preprocess:
        sents = preprocess(sents)
        
    fname = SPACY_DOC_PATH + str(do_preprocess) + hash_sents(sents) + ".bin"
    
    if os.path.exists(fname) and use_cache:
        now = time.time()
        if verbose:
            print("Loading tokenized document from disk...")
        with open(fname, "rb") as f:
            doc_bin = DocBin(attrs=["POS", "ENT_TYPE"]).from_bytes(f.read())
        if verbose:
            print("Finished loading tokenized document in {:.2f}s!".format(time.time() - now))
        return doc_bin
    else:
        now = time.time()
        if verbose:
            print("Start tokenizing document...")
        doc_bin = DocBin(attrs=["POS", "ENT_TYPE"])
        for doc in nlp.pipe(sents, disable=["parser"]):
            doc_bin.add(doc)
        with open(fname, "wb") as f:
            f.write(doc_bin.to_bytes())
        if verbose:
            print("Finish tokenizing document and save to disk in {:.2f}s!".format(time.time() - now))
        return doc_bin
Ejemplo n.º 4
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])

    header = True
    with open(input_path, "r") as in_f, open(output_path, "w") as out_f:
        for line in tqdm(in_f):
            if header:
                header = False
                continue
            sentence, tokens = pd.read_csv(StringIO(line),
                                           header=None,
                                           usecols=[0, 1]).values[0]
            tokens = eval(tokens)
            dict_line = line_to_dict(sentence, tokens)
            eg = dict_line

            if eg["answer"] != "accept":
                continue
            tokens = [token["text"] for token in eg["tokens"]]
            words, spaces = get_words_and_spaces(tokens, eg["text"])
            doc = Doc(nlp.vocab, words=words, spaces=spaces)
            doc.ents = [
                doc.char_span(s["start"], s["end"], label=s["label"])
                for s in eg.get("spans", [])
            ]
            doc_bin.add(doc)
        doc_bin.to_disk(output_path)
        print(f"Processed {len(doc_bin)} documents: {output_path}")
Ejemplo n.º 5
0
def build_vocab(water_bodies: Dict, nlp: Language):
    """Load new vocab and wikidata.
    
    Parameters
    ----------
    water_bodies : Dict
        Dictionary containing the list of new water bodies to be loaded.
        Format:
        {
            "LAKE": [(Name, Wiki_Id), ...],
            "RIVER": [(Name, Wiki_Id), ...],
            ...
        }
    nlp: Language
        spacy nlp object
    """

    vocab = {}
    wikidata = {}
    doc_bins_bytes = {}
    stop_words = set(srsly.read_json(stop_words_file)['stop_words'])

    for key in water_bodies:
        doc_bin = DocBin()
        for wb, _ in tqdm(water_bodies[key], desc=f'Loading {key}(s)'):
            doc_bin.add(nlp(wb))
        doc_bins_bytes[key] = doc_bin.to_bytes()

        if key not in wikidata:
            wikidata[key] = {}
        for name, id in water_bodies[key]:
            wikidata[key][name.lower()] = id

        vocab[str(nlp.vocab.strings[key])] = key
    write_data_files(vocab, wikidata, stop_words, doc_bins_bytes)
Ejemplo n.º 6
0
def convert(json_path, output):
    db = DocBin()
    for line in srsly.read_jsonl(json_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output)
Ejemplo n.º 7
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for line in srsly.read_jsonl(input_path):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Ejemplo n.º 8
0
def convert(output_path):
    global nlp
    db = DocBin()
    for line in srsly.read_jsonl("db.json"):
        doc = nlp.make_doc(line["text"])
        doc.cats = line["cats"]
        db.add(doc)
    db.to_disk(output_path)
Ejemplo n.º 9
0
def main(
    # fmt: off
    in_file: str = typer.Argument(..., help="Path to input file"),
    out_dir: str = typer.Argument(..., help="Path to output directory"),
    spacy_model: str = typer.Argument("en_core_web_sm",
                                      help="Name of spaCy model to use"),
    n_process: int = typer.Option(
        1, "--n-process", "-n", help="Number of processes (multiprocessing)"),
    max_docs: int = typer.Option(10**6,
                                 "--max-docs",
                                 "-m",
                                 help="Maximum docs per batch"),
    # fmt: on
):
    """
    Step 1: Parse raw text with spaCy

    Expects an input file with one sentence per line and will output a .spacy
    file of the parsed collection of Doc objects (DocBin).
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    doc_bin = DocBin(attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
    msg.text("Preprocessing text...")
    count = 0
    batch_num = 0
    with input_path.open("r", encoding="utf8") as texts:
        docs = nlp.pipe(texts, n_process=n_process)
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            if count < max_docs:
                doc_bin.add(doc)
                count += 1
            else:
                batch_num += 1
                count = 0
                msg.good(f"Processed {len(doc_bin)} docs")
                doc_bin_bytes = doc_bin.to_bytes()
                output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
                with output_file.open("wb") as f:
                    f.write(doc_bin_bytes)
                msg.good(f"Saved parsed docs to file", output_file.resolve())
                doc_bin = DocBin(
                    attrs=["POS", "TAG", "DEP", "ENT_TYPE", "ENT_IOB"])
        batch_num += 1
        output_file = output_path / f"{input_path.stem}-{batch_num}.spacy"
        with output_file.open("wb") as f:
            doc_bin_bytes = doc_bin.to_bytes()
            f.write(doc_bin_bytes)
            msg.good(f"Complete. Saved final parsed docs to file",
                     output_file.resolve())
Ejemplo n.º 10
0
    def generate_corpus(nlp):
        directory_path = path.join('data')
        
        corpus_path = Path(path.join(directory_path, file_name) + ".spacy")
        raw_path = Path(path.join(directory_path, file_name) + ".jsonl")

        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)

        vulnerabilities = []
        with open(raw_path) as file:
            for line in file.readlines():
                vulnerability = loads(line)

                vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])})
                

        corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"])

        for vulnerability in vulnerabilities:
            document = nlp.make_doc(vulnerability['description'].lower())
            #print(vulnerability)
            #print(len(document))
            #iob =  [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
            #biluo = iob_to_biluo(iob)
            #print(biluo)
            
            
            #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']])
            #document.set_ents(list(document.ents))

            tags = offsets_to_biluo_tags(document, vulnerability['entities'])
            entities = biluo_tags_to_spans(document, tags)
            document.set_ents(entities)
            '''
             Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens.
            '''
            #print(document.has_annotation(1)) #ID of "SOFTWARE"

            # passt alles!
            ents = list(document.ents)
            for i, _ in enumerate(ents):
                print(ents[i].label_)
                print(ents[i].text)
                print('\n')


            print('\nOK\n')   
            #exit()
            corpus.add(document)
            
        print(len(corpus))
        print(list(corpus.get_docs(nlp.vocab)))
        corpus.to_disk(corpus_path)
    
        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)
Ejemplo n.º 11
0
def bin_inscriptions(corpus):
    """ put the texts into the docbin """
    doc_bin = DocBin(attrs=["LEMMA", "TAG", "POS", "DEP", "HEAD"],
                     store_user_data=True)
    for c in corpus:
        doc = nlp(c)
        doc_bin.add(doc)

    with open('dbg.bin', 'wb') as f:
        f.write(doc_bin.to_bytes())
Ejemplo n.º 12
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    in_db = DocBin().from_disk(input_path)
    out_db = DocBin()
    logging.info(f"Read {len(in_db)} documents from {input_path}.")
    for doc in in_db.get_docs(nlp.vocab):
        new_doc = nlp.make_doc(doc.text)
        new_doc.user_data = doc.user_data
        new_doc.ents = doc.ents
        out_db.add(new_doc)
    out_db.to_disk(output_path)
Ejemplo n.º 13
0
def json_path_to_examples(data_path, NLP):
    data = srsly.read_json(data_path)
    # no good way to convert with a specified vocab, so convert, then reload
    # through DocBin with the right vocab
    docs = json_to_docs(data)
    docbin = DocBin()
    for doc in docs:
        docbin.add(doc)
    docs = docbin.get_docs(NLP.vocab)
    examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs]
    return examples
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    nlp = English()
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    list(doc_bin.get_docs(nlp.vocab))
Ejemplo n.º 15
0
def write_docs(texts, attrs, lang, file, provider='spacy'):
    from tqdm import tqdm
    nlp = get_model(lang, provider=provider)
    doc_bin = DocBin(attrs=[a.upper() for a in attrs], store_user_data=True)
    # doc_bin = DocBin(attrs=["DEP", "HEAD"])
    # for doc in nlp.pipe(texts):
    # the tqdm library just wraps a loop. When you call it around nlp.pipe,
    # the loop you're wrapping are the individual batches.
    for doc in tqdm(nlp.pipe(texts)):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()
    write_doc_to(bytes_data, file)
Ejemplo n.º 16
0
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag,
                                    reader_value):
    """Test that custom extensions are correctly serialized in DocBin."""
    Doc.set_extension("foo", default="nothing")
    doc = Doc(en_vocab, words=["hello", "world"])
    doc._.foo = "bar"
    doc_bin_1 = DocBin(store_user_data=writer_flag)
    doc_bin_1.add(doc)
    doc_bin_bytes = doc_bin_1.to_bytes()
    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
    assert doc_2._.foo == reader_value
    Underscore.doc_extensions = {}
Ejemplo n.º 17
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin()
    data_tuples = ((eg["text"], eg) for eg in srsly.read_jsonl(input_path))
    for doc, eg in nlp.pipe(data_tuples, as_tuples=True):
        # doc.cats = {category: 0 for category in CATEGORIES}
        doc.cats[eg["label"]] = 1
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Ejemplo n.º 18
0
def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
    def parse_dataset(self,
                      in_dataset_path,
                      out_dataset_path,
                      save_as_str=False,
                      condition_func=None):
        in_dataset = self.load_dataset(in_dataset_path,
                                       out_dataset_path,
                                       binary=False)

        if in_dataset is None:
            return None

        start_time = time.time()
        sents = []
        doc_bin = DocBin(attrs=[
            "ID", "ORTH", "LEMMA", "TAG", "POS", "HEAD", "DEP", "ENT_IOB",
            "ENT_TYPE"
        ],
                         store_user_data=True)

        for sent in in_dataset:
            doc = self.parse_sentence(sent)

            if doc is None:
                continue

            if condition_func and condition_func(doc):
                continue

            doc_bin.add(doc)
            sents.append(sent)

            self.print_status(len(doc_bin), out_dataset_path, start_time)
            if not self.check_dataset_size(doc_bin):
                break

        print(
            f"The dataset {basename(out_dataset_path)} contains {len(doc_bin)} parsed sentences"
        )

        os.makedirs(dirname(out_dataset_path), exist_ok=True)

        # Save the resulted dataset as string if needed
        if save_as_str:
            with open(out_dataset_path, "w") as target_file:
                target_file.writelines(sents)

        # Save the resulted parsed dataset
        with open(out_dataset_path, "wb") as parsed_file:
            parsed_file.write(doc_bin.to_bytes())
Ejemplo n.º 20
0
def main():
    keywords_df = pd.read_csv('data/keywords.csv')
    keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity']))
    data = pd.read_csv('data/taged_all.csv')

    cd = CleanData()
    data_clean = cd.normalize_text(data.copy())
    data_clean['keywords'] = data_clean['clean_text'].str.split()

    doc_entity_df = doc_entity(data_clean, keywords_dic)

    doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target',
                                                    'predict']],
                                        how='left',
                                        left_on='id',
                                        right_on='id')
    doc_entity_df.set_index('id', inplace=True)
    doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True)

    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = [
        "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]"
    ]
    nlp = spacy.load("en_core_web_md")
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Read and write binary file
    with open('data/sample', "wb") as out_file:
        out_file.write(bytes_data)

    with open('data/sample', "rb") as in_file:
        data = in_file.read()
        in_file.close()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(data)
    docs = list(doc_bin.get_docs(nlp.vocab))

    # ###################################################################################
    data = pd.read_csv('data/taged_all.csv')

    for row in tqdm(data['text'], total=data.shape[0]):
        doc = nlp(row)
        doc.to_disk('data/sample')
    print([(X.text, X.label_) for X in doc.ents])
Ejemplo n.º 21
0
 def convert(input_path, output_path, lang='en'):
     nlp = spacy.blank(lang)
     db = DocBin()
     for text, annot in srsly.read_json(input_path):
         doc = nlp.make_doc(text)
         ents = []
         for start, end, label in annot["entities"]:
             span = doc.char_span(start, end, label=label)
             if span is None:
                 print("Skipping entity")
             else:
                 ents.append(span)
         doc.ents = ents
         db.add(doc)
     db.to_disk(output_path)
Ejemplo n.º 22
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    db = DocBin()
    for text, annot in srsly.read_json(input_path):
        doc = nlp.make_doc(text)
        ents = []
        for start, end, label in annot["entities"]:
            span = doc.char_span(start, end, label=label)
            if span is None:
                msg = f"Skipping entity [{start}, {end}, {label}] in the following text because the character span '{doc.text[start:end]}' does not align with token boundaries:\n\n{repr(text)}\n"
                warnings.warn(msg)
            else:
                ents.append(span)
        doc.ents = ents
        db.add(doc)
    db.to_disk(output_path)
Ejemplo n.º 23
0
Archivo: nlp.py Proyecto: jdbuysse/lgap
def process_uploaded_file(f, title):
    doc_bin = DocBin(
        attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "POS", "TAG", "HEAD", "DEP"],
        store_user_data=True)
    # add newlines using spacy's sentence detection
    f = lineizer(f)
    # this assumes a text that has sentences split into new lines
    doclist = f
    for doc in nlp.pipe(doclist):
        print(doc)
        doc_bin.add(doc)
    # for doc in nlp.pipe():
    #      print(doc)
    #      doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()
    with open(f"media/{title}", "wb") as binary_file:
        binary_file.write(bytes_data)
Ejemplo n.º 24
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in srsly.read_json(input_path):
        print(eg)

        doc = nlp(eg[0])
        doc.ents = [
            doc.char_span(s[0], s[1], label=s[2])
            for s in eg[1].get("entities", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Ejemplo n.º 25
0
def spacy_parse_store_from_dataframe(fname, df, nlp):
    chunks = math.ceil(len(df))
    start_time = time.time()
    for chunk in range(chunks):
        start = chunk * 10000
        end = start + 10000
        chunk_df = df.iloc[start:end, ]
        chunk_fname = fname + f'_{chunk}'
        doc_bin = DocBin(attrs=attrs)
        for ti, text in enumerate(get_dataframe_review_texts(chunk_df)):
            doc = nlp(text)
            doc_bin.add(doc)
            if (ti + 1) % 1000 == 0:
                print(ti + 1, 'reviews parsed in chunk', chunk, '\ttime:',
                      time.time() - start_time)
        with open(chunk_fname, 'wb') as fh:
            fh.write(doc_bin.to_bytes())
Ejemplo n.º 26
0
    def to_spacy(self, df, file_path=None):
        """
        Function to convert dataframe returned by annotator into spacy .

        Parameters
        ----------
        df (pandas DataFrame): Dataframe returned by the annotator (see Annotate()).
        file_path (str): Filepath (including filename) to save the .spacy file to.
        
        Returns
        -------
        Spacy docbin if a user wants to combine additional training data
        """

        if (not isinstance(df, pd.DataFrame)):
            raise TypeError("Pass the pandas dataframe returned by annotate()")

        if file_path and (not isinstance(file_path, str)):
            raise TypeError("The file_path must be a string or None")

        if file_path is None:
            file_path = os.path.join(os.getcwd(), 'annotations.spacy')

        db = DocBin()
        training_data = [ant for ant in df['annotations'].tolist() if ant]
        for text, annotations in training_data:

            ents = []
            doc = self.nlp(text)
            for start, end, label in annotations['entities']:

                span = doc.char_span(start, end, label=label)
                ents.append(span)

            # Drop overlapping spans. Note: when spans overlap, the (first) longest span is preferred over shorter spans.
            # See: https://spacy.io/api/top-level#util.filter_spans
            # TODO: alert users that some spans have been dropped.
            doc.ents = filter_spans(ents)

            db.add(doc)

        db.to_disk(file_path)
        print(f"Spacy file saved to: {file_path}")

        return db
Ejemplo n.º 27
0
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    cats = {"A": 0.5}
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
Ejemplo n.º 28
0
def descrip_to_spacy(data: pd.DataFrame, output_path: str) -> None:
    "Takes in dataframe with description and label to save DocBin to disk"
    tuples = data.apply(lambda row:
                        (strip_html_tags(row["description"]), row["fraud"]),
                        axis=1).to_list()
    nlp = spacy.blank("en")
    db = DocBin()

    for doc, label in nlp.pipe(tuples, as_tuples=True):
        if label:
            doc.cats["FRAUD"] = 1
            doc.cats["NOTFRAUD"] = 0
        else:
            doc.cats["FRAUD"] = 0
            doc.cats["NOTFRAUD"] = 1

        db.add(doc)

    db.to_disk(output_path)
Ejemplo n.º 29
0
def convert_file(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for eg in tqdm(srsly.read_jsonl(input_path)):
        if eg["answer"] != "accept":
            continue
        tokens = [token["text"] for token in eg["tokens"]]
        words, spaces = get_words_and_spaces(tokens, eg["text"])
        doc = Doc(nlp.vocab, words=words, spaces=spaces)
        doc.ents = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in eg.get("spans", [])
        ]
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")
Ejemplo n.º 30
0
def main(
        input_path: Path = typer.Argument(..., exists=True, dir_okay=False),
        output_path: Path = typer.Argument(..., dir_okay=False),
):
    nlp = spacy.blank("en")
    doc_bin = DocBin(attrs=["ENT_IOB", "ENT_TYPE"])
    for idx, eg in enumerate(srsly.read_jsonl(input_path)):
        if idx % 10000 == 0:
            print(f"converted {idx} sentences")
        doc = nlp(eg["text"])
        spans_from_json = eg.get("spans", [])
        spans_objects = [
            doc.char_span(s["start"], s["end"], label=s["label"])
            for s in spans_from_json
        ]
        spans_objects = filter_spans(spans_objects)
        doc.ents = spans_objects
        doc_bin.add(doc)
    doc_bin.to_disk(output_path)
    print(f"Processed {len(doc_bin)} documents: {output_path.name}")