Exemple #1
0
    def generate_corpus(nlp):
        directory_path = path.join('app','static', 'pickleFiles', 'training_testing')
        
        corpus_path = Path(path.join(directory_path, file_name) + ".spacy")
        raw_path = Path(path.join(directory_path, file_name) + ".jsonl")

        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)

        vulnerabilities = []
        with open(raw_path) as file:
            for line in file.readlines():
                vulnerability = loads(line)

                vulnerabilities.append({'description': vulnerability['data'], 'entities': vulnerability.get('label', [])})
                

        corpus = DocBin(attrs=["TAG", "ENT_IOB", "ENT_TYPE", "POS"])

        for vulnerability in vulnerabilities:
            document = nlp.make_doc(vulnerability['description'].lower())
            #print(vulnerability)
            #print(len(document))
            #iob =  [f"{token.ent_iob_}-{token.ent_type_}" if token.ent_iob_ != "O" else "O" for token in doc]
            #biluo = iob_to_biluo(iob)
            #print(biluo)
            
            
            #document.set_ents([Span(document, entity[0], entity[1], entity[2]) for entity in vulnerability['entities']])
            #document.set_ents(list(document.ents))

            tags = offsets_to_biluo_tags(document, vulnerability['entities'])
            entities = biluo_tags_to_spans(document, tags)
            document.set_ents(entities)
            '''
             Problem - doccano annotiert Labels auf zeichenenbene, nlp.make_doc erzeugt aber tokens.
            '''
            #print(document.has_annotation(1)) #ID of "SOFTWARE"

            # passt alles!
            ents = list(document.ents)
            for i, _ in enumerate(ents):
                print(ents[i].label_)
                print(ents[i].text)
                print('\n')


            print('\nOK\n')   
            #exit()
            corpus.add(document)
            
        print(len(corpus))
        print(list(corpus.get_docs(nlp.vocab)))
        corpus.to_disk(corpus_path)
    
        if exists(corpus_path):
            return Corpus(corpus_path)(nlp)
Exemple #2
0
def docbin2docs(docbin_bytes: bytes,
                language: t.Union[str, Language],
                similarity_method: int = 0) -> t.Tuple[Doc, ...]:
    if isinstance(language, str):
        language = blank(language, similarity_method)

    docbin = DocBin().from_bytes(docbin_bytes)

    return tuple(docbin.get_docs(language.vocab))
Exemple #3
0
def convert(lang: str, input_path: Path, output_path: Path):
    nlp = spacy.blank(lang)
    in_db = DocBin().from_disk(input_path)
    out_db = DocBin()
    logging.info(f"Read {len(in_db)} documents from {input_path}.")
    for doc in in_db.get_docs(nlp.vocab):
        new_doc = nlp.make_doc(doc.text)
        new_doc.user_data = doc.user_data
        new_doc.ents = doc.ents
        out_db.add(new_doc)
    out_db.to_disk(output_path)
Exemple #4
0
def json_path_to_examples(data_path, NLP):
    data = srsly.read_json(data_path)
    # no good way to convert with a specified vocab, so convert, then reload
    # through DocBin with the right vocab
    docs = json_to_docs(data)
    docbin = DocBin()
    for doc in docs:
        docbin.add(doc)
    docs = docbin.get_docs(NLP.vocab)
    examples = [Example(NLP.make_doc(doc.text), doc) for doc in docs]
    return examples
Exemple #5
0
def read_spacy_docs(
    filepath: types.PathLike,
    *,
    format: str = "binary",
    lang: Optional[types.LangLike] = None,
) -> Iterable[Doc]:
    """
    Read the contents of a file at ``filepath``, written in binary or pickle format.

    Args:
        filepath: Path to file on disk from which data will be read.
        format ({"binary", "pickle"}): Format of the data that was written to disk.
            If "binary", uses :class:`spacy.tokens.DocBin` to deserialie data;
            if "pickle", uses python's stdlib ``pickle``.

            .. warning:: Docs written in pickle format were saved all together
               as a list, which means they're all loaded into memory at once
               before streaming one by one. Mind your RAM usage, especially when
               reading many docs!

        lang: Language with which spaCy originally processed docs, represented as
            the full name of or path on disk to the pipeline, or an already instantiated
            pipeline instance.
            Note that this is only required when ``format`` is "binary".

    Yields:
        Next deserialized document.

    Raises:
        ValueError: if format is not "binary" or "pickle", or if ``lang`` is None
            when ``format="binary"``
    """
    if format == "binary":
        if lang is None:
            raise ValueError(
                "lang=None is invalid. When format='binary', a `spacy.Language` "
                "(well, its associated `spacy.Vocab`) is required to deserialize "
                "the binary data. Note that this should be the same language pipeline "
                "used when processing the original docs!")
        else:
            lang = spacier.utils.resolve_langlike(lang)
        docbin = DocBin().from_disk(filepath)
        for doc in docbin.get_docs(lang.vocab):
            yield doc

    elif format == "pickle":
        with io_utils.open_sesame(filepath, mode="rb") as f:
            for spacy_doc in pickle.load(f):
                yield spacy_doc

    else:
        raise ValueError(
            errors.value_invalid_msg("format", format, {"binary", "pickle"}))
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"], store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    nlp = English()
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    list(doc_bin.get_docs(nlp.vocab))
Exemple #7
0
def test_serialize_custom_extension(en_vocab, writer_flag, reader_flag,
                                    reader_value):
    """Test that custom extensions are correctly serialized in DocBin."""
    Doc.set_extension("foo", default="nothing")
    doc = Doc(en_vocab, words=["hello", "world"])
    doc._.foo = "bar"
    doc_bin_1 = DocBin(store_user_data=writer_flag)
    doc_bin_1.add(doc)
    doc_bin_bytes = doc_bin_1.to_bytes()
    doc_bin_2 = DocBin(store_user_data=reader_flag).from_bytes(doc_bin_bytes)
    doc_2 = list(doc_bin_2.get_docs(en_vocab))[0]
    assert doc_2._.foo == reader_value
    Underscore.doc_extensions = {}
def read_files(file: Path, nlp: "Language") -> Iterable[Example]:
    """Custom reader that keeps the tokenization of the gold data,
    and also adds the gold GGP annotations as we do not attempt to predict these."""
    doc_bin = DocBin().from_disk(file)
    docs = doc_bin.get_docs(nlp.vocab)
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        yield Example(pred, gold)
Exemple #9
0
def test_issue4528(en_vocab):
    """Test that user_data is correctly serialized in DocBin."""
    doc = Doc(en_vocab, words=["hello", "world"])
    doc.user_data["foo"] = "bar"
    # This is how extension attribute values are stored in the user data
    doc.user_data[("._.", "foo", None, None)] = "bar"
    doc_bin = DocBin(store_user_data=True)
    doc_bin.add(doc)
    doc_bin_bytes = doc_bin.to_bytes()
    new_doc_bin = DocBin(store_user_data=True).from_bytes(doc_bin_bytes)
    new_doc = list(new_doc_bin.get_docs(en_vocab))[0]
    assert new_doc.user_data["foo"] == "bar"
    assert new_doc.user_data[("._.", "foo", None, None)] == "bar"
def prepare_data(
    params: Params,
    verbose: bool = True,
) -> Dict[str, Doc]:
    """
    return a single spacy doc for each age.

    warning: if corpus binary is not on disk already, it will be saved to disk.
    this means the corpus should never be modified - else, the binary will also contain unexpected modifications
    """

    # try loading transcripts from disk
    fn = params.corpus_name + '.spacy'
    bin_path = configs.Dirs.corpora / fn
    if bin_path.exists():
        doc_bin = DocBin().from_disk(bin_path)
        docs = list(doc_bin.get_docs(nlp.vocab))
    # load raw transcripts + process them
    else:
        print(
            f'WARNING: Did not find binary file associated with {params.corpus_name}. Preprocessing corpus...'
        )
        transcripts = load_transcripts(params)
        docs: List[Doc] = [doc for doc in nlp.pipe(transcripts)]
        # WARNING: only save to disk if we know that corpus has not been modified
        doc_bin = DocBin(docs=docs)
        doc_bin.to_disk(bin_path)

    # group docs by age
    ages = load_ages(params)
    if len(ages) != len(docs):
        raise RuntimeError(f'Num docs={len(docs)} and num ages={len(ages)}')
    age2docs = {}
    for age in SortedSet(ages):
        if age == EXCLUDED_AGE:
            continue
        docs_at_age = [docs[n] for n, ai in enumerate(ages) if ai == age]
        age2docs[age] = docs_at_age
        if verbose:
            print(
                f'Processed {len(age2docs[age]):>6} transcripts for age={age}')

    # combine all documents at same age
    age2doc = {}
    for age, docs in age2docs.items():

        doc_combined = Doc.from_docs(docs)
        age2doc[age] = doc_combined
        print(f'Num tokens at age={age} is {len(doc_combined):,}')

    return age2doc
Exemple #11
0
def test_serialize_doc_bin_unknown_spaces(en_vocab):
    doc1 = Doc(en_vocab, words=["that", "'s"])
    assert doc1.has_unknown_spaces
    assert doc1.text == "that 's "
    doc2 = Doc(en_vocab, words=["that", "'s"], spaces=[False, False])
    assert not doc2.has_unknown_spaces
    assert doc2.text == "that's"

    doc_bin = DocBin().from_bytes(DocBin(docs=[doc1, doc2]).to_bytes())
    re_doc1, re_doc2 = doc_bin.get_docs(en_vocab)
    assert re_doc1.has_unknown_spaces
    assert re_doc1.text == "that 's "
    assert not re_doc2.has_unknown_spaces
    assert re_doc2.text == "that's"
Exemple #12
0
def main():
    keywords_df = pd.read_csv('data/keywords.csv')
    keywords_dic = dict(zip(keywords_df['keyword'], keywords_df['entity']))
    data = pd.read_csv('data/taged_all.csv')

    cd = CleanData()
    data_clean = cd.normalize_text(data.copy())
    data_clean['keywords'] = data_clean['clean_text'].str.split()

    doc_entity_df = doc_entity(data_clean, keywords_dic)

    doc_entity_df = doc_entity_df.merge(data_clean[['id', 'target',
                                                    'predict']],
                                        how='left',
                                        left_on='id',
                                        right_on='id')
    doc_entity_df.set_index('id', inplace=True)
    doc_entity_df.to_csv('data/doc_entity_df.csv', index=True, header=True)

    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = [
        "Disaster control teams are studying ways to evacuate the port area in response to tidal wave warnings.[900037]"
    ]
    nlp = spacy.load("en_core_web_md")
    for doc in nlp.pipe(texts):
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Read and write binary file
    with open('data/sample', "wb") as out_file:
        out_file.write(bytes_data)

    with open('data/sample', "rb") as in_file:
        data = in_file.read()
        in_file.close()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(data)
    docs = list(doc_bin.get_docs(nlp.vocab))

    # ###################################################################################
    data = pd.read_csv('data/taged_all.csv')

    for row in tqdm(data['text'], total=data.shape[0]):
        doc = nlp(row)
        doc.to_disk('data/sample')
    print([(X.text, X.label_) for X in doc.ents])
Exemple #13
0
def main(in_file, out_dir, spacy_model="en_core_web_sm", n_process=4):
    """
    Step 2: Preprocess text in sense2vec's format

    Expects a binary .spacy input file consisting of the parsed Docs (DocBin)
    and outputs a text file with one sentence per line in the expected sense2vec
    format (merged noun phrases, concatenated phrases with underscores and
    added "senses").

    Example input:
    Rats, mould and broken furniture: the scandal of the UK's refugee housing

    Example output:
    Rats|NOUN ,|PUNCT mould|NOUN and|CCONJ broken_furniture|NOUN :|PUNCT
    the|DET scandal|NOUN of|ADP the|DET UK|GPE 's|PART refugee_housing|NOUN
    """
    input_path = Path(in_file)
    output_path = Path(out_dir)
    if not input_path.exists():
        msg.fail("Can't find input file", in_file, exits=1)
    if not output_path.exists():
        output_path.mkdir(parents=True)
        msg.good(f"Created output directory {out_dir}")
    nlp = spacy.load(spacy_model)
    msg.info(f"Using spaCy model {spacy_model}")
    with input_path.open("rb") as f:
        doc_bin_bytes = f.read()
    doc_bin = DocBin().from_bytes(doc_bin_bytes)
    msg.good(f"Loaded {len(doc_bin)} parsed docs")
    docs = doc_bin.get_docs(nlp.vocab)
    output_file = output_path / f"{input_path.stem}.s2v"
    lines_count = 0
    words_count = 0
    with output_file.open("w", encoding="utf8") as f:
        for doc in tqdm.tqdm(docs, desc="Docs", unit=""):
            doc = merge_phrases(doc)
            words = []
            for token in doc:
                if not token.is_space:
                    word, sense = make_spacy_key(token, prefer_ents=True)
                    words.append(make_key(word, sense))
            f.write(" ".join(words) + "\n")
            lines_count += 1
            words_count += len(words)
    msg.good(
        f"Successfully preprocessed {lines_count} docs ({words_count} words)",
        output_file.resolve(),
    )
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                     store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    cats = {"A": 0.5}
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
Exemple #15
0
def main(model="en_core_web_lg", docbin_path=EXAMPLE_PARSES_PATH):
    nlp = spacy.load(model)
    print("Reading data from {}".format(docbin_path))
    with open(docbin_path, "rb") as file_:
        bytes_data = file_.read()
    nr_word = 0
    start_time = timer()
    entities = Counter()
    docbin = DocBin().from_bytes(bytes_data)
    for doc in docbin.get_docs(nlp.vocab):
        nr_word += len(doc)
        entities.update((e.label_, e.text) for e in doc.ents)
    end_time = timer()
    msg = "Loaded {nr_word} words in {seconds} seconds ({wps} words per second)"
    wps = nr_word / (end_time - start_time)
    print(msg.format(nr_word=nr_word, seconds=end_time - start_time, wps=wps))
    print("Most common entities:")
    for (label, entity), freq in entities.most_common(30):
        print(freq, entity, label)
def test_to_spacy_file_and_back(small_dataset):
    spacy_pipeline = spacy.load("en_core_web_sm")
    InputSample.create_spacy_dataset(
        small_dataset,
        output_path="dataset.spacy",
        translate_tags=False,
        spacy_pipeline=spacy_pipeline,
        alignment_mode="strict",
    )

    db = DocBin()
    db.from_disk("dataset.spacy")
    docs = db.get_docs(vocab=spacy_pipeline.vocab)
    for doc, input_sample in zip(docs, small_dataset):
        input_ents = sorted(input_sample.spans, key=lambda x: x.start_position)
        spacy_ents = sorted(doc.ents, key=lambda x: x.start_char)
        for spacy_ent, input_span in zip(spacy_ents, input_ents):
            assert spacy_ent.start_char == input_span.start_position
            assert spacy_ent.end_char == input_span.end_position
    def load_dataset(input_path, output_path, binary=False):
        # Load the dataset in the given path, and ignore it if the given output exists
        dataset = None

        if config.IGNORE_PROCESSED_DATASET and isfile(output_path):
            return None

        try:
            if not binary:
                return open(input_path, "r")

            with open(input_path, "rb") as parsed_dataset_file:
                dataset_bytes = parsed_dataset_file.read()
                doc_bin = DocBin().from_bytes(dataset_bytes)
                dataset = doc_bin.get_docs(ud_parser.vocab)
        except zlib.error:
            pass

        return dataset
Exemple #18
0
    def extract_docs(self):
        """
		Extracts serialised SpaCy docs from a zipped archive.

		:returns: SpaCy docs.

		"""

        nlp = en_core_web_sm.load()  # Load model

        with zipfile.ZipFile(str(self.source_file), "r") as archive:
            file_name = archive.namelist()[0]  # always just one pickle file

            with archive.open(file_name, "r") as pickle_file:
                # Load DocBin
                file = pickle.load(pickle_file)
                doc_bin = DocBin().from_bytes(file)
                docs = list(doc_bin.get_docs(nlp.vocab))

        return docs
Exemple #19
0
    def cache_docbin(self, force=False):
        sp = self.sp
        refresh = force or not os.path.isfile(DOCBIN_CACHE) \
                        or not os.path.isdir(VOCAB_CACHE)

        if refresh:
            paragraphs = self.load_paragraphs()
            doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE"],
                             store_user_data=True)
            for doc in sp.pipe(tqdm(paragraphs)):
                doc_bin.add(doc)
            with open(DOCBIN_CACHE, "wb") as f:
                f.write(doc_bin.to_bytes())
            sp.vocab.to_disk(VOCAB_CACHE)

        sp.vocab.from_disk(VOCAB_CACHE)

        with open(DOCBIN_CACHE, "rb") as f:
            bb = f.read()
            doc_bin = DocBin().from_bytes(bb)
        return list(doc_bin.get_docs(sp.vocab))
Exemple #20
0
def docbin_reader(docbin_file_path: str,
                  spacy_model_name: str = "en_core_web_md",
                  cutoff: Optional[int] = None,
                  nb_to_skip: int = 0):
    """Read a binary file containing a DocBin repository of spacy documents.
    In addition to the file path, we also need to provide the name of the spacy
    model (which is necessary to load the vocabulary), such as "en_core_web_md".

    If cutoff is specified, the method will stop after generating the given
    number of documents. If nb_to_skip is > 0, the method will skip the given
    number of documents before starting the generation.
    """

    import spacy

    # Reading the binary data from the file
    fd = open(docbin_file_path, "rb")
    data = fd.read()
    fd.close()
    docbin = DocBin(store_user_data=True)
    docbin.from_bytes(data)
    del data
    #    print("Total number of documents in docbin:", len(docbin))

    # Skip a number of documents
    if nb_to_skip:
        docbin.tokens = docbin.tokens[nb_to_skip:]
        docbin.spaces = docbin.spaces[nb_to_skip:]
        docbin.user_data = docbin.user_data[nb_to_skip:]

    # Retrieves the vocabulary
    vocab = get_spacy_model(spacy_model_name).vocab

    # We finally generate the documents one by one
    reader = docbin.get_docs(vocab)
    for i, doc in enumerate(reader):
        yield doc
        if cutoff is not None and (i + 1) >= cutoff:
            return
def read_spacy_docs(filepath, vocab_filepath):
    """ Reads serialized spacy docs from a file into memory.
    
    Parameters
    ----------
    filepath: str
        File path to serialized spacy docs
    
    Returns
    -------
    list of spacy.tokens.doc.Doc
        List of spacy Docs loaded from file
    """
    from spacy.vocab import Vocab
    with open(vocab_filepath, 'rb') as f:
        vocab = Vocab().from_bytes(f.read())

    with open(filepath, 'rb') as f:
        data = f.read()

    doc_bin = DocBin().from_bytes(data)
    docs = list(doc_bin.get_docs(vocab))
    return docs
Exemple #22
0
def test_serialize_doc_bin():
    doc_bin = DocBin(attrs=["LEMMA", "ENT_IOB", "ENT_TYPE", "NORM", "ENT_ID"],
                     store_user_data=True)
    texts = ["Some text", "Lots of texts...", "..."]
    cats = {"A": 0.5}
    nlp = English()
    for doc in nlp.pipe(texts):
        doc.cats = cats
        doc.spans["start"] = [doc[0:2]]
        doc[0].norm_ = "UNUSUAL_TOKEN_NORM"
        doc[0].ent_id_ = "UNUSUAL_TOKEN_ENT_ID"
        doc_bin.add(doc)
    bytes_data = doc_bin.to_bytes()

    # Deserialize later, e.g. in a new process
    nlp = spacy.blank("en")
    doc_bin = DocBin().from_bytes(bytes_data)
    reloaded_docs = list(doc_bin.get_docs(nlp.vocab))
    for i, doc in enumerate(reloaded_docs):
        assert doc.text == texts[i]
        assert doc.cats == cats
        assert len(doc.spans) == 1
        assert doc[0].norm_ == "UNUSUAL_TOKEN_NORM"
        assert doc[0].ent_id_ == "UNUSUAL_TOKEN_ENT_ID"
Exemple #23
0
from pandas import read_csv
from glob import glob
import spacy
from spacy.tokens import DocBin

nlp = spacy.load("de_core_news_sm")

daten = glob("plenarprotokolle/pp19/*.xml.spacy")


spacy_db = {}
for datei in daten:
    protokoll = DocBin(store_user_data=True).from_bytes(open(datei, "rb").read())
    protokoll = list(protokoll.get_docs(nlp.vocab))
    datei = datei.split("plenarprotokolle/pp19/")[1]
        spacy_db[datei] = protokoll

for f, protokoll in spacy_db.items():
    for rede in protokoll:
        rede.user_data["entitaeten"] = [x.text for x in rede.ents]
        rede.user_data["entitaeten"] = [x for x in rede.user_data["entitaeten"] if not x == "||"]


def collect_classifiers_sentiws(sourcefile):
    with open(sourcefile) as csv_file:
        classifiers = read_csv(csv_file, sep="\t", header=None, names=["lemma", "wert", "formen"])
        classifiers["formen"] = classifiers["formen"].astype(str)
        classifiers["formen"] = classifiers["formen"].apply(lambda x: x.split(","))
        classifiers[["lemma", "pos"]] = classifiers["lemma"].str.split("|", expand=True)
        classifiers["lemma"] = classifiers["lemma"].astype(str)
        for formen, lemma in zip(classifiers.formen, classifiers.lemma):
Exemple #24
0
    # Ask for password for the email respones
    email_address = input("Enter your e-mail address: ")
    password = getpass("Password for sending emails: ")

    # Create the arguments extractor
    test_extractor = ArgumentsExtractor("NOMLEX-plus.1.0.txt")

    # Create the UD parser, that resulted in odin formated representation
    nlp = spacy.load("en_ud_model_lg")
    nlp.tokenizer = _custom_tokenizer(nlp)
    converter = Converter(False, False, False, 0, False, False, False, False,
                          False, ConvsCanceler())
    nlp.add_pipe(converter, name="BART")
    tagger = nlp.get_pipe('tagger')
    parser = nlp.get_pipe('parser')

    # Load the example sentences
    DATA_PATH = "data/too_clean_wiki/example.txt"
    with open(DATA_PATH, "r") as example_sentence_file:
        example_sentences = example_sentence_file.readlines()

    # Load the parsed example sentences
    with open(DATA_PATH.replace(".txt", ".parsed"),
              "rb") as parsed_dataset_file:
        dataset_bytes = parsed_dataset_file.read()
        doc_bin = DocBin().from_bytes(dataset_bytes)
        docs = doc_bin.get_docs(nlp.vocab)
        parsed_example_sentences = list(doc_bin.get_docs(nlp.vocab))

    # Start the server
    run(host='0.0.0.0', reloader=False, port=5000, server='paste')
Exemple #25
0
    def process(self):
        """
		Opens the SpaCy output and gets ze nouns.

		"""
        noun_type = self.parameters["type"]

        # Validate whether the user enabled the right parameters.
        # Check part of speech tagging
        if "tagger" not in self.source_dataset.parameters["enable"]:
            self.dataset.update_status(
                "Enable \"Part-of-speech tagging\" in previous module")
            self.dataset.finish(0)

        # Check dependency parsing if nouns and compouns nouns is selected
        elif (noun_type == "nouns_and_compounds" or noun_type == "noun_chunks"
              ) and "parser" not in self.source_dataset.parameters["enable"]:
            self.dataset.update_status(
                "Enable \"Part-of-speech tagging\" and \"Dependency parsing\" for compound nouns/noun chunks in previous module"
            )
            self.dataset.finish(0)

        # Valid parameters
        else:

            # Extract the SpaCy docs first
            self.dataset.update_status("Unzipping SpaCy docs")
            self.dataset.update_status("Extracting nouns")

            # Store all the nouns in this list
            li_nouns = []
            nlp = spacy.load("en_core_web_sm")  # Load model
            spacy.load("en_core_web_sm")

            for doc_file in self.iterate_archive_contents(self.source_file):
                with doc_file.open("rb") as pickle_file:
                    # Load DocBin
                    file = pickle.load(pickle_file)
                    doc_bin = DocBin().from_bytes(file)
                    docs = list(doc_bin.get_docs(nlp.vocab))

            # Simply add each word if its POS is "NOUN"
            if noun_type == "nouns":
                for doc in docs:
                    post_nouns = []
                    post_nouns += [
                        token.text for token in doc if token.pos_ == "NOUN"
                    ]
                    li_nouns.append(post_nouns)

            # Use SpaCy's noun chunk detection
            elif noun_type == "noun_chunks":

                for doc in docs:

                    # Note: this is a workaround for now.
                    # Serialization of the SpaCy docs does not
                    # work well with dependency parsing after
                    # loading. Quick fix: parse again.

                    new_doc = nlp(doc.text)
                    post_nouns = []
                    for chunk in new_doc.noun_chunks:
                        post_nouns.append(chunk.text)

                    li_nouns.append(post_nouns)

            # Use a custom script to get single nouns and compound nouns
            elif noun_type == "nouns_and_compounds":
                for doc in docs:
                    post_nouns = []
                    noun = ""

                    for i, token in enumerate(doc):

                        # Check for common nouns (general, e.g. "people")
                        # and proper nouns (specific, e.g. "London")
                        if token.pos_ == "NOUN" or token.pos_ == "PROPN":
                            # Check if the token is part of a noun chunk
                            if token.dep_ == "compound":  # Check for a compound relation
                                noun = token.text
                            else:
                                if noun:
                                    noun += " " + token.text
                                    post_nouns.append(noun)
                                    noun = ""
                                else:
                                    post_nouns.append(token.text)
                    li_nouns.append(post_nouns)

            results = []

            if li_nouns:

                # Also add the data to the original csv file, if indicated.
                if self.parameters.get("overwrite"):
                    self.update_parent(li_nouns, noun_type)

                # convert to lower and filter out one-letter words
                all_nouns = []
                for post_n in li_nouns:
                    all_nouns += [
                        str(cap_noun).lower() for cap_noun in post_n
                        if len(cap_noun) > 1
                    ]

                # Group and rank
                count_nouns = Counter(all_nouns).most_common()
                results = [{
                    "word": tpl[0],
                    "count": tpl[1]
                } for tpl in count_nouns]

            # done!
            if results:
                self.dataset.update_status("Finished")
                self.write_csv_items_and_finish(results)
            else:
                self.dataset.update_status(
                    "Finished, but no nouns were extracted.")
                self.dataset.finish(0)
Exemple #26
0
def main(trained_pipeline: Path, test_data: Path, print_details: bool):
    nlp = spacy.load(trained_pipeline)

    doc_bin = DocBin(store_user_data=True).from_disk(test_data)
    docs = doc_bin.get_docs(nlp.vocab)
    examples = []
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        for name, proc in nlp.pipeline:
            pred = proc(pred)
        examples.append(Example(pred, gold))

        # Print the gold and prediction, if gold label is not 0
        if print_details:
            print()
            print(f"Text: {gold.text}")
            print(f"spans: {[(e.start, e.text, e.label_) for e in pred.ents]}")
            for value, rel_dict in pred._.rel.items():
                gold_labels = [
                    k for (k, v) in gold._.rel[value].items() if v == 1.0
                ]
                if gold_labels:
                    print(
                        f" pair: {value} --> gold labels: {gold_labels} --> predicted values: {rel_dict}"
                    )
            print()

    random_examples = []
    docs = doc_bin.get_docs(nlp.vocab)
    for gold in docs:
        pred = Doc(
            nlp.vocab,
            words=[t.text for t in gold],
            spaces=[t.whitespace_ for t in gold],
        )
        pred.ents = gold.ents
        relation_extractor = nlp.get_pipe("relation_extractor")
        get_instances = relation_extractor.model.attrs["get_instances"]
        for (e1, e2) in get_instances(pred):
            offset = (e1.start, e2.start)
            if offset not in pred._.rel:
                pred._.rel[offset] = {}
            for label in relation_extractor.labels:
                pred._.rel[offset][label] = random.uniform(0, 1)
        random_examples.append(Example(pred, gold))

    thresholds = [
        0.000, 0.050, 0.100, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9, 0.99,
        0.999
    ]
    print()
    print("Random baseline:")
    _score_and_format(random_examples, thresholds)

    print()
    print("Results of the trained model:")
    _score_and_format(examples, thresholds)
nlp = spacy.load("en_core_web_sm")
doc_base = nlp(text)
print("")
print_doc(doc_base)

# Serialize document to disk and bytes
doc_base.to_disk("doc.spacy")
doc_base_bytes = doc_base.to_bytes()

# Serialize using DocBin
docbin_base = DocBin(attrs=["ENT_IOB", "POS", "HEAD", "DEP", "ENT_TYPE"], store_user_data=True)
docbin_base.add(doc_base)
docbin_base_bytes = docbin_base.to_bytes()

# Restore document from disk
doc = Doc(Vocab())
doc.from_disk("doc.spacy")
print("")
print_doc(doc)

# Restore document from bytes
doc = Doc(Vocab())
doc.from_bytes(doc_base_bytes)
print("")
print_doc(doc)

# Restore using DocBin
docbin = DocBin().from_bytes(docbin_base_bytes)
docs = list(docbin.get_docs(nlp.vocab))
print("")
print_doc(docs[0])
Exemple #28
0
    def process(self):
        """
		Opens the SpaCy output and gets ze entities.

		"""

        # Validate whether the user enabled the right parameters.
        if "ner" not in self.source_dataset.parameters["enable"]:
            self.dataset.update_status(
                "Enable \"Named entity recognition\" in previous module")
            self.dataset.finish(0)
            return

        if self.source_dataset.num_rows > 25000:
            self.dataset.update_status(
                "Named entity recognition is only available for datasets smaller than 25.000 items."
            )
            self.dataset.finish(0)
            return

        else:
            # Extract the SpaCy docs first
            self.dataset.update_status("Unzipping SpaCy docs")

            # Store all the entities in this list
            li_entities = []
            nlp = spacy.load("en_core_web_sm")  # Load model

            for doc_file in self.iterate_archive_contents(self.source_file):
                with doc_file.open("rb") as pickle_file:
                    # Load DocBin
                    file = pickle.load(pickle_file)
                    doc_bin = DocBin().from_bytes(file)
                    docs = list(doc_bin.get_docs(nlp.vocab))

                for doc in docs:
                    post_entities = []

                    # stop processing if worker has been asked to stop
                    if self.interrupted:
                        raise ProcessorInterruptedException(
                            "Interrupted while processing documents")

                    for ent in doc.ents:
                        if ent.label_ in self.parameters["entities"]:
                            post_entities.append(
                                (ent.text, ent.label_))  # Add a tuple

                    li_entities.append(post_entities)

            results = []

            if li_entities:

                # Also add the data to the original csv file, if indicated.
                if self.parameters.get("overwrite"):
                    self.update_parent(li_entities)

                all_entities = []
                # Convert to lower and filter out one-letter words. Join the words with the entities so we can group easily.
                for post_ents in li_entities:
                    for pair in post_ents:
                        if pair and len(pair[0]) > 1:
                            pair = pair[0].lower() + " |#| " + pair[1]
                            all_entities.append(pair)

                # Group and rank
                count_nouns = Counter(all_entities).most_common()
                # Unsplit and list the count.
                results = [{
                    "word": tpl[0].split(" |#| ")[0],
                    "entity": tpl[0].split(" |#| ")[1],
                    "count": tpl[1]
                } for tpl in count_nouns]

            # done!
            if results:
                self.dataset.update_status("Finished")
                self.write_csv_items_and_finish(results)
            else:
                self.dataset.update_status(
                    "Finished, but no entities were extracted.")
                self.dataset.finish(0)