Exemple #1
0
def test_issue3289():
    """Test that Language.to_bytes handles serializing a pipeline component
    with an uninitialized model."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("textcat"))
    bytes_data = nlp.to_bytes()
    new_nlp = English()
    new_nlp.add_pipe(nlp.create_pipe("textcat"))
    new_nlp.from_bytes(bytes_data)
Exemple #2
0
def test_issue3449():
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    text1 = "He gave the ball to I. Do you want to go to the movies with I?"
    text2 = "He gave the ball to I.  Do you want to go to the movies with I?"
    text3 = "He gave the ball to I.\nDo you want to go to the movies with I?"
    t1 = nlp(text1)
    t2 = nlp(text2)
    t3 = nlp(text3)
    assert t1[5].text == "I"
    assert t2[5].text == "I"
    assert t3[5].text == "I"
def main():
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    rest_countries = RESTCountriesComponent(nlp)  # initialise component
    nlp.add_pipe(rest_countries) # add it to the pipeline
    doc = nlp(u"Some text about Colombia and the Czech Republic")
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Doc has countries', doc._.has_country)  # Doc contains countries
    for token in doc:
        if token._.is_country:
            print(token.text, token._.country_capital, token._.country_latlng,
                token._.country_flag)  # country data
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # entities
Exemple #4
0
def test_issue3468():
    """Test that sentence boundaries are set correctly so Doc.is_sentenced can
    be restored after serialization."""
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("sentencizer"))
    doc = nlp("Hello world")
    assert doc[0].is_sent_start
    assert doc.is_sentenced
    assert len(list(doc.sents)) == 1
    doc_bytes = doc.to_bytes()
    new_doc = Doc(nlp.vocab).from_bytes(doc_bytes)
    assert new_doc[0].is_sent_start
    assert new_doc.is_sentenced
    assert len(list(new_doc.sents)) == 1
def main(text="Alphabet Inc. is the company behind Google.", *companies):
    # For simplicity, we start off with only the blank English Language class
    # and no model or pre-defined pipeline loaded.
    nlp = English()
    if not companies:  # set default companies if none are set via args
        companies = ['Alphabet Inc.', 'Google', 'Netflix', 'Apple']  # etc.
    component = TechCompanyRecognizer(nlp, companies)  # initialise component
    nlp.add_pipe(component, last=True)  # add last to the pipeline

    doc = nlp(text)
    print('Pipeline', nlp.pipe_names)  # pipeline contains component name
    print('Tokens', [t.text for t in doc])  # company names from the list are merged
    print('Doc has_tech_org', doc._.has_tech_org)  # Doc contains tech orgs
    print('Token 0 is_tech_org', doc[0]._.is_tech_org)  # "Alphabet Inc." is a tech org
    print('Token 1 is_tech_org', doc[1]._.is_tech_org)  # "is" is not
    print('Entities', [(e.text, e.label_) for e in doc.ents])  # all orgs are entities
Exemple #6
0
def test_issue3209():
    """Test issue that occurred in spaCy nightly where NER labels were being
    mapped to classes incorrectly after loading the model, when the labels
    were added using ner.add_label().
    """
    nlp = English()
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)

    ner.add_label("ANIMAL")
    nlp.begin_training()
    move_names = ["O", "B-ANIMAL", "I-ANIMAL", "L-ANIMAL", "U-ANIMAL"]
    assert ner.move_names == move_names
    nlp2 = English()
    nlp2.add_pipe(nlp2.create_pipe("ner"))
    nlp2.from_bytes(nlp.to_bytes())
    assert nlp2.get_pipe("ner").move_names == move_names
Exemple #7
0
def test_issue3456():
    # this crashed because of a padding error in layer.ops.unflatten in thinc
    nlp = English()
    nlp.add_pipe(nlp.create_pipe("tagger"))
    nlp.begin_training()
    list(nlp.pipe(["hi", ""]))
    similarity_scores = []
    for article in tokenized:
        current_article = []
        file = article[0]
        for tokenized_sentence in article[1]:
            sentence_embedding = get_vec(tokenized_sentence)
            score = distance.cosine(cc_embedding, sentence_embedding)
            if score > 1.0 :
                score = 1.0
            current_article.append(score)
        similarity_scores.append((file,current_article))
    return similarity_scores

# spaCy stuff
nlp = English()
nlp.add_pipe(nlp.create_pipe('sentencizer'))
tokenizer = Tokenizer(nlp.vocab)

# Split the corpus into sentences
articles = []
journals = ["ScienceOCR", "NatureOCR"]
for journal in journals:
    for article in corpus[journal]:
        file = journal + "/" + article[0]
        text = article[1]
        sentences = []
        doc = nlp(text)
        for sent in doc.sents:
            if sent.orth_ != "\n":
                s = sent.orth_.replace("\n", "")
                sentences.append(s)
Exemple #9
0
def test_overfitting_IO():
    # Simple test to try and quickly overfit the NEL component - ensuring the ML models work correctly
    nlp = English()
    vector_length = 3
    assert "Q2146908" not in nlp.vocab.strings

    # Convert the texts to docs to make sure we have doc.ents set for the training examples
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
        # create artificial KB - assign same prior weight to the two russ cochran's
        # Q2146908 (Russ Cochran): American golfer
        # Q7381115 (Russ Cochran): publisher
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_entity(entity="Q7381115", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="Russ Cochran",
            entities=["Q2146908", "Q7381115"],
            probabilities=[0.5, 0.5],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    assert "Q2146908" in entity_linker.vocab.strings
    assert "Q2146908" in entity_linker.kb.vocab.strings

    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    assert entity_linker.model.get_dim("nO") == vector_length
    assert entity_linker.model.get_dim(
        "nO") == entity_linker.kb.entity_vector_length

    for i in range(50):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)
    assert losses["entity_linker"] < 0.001

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)

    # Add a custom component to recognize "Russ Cochran" as an entity for the example training data
    patterns = [{
        "label": "PERSON",
        "pattern": [{
            "LOWER": "russ"
        }, {
            "LOWER": "cochran"
        }]
    }]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # test the trained model
    predictions = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        for ent in doc.ents:
            predictions.append(ent.kb_id_)
    assert predictions == GOLD_entities

    # Also test the results are still the same after IO
    with make_tempdir() as tmp_dir:
        nlp.to_disk(tmp_dir)
        nlp2 = util.load_model_from_path(tmp_dir)
        assert nlp2.pipe_names == nlp.pipe_names
        assert "Q2146908" in nlp2.vocab.strings
        entity_linker2 = nlp2.get_pipe("entity_linker")
        assert "Q2146908" in entity_linker2.vocab.strings
        assert "Q2146908" in entity_linker2.kb.vocab.strings
        predictions = []
        for text, annotation in TRAIN_DATA:
            doc2 = nlp2(text)
            for ent in doc2.ents:
                predictions.append(ent.kb_id_)
        assert predictions == GOLD_entities

    # Make sure that running pipe twice, or comparing to call, always amounts to the same predictions
    texts = [
        "Russ Cochran captured his first major title with his son as caddie.",
        "Russ Cochran his reprints include EC Comics.",
        "Russ Cochran has been publishing comic art.",
        "Russ Cochran was a member of University of Kentucky's golf team.",
    ]
    batch_deps_1 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    batch_deps_2 = [doc.to_array([ENT_KB_ID]) for doc in nlp.pipe(texts)]
    no_batch_deps = [
        doc.to_array([ENT_KB_ID]) for doc in [nlp(text) for text in texts]
    ]
    assert_equal(batch_deps_1, batch_deps_2)
    assert_equal(batch_deps_1, no_batch_deps)
Exemple #10
0
def convert_to_extractive_driver(args):
    """
    Driver function to convert an abstractive summarization dataset to an extractive dataset.
    The abstractive dataset must be formatted with two files for each split: a source and target file.
    Example file list for two splits: ``["train.source", "train.target", "val.source", "val.target"]``
    """
    # default is to output to input data directory if no output directory specified
    if not args.base_output_path:
        args.base_output_path = args.base_path

    # load spacy english small model with the "tagger" and "ner" disabled since
    # we only need the "tokenizer" and "parser"
    # more info: https://spacy.io/usage/processing-pipelines
    if args.sentencizer:
        nlp = English()
        sentencizer = nlp.create_pipe("sentencizer")
        nlp.add_pipe(sentencizer)
    else:
        nlp = spacy.load("en_core_web_sm", disable=["tagger", "ner"])

    if args.dataset:
        dataset = hf_nlp.load_dataset(args.dataset, args.dataset_version)

    # for each split
    for name in tqdm(
        args.split_names, total=len(args.split_names), desc="Dataset Split"
    ):
        if args.dataset:  # if loading using the `nlp` library
            current_dataset = dataset[name]
            source_file = current_dataset[args.data_example_column]
            target_file = current_dataset[args.data_summarized_column]
        else:
            # get the source and target paths
            source_file_path = os.path.join(
                args.base_path, (name + "." + args.source_ext)
            )
            target_file_path = os.path.join(
                args.base_path, (name + "." + args.target_ext)
            )
            logger.info("Opening source and target %s files", name)
            source_file = open(source_file_path, "r")
            target_file = open(target_file_path, "r")

        if args.shard_interval:  # if sharding is enabled
            # get number of examples to process
            if args.dataset:
                target_file_len = len(current_dataset)
            else:
                target_file_len = sum([1 for line in target_file])
                # reset pointer back to beginning after getting length
                target_file.seek(0)

            # find how long the loop will run, round up because any extra examples
            # will form a chunk of size less than `args.shard_interval`
            tot_num_interations = math.ceil(target_file_len / args.shard_interval)

            # default is that there was no previous shard (aka not resuming)
            last_shard = 0
            if args.resume:
                assert (
                    not args.dataset
                ), "Cannot resume when using data loaded from the `nlp` library."
                num_lines_read, last_shard = resume(
                    args.base_output_path, name, args.shard_interval
                )

                # if lines have been read and shards have been written to disk
                if num_lines_read:
                    logger.info("Resuming to line %i", num_lines_read - 1)
                    # seek both the source and target to the next line
                    seek_files([source_file, target_file], num_lines_read - 1)

                    # checks to make sure the last documents match
                    # this moves the file pointer in source_file forward 1...
                    resume_success = check_resume_success(
                        nlp,
                        args,
                        source_file,
                        last_shard,
                        args.base_output_path,
                        name,
                        args.compression,
                    )
                    # ...so move the target_file pointed forward 1 as well
                    target_file.readline()

                    if not resume_success:
                        logger.error("Exiting...")
                        sys.exit(-1)

                    # subtract the number of shards already created
                    tot_num_interations -= int(last_shard)
                else:  # no shards on disk
                    logger.warning("Tried to resume but no shards found on disk")

            for piece_idx, (source_docs, target_docs) in tqdm(
                enumerate(
                    zip(
                        read_in_chunks(source_file, args.shard_interval),
                        read_in_chunks(target_file, args.shard_interval),
                    )
                ),
                total=tot_num_interations,
                desc="Shards",
            ):
                piece_idx += last_shard  # effective if resuming (offsets the index)
                convert_to_extractive_process(
                    args, nlp, source_docs, target_docs, name, piece_idx
                )
        else:
            # only `str.strip()` the lines if loading from an actual file, not
            # the `nlp` library
            if args.dataset:
                source_docs = source_file
                target_docs = target_file
            else:
                source_docs = [line.strip() for line in source_file]
                target_docs = [line.strip() for line in target_file]
            convert_to_extractive_process(args, nlp, source_docs, target_docs, name)

        # If not processing data from the `nlp` library then close the loaded files
        if not args.dataset:
            source_file.close()
            target_file.close()
Exemple #11
0
import zipfile
import tempfile
from spacy.lang.en import English
from dstc_utilities import *

nlp = English()
sentencizer = nlp.create_pipe("sentencizer")
nlp.add_pipe(sentencizer)

# DSTC3 archive directory
archive_dir = 'dstc_archive'

# Processed data directory
data_dir = os.path.join('dstc_data', 'json')

remove_words = ['sil', 'unitelligible', 'unintelligible', 'background', 'noise', 'nosie', 'cough', 'coughing',
                'laughing', 'breathing', 'click', 'clicking', 'knock', 'knocking', 'system', 'dog', 'barking',
                'whisper', 'throat', 'clear', 'clearing', 'throatnoise', 'whisperingunintelligible']
sets = ['train', 'test']
# Create a temporary directory and unzip the archived data
with tempfile.TemporaryDirectory(dir=archive_dir) as tmp_dir:

    # Load into temp directory
    zip_file = zipfile.ZipFile(os.path.join(archive_dir, 'dstc3_archive.zip'), 'r')
    zip_file.extractall(tmp_dir)
    zip_file.close()

    for dataset_name in sets:
        # Get a list of all the dialogues
        set_list = os.listdir(os.path.join(tmp_dir, dataset_name))
Exemple #12
0
def main():

    # Setup Cuda if available, otherwise use the CPU
    device = -1

    if torch.cuda.is_available():
        device = torch.cuda.current_device()

    # Put data path here
    data_path = "/Users/bencullen/Projects/StoryGrapher/text_data/test.txt"
    save_path = "/Users/bencullen/Projects/StoryGrapher/output/triples/"
    data_name = data_path.split('/')[-1]

    # Generate Models
    print("Generating models...")
    openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz"
    openie_predictor = Predictor.from_path(openie_model_url,
                                           cuda_device=device)
    print("Generated openie predictor")

    spacy_sent = English()
    spacy_sent = spacy.load('en_core_web_sm')
    spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer'))
    print("Generated Spacy Sentencizer")

    print("Finished generating models")

    sentences = []
    all_triples = []
    openie_raw_json = []
    sent_parts_of_speech = []
    sent_structure = []
    sent_roots = []
    selected_triples = []
    verb_tenses = []
    trimmed_triples = []

    # Split text data into sentences
    all_sentences = get_all_sentences(spacy_sent, data_path)

    t = time.localtime()
    timestamp = time.strftime('%b-%d-%y_%H:%M', t)

    remove_bad_triples = True
    good_triples = 0
    total_triples = len(all_sentences)

    # print("Doing co-reference analysis")
    # coref_data = get_coref_prediction(coref_predictor, text_data)

    for sent in all_sentences:
        print('Processing sentence:', sent.text)
        sentences.append(sent)

        sent_pos = get_sent_pos_string(sent)
        print(sent_pos)
        sent_parts_of_speech.append(sent_pos)

        sent_dep = get_sent_dep(sent)
        print(sent_dep)
        sent_structure.append(sent_dep)

        sent_root = get_root_verb(sent)
        # print("Root Verb:", sent_root)
        sent_roots.append(sent_root)

        # Extract a triple using OpenIE
        openie_result = create_openie_triple(openie_predictor,
                                             sent.text.strip())
        openie_raw_json.append(openie_result)

        openie_str = get_triple_string_from_json(openie_result)
        all_triples.append(openie_str)
        # print("Openie triples:", openie_str)

        relevant_triple = get_relevant_triple(openie_result, sent_root)
        selected_triples.append(relevant_triple)
        # print("Selected Triple", str(relevant_triple))

        verb_tense = get_verb_tense(sent, relevant_triple)
        # print("Triple's Tense:", verb_tense)
        verb_tenses.append(verb_tense)

        trimmed = trim_triple(spacy_sent, relevant_triple)
        # print("Trimmed Triple:", trimmed, "\n")
        trimmed_triples.append(trimmed)
        print("\n")

        if remove_bad_triples == True:
            if None in trimmed:
                sentences.pop()
                sent_parts_of_speech.pop()
                sent_structure.pop()
                sent_roots.pop()
                openie_raw_json.pop()
                all_triples.pop()
                selected_triples.pop()
                verb_tenses.pop()
                trimmed_triples.pop()
            else:
                good_triples += 1

    # Put sentence and triple data into a pandas dataframe for processing
    triples_data = pd.DataFrame({
        'Sentences': sentences,
        'Sentence Parts of speech': sent_parts_of_speech,
        'Sentence Dependenceies': sent_structure,
        'Extracted Triples': all_triples,
        'Extraction JSON': openie_raw_json,
        'Root Verb': sent_roots,
        'Selected Triple': selected_triples,
        'Triple\'s Verb Tense': verb_tenses,
        'Trimmed Triple': trimmed_triples
    })

    print(good_triples, " triples of total triples ", total_triples,
          " were good")

    # Store the DataFrame into a csv file for examination
    triples_data.to_csv(
        os.path.join(save_path + data_name + '_triples_ ' + timestamp +
                     '.csv'))
Exemple #13
0
def test_positive_class_not_present():
    nlp = English()
    textcat = nlp.add_pipe("textcat")
    get_examples = make_get_examples_single_label(nlp)
    with pytest.raises(ValueError):
        textcat.initialize(get_examples, labels=["SOME", "THING"], positive_label="POS")
import subprocess
from spacy.lang.en import English
import os
import json
import fire

# Spacy model for sentence segmentation (among other things, but we only do segmentation)
nlp_lg = spacy.load("en_core_web_lg")
# nlp_lg.max_length = 1500000

# Also spacy, but using the rule-based sentencizer
# Note: we use this for bookcorpus, since the dependency-parser-based sentencizer
# tends to put quotation marks as their own sentences, and bookcorpus has a lot
# of quotations.
nlp = English()
nlp.add_pipe(nlp.create_pipe("sentencizer"))
# Increase the max length of documents from 1M to 14M characters for this
# sentencizer. Books are long. The longest book in bookcorpus is:
# 13961563 out_txts/682810__debunkanji-chinese-glyphs-used-in-japanese.txt
# We can do this because we're using the rule-based sentencizer, not the
# parser, so we don't need as much memory.
nlp.max_length = 14000000


def write_doc(doc):
    """From the BERT readme:
    The input is a plain text file, with one sentence per line. (It is
    important that these be actual sentences for the "next sentence prediction"
    task). Documents are delimited by empty lines.
    """
    for sent in doc.sents:
Exemple #15
0
def generate_rules(patterns):
    nlp = English()
    ruler = EntityRuler(nlp)
    ruler.add_patterns(patterns)
    nlp.add_pipe(ruler)
    nlp.to_disk("hp_ner")
def main():

    parser = argparse.ArgumentParser()
    add_arg = parser.add_argument

    add_arg('-i',
            dest='iterations',
            type=int,
            default=ITER,
            help='Number of iteration wanted, default value %d' % ITER)
    add_arg('-o',
            dest='output',
            type=str,
            default=OUTPUT_DIR,
            help='Output directory path, default %s' % OUTPUT_DIR)
    add_arg('-l',
            dest='load',
            type=bool,
            default=False,
            help='Load pretrained Spacy model for English, default False')

    args = parser.parse_args()

    print("Load term's list")
    terms_corpus = pd.read_excel('astronomy.xls')

    # the list containing the pharses to be matched
    terminology_list = []
    for term in terms_corpus['key']:
        terminology_list.append(term[term.find(':') + 2:])

    print("Read the corpus files...")
    read_files = glob.glob("corpus/Astromony_*.txt")
    with open("corpus/result.txt", "wb") as outfile:
        for f in progressbar(read_files):
            with open(f, "rb") as infile:
                outfile.write(infile.read())

    # the input text string is converted to a Document object

    file = open('corpus/result.txt')
    text = file.read()

    nlp_rule_based = English()
    ruler = EntityRuler(nlp_rule_based)

    # create patterns
    patterns = []

    for term in terminology_list:
        dct = {}
        temp = term.split()
        if len(temp) == 1:
            dct["label"] = "AstroTerm"
            dct["pattern"] = temp[0]
            patterns.append(dct)
        else:
            lst = []
            for item in temp:
                dct_temp = {}
                dct_temp["lower"] = item

                lst.append(dct_temp)

            dct["label"] = "AstroTerm"
            dct["pattern"] = lst
            patterns.append(dct)

    # add patterns and pipe
    ruler.add_patterns(patterns)
    nlp_rule_based.add_pipe(ruler)

    # generate annotaeted data
    print("Generate annotated data...")
    train_data = []
    for doc in progressbar(nltk.tokenize.sent_tokenize(text)):
        doc = nlp_rule_based(doc)
        train_data.append(extract_entities(doc))

    # to train the model set 'train' to true

    train_spacy(train_data, args.iterations, args.load, args.output)
Exemple #17
0
def split_spacy_en(text):
    nlp_e = English()
    nlp_e.add_pipe(nlp_e.create_pipe('sentencizer'))
    return prepare_spacy(text, nlp_e)
Exemple #18
0
def test_lemmatizer_requires_labels():
    nlp = English()
    nlp.add_pipe("trainable_lemmatizer")
    with pytest.raises(ValueError):
        nlp.initialize()
Exemple #19
0
def generate_rules(pattern):
    nlp = English()
    ruler = nlp.add_pipe("entity_ruler")
    ruler.add_patterns(patterns)
    nlp.to_disk("hp_ner")
def main():

    # Setup Cuda if available, otherwise use the CPU
    device = -1

    if torch.cuda.is_available():
        device = torch.cuda.current_device()

    # Put data path here
    data_path = "data/raw/anne_bonnie.txt"
    save_path = "data/triples/"
    data_name = data_path.split('/')[-1]

    # Generate Models
    print("Generating models...")
    openie_model_url = "https://storage.googleapis.com/allennlp-public-models/openie-model.2020.03.26.tar.gz"
    openie_predictor = Predictor.from_path(openie_model_url,
                                           cuda_device=device)
    print("Generated openie predictor")

    spacy_sent = English()
    spacy_sent = spacy.load('en_core_web_sm')
    spacy_sent.add_pipe(spacy_sent.create_pipe('sentencizer'))
    print("Generated Spacy Sentencizer")

    print("Finished generating models")

    sentences = []
    trimmed_triples = []

    # Split text data into sentences
    all_sentences = get_all_sentences(spacy_sent, data_path)

    t = time.localtime()
    timestamp = time.strftime('%b-%d-%y_%H:%M', t)

    remove_bad_triples = True
    good_triples = 0
    total_triples = len(all_sentences)

    # print("Doing co-reference analysis")
    # coref_data = get_coref_prediction(coref_predictor, text_data)

    for sent in all_sentences:
        print('Processing sentence:', sent.text)
        sentences.append(sent)

        # Get the root of the sentence
        sent_root = get_root_verb(sent)
        # print("Root Verb:", sent_root)

        # Extract a triple using OpenIE
        openie_result = create_openie_triple(openie_predictor,
                                             sent.text.strip())

        # Get releveant triple
        relevant_triple = get_relevant_triple(openie_result, sent_root)
        # print("Selected Triple", str(relevant_triple))

        # Tri the triple
        trimmed = trim_triple(spacy_sent, relevant_triple)
        trimmed_triples.append(trimmed)
        print("Trimmed Triple:", trimmed, "\n")

        if remove_bad_triples == True:
            if None in trimmed:
                sentences.pop()
                trimmed_triples.pop()
            else:
                good_triples += 1

    # Put sentence and triple data into a pandas dataframe for exporting
    triples_data = pd.DataFrame({
        'Sentence': sentences,
        'Trimmed Triple': trimmed_triples
    })

    print(good_triples, " triples of total ", total_triples,
          " triples were extracted")

    # Store the DataFrame into a csv file for examination
    triples_data.to_csv(
        os.path.join(save_path + data_name + '_triples_ ' + timestamp +
                     '.csv'))

    # Create graph object
    G = nx.Graph()

    file_name = data_name + ' Graph ' + timestamp
    # Add nodes to graph and connect images
    for triple in trimmed_triples:
        G.add_edge(triple[0], triple[1])
        G.add_edge(triple[1], triple[2])

    # Create graph picture
    pos = nx.spring_layout(G)
    fig = plt.figure(figsize=(45, 45))
    fig.suptitle(file_name)
    nx.draw(G,
            pos,
            edge_color='black',
            width=1,
            linewidths=1,
            node_size=1000,
            node_color='seagreen',
            alpha=0.9,
            labels={node: node
                    for node in G.nodes()})

    # Save the graph as a picture
    plt.savefig(
        os.path.join(save_path + data_name + '_graph_' + timestamp + '.png'))
Exemple #21
0
# external libraries
import numpy as np
import math
import torch
import torch.nn.functional as F
from spacy.lang.en import English

# internal utilities
import config

tokenizer = English()
tokenizer.add_pipe(tokenizer.create_pipe("sentencizer"))
device = torch.device("cuda" if config.cuda else "cpu")


def clean_text(text):
    text = text.replace("]", " ] ")
    text = text.replace("[", " [ ")
    text = text.replace("\n", " ")
    text = text.replace("''", '" ').replace("``", '" ')

    return text


def word_tokenize(text):
    tokens = [token.text for token in tokenizer(text) if token.text]
    tokens = [t for t in tokens if t.strip("\n").strip()]
    return tokens


def sent_tokenize(text):
Exemple #22
0
def spacy_sbd(text):
    nlp = English()
    nlp.add_pipe("sentencizer")
    doc = nlp(text)
    test = list(doc.sents)
    return [str(i) for i in test]
Exemple #23
0
    parser = argparse.ArgumentParser(
        'Generate raw sentences file for consumption by open IE 6.')
    parser.add_argument('--dataset',
                        default='squad',
                        help='trivia_qa or hotpot_qa')
    parser.add_argument('-debug',
                        default=False,
                        action='store_true',
                        help='If true, run on tiny portion of train dataset')
    args = parser.parse_args()

    update_incr = 10 if args.debug else 10000
    print('Loading Spacy...')
    spacy_nlp = English()  # just the language with no model
    sentencizer = spacy_nlp.create_pipe('sentencizer')
    spacy_nlp.add_pipe(sentencizer)
    print('Done Loading Spacy...')

    data_dir = os.path.join('..', 'data', args.dataset)
    if not os.path.exists(data_dir):
        os.mkdir(data_dir)

    if args.dataset == 'squad':
        dtypes = ['mini'] if args.debug else ['train', 'validation']
    else:
        dtypes = ['mini'] if args.debug else ['train', 'test', 'validation']

    for dtype in dtypes:
        start_time = time()
        generate_ie_input(dtype, data_dir)
        duration(start_time)
Exemple #24
0
class CrazyTokenizer(object):
    """
    Tokenizer with Reddit- and Twitter-specific options

    Parameters
    ----------
    lowercase : bool, optional
        If True, lowercase all tokens. Defaults to True.

    keepcaps: bool, optional
        If True, keep ALL CAPS WORDS uppercased. Defaults to False.

    normalize: int or bool, optional
        If not False, perform normalization of repeated charachers
        ("awesoooooome" -> "awesooome"). The value of parameter
        determines the number of occurences to keep. Defaults to 3.

    ignore_quotes: bool, optional
        If True, ignore tokens contained within double quotes.
        Defaults to False.

    ignore_reddit_quotes: bool, optional
        If True, remove quotes from the Reddit comments. Defaults to False.

    ignore_stopwords: str, list, or boolean, optional
        Whether to ignore stopwords

        - str: language to get a list of stopwords for from NLTK package
        - list: list of stopwords to remove
        - True: use built-in list of the english stop words
        - False: keep all tokens

        Defaults to False

    stem: {False, 'stem', 'lemm'}, optional
        Whether to perform word stemming

        - False: do not perform word stemming
        - 'stem': use PorterStemmer from NLTK package
        - 'lemm': use WordNetLemmatizer from NLTK package

    remove_punct: bool, optional
        If True, remove punctuation tokens. Defaults to True.

    remove_breaks: bool, optional
        If True, remove linebreak tokens. Defaults to True.

    decontract: bool, optional
        If True, attempt to expand certain contractions. Defaults to False.
        Example: "'ll" -> " will"

    numbers, subreddits, reddit_usernames, emails:
    False or str, optional
        Replacement of the different types of tokens

        - False: leaves these tokens intact
        - str: replacement token
        - '': removes all occurrences of these tokens

    twitter_handles: False, 'realname' or str, optional
        Processing of twitter handles

        - False: do nothing
        - str: replacement token
        - 'realname': replace with the real screen name of Twitter account
        - 'split': split handles using Viterbi algorithm

        Example: "#vladimirputinisthebest" -> "vladimir putin is the best"

    hashtags: False or str, optional
        Processing of hashtags

        - False: do nothing
        - str: replacement token
        - 'split': split hashtags according using Viterbi algorithm

    urls: False or str, optional
        Replacement of parsed URLs

        - False: leave URL intact
        - str: replacement token
        - dict: replace all URLs stored in keys with the corresponding values
        - '': removes all occurrences of these tokens
        - 'domain': extract domain ("http://cnn.com" -> "cnn")
        - 'domain_unwrap_fast': extract domain after unwraping links
        for a list of URL shorteners (goo.gl, t.co, bit.ly, tinyurl.com)
        - 'domain_unwrap': extract domain after unwraping all links
        - 'title': extract and tokenize title of each link after unwraping it

        Defaults to False.

    extra_patterns: None or list of tuples, optional
        Replacement of any user-supplied extra patterns.
        Tuples must have the following form: (name, re_pattern, replacement_token):

        - name (str): name of the pattern
        - re_pattern (_sre.SRE_Pattern): compiled re pattern
        - replacement_token (str): replacement token

        Defaults to None

    keep_untokenized: None or list, optional
        List of expressions to keep untokenized

        Example: ["New York", "Los Angeles", "San Francisco"]

    whitespaces_to_underscores: boolean, optional
        If True, replace all whitespace characters with
        underscores in the final tokens. Defaults to True.

    remove_nonunicode: boolean, optional
        If True, remove all non-unicode characters. Defaults to False.

    pos_emojis, neg_emojis, neutral_emojis: None, True, or list, optional
        Replace positive, negative, and neutral emojis with the special tokens

        - None: do not perform replacement
        - True: perform replacement of the default lists of emojis
        - list: list of emojis to replace

    print_url_warnings: bool, optional
        If True, print URL-related warnings. Defaults to False.

    latin_chars_fix: bool, optional
        Try applying this fix if you have a lot of \\xe2\\x80\\x99-like
        or U+1F601-like strings in your data. Defaults to False.

    ngrams: int, optional
        Add ngrams of tokens after tokenizing
    """
    def __init__(self,
                 lowercase=True,
                 keepcaps=False,
                 normalize=3,
                 ignore_quotes=False,
                 ignore_reddit_quotes=False,
                 ignore_stopwords=False,
                 stem=False,
                 remove_punct=True,
                 remove_breaks=True,
                 decontract=False,
                 twitter_handles=False,
                 urls=False,
                 hashtags=False,
                 numbers=False,
                 subreddits=False,
                 reddit_usernames=False,
                 emails=False,
                 extra_patterns=None,
                 keep_untokenized=None,
                 whitespaces_to_underscores=True,
                 remove_nonunicode=False,
                 remove_numbers=False,
                 pos_emojis=None,
                 neg_emojis=None,
                 neutral_emojis=None,
                 print_url_warnings=False,
                 latin_chars_fix=False,
                 ngrams=1,
                 wordnet=None,
                 porterstem=None):
        self.params = locals()

        self._nlp = English()
        self._merging_matcher = Matcher(self._nlp.vocab)
        self._matcher = Matcher(self._nlp.vocab)

        self._replacements = {}
        self._domains = {}
        self._realnames = {}
        self._stopwords = None
        self.wordnet = wordnet

        alpha_digits_flag = self._nlp.vocab.add_flag(alpha_digits_check)
        hashtag_flag = self._nlp.vocab.add_flag(hashtag_check)
        twitter_handle_flag = self._nlp.vocab.add_flag(twitter_handle_check)

        self._merging_matcher.add('HASHTAG', None, [{
            'ORTH': '#'
        }, {
            'IS_ASCII': True
        }])
        self._merging_matcher.add('SUBREDDIT', None, [{
            'ORTH': '/r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }], [{
            'ORTH': 'r'
        }, {
            'ORTH': '/'
        }, {
            alpha_digits_flag: True
        }])
        self._merging_matcher.add('REDDIT_USERNAME', None,
                                  [{
                                      'ORTH': '/u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }], [{
                                      'ORTH': 'u'
                                  }, {
                                      'ORTH': '/'
                                  }, {
                                      alpha_digits_flag: True
                                  }])

        if isinstance(ignore_stopwords, str) and ('nltk' in sys.modules):
            try:
                self._stopwords = stopwords.words(ignore_stopwords)
            except OSError:
                raise ValueError('Language {} was not found by NLTK'.format(
                    ignore_stopwords))
        elif ignore_stopwords is True:
            self._matcher.add('STOPWORDS', self._remove_token, [{
                'IS_STOP': True
            }])
        elif isinstance(ignore_stopwords, list):
            self._stopwords = [word.lower() for word in ignore_stopwords]
        elif ignore_stopwords is not False:
            raise TypeError(
                'Type {} is not supported by ignore_stopwords parameter or NLTK is not installed'
                .format(type(ignore_stopwords)))

        if lowercase and (not keepcaps):
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False
            }])
        elif lowercase and keepcaps:
            self._matcher.add('LOWERCASE', self._lowercase, [{
                'IS_LOWER': False,
                'IS_UPPER': False
            }])

        if remove_punct:
            self._matcher.add('PUNCTUATION', self._remove_token,
                              [{
                                  'IS_PUNCT': True
                              }])

        if remove_numbers:
            self._matcher.add('NUMBERS', self._remove_token, [{
                'LIKE_NUM': True
            }])

        if remove_breaks:

            def break_check(text):
                return bool(BREAKS_RE.fullmatch(text))

            break_flag = self._nlp.vocab.add_flag(break_check)
            self._matcher.add('BREAK', self._remove_token, [{
                break_flag: True
            }])

        if normalize:

            def normalize_check(text):
                return bool(NORMALIZE_RE.search(text))

            normalize_flag = self._nlp.vocab.add_flag(normalize_check)
            self._matcher.add('NORMALIZE', self._normalize,
                              [{
                                  normalize_flag: True
                              }])

        if numbers is not False:
            self._matcher.add('NUMBER', self._replace_token, [{
                'LIKE_NUM': True
            }])
            self._replacements['NUMBER'] = numbers

        if urls is not False:
            if urls in [
                    'domain', 'domain_unwrap_fast', 'domain_unwrap', 'title'
            ]:
                self._urls = urls
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            elif isinstance(urls, dict):
                self._domains = urls
                self._urls = 'domain_unwrap_fast'
                self._matcher.add('URL', self._process_url, [{
                    'LIKE_URL': True
                }])
            else:
                self._matcher.add('URL', self._replace_token, [{
                    'LIKE_URL': True
                }])
                self._replacements['URL'] = urls

        if emails is not False:
            self._matcher.add('EMAIL', self._replace_token, [{
                'LIKE_EMAIL': True
            }])
            self._replacements['EMAIL'] = emails

        if reddit_usernames is not False:

            def reddit_username_check(text):
                return bool(REDDITORS_RE.fullmatch(text))

            reddit_username_flag = self._nlp.vocab.add_flag(
                reddit_username_check)
            self._matcher.add('REDDIT_USERNAME', self._replace_token,
                              [{
                                  reddit_username_flag: True
                              }])
            self._replacements['REDDIT_USERNAME'] = reddit_usernames

        if subreddits is not False:

            def subreddit_check(text):
                return bool(SUBREDDITS_RE.fullmatch(text))

            subreddit_flag = self._nlp.vocab.add_flag(subreddit_check)
            self._matcher.add('SUBREDDIT', self._replace_token,
                              [{
                                  subreddit_flag: True
                              }])
            self._replacements['SUBREDDIT'] = subreddits

        if twitter_handles is not False:
            self._matcher.add('TWITTER_HANDLE', self._handles_postprocess,
                              [{
                                  twitter_handle_flag: True
                              }])

        if hashtags is not False:
            self._matcher.add('HASHTAG', self._hashtag_postprocess,
                              [{
                                  hashtag_flag: True
                              }])

        if hashtags == 'split' or twitter_handles == 'split':
            file = os.path.join(DATA_PATH, 'wordsfreq_wiki2.txt')
            with open(file) as f:
                self._words = f.read().split()
            self._wordcost = dict((k, log((i + 1) * log(len(self._words))))
                                  for i, k in enumerate(self._words))
            self._maxword = max(len(x) for x in self._words)

        if twitter_handles == 'realname':
            with open(os.path.join(DATA_PATH, 'realnames.json')) as f:
                self._realnames = json.load(f)

        if ignore_quotes:
            self._merging_matcher.add('QUOTE', None, [{
                'ORTH': '"'
            }, {
                'OP': '*',
                'IS_ASCII': True
            }, {
                'ORTH': '"'
            }])

            def doublequote_check(text):
                return bool(QUOTES_RE.fullmatch(text))

            doublequote_flag = self._nlp.vocab.add_flag(doublequote_check)
            self._matcher.add('DOUBLE_QUOTES', self._remove_token,
                              [{
                                  doublequote_flag: True
                              }])

        if self._stopwords:

            def stopword_check(text):
                return bool(text.lower() in self._stopwords)

            stopword_flag = self._nlp.vocab.add_flag(stopword_check)
            self._matcher.add('STOPWORD', self._remove_token,
                              [{
                                  stopword_flag: True
                              }])

        if keep_untokenized is not None:
            if not isinstance(keep_untokenized, list):
                raise ValueError(
                    "keep_untokenized has to be either None or a list")
            for i, phrase in enumerate(keep_untokenized):
                phrase_tokens = phrase.split(' ')
                rule = []
                for token in phrase_tokens:
                    rule.append({'LOWER': token.lower()})
                self._merging_matcher.add('RULE_' + str(i), None, rule)

        if pos_emojis:
            if not isinstance(pos_emojis, list):
                pos_emojis = POS_EMOJIS
            pos_patterns = [[{'ORTH': emoji}] for emoji in pos_emojis]
            self._matcher.add('HAPPY', self._replace_token, *pos_patterns)
            self._replacements['HAPPY'] = 'POS_EMOJI'

        if neg_emojis:
            if not isinstance(neg_emojis, list):
                neg_emojis = NEG_EMOJIS
            neg_patterns = [[{'ORTH': emoji}] for emoji in neg_emojis]
            self._matcher.add('SAD', self._replace_token, *neg_patterns)
            self._replacements['SAD'] = 'NEG_EMOJI'

        if neutral_emojis:
            if not isinstance(neutral_emojis, list):
                neutral_emojis = NEUTRAL_EMOJIS
            neutral_patterns = [[{'ORTH': emoji}] for emoji in neutral_emojis]
            self._matcher.add('NEUTRAL', self._replace_token,
                              *neutral_patterns)
            self._replacements['NEUTRAL'] = 'NEUTRAL_EMOJI'

        if isinstance(extra_patterns, list):
            self._flags = {}
            for name, re_pattern, replacement_token in extra_patterns:

                def flag(text):
                    return bool(re_pattern.search(text))

                self._flags[name] = self._nlp.vocab.add_flag(flag)
                self._matcher.add(name, self._replace_token,
                                  [{
                                      self._flags[name]: True
                                  }])
                self._replacements[name] = replacement_token

        if stem and ('nltk' in sys.modules):
            if stem == 'stem':
                self._stemmer = porterstem
            elif stem == 'lemm':
                # self._stemmer = wordnetlem
                pass
            else:
                raise ValueError(
                    'Stemming method {} is not supported'.format(stem))
            self._matcher.add('WORD_TO_STEM', self._stem_word,
                              [{
                                  'IS_ALPHA': True
                              }])

        retokenize_flag = self._nlp.vocab.add_flag(retokenize_check)
        self._matcher.add('RETOKENIZE', self._retokenize,
                          [{
                              retokenize_flag: True,
                              'IS_PUNCT': False,
                              'LIKE_URL': False,
                              'LIKE_EMAIL': False,
                              'LIKE_NUM': False,
                              hashtag_flag: False,
                              twitter_handle_flag: False
                          }])

        self._nlp.add_pipe(self._merge_doc, name='merge_doc', last=True)
        self._nlp.add_pipe(self._match_doc, name='match_doc', last=True)
        self._nlp.add_pipe(self._postproc_doc, name='postproc_doc', last=True)

    @staticmethod
    def _lowercase(__, doc, i, matches):
        # Lowercase tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = tok._.transformed_text.lower()

    def _stem_word(self, __, doc, i, matches):
        # Stem tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['stem'] == 'stem':
                tok._.transformed_text = self._stemmer.stem(
                    tok._.transformed_text)
            elif self.params['stem'] == 'lemm':
                tok._.transformed_text = self.lemmatize(tok._.transformed_text)

    def lemmatize(self, word, pos=NOUN):
        lemmas = self.wordnet._morphy(word, pos)
        return min(lemmas, key=len) if lemmas else word

    def _normalize(self, __, doc, i, matches):
        # Normalize repeating symbols
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = NORMALIZE_RE.sub(
                r"\1" * self.params['normalize'], tok._.transformed_text)

    def _process_url(self, __, doc, i, matches):
        # Process found URLs
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            found_urls = URLS_RE.findall(tok.text)
            if found_urls:
                if found_urls[0] in self._domains:
                    tok._.transformed_text = self._domains[found_urls[0]]
                elif self._urls == 'domain':
                    tok._.transformed_text = tldextract.extract(
                        found_urls[0]).domain
                elif self._urls != 'title':
                    if self._urls == 'domain_unwrap':
                        domain = unshorten_url(
                            found_urls[0], None,
                            self.params['print_url_warnings'])
                    else:
                        domain = unshorten_url(
                            found_urls[0], URL_SHORTENERS,
                            self.params['print_url_warnings'])
                    self._domains[found_urls[0]] = domain
                    tok._.transformed_text = domain
                elif self._urls == 'title':
                    domain = unshorten_url(found_urls[0], URL_SHORTENERS)
                    if domain != 'twitter':
                        title = get_url_title(
                            found_urls[0], self.params['print_url_warnings'])
                        title = self.tokenize(URLS_RE.sub('', title))
                    else:
                        title = ''
                    tok._.transformed_text = title
                    self._domains[found_urls[0]] = title

    def _replace_token(self, __, doc, i, matches):
        # Replace tokens with something else

        match_id, start, end = matches[i]
        span = doc[start:end]
        replacement_token = self._replacements[doc.vocab.strings[match_id]]
        for tok in span:
            tok._.transformed_text = replacement_token

    @staticmethod
    def _remove_token(__, doc, i, matches):
        # Remove tokens
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            tok._.transformed_text = ''

    def _retokenize(self, __, doc, i, matches):
        # Retokenize
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            text = tok.text
            text = re.sub(r'([#@])', r' \1', text)
            text = re.sub(r'\s{2,}', ' ', text).strip()
            tok._.transformed_text = self.tokenize(text)

    def _infer_spaces(self, text):
        # Infer location of spaces in hashtags
        text = text.lower()
        text = re.sub(r'[^\w\s]', '', text)

        def best_match(i):
            # Find the best match for the first i characters
            # assuming costs has been built for the first (i-1) characters
            candidates = enumerate(reversed(cost[max(0, i - self._maxword):i]))
            return min(
                (c + self._wordcost.get(text[i - k - 1:i], 9e999), k + 1)
                for k, c in candidates)

        cost = [0]
        for i in range(1, len(text) + 1):
            cur_cost, k = best_match(i)
            cost.append(cur_cost)

        out = []
        i = len(text)
        while i > 0:
            cur_cost, k = best_match(i)
            assert cur_cost == cost[i]
            out.append(text[i - k:i])
            i -= k

        return list(reversed(out))

    def _handles_postprocess(self, __, doc, i, matches):
        # Process twitter handles
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['twitter_handles'] == 'realname':
                if tok.text in self._realnames:
                    tok._.transformed_text = self._realnames[tok.text]
                else:
                    handle = get_twitter_realname(tok.text)
                    realname = self.tokenize(TWITTER_HANDLES_RE.sub(
                        '', handle))
                    tok._.transformed_text = realname
                    self._realnames[tok.text] = realname
            elif self.params['twitter_handles'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['twitter_handles']

    def _hashtag_postprocess(self, __, doc, i, matches):
        # Process hashtags
        __, start, end = matches[i]
        span = doc[start:end]
        for tok in span:
            if self.params['hashtags'] == 'split':
                poss = self._infer_spaces(tok._.transformed_text[1:])
                if poss:
                    tok._.transformed_text = poss
            else:
                tok._.transformed_text = self.params['hashtags']

    @staticmethod
    def _decontract(text):
        # Expand contractions
        for contraction, decontraction in DECONTRACTIONS.items():
            text = re.sub(contraction, decontraction, text)
        return text

    def _preprocess_text(self, text):
        # Do some preprocessing
        text = re.sub("’", "'", text)
        if self.params['remove_nonunicode']:
            try:
                text = text.encode('utf-8').decode('unicode-escape')
                text = ''.join(filter(lambda x: x in string.printable,
                                      text)).strip()
            except UnicodeDecodeError:
                warnings.warn(
                    '(UnicodeDecodeError while trying to remove non-unicode characters'
                )
        if self.params['decontract']:
            text = self._decontract(text)
        text = html.unescape(text)

        if self.params['latin_chars_fix']:
            if EMOJIS_UTF_RE.findall(text):
                text = EMOJIS_UTF_NOSPACE_RE.sub(r' \1', text)
                for utf_code, emoji in EMOJIS_UTF.items():
                    text = EMOJIS_UTF_PATS[utf_code].sub(emoji, text)

            if EMOJIS_UNICODE_RE.findall(text):
                text = EMOJIS_UNICODE_NOSPACE_RE.sub(r'\1 \2', text)
                for utf_code, emoji in EMOJIS_UNICODE.items():
                    text = EMOJIS_UNICODE_PATS[utf_code].sub(emoji, text)

            if LATIN_CHARS_RE.findall(text):
                for _hex, _char in LATIN_CHARS.items():
                    text = LATIN_CHARS_PATS[_hex].sub(_char, text)

        if self.params['ignore_reddit_quotes']:
            text = REDDIT_QUOTES_RE.sub(text, ' ')

        text = text.replace('.@', '. @')
        text = re.sub(r'([*;,!?\(\)\[\]])', r' \1', text)
        text = re.sub(r'\s{2,}', ' ', text)

        return text.strip()

    def _merge_doc(self, doc):
        # Perform merging for certain types of tokens
        matches = self._merging_matcher(doc)
        spans = []
        for __, start, end in matches:
            spans.append(doc[start:end])
        for span in spans:
            span.merge()
        for tok in doc:
            tok._.transformed_text = tok.text

        return doc

    def _match_doc(self, doc):
        # Perform all additional processing
        self._matcher(doc)
        return doc

    def _postproc_doc(self, doc):
        # Perform postprocessing
        doc._.tokens = []
        for tok in doc:
            if isinstance(tok._.transformed_text, list):
                doc._.tokens.extend(tok._.transformed_text)
            elif tok._.transformed_text.strip() != '':
                if self.params['whitespaces_to_underscores']:
                    tok._.transformed_text = "_".join(
                        tok._.transformed_text.split())
                doc._.tokens.append(tok._.transformed_text.strip())
        return doc

    def tokenize(self, text):
        """
        Tokenize document

        Parameters
        ----------
        text : str
            Document to tokenize

        Returns
        -------
        list
            List of tokens

        Examples
        --------
        >>> from redditscore.tokenizer import CrazyTokenizer
        >>> tokenizer = CrazyTokenizer(splithashtags=True, hashtags=False)
        >>> tokenizer.tokenize("#makeamericagreatagain")
        ["make", "america", "great", "again"]
        """
        if not isinstance(text, str):
            warnings.warn('Document {} is not a string'.format(text))
            return []
        text = self._preprocess_text(text)
        doc = self._nlp(text)
        tokens = doc._.tokens
        if self.params['ngrams'] > 1:
            if self.params['whitespaces_to_underscores']:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']),
                                     separator='_')
            else:
                tokens = word_ngrams(tokens, (1, self.params['ngrams']))
        return tokens
with open('countries.json') as f:
    COUNTRIES = json.loads(f.read())

nlp = English()

matcher = PhraseMatcher(nlp.vocab)
matcher.add('COUNTRY', None, *list(nlp.pipe(COUNTRIES)))


def countries_component(doc):
    #Create an entity Span with the label "GPE" for all matches
    matches = matcher(doc)
    doc.ents = [
        Span(doc, start, end, label="GPE") for match_id, start, end in matches
    ]
    return doc


#Add component to the pipeline
nlp.add_pipe(countries_component)
print(nlp.pipe_names)

#Getter that looks up the span text in the dictionary of country capitals
get_capital = lambda span: CAPITALS.get(span.text)

#Register the Span extension attribute 'capital' with the getter get_capital
Span.set_extension('capital', getter=get_capital)

#Process the text and print the entity text, label and capital attributes
doc = nlp("Czech Republic may help Slovakia protext its airspace")
print([(ent.text, ent.label_, ent._.capital) for ent in doc.ents])
Exemple #26
0
def getSentences(text):
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    document = nlp(text)
    return [sent.string.strip() for sent in document.sents]
    def prepare_data(self):
        """
        Create the data using the ``huggingface/nlp`` library. This function handles
        downloading, preprocessing, tokenization, and feature extraction.
        """
        all_tokenized_files_present = all(
            os.path.isfile(path)
            for path in self.tokenized_data_file_paths.values())
        if self.hparams.no_prepare_data or all_tokenized_files_present:
            logger.info(
                "Skipping data preparation because `--no_prepare_data` was specified or all the final tokenized data files are present."
            )
            if self.hparams.only_preprocess:
                logger.info(
                    "Exiting because both `--no_prepare_data` and `--only_preprocess` set."
                )
                sys.exit(0)
            return

        def convert_to_features(example_batch):
            max_length = self.tokenizer.model_max_length

            articles = example_batch[self.hparams.data_example_column]

            articles_encoded_step = []
            for idx, article in enumerate(articles):
                article = article.strip()
                try:
                    article_encoded = self.tokenizer(
                        article,
                        padding="max_length",
                        truncation=True,
                    )
                    articles_encoded_step.append(article_encoded)
                except:  # skipcq: FLK-E722
                    print("Failed to tokenize article: {}".format(article))
                    sys.exit(1)

                if idx != 0:
                    current_length = len(article_encoded["input_ids"])
                    first_length = len(articles_encoded_step[0]["input_ids"])
                    assert (
                        current_length == first_length
                    ), "The length of the current input, {}, does not match the length of the first input, {}.".format(
                        current_length, first_length)

            articles_encoded = {
                "input_ids": [i["input_ids"] for i in articles_encoded_step],
                "attention_mask":
                [i["attention_mask"] for i in articles_encoded_step],
            }

            # articles_encoded = self.tokenizer.batch_encode_plus(
            #     articles, pad_to_max_length=True, truncation=True,
            # )

            highlights = example_batch[self.hparams.data_summarized_column]

            # Tokenize highlights using spacy to split them into sentences if they were not
            # already split in the dataset (use `hparams.split_char` to specify the sentence
            # boundary character)
            if not self.hparams.split_char:
                highlights = tokenize(spacy_nlp,
                                      highlights,
                                      disable_progress_bar=True)

            sep_token = self.tokenizer.sep_token
            highlights_input_ids = []
            highlights_attention_masks = []

            # For each ground-truth summary
            for highlight in highlights:
                if self.hparams.split_char:
                    # simply split into sentences if `hparams.split_char` is specified
                    sents = highlight.split(self.hparams.split_char)
                else:
                    # `highlight` is a list of sentences where each sentence is a list of tokens
                    # Combine those tokens to create a list of sentences.
                    sents = [
                        " ".join(list_of_ids) for list_of_ids in highlight
                    ]

                assert type(sents) is list
                assert len(sents) > 0

                # Tokenize each sentence and append the `sep_token`
                sents_tokenized = []
                for sent in sents:
                    assert type(sent) is str
                    assert len(sent) > 0
                    sent = self.tokenizer.tokenize(sent)
                    sent.append(sep_token)
                    sents_tokenized.append(sent)

                # Delete the last `sep_token` from the last sentence
                assert type(sents_tokenized[-1][-1]) is str
                del sents_tokenized[-1][-1]
                # Flatten `sents_tokenized` (a list of sentences where each sentence is a list
                # of tokens) to a list of tokens
                sents_tokenized_flat = list(
                    itertools.chain.from_iterable(sents_tokenized))
                assert type(sents_tokenized_flat[0]) is str
                assert len(sents_tokenized_flat) > 0

                # Convert the tokens to `input_ids`
                # `max_length` is the max length minus 2 because we need to add the
                # beginning and ending tokens to the target
                sents_input_ids = self.tokenizer.encode_plus(
                    sents_tokenized_flat,
                    truncation=True,
                    is_pretokenized=True,
                    add_special_tokens=False,
                    max_length=(max_length - 2),
                    return_attention_mask=False,
                    return_token_type_ids=False,
                )["input_ids"]

                # Insert beginning of sequence token and append end of sequence token.
                sents_input_ids.insert(0, self.target_boseq_token_id)
                sents_input_ids.append(self.target_eoseq_token_id)

                # Create attention mask
                attention_mask = [1] * len(sents_input_ids)

                # Append the `input_ids` and `attention_mask`
                highlights_input_ids.append(sents_input_ids)
                highlights_attention_masks.append(attention_mask)

            # Pad the highlight input ids and attention masks to `tokenizer.max_len`.
            # The articles have already been padded because they do not need the extra
            # `boseq` and `eoseq` tokens.
            highlights_input_ids = pad(
                highlights_input_ids,
                self.tokenizer.pad_token_id,
                width=max_length,
            )
            highlights_attention_masks = pad(highlights_attention_masks,
                                             0,
                                             width=max_length)

            return {
                "source": articles_encoded["input_ids"],
                "target": highlights_input_ids,
                "source_mask": articles_encoded["attention_mask"],
                "target_mask": highlights_attention_masks,
            }

        def remove_empty(batch_item):
            article = batch_item[self.hparams.data_example_column]
            article = article.strip()
            highlight = batch_item[self.hparams.data_summarized_column]
            highlight = highlight.strip()
            # keep_article = article and article != "\n" and article != ""
            # keep_highlight = highlight and highlight != "\n" and highlight != ""
            if self.hparams.use_percentage_of_data:
                keep_example = (
                    article and highlight
                    and random.random() < self.hparams.use_percentage_of_data)
            else:
                keep_example = bool(article and highlight)

            return keep_example

        # Load spacy if the summary column does not contain separated sentences
        if not self.hparams.split_char:
            # load spacy english small model with the "tagger" and "ner" disabled since
            # we only need the "tokenizer" and "parser"
            # more info: https://spacy.io/usage/processing-pipelines
            if self.hparams.sentencizer:
                spacy_nlp = English()
                sentencizer = spacy_nlp.create_pipe("sentencizer")
                spacy_nlp.add_pipe(sentencizer)
            else:
                spacy_nlp = spacy.load("en_core_web_sm",
                                       disable=["tagger", "ner"])

        # Combine the two sections of `scientific_papers` if it is chosen as the dataset
        if self.hparams.dataset == "scientific_papers":
            self.hparams.data_example_column = "article"
            self.hparams.data_summarized_column = "abstract"

            dataset_pubmed = nlp.load_dataset(
                "scientific_papers",
                "pubmed",
                cache_dir=self.hparams.nlp_cache_dir)
            dataset_arxiv = nlp.load_dataset(
                "scientific_papers",
                "arxiv",
                cache_dir=self.hparams.nlp_cache_dir)

            combined_dataset = {}
            for (
                    split,
                    save_path_final_tokenized,
            ) in self.tokenized_data_file_paths.items():
                save_path = os.path.join(
                    self.hparams.cache_file_path,
                    ("arxiv_pubmed_combined_" + split + ".arrow"),
                )
                # If the file has not been saved to disk then combine arXiv and PubMed
                # and write to file. Don't process if the final tokenized version is
                # present and can be loaded.
                if (not os.path.exists(save_path)) and (
                        not os.path.exists(save_path_final_tokenized)):
                    logger.info("Joining split %s", split)
                    new = pyarrow.concat_tables([
                        dataset_pubmed[split].data, dataset_arxiv[split].data
                    ])

                    writer = nlp.arrow_writer.ArrowWriter(path=save_path)
                    writer.write_table(new)
                else:
                    logger.info(
                        "Skipping joining split %s because it already exists",
                        split)

                if not os.path.exists(save_path_final_tokenized):
                    # Load combined dataset from file if the final tokenized version
                    # does not exist.
                    logger.info("Loading split %s", save_path)
                    combined_dataset[split] = nlp.Dataset.from_file(save_path)
                else:
                    # If the tokenzed split already exists then just store the pubmed
                    # section as a placeholder so `nlp` does not complain.
                    logger.info(
                        "NOT loading split %s because the final tokenized version already exists.",
                        save_path,
                    )
                    combined_dataset[split] = dataset_pubmed[split]

            self.dataset = combined_dataset

        else:
            if type(self.hparams.dataset
                    ) is list and "/" in self.hparams.dataset[0]:
                for (split, _), dataset_path in zip(
                        self.tokenized_data_file_paths.items(),
                        self.hparams.dataset):
                    self.dataset[split] = nlp.Dataset.from_file(dataset_path)
            else:
                self.dataset = nlp.load_dataset(
                    self.hparams.dataset,
                    self.hparams.dataset_version,
                    cache_dir=self.hparams.nlp_cache_dir,
                )

        for split, features_cache_file in self.tokenized_data_file_paths.items(
        ):
            # If the tokenized version has not been created yet, then do the initial
            # filtering so it can be created
            if not os.path.isfile(features_cache_file):
                logger.info("Removing empty examples from %s dataset", split)
                start_num_examples = len(self.dataset[split])
                self.dataset[split] = self.dataset[split].filter(
                    remove_empty,
                    cache_file_name=os.path.join(self.hparams.cache_file_path,
                                                 (split + "_filtered")),
                )
                end_num_examples = len(self.dataset[split])
                logger.info(
                    "Removed %i (%.2f%%) examples from the dataset.",
                    start_num_examples - end_num_examples,
                    (1 - end_num_examples / start_num_examples) * 100,
                )

            logger.info("Converting %s dataset to features", split)
            self.dataset[split] = self.dataset[split].map(
                convert_to_features,
                batched=True,
                remove_columns=self.dataset[split].data.column_names,
                cache_file_name=features_cache_file,
            )

        # Exit if set to only preprocess the data
        if self.hparams.only_preprocess:
            logger.info(
                "Exiting because data has been pre-processed and the `--only_preprocess` option is enabled."
            )
            sys.exit(0)
Exemple #28
0
def get_sentencizer() -> spacy.language.Language:
    nlp = English()
    nlp.add_pipe(nlp.create_pipe('sentencizer'))
    return nlp
Exemple #29
0
    def transcribe(
        self,
        file_uri: Union[str, Path],
        phrases: Optional[List[str]] = None,
        **kwargs: Any,
    ) -> transcript_model.Transcript:
        """
        Transcribe audio from GCS file and return a Transcript model.

        Parameters
        ----------
        file_uri: Union[str, Path]
            The GCS file uri to the audio file or caption file to transcribe.
            It should be in format 'gs://...'.
        phrases: Optional[List[str]] = None
            A list of strings to feed as targets to the model.

        Returns
        -------
        outputs: transcript_model.Transcript
            The transcript model for the supplied media file.
        """
        # Create client
        client = speech.SpeechClient.from_service_account_file(
            filename=str(self.credentials_file))

        # Create basic metadata
        metadata = speech.RecognitionMetadata()
        metadata.interaction_type = (
            speech.RecognitionMetadata.InteractionType.PHONE_CALL)
        metadata.original_media_type = (
            speech.RecognitionMetadata.OriginalMediaType.VIDEO)

        # Add phrases
        event_metadata_speech_context = speech.SpeechContext(
            phrases=self._clean_phrases(phrases))

        # Prepare for transcription
        config = speech.RecognitionConfig(
            encoding=speech.RecognitionConfig.AudioEncoding.LINEAR16,
            sample_rate_hertz=16000,
            language_code="en-US",
            enable_automatic_punctuation=True,
            enable_word_time_offsets=True,
            enable_spoken_punctuation=True,
            speech_contexts=[
                GOOGLE_SPEECH_ADAPTION_CLASSES,
                event_metadata_speech_context,
            ],
            metadata=metadata,
            model="video",
            use_enhanced=True,
        )
        audio = speech.RecognitionAudio(uri=file_uri)

        # Begin transcription
        log.debug(f"Beginning transcription for: {file_uri}")
        operation = client.long_running_recognize(request={
            "config": config,
            "audio": audio
        })

        # Wait for complete
        response = operation.result(timeout=10800)

        # Select highest confidence transcripts
        confidence_sum = 0
        segments = 0

        # Create timestamped sentences
        timestamped_sentences: List[transcript_model.Sentence] = []
        transcript_sentence_index = 0

        # Create sentence boundary pipeline
        nlp = English()
        nlp.add_pipe("sentencizer")

        for result in response.results:
            # Some portions of audio may not have text
            if len(result.alternatives) > 0:
                # Split transcript into sentences
                doc = nlp(result.alternatives[0].transcript)

                # Convert generator to list
                sentences = [str(sent) for sent in doc.sents]

                # Index holder for word results of response
                w_marker = 0
                for s_ind, _ in enumerate(sentences):
                    # Sentence text
                    s_text = sentences[s_ind]

                    num_words = len(s_text.split())

                    # Initialize sentence model
                    timestamped_sentence = transcript_model.Sentence(
                        index=transcript_sentence_index,
                        confidence=result.alternatives[0].confidence,
                        # Start and end time are placeholder values
                        start_time=0.0,
                        end_time=0.0,
                        words=[],
                        text=s_text,
                    )

                    for w_ind in range(w_marker, w_marker + num_words):
                        # Extract word from response
                        word = result.alternatives[0].words[w_ind]

                        # Nanos no longer supported, use microseconds instead
                        # https://github.com/googleapis/python-speech/issues/71
                        start_time = (word.start_time.seconds +
                                      word.start_time.microseconds * 1e-6)

                        end_time = (word.end_time.seconds +
                                    word.end_time.microseconds * 1e-6)

                        # Add start_time to Sentence if first word
                        if w_ind - w_marker == 0:
                            timestamped_sentence.start_time = start_time

                        # Add end_time to Sentence if last word
                        if (w_ind - w_marker) == (num_words - 1):
                            timestamped_sentence.end_time = end_time

                        # Create Word model
                        timestamped_word = transcript_model.Word(
                            index=w_ind - w_marker,
                            start_time=start_time,
                            end_time=end_time,
                            text=self._clean_word(word.word),
                        )

                        timestamped_sentence.words.append(timestamped_word)

                    # Increment word marker
                    w_marker += num_words

                    # Add Sentence to sentence list
                    timestamped_sentences.append(timestamped_sentence)

                    # Increment transcript sentence index
                    transcript_sentence_index += 1

                # Update confidence stats
                confidence_sum += result.alternatives[0].confidence
                segments += 1

        # Compute mean confidence
        if segments > 0:
            confidence = confidence_sum / segments
        else:
            confidence = 0.0
        log.info(
            f"Completed transcription for: {file_uri}. Confidence: {confidence}"
        )

        # Create transcript model
        transcript = transcript_model.Transcript(
            generator=f"Google Speech-to-Text -- CDP v{__version__}",
            confidence=confidence,
            session_datetime=None,
            created_datetime=datetime.utcnow().isoformat(),
            sentences=timestamped_sentences,
        )

        return transcript
Exemple #30
0
def test_issue7065_b():
    # Test that the NEL doesn't crash when an entity crosses a sentence boundary
    nlp = English()
    vector_length = 3
    nlp.add_pipe("sentencizer")
    text = "Mahler 's Symphony No. 8 was beautiful."
    entities = [(0, 6, "PERSON"), (10, 24, "WORK")]
    links = {
        (0, 6): {
            "Q7304": 1.0,
            "Q270853": 0.0
        },
        (10, 24): {
            "Q7304": 0.0,
            "Q270853": 1.0
        },
    }
    sent_starts = [1, -1, 0, 0, 0, 0, 0, 0, 0]
    doc = nlp(text)
    example = Example.from_dict(doc, {
        "entities": entities,
        "links": links,
        "sent_starts": sent_starts
    })
    train_examples = [example]

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q270853", freq=12, entity_vector=[9, 1, -7])
        mykb.add_alias(
            alias="No. 8",
            entities=["Q270853"],
            probabilities=[1.0],
        )
        mykb.add_entity(entity="Q7304", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias(
            alias="Mahler",
            entities=["Q7304"],
            probabilities=[1.0],
        )
        return mykb

    # Create the Entity Linker component and add it to the pipeline
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    # train the NEL pipe
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # Add a custom rule-based component to mimick NER
    patterns = [
        {
            "label": "PERSON",
            "pattern": [{
                "LOWER": "mahler"
            }]
        },
        {
            "label":
            "WORK",
            "pattern": [
                {
                    "LOWER": "symphony"
                },
                {
                    "LOWER": "no"
                },
                {
                    "LOWER": "."
                },
                {
                    "LOWER": "8"
                },
            ],
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)
    # test the trained model - this should not throw E148
    doc = nlp(text)
    assert doc
Exemple #31
0
# sentence tokenization
from spacy.lang.en import English

# Load English tokenizer, tagger, parser, NER and word vectors
nlp = English()

# Create the pipeline 'sentencizer' component
sbd = nlp.create_pipe('sentencizer')

# Add the component to the pipeline
nlp.add_pipe(sbd)

text = """When learning Artificial Intelligence, you shouldn't get discouraged!
Challenges and setbacks aren't failures, they're just part of the journey. You've got this!"""

#  "nlp" Object is used to create documents with linguistic annotations.
doc = nlp(text)

# create list of sentence tokens
sents_list = []
for sent in doc.sents:
    sents_list.append(sent.text)
print(sents_list)
Exemple #32
0
 def find_usage_examples_from_summary(
     self,
     form: Form = None,
 ) -> List[UsageExample]:
     """This tries to find and clean sentences and return the shortest one"""
     if form is None:
         raise ValueError("form was None")
     logger = logging.getLogger(__name__)
     # find sentences
     # order in a list by length
     # pick the shortest one where the form representation appears
     if self.language_code == WikimediaLanguageCode.ENGLISH:
         logger.info("using the English spaCy pipeline")
         nlp = English()
         nlp.add_pipe('sentencizer')
     elif self.language_code == WikimediaLanguageCode.SWEDISH:
         nlp = Swedish()
         nlp.add_pipe('sentencizer')
     elif (self.language_code == WikimediaLanguageCode.FRENCH
           or self.language_code == WikimediaLanguageCode.GERMAN
           or self.language_code == WikimediaLanguageCode.BOKMÅL
           or self.language_code == WikimediaLanguageCode.DANISH):
         logger.info(
             f"using the {self.language_code.name.title()} spaCy pipeline")
         try:
             nlp = spacy.load(f'{self.language_code.value}_core_news_sm')
         except:
             raise ModuleNotFoundError(
                 f"Please install the spacy model for "
                 f"{self.language_code.name.title()} by running: "
                 f"'python -m spacy download "
                 f"{self.language_code.value}_core_news_sm' "
                 f"in the terminal/cmd/powershell")
     else:
         raise NotImplementedError(
             f"Sentence extraction for {self.language_code.name} "
             f"is not supported yet, feel free to open an issue at "
             f"https://github.com/dpriskorn/LexUtils/issues")
     doc = nlp(self.text)
     sentences = set()
     for sentence in doc.sents:
         # logger.info(sentence.text)
         # This is a very crude test for relevancy, we lower first to improve matching
         cleaned_sentence = sentence.text.lower()
         punctations = [".", ",", "!", "?", "„", "“", "»"]
         for punctation in punctations:
             if punctation in cleaned_sentence:
                 cleaned_sentence = cleaned_sentence.replace(
                     punctation, " ")
         cleaned_sentence = cleaned_sentence.strip()
         logger.debug(f"cleaned sentence:{cleaned_sentence}")
         if f" {form.representation.lower()} " in cleaned_sentence:
             # Add to the set first to avoid duplicates
             sentences.add(sentence.text)
     examples = []
     for sentence in sentences:
         sentence_length = len(sentence.split(" "))
         if (sentence_length > config.min_word_count
                 and sentence_length < config.max_word_count):
             # Clean the sentence so it looks better
             punctations = ["„", "“", "»"]
             for punctation in punctations:
                 if punctation in sentence:
                     sentence = sentence.replace(punctation, " ")
             sentence = sentence.strip()
             examples.append(UsageExample(sentence=sentence, record=self))
     # print("debug exit")
     # exit(0)
     return examples
Exemple #33
0
def test_partial_links():
    # Test that having some entities on the doc without gold links, doesn't crash
    TRAIN_DATA = [(
        "Russ Cochran his reprints include EC Comics.",
        {
            "links": {
                (0, 12): {
                    "Q2146908": 1.0
                }
            },
            "entities": [(0, 12, "PERSON")],
            "sent_starts": [1, -1, 0, 0, 0, 0, 0, 0],
        },
    )]
    nlp = English()
    vector_length = 3
    train_examples = []
    for text, annotation in TRAIN_DATA:
        doc = nlp(text)
        train_examples.append(Example.from_dict(doc, annotation))

    def create_kb(vocab):
        # create artificial KB
        mykb = KnowledgeBase(vocab, entity_vector_length=vector_length)
        mykb.add_entity(entity="Q2146908", freq=12, entity_vector=[6, -4, 3])
        mykb.add_alias("Russ Cochran", ["Q2146908"], [0.9])
        return mykb

    # Create and train the Entity Linker
    entity_linker = nlp.add_pipe("entity_linker", last=True)
    entity_linker.set_kb(create_kb)
    optimizer = nlp.initialize(get_examples=lambda: train_examples)
    for i in range(2):
        losses = {}
        nlp.update(train_examples, sgd=optimizer, losses=losses)

    # adding additional components that are required for the entity_linker
    nlp.add_pipe("sentencizer", first=True)
    patterns = [
        {
            "label": "PERSON",
            "pattern": [{
                "LOWER": "russ"
            }, {
                "LOWER": "cochran"
            }]
        },
        {
            "label": "ORG",
            "pattern": [{
                "LOWER": "ec"
            }, {
                "LOWER": "comics"
            }]
        },
    ]
    ruler = nlp.add_pipe("entity_ruler", before="entity_linker")
    ruler.add_patterns(patterns)

    # this will run the pipeline on the examples and shouldn't crash
    results = nlp.evaluate(train_examples)
    assert "PERSON" in results["ents_per_type"]
    assert "PERSON" in results["nel_f_per_type"]
    assert "ORG" in results["ents_per_type"]
    assert "ORG" not in results["nel_f_per_type"]
Exemple #34
0
def main():
    # get just the language with no model
    nlp = English()
    # nlp = spacy.load('en_core_web_sm')

    # add the sentencizer component to the pipeline
    # rem this component  splits sentences on punctuation such as . !  ?
    # plugging it into pipeline to get just the sentence boundaries
    # without the dependency parse.
    sentencizer = nlp.create_pipe("sentencizer")
    nlp.add_pipe(sentencizer)
    '''
    Model for component 'ner' not initialized.
    Did you forget to load a model, or forget to call begin_training()?
    ner = nlp.create_pipe("ner")
    nlp.add_pipe(ner)
    '''

    # get product group data file and feed the info into arrays
    # that will be used later to create custom tags for the nlp object
    txt_obj = ''
    with open('../../store/model/erp10/pumps/prod_pumps_erp10.csv') as data:
        data = csv.reader(data, delimiter='|')
        headers = []
        productIDs = []
        products = []
        suppliers = []
        mpns = []

        # TEST print    -----------------------
        #print('contents of arrays for tagging:\n')

        testList = [headers, productIDs, products, suppliers, mpns]
        i = 0
        for row in data:
            if i == 0:
                headers.append(row)
            else:
                productID = row[0]
                product = row[1]
                supplier = row[2]
                mpn = row[3]

                productIDs.append(productID)
                products.append(product)
                suppliers.append(supplier)
                mpns.append(mpn)

            # create text object
            # rem add a period at the end so that the spacy sentencizer
            # knows how to detect the end of each record
            # and add all rows to text object except for header row
            if i != 0:
                txt_obj = txt_obj + ' '.join(row) + '.\n'
            i += 1

    # TEST print  -----------------------
    print('testList items:\n')
    for item in testList:
        print(item)

    # clean the text object
    txt_obj = preprocessor.string_cleaner(txt_obj)

    # TEST PRINT  -----------------------
    print('\n\ntxt_obj after cleaning:\n', txt_obj)

    # create the nlp object:
    pumps_erp10 = nlp(txt_obj)

    # TEST print  -----------------------
    print('\n\npumps_erp10 after sentencizer:\n')
    for sent in pumps_erp10.sents:
        print(sent.text, '**end row**', end='')

    # TEST print  -----------------------
    print('\n\ntoken.like_num in nlp obj:\n')
    for token in pumps_erp10:
        print(token.like_num, ',', end='')

    # stuff we get:
    # token, .text, .i, .idx, .tag_, .lemma_
    # .is_punct, .is_space, .like_num
    print('\nDone.')

    # stuff we don't get:
    # pos, ent, chunking,

    # LU
    # textcat (TextCategorizer, Doc.cats)
    # custom components (Doc._.xxx, Token._.xxx, Span._.xxx)
    # create_pipe, add_pipe

    # TEST print  -----------------------
    print('\n', nlp.pipeline)
    print('\n', nlp.pipe_names)
Exemple #35
0
#!/usr/bin/env python
# coding: utf-8

# ### Importing Dependencies

import pandas as pd
import spacy
from spacy.pipeline import EntityRuler
from spacy.lang.en import English

##Initiating Spacy english version
nlp = English()
## Instantiating Entity ruler and adding to nlp via pipe
ruler = EntityRuler(nlp)
nlp.add_pipe(ruler)


##Creating Entity pattern
### To retrieve patterns from the text for NER
class EntityPattern():
    def dash_split(self, sentence):
        ##If pattern list has no value then split directly else loop the list and split each value
        if len(self.split_pattern_list) == 0:
            self.split_pattern_list = sentence.split('-')
        else:
            dash_list = []
            for word in self.split_pattern_list:
                word_list = word.split('-')
                for split_word in word_list:
                    dash_list.append(split_word)
Exemple #36
0
class Segmentation:
    def __init__(self,
                 dataset=None,
                 entity_labels=None,
                 no_rel_label=None,
                 no_rel_multiple=False,
                 sentence_align=False,
                 test=False,
                 same_entity_relation=False,
                 write_Entites=False,
                 generalize=False,
                 parallelize=False,
                 no_of_cores=64,
                 predictions_folder=None,
                 de_sample=None):
        """
           Data files are read in and the sentence where the entitiy pair is located is segmented into 5
           along with the labels and the track information (file number, entity1 and entity 2) that helps to write predictions
           back to file.
           :param dataset: path to dataset
           :param predictions_folder: path to predictions (output) folder
           :param entity_labels: labels of the list of entities that create the relations
           :param no_labels: name the label when entities that do not have relations in a sentence are considered
           :param no_rel_multiple: flag whether multiple labels are possibles for No-relation
           :param sentence_align: options to break sentences
           :param test: flag to run test-segmentation options
           :param same_entity_relation: flag when relation exists between same type of entities
           :param de_sample: flag to reduce the no of samples
           :param generalize: flag when relations are not dependent on the first given relation label
           :param parallelize: flag to parallelize the segmentation
           :param no_of_cores: no of cores to run the parallelized segmentation
           :param write_Entites: write entities and predictions to file
           :param with_labels: Take the labels of the entites into consideration during segmentation

        """
        self.predictions_folder = predictions_folder
        self.dataset = dataset
        self.entity_labels = entity_labels
        self.test = test
        self.same_entity_relation = same_entity_relation
        self.generalize = generalize
        self.parallelize = parallelize
        self.write_Entites = write_Entites
        self.nlp_model = English()
        self.nlp_model.max_length = 2000000
        if no_rel_label:
            self.no_rel_label = no_rel_label
        else:
            self.no_rel_label = False
        self.no_rel_multiple = no_rel_multiple

        if de_sample:
            self.de_sample = de_sample
        else:
            self.de_sample = False

        if sentence_align:
            sentencizer = Sentencizer(punct_chars=["\n"])
        else:
            sentencizer = Sentencizer(punct_chars=["\n", ".", "?"])

        if self.write_Entites and self.predictions_folder is not None:
            ext = ".ann"
            file.delete_all_files(predictions_folder, ext)

        self.nlp_model.add_pipe(sentencizer)

        # self.nlp_model = spacy.load('en_core_web_sm')

        # global segmentation object that returns all segments and the label
        self.segments = {
            'seg_preceding': [],
            'seg_concept1': [],
            'seg_concept2': [],
            'seg_concept1_label': [],
            'seg_concept2_label': [],
            'seg_middle': [],
            'seg_succeeding': [],
            'sentence': [],
            'label': [],
            'track': []
        }

        #if parallelize flag is true
        if self.parallelize:
            # Pool object which offers a convenient means of parallelizing the execution of a function
            # across multiple input values, distributing the input data across processes
            pool = Pool(no_of_cores)
            all_args = []
            for datafile, txt_path, ann_path in self.dataset:
                all_args.append([datafile, txt_path, ann_path])
            segments_file = pool.map(self.process_file_parallel, all_args)
            pool.close()
            pool.join()

            # count = 0
            # for i in range(len(segments_file)):
            #     count  = count + len(segments_file[i]['label'])
            # print(count)

            for segment in segments_file:
                # Add lists of segments to the segments object for the dataset
                self.segments['seg_preceding'].extend(segment['preceding'])
                self.segments['seg_concept1'].extend(segment['concept1'])
                self.segments['seg_middle'].extend(segment['middle'])
                self.segments['seg_concept2'].extend(segment['concept2'])
                self.segments['seg_succeeding'].extend(segment['succeeding'])
                self.segments['sentence'].extend(segment['sentence'])
                self.segments['track'].extend(segment['track'])
                # if not self.test:
                self.segments['label'].extend(segment['label'])
                # self.segments['seg_concept1_label'].extend(segment['concept1_label'])
                # self.segments['seg_concept2_label'].extend(segment['concept2_label'])
        else:
            segment = self.process_file_serial(dataset)

            # Add lists of segments to the segments object for the dataset
            self.segments['seg_preceding'].extend(segment['preceding'])
            self.segments['seg_concept1'].extend(segment['concept1'])
            self.segments['seg_middle'].extend(segment['middle'])
            self.segments['seg_concept2'].extend(segment['concept2'])
            self.segments['seg_succeeding'].extend(segment['succeeding'])
            self.segments['sentence'].extend(segment['sentence'])
            self.segments['track'].extend(segment['track'])
            # if not self.test:
            self.segments['label'].extend(segment['label'])
            self.segments['seg_concept1_label'].extend(
                segment['concept1_label'])
            self.segments['seg_concept2_label'].extend(
                segment['concept2_label'])

        if not self.test:
            # print(set(self.segments['label']))
            # print the number of instances of each relation classes
            print([(i, self.segments['label'].count(i))
                   for i in set(self.segments['label'])])

        # write the segments to a file
        file.list_to_file('sentence_test', self.segments['sentence'])
        file.list_to_file('preceding_seg', self.segments['seg_preceding'])
        file.list_to_file('concept1_seg', self.segments['seg_concept1'])
        file.list_to_file('middle_seg', self.segments['seg_middle'])
        file.list_to_file('concept2_seg', self.segments['seg_concept2'])
        file.list_to_file('succeeding_seg', self.segments['seg_succeeding'])
        file.list_to_file('track_test', self.segments['track'])
        # if not self.test:
        # file.list_to_file('labels_test', self.segments['label'])
        # file.list_to_file('concept1_seg_label', self.segments['seg_concept1_label'])
        # file.list_to_file('concept2_seg_label', self.segments['seg_concept2_label'])

    def process_file_parallel(self, dataset):
        """
        Parallelizing the execution of segmentation across multiple input files, distributing the input data across processes
        :param dataset: dataset
        :return: segments
        """
        self.file = dataset[0]
        self.ann_path = dataset[2]
        self.txt_path = dataset[1]
        self.ann_obj = Annotation(self.ann_path)

        print("File", self.file)
        content = open(self.txt_path).read()
        # content_text = normalization.replace_Punctuation(content)

        self.doc = self.nlp_model(content)

        file_name = str(self.file) + ".ann"
        if self.write_Entites and self.predictions_folder is not None:
            write_entities_to_file(self.ann_obj, file_name,
                                   self.predictions_folder)
        # else:
        #     print("Define the path to the folder to save predictions ")

        segment = self.get_Segments_from_sentence(self.ann_obj)
        return segment

    def process_file_serial(self, dataset):
        """
        Serial the execution of sementation across multiple input files, distributing the input data across processes
        :param dataset: dataset
        :return: segments
        """
        for datafile, txt_path, ann_path in dataset:
            print("File", datafile)

            self.file = datafile
            self.ann_path = ann_path
            self.txt_path = txt_path
            self.ann_obj = Annotation(self.ann_path)

            content = open(self.txt_path).read()
            # content_text = normalization.replace_Punctuation(content)

            self.doc = self.nlp_model(content)

            file_name = str(datafile) + ".ann"
            if self.write_Entites and self.prediction_folder is not None:
                write_entities_to_file(self.ann_obj, file_name,
                                       self.prediction_folder)
            # else:
            #     print("Define the path to the folder to save predictions ")

            segment = self.get_Segments_from_sentence(self.ann_obj)
        return segment

    def get_Segments_from_relations(self, ann):
        """
        For each relation object, it identifies the label and the entities first, then extracts the span of the
        entities from the text file using the start and end character span of the entities. Then it finds the
        sentence the entities are located in and passes the sentence and the spans of the entities to the function
        that extracts the following segments:

        Preceding - (tokenize words before the first concept)
        concept 1 - (tokenize words in the first concept)
        Middle - (tokenize words between 2 concepts)
        concept 2 - (tokenize words in the second concept)
        Succeeding - (tokenize words after the second concept)

        :param ann: annotation object
        :return: segments and label
        """

        # object to store the segments of a relation object
        segment = {
            'preceding': [],
            'concept1': [],
            'concept2': [],
            'middle': [],
            'succeeding': [],
            'sentence': [],
            'label': []
        }

        for label_rel, entity1, entity2 in ann.annotations['relations']:

            start_C1 = ann.annotations['entities'][entity1][1]
            end_C1 = ann.annotations['entities'][entity1][2]

            start_C2 = ann.annotations['entities'][entity2][1]
            end_C2 = ann.annotations['entities'][entity2][2]

            # to get arrange the entities in the order they are located in the sentence
            if start_C1 < start_C2:
                concept_1 = self.doc.char_span(start_C1, end_C1)
                concept_2 = self.doc.char_span(start_C2, end_C2)
            else:
                concept_1 = self.doc.char_span(start_C2, end_C2)
                concept_2 = self.doc.char_span(start_C1, end_C1)

            if concept_1 is not None and concept_2 is not None:
                # get the sentence where the entity is located
                sentence_C1 = str(concept_1.sent)
                sentence_C2 = str(concept_2.sent)
            else:
                break

            # if both entities are located in the same sentence return the sentence or
            # concatenate the individual sentences where the entities are located in to one sentence

            if sentence_C1 == sentence_C2:
                sentence = sentence_C1
            else:
                sentence = sentence_C1 + " " + sentence_C2

            sentence = normalization.remove_Punctuation(str(sentence).strip())
            concept_1 = normalization.remove_Punctuation(
                str(concept_1).strip())
            concept_2 = normalization.remove_Punctuation(
                str(concept_2).strip())
            segment['concept1'].append(concept_1)
            segment['concept2'].append(concept_2)
            segment['sentence'].append(sentence.replace('\n', ' '))

            preceding, middle, succeeding = extract_Segments(
                sentence, concept_1, concept_2)
            segment['preceding'].append(preceding.replace('\n', ' '))
            segment['middle'].append(middle.replace('\n', ' '))
            segment['succeeding'].append(succeeding.replace('\n', ' '))
            segment['label'].append(label_rel)

        return segment

    def get_Segments_from_sentence(self, ann):
        """
        In the annotation object, it identifies the sentence each problem entity is located and tries to determine
        the relations between other problem entities and other entity types in the same sentence. When a pair of
        entities is identified first it checks whether a annotated relation type exists, in that case it labels with
        the given annotated label if not it labels as a No - relation pair. finally it passes the sentence and the
        spans of the entities to the function that extracts the following segments:

        Preceding - (tokenize words before the first concept)
        concept 1 - (tokenize words in the first concept)
        Middle - (tokenize words between 2 concepts)
        concept 2 - (tokenize words in the second concept)
        Succeeding - (tokenize words after the second concept)

        :param ann: annotation object
        :return: segments and label: preceding, concept_1, middle, concept_2, succeeding, label
        """
        # object to store the segments of a relation object for a file
        doc_segments = {
            'preceding': [],
            'concept1': [],
            'concept2': [],
            'concept1_label': [],
            'concept2_label': [],
            'middle': [],
            'succeeding': [],
            'sentence': [],
            'label': [],
            'track': []
        }

        # list to store the identified relation pair when both entities are same
        self.entity_holder = []

        for key1, value1 in ann.annotations['entities'].items():
            label1, start1, end1, mention1 = value1
            # when relations are dependent on one entity (dominant)
            if not self.generalize:
                # dominant label
                if label1 == self.entity_labels[0]:
                    # if label1 == self.rel_labels[0] or label1 == self.rel_labels[1]:

                    for key2, value2 in ann.annotations['entities'].items():
                        label2, start2, end2, mention2 = value2
                        token = True
                        # if relation exists between the same entities
                        if self.same_entity_relation and label2 == self.entity_labels[
                                0] and key1 != key2:
                            #needs checking
                            if self.test:
                                label_rel = "No Label"
                                segment = self.extract_sentences(
                                    ann, key2, key1, label_rel)
                                if segment is not None:
                                    doc_segments = add_file_segments(
                                        doc_segments, segment)
                            else:
                                for label_rel, entity1, entity2 in ann.annotations[
                                        'relations']:
                                    if key2 == entity1 and key1 == entity2:
                                        segment = self.extract_sentences(
                                            ann, entity1, entity2, label_rel,
                                            True)
                                        doc_segments = add_file_segments(
                                            doc_segments, segment)
                                        token = False
                                        break

                                # No relations for the same entity
                                if token and self.no_rel_label:
                                    if self.no_rel_multiple:
                                        label_rel = self.no_rel_label
                                    else:
                                        label_rel = self.no_rel_label[0]
                                    segment = self.extract_sentences(
                                        ann, key2, key1, label_rel)
                                    if segment is not None:
                                        doc_segments = add_file_segments(
                                            doc_segments, segment)

                        # when the entity pair do not contain entities of the same type
                        for i in range(len(self.entity_labels) - 1):
                            # match the dominant entity with other entities
                            if not self.same_entity_relation and label2 == self.entity_labels[
                                    i + 1]:  #label2 - second entity label
                                if self.test:
                                    label_rel = "No Label"
                                    segment = self.extract_sentences(
                                        ann, key2, key1, label_rel)
                                    if segment is not None:
                                        doc_segments = add_file_segments(
                                            doc_segments, segment)
                                else:
                                    # for the relations that exist in the ann files
                                    for label_rel, entity1, entity2 in ann.annotations[
                                            'relations']:
                                        # if key2 == entity2 and key1 == entity1:
                                        if key2 == entity1 and key1 == entity2:
                                            # when a match with an existing relation is found
                                            segment = self.extract_sentences(
                                                ann, entity1, entity2,
                                                label_rel, True)
                                            doc_segments = add_file_segments(
                                                doc_segments, segment)
                                            token = False
                                            break

                                    # No relations for the different entities
                                    if token and self.no_rel_label:
                                        if self.no_rel_multiple:
                                            label_rel = self.no_rel_label
                                        else:
                                            label_rel = self.no_rel_label[0]
                                        segment = self.extract_sentences(
                                            ann, key2, key1, label_rel)
                                        if segment is not None:
                                            doc_segments = add_file_segments(
                                                doc_segments, segment)

            else:
                # when relation exists between all entity pairs
                for key2, value2 in ann.annotations['entities'].items():
                    label2, start2, end2, mention2 = value2
                    token = True

                    # for the same entity
                    if self.same_entity_relation and label2 == self.entity_labels[
                            0] and key1 != key2:
                        if self.test:
                            label_rel = "No Label"
                            segment = self.extract_sentences(
                                ann, key2, key1, label_rel)
                            if segment is not None:
                                doc_segments = add_file_segments(
                                    doc_segments, segment)
                        else:
                            #when relation exists in the ann file
                            for label_rel, entity1, entity2 in ann.annotations[
                                    'relations']:
                                if key2 == entity1 and key1 == entity2:
                                    segment = self.extract_sentences(
                                        ann, entity1, entity2, label_rel, True)
                                    doc_segments = add_file_segments(
                                        doc_segments, segment)
                                    token = False
                                    break

                            # No relations for the same entity
                            if token and self.no_rel_label:
                                if self.no_rel_multiple:
                                    label_rel = self.no_rel_label
                                else:
                                    label_rel = self.no_rel_label[0]
                                segment = self.extract_sentences(
                                    ann, key2, key1, label_rel)
                                if segment is not None:
                                    doc_segments = add_file_segments(
                                        doc_segments, segment)

                    for i in range(len(self.entity_labels) - 1):
                        # for the different entities
                        if not self.same_entity_relation and label2 == self.entity_labels[
                                i + 1]:
                            if self.test:
                                label_rel = "No Label"
                                segment = self.extract_sentences(
                                    ann, key2, key1, label_rel)
                                if segment is not None:
                                    doc_segments = add_file_segments(
                                        doc_segments, segment)
                            else:
                                # when relation exists in the ann file
                                for label_rel, entity1, entity2 in ann.annotations[
                                        'relations']:
                                    if key2 == entity1 and key1 == entity2:
                                        segment = self.extract_sentences(
                                            ann, entity1, entity2, label_rel,
                                            True)
                                        doc_segments = add_file_segments(
                                            doc_segments, segment)
                                        token = False
                                        break

                                # No relations for the different entities
                                if token and self.no_rel_label:
                                    if self.no_rel_multiple:
                                        label_rel = self.no_rel_label
                                    else:
                                        label_rel = self.no_rel_label[0]
                                    segment = self.extract_sentences(
                                        ann, key2, key1, label_rel)
                                    if segment is not None:
                                        doc_segments = add_file_segments(
                                            doc_segments, segment)

        return doc_segments

    def extract_sentences(self,
                          ann,
                          entity1,
                          entity2,
                          label_rel=None,
                          join_sentences=False):
        """
        when the two entities are give as input, it identifies the sentences they are located and determines whether the
        entity pair is in the same sentence or not. if not they combine the sentences if there an annotated relation exist
        and returns None if an annotated relation doesn't exist
        :param ann: annotation object
        :param label_rel: relation type
        :param entity1: first entity in the considered pair
        :param entity2: second entity in the considered pair
        :param join_sentences: check for annotated relation in the data
        :return: segments and sentences and label
        """
        segment = {
            'preceding': [],
            'concept1': [],
            'concept2': [],
            'concept1_label': [],
            'concept2_label': [],
            'middle': [],
            'succeeding': [],
            'sentence': [],
            'label': [],
            'track': []
        }

        start_C1 = ann.annotations['entities'][entity1][1]
        end_C1 = ann.annotations['entities'][entity1][2]

        start_C2 = ann.annotations['entities'][entity2][1]
        end_C2 = ann.annotations['entities'][entity2][2]

        label_C1 = ann.annotations['entities'][entity1][0]
        label_C2 = ann.annotations['entities'][entity2][0]

        # to get arrange the entities in the order they are located in the sentence
        if start_C1 < start_C2:
            concept_1 = self.doc.char_span(start_C1, end_C1)
            concept_2 = self.doc.char_span(start_C2, end_C2)
        else:
            concept_1 = self.doc.char_span(start_C2, end_C2)
            concept_2 = self.doc.char_span(start_C1, end_C1)

        if concept_1 is not None and concept_2 is not None:
            # get the sentence the entities are located
            sentence_C1 = str(concept_1.sent.text)
            sentence_C2 = str(concept_2.sent.text)

            # if both entities are located in the same sentence return the sentence or concatenate the individual sentences where the entities are located in to one sentence
            if join_sentences:
                if sentence_C1 == sentence_C2:
                    sentence = sentence_C1
                else:
                    sentence = sentence_C1 + " " + sentence_C2
            else:
                #if the entity pair considered do not come from an annotated relation, strictly restrict to one sentence
                if sentence_C1 == sentence_C2:
                    sentence = sentence_C1
                    entity_pair = entity1 + '-' + entity2
                    # to make sure the same entity pair is not considered twice
                    if entity_pair not in self.entity_holder:
                        self.entity_holder.append(entity2 + '-' + entity1)
                    else:
                        sentence = None
                else:
                    sentence = None
        else:
            sentence = None

        if sentence is not None:
            sentence = normalization.remove_Punctuation(str(sentence).strip())
            concept_1 = normalization.remove_Punctuation(
                str(concept_1).strip())
            concept_2 = normalization.remove_Punctuation(
                str(concept_2).strip())
            preceding, middle, succeeding = extract_Segments(
                sentence, concept_1, concept_2)

            # remove the next line character in the extracted segment by replacing the '\n' with ' '
            segment['concept1'].append(concept_1.replace('\n', ' '))
            segment['concept2'].append(concept_2.replace('\n', ' '))
            segment['sentence'].append(sentence.replace('\n', ' '))
            segment['preceding'].append(preceding.replace('\n', ' '))
            segment['middle'].append(middle.replace('\n', ' '))
            segment['succeeding'].append(succeeding.replace('\n', ' '))
            segment['label'].append(label_rel)
            # Adding the track information
            # print( int(self.file),int(entity1[1:]),int(entity2[1:]))
            segment['track'].append(int(self.file))
            segment['track'].append(int(entity1[1:]))
            segment['track'].append(int(entity2[1:]))

            segment['concept1_label'].append(label_C1)
            segment['concept2_label'].append(label_C2)
        return segment