Ejemplo n.º 1
0
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1 = taggers[0]
    tagger1_b = tagger1.to_bytes()
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
    assert new_tagger1.to_bytes() == tagger1_b
Ejemplo n.º 2
0
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1 = taggers[0]
    tagger1_b = tagger1.to_bytes()
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    new_tagger1 = Tagger(en_vocab).from_bytes(tagger1_b)
    assert new_tagger1.to_bytes() == tagger1_b
Ejemplo n.º 3
0
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
    tagger1, tagger2 = taggers
    with make_tempdir() as d:
        file_path1 = d / "tagger1"
        file_path2 = d / "tagger2"
        tagger1.to_disk(file_path1)
        tagger2.to_disk(file_path2)
        tagger1_d = Tagger(en_vocab).from_disk(file_path1)
        tagger2_d = Tagger(en_vocab).from_disk(file_path2)
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
Ejemplo n.º 4
0
def test_serialize_tagger_roundtrip_bytes(en_vocab, taggers):
    tagger1 = taggers[0]
    tagger1_b = tagger1.to_bytes()
    tagger1 = tagger1.from_bytes(tagger1_b)
    assert tagger1.to_bytes() == tagger1_b
    cfg = {"model": DEFAULT_TAGGER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    new_tagger1 = Tagger(en_vocab, model).from_bytes(tagger1_b)
    new_tagger1_b = new_tagger1.to_bytes()
    assert len(new_tagger1_b) == len(tagger1_b)
    assert new_tagger1_b == tagger1_b
Ejemplo n.º 5
0
def test_issue1727():
    """Test that models with no pretrained vectors can be deserialized
    correctly after vectors are added."""
    data = numpy.ones((3, 300), dtype="f")
    vectors = Vectors(data=data, keys=["I", "am", "Matt"])
    tagger = Tagger(Vocab())
    tagger.add_label("PRP")
    tagger.begin_training()
    assert tagger.cfg.get("pretrained_dims", 0) == 0
    tagger.vocab.vectors = vectors
    with make_tempdir() as path:
        tagger.to_disk(path)
        tagger = Tagger(Vocab()).from_disk(path)
        assert tagger.cfg.get("pretrained_dims", 0) == 0
Ejemplo n.º 6
0
def part_of_speech(texts: Iterable[str],
                   nlp: Optional[Language] = None,
                   model: str = 'en_core_web_sm') -> Counter:
    """Part-Of-Speech Tags from an Iterable of String Sequences."""
    added_tagger: Union[str, bool] = False
    if nlp is None:
        nlp = spacy.load(model)
    elif isinstance(nlp, Language) and 'tagger' not in nlp.pipe_names:
        nlp.add_pipe(Tagger(nlp.vocab))
        added_tagger = 'tagger'

    pos_tags = {}
    for text in texts:
        if not text:
            continue
        for tok in nlp(text):
            if tok.is_space or not tok.is_alpha or not tok.pos_:
                continue
            if tok.pos_ not in pos_tags:
                pos_tags[tok.pos_] = 1
            else:
                pos_tags[tok.pos_] += 1

    if added_tagger:
        nlp.remove_pipe(added_tagger)
    return Counter(pos_tags)
Ejemplo n.º 7
0
def train_spacy(data, iterations, model=None):
    TRAIN_DATA = pickle.load(open(data, 'rb'))
    nlp = spacy.blank('en')  # create blank Language class
    # create the built-in pipeline components and add them to the pipeline
    # nlp.create_pipe works for built-ins that are registered with spaCy
    if 'ner' not in nlp.pipe_names:
        ner = nlp.create_pipe('ner')
        nlp.add_pipe(ner, last=True)
    else:
        ner = nlp.get_pipe("ner")

    # add labels
    for _, annotations in TRAIN_DATA:
        for ent in annotations.get('entities'):
            ner.add_label(ent[2])

    if model is None:
        optimizer = nlp.begin_training()
    else:
        print("Existing entities in the model are:", move_names)
        optimizer = nlp.entity.create_optimizer()

    # get names of other pipes to disable them during training
    other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
    with nlp.disable_pipes(*other_pipes):  # only train NER
        for itn in range(iterations):
            print("Starting iteration " + str(itn))
            random.shuffle(TRAIN_DATA)
            losses = {}
            batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
            for batch in batches:
                text, annotations = zip(*batch)
                nlp.update(
                    text,  # batch of texts
                    annotations,  # batch of annotations
                    drop=0.2,  # dropout - make it harder to memorise data
                    sgd=optimizer,  # callable to update weights
                    losses=losses)
            print(losses)

    #custom_ner_model = spacy.load(nlp)

    nlp_core_model = spacy.load("en_core_web_lg")

    tagger = Tagger(nlp_core_model.vocab)
    nlp.add_pipe(tagger, before="ner")

    parser = DependencyParser(nlp_core_model.vocab)
    nlp.add_pipe(parser, before="ner")
    nlp.begin_training()
    return TRAIN_DATA, nlp
Ejemplo n.º 8
0
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
    tagger1, tagger2 = taggers
    with make_tempdir() as d:
        file_path1 = d / "tagger1"
        file_path2 = d / "tagger2"
        tagger1.to_disk(file_path1)
        tagger2.to_disk(file_path2)
        tagger1_d = Tagger(en_vocab).from_disk(file_path1)
        tagger2_d = Tagger(en_vocab).from_disk(file_path2)
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
Ejemplo n.º 9
0
def test_serialize_tagger_strings(en_vocab, de_vocab, taggers):
    label = "SomeWeirdLabel"
    assert label not in en_vocab.strings
    assert label not in de_vocab.strings
    tagger = taggers[0]
    assert label not in tagger.vocab.strings
    with make_tempdir() as d:
        # check that custom labels are serialized as part of the component's strings.jsonl
        tagger.add_label(label)
        assert label in tagger.vocab.strings
        file_path = d / "tagger1"
        tagger.to_disk(file_path)
        # ensure that the custom strings are loaded back in when using the tagger in another pipeline
        cfg = {"model": DEFAULT_TAGGER_MODEL}
        model = registry.resolve(cfg, validate=True)["model"]
        tagger2 = Tagger(de_vocab, model).from_disk(file_path)
        assert label in tagger2.vocab.strings
Ejemplo n.º 10
0
def test_serialize_tagger_roundtrip_disk(en_vocab, taggers):
    tagger1, tagger2 = taggers
    with make_tempdir() as d:
        file_path1 = d / "tagger1"
        file_path2 = d / "tagger2"
        tagger1.to_disk(file_path1)
        tagger2.to_disk(file_path2)
        cfg = {"model": DEFAULT_TAGGER_MODEL}
        model = registry.resolve(cfg, validate=True)["model"]
        tagger1_d = Tagger(en_vocab, model).from_disk(file_path1)
        tagger2_d = Tagger(en_vocab, model).from_disk(file_path2)
        assert tagger1_d.to_bytes() == tagger2_d.to_bytes()
Ejemplo n.º 11
0
def main(data_dir, model_dir=None, exclude_normalize_tags=None, keys={}):
    '''
	data_dir -> path to brat annotation data. searches recursively
	model_dir -> path to save spacy training model
	exclude_normalize_tags -> list of tags to exclude from normalization. If NONE, no normalization is performed.
	keys -> dict translating brat tags to training tags. keys not in dict will be preserved
	'''

    r = RepoModel(data_dir, recursive=True, cached=False)

    nlp = spacy.load('en_default')

    # v1.1.2 onwards
    if nlp.tagger is None:
        print('---- WARNING ----')
        print('Data directory not found')
        print(
            'please run: `python -m spacy.en.download --force all` for better performance'
        )
        print('Using feature templates for tagging')
        print('-----------------')
        nlp.tagger = Tagger(nlp.vocab, features=Tagger.feature_templates)

    normalized_train_data = []
    excludes = exclude_normalize_tags  #we have manually tagged all instances of these

    for key, data in r.documents.items():
        if exclude_normalize_tags:
            normalized_train_data.extend(
                normalize_tags(nlp, get_annotated_sents(data, keys), excludes))
        else:
            normalized_train_data.extend(get_annotated_sents(data, keys))

    # print(normalized_train_data)

    nlp = train_ner(nlp, normalized_train_data, keys.values())

    doc = nlp(
        u"Hi Adam,\nSounds great to me. I'll send through the QA department. In the invite you through Skype, and we can discuss if Applause is right for you.\nI look forward to it!\nRegards,\nAndrew"
    )
    for word in doc:
        print(word.text, word.tag_, word.ent_type_)

    if model_dir is not None:
        save_model(nlp, model_dir)
Ejemplo n.º 12
0
def top_60_feature_extraction(inputs):
    nlp = spacy.load('en')
    desired_labels = [
        'PERSON', 'NORP', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT',
        'WORK_OF_ART', 'LAW', 'LANGUAGE'
    ]
    tagger = Tagger(nlp.vocab, model=True)

    features = []
    for inp in tqdm(inputs):
        postStr = ' '.join(inp['postText'])

        parsed_post = nlp(postStr)
        parsed_title = nlp(inp['targetTitle'])
        keywords = [
            kw.strip().lower() for kw in inp['targetKeywords'].split(',')
        ]

        if len(parsed_post) == 0:
            parsed_post = nlp(inp['targetDescription'])

        # tokenize (also by punctuation)
        tokens_by_punc = word_tokenize(postStr)

        # get parts of speech
        TAG = [token.tag_ for token in parsed_post]

        # get word lengths in post
        lens = [len(token.text) for token in parsed_post]

        features.append([
            # 1 number of proper nouns
            match_tags(parsed_post, ['NNP']),
            # 2 readability of target paragraphs

            # 3 number of tokens
            len(tokens_by_punc),
            # 4 word length of post text
            len(parsed_post),
            # 5 POS 2-gram NNP NNP
            match_tags(parsed_post, ['NNP', 'NNP']),
            # 6 Whether the post starts with number
            1 if is_first_word_number(parsed_post) else 0,
            # 7 Average length of words in post
            np.mean(lens),
            # 8 Number of Prepositions / Subordinating Conjunction
            match_tags(parsed_post, ['IN']),
            # 9 POS 2-gram NNP 3rd person singular present Verb
            match_tags(parsed_post, ['NNP', 'VBZ']),
            # 10 POS 2-gram IN NNP
            match_tags(parsed_post, ['IN', 'NNP']),
            # 11 length of the longest word in post text
            max(lens),
            # 12 number of wh-adverb
            match_tags(parsed_post, ['WRB']),
            # 13 count POS pattern WRB

            # 14 number of single/mass nouns
            match_tags(parsed_post, ['NN']),
            # 15 count POS pattern NN

            # 16 whether the post text starts with 5W1H
            1 if parsed_post[0].tag_ in ['WDT', 'WP', 'WP$', 'WRB'] else 0,
            # 17 Whether exist Question Mark
            1 if '?' in postStr else 0,
            # 18 similarity between post and target title
            parsed_post.similarity(parsed_title),
            # 19 Count POS pattern this/these NN
            match_this_NN(parsed_post),
            # 20 Count POS pattern PRP

            # 21 Number of PRP
            match_tags(parsed_post, ['PRP']),
            # 22 Number of VBZ
            match_tags(parsed_post, ['VBZ']),
            # 23 POS 3-gram NNP NNP VBZ
            match_tags(parsed_post, ['NNP', 'NNP', 'VBZ']),
            # 24 POS 2-gram NN IN
            match_tags(parsed_post, ['NN', 'IN']),
            # 25 POS 3-gram NN IN NNP
            match_tags(parsed_post, ['NN', 'IN', 'NNP']),
            # 26 ratio of stop words in posttext
            len(filter(lambda x: x.is_stop, parsed_post)) / len(parsed_post),
            # 27 POS 2-gram NNP
            match_NNP_period(parsed_post),
            # 28 POS 2-gram PRP VBP
            match_tags(parsed_post, ['PRP', 'VBP']),
            # 29 Count POS pattern WP

            # 30 Number of WP
            match_tags(parsed_post, ['WP']),
            # 31 Count POS pattern DT

            # 32 Number of DT
            match_tags(parsed_post, ['DT']),
            # 33 POS 2-gram NNP IN
            match_tags(parsed_post, ['NNP', 'IN']),
            # 34 POS 3-gram IN NNP NNP
            match_tags(parsed_post, ['IN', 'NNP', 'NNP']),
            # 35 Number of POS
            match_tags(parsed_post, ['POS']),
            # 36 POS 2-gram IN IN
            match_tags(parsed_post, ['IN', 'IN']),
            # 37 Match between keywords and post
            len(filter(lambda x: x in postStr.lower(), keywords)),
            # 38 Number of ','
            len(filter(lambda x: x == ',', postStr)),
            # 39 POS 2-gram NNP NNS
            match_tags(parsed_post, ['NNP', 'NNS']),
            # 40 POS 2-gram IN JJ
            match_tags(parsed_post, ['IN', 'JJ']),
            # 41 POS 2-gram NNP POS
            match_tags(parsed_post, ['NNP', 'POS']),
            # 42 WDT
            match_tags(parsed_post, ['WDT']),
            # 43 Count POS pattern WDT

            # 44 POS 2-gram NN NN
            match_tags(parsed_post, ['NN', 'NN']),
            # 45 POS 2-gram NN NNP
            match_tags(parsed_post, ['NN', 'NNP']),
            # 46 POS 2-gram NNP VBD
            match_tags(parsed_post, ['NN', 'VBD']),
            # 47 Similarity between post and target paragraphs

            # 48 POS pattern RB
            match_tags(parsed_post, ['RB']),
            # 49 Number of RB

            # 50 POS 3-gram NNP NNP NNP
            match_tags(parsed_post, ['NNP', 'NNP', 'NNP']),
            # 51 POS 3-gram NNP NNP NN
            match_tags(parsed_post, ['NNP', 'NNP', 'NN']),
            # 52 Readability of target paragraphs

            # 53 Number of RBS
            match_tags(parsed_post, ['RBS']),
            # 54 Number of VBN
            match_tags(parsed_post, ['VBN']),
            # 55 POS 2-gram VBN IN
            match_tags(parsed_post, ['VBN', 'IN']),
            # 56 whether exist NUMBER NP VB
            match_NUM_NP_VB(parsed_post),
            # 57 POS 2-gram JJ NNP
            match_tags(parsed_post, ['JJ', 'NNP']),
            # 58 POS 3-gram NNP NN NN
            match_tags(parsed_post, ['NNP', 'NN', 'NN']),
            # 59 POS 2-gram DT NN
            match_tags(parsed_post, ['DT', 'NN']),
            # 60 whether exist EX
            1 if match_tags(parsed_post, ['EX']) > 1 else 0
        ])
    return features
Ejemplo n.º 13
0
            ])
            counter = end
        parts.extend(sentences[i][counter:])
        sentences_marked.append(''.join(parts))
    else:
        # the target is a single token
        start = target_indices[i][0]
        end = target_indices[i][1]
        sentences_marked.append(''.join([
            sentences[i][0:start], '[TARGET_START] ', sentences[i][start:end],
            ' [TARGET_END]', sentences[i][end:]
        ]))

# SpaCy NLP tools
nlp = en_core_web_md.load()
tagger = Tagger(nlp.vocab)

# Add special case rule
special_case1 = [{ORTH: "[TARGET_START]"}]
special_case2 = [{ORTH: "[TARGET_END]"}]
nlp.tokenizer.add_special_case("[TARGET_START]", special_case1)
nlp.tokenizer.add_special_case("[TARGET_END]", special_case2)

# A list of universal POS tags used by SpaCy (except for 'SPECIAL' and 'PAD' tags).
# Tag 'SPECIAL' is used only for markers '[CLS]' and '[SEP]', and tag 'PAD' for padded tokens.
pos_tags = [
    'ADJ', 'ADP', 'ADV', 'AUX', 'CONJ', 'CCONJ', 'DET', 'INTJ', 'NOUN', 'NUM',
    'PART', 'PRON', 'PROPN', 'PUNCT', 'SCONJ', 'SYM', 'VERB', 'X', 'SPACE',
    'SPECIAL', 'PAD'
]
Ejemplo n.º 14
0
def taggers(en_vocab):
    cfg = {"model": DEFAULT_TAGGER_MODEL}
    model = registry.resolve(cfg, validate=True)["model"]
    tagger1 = Tagger(en_vocab, model)
    tagger2 = Tagger(en_vocab, model)
    return tagger1, tagger2
Ejemplo n.º 15
0
 def __init__(self, model="fr_core_news_md"):
     self.nlp = spacy.load(model)
     self.tagger = Tagger(self.nlp.vocab)
     self.parsed_text = OrderedDict()
Ejemplo n.º 16
0
def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
    tagger1.model = tagger1.Model(8)
    tagger2.model = tagger1.model
    return (tagger1, tagger2)
Ejemplo n.º 17
0
def taggers(en_vocab):
    tagger1 = Tagger(en_vocab)
    tagger2 = Tagger(en_vocab)
    tagger1.model = tagger1.Model(8)
    tagger2.model = tagger1.model
    return (tagger1, tagger2)