def test_matcher_remove_zero_operator(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"OP": "!"}]
    matcher.add("Rule", [pattern])
    doc = Doc(en_vocab, words=["This", "is", "a", "test", "."])
    matches = matcher(doc)
    assert len(matches) == 0
    assert "Rule" in matcher
    matcher.remove("Rule")
    assert "Rule" not in matcher
Beispiel #2
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
Beispiel #3
0
def test_matcher_from_api_docs(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert len(matcher) == 1
    matcher.remove("Rule")
    assert "Rule" not in matcher
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher
    on_match, patterns = matcher.get("Rule")
    assert len(patterns[0])
Beispiel #4
0
 def get_single_match(self, doc, pattern):
     matcher = Matcher(self.nlp.vocab)
     if "newMatch" in matcher:
         matcher.remove("newMatch")
     matcher.add("newMatch", None, pattern)
     matches = matcher(doc)
     try:
         if len(matches) > 0:
             for match_id, start, end in matches:
                 return doc[start:end]
     except Exception as e:
         return e
     return ""
Beispiel #5
0
def test_matcher_remove(en_vocab):
    matcher = Matcher(en_vocab)
    pattern = [{"ORTH": "test"}, {"OP": "?"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher

    # removing once should work
    matcher.remove("Rule")

    # removing again should throw an error
    with pytest.raises(ValueError):
        matcher.remove("Rule")
Beispiel #6
0
def getPhrases(file, context_pattern):
    new_phrases = set()
    with open(file, 'r') as f:
        t = f.read().lower()
        matcher = Matcher(nlp.vocab)
        doc = nlp(t)
        for cp in context_pattern:
            matcher.add("extraction", None, cp)
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start + 2:end].text
                if span not in new_phrases:
                    new_phrases.add(span)
            matcher.remove("extraction")
    return new_phrases
    def match_sentence(self, question):
        matcher = Matcher(self.nlp.vocab)
        sent_tokens = [self.nlp(i) for i in self.textsplit]
        question_nlp = self.nlp(question)
        # for t in q_token_set:
        #     question_pattern.extend([{'LEMMA': t, 'OP': '?'}])
        matched_scores = {}

        for sent_token in sent_tokens:
            sent_pattern = []
            this_length = 0
            sent_token_set = set([tok.lemma_.lower() for tok in sent_token])
            for t in sent_token_set:
                sent_pattern.extend([{'LEMMA': t, 'OP': '?'}])
            matcher.add("sent_pattern", None, sent_pattern)
            matches = matcher(question_nlp)
            for match_id, start, end in matches:
                # string_id = nlp.vocab.strings[match_id]  # Get string representation
                # span = question_nlp[start:end]  # The matched span
                # this_length += len(span.text.split(" "))
                this_length += end - start
            matched_scores[sent_token] = this_length
            matcher.remove("sent_pattern")

        matched_scores = {k: v/len(question_nlp) for k, v in matched_scores.items()}

        # from the default similarity function in spacy, find the simiarity score
        # for the sentence, and then weight it with the matching score.
        final_scores = {}

        if question_nlp[0].text in ["Did", "Do", "Does", "Is", "Are", "Were", "Was", "Had", "Has", "Have"]:
            for sent in matched_scores.keys():
                if len(sent) <= 2: continue
                similarity_score = sent.similarity(question_nlp)
                # print(sent)
                # print("match_score: %.3f" % matched_scores[sent])
                # print("similarity score: %.3f" % similarity_score)
                final_scores[sent.text] = 0.7*matched_scores[sent] + 0.3*similarity_score
        else:
            for sent in matched_scores:
                if len(sent) <= 2: continue
                similarity_score = sent.similarity(question_nlp)
                # print(sent)
                # print(matched_scores[sent])
                final_scores[sent.text] = 0.5*matched_scores[sent] + 0.5*similarity_score
        return final_scores
def getPhrases(file, context_pattern):
    new_phrases = set()
    with open(file, 'r') as f:
        matcher = Matcher(nlp.vocab)
        file_chunk = partition(f)
        for t in file_chunk:
            doc = nlp(t)
            for cp in context_pattern:
                pos_indices = [i for i in range(len(cp)) if 'POS' in cp[i]]
                start_offset = min(pos_indices)
                end_offset = max(pos_indices) + 1
                matcher.add("extraction", None, cp)
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start+start_offset:start+end_offset].text
                    if span not in new_phrases:
                        new_phrases.add(span)
                matcher.remove("extraction")
    return new_phrases
def getPhrases(file, context_pattern):
    new_phrases = set()
    with open(file, 'r') as f:
        matcher = Matcher(nlp.vocab)
        file_chunk = partition(f)
        for t in file_chunk:
            doc = nlp(t)
            for cp in context_pattern:
                offset = 0
                for i in range(len(cp)):
                    if 'POS' in cp[i]:
                        break
                    offset += 1
                matcher.add("extraction", None, cp)
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start + offset:end].text
                    if span not in new_phrases:
                        new_phrases.add(span)
    #                     print(span)
                matcher.remove("extraction")
    return new_phrases
Beispiel #10
0
def is_valid_drivenRequirements(nlp_doc):
    pattern = [{'TEXT': {'REGEX': '(?i)^(when|if|while)*$'}}]
    pattern1 = ''
    matcher = Matcher(nlp.vocab)
    matcher.add('validdriven', None, pattern)
    matches = matcher(nlp_doc)
    first_word = str(nlp_doc[0]).lower()
    if matches:
        if first_word == 'when':
            pattern1 = [{'TAG': 'NNP'}, {'TAG': ','}]
        if first_word == 'if':
            pattern1 = [{'TAG': 'VBN'}, {'TAG': ','}]
        if first_word == 'while':
            pattern1 = [{'TAG': 'NN'}, {'TAG': ','}]
        matcher.remove('validdriven')
        matcher.add("commapostion", None, pattern1)
        matches = matcher(nlp_doc)
        if matches:
            return True
        else:
            return False
    else:
        return False
Beispiel #11
0
def test_matcher_remove():
    nlp = English()
    matcher = Matcher(nlp.vocab)
    text = "This is a test case."

    pattern = [{"ORTH": "test"}, {"OP": "?"}]
    assert len(matcher) == 0
    matcher.add("Rule", None, pattern)
    assert "Rule" in matcher

    # should give two matches
    results1 = matcher(nlp(text))
    assert len(results1) == 2

    # removing once should work
    matcher.remove("Rule")

    # should not return any maches anymore
    results2 = matcher(nlp(text))
    assert len(results2) == 0

    # removing again should throw an error
    with pytest.raises(ValueError):
        matcher.remove("Rule")
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):

    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        file_chunk = partition(f)
        matcher = Matcher(nlp.vocab)
        for t in file_chunk:
            doc = nlp(t)
            for i in range(len(unranked_patterns)):
                offset = 0
                for pattern_dict in unranked_patterns[i]:
                    if 'POS' in pattern_dict:
                        break
                    offset += 1
                matcher.add("extraction", None, unranked_patterns[i])
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start + offset:end].text
                    j = unranked_phrases.index(span)
                    context_matrix[j, i] += 1
                    id2patterns[j].add(i)
                    pattern2ids[i].add(j)
                matcher.remove("extraction")

    id2sup = {}
    pattern2sup = {}

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [],
                                                id2patterns,
                                                pattern2ids, {}, {}, {}, {},
                                                id2phrase,
                                                context_matrix.tolist(),
                                                id2sup,
                                                pattern2sup,
                                                FLAGS_VERBOSE=False,
                                                FLAGS_DEBUG=False)

    return l1, l2, l3, l4, m1, m2, m3, m4
Beispiel #13
0
def manage_speaker(doc_clean, pattern_1, pattern_2, president_1, president_2):

    matcher = Matcher(nlp.vocab)
    matcher.add("speaker", None, pattern_1, pattern_2)
    matches = matcher(doc_clean)

    speaker = []
    i = 0
    for match_id, start, end in matches:
        i += 1
        speaker.append(
            (start + 1, end - 1))  #append span of speaker as list of tuples

    matcher.remove("speaker")

    # find speaker using party matching, vom redner nicht autorisiert, punctuation

    matcher = PhraseMatcher(nlp.vocab, attr="LOWER")
    terms = [
        "(von der rednerin nicht autorisiert):",
        "(vom redner nicht autorisiert):"
    ]

    patterns = [nlp.make_doc(text) for text in terms]
    matcher.add("nicht_autorisiert", None, *patterns)
    matches = matcher(doc_clean)

    for match_id, start, end in matches:
        for i in range(10):
            if doc_clean[start -
                         i].is_sent_start and doc_clean[start -
                                                        i].pos_ != 'PUNCT':
                #print(doc_clean[start-i:end-1])
                speaker.append((start - i, end - 1))
                break

    matcher.remove("nicht_autorisiert")

    # matcher for (vice)president; 'OP' operator does not behave greedily, thus yielding redundant results
    matcher = Matcher(nlp.vocab)
    matcher.add("president", None, president_1, president_2)
    matches = matcher(doc_clean)

    # filter redundant results so that only longest span is kept; exploit the fact that first result is always the longest
    seen = set()
    keep = []
    for match_id, start, end in matches:
        if end - 1 in seen:
            continue
        else:
            seen.add(end - 1)
            keep.append((start, end - 1))

    pres = keep

    #for start, end in pres:
    #span = doc_clean[start:end]
    #print(span.text, start, end)

    matcher.remove("president")

    # append presidents
    for i in pres:
        speaker.append(i)

    #sort speaker list by first element of the tuples
    speaker.sort(key=operator.itemgetter(0))
    ##all speakers should be here!!
    #for start, end in speaker:
    #print(doc_clean[start:end+4])

    print('there are', len(speaker), 'speakers in this session')

    return speaker
Beispiel #14
0
def manage_interruptions(doc, party, exception):
    matcher = Matcher(nlp.vocab)
    party_parenth = [{
        "TEXT": "("
    }, {
        'LOWER': {
            'IN': party
        }
    }, {
        "TEXT": ")"
    }]  ##'regular' speaker (e.g. Muller (spd):)

    matcher.add("party_parenthesis", None, party_parenth)
    matches = matcher(doc)

    end_p = []
    for match_id, start, end in matches:
        end_p.append(end)

    # identify interruptions in doc and print them
    tmp = []

    for token in doc:
        if token.text == '(' and doc[token.i + 1].lower_ not in party and doc[
                token.i +
                1].lower_ not in exception and token.i + 80 < len(doc):
            for i in range(1, 80):
                if doc[token.i +
                       i].text == ')' and token.i + i + 1 not in end_p:  #find last parentheses that does not belong to party
                    #print(doc[token.i:token.i+i+1], token.i, token.i+i, token.i+i+1) #print interruption, index and subsequent token
                    tmp.append(
                        (token.i, token.i + i + 1)
                    )  #store index of span of interruptions as list of tuples

                    break  #avoid capturing subsequent interruptions due to i going till 80 tokens forward

    matcher.remove("party_parenthesis")

    ##check long (> 50 tokens) interruptions among those above
    for i in range(len(tmp)):
        if (tmp[i][1] - tmp[i][0]) > 50:
            print('wow! This is a very long interruption: -->',
                  doc[tmp[i][0]:tmp[i][1]], '\n')

    # create variable that contains index of each token that is within interruptions
    seen = set()
    t = []
    for i in tmp:
        #print(doc[i[0]: i[1]])  #check it is printing all interruptions
        for token in doc[i[0]:i[1]]:
            if token.i in seen:  # avoid adding parts of already identified interruptions
                continue
            else:
                seen.add(token.i)
                t.append(token.i)

    # define getter function that returns True if a token is part of interruptions
    def is_in_interruption(token):
        in_int = token.i in t
        return in_int

    # set a token custom extension to check whether token is in interruption
    Token.set_extension('is_in_interruption', getter=is_in_interruption)

    #store tokens that are not within interruptions in clean_doc
    clean_doc = []
    for token in doc:
        if not token._.is_in_interruption:
            clean_doc.append(token)

    Token.remove_extension('is_in_interruption')

    # create a new doc object that does not contain interruptions
    doc_clean = nlp(''.join(map(lambda x: x.text_with_ws, clean_doc)))

    return doc_clean
Beispiel #15
0
pattern_3 = [ { 'LOWER' : 'solar' }, { 'LOWER' : 'power' } ]

matcher.add( 'SolarPower', None, pattern_1, pattern_2, pattern_3 )

doc = nlp( u'The Solar Power inductry continues to grow as solarpower increases. Solar-Power is great.' )

def format_matcher( doc, matcher ):
    for match_id, start, end in matcher:
        string_id = nlp.vocab.strings[ match_id ]
        span      = doc[ start : end ]
        print( match_id, string_id, start, end, span.text )

format_matcher( doc, matcher( doc ) )

# Remove Pattern
matcher.remove( 'SolarPower' )

# Compress pattern_2 and pattern_3
pattern_1 = [ { 'LOWER' : 'solarpower' } ]
# '*' means match 0 or more times
pattern_2 = [ { 'LOWER' : 'solar' }, { 'IS_PUNCT' : True, 'OP' : '*' }, { 'LOWER' : 'power' } ]
matcher.add( 'SolarPower', None, pattern_1, pattern_2 )

doc = nlp( u'The Solar Power inductry continues to grow as solarpower increases. Solar--Power is great.' )

format_matcher( doc, matcher( doc ) )

from spacy.matcher import PhraseMatcher

matcher = PhraseMatcher( nlp.vocab )
Beispiel #16
0
def create_dataset_input(rule_based_candidates,
                         mention_context_cache_path,
                         data_folder_path,
                         overall_output_path=None,
                         is_test=False,
                         output_path=None,
                         overwrite_dataset=False):
    """Function to take in the rule based candidates and create
       the input format for the SGTB model. This function is intended
       to be used for processing test data, as the main function in
       this file will convert and save train, dev, and test output.

       @param rule_based_candidates: a list of candidates from the rule based model
       @param mention_context_cache_path: path to a dictionary mapping <pub_id>:<mention_text> pairs to all contexts
       @param data_folder_path: path to the data folder
       @param overall_output_path: path to the overall output folder (optional, used for SGTB training)
       @param is_test: parameter indicating whether or not the data being processed is test data
       @param output_path: the path to write the output to (if not processing test data)
       @param overwrite_dataset: whether or not to overwrite the existing dataset (will be true for train
                                 and false for dev and test)
    """
    scispacy_parser = scispacy_util.SciSpaCyParser()
    prior_entity_probs = compute_entity_probabilities()
    prior_entity_given_mention_probs = compute_entity_given_mention_probs()
    prior_mention_given_entity_probs = compute_mention_given_entity_probs()

    glove_path = os.path.abspath(
        os.path.join("project", "data", "glove", "glove.6B.50d.txt"))
    with open(glove_path, "r") as lines:
        glove = {
            line.split()[0]:
            np.array([float(value) for value in line.split()[1:]])
            for line in lines
        }

    # I haven't run the experiments to tell if having a cache actually helps or not, it takes a while to load
    # the cache when it is used
    # if is_test:
    #     mention_context_cache = {}
    # else:
    #     try:
    #         print("Loading cache...")
    #         mention_context_cache = joblib.load(mention_context_cache_path)["cache"]
    #         print("Cache loaded...")
    #     except:
    #         mention_context_cache = {}
    mention_context_cache = {}

    kb_path = os.path.abspath(os.path.join("project", "data",
                                           "data_sets.json"))
    with open(kb_path) as kb_file:
        kb_json = json.load(kb_file)

    dataset_id_to_kb_entry = {}
    for dataset in kb_json:
        dataset_id_to_kb_entry[dataset["data_set_id"]] = dataset

    matcher = Matcher(scispacy_parser.nlp.vocab)
    section_matcher = Matcher(scispacy_parser.nlp.vocab)
    for section_name in SECTION_STRINGS:
        section_matcher.add(section_name, None, [{
            "LOWER": section_name
        }, {
            "ORTH": "\n"
        }], [{
            "LOWER": section_name
        }, {
            "ORTH": ":"
        }], [{
            "ORTH": "\n"
        }, {
            "LOWER": section_name
        }, {
            "ORTH": "."
        }])

    output_docs = []
    pub_ids = []
    # we will write a new file on the first document, and amend to it afterwards
    first_doc = True
    cache_changed = False
    for pub_id in tqdm(rule_based_candidates,
                       desc='create dataset in create_sgtb_dataset.py'):
        spacy_doc = get_scispacy_doc(data_folder_path, pub_id, scispacy_parser)

        pub_ids.append(pub_id)
        doc_candidates = rule_based_candidates[pub_id]
        output_doc = []

        dataset_id_to_longest_mention_text = {}
        for row in doc_candidates:
            mention_text = row["mention"]
            dataset_id = row["candidate_dataset_ids"][0]
            if dataset_id in dataset_id_to_longest_mention_text:
                if len(mention_text) > len(
                        dataset_id_to_longest_mention_text[dataset_id]):
                    dataset_id_to_longest_mention_text[
                        dataset_id] = mention_text
            else:
                dataset_id_to_longest_mention_text[dataset_id] = mention_text

        for row in doc_candidates:
            mention_text = row["mention"]
            dataset_id = row["candidate_dataset_ids"][0]
            # if mention_text != dataset_id_to_longest_mention_text[dataset_id]:
            #     continue

            mention_context_cache_key = str(pub_id) + "_" + mention_text
            if mention_context_cache_key in mention_context_cache:
                mention_contexts = mention_context_cache[
                    mention_context_cache_key]
            else:
                # search for the mention text in the doc
                spacy_mention_text = scispacy_parser.scispacy_create_doc(
                    mention_text)

                pattern = []
                for token in spacy_mention_text:
                    pattern.append({"ORTH": token.text})
                try:
                    matcher.add("MENTION", None, pattern)
                    matches = list(matcher(spacy_doc))
                except ValueError:
                    continue

                # build and save a mapping of <pub_id>_<mention_text> to all contexts the mention
                # is found in
                cache_changed = True
                mention_contexts = []
                token_idx_to_sent_idx = {}
                sentences_list = list(spacy_doc.sents)
                context_size = 3
                for sent_idx, sent in enumerate(sentences_list):
                    for token in sent:
                        token_idx = token.i
                        token_idx_to_sent_idx[token_idx] = sent_idx

                for match_id, start, end in matches:
                    sentence_idx = token_idx_to_sent_idx[start]
                    start_context_sent_idx = max(0,
                                                 sentence_idx - context_size)
                    if start_context_sent_idx == 0:
                        match_sentence_idx = sentence_idx
                    else:
                        match_sentence_idx = context_size
                    end_context_sent_idx = min(len(sentences_list),
                                               sentence_idx + context_size)
                    mention_context = sentences_list[
                        start_context_sent_idx:end_context_sent_idx + 1]
                    sentences_as_docs = []
                    for sentence in mention_context:
                        sentences_as_docs.append(sentence.as_doc())

                    start_context_token_idx = sentences_list[
                        start_context_sent_idx].start
                    end_context_token_idx = sentences_list[end_context_sent_idx
                                                           - 1].end
                    context_with_offsets = (sentences_as_docs,
                                            (start_context_token_idx,
                                             end_context_token_idx),
                                            (start, end), match_sentence_idx)
                    mention_contexts.append(context_with_offsets)

                # limit featurizing to first 3 contexts in order of appearance
                mention_contexts = mention_contexts[:3]
                mention_context_cache[
                    mention_context_cache_key] = mention_contexts

                matcher.remove("MENTION")

            if mention_contexts != []:
                output_mention = create_output_mention(
                    is_test, row, prior_entity_probs,
                    prior_entity_given_mention_probs, mention_text,
                    prior_mention_given_entity_probs, dataset_id_to_kb_entry,
                    mention_contexts, scispacy_parser, glove, spacy_doc,
                    section_matcher)
                output_doc.append(output_mention)

        # only write output to file if not processing test data
        if not is_test:
            if first_doc:
                with open(output_path, "w") as output_file:
                    json.dump(output_doc, output_file)
                    output_file.write("\n")
                first_doc = False

                if overwrite_dataset:
                    with open(overall_output_path, "w") as overall_output_file:
                        json.dump(output_doc, overall_output_file)
                        overall_output_file.write("\n")
            else:
                with open(output_path, "a") as output_file:
                    json.dump(output_doc, output_file)
                    output_file.write("\n")

                with open(overall_output_path, "a") as overall_output_file:
                    json.dump(output_doc, overall_output_file)
                    overall_output_file.write("\n")

        output_docs.append(json.loads(json.dumps(output_doc)))

    # if cache_changed and not is_test:
    #     joblib.dump({"cache": mention_context_cache}, mention_context_cache_path)
    return output_docs, pub_ids
Beispiel #17
0
def patternSearch(T_0, file):
    phrase_patterns = set()
    seed_pattern = [nlp(x) for x in T_0]
    phrase_matcher = PhraseMatcher(nlp.vocab)
    phrase_matcher.add('pattern search', None, *seed_pattern)
    # find occurrences of seed phrases
    with open(file, "r") as f:
        document = nlp(f.read().lower())
        matches = phrase_matcher(document)
        for match_id, start, end in matches:
            p = tuple((start, end))
            if p not in phrase_patterns:
                phrase_patterns.add(p)
    # find patterns around seed phrases
    unranked_patterns = []
    with open(file, "r") as f:
        text = nlp(f.read().lower())
        for phrase_pattern in phrase_patterns:
            start = phrase_pattern[0]
            end = phrase_pattern[1]
            if (text[start - 1].text == '\n'):
                continue
            # add context pattern
            tmp = []
            for i in range(2, 0, -1):
                tmp.append({"TEXT": text[start - i].text})
            # add content pattern
            span = text[start:end]
            for token in span:
                tmp.append({"POS": token.pos_})
            if tmp not in unranked_patterns:
                unranked_patterns.append(tmp)
                print(tmp)
    unranked_phrases = list(getPhrases(file, unranked_patterns))
    # build context graph
    context_graph = nx.Graph()
    # add tuples and patterns into graph
    for i in range(len(unranked_phrases)):
        node = 't' + str(i)
        context_graph.add_node(node, pos=(0, i))
    for i in range(len(unranked_patterns)):
        node = 'p' + str(i)
        context_graph.add_node(node, pos=(2, i))
    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        t = f.read().lower()
        matcher = Matcher(nlp.vocab)
        doc = nlp(t)
        for i in range(len(unranked_patterns)):
            matcher.add("extraction", None, unranked_patterns[i])
            matches = matcher(doc)
            for match_id, start, end in matches:
                span = doc[start + 2:end].text
                j = unranked_phrases.index(span)
                context_matrix[j, i] += 1
            matcher.remove("extraction")
    # add context nodes into graph
    c_count = 0
    for i in range(context_matrix.shape[0]):
        for j in range(context_matrix.shape[1]):
            if context_matrix[i, j] != 0:
                occur = context_matrix[i, j]
                node_t = 't' + str(i)
                node_p = 'p' + str(j)
                node_c = 'c' + str(c_count)
                c_count += 1
                context_graph.add_node(node_c, pos=(1, c_count))
                context_graph.add_edge(node_t, node_c, weight=occur)
                context_graph.add_edge(node_c, node_p, weight=occur)
    # draw context graph
    plt.figure()
    pos = nx.get_node_attributes(context_graph, 'pos')
    nx.draw(context_graph, pos, with_labels=True)
    labels = nx.get_edge_attributes(context_graph, 'weight')
    nx.draw_networkx_edge_labels(context_graph, pos, edge_labels=labels)
    # return patterns
    return unranked_phrases
Beispiel #18
0
for match_id, start, end in found_matches:
    string_id = nlp.vocab.strings[match_id] #adquirir a representação da string
    span = doc[start:end] #adquirir em qual linha iniciou e finalizou a palavra identificada
    print(match_id,string_id, start, end, span.text)


# In[ ]:





# In[20]:


matcher.remove('ArtificialIntelligence')


# In[ ]:





# In[29]:


#artificialintelligence ArtificialIntelligence
pattern1 = [{'LOWER': 'artificialintelligence'}]
#Articial.Intelligence
pattern2 = [{'Lower':'Artificial'},{'IS_PUNCT':True, 'OP':'*'},{'LOWER':'intelligence'}]
Beispiel #19
0
class RefMatcher:
    def __init__(self, nlp):
        self.nlp = nlp
        self.matcher = Matcher(nlp.vocab)

    def clean_matcher(self):
        # no native method to clean spaCy matcher
        # or retrieve pattern names
        # so always add ints, starting from zero
        # and clean ints from 0 till not found
        i = 0
        while len(self.matcher) > 0 and i < 100:
            if i in self.matcher:
                self.matcher.remove(i)
            i += 1

    @staticmethod
    def is_negative(p):
        if "OP" in p and p["OP"] == "!":
            return True
        return False

    @staticmethod
    def is_droppable(p):
        if "OP" in p and p["OP"] in ["*", "?"]:
            return True
        return False

    @staticmethod
    def is_multitoken(p):
        if "OP" in p and p["OP"] in ["*", "+"]:
            return True
        return False

    def remove_skipped_ops(self, span, pattern):
        skipped_idx = []

        op_tokens = [
            i for (i, p) in enumerate(pattern) if RefMatcher.is_droppable(p)
        ]

        for op in op_tokens:
            op_pattern = copy.deepcopy(pattern)
            # remove "?" to require 1 instead of 0
            if op_pattern[op]["OP"] == "?":
                if len(op_pattern[op]) == 1:
                    # if no more props,
                    # add dummy string that will never match
                    # since its not 1 token :)
                    op_pattern[op]["TEXT"] = "alice and bob"
                    op_pattern[op]["OP"] = "!"
                del op_pattern[op]["OP"]
            # change "*" to "+", to require 1+ instead of 0+
            elif op_pattern[op]["OP"] == "*":
                op_pattern[op]["OP"] = "+"
            self.matcher.add(op, None, op_pattern)

        # check whether it still matches
        matches = self.matcher(span.as_doc())
        max_matches = [
            m for (m, s, e) in matches if (s == 0) and (e == len(span))
        ]

        # clean the matcher
        self.clean_matcher()

        non_op_pattern = []
        for i, p in enumerate(pattern):
            # is optional
            if "OP" in p:
                # but not found
                if not i in max_matches and not RefMatcher.is_negative(p):
                    # => to do marked non matched, skip
                    skipped_idx.append(i)
                    continue
                else:
                    if p["OP"] == "+":
                        if len(p) == 1:
                            # if no more props,
                            # add dummy string that will never match
                            # since its not 1 token :)
                            p["TEXT"] = "alice and bob"
                            p["OP"] = "!"
                        else:
                            del p["OP"]
                    elif p["OP"] == "*":
                        p["OP"] = "+"
            non_op_pattern.append(p)

        return non_op_pattern, skipped_idx

    def insert_empty_idx(self, pattern_ref, idx):
        pattern_ref_insert = {}
        for p, v in pattern_ref.items():
            if p >= idx:
                pattern_ref_insert[p + 1] = v
            else:
                pattern_ref_insert[p] = v
        pattern_ref_insert[idx] = []
        return pattern_ref_insert

    def shift_pattern_ref(self, pattern_ref, skipped_idx):
        for idx in skipped_idx:
            pattern_ref = self.insert_empty_idx(pattern_ref, idx)
        return pattern_ref

    def __call__(self, span, orig_pattern):

        pattern = copy.deepcopy(orig_pattern)

        # remove props not supported by SpaCy matcher:
        for p in pattern:
            if "TEMPLATE_ID" in p:
                del p["TEMPLATE_ID"]

        # case I: tokens <-> patterns
        # if lengths match
        # if no OP
        # => everything has been matched
        if len(span) == len(pattern) and not any(["OP" in p for p in pattern]):
            return {k: [k] for k in range(len(pattern))}

        # check which tokens are matched, remove non matched
        non_op_pattern, skipped_idx = self.remove_skipped_ops(span, pattern)

        # case II:
        # if lengths match
        # if no multitoken OPs
        # => everything has been matched
        if len(span) == len(non_op_pattern) and not any(
            [RefMatcher.is_multitoken(p) for p in non_op_pattern]):
            pattern_ref = {k: [k] for k in range(len(non_op_pattern))}
            return self.shift_pattern_ref(pattern_ref, skipped_idx)

        # case III:
        # worst case
        # get shifts for multitokens
        # ie rematching cropped spans and patterns

        # A. get cropped patterns
        for i in range(len(non_op_pattern)):
            self.matcher.add(i, None, non_op_pattern[i:])

        # B. get cropped spans
        docs = [span[i:].as_doc() for i in range(len(span))]

        # C. rematch
        matches = self.matcher.pipe(docs,
                                    batch_size=len(span),
                                    return_matches=True)

        # D. get pattern_ref
        pattern_ref = {}

        for i, (d, m) in enumerate(matches):
            # take max span match for doc
            if len(m):
                # len 0 shouldn't happen except weird white spaces
                m_id, m_start, m_end = max(m, key=lambda x: x[2] - x[1])

                # if cropped span matches cropped pattern
                # 1st token of cropped span belongs to 1st cropped pattern item
                if not m_id in pattern_ref:
                    pattern_ref[m_id] = [i]
                else:
                    # no changes in pattern
                    # pattern item had more tokens matched
                    # ex. "very fast ..." & "fast ... "
                    # matched with {"POS": "ADJ", "OP": "+"} ...
                    pattern_ref[m_id].append(i)

        # clean
        self.clean_matcher()

        # shift by skipped ops
        pattern_ref = self.shift_pattern_ref(pattern_ref, skipped_idx)
        return pattern_ref
Beispiel #20
0
class AbbreviationDetector:
    """
    Detects abbreviations using the algorithm in "A simple algorithm for identifying
    abbreviation definitions in biomedical text.", (Schwartz & Hearst, 2003).

    This class sets the `._.abbreviations` attribute on spaCy Doc.

    The abbreviations attribute is a `List[Span]` where each Span has the `Span._.long_form`
    attribute set to the long form definition of the abbreviation.

    Note that this class does not replace the spans, or merge them.
    """
    def __init__(self, nlp) -> None:
        Doc.set_extension("abbreviations", default=[], force=True)
        Span.set_extension("long_form", default=None, force=True)

        self.matcher = Matcher(nlp.vocab)
        self.matcher.add("parenthesis", None, [{
            "ORTH": "("
        }, {
            "OP": "+"
        }, {
            "ORTH": ")"
        }])
        self.global_matcher = Matcher(nlp.vocab)

    def find(self, span: Span, doc: Doc) -> Tuple[Span, Set[Span]]:
        """
        Functional version of calling the matcher for a single span.
        This method is helpful if you already have an abbreviation which
        you want to find a definition for.
        """
        dummy_matches = [(-1, int(span.start), int(span.end))]
        filtered = filter_matches(dummy_matches, doc)
        abbreviations = self.find_matches_for(filtered, doc)

        if not abbreviations:
            return span, set()
        else:
            return abbreviations[0]

    def __call__(self, doc: Doc) -> Doc:
        matches = self.matcher(doc)
        matches_no_brackets = [(x[0], x[1] + 1, x[2] - 1) for x in matches]
        filtered = filter_matches(matches_no_brackets, doc)
        occurences = self.find_matches_for(filtered, doc)

        for (long_form, short_forms) in occurences:
            for short in short_forms:
                short._.long_form = long_form
                doc._.abbreviations.append(short)
        return doc

    def find_matches_for(self, filtered: List[Tuple[Span, Span]],
                         doc: Doc) -> List[Tuple[Span, Set[Span]]]:
        rules = {}
        all_occurences: Dict[Span, Set[Span]] = defaultdict(set)
        already_seen_long: Set[str] = set()
        already_seen_short: Set[str] = set()
        for (long_candidate, short_candidate) in filtered:
            short, long = find_abbreviation(long_candidate, short_candidate)
            # We need the long and short form definitions to be unique, because we need
            # to store them so we can look them up later. This is a bit of a
            # pathalogical case also, as it would mean an abbreviation had been
            # defined twice in a document. There's not much we can do about this,
            # but at least the case which is discarded will be picked up below by
            # the global matcher. So it's likely that things will work out ok most of the time.
            new_long = long.string not in already_seen_long if long else False
            new_short = short.string not in already_seen_short
            if long is not None and new_long and new_short:
                already_seen_long.add(long.string)
                already_seen_short.add(short.string)
                all_occurences[long].add(short)
                rules[long.string] = long
                # Add a rule to a matcher to find exactly this substring.
                self.global_matcher.add(long.string, None, [{
                    "ORTH": x.text
                } for x in short])
        to_remove = set()
        global_matches = self.global_matcher(doc)
        for match, start, end in global_matches:
            string_key = self.global_matcher.vocab.strings[match]
            to_remove.add(string_key)
            all_occurences[rules[string_key]].add(doc[start:end])
        for key in to_remove:
            # Clean up the global matcher.
            self.global_matcher.remove(key)

        return list((k, v) for k, v in all_occurences.items())
Beispiel #21
0
import spacy
from spacy.matcher import Matcher
import os.path
import io

data_folder = os.path.join(
    "/Users/juangarciaberdoy/Documents/GitHub/philhistcomp/projects",
    "piraha_language")
file_to_open = os.path.join(data_folder, "corpus.txt")
ff = io.open(file_to_open, 'r', encoding='utf-8')

nlp = spacy.load("en_core_web_sm")
matcher = Matcher(nlp.vocab)

patterns = [[{"LOWER": "one"}], [{"LOWER": "is"}]]

doc = nlp(ff.read())
for pattern in patterns:
    print(pattern)
    matcher.add("tempId", None, pattern)
    matches = matcher(doc)
    print(len(matches))
    matcher.remove("tempId")
Beispiel #22
0
class NlpService(nlp_pb2_grpc.NlpServicer):
    def __init__(self):
        self.modelName = None
        self.nlp = None
        self.matcher = None

    def LoadModel(self, request, context):
        self.modelName = request.text
        self.nlp = spacy.load(request.text)
        response = nlp_pb2.TextResponse()
        response.message = "Model loaded '{}'".format(request.text)
        return response

    def NlpProcess(self, request, context):
        doc = self.nlp(request.text)
        response = utils.doc2proto(doc, self.modelName)
        return response

    def DocSimilarity(self, request, context):
        docA = self.nlp(request.texta)
        docB = self.nlp(request.textb)
        response = nlp_pb2.TextSimilarity()
        response.similarity = docA.similarity(docB)
        return response

    def AddRule(self, request, context):
        if self.matcher == None:
            self.matcher = Matcher(self.nlp.vocab)
        matcher_id = request.id
        patterns = [{pat.key: pat.value} for pat in request.patterns]
        self.matcher.add(matcher_id, None, patterns)
        response = nlp_pb2.TextResponse()
        response.message = "Rule with id '{}' added to matcher.".format(
            matcher_id)
        return response

    def RemoveRule(self, request, context):
        if self.matcher == None:
            return nlp_pb2.TextResponse(message="No rules exists with matcher")
        self.matcher.remove(request.text)
        return nlp_pb2.TextResponse(
            message="Rule with id '{}' removed from matcher.".format(
                request.text))

    def GetRule(self, request, context):
        if self.matcher == None:
            return nlp_pb2.TextResponse(message="No rules exists with matcher")
        _, patterns = self.matcher.get(request.text)
        return nlp_pb2.Rule(
            id=request.text,
            patterns=[
                nlp_pb2.Pattern(key=list(pat.keys())[0],
                                value=list(pat.values())[0])
                for pat in patterns[0]
            ],
        )

    def GetMatches(self, request, context):
        doc = self.nlp(request.text)
        matches = self.matcher(doc)
        reponse = nlp_pb2.Matches(matches=[
            nlp_pb2.Match(id=str(i[0]), start=i[1], end=i[2]) for i in matches
        ])
        return reponse

    def ResetMatcher(self, request, context):
        self.matcher = None
        return nlp_pb2.TextResponse(message="Matcher object reset successful.")
Beispiel #23
0
class Classifier():
    def __init__(self, inferenceEngine, colorFile="corpora/colors.csv", sizeFile="corpora/sizes.txt", shapeFile="corpora/shapes.txt", nerModel="models/nerModel"):
        self.query = ""
        self.nlp = spacy.load('en')
        ner = spacy.load(nerModel).pipeline[0][1]
        self.nlp.replace_pipe("ner", ner)

        self.inferenceEngine = inferenceEngine

        self.matcher = Matcher(self.nlp.vocab)
        self.lemmatizer = Lemmatizer(LEMMA_INDEX, LEMMA_EXC, LEMMA_RULES)
        self.scene = {
            "objects": [],
            "backgrounds": []
        }
        self.subjects = {}
        self.referenceWords = ["the", "it", "that", "his", "hers", "theirs"]
        self.colors = {}
        with open(colorFile, "r") as colorReader:
            for line in colorReader:
                colorValue = line.split(",")
                self.colors[colorValue[0].lower()] = colorValue[1].strip("\n")

        self.sizes = {}
        with open(sizeFile, "r") as sizeReader:
            for line in sizeReader:
                line = line.strip().lower()
                sizeValue = line.split(",")
                self.sizes[sizeValue[0]] = sizeValue[1].strip("\n")

        self.shapes = []
        with open(shapeFile, "r")  as shapeReader:
            self.shapes = [shape.strip().lower() for shape in shapeReader]

    def getBlankObject(self):
        identifiedObject = {}
        identifiedObject["subject"] = None
        identifiedObject["modifiers"] = {}
        identifiedObject["modifiers"]["color"] = None
        identifiedObject["modifiers"]["shape"] = None
        identifiedObject["modifiers"]["size"] = None
        identifiedObject["modifiers"]["quantity"] = 1
        return identifiedObject

    def classifyDescriptors(self, descriptors):
        classifiedDescriptors = {}
        pastRef = False
        classifiedDescriptors["color"] = set()
        classifiedDescriptors["size"] = set()
        classifiedDescriptors["shape"] = set()
        classifiedDescriptors["quantity"] = 1
        classifiedDescriptors["entity"] = None
        for descriptor in descriptors:
            lemma = descriptor.lemma_.lower()
            if lemma in self.referenceWords:
                pastRef = True
            elif descriptor.text.lower() in self.colors:
                classifiedDescriptors["color"].add(self.colors[descriptor.text.lower()])
            elif lemma in self.sizes:
                classifiedDescriptors["size"].add(float(self.sizes[lemma]))
            elif lemma in self.shapes:
                classifiedDescriptors["shape"].add(lemma)
            elif descriptor.pos_ == "NUM":
                classifiedDescriptors["quantity"] = descriptor.lemma_
        return (classifiedDescriptors, pastRef)

    def addSubjectDescriptors(self, subject, descriptors, subjectEntType=None, pronoun=False):
        subject = self.lemmatizer(subject, "NOUN")[0]
        descriptors, pastRef = self.classifyDescriptors(descriptors)
        if subject not in self.subjects:
            self.subjects[subject] = [descriptors]
        else:
            # TODO: If past ref and referring to multiple quantities, then get lemma of subject and modify all subjects being referred to
            # TODO: Compare descriptors to existing descriptors and choose the one that best fits, preferring the most recent
            if pastRef or pronoun:
                for propertyName, props in self.subjects[subject][-1].items():
                    if isinstance(props, set):
                        self.subjects[subject][-1][propertyName] = self.subjects[subject][-1][propertyName].union(descriptors[propertyName])
            else:
                self.subjects[subject].append(descriptors)
        if subjectEntType:
            for individual in self.subjects[subject]:
                if "entity" not in individual or not individual["entity"]:
                    individual["entity"] = subjectEntType

    def detectBackground(self, match):
        return "entity" in match and match["entity"] in ["GPE", "LOC", "EVENT", "FAC"]

    def addSubjectsToScene(self):
        for subject, matches in self.subjects.items():
            for match in matches:
                appendTo = "objects"
                if self.detectBackground(match):
                    appendTo = "backgrounds"
                match.pop("entity", None)
                self.scene[appendTo].append({
                    "subject": subject,
                    "modifiers": match
                })

    def inferContext(self):
        for object in self.scene["objects"]:
            descriptiveWords = self.inferenceEngine.getDescriptiveWords(object["subject"])
            matchingColors = []
            matchingSizes = []
            for word in descriptiveWords:
                word = word.lower()
                lemma = self.lemmatizer(word, "ADJ")[0]
                if word in self.colors:
                    matchingColors.append(self.colors[word])
                if not object["modifiers"]["size"] and lemma in self.sizes:
                    matchingSizes.append(float(self.sizes[lemma]))
            if matchingColors and not object["modifiers"]["color"]:
                object["modifiers"]["color"] = {random.choice(matchingColors)}
            if matchingSizes and not object["modifiers"]["size"]:
                object["modifiers"]["size"] = {random.choice(matchingSizes)}

    def addUniqueMatches(self, doc, subject, pronoun=False):
        matchedRanges = []
        for match_id, start, end in self.matcher(doc):
            skipMatch = False
            for prevStart, prevEnd in matchedRanges:
                if start >= prevStart and end <= prevEnd:
                    skipMatch = True
                    break
            if skipMatch:
                continue
            matchedRanges.append((start, end))
            print("match", doc[start:end])
            self.addSubjectDescriptors(subject, [token for token in doc[start:end] if token.text != subject], pronoun=pronoun)

    def matchPattern(self, doc, pattern, subject, pronoun=False):
        self.matcher.add(subject, None, pattern)
        self.addUniqueMatches(doc, subject, pronoun=pronoun)
        self.matcher.remove(subject)

    def matchPatterns(self, sentence):
        doc = self.nlp(sentence.text)
        for subject in self.subjects:
            pattern = [{'POS': 'DET', 'OP': '?'}, {'POS': 'ADJ', 'OP': '*'}, {'LOWER': subject}, {'LEMMA': 'be'}, {'POS': 'ADJ'}]
            self.matchPattern(doc, pattern, subject)

        pattern = [{'LEMMA': '-PRON-'}, {'LEMMA': 'be'}, {'POS': 'ADJ'}]

        subject = ""
        for subject in list(self.subjects)[::-1]:
            if not self.detectBackground(self.subjects[subject][-1]):
                self.matchPattern(doc, pattern, subject, pronoun=True)
                break

    def classify(self, query):
        self.scene = {
            "objects": [],
            "backgrounds": []
        }
        self.subjects = {}
        doc = self.nlp(query)
        for i, sentence in enumerate(doc.sents):
            for chunk in sentence.noun_chunks:
                subject = chunk.root.text
                if chunk.root.lemma_ == "-PRON-":
                    continue
                descriptors = [word for word in chunk if word.text != subject]
                self.addSubjectDescriptors(subject, descriptors, chunk.root.ent_type_)

            self.matchPatterns(sentence)

        self.addSubjectsToScene()
        self.inferContext()
        return self.scene
Beispiel #24
0
matcher.add('Solar', None, pattern1, pattern2, pattern3)

doc = nlp(
    u'The solar-power industry is growing up now-a-days. Solar Power distribution is invlolving by govt and now solarpower is getting polpular day by day'
)

find = matcher(doc)

#print(find)

for matched_id, start, end in find:
    string_id = nlp.vocab.strings[
        matched_id]  #-----------------Finding the matched pattern name
    span = doc[
        start:
        end]  # -----------------------------------Finding the pattern matched pattern string
    print(matched_id, string_id, start, end, span)

#--------To remove the saved pattern form the matcher object....

matcher.remove('Solar')

pattern1 = [{'Lower': 'solarpower'}]
pattern2 = [{
    'Lower': 'solar'
}, {
    'IS_PUNC': True,
    'OP': '*'
}, {
    'Lower': 'power'
}]  #-------thE '*' means punctuation can be zero or more times