def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):

    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    context_matrix = np.zeros((len(unranked_phrases), len(unranked_patterns)))
    # find c (t, p)
    with open(file, 'r') as f:
        file_chunk = partition(f)
        matcher = Matcher(nlp.vocab)
        for t in file_chunk:
            doc = nlp(t)
            for i in range(len(unranked_patterns)):
                offset = 0
                for pattern_dict in unranked_patterns[i]:
                    if 'POS' in pattern_dict:
                        break
                    offset += 1
                matcher.add("extraction", None, unranked_patterns[i])
                matches = matcher(doc)
                for match_id, start, end in matches:
                    span = doc[start + offset:end].text
                    j = unranked_phrases.index(span)
                    context_matrix[j, i] += 1
                    id2patterns[j].add(i)
                    pattern2ids[i].add(j)
                matcher.remove("extraction")

    id2sup = {}
    pattern2sup = {}

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [],
                                                id2patterns,
                                                pattern2ids, {}, {}, {}, {},
                                                id2phrase,
                                                context_matrix.tolist(),
                                                id2sup,
                                                pattern2sup,
                                                FLAGS_VERBOSE=False,
                                                FLAGS_DEBUG=False)

    return l1, l2, l3, l4, m1, m2, m3, m4
Ejemplo n.º 2
0
def run_prdualrank(T_0, unranked_patterns, unranked_phrases, file):
    global final_patterns, final_keywords, pattern_to_score_map, keyword_to_score_map, ngram_prob_map, phrase_seg_score, removed_phrases, wiki_score_cache, error_count, total_ngram_counts

    phrase2id = {}
    for i in range(len(unranked_phrases)):
        phrase2id[unranked_phrases[i]] = i

    id2phrase = {}
    for i in range(len(unranked_phrases)):
        id2phrase[i] = unranked_phrases[i]

    id2pattern = {}
    for i in range(len(unranked_patterns)):
        id2pattern[i] = unranked_patterns[i]

    seedIdwConfidence = {}
    for key, val in phrase2id.items():
        if key in T_0:
            seedIdwConfidence[val] = 0.0

    id2patterns = defaultdict(set)
    pattern2ids = defaultdict(set)

    with open('../development_ipynbs/context_matrix.pickle', 'rb') as f:
        context_matrix = pickle.load(f)
        print("[LOG] Loaded the context matrix. Shape: " +
              str(context_matrix.shape))

    for i in range(len(unranked_patterns)):
        for j in range(len(unranked_phrases)):
            if context_matrix[j, i] > 0:
                id2patterns[j].add(i)
                pattern2ids[i].add(j)

    id2sup = {}
    for i in range(len(unranked_phrases)):
        id2sup[i] = 0

    pattern2sup = {}
    for i in range(len(unranked_patterns)):
        pattern2sup[i] = 0

    for id in id2patterns.keys():
        sum = 0
        for col in range(len(unranked_patterns)):
            sum += context_matrix[id, col]
        id2sup[id] = sum

    for pattern in pattern2ids.keys():
        sum = 0
        for row in range(len(unranked_phrases)):
            sum += context_matrix[row, pattern]
        pattern2sup[pattern] = sum

    print("[LOG] Initiating PR Dual Rank inference.")
    l1, l2, l3, l4, m1, m2, m3, m4 = prDualRank(seedIdwConfidence, [],
                                                id2patterns,
                                                pattern2ids, {}, {}, {}, {},
                                                id2phrase,
                                                context_matrix.tolist(),
                                                id2sup,
                                                pattern2sup,
                                                FLAGS_VERBOSE=True,
                                                FLAGS_DEBUG=True)
    print("[LOG] Ended PR Dual Rank inference.")

    return l1, l2, l3, l4, m1, m2, m3, m4