Ejemplo n.º 1
0
def getIsDominatedScore(sequence, superspan_sequence, model=model):
    score = 0
    for concept, superspan in zip(sequence, superspan_sequence):
        if re_concept_tagged.match(concept):
            concept = concept.lower()
            # if '<c>positive_definite_matrix' in concept:
            #     import ipdb; ipdb.set_trace()
            covered_concepts = set()
            try:
                c1overed_neighbor_word2sim = {
                    covered_concept.lower(): sim
                    for covered_concept, sim in model.most_similar(
                        model.wv.index2word[vocab_lower[concept1].index],
                        topn=TOPN,
                        restrict_vocab=restrict_vocab,
                        partition_only=True)
                    if sim > BASIC_THRESHOLD and normalize_concept(concept) in
                    normalize_concept(covered_concept)
                }
                for other_concept in getNormalizedTextualUnits(superspan):
                    if other_concept.lower() in covered_neighbor_word2sim:
                        covered_concepts.add(other_concept)
                        continue

                score += len(covered_concepts)
            except Exception as e:
                continue

            # if len(covered_concepts) > 0:
            #     import ipdb; ipdb.set_trace()

    return score
def process():
    with open(concept_feature_path, 'w') as f_out:
        for i, w in enumerate(model.wv.index2word):
            if i % 1000 == 0:
                logging.debug('%sth concept' % i)
            if re_concept_tagged.match(w):
                concept_feature_dict[w] = computeFeatures(w, model)
                f_out.write('%s\t%s\n' %
                            (display_concept(w), concept_feature_dict[w]))

    cPickle.dump(concept_feature_dict,
                 open('data/%s/concept_feature_dict.bin' % dataset, 'w'))
Ejemplo n.º 3
0
concept_representation_path = './data/%s/concept_representation.txt' % dataset

if len(sys.argv) > 3:
    concept_representation_path = sys.argv[3]

model_save_path = './data/%s/embedding_tmp.bin' % dataset
model = Word2Vec.load(model_save_path)

model_for_score = Word2Vec.load(model_save_path)

concept_score_path = './data/%s/score_list.bin' % dataset
concept_score_list = cPickle.load(open(concept_score_path, 'r'))

re_concept_tagged = re.compile(r"<c>(?P<phrase>[^<]*)</c>")

concept_list = [w for w in model.wv.index2word if re_concept_tagged.match(w)]
concept2score = dict(
    zip(concept_list[:len(concept_score_list)], concept_score_list))

concept_lowered2score = {
    c.lower(): max([s for c, s in c_s])
    for c, c_s in groupby(sorted(concept2score.items(), key=lambda t: t[0]),
                          key=lambda t: t[0])
}

vocab_lower = {k.lower(): v for k, v in model.wv.vocab.items()}
concept_lower2Concept = {
    w: model.wv.index2word[vocab_lower[w].index]
    for w in vocab_lower
}