Esempio n. 1
0
    def ngram_match_remove_stopwords(sa, sb, n):
        nga = utils.make_ngram(sa, n)
        ngb = utils.make_ngram(sb, n)

        stopwords = dict_utils.DictLoader().load_dict('stopwords')

        new_nga = []
        for ng in nga:
            new_ng = []
            for x in ng:
                if x not in stopwords:
                    new_ng.append(x)
            new_ng = tuple(new_ng)
            if new_ng != ():
                new_nga.append(new_ng)

        new_ngb = []
        for ng in ngb:
            new_ng = []
            for x in ng:
                if x.lower() not in stopwords:
                    new_ng.append(x)
            new_ng = tuple(new_ng)
            if new_ng != ():
                new_ngb.append(new_ng)

        f1 = utils.overlap_f1(new_nga, new_ngb)
        info = [new_nga, new_ngb]
        return f1, info
Esempio n. 2
0
    def extract(self, train_instance):
        negation = dict_utils.DictLoader().load_dict('negation_terms')
        lemma_sa, lemma_sb = train_instance.get_word(type='lemma', lower=True)
        na = sum([1 if w in negation else 0 for w in lemma_sa])
        nb = sum([1 if w in negation else 0 for w in lemma_sb])

        features = [(na - nb) % 2]
        infos = [na, nb]
        return features, infos
 def extract(self, train_instance):
     idf_weight = dict_utils.DictLoader().load_dict('global_idf')
     vocab = utils.word2index(idf_weight)
     sa, sb = train_instance.get_word(type='lemma',
                                      stopwords=True,
                                      lower=True)
     features, infos = utils.sentence_vectorize_features(sa,
                                                         sb,
                                                         idf_weight,
                                                         vocab,
                                                         convey='idf')
     return features, infos
    def extract_instances(self, train_instances):
        model = dict_utils.DictLoader().load_doc2vec()
        file_name = self.train_file.split('/')[-1]
        features = []
        infos = []
        for idx in range(len(train_instances)):
            vec_a = model.docvecs['%s_%d_sa' % (file_name, idx)]
            vec_b = model.docvecs['%s_%d_sb' % (file_name, idx)]
            # train_instance = train_instances[idx]
            # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True)
            # vec_a = model.infer_vector(sa)
            # vec_b = model.infer_vector(sb)

            feature, info = vk.get_all_kernel(vec_a, vec_b)
            features.append(feature)
            infos.append(info)

        return features, infos
Esempio n. 5
0
def pooling(word_sa, emb_type, dim, pooling_types='avg', convey='idf'):
    idf_weight = dict_utils.DictLoader().load_dict('idf')
    embedding = Embedding()

    vdist = nltk.FreqDist(word_sa)
    length = float(len(word_sa))

    if pooling_types == 'avg':
        function = np.average
    elif pooling_types == 'min':
        function = np.amin
    elif pooling_types == 'max':
        function = np.amax
    else:
        print(pooling_types)
        raise NotImplementedError

    vec = []
    for word in word_sa:
        if emb_type == 'word2vec':
            st, w2v = embedding.get_word2vec(word)
        elif emb_type == 'glove':
            st, w2v = embedding.get_glove(word)
        elif emb_type == 'paragram':
            st, w2v = embedding.get_paragram(word)
        elif emb_type == 'glove300':
            st, w2v = embedding.get_glove300(word)

        if convey == 'idf':
            w = idf_weight.get(word, 10.0)
        elif convey == 'tfidf':
            w = vdist[word] * idf_weight.get(word, 10.0)
        else:
            raise NotImplementedError

        w2v = w * np.array(w2v)
        vec.append(w2v)

    if len(vec) == 0:
        vec = np.zeros((dim, ))
    else:
        vec = function(vec, axis=0)

    return vec