def ngram_match_remove_stopwords(sa, sb, n): nga = utils.make_ngram(sa, n) ngb = utils.make_ngram(sb, n) stopwords = dict_utils.DictLoader().load_dict('stopwords') new_nga = [] for ng in nga: new_ng = [] for x in ng: if x not in stopwords: new_ng.append(x) new_ng = tuple(new_ng) if new_ng != (): new_nga.append(new_ng) new_ngb = [] for ng in ngb: new_ng = [] for x in ng: if x.lower() not in stopwords: new_ng.append(x) new_ng = tuple(new_ng) if new_ng != (): new_ngb.append(new_ng) f1 = utils.overlap_f1(new_nga, new_ngb) info = [new_nga, new_ngb] return f1, info
def extract(self, train_instance): negation = dict_utils.DictLoader().load_dict('negation_terms') lemma_sa, lemma_sb = train_instance.get_word(type='lemma', lower=True) na = sum([1 if w in negation else 0 for w in lemma_sa]) nb = sum([1 if w in negation else 0 for w in lemma_sb]) features = [(na - nb) % 2] infos = [na, nb] return features, infos
def extract(self, train_instance): idf_weight = dict_utils.DictLoader().load_dict('global_idf') vocab = utils.word2index(idf_weight) sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True) features, infos = utils.sentence_vectorize_features(sa, sb, idf_weight, vocab, convey='idf') return features, infos
def extract_instances(self, train_instances): model = dict_utils.DictLoader().load_doc2vec() file_name = self.train_file.split('/')[-1] features = [] infos = [] for idx in range(len(train_instances)): vec_a = model.docvecs['%s_%d_sa' % (file_name, idx)] vec_b = model.docvecs['%s_%d_sb' % (file_name, idx)] # train_instance = train_instances[idx] # sa, sb = train_instance.get_word(type='lemma', stopwords=True, lower=True) # vec_a = model.infer_vector(sa) # vec_b = model.infer_vector(sb) feature, info = vk.get_all_kernel(vec_a, vec_b) features.append(feature) infos.append(info) return features, infos
def pooling(word_sa, emb_type, dim, pooling_types='avg', convey='idf'): idf_weight = dict_utils.DictLoader().load_dict('idf') embedding = Embedding() vdist = nltk.FreqDist(word_sa) length = float(len(word_sa)) if pooling_types == 'avg': function = np.average elif pooling_types == 'min': function = np.amin elif pooling_types == 'max': function = np.amax else: print(pooling_types) raise NotImplementedError vec = [] for word in word_sa: if emb_type == 'word2vec': st, w2v = embedding.get_word2vec(word) elif emb_type == 'glove': st, w2v = embedding.get_glove(word) elif emb_type == 'paragram': st, w2v = embedding.get_paragram(word) elif emb_type == 'glove300': st, w2v = embedding.get_glove300(word) if convey == 'idf': w = idf_weight.get(word, 10.0) elif convey == 'tfidf': w = vdist[word] * idf_weight.get(word, 10.0) else: raise NotImplementedError w2v = w * np.array(w2v) vec.append(w2v) if len(vec) == 0: vec = np.zeros((dim, )) else: vec = function(vec, axis=0) return vec