def expand_query_term_cluster(self, q, G, cluster_dict, k_relevant_words): upd_query = utils.get_tokenized_query(q) porter = PorterStemmer() res = [w for w in upd_query] for qw in upd_query: counter = 0 for cluster in cluster_dict.values(): if qw in cluster or porter.stem(qw) in cluster: list_neighbors = [ i for i in cluster if (i != qw and i != porter.stem(qw)) ] counter += 1 break if counter == 0: continue weight_list = [] for i in list_neighbors: weight_list.append( (i, G.edges[(qw, i)]['weight'] if (qw, i) in G.edges else (porter.stem(qw), i))) final_res = sorted(weight_list, key=lambda x: x[1], reverse=True)[:k_relevant_words] for u, v in final_res: res.append(u) return ' '.join(res)
def get_expanded_query(self, q, args=None): if not Glove.glove: print('INFO: Glove: Loading word vectors in {} ...'.format( Glove.vectorfile)) Glove.glove = load_glove_model(Glove.vectorfile) upd_query = utils.get_tokenized_query(q) synonyms = [] res = [] if not self.replace: res = [w for w in upd_query] ps = PorterStemmer() for qw in upd_query: found_flag = False qw_stem = ps.stem(qw) if qw.lower() in Glove.glove.keys(): w = sorted(Glove.glove.keys(), key=lambda word: scipy.spatial.distance.euclidean( Glove.glove[word], Glove.glove[qw])) w = w[:self.topn] for u in w: u_stem = ps.stem(u) if u_stem != qw_stem: found_flag = True res.append(u) if not found_flag and self.replace: res.append(qw) return ' '.join(res)
def get_expanded_query(self, q, args=None): if not Word2Vec.word2vec: print('INFO: Word2Vec: Loading word vectors in {} ...'.format( Word2Vec.vectorfile)) Word2Vec.word2vec = gensim.models.KeyedVectors.load_word2vec_format( Word2Vec.vectorfile) upd_query = utils.get_tokenized_query(q) synonyms = [] res = [] if not self.replace: res = [w for w in upd_query] ps = PorterStemmer() for qw in upd_query: found_flag = False qw_stem = ps.stem(qw) if qw in Word2Vec.word2vec.vocab: w = Word2Vec.word2vec.most_similar(positive=[qw], topn=self.topn) for u, v in w: u_stem = ps.stem(u) if u_stem != qw_stem: found_flag = True res.append(u) if not found_flag and self.replace: res.append(qw) return ' '.join(res)
def get_expanded_query(self, q, args=None): if not Word2Vec.word2vec: print('INFO: Word2Vec: Loading word vectors in {} ...'.format( Word2Vec.vectorfile)) Word2Vec.word2vec = gensim.models.KeyedVectors.load( Word2Vec.vectorfile) query_concepts = self.get_concepts(q, 0.1) upd_query = utils.get_tokenized_query(q) res = [] if not self.replace: res = [w for w in upd_query] for c in query_concepts: c_lower_e = "e_" + c.replace(" ", "_").lower() if c_lower_e in Word2Vec.word2vec.vocab: w = Word2Vec.word2vec.most_similar(positive=[c_lower_e], topn=self.topn) for u, v in w: if u.startswith("e_"): u = u.replace("e_", "") elif u.startswith("c_"): u = u.replace("c_", "") res.append(u.replace("_", " ")) res.append(c) return ' '.join(res)
def get_expanded_query(self, q, args=None): upd_query = utils.get_tokenized_query(q) ps = PorterStemmer() synonyms =[] res = [] if not self.replace: res=[w for w in upd_query] for w in upd_query: found_flag = False w_stem=ps.stem(w) for syn in wordnet.synsets(w): for l in syn.lemmas(): synonyms.append(l.name()) synonyms=list(set(synonyms)) synonyms=synonyms[:self.topn] for s in synonyms: s_stem=ps.stem(s) if s_stem!=w_stem: found_flag = True res.append(s) synonyms=[] if not found_flag and self.replace: res.append(w) return ' '.join(res)
def get_expanded_query(self, q, args=None): query_concepts = self.get_concepts(q, 0.1) upd_query = utils.get_tokenized_query(q) res = [] if not self.replace: res = [w for w in upd_query] for c in query_concepts: res.append(c) return ' '.join(res)
def get_expanded_query(self, q, args=None): upd_query = utils.get_tokenized_query(q) res = [] if not self.replace: res = [w for w in upd_query] ps = PorterStemmer() for q in upd_query: q_stem = ps.stem(q) found_flag = False try: obj = requests.get('http://api.conceptnet.io/c/en/' + q).json() except: if self.replace: res.append(q) continue if len(obj['edges']) < self.topn: x = len(obj['edges']) else: x = self.topn for i in range(x): try: start_lan = obj['edges'][i]['start']['language'] end_lan = obj['edges'][i]['end']['language'] except: continue if obj['edges'][i]['start']['language'] != 'en' or obj['edges'][i]['end']['language'] != 'en': continue if obj['edges'][i]['start']['label'].lower() == q: if obj['edges'][i]['end']['label'] not in res and q_stem != ps.stem(obj['edges'][i]['end']['label']): found_flag = True res.append(obj['edges'][i]['end']['label']) elif obj['edges'][i]['end']['label'].lower() == q: if obj['edges'][i]['start']['label'] not in res and q_stem != ps.stem(obj['edges'][i]['start']['label']): found_flag = True res.append(obj['edges'][i]['start']['label']) if not found_flag and self.replace: res.append(q) return ' '.join(res)
def get_expanded_query(self, q, args=None): pos_dict = { 'n': 'noun', 'v': 'verb', 'a': 'adjective', 's': 'satellite adj', 'r': 'adverb' } upd_query = utils.get_tokenized_query(q) q_ = [] if not self.replace: q_ = [w for w in upd_query] for w in upd_query: found_flag = False if utils.valid(w): pos = wordnet.synsets(w)[0].pos() if wordnet.synsets( w) else 'n' syn = self.get_synonym(w, pos_dict[pos]) if not syn and self.replace: q_.append(w) else: q_.append(' '.join(syn)) return ' '.join(q_)