def findFeature(i, j): dice_score = [] cosine_score = [] jaccard_score = [] fuzzy_score = [] sequence_matcher = [] fmeasure = [] word2vec_scores = [] wordnet_score = [] tfidf_cosine_score = [] # find features dice_score.append(dice_recur(i, j)) cosine_score.append(get_cosine(i, j)) jaccard_score.append(is_ci_token_stopword_set_match(i, j)) fuzzy_score.append(get_string_similarity(i, j)) sequence_matcher.append(similar(i, j)) # tf-idf cosine similarity tfidf_cosine_score.append(tfidf_cosine(i, j)) rouge_score = calc_Rouge(i, j) surf_features = get_surface_features(i, j) fmeasure.append(FMeasure(j)) word2vec_scores.append(word2vec_score(i, j)) ner_overlap_score = get_NER_overlap_score(i, j) num_overlap_score = get_number_similarity(i, j) # no of significant words + summation of significance values significant_score = getsignificance(j) # get best similarity between words based on wordnet wordnet_score.append(get_wordnet_based_similarity(i, j)) features = tuple(dice_score) + tuple(cosine_score) + tuple( jaccard_score) + tuple(fuzzy_score) + tuple(sequence_matcher) + tuple( tfidf_cosine_score) + rouge_score + tuple( fmeasure) + surf_features + tuple(word2vec_scores) + tuple( wordnet_score ) + ner_overlap_score + num_overlap_score + significant_score featureVector = {} cnt = -1 for feature in features: cnt += 1 featureVector[cnt] = feature return featureVector
select_cmd[ i] = "select text from query where create_time like '%d%%'" % date[i] db = sqlite3.connect(db_path) c = db.cursor() for i in range(len(date)): c.execute(select_cmd[i]) ans = c.fetchall() ans = [''.join(x) for x in ans] result = [jieba.lcut(each) for each in ans] for j in range(len(result)): result[j] = [ word for word in result[j] if word not in [u' '] + stopwords ] #print get_cosine(Counter(result[0]),Counter(result[1])) G = nx.Graph() for k in range(len(result)): G.add_node(k) for x in range(len(result)): for y in range(x, len(result)): G.add_weighted_edges_from([(x, y, get_cosine(Counter(result[x]), Counter(result[y])))]) pr[i] = nx.pagerank(G) #get_cosine(ans[0],ans[1]) print pr[0] c.close() db.commit() db.close()
c.execute(select_cmd) ans = c.fetchall() ans = [''.join(x) for x in ans] print len(set(ans)) result = [jieba.lcut(each) for each in ans] print len(result) result = result[:1000] for j in range(len(result)): result[j] = [word for word in result[j] if word not in [u' '] + stopwords] #print get_cosine(Counter(result[0]),Counter(result[1])) print 'select finished...' G = nx.Graph() print len(result) for x in range(len(result)): for y in range(x, len(result)): if get_cosine(Counter(result[x]), Counter(result[y])) != 0.0: #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y])))) G.add_weighted_edges_from([(x, y, x + y)]) print 'graph edges added...' pr = nx.pagerank(G) print 'pagerank calculated...' #get_cosine(ans[0],ans[1]) print pr print 'Done!' c.close() db.commit() db.close()
def __init__(self, str_q, str_a): # Clean the input (delete unnecessary space) self.str_q = str_q.strip() self.str_a = str_a.strip() # Tokenize self.bag_q = word_tokenize(self.str_q) self.bag_a = word_tokenize(self.str_a) # POS Tagging (As feature) self.bag_q_pos = nltk.pos_tag(self.bag_q) # Tuple (word, POS) self.bag_a_pos = nltk.pos_tag(self.bag_a) # Tuple (word, POS) # Remove stop words self.bag_q_sw_removed = [ w for w in self.bag_q if w.lower() not in stop_words ] self.bag_a_sw_removed = [ w for w in self.bag_a if w.lower() not in stop_words ] # Stem (As feature) self.bag_q_stemmed = [] self.bag_a_stemmed = [] for word in self.bag_q: self.bag_q_stemmed.append(stemmer.stem(word)) for word in self.bag_a: self.bag_a_stemmed.append(stemmer.stem(word)) # Lemmatize (As feature) self.bag_q_lemmatized = [] self.bag_a_lemmatized = [] for word in self.bag_q: self.bag_q_lemmatized.append(lemmatizer.lemmatize(word)) for word in self.bag_a: self.bag_a_lemmatized.append(lemmatizer.lemmatize(word)) # Tree Parse (As feature) self.sent_q = sent_tokenize(self.str_q) self.sent_a = sent_tokenize(self.str_a) self.parse_tree_q = parser.raw_parse_sents(self.sent_q) self.parse_tree_a = parser.raw_parse_sents(self.sent_a) # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature) self.hypernymns = [] self.hyponyms = [] self.meronyms = [] self.holonyms = [] self.bag_counter = Counter(self.bag_q_sw_removed) + Counter( self.bag_a_sw_removed) for word in self.bag_counter.keys(): synsets = wn.synsets(word) if synsets: max_cos = 0.0 target_synset = None for synset in synsets: definition = synset.definition() cos = get_cosine(Counter(self.bag_q + self.bag_a), Counter(definition)) if cos > max_cos: max_cos = cos target_synset = synset if target_synset is None: target_synset = synsets[0] if target_synset.hypernyms(): self.hypernymns += target_synset.hypernyms() if target_synset.hyponyms(): self.hyponyms += target_synset.hyponyms() if target_synset.part_meronyms(): self.meronyms += target_synset.part_meronyms() if target_synset.part_holonyms(): self.holonyms += target_synset.part_holonyms()
ans_dict=OrderedDict() for i in ans: ans_dict[i]=ans_dict.get(i,0)+1 result=[jieba.lcut(each) for each in ans_dict.keys()] print len(result) if not result: continue for j in range(len(result)): result[j]=[word for word in result[j] if word not in [u' ']+stopwords] #print get_cosine(Counter(result[0]),Counter(result[1])) #print 'select finished...' G=nx.Graph() #print len(result) for x in range(len(result)): for y in range(x,len(result)): if get_cosine(Counter(result[x]),Counter(result[y])) != 0.0: #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y])))) G.add_weighted_edges_from([(x,y,ans_dict.values()[x]*ans_dict.values()[y]*get_cosine(Counter(result[x]),Counter(result[y])))]) #print 'graph edges added...' pr=nx.pagerank(G) #print type(pr) #print pr pr_max=max(pr.values()) for k in pr.keys(): pr[k]=pr[k]/pr_max #pr_sort=sorted(pr.items(), key=lambda x: x[1],reverse=True) #pr_max_index=pr_sort[0][0] for k in range(len(result)): count[d]+=ans_dict.values()[k]*pr.values()[k] #count[d]+=ans_dict.values()[x]*ans_dict.values()[y]*get_cosine(Counter(result[pr_max]),Counter(result[k])) #print 'pagerank calculated...'
input_tree = parser.raw_parse_sents(input_sents) # WordNet hypernymns, hyponyms, meronyms, AND holonyms (As feature) input_hypernymns = [] input_hyponyms = [] input_meronyms = [] input_holonyms = [] input_bag_counter = Counter(input_sw_removed) for word in input_bag_counter.keys(): synsets = wn.synsets(word) if synsets: max_cos = 0.0 target_synset = None for synset in synsets: definition = synset.definition() cos = get_cosine(Counter(input_bag), Counter(definition)) if cos > max_cos: max_cos = cos target_synset = synset if target_synset is None: target_synset = synsets[0] if target_synset.hypernyms(): input_hypernymns += target_synset.hypernyms() if target_synset.hyponyms(): input_hyponyms += target_synset.hyponyms() if target_synset.part_meronyms(): input_meronyms += target_synset.part_meronyms() if target_synset.part_holonyms(): input_holonyms += target_synset.part_holonyms() # Task 2 - Use statistical method (cosine-similarity) to calculate similarity of user input and every faqs
def text_rank(dateJson, source, model, wiki_model): data = [] for item in source: if item['create_time'][0:8] == dateJson: data.append(item) if len(data) == 0: return print '# of records:' + str(len(data)) print data[0]['create_time'] dict_day = collections.OrderedDict() Graph = networkx.Graph() for item in data: for word in item['cut_text']: if word not in dict_day: dict_day[word] = 1 else: dict_day[word] += 1 dict_day = sorted(dict_day.iteritems(), key=lambda d: d[1], reverse=True) print dict_day[0] fre_files = open(path.hotwords_path_fre.format(dateJson), 'w+') try: for i in range(len(dict_day)): line = {'keyword': dict_day[i][0].encode('utf-8'), 'num': str(dict_day[i][1])} fre_files.write('{}\n'.format(json.dumps(line, ensure_ascii=False))) except IndexError: print 'done' fre_files.close() dict_day = [item for item in dict_day if item[1] > 100] print 'adding edges...' scale = len(dict_day) for x in range(scale): for y in range(x, scale): if x == y: continue sim = 0.1 * cosine.get_cosine(collections.Counter(dict_day[x][0].encode('utf-8')), collections.Counter(dict_day[y][0].encode('utf-8'))) if dict_day[x][0] in model and dict_day[y][0] in model: sim += 0.2 * model.similarity(dict_day[x][0], dict_day[y][0]) if dict_day[x][0] in wiki_model and dict_day[y][0] in wiki_model: sim += 0.7 * wiki_model.similarity(dict_day[x][0], dict_day[y][0]) if sim > 0: Graph.add_edge(x, y, weight=sim) print '# of nodes: ' + str(Graph.number_of_nodes()) print '# of edges: ' + str(Graph.number_of_edges()) print 'pageranking...' pr = networkx.pagerank(Graph) result = [[dict_day[pr.keys()[j]][0], dict_day[pr.keys()[j]][1], pr.values()[j]] for j in range(len(pr.values()))] sum_of_fre = sum([item[1] for item in result]) result = sorted(result, key=lambda d: d[2], reverse=True) pr_file = open(path.hotwords_path_pr.format(dateJson), 'w+') try: for i in range(len(result)): line = {'keyword': result[i][0].encode('utf-8'), 'num': str(result[i][1]), 'factor': str(result[i][2])} pr_file.write('{}\n'.format(json.dumps(line, ensure_ascii=False))) except IndexError: print 'done' pr_file.close() result = sorted(result, key=lambda d: d[2] * d[1], reverse=True) hot_words_file = open(path.hotwords_path_frepr.format(dateJson), 'w+') try: for i in range(len(result)): line = {'keyword': result[i][0].encode('utf-8'), 'num': str(result[i][1]), 'factor': str(result[i][2])} hot_words_file.write('{}\n'.format(json.dumps(line, ensure_ascii=False))) except IndexError: print 'done' hot_words_file.close()
c.execute(select_cmd) ans=c.fetchall() ans=[''.join(x) for x in ans] print len(set(ans)) result=[jieba.lcut(each) for each in ans] print len(result) result=result[:1000] for j in range(len(result)): result[j]=[word for word in result[j] if word not in [u' ']+stopwords] #print get_cosine(Counter(result[0]),Counter(result[1])) print 'select finished...' G=nx.Graph() print len(result) for x in range(len(result)): for y in range(x,len(result)): if get_cosine(Counter(result[x]),Counter(result[y])) != 0.0: #edges.append((x,y,get_cosine(Counter(result[x]),Counter(result[y])))) G.add_weighted_edges_from([(x,y,x+y)]) print 'graph edges added...' pr=nx.pagerank(G) print 'pagerank calculated...' #get_cosine(ans[0],ans[1]) print pr print 'Done!' c.close() db.commit() db.close()
stopwords[i]=stopwords[i].decode('utf8') jieba.load_userdict("dict/event.txt") select_cmd=[None]*len(date) for i in range(len(date)): select_cmd[i]="select text from query where create_time like '%d%%'"%date[i] db=sqlite3.connect(db_path) c=db.cursor() for i in range(len(date)): c.execute(select_cmd[i]) ans=c.fetchall() ans=[''.join(x) for x in ans] result=[jieba.lcut(each) for each in ans] for j in range(len(result)): result[j]=[word for word in result[j] if word not in [u' ']+stopwords] #print get_cosine(Counter(result[0]),Counter(result[1])) G=nx.Graph() for k in range(len(result)): G.add_node(k) for x in range(len(result)): for y in range(x,len(result)): G.add_weighted_edges_from([(x,y,get_cosine(Counter(result[x]),Counter(result[y])))]) pr[i]=nx.pagerank(G) #get_cosine(ans[0],ans[1]) print pr[0] c.close() db.commit() db.close()