def multilayer_noun_based_network(self): print 'MLN-Noun' network_size = len(self.document_data[0]) document_sentences = self.document_data[0] only_auxiliar = Graph.Full(network_size) all_edges = only_auxiliar.get_edgelist() network_edges = [] auxiliar_list = [] weight_list = [] for i in range(len(self.inter_edge)): weight_list.append([]) for i in all_edges: index1 = i[0] index2 = i[1] similarity = cosineSimilarity(document_sentences[index1][0], document_sentences[index2][0]) belong_same_document = document_sentences[index1][1] == document_sentences[index2][1] if similarity > 0: network_edges.append((index1, index2)) auxiliar_list.append(similarity) if belong_same_document: for index , j in enumerate(self.inter_edge): weight_list[index].append(similarity) #weight_list.append(similarity) else: for index, j in enumerate(self.inter_edge): weight_list[index].append(similarity*j) #weight_list.append(similarity*self.inter_edge) # [1.7, 1.9] networks = [] for i in weight_list: for j in self.limiar_mln: # [0.1, 0.15, 0.2] network = Graph() network.add_vertices(network_size) network.add_edges(network_edges) network.es['weight'] = i auxiliar_network = self.remove_edges_for_mln(network, j) #print j , len(network.get_edgelist()) , len(auxiliar_network.get_edgelist()) #a = input() pair = (network, auxiliar_network) networks.append(pair) #network = Graph() #network.add_vertices(network_size) #network.add_edges(network_edges) #network.es['weight'] = weight_list #auxiliar_network = self.remove_edges_for_mln(network, 0.4) #auxiliar_network = self.remove_edges_for_mln(network, self.limiar_mln) #return [network, threshold] #return [(network, auxiliar_network), threshold] #print len(networks) #a = input() threshold = (max(auxiliar_list) + min(auxiliar_list)) / 2 return (networks , threshold)
def getSupFeats(self): sup_feature_dict = {} sup1, sup2 = [ev.supArgs for ev in self.events] for c1, c2 in itertools.product(sup1.keys(), sup2.keys()): align = "%s-%s" % (c1, c2) sim = round(utils.cosineSimilarity(sup1[c1], sup2[c2]), 3) if sim: sup_feature_dict[align] = sim return sup_feature_dict
def getCfsimFeats(self, cf1, cf2, get_contributors=False, cf_threshold=0): cfsim_feature_dict, cfsim_contributors = defaultdict(float), {} for c1, c2 in itertools.product(cf1.args.keys(), cf2.args.keys()): align = "%s-%s" % (c1, c2) if get_contributors: align_sim, contributors = utils.cosineSimilarity(cf1.args[c1], cf2.args[c2], get_contributors=True, threshold=cf_threshold) else: align_sim = utils.cosineSimilarity(cf1.args[c1], cf2.args[c2], threshold=cf_threshold) align_sim = round(align_sim, 3) if align_sim: cfsim_feature_dict[align] = align_sim cfsim_feature_dict["%s-_" % c1] += align_sim cfsim_feature_dict["_-%s" % c2] += align_sim cfsim_contributors[align] = contributors cfsim_scores = {align : round(cfsim, 3) for align, cfsim in cfsim_feature_dict.iteritems()} return cfsim_scores, cfsim_contributors
def noun_based_network(self): #print "creando red de sustantivos" network_size = len(self.document_data[0]) document_sentences = self.document_data[0] only_auxiliar = Graph.Full(network_size) all_edges = only_auxiliar.get_edgelist() network = Graph() network.add_vertices(network_size) network_edges =[] weight_list = [] cosine_sim_list = [] for i in all_edges: index1 = i[0] index2 = i[1] #common_elements = has_common_elements(document_sentences[index1] , document_sentences[index2]) common_elements = has_common_elements(document_sentences[index1][0], document_sentences[index2][0]) if common_elements>0: network_edges.append((index1,index2)) weight_list.append(common_elements) # MLN ------- #cosine = cosineSimilarity(document_sentences[index1], document_sentences[index2]) cosine = cosineSimilarity(document_sentences[index1][0], document_sentences[index2][0]) cosine_sim_list.append(cosine) network.add_edges(network_edges) network.es['weight'] = weight_list #print network.es['weight'] #print cosine_sim_list ###### PROBLEMAS PARA INGLES sds threshold = (max(cosine_sim_list) + min(cosine_sim_list))/2 #PROBLMAS PARA INGLES sds #print threshold #################### #threshold = 0 #diameter = network.diameter() #print diameter #draw_graph(network) #if diameter == 6: # draw_graph(network) #return [network, threshold] #None es el valor de treshold para MDS, para NOUns debe calcularse en la misma etapa de generacion return ([network], threshold)
def search(query): dataset = {} tfidf = [0] * len(inverdIndex) cos_sim = {} doc_listwords = [] for word in utils.removeSymbols(query.lower()).split(): if word not in stopwords and utils.isNotEmpty(word): doc_listwords.append(word) dataset = Counter(doc_listwords) for id, word in enumerate(inverdIndex.keys()): if word in dataset: tfidf[id] = dataset.get(word, 0) * inv_frec_vector[id] for word in dataset.keys(): if word in inverdIndex: # si la palabra esta en el index invertido for key in inverdIndex.get(word): if key not in cos_sim: cos_sim[key] = utils.cosineSimilarity(tfidf, allTfidf[key]) return cos_sim
with open("data.csv", "r") as ins: for line in ins: arr = line.split(",") data[arr[0].split('\'')[1]] = arr[1].split('\'')[1] questions = list(data.keys()) docTFIDFs = json.load(open("stack-tfidf.json")) while (1): query = raw_input("Please enter question: ") print("\n") maxSimilarity = -1 bestQuestion = "" print("-----------------------") print( "Calculating tfidf for the query with all questions as reference ...") queryTFIDF = utils.getTFIDF(query, questions) print( "Calling cosine similarity between all the questions to find best match ..." ) for i in range(len(questions)): question = questions[i] similarity = utils.cosineSimilarity(queryTFIDF, docTFIDFs[question]) if similarity > maxSimilarity: print(similarity) maxSimilarity = similarity bestQuestion = question print("Best question match : " + bestQuestion) print("Max similarity score : " + str(maxSimilarity)) print("Best answer : " + data[bestQuestion])
def get_max_similarity(sentence, extractos): similarities = [] for i in extractos: similarities.append(cosineSimilarity(sentence.split(), i.split())) return max(similarities)
return res feature = getFeatureVector(processed) author = ['Bryant Zhou'] recommended = [] with open('processed.json') as f: paper2author = json.load(f) # print(paper2author) length = len(paper2author) newIdx = length paper2author[validateTitle] = {} paper2author[validateTitle]['author'] = author paper2author[validateTitle]['feature'] = feature paper2author[validateTitle]['processed'] = processed paper2author[validateTitle]['similarity'] = [float('inf')] * (length + 1) paper2author[validateTitle]['index'] = newIdx for key in paper2author: print(paper2author[key]['feature']) print(f'new: {feature}') sim = cosineSimilarity(paper2author[key]['feature'], feature) paper2author[key]['similarity'].append(sim) idx = paper2author[key]['index'] paper2author[validateTitle]['similarity'][idx] = sim if sim < 1: recommended.append((key, paper2author[key]['author'])) print(recommended)
def computeScores(data, tag, vectors, ferr, params): queryId2Mention = {}; mention2QueryId = {}; qid = 1 isWeighted = params['weight']; isMeanCentered = params['meanCenter']; embeddingType = params['embeddingType'] num_cands = params['numCands']; num_components = params['ncomp'] tpca = 0.0; twpca = 0.0 trueCandMentions = False key = []; cand_names = []; hard2beat_baseline = []; avg_baseline = []; wavg_baseline = []; agw_pca = []; agw_wpca = []; labels = [] for doc_name in data: doc_candidates = []; doc_weight_array = [] doc_entity_candidates = [] for mention_dict in data[doc_name]: mention_name = mention_dict["mention"] if 'tabel' in tag: position = str(mention_dict["row"])+str(mention_dict["col"]) else: position = mention_dict["posI"] true_entity_id = mention_dict["wikidata_id"] isDifficult = mention_dict["difficulty"] if str(true_entity_id) == '-1': ferr.write("[Wikipedia Page for True-Entity has no Wikidata Mapping]: Skip this mention: "+doc_name+" "+mention_name+"\n") continue if "candidates" in mention_dict: candidate_tuples = [] temp_candidates = []; weight_array = [] flag = -1; cand_pos = 1 for cand in mention_dict["candidates"]: cand_name = cand[0] prominence_score = 1/float(cand_pos) try: entity_vector = vectors[cand_name] except KeyError: ferr.write("[Missing Embedding] Skipping candidates that do not have pre-trained entity embeddings: "+doc_name+" "+mention_name+" "+cand_name+"\n") continue # candidates used for constructing the grassmannian subspace candidate_tuples.append((cand_name, prominence_score)) temp_candidates.append(cand_name); weight_array.append(prominence_score) # check if the true entity was found in the candidates #if trueCandMentions: if cand_name == true_entity_id: flag = 0 cand_pos += 1 # restricting the data to only top-num_cands candidates per mention if num_cands != -1 and len(temp_candidates) >= num_cands: break # if the true entity is not present in the candidates, ignore this mention if trueCandMentions and flag == -1: ferr.write("[Missing True Entity] Skipping mentions without a true entity in the candidates: "+doc_name+" "+mention_name+"\n") mention2QueryId[(doc_name,mention_name,position)] = (-1,-1) continue else: if flag == -1: mention2QueryId[(doc_name,mention_name,position)] = (-1,-1) else: if (doc_name,mention_name,position) not in mention2QueryId: mention2QueryId[(doc_name,mention_name,position)] = (qid,int(isDifficult)) queryId2Mention[qid] = (doc_name,mention_name,position) qid+=1 doc_candidates += temp_candidates; doc_weight_array += weight_array doc_entity_candidates.append((true_entity_id, mention_name, position, candidate_tuples)) else: # if there are no candidates, ignore this mention ferr.write("[Missing Candidates] Skipping mentions with no candidates: "+doc_name+" "+mention_name+"\n") mention2QueryId[(doc_name,mention_name,position)] = (-1,-1) if len(doc_entity_candidates) == 0: ferr.write("[Skip Document] No true entity in the document"+doc_name+"\n") continue uniform_weights = list(np.ones(len(doc_candidates))) tpca_start = time.clock() subspace, sinV, _ = utils.constructRepresentation(doc_candidates, uniform_weights, vectors, 'pca', isMeanCentered, num_components, (doc_name,)) tpca_end = time.clock() tpca += tpca_end - tpca_start avgSubspace = utils.constructRepresentation(doc_candidates, uniform_weights, vectors, 'avg', debugInfo=(doc_name,)) twpca_start = time.clock() subspace_weighted, sinV_weighted, _ = utils.constructRepresentation(doc_candidates, doc_weight_array, vectors, 'wpca', isMeanCentered, num_components, (doc_name,)) twpca_end = time.clock() twpca += twpca_end - twpca_start weighted_avgSubspace = utils.constructRepresentation(doc_candidates, doc_weight_array, vectors, 'avg', debugInfo=(doc_name,)) for (true_entity, mention, position, candidates) in doc_entity_candidates: queryId, isDifficult = mention2QueryId[(doc_name,mention,position)] if queryId != -1: for candidate in candidates: candidate_id = candidate[0] simProminence = float(candidate[1]) entity_vector = vectors[candidate_id]/np.linalg.norm(vectors[candidate_id]) tpca_start = time.clock() if isMeanCentered: simPCA = utils.computeVecSubspaceSimilarity(entity_vector - avgSubspace, subspace, sinV, isWeighted) else: simPCA = utils.computeVecSubspaceSimilarity(entity_vector, subspace, sinV, isWeighted) tpca_end = time.clock() tpca += tpca_end - tpca_start simAvg = utils.cosineSimilarity(entity_vector, avgSubspace) twpca_start = time.clock() if isMeanCentered: simWPCA = utils.computeVecSubspaceSimilarity(entity_vector - avgSubspace, subspace_weighted, sinV_weighted, isWeighted) else: simWPCA = utils.computeVecSubspaceSimilarity(entity_vector, subspace_weighted, sinV_weighted, isWeighted) twpca_end = time.clock() twpca += twpca_end - twpca_start simWAvg = utils.cosineSimilarity(entity_vector, weighted_avgSubspace) if candidate_id == true_entity: label = 1 else: label = 0 key.append("qid:"+str(queryId)); cand_names.append(candidate_id); hard2beat_baseline.append(simProminence); avg_baseline.append(simAvg); wavg_baseline.append(simWAvg); agw_pca.append(simPCA); agw_wpca.append(simWPCA); labels.append(label) return key, cand_names, hard2beat_baseline, avg_baseline, wavg_baseline, agw_pca, agw_wpca, labels, mention2QueryId, queryId2Mention, tpca, twpca
# train the model and get the feature vector for each paper model = models.Word2Vec(sentences, min_count=1, size=7, window=2) for key in paper2author: vector = [] for stemmedWord in paper2author[key]['processed']: vector.append(model[stemmedWord]) paper2author[key]['feature'] = vector print('trained done') # get cosSim for each pair for key1 in paper2author: for key2 in paper2author: paper1, paper2 = paper2author[key1], paper2author[key2] index1, index2 = paper1['index'], paper2['index'] if index1 >= index2: continue # only compare when index1 < index2 to minimize comparisons try: cosSim = cosineSimilarity(paper1['feature'], paper2['feature']) paper1['similarity'][index2] = paper2['similarity'][index1] = cosSim except: # some edge case has research paper title with only number continue print('loop done') model.save('model.bin') for key in paper2author: for i, arr in enumerate(paper2author[key]['feature']): paper2author[key]['feature'][i] = paper2author[key]['feature'][i].tolist() print(paper2author) with open('processed.json', 'w') as f: json.dump(paper2author, f, indent=4) ########### Ex: paper2author: ########### # { 'Machine Translation Demonstration': {'author': ['Ulrike Schwall'], # 'processed': ['lmt', '-', 'machin', 'translat', 'demonstr'],