def extract_daily_facts(user_1_facts, user_2_facts): """ :param user_1_facts: :param user_2_facts: :return: """ f1 = 0 f2 = 0 f3 = 1.0 f4 = 1.0 num_days = 0 for day in user_1_facts: if day in user_2_facts: num_days += 1 facts1 = user_1_facts[day] facts2 = user_2_facts[day] jaccard1 = matutils.jaccard(facts1.items(), facts2.items()) jaccard2 = matutils.jaccard(facts1, facts2) f1 += jaccard1 f2 += jaccard2 f3 = min(f3, jaccard1) f4 = min(f4, jaccard2) if num_days > 0: f1 = float(f1) / num_days f2 = float(f2) / num_days else: f1 = -999 f2 = -999 f3 = -999 f4 = -999 return f1, f2, f3, f4, float(num_days) / (len(user_1_facts) + len(user_2_facts) - num_days)
vect1 = user_to_topic[u1] vect2 = user_to_topic[u2] topic_sim = 1 - spatial.distance.cosine(vect1, vect2) vec1 = doc2vecmodel.docvecs[u1] vec2 = doc2vecmodel.docvecs[u2] doc2vec_sim = 1 - spatial.distance.cosine(vec1, vec2) bow1 = user_to_words[u1] bow2 = user_to_words[u2] lda_bow1 = user_to_lda_bow[u1] lda_bow2 = user_to_lda_bow[u2] hellinger_score = matutils.hellinger( lda_bow1, lda_bow2) # hellinger(lda_bow2, lda_bow1) cosine_score = matutils.cossim(lda_bow1, lda_bow2) jaccard_word_score = matutils.jaccard(bow1, bow2) jaccard_lda_score = matutils.jaccard(lda_bow1, lda_bow2) outfile.write("{} {} {} {} {} {} {} {} {} {}\n".format( line.strip(), share_facts, float(share_facts) / union_facts, tfidf_sim, topic_sim, hellinger_score, cosine_score, jaccard_word_score, jaccard_lda_score, doc2vec_sim)) outfile.close() i += 2
i = 4 while i < len(sys.argv): outfile = open(sys.argv[i + 1], 'w') with open(sys.argv[i], 'r') as infile: for line in tqdm(infile): splits = line.strip().split() u1, u2 = splits[0].split(",") vect1 = tfidf_days[user_to_time_index[u1]].toarray()[0] vect2 = tfidf_days[user_to_time_index[u2]].toarray()[0] f1 = 1 - spatial.distance.cosine(vect1, vect2) f2 = matutils.jaccard(user_to_time[u1], user_to_time[u2]) f3 = matutils.jaccard(user_to_time_id[u1], user_to_time_id[u2]) f4 = compute_euclidean_distance(user_to_hour[u1], user_to_hour[u2]) f5 = compute_euclidean_datetime_distance(user_to_datetime[u1], user_to_datetime[u2]) f6 = compute_cosine_distance(user_to_hour[u1], user_to_hour[u2]) f7 = compute_cosine_datetime_distance(user_to_datetime[u1], user_to_datetime[u2]) dist_list, share_time = compute_dist(user_to_hour[u1], user_to_hour[u2])