def worker(word_tuple: (AnyStr, Iterable[float])) -> (AnyStr, float32): """Worker function for the pool cannot be inner function because it cannot be pickled that way""" # Check if there is any punctuation in the word. If there is, skip it. This used `string.punctuation` minus `-`, # since this symbol can occur in actual words word, vec = word_tuple word_vector = list(vec) if len(word_vector) != 300: return None, None diff_man = cosine_similarity(word_vector, current_man_vec) diff_woman = cosine_similarity(word_vector, current_woman_vec) diff = diff_man - diff_woman return word, diff
def reorder_list_like(to_reorder, ref_summs, ordered_ref_summs): # if len(to_reorder) != len(ref_summs) or len(to_reorder) != len(ordered_ref_summs): # raise Exception('lens of lists are not equal. %d %d %d' % (len(to_reorder), len(ref_summs), len(ordered_ref_summs))) print('Fitting and transforming vecs') vec = CountVectorizer(input='content', decode_error='ignore') all_vecs = vec.fit_transform(ref_summs + ordered_ref_summs) unordered_vecs = all_vecs[:len(ref_summs)] ordered_vecs = all_vecs[len(ref_summs):] print('Cosine similarity') similarities = util.cosine_similarity(ordered_vecs, unordered_vecs) argmaxes = np.argmax(similarities, axis=1) indices_found = [False] * len(to_reorder) reordered_summaries = [] for i in tqdm(range(len(argmaxes))): argmax_val = argmaxes[i] max_val = similarities[i, argmax_val] if max_val < 0.7: a = 0 # raise Exception('Best result does not match well. \nSystem ref summ: %s\n\n Ordered ref summ: %s' % (ref_summs[argmax_val], ordered_ref_summs[i])) # if indices_found[argmax_val]: # raise Exception('Best result was already matched with another ordered ref summ') indices_found[argmax_val] = True reordered_summaries.append(to_reorder[argmax_val]) if len(reordered_summaries) != len(to_reorder): a = 0 # raise Exception('reordered summaries len (%d) is not equal to original length (%d)' % (len(reordered_summaries), len(to_reorder))) return reordered_summaries
def get_single_sent_features(similar_source_indices, sent_term_matrix, doc_vector, article_sent_tokens): sent_idx = similar_source_indices[0] doc_similarity = util.cosine_similarity(sent_term_matrix[sent_idx], doc_vector) sent_len = len(article_sent_tokens[sent_idx]) return sent_idx, doc_similarity, sent_len
def clustering(self): # Calculate similarity matrix X = self.create_tfidf_vector() X = X.toarray() pca = PCA(n_components=300, copy=False) X = pca.fit(X).transform(X) S = cosine_similarity(X, X) # Run affinity propogation af = AffinityPropagation() af.fit(S) # Formulate result tmp_clusters = defaultdict(list) goal_clusters = defaultdict(list) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 for label in labels: tmp_clusters[\ self.goal_list[cluster_centers_indices[label]]].append(\ self.goal_list[count]) count += 1 # 2nd-layer clutering of each cluster for goal, item_list in tmp_clusters.items(): subclusters = self.subcluster_by_editdistance(goal, item_list) for subgoal, items in subclusters.items(): goal_clusters[subgoal] = items return goal_clusters
def main(): args = ArgumentParser() args.add_argument('-c', '--camera_url', default=0, type=str, help='0 - local camera') args.add_argument('-dt', '--detect_threshold', default=0.975, type=float, help="Threshold of face detection") args.add_argument('-rf', '--recognized_threshold', default=0.8, type=float, help="Threshold of face recognition") args.add_argument('--device', default='cuda:0', type=str, help="Device run model. `cuda:<id>` or `cpu`") args.add_argument('--detect_face_model', default='data/pretrained/mobilenet_header.pth', type=str, help="Face detector model path") args.add_argument('--detect_face_backbone', default='data/pretrained/mobile_backbone.tar', type=str, help="Face detector backbone path") args.add_argument('--recognized_model', default='data/pretrained/embedder_resnet50_asia.pth' , type=str, help="Face embedding model path") args.add_argument('--model_registered', default='model_faces.npy', type=str, help="Model contain face's vectors") args.add_argument('--model_ids', default='model_face_ids.npy', type=str, help="Model contain face's ids") args = args.parse_args() try: args.camera_url = int(args.camera_url) except: pass if not (os.path.isfile(args.model_registered) and os.path.isfile(args.model_ids)): face_model = numpy.zeros((0, 512), dtype=numpy.float32) ids_model = [] else: face_model = numpy.load(args.model_registered, allow_pickle=True) ids_model = numpy.load(args.model_ids, allow_pickle=True).tolist() detector = FaceDetection(args.detect_face_model, args.detect_face_backbone, scale_size=480, device=args.device) embedder = FaceEmbedding(args.recognized_model, device=args.device) # recognize video = VideoCapture(args.camera_url) for frame in video: faces = detector(frame) faces = embedder(faces) for face in faces: txt = "None" color = RED scores = cosine_similarity(face.embedding.reshape(1, 512), face_model, skip_normalize=True).ravel() args_idx = numpy.argmax(scores) if scores[args_idx] >= args.recognized_threshold: txt = ids_model[args_idx] color = GREEN frame = draw_square(frame, face.box.astype(numpy.int), color=color) frame = cv2.putText(frame, f"EID: {txt}", (int(face.box[0]), int(face.box[1] - 20)), cv2.FONT_HERSHEY_PLAIN, 1, GREEN) if not show_image(frame, 'Face Recognition', windows_size=(1920, 1080)): break video.stop()
def cluster_concepts(context="location"): """ Cluster related concepts of a specific type to different categories """ db = Database() concept_category = ConceptCategory() cmd = "SELECT * FROM %s" % (context) context_res = db.query_db(cmd) concept_list = [] concept_matrix = [] for item in context_res: concept_list = [] concept_matrix = [] if context == "action": context_id, context_chinese, context_name = item[:3] elif context == "location": context_id, context_name, context_chinese = item cmd = ( "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \ WHERE a.%s_id = %s AND a.concept_id = b.id" % (context, context, context_id) ) concept_res = db.query_db(cmd) if len(concept_res) == 0: continue for item in concept_res: concept, concept_id = item concept_vector = concept_category.concept_axes.row_named(concept) concept_list.append((concept_id, concept)) concept_matrix.append(concept_vector) # Run affinity propogation S = cosine_similarity(concept_matrix, concept_matrix) af = AffinityPropagation() af.fit(S) cluster_centers_indices = af.cluster_centers_indices_ labels = af.labels_ count = 0 clusters = defaultdict(list) for label in labels: clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count]) count += 1 category_num = 0 for key, value in clusters.items(): category_num += 1 for concept in value: cmd = ( "UPDATE %s_concept SET category = %d WHERE \ %s_id = %s AND concept_id = %s" % (context, category_num, context, context_id, concept[0]) ) db.query_db(cmd) print concept[1].encode("utf-8") + " ", print "" print "----------" + context_chinese.encode("utf-8") + "----------"
def verify(self, img1_bgr, img2_bgr): result = [] bboxes1 = self.detect_all_faces(img1_bgr) bboxes2 = self.detect_all_faces(img2_bgr) for bbox1 in bboxes1: dist = [] _, feat1 = self.__get_face_feature(img1_bgr, bbox1) for bbox2 in bboxes2: _, feat2 = self.__get_face_feature(img2_bgr, bbox2) dist.append(util.cosine_similarity(feat1, feat2)) result.append(dist) return result
def search_db(self, feature, max_num, threshold): score_list = [] for item in self.__db: similarity = util.cosine_similarity(feature, item.feature) if similarity > threshold: score_list.append((item, similarity)) if len(score_list) == 0: return 0, [] sorted_score_list = sorted( score_list, key=lambda p: p[1], reverse=True) if max_num > len(sorted_score_list): max_num = len(sorted_score_list) return max_num, sorted_score_list[:max_num]
def get_predict_file(args, feature_extractor): assert (os.path.exists(args.lfw_align)) pairs = load_pairs(args.pairs) with open(args.predict_file, 'w') as f: for pair in pairs: name1, name2, same = pairs_info(pair, args.suffix) logging.info("processing name1:{} <---> name2:{}".format( name1, name2)) img1_bgr, img2_bgr = read2img(args.lfw_align, name1, name2) feat1 = feature_extractor.extract_feature(img1_bgr) feat2 = feature_extractor.extract_feature(img2_bgr) dis = util.cosine_similarity(feat1, feat2) f.write(name1 + '\t' + name2 + '\t' + str(dis) + '\t' + str(same) + '\n')
def compute_similarity_by_avg(self, sents_1, sents_2): if len(sents_1) == 0 or len(sents_2) == 0: return 0.0 #把一个句子中的所有词向量相加 vec1 = self.__word2vec[sents_1[0]] for word1 in sents_1[1:]: vec1 = vec1 + self.__word2vec[word1] vec2 = self.__word2vec[sents_2[0]] for word2 in sents_2[1:]: vec2 = vec2 + self.__word2vec[word2] similarity = util.cosine_similarity(vec1 / len(sents_1), vec2 / len(sents_2)) return similarity
def get_arg_max_label(ph, child_labels, embeddings, label_embeddings, threshold=0.0): filtered_phrase = ph.translate(translator) sims = [] for ch in child_labels: try: child_label_str = " ".join([t for t in ch.split("_") if t not in stop_words]).strip() sims.append(cosine_similarity(embeddings[filtered_phrase], label_embeddings[child_label_str])) except Exception as e: print("Error while computing cosine sim", e) return None, None sim_softmax = softmax(np.array(sims)) if max(sim_softmax) >= threshold: max_ind = np.argmax(sim_softmax) return child_labels[max_ind], max(sim_softmax) else: return None, None
def word_sent_similarity(word_vec, sent_vecs): """ sent_vecs: a list of the vectors of the sentence's words """ sent_len = len(sent_vecs) similarities = np.zeros(sent_len) for vec_i, vec in enumerate(sent_vecs): cosine_sim = cosine_similarity(vec, word_vec) # obtain positive similarity sim = np.exp(cosine_sim) similarities[vec_i] = sim max_sim = np.max(similarities) return max_sim
def test_undistort_image(): image = cv2.imread("test/lena.png").astype(np.float32) H, W, _ = image.shape K = np.array([[[1.0, 0.0, W / 2], [0.0, 1.0, H / 2], [0.0, 0.0, 1.0]]], np.float32) dist = np.array([[1e-6, 0.0, 1e-5, 0.0]], np.float32) ref_image = cv2.undistort(image, K[0], dist[0]) image = np.expand_dims(image.transpose((2, 0, 1)), axis=0) warped_image = C.undistort_image(K, dist, image)[0].data warped_image = warped_image.transpose((1, 2, 0)) #cv2.imwrite('ref.png', ref_image) #cv2.imwrite('warped.png', warped_image) assert (1 - cosine_similarity(ref_image, warped_image) < eps)
def test_warp_affine(): identity = np.array([[1., 0., 0.], [0., 1., 0.]]) mat = identity + np.random.randn(2, 3) / 10 image = cv2.imread("test/lena.png").astype(float) H, W, _ = image.shape ref_image = cv2.warpAffine(image, mat, (W, H)) image = np.expand_dims(image.transpose((2, 0, 1)), axis=0) mat = np.expand_dims(mat, axis=0) warped_image = I.warp_affine(chainer.Variable(image), chainer.Variable(mat)).data warped_image = warped_image.transpose((0, 2, 3, 1)).reshape((H, W, 3)) #cv2.imwrite('ref.png', ref_image) #cv2.imwrite('warped.png', warped_image) assert (1 - cosine_similarity(ref_image, warped_image) < eps)
def test_reversibility(): image = cv2.imread("test/lena.png").astype(np.float32) H, W, _ = image.shape K = np.array([[[1.0, 0.0, W / 2], [0.0, 1.0, H / 2], [0.0, 0.0, 1.0]]], np.float32) dist = np.array([[1e-6, 0.0, 1e-5, 0.0]], np.float32) image0 = np.expand_dims(image.transpose((2, 0, 1)), axis=0) distorted_image = C.undistort_image(K, dist, image0).data image1 = C.distort_image(K, dist, distorted_image).data distorted_image = distorted_image[0].transpose((1, 2, 0)) image0 = image0[0].transpose((1, 2, 0)) image1 = image1[0].transpose((1, 2, 0)) #cv2.imwrite('distorted.png', distorted_image) #cv2.imwrite('image0.png', image0) #cv2.imwrite('image1.png', image1) assert (1 - cosine_similarity(image0, image1) < eps)
def sent_sent_similarity(self, sent1_vecs, sent2_vecs): """ sent1_vecs: a list of the word vectors of the 1st sentence sent2_vecs: same, for the 2nd sentence """ if self.cfg.sent_sent_similarity_wordwise: similarities = [] for word_vec in sent1_vecs: similarities.append(self.word_sent_similarity(word_vec, sent2_vecs)) max_sim = max(similarities) return max_sim # cosine similarity between the mean vectors else: sent1_mean_w2v = np.mean(sent1_vecs, 0) sent2_mean_w2v = np.mean(sent2_vecs, 0) cosine_sim = cosine_similarity(sent1_mean_w2v, sent2_mean_w2v) sim = np.exp(cosine_sim) return sim
def get_pair_sent_features(similar_source_indices, sent_term_matrix, article_sent_tokens, mmr): features = [] # features.append(1) # is_sent_pair sent_idx1, sent_idx2 = similar_source_indices[0], similar_source_indices[1] sent1_features = get_single_sent_features(sent_idx1, sent_term_matrix, article_sent_tokens, mmr) features.extend(sent1_features[1:]) # sent_idx, doc_similarity, sent_len sent2_features = get_single_sent_features(sent_idx2, sent_term_matrix, article_sent_tokens, mmr) features.extend(sent2_features[1:]) # sent_idx, doc_similarity, sent_len average_mmr = (mmr[sent_idx1] + mmr[sent_idx2])/2 sents_similarity = util.cosine_similarity(sent_term_matrix[sent_idx1], sent_term_matrix[sent_idx2])[0][0] sents_dist = abs(sent_idx1 - sent_idx2) if real_values: features.extend([average_mmr, sents_similarity]) if include_sents_dist: features.append(sents_dist) else: features.extend(convert_to_one_hot(average_mmr, 5, (0,1))) features.extend(convert_to_one_hot(sents_similarity, 5, (0,1))) # sents_similarity if include_sents_dist: features.extend(convert_to_one_hot(min(sents_dist, max_num_sents), 10, (0,max_num_sents))) # sents_dist return features
np.save(plasmid_host_dist_path, plasmid_host) util.save_obj(t, plasmid_host_class_path) # %% Load related diatance # Load calculated plasmid-host distance plasmid_host = np.load(plasmid_host_dist_path) plasmid_host[plasmid_host > 1000] = 1000 # Normalize plasmid-host distance plasmid_host_normalized = (plasmid_host - plasmid_host.min(axis=0)) / ( plasmid_host.max(axis=0) - plasmid_host.min(axis=0) ) # Calculate plasmid-wise distance plasmid_plasmid = util.cosine_similarity(plasmid_host, plasmid_host) # %% Construct plasmid interaction table host_list = list(set(metadata.Assembly_chainid)) host_list.sort() host_to_idx_dict = {host: i for i, host in enumerate(host_list)} idx_to_host_dict = {i: host for i, host in enumerate(host_list)} # plasmid-strain indicator interaction_table = np.zeros((len(metadata), len(set(host_list)))) for i in range(len(metadata)): interaction_table[i, host_to_idx_dict[metadata.Assembly_chainid[i]]] = 1 # %% Construct plasmid interaction table based on species host_to_speciesid = {}
blast_results = {} for key in blast_results_dict: blast_results[key.split(".")[0]] = blast_results_dict[key] blast_results_mat = np.zeros((len(query_list), len(set(host_list)))) for i in range(blast_results_mat.shape[0]): success, series = blast_results[query_list[i]] if not success: continue else: for key in series.keys(): idx = host_to_idx_dict[int(key[4:])] blast_results_mat[i, idx] = series[key] # Calculate test-training plasmid distance and svpos # test_plasmid_to_train_plasmid = util.cosine_similarity(plasmid_host_normalized, training_plasmid_host[:, :6]) test_plasmid_to_train_plasmid = util.cosine_similarity( plasmid_host_normalized, training_plasmid_host) svpos = calc_svpos(test_plasmid_to_train_plasmid, training_interaction_indicator) model_path = "data/model.pkl" model = util.load_obj(model_path) idx = np.arange(plasmid_host.shape[0]) # features = [plasmid_host_normalized, blast_results_mat[:, :6], svpos[:, :6]] features = [plasmid_host_normalized, blast_results_mat, svpos] combined_features = [feature.flatten()[:, None] for feature in features] combined_features = np.hstack(combined_features) prediction = model.predict_proba(combined_features) prediction = prediction[:, 1].reshape((-1, features[0].shape[1]))
return feature.copy() if __name__ == '__main__': import sys if len(sys.argv) != 3: img1 = 'test1.jpg' img2 = 'test2.jpg' else: img1 = sys.argv[1].strip() img2 = sys.argv[2].strip() import cv2 import util config.channel_num = 3 config.face_size = 224 config.feature_size = 4096 config.extractor = 'vgg_face' extractor = VggFeatureExtractor() # img_bgr = cv2.imread('../model/vgg_face_caffe/ak.png') # feature = extractor.extract_feature(img_bgr) img1_bgr = cv2.imread(img1) img2_bgr = cv2.imread(img2) # img1_bgr = cv2.imread('../../../data/lfw-align/Shane_Loux/Shane_Loux_0001.png') # img2_bgr = cv2.imread('../../../data/lfw-align/Val_Ackerman/Val_Ackerman_0001.png') feat1 = extractor.extract_feature(img1_bgr) feat2 = extractor.extract_feature(img2_bgr) print util.cosine_similarity(feat1, feat2)
def body(j, result): h_j = H_q[j, :, :] # hidden_size x batch alpha = cosine_similarity(h_i, h_j) result = tf.concat([result, alpha], axis=1) return [j + 1, result]
def get_pseudo_label_surface_name(child_label_str, texts, embeddings, probability, parent, parent_labels, thresh=0.8): candidate_words = set() for sent in texts: tokens = set(sent.strip().split()) if child_label_str in tokens: candidate_words.update(tokens) candidate_words = candidate_words - {child_label_str} filter_words = set([]) for w in candidate_words: try: if cosine_similarity(embeddings[w], embeddings[child_label_str]) < thresh: filter_words.add(w) except Exception as e: print(e) candidate_words = candidate_words - filter_words candidate_words = list(candidate_words) scores = [] try: child_label_thresh = probability[parent][child_label_str] den = 0 for l in parent_labels: if l == parent: continue if child_label_str in probability[ l] and probability[l][child_label_str] != -math.inf: den += probability[l][child_label_str] if den != 0: child_label_thresh = child_label_thresh / den except Exception as e: print(decipher_phrase(child_label_str, id_phrase_map), e) child_label_thresh = 0 # child_label_thresh = 0 for c in candidate_words: cos_sim = cosine_similarity(embeddings[c], embeddings[child_label_str]) num = probability[parent][c] den = 0 for l in parent_labels: if l == parent: continue if c in probability[l] and probability[l][c] != -math.inf: den += probability[l][c] if den != 0: val = cos_sim * (num / den) else: val = cos_sim * num if val > child_label_thresh: scores.append(val) inds = sorted(range(len(scores)), key=lambda i: scores[i])[-10:] words = [] for i in inds: words.append(candidate_words[i]) return words
label_embeddings = pickle.load( open(pkl_dump_dir + "label_bert_word_phrase_embeddings.pkl", "rb")) stop_words = set(stopwords.words('english')) stop_words.add('would') translator = str.maketrans(string.punctuation, ' ' * len(string.punctuation)) for p in parent_to_child: embeddings = label_embeddings[p] for ch in parent_to_child[p]: all_sims[ch] = {} mean_sim[ch] = 0 child_label_str = " ".join( [t for t in ch.split("_") if t not in stop_words]).strip() for w in embeddings: sim = cosine_similarity(embeddings[child_label_str], embeddings[w]) all_sims[ch][w] = sim mean_sim[ch] += sim mean_sim[ch] = mean_sim[ch] / len(embeddings) all_sims[ch] = { k: v for k, v in sorted(all_sims[ch].items(), key=lambda item: -item[1])[:1000] } print(mean_sim) json.dump(all_sims, open(pkl_dump_dir + "all_sims_label_specific.json", "w")) json.dump(mean_sim, open(pkl_dump_dir + "mean_sim_label_specific.json", "w"))
def main(unused_argv): print('Running statistics on %s' % FLAGS.dataset_name) if len(unused_argv ) != 1: # prints a message if you've entered flags incorrectly raise Exception("Problem with flags: %s" % unused_argv) if FLAGS.dataset_name == 'all': dataset_names = ['cnn_dm', 'xsum', 'duc_2004'] else: dataset_names = [FLAGS.dataset_name] if not os.path.exists(plot_data_file): all_lists_of_histogram_pairs = [] for dataset_name in dataset_names: FLAGS.dataset_name = dataset_name if dataset_name == 'duc_2004': dataset_splits = ['test'] elif FLAGS.dataset_split == 'all': dataset_splits = ['test', 'val', 'train'] else: dataset_splits = [FLAGS.dataset_split] ssi_list = [] for dataset_split in dataset_splits: ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi.pkl') with open(ssi_path) as f: ssi_list.extend(pickle.load(f)) if FLAGS.dataset_name == 'duc_2004': for abstract_idx in [1, 2, 3]: ssi_path = os.path.join( ssi_dir, FLAGS.dataset_name, dataset_split + '_ssi_' + str(abstract_idx) + '.pkl') with open(ssi_path) as f: temp_ssi_list = pickle.load(f) ssi_list.extend(temp_ssi_list) ssi_2d = util.flatten_list_of_lists(ssi_list) num_extracted = [ len(ssi) for ssi in util.flatten_list_of_lists(ssi_list) ] hist_num_extracted = np.histogram(num_extracted, bins=6, range=(0, 5)) print(hist_num_extracted) print('Histogram of number of sentences merged: ' + util.hist_as_pdf_str(hist_num_extracted)) distances = [ abs(ssi[0] - ssi[1]) for ssi in ssi_2d if len(ssi) >= 2 ] print('Distance between sentences (mean, median): ', np.mean(distances), np.median(distances)) hist_dist = np.histogram(distances, bins=max(distances)) print('Histogram of distances: ' + util.hist_as_pdf_str(hist_dist)) summ_sent_idx_to_number_of_source_sents = [[], [], [], [], [], [], [], [], [], []] for ssi in ssi_list: for summ_sent_idx, source_indices in enumerate(ssi): if len(source_indices) == 0 or summ_sent_idx >= len( summ_sent_idx_to_number_of_source_sents): continue num_sents = len(source_indices) if num_sents > 2: num_sents = 2 summ_sent_idx_to_number_of_source_sents[ summ_sent_idx].append(num_sents) print( "Number of source sents for summary sentence indices (Is the first summary sent more likely to match with a singleton or a pair?):" ) for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_singleton = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_singleton) + '\t', print '' for summ_sent_idx, list_of_numbers_of_source_sents in enumerate( summ_sent_idx_to_number_of_source_sents): if len(list_of_numbers_of_source_sents) == 0: percent_pair = 0. else: percent_singleton = list_of_numbers_of_source_sents.count( 1) * 1. / len(list_of_numbers_of_source_sents) percent_pair = list_of_numbers_of_source_sents.count( 2) * 1. / len(list_of_numbers_of_source_sents) print str(percent_pair) + '\t', print '' primary_pos = [ssi[0] for ssi in ssi_2d if len(ssi) >= 1] secondary_pos = [ssi[1] for ssi in ssi_2d if len(ssi) >= 2] all_pos = [max(ssi) for ssi in ssi_2d if len(ssi) >= 1] # if FLAGS.dataset_name != 'duc_2004': # plot_positions(primary_pos, secondary_pos, all_pos) if FLAGS.dataset_split == 'all': glob_string = '*.bin' else: glob_string = dataset_splits[0] print('Loading TFIDF vectorizer') with open(tfidf_vec_path, 'rb') as f: tfidf_vectorizer = pickle.load(f) source_dir = os.path.join(data_dir, FLAGS.dataset_name) source_files = sorted( glob.glob(source_dir + '/' + glob_string + '*')) total = len(source_files) * 1000 if ( 'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name or 'xsum' in FLAGS.dataset_name) else len(source_files) example_generator = data.example_generator( source_dir + '/' + glob_string + '*', True, False, should_check_valid=False) all_possible_singles = 0 all_possible_pairs = [0] all_filtered_pairs = 0 all_all_combinations = 0 all_ssi_pairs = [0] ssi_pairs_with_shared_coref = [0] ssi_pairs_with_shared_word = [0] ssi_pairs_with_either_coref_or_word = [0] all_pairs_with_shared_coref = [0] all_pairs_with_shared_word = [0] all_pairs_with_either_coref_or_word = [0] actual_total = [0] rel_positions_primary = [] rel_positions_secondary = [] rel_positions_all = [] sent_lens = [] all_sent_lens = [] all_pos = [] y = [] normalized_positions_primary = [] normalized_positions_secondary = [] all_normalized_positions_primary = [] all_normalized_positions_secondary = [] normalized_positions_singles = [] normalized_positions_pairs_first = [] normalized_positions_pairs_second = [] primary_pos_duc = [] secondary_pos_duc = [] all_pos_duc = [] all_distances = [] distances_duc = [] tfidf_similarities = [] all_tfidf_similarities = [] average_mmrs = [] all_average_mmrs = [] for example_idx, example in enumerate( tqdm(example_generator, total=total)): # def process(example_idx_example): # # print '0' # example = example_idx_example if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances: break raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example( example, names_to_types) article_sent_tokens = [ util.process_sent(sent) for sent in raw_article_sents ] article_text = ' '.join(raw_article_sents) groundtruth_summ_sents = [[ sent.strip() for sent in groundtruth_summary_text.strip().split('\n') ]] if doc_indices is None: doc_indices = [0] * len( util.flatten_list_of_lists(article_sent_tokens)) doc_indices = [int(doc_idx) for doc_idx in doc_indices] rel_sent_indices, doc_sent_indices, doc_sent_lens = preprocess_for_lambdamart_no_flags.get_rel_sent_indices( doc_indices, article_sent_tokens) groundtruth_similar_source_indices_list = util.enforce_sentence_limit( groundtruth_similar_source_indices_list, FLAGS.sentence_limit) sent_term_matrix = util.get_doc_substituted_tfidf_matrix( tfidf_vectorizer, raw_article_sents, article_text) sents_similarities = util.cosine_similarity( sent_term_matrix, sent_term_matrix) importances = util.special_squash( util.get_tfidf_importances(tfidf_vectorizer, raw_article_sents)) if FLAGS.dataset_name == 'duc_2004': first_k_indices = lambdamart_scores_to_summaries.get_indices_of_first_k_sents_of_each_article( rel_sent_indices, FLAGS.first_k) else: first_k_indices = [ idx for idx in range(len(raw_article_sents)) ] article_indices = list(range(len(raw_article_sents))) possible_pairs = [ x for x in list(itertools.combinations(article_indices, 2)) ] # all pairs # # # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs) # if FLAGS.dataset_name == 'duc_2004': # filtered_possible_pairs = [x for x in list(itertools.combinations(first_k_indices, 2))] # all pairs # else: # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_sent_position(possible_pairs) # # removed_pairs = list(set(possible_pairs) - set(filtered_possible_pairs)) # possible_singles = [(i,) for i in range(len(raw_article_sents))] # all_combinations = filtered_possible_pairs + possible_singles # # all_possible_singles += len(possible_singles) # all_possible_pairs[0] += len(possible_pairs) # all_filtered_pairs += len(filtered_possible_pairs) # all_all_combinations += len(all_combinations) # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) > 0: # idx = rel_sent_indices[ssi[0]] # rel_positions_primary.append(idx) # rel_positions_all.append(idx) # if len(ssi) > 1: # idx = rel_sent_indices[ssi[1]] # rel_positions_secondary.append(idx) # rel_positions_all.append(idx) # # # # coref_pairs = preprocess_for_lambdamart_no_flags.get_coref_pairs(corefs) # # DO OVER LAP PAIRS BETTER # overlap_pairs = preprocess_for_lambdamart_no_flags.filter_by_overlap(article_sent_tokens, possible_pairs) # either_coref_or_word = list(set(list(coref_pairs) + overlap_pairs)) # # for ssi in groundtruth_similar_source_indices_list: # if len(ssi) == 2: # all_ssi_pairs[0] += 1 # do_share_coref = ssi in coref_pairs # do_share_words = ssi in overlap_pairs # if do_share_coref: # ssi_pairs_with_shared_coref[0] += 1 # if do_share_words: # ssi_pairs_with_shared_word[0] += 1 # if do_share_coref or do_share_words: # ssi_pairs_with_either_coref_or_word[0] += 1 # all_pairs_with_shared_coref[0] += len(coref_pairs) # all_pairs_with_shared_word[0] += len(overlap_pairs) # all_pairs_with_either_coref_or_word[0] += len(either_coref_or_word) if FLAGS.dataset_name == 'duc_2004': primary_pos_duc.extend([ rel_sent_indices[ssi[0]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) secondary_pos_duc.extend([ rel_sent_indices[ssi[1]] for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 2 ]) all_pos_duc.extend([ max([rel_sent_indices[sent_idx] for sent_idx in ssi]) for ssi in groundtruth_similar_source_indices_list if len(ssi) >= 1 ]) for ssi in groundtruth_similar_source_indices_list: for sent_idx in ssi: sent_lens.append(len(article_sent_tokens[sent_idx])) if len(ssi) >= 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_primary.extend(vals_to_add) if len(ssi) >= 2: orig_val = ssi[1] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_secondary.extend(vals_to_add) if FLAGS.dataset_name == 'duc_2004': distances_duc.append( abs(rel_sent_indices[ssi[1]] - rel_sent_indices[ssi[0]])) tfidf_similarities.append(sents_similarities[ssi[0], ssi[1]]) average_mmrs.append( (importances[ssi[0]] + importances[ssi[1]]) / 2) for ssi in groundtruth_similar_source_indices_list: if len(ssi) == 1: orig_val = ssi[0] vals_to_add = get_integral_values_for_histogram( orig_val, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_singles.extend(vals_to_add) if len(ssi) >= 2: if doc_sent_indices[ssi[0]] != doc_sent_indices[ ssi[1]]: continue orig_val_first = min(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_first, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_first.extend(vals_to_add) orig_val_second = max(ssi[0], ssi[1]) vals_to_add = get_integral_values_for_histogram( orig_val_second, rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) normalized_positions_pairs_second.extend(vals_to_add) # all_normalized_positions_primary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(single[0], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for single in possible_singles])) # all_normalized_positions_secondary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(pair[1], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for pair in possible_pairs])) all_sent_lens.extend( [len(sent) for sent in article_sent_tokens]) all_distances.extend([ abs(rel_sent_indices[pair[1]] - rel_sent_indices[pair[0]]) for pair in possible_pairs ]) all_tfidf_similarities.extend([ sents_similarities[pair[0], pair[1]] for pair in possible_pairs ]) all_average_mmrs.extend([ (importances[pair[0]] + importances[pair[1]]) / 2 for pair in possible_pairs ]) # if FLAGS.dataset_name == 'duc_2004': # rel_pos_single = [rel_sent_indices[single[0]] for single in possible_singles] # rel_pos_pair = [[rel_sent_indices[pair[0]], rel_sent_indices[pair[1]]] for pair in possible_pairs] # all_pos.extend(rel_pos_single) # all_pos.extend([max(pair) for pair in rel_pos_pair]) # else: # all_pos.extend(util.flatten_list_of_lists(possible_singles)) # all_pos.extend([max(pair) for pair in possible_pairs]) # y.extend([1 if single in groundtruth_similar_source_indices_list else 0 for single in possible_singles]) # y.extend([1 if pair in groundtruth_similar_source_indices_list else 0 for pair in possible_pairs]) # actual_total[0] += 1 # # p = Pool(144) # # list(tqdm(p.imap(process, example_generator), total=total)) # # # print 'Possible_singles\tPossible_pairs\tFiltered_pairs\tAll_combinations: \n%.2f\t%.2f\t%.2f\t%.2f' % (all_possible_singles*1./actual_total, \ # # all_possible_pairs*1./actual_total, all_filtered_pairs*1./actual_total, all_all_combinations*1./actual_total) # # # # # print 'Relative positions of groundtruth source sentences in document:\nPrimary\tSecondary\tBoth\n%.2f\t%.2f\t%.2f' % (np.mean(rel_positions_primary), np.mean(rel_positions_secondary), np.mean(rel_positions_all)) # # # # print 'SSI Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (ssi_pairs_with_shared_coref[0]*100./all_ssi_pairs[0], ssi_pairs_with_shared_word[0]*100./all_ssi_pairs[0], ssi_pairs_with_either_coref_or_word[0]*100./all_ssi_pairs[0]) # # print 'All Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \ # # % (all_pairs_with_shared_coref[0]*100./all_possible_pairs[0], all_pairs_with_shared_word[0]*100./all_possible_pairs[0], all_pairs_with_either_coref_or_word[0]*100./all_possible_pairs[0]) # # # hist_all_pos = np.histogram(all_pos, bins=max(all_pos)+1) # # print 'Histogram of all sent positions: ', util.hist_as_pdf_str(hist_all_pos) # # min_sent_len = min(sent_lens) # # hist_sent_lens = np.histogram(sent_lens, bins=max(sent_lens)-min_sent_len+1) # # print 'min, max sent lens:', min_sent_len, max(sent_lens) # # print 'Histogram of sent lens: ', util.hist_as_pdf_str(hist_sent_lens) # # min_all_sent_len = min(all_sent_lens) # # hist_all_sent_lens = np.histogram(all_sent_lens, bins=max(all_sent_lens)-min_all_sent_len+1) # # print 'min, max all sent lens:', min_all_sent_len, max(all_sent_lens) # # print 'Histogram of all sent lens: ', util.hist_as_pdf_str(hist_all_sent_lens) # # # print 'Pearsons r, p value', pearsonr(all_pos, y) # # fig, ax1 = plt.subplots(nrows=1) # # plt.scatter(all_pos, y) # # pp = PdfPages(os.path.join('stuff/plots', FLAGS.dataset_name + '_position_scatter.pdf')) # # plt.savefig(pp, format='pdf',bbox_inches='tight') # # plt.show() # # pp.close() # # # if FLAGS.dataset_name == 'duc_2004': # # plot_positions(primary_pos_duc, secondary_pos_duc, all_pos_duc) # # normalized_positions_all = normalized_positions_primary + normalized_positions_secondary # # plot_histogram(normalized_positions_primary, num_bins=100) # # plot_histogram(normalized_positions_secondary, num_bins=100) # # plot_histogram(normalized_positions_all, num_bins=100) # # sent_lens_together = [sent_lens, all_sent_lens] # # plot_histogram(sent_lens_together, pdf=True, start_at_0=True, max_val=70) # # if FLAGS.dataset_name == 'duc_2004': # distances = distances_duc # sent_distances_together = [distances, all_distances] # # plot_histogram(sent_distances_together, pdf=True, start_at_0=True, max_val=100) # # tfidf_similarities_together = [tfidf_similarities, all_tfidf_similarities] # # plot_histogram(tfidf_similarities_together, pdf=True, num_bins=100) # # average_mmrs_together = [average_mmrs, all_average_mmrs] # # plot_histogram(average_mmrs_together, pdf=True, num_bins=100) # # normalized_positions_primary_together = [normalized_positions_primary, bin_values] # normalized_positions_secondary_together = [normalized_positions_secondary, bin_values] # # plot_histogram(normalized_positions_primary_together, pdf=True, num_bins=100) # # plot_histogram(normalized_positions_secondary_together, pdf=True, num_bins=100) # # # list_of_hist_pairs = [ # { # 'lst': normalized_positions_primary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'y_label': FLAGS.dataset_name, # 'x_label': 'Sent position (primary)' # }, # { # 'lst': normalized_positions_secondary_together, # 'pdf': True, # 'num_bins': 100, # 'y_lim': 3.9, # 'x_label': 'Sent position (secondary)' # }, # { # 'lst': sent_distances_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 100, # 'x_label': 'Sent distance' # }, # { # 'lst': sent_lens_together, # 'pdf': True, # 'start_at_0': True, # 'max_val': 70, # 'x_label': 'Sent length' # }, # { # 'lst': average_mmrs_together, # 'pdf': True, # 'num_bins': 100, # 'x_label': 'Average TF-IDF importance' # } # ] normalized_positions_pairs_together = [ normalized_positions_pairs_first, normalized_positions_pairs_second ] list_of_hist_pairs = [ { 'lst': [normalized_positions_singles], 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'y_label': FLAGS.dataset_name, 'x_label': 'Sent Position (Singles)', 'legend_labels': ['Primary'] }, { 'lst': normalized_positions_pairs_together, 'pdf': True, 'num_bins': 100, # 'y_lim': 3.9, 'x_lim': 1.0, 'x_label': 'Sent Position (Pairs)', 'legend_labels': ['Primary', 'Secondary'] } ] all_lists_of_histogram_pairs.append(list_of_hist_pairs) with open(plot_data_file, 'w') as f: cPickle.dump(all_lists_of_histogram_pairs, f) else: with open(plot_data_file) as f: all_lists_of_histogram_pairs = cPickle.load(f) plot_histograms(all_lists_of_histogram_pairs)