Python cosine_similarity Examples, util.cosine_similarity Python Examples

Example #1

0

Show file

def worker(word_tuple: (AnyStr, Iterable[float])) -> (AnyStr, float32):
    """Worker function for the pool cannot be inner function because it cannot be pickled that way"""
    # Check if there is any punctuation in the word. If there is, skip it. This used `string.punctuation` minus `-`,
    # since this symbol can occur in actual words
    word, vec = word_tuple
    word_vector = list(vec)
    if len(word_vector) != 300:
        return None, None
    diff_man = cosine_similarity(word_vector, current_man_vec)
    diff_woman = cosine_similarity(word_vector, current_woman_vec)
    diff = diff_man - diff_woman
    return word, diff

Example #2

0

Show file

File: preprocess_for_highlighting.py Project: loganlebanoff/correct_summarization

def reorder_list_like(to_reorder, ref_summs, ordered_ref_summs):
    # if len(to_reorder) != len(ref_summs) or len(to_reorder) != len(ordered_ref_summs):
    #     raise Exception('lens of lists are not equal. %d %d %d' % (len(to_reorder), len(ref_summs), len(ordered_ref_summs)))
    print('Fitting and transforming vecs')
    vec = CountVectorizer(input='content', decode_error='ignore')
    all_vecs = vec.fit_transform(ref_summs + ordered_ref_summs)
    unordered_vecs = all_vecs[:len(ref_summs)]
    ordered_vecs = all_vecs[len(ref_summs):]
    print('Cosine similarity')
    similarities = util.cosine_similarity(ordered_vecs, unordered_vecs)
    argmaxes = np.argmax(similarities, axis=1)
    indices_found = [False] * len(to_reorder)
    reordered_summaries = []
    for i in tqdm(range(len(argmaxes))):
        argmax_val = argmaxes[i]
        max_val = similarities[i, argmax_val]
        if max_val < 0.7:
            a = 0
            # raise Exception('Best result does not match well. \nSystem ref summ: %s\n\n Ordered ref summ: %s' % (ref_summs[argmax_val], ordered_ref_summs[i]))
        # if indices_found[argmax_val]:
        #     raise Exception('Best result was already matched with another ordered ref summ')
        indices_found[argmax_val] = True
        reordered_summaries.append(to_reorder[argmax_val])

    if len(reordered_summaries) != len(to_reorder):
        a = 0
        # raise Exception('reordered summaries len (%d) is not equal to original length (%d)' % (len(reordered_summaries), len(to_reorder)))
    return reordered_summaries

Example #3

0

Show file

File: count_merged.py Project: loganlebanoff/correct_summarization

def get_single_sent_features(similar_source_indices, sent_term_matrix,
                             doc_vector, article_sent_tokens):
    sent_idx = similar_source_indices[0]
    doc_similarity = util.cosine_similarity(sent_term_matrix[sent_idx],
                                            doc_vector)
    sent_len = len(article_sent_tokens[sent_idx])
    return sent_idx, doc_similarity, sent_len

Example #4

0

Show file

File: goal_cluster.py Project: a33kuo/procedural_knowledge

	def clustering(self):
		# Calculate similarity matrix
		X = self.create_tfidf_vector()
		X = X.toarray()
		pca = PCA(n_components=300, copy=False)
		X = pca.fit(X).transform(X)
		S = cosine_similarity(X, X)
		# Run affinity propogation
		af = AffinityPropagation()
		af.fit(S)
		# Formulate result
		tmp_clusters = defaultdict(list)
		goal_clusters = defaultdict(list)
		cluster_centers_indices = af.cluster_centers_indices_
		labels = af.labels_
		count = 0
		for label in labels:
			tmp_clusters[\
				self.goal_list[cluster_centers_indices[label]]].append(\
				self.goal_list[count])
			count += 1
		# 2nd-layer clutering of each cluster
		for goal, item_list in tmp_clusters.items():
			subclusters = self.subcluster_by_editdistance(goal, item_list)
			for subgoal, items in subclusters.items():
				goal_clusters[subgoal] = items
		return goal_clusters

Example #5

0

Show file

File: recognize.py Project: TinDang97/face_recognition

def main():
    args = ArgumentParser()
    args.add_argument('-c', '--camera_url', default=0, type=str, help='0 - local camera')
    args.add_argument('-dt', '--detect_threshold', default=0.975, type=float, help="Threshold of face detection")
    args.add_argument('-rf', '--recognized_threshold', default=0.8, type=float, help="Threshold of face recognition")
    args.add_argument('--device', default='cuda:0', type=str, help="Device run model. `cuda:<id>` or `cpu`")
    args.add_argument('--detect_face_model', default='data/pretrained/mobilenet_header.pth',
                      type=str, help="Face detector model path")
    args.add_argument('--detect_face_backbone', default='data/pretrained/mobile_backbone.tar',
                      type=str, help="Face detector backbone path")
    args.add_argument('--recognized_model', default='data/pretrained/embedder_resnet50_asia.pth'
                      , type=str, help="Face embedding model path")
    args.add_argument('--model_registered', default='model_faces.npy', type=str, help="Model contain face's vectors")
    args.add_argument('--model_ids', default='model_face_ids.npy', type=str, help="Model contain face's ids")
    args = args.parse_args()

    try:
        args.camera_url = int(args.camera_url)
    except:
        pass

    if not (os.path.isfile(args.model_registered) and os.path.isfile(args.model_ids)):
        face_model = numpy.zeros((0, 512), dtype=numpy.float32)
        ids_model = []
    else:
        face_model = numpy.load(args.model_registered, allow_pickle=True)
        ids_model = numpy.load(args.model_ids, allow_pickle=True).tolist()

    detector = FaceDetection(args.detect_face_model, args.detect_face_backbone, scale_size=480, device=args.device)
    embedder = FaceEmbedding(args.recognized_model, device=args.device)

    # recognize
    video = VideoCapture(args.camera_url)

    for frame in video:
        faces = detector(frame)
        faces = embedder(faces)

        for face in faces:
            txt = "None"
            color = RED

            scores = cosine_similarity(face.embedding.reshape(1, 512), face_model, skip_normalize=True).ravel()
            args_idx = numpy.argmax(scores)

            if scores[args_idx] >= args.recognized_threshold:
                txt = ids_model[args_idx]
                color = GREEN

            frame = draw_square(frame, face.box.astype(numpy.int), color=color)
            frame = cv2.putText(frame, f"EID: {txt}",
                                (int(face.box[0]), int(face.box[1] - 20)), cv2.FONT_HERSHEY_PLAIN, 1,
                                GREEN)

        if not show_image(frame, 'Face Recognition', windows_size=(1920, 1080)):
            break

    video.stop()

Example #6

0

Show file

File: database.py Project: a33kuo/language-learner

def cluster_concepts(context="location"):
    """
	Cluster related concepts of a specific type to different categories
	"""
    db = Database()
    concept_category = ConceptCategory()
    cmd = "SELECT * FROM %s" % (context)
    context_res = db.query_db(cmd)
    concept_list = []
    concept_matrix = []
    for item in context_res:
        concept_list = []
        concept_matrix = []
        if context == "action":
            context_id, context_chinese, context_name = item[:3]
        elif context == "location":
            context_id, context_name, context_chinese = item
        cmd = (
            "SELECT b.name, b.id FROM %s_concept AS a, concept AS b \
				WHERE a.%s_id = %s AND a.concept_id = b.id"
            % (context, context, context_id)
        )
        concept_res = db.query_db(cmd)
        if len(concept_res) == 0:
            continue
        for item in concept_res:
            concept, concept_id = item
            concept_vector = concept_category.concept_axes.row_named(concept)
            concept_list.append((concept_id, concept))
            concept_matrix.append(concept_vector)
            # Run affinity propogation
        S = cosine_similarity(concept_matrix, concept_matrix)
        af = AffinityPropagation()
        af.fit(S)
        cluster_centers_indices = af.cluster_centers_indices_
        labels = af.labels_
        count = 0
        clusters = defaultdict(list)
        for label in labels:
            clusters[concept_list[cluster_centers_indices[label]][1]].append(concept_list[count])
            count += 1
        category_num = 0
        for key, value in clusters.items():
            category_num += 1
            for concept in value:
                cmd = (
                    "UPDATE %s_concept SET category = %d WHERE \
						%s_id = %s AND concept_id = %s"
                    % (context, category_num, context, context_id, concept[0])
                )
                db.query_db(cmd)
                print concept[1].encode("utf-8") + " ",
            print ""
        print "----------" + context_chinese.encode("utf-8") + "----------"

Example #7

0

Show file

    def verify(self, img1_bgr, img2_bgr):
        result = []
        bboxes1 = self.detect_all_faces(img1_bgr)
        bboxes2 = self.detect_all_faces(img2_bgr)

        for bbox1 in bboxes1:
            dist = []
            _, feat1 = self.__get_face_feature(img1_bgr, bbox1)
            for bbox2 in bboxes2:
                _, feat2 = self.__get_face_feature(img2_bgr, bbox2)
                dist.append(util.cosine_similarity(feat1, feat2))
            result.append(dist)
        return result

Example #8

0

Show file

File: db.py Project: zhonghp/mxnet-face

 def search_db(self, feature, max_num, threshold):
     score_list = []
     for item in self.__db:
         similarity = util.cosine_similarity(feature, item.feature)
         if similarity > threshold:
             score_list.append((item, similarity))
     if len(score_list) == 0:
         return 0, []
     sorted_score_list = sorted(
         score_list, key=lambda p: p[1], reverse=True)
     if max_num > len(sorted_score_list):
         max_num = len(sorted_score_list)
     return max_num, sorted_score_list[:max_num]

Example #9

0

Show file

File: lfw.py Project: zhonghp/mxnet-face

def get_predict_file(args, feature_extractor):
    assert (os.path.exists(args.lfw_align))
    pairs = load_pairs(args.pairs)
    with open(args.predict_file, 'w') as f:
        for pair in pairs:
            name1, name2, same = pairs_info(pair, args.suffix)
            logging.info("processing name1:{} <---> name2:{}".format(
                name1, name2))
            img1_bgr, img2_bgr = read2img(args.lfw_align, name1, name2)
            feat1 = feature_extractor.extract_feature(img1_bgr)
            feat2 = feature_extractor.extract_feature(img2_bgr)
            dis = util.cosine_similarity(feat1, feat2)
            f.write(name1 + '\t' + name2 + '\t' + str(dis) + '\t' + str(same) +
                    '\n')

Example #10

0

Show file

    def compute_similarity_by_avg(self, sents_1, sents_2):
        if len(sents_1) == 0 or len(sents_2) == 0:
            return 0.0
        #把一个句子中的所有词向量相加
        vec1 = self.__word2vec[sents_1[0]]
        for word1 in sents_1[1:]:
            vec1 = vec1 + self.__word2vec[word1]

        vec2 = self.__word2vec[sents_2[0]]
        for word2 in sents_2[1:]:
            vec2 = vec2 + self.__word2vec[word2]

        similarity = util.cosine_similarity(vec1 / len(sents_1),
                                            vec2 / len(sents_2))
        return similarity

Example #11

0

Show file

def get_arg_max_label(ph, child_labels, embeddings, label_embeddings, threshold=0.0):
    filtered_phrase = ph.translate(translator)
    sims = []
    for ch in child_labels:
        try:
            child_label_str = " ".join([t for t in ch.split("_") if t not in stop_words]).strip()
            sims.append(cosine_similarity(embeddings[filtered_phrase], label_embeddings[child_label_str]))
        except Exception as e:
            print("Error while computing cosine sim", e)
            return None, None
    sim_softmax = softmax(np.array(sims))
    if max(sim_softmax) >= threshold:
        max_ind = np.argmax(sim_softmax)
        return child_labels[max_ind], max(sim_softmax)
    else:
        return None, None

Example #12

0

Show file

    def word_sent_similarity(word_vec, sent_vecs):
        """
        sent_vecs: a list of the vectors of the sentence's words
        """
        sent_len = len(sent_vecs)
        similarities = np.zeros(sent_len)

        for vec_i, vec in enumerate(sent_vecs):
            cosine_sim = cosine_similarity(vec, word_vec)

            # obtain positive similarity
            sim = np.exp(cosine_sim)

            similarities[vec_i] = sim

        max_sim = np.max(similarities)
        return max_sim

Example #13

0

Show file

File: test_distortion.py Project: Idein/chainer-graphics

def test_undistort_image():
    image = cv2.imread("test/lena.png").astype(np.float32)
    H, W, _ = image.shape
    K = np.array([[[1.0, 0.0, W / 2], [0.0, 1.0, H / 2], [0.0, 0.0, 1.0]]],
                 np.float32)
    dist = np.array([[1e-6, 0.0, 1e-5, 0.0]], np.float32)

    ref_image = cv2.undistort(image, K[0], dist[0])

    image = np.expand_dims(image.transpose((2, 0, 1)), axis=0)
    warped_image = C.undistort_image(K, dist, image)[0].data
    warped_image = warped_image.transpose((1, 2, 0))

    #cv2.imwrite('ref.png', ref_image)
    #cv2.imwrite('warped.png', warped_image)

    assert (1 - cosine_similarity(ref_image, warped_image) < eps)

Example #14

0

Show file

File: test_image_warp.py Project: Idein/chainer-graphics

def test_warp_affine():
    identity = np.array([[1., 0., 0.], [0., 1., 0.]])
    mat = identity + np.random.randn(2, 3) / 10

    image = cv2.imread("test/lena.png").astype(float)
    H, W, _ = image.shape

    ref_image = cv2.warpAffine(image, mat, (W, H))

    image = np.expand_dims(image.transpose((2, 0, 1)), axis=0)
    mat = np.expand_dims(mat, axis=0)
    warped_image = I.warp_affine(chainer.Variable(image),
                                 chainer.Variable(mat)).data
    warped_image = warped_image.transpose((0, 2, 3, 1)).reshape((H, W, 3))

    #cv2.imwrite('ref.png', ref_image)
    #cv2.imwrite('warped.png', warped_image)

    assert (1 - cosine_similarity(ref_image, warped_image) < eps)

Example #15

0

Show file

File: test_distortion.py Project: Idein/chainer-graphics

def test_reversibility():
    image = cv2.imread("test/lena.png").astype(np.float32)
    H, W, _ = image.shape
    K = np.array([[[1.0, 0.0, W / 2], [0.0, 1.0, H / 2], [0.0, 0.0, 1.0]]],
                 np.float32)
    dist = np.array([[1e-6, 0.0, 1e-5, 0.0]], np.float32)

    image0 = np.expand_dims(image.transpose((2, 0, 1)), axis=0)
    distorted_image = C.undistort_image(K, dist, image0).data
    image1 = C.distort_image(K, dist, distorted_image).data

    distorted_image = distorted_image[0].transpose((1, 2, 0))
    image0 = image0[0].transpose((1, 2, 0))
    image1 = image1[0].transpose((1, 2, 0))

    #cv2.imwrite('distorted.png', distorted_image)
    #cv2.imwrite('image0.png', image0)
    #cv2.imwrite('image1.png', image1)

    assert (1 - cosine_similarity(image0, image1) < eps)

Example #16

0

Show file

    def sent_sent_similarity(self, sent1_vecs, sent2_vecs):
        """
        sent1_vecs: a list of the word vectors of the 1st sentence
        sent2_vecs: same, for the 2nd sentence
        """
        if self.cfg.sent_sent_similarity_wordwise:
            similarities = []
            for word_vec in sent1_vecs:
                similarities.append(self.word_sent_similarity(word_vec, sent2_vecs))

            max_sim = max(similarities)
            return max_sim

        # cosine similarity between the mean vectors
        else:
            sent1_mean_w2v = np.mean(sent1_vecs, 0)
            sent2_mean_w2v = np.mean(sent2_vecs, 0)

            cosine_sim = cosine_similarity(sent1_mean_w2v, sent2_mean_w2v)

            sim = np.exp(cosine_sim)
        return sim

Example #17

0

Show file

def get_pair_sent_features(similar_source_indices, sent_term_matrix, article_sent_tokens, mmr):
    features = []
    # features.append(1)  # is_sent_pair
    sent_idx1, sent_idx2 = similar_source_indices[0], similar_source_indices[1]
    sent1_features = get_single_sent_features(sent_idx1,
                         sent_term_matrix, article_sent_tokens, mmr)
    features.extend(sent1_features[1:]) # sent_idx, doc_similarity, sent_len
    sent2_features = get_single_sent_features(sent_idx2,
                         sent_term_matrix, article_sent_tokens, mmr)
    features.extend(sent2_features[1:]) # sent_idx, doc_similarity, sent_len
    average_mmr = (mmr[sent_idx1] + mmr[sent_idx2])/2
    sents_similarity = util.cosine_similarity(sent_term_matrix[sent_idx1], sent_term_matrix[sent_idx2])[0][0]
    sents_dist = abs(sent_idx1 - sent_idx2)
    if real_values:
        features.extend([average_mmr, sents_similarity])
        if include_sents_dist:
            features.append(sents_dist)
    else:
        features.extend(convert_to_one_hot(average_mmr, 5, (0,1)))
        features.extend(convert_to_one_hot(sents_similarity, 5, (0,1))) # sents_similarity
        if include_sents_dist:
            features.extend(convert_to_one_hot(min(sents_dist, max_num_sents), 10, (0,max_num_sents))) # sents_dist
    return features

Example #18

0

Show file

    np.save(plasmid_host_dist_path, plasmid_host)
    util.save_obj(t, plasmid_host_class_path)

# %% Load related diatance
# Load calculated plasmid-host distance
plasmid_host = np.load(plasmid_host_dist_path)

plasmid_host[plasmid_host > 1000] = 1000

# Normalize plasmid-host distance
plasmid_host_normalized = (plasmid_host - plasmid_host.min(axis=0)) / (
    plasmid_host.max(axis=0) - plasmid_host.min(axis=0)
)

# Calculate plasmid-wise distance
plasmid_plasmid = util.cosine_similarity(plasmid_host, plasmid_host)


# %% Construct plasmid interaction table
host_list = list(set(metadata.Assembly_chainid))
host_list.sort()
host_to_idx_dict = {host: i for i, host in enumerate(host_list)}
idx_to_host_dict = {i: host for i, host in enumerate(host_list)}

# plasmid-strain indicator
interaction_table = np.zeros((len(metadata), len(set(host_list))))
for i in range(len(metadata)):
    interaction_table[i, host_to_idx_dict[metadata.Assembly_chainid[i]]] = 1

# %% Construct plasmid interaction table based on species
host_to_speciesid = {}

Example #19

0

Show file

File: PlasmidHostMahalanobisMatcher.py Project: dorasir/PlasmidHostMahalanobisMatcher

    blast_results = {}
    for key in blast_results_dict:
        blast_results[key.split(".")[0]] = blast_results_dict[key]
    blast_results_mat = np.zeros((len(query_list), len(set(host_list))))
    for i in range(blast_results_mat.shape[0]):
        success, series = blast_results[query_list[i]]
        if not success:
            continue
        else:
            for key in series.keys():
                idx = host_to_idx_dict[int(key[4:])]
                blast_results_mat[i, idx] = series[key]

    # Calculate test-training plasmid distance and svpos
    # test_plasmid_to_train_plasmid = util.cosine_similarity(plasmid_host_normalized, training_plasmid_host[:, :6])
    test_plasmid_to_train_plasmid = util.cosine_similarity(
        plasmid_host_normalized, training_plasmid_host)
    svpos = calc_svpos(test_plasmid_to_train_plasmid,
                       training_interaction_indicator)

    model_path = "data/model.pkl"
    model = util.load_obj(model_path)

    idx = np.arange(plasmid_host.shape[0])
    # features = [plasmid_host_normalized, blast_results_mat[:, :6], svpos[:, :6]]
    features = [plasmid_host_normalized, blast_results_mat, svpos]
    combined_features = [feature.flatten()[:, None] for feature in features]
    combined_features = np.hstack(combined_features)

    prediction = model.predict_proba(combined_features)
    prediction = prediction[:, 1].reshape((-1, features[0].shape[1]))

Example #20

0

Show file

        return feature.copy()


if __name__ == '__main__':
    import sys
    if len(sys.argv) != 3:
        img1 = 'test1.jpg'
        img2 = 'test2.jpg'
    else:
        img1 = sys.argv[1].strip()
        img2 = sys.argv[2].strip()

    import cv2
    import util
    config.channel_num = 3
    config.face_size = 224
    config.feature_size = 4096
    config.extractor = 'vgg_face'

    extractor = VggFeatureExtractor()
    # img_bgr = cv2.imread('../model/vgg_face_caffe/ak.png')
    # feature = extractor.extract_feature(img_bgr)

    img1_bgr = cv2.imread(img1)
    img2_bgr = cv2.imread(img2)
    # img1_bgr = cv2.imread('../../../data/lfw-align/Shane_Loux/Shane_Loux_0001.png')
    # img2_bgr = cv2.imread('../../../data/lfw-align/Val_Ackerman/Val_Ackerman_0001.png')
    feat1 = extractor.extract_feature(img1_bgr)
    feat2 = extractor.extract_feature(img2_bgr)
    print util.cosine_similarity(feat1, feat2)

Example #21

0

Show file

 def body(j, result):
     h_j = H_q[j, :, :]  # hidden_size x batch
     alpha = cosine_similarity(h_i, h_j)
     result = tf.concat([result, alpha], axis=1)
     return [j + 1, result]

Example #22

0

Show file

File: pseudo_label_surface_name.py Project: dheeraj7596/Coarse2Fine

def get_pseudo_label_surface_name(child_label_str,
                                  texts,
                                  embeddings,
                                  probability,
                                  parent,
                                  parent_labels,
                                  thresh=0.8):
    candidate_words = set()
    for sent in texts:
        tokens = set(sent.strip().split())
        if child_label_str in tokens:
            candidate_words.update(tokens)

    candidate_words = candidate_words - {child_label_str}
    filter_words = set([])
    for w in candidate_words:
        try:
            if cosine_similarity(embeddings[w],
                                 embeddings[child_label_str]) < thresh:
                filter_words.add(w)
        except Exception as e:
            print(e)
    candidate_words = candidate_words - filter_words

    candidate_words = list(candidate_words)
    scores = []

    try:
        child_label_thresh = probability[parent][child_label_str]
        den = 0
        for l in parent_labels:
            if l == parent:
                continue
            if child_label_str in probability[
                    l] and probability[l][child_label_str] != -math.inf:
                den += probability[l][child_label_str]

        if den != 0:
            child_label_thresh = child_label_thresh / den
    except Exception as e:
        print(decipher_phrase(child_label_str, id_phrase_map), e)
        child_label_thresh = 0
    # child_label_thresh = 0

    for c in candidate_words:
        cos_sim = cosine_similarity(embeddings[c], embeddings[child_label_str])
        num = probability[parent][c]
        den = 0
        for l in parent_labels:
            if l == parent:
                continue
            if c in probability[l] and probability[l][c] != -math.inf:
                den += probability[l][c]
        if den != 0:
            val = cos_sim * (num / den)
        else:
            val = cos_sim * num

        if val > child_label_thresh:
            scores.append(val)

    inds = sorted(range(len(scores)), key=lambda i: scores[i])[-10:]
    words = []
    for i in inds:
        words.append(candidate_words[i])
    return words

Example #23

0

Show file

File: get_label_specific_top_words_labels.py Project: dheeraj7596/Coarse2Fine

    label_embeddings = pickle.load(
        open(pkl_dump_dir + "label_bert_word_phrase_embeddings.pkl", "rb"))
    stop_words = set(stopwords.words('english'))
    stop_words.add('would')
    translator = str.maketrans(string.punctuation,
                               ' ' * len(string.punctuation))

    for p in parent_to_child:
        embeddings = label_embeddings[p]
        for ch in parent_to_child[p]:
            all_sims[ch] = {}
            mean_sim[ch] = 0
            child_label_str = " ".join(
                [t for t in ch.split("_") if t not in stop_words]).strip()
            for w in embeddings:
                sim = cosine_similarity(embeddings[child_label_str],
                                        embeddings[w])
                all_sims[ch][w] = sim
                mean_sim[ch] += sim
            mean_sim[ch] = mean_sim[ch] / len(embeddings)
            all_sims[ch] = {
                k: v
                for k, v in sorted(all_sims[ch].items(),
                                   key=lambda item: -item[1])[:1000]
            }

    print(mean_sim)
    json.dump(all_sims, open(pkl_dump_dir + "all_sims_label_specific.json",
                             "w"))
    json.dump(mean_sim, open(pkl_dump_dir + "mean_sim_label_specific.json",
                             "w"))

Example #24

0

Show file

def main(unused_argv):

    print('Running statistics on %s' % FLAGS.dataset_name)

    if len(unused_argv
           ) != 1:  # prints a message if you've entered flags incorrectly
        raise Exception("Problem with flags: %s" % unused_argv)

    if FLAGS.dataset_name == 'all':
        dataset_names = ['cnn_dm', 'xsum', 'duc_2004']
    else:
        dataset_names = [FLAGS.dataset_name]

    if not os.path.exists(plot_data_file):
        all_lists_of_histogram_pairs = []
        for dataset_name in dataset_names:
            FLAGS.dataset_name = dataset_name

            if dataset_name == 'duc_2004':
                dataset_splits = ['test']
            elif FLAGS.dataset_split == 'all':
                dataset_splits = ['test', 'val', 'train']
            else:
                dataset_splits = [FLAGS.dataset_split]

            ssi_list = []
            for dataset_split in dataset_splits:

                ssi_path = os.path.join(ssi_dir, FLAGS.dataset_name,
                                        dataset_split + '_ssi.pkl')

                with open(ssi_path) as f:
                    ssi_list.extend(pickle.load(f))

                if FLAGS.dataset_name == 'duc_2004':
                    for abstract_idx in [1, 2, 3]:
                        ssi_path = os.path.join(
                            ssi_dir, FLAGS.dataset_name, dataset_split +
                            '_ssi_' + str(abstract_idx) + '.pkl')
                        with open(ssi_path) as f:
                            temp_ssi_list = pickle.load(f)
                        ssi_list.extend(temp_ssi_list)

            ssi_2d = util.flatten_list_of_lists(ssi_list)

            num_extracted = [
                len(ssi) for ssi in util.flatten_list_of_lists(ssi_list)
            ]
            hist_num_extracted = np.histogram(num_extracted,
                                              bins=6,
                                              range=(0, 5))
            print(hist_num_extracted)
            print('Histogram of number of sentences merged: ' +
                  util.hist_as_pdf_str(hist_num_extracted))

            distances = [
                abs(ssi[0] - ssi[1]) for ssi in ssi_2d if len(ssi) >= 2
            ]
            print('Distance between sentences (mean, median): ',
                  np.mean(distances), np.median(distances))
            hist_dist = np.histogram(distances, bins=max(distances))
            print('Histogram of distances: ' + util.hist_as_pdf_str(hist_dist))

            summ_sent_idx_to_number_of_source_sents = [[], [], [], [], [], [],
                                                       [], [], [], []]
            for ssi in ssi_list:
                for summ_sent_idx, source_indices in enumerate(ssi):
                    if len(source_indices) == 0 or summ_sent_idx >= len(
                            summ_sent_idx_to_number_of_source_sents):
                        continue
                    num_sents = len(source_indices)
                    if num_sents > 2:
                        num_sents = 2
                    summ_sent_idx_to_number_of_source_sents[
                        summ_sent_idx].append(num_sents)
            print(
                "Number of source sents for summary sentence indices (Is the first summary sent more likely to match with a singleton or a pair?):"
            )
            for summ_sent_idx, list_of_numbers_of_source_sents in enumerate(
                    summ_sent_idx_to_number_of_source_sents):
                if len(list_of_numbers_of_source_sents) == 0:
                    percent_singleton = 0.
                else:
                    percent_singleton = list_of_numbers_of_source_sents.count(
                        1) * 1. / len(list_of_numbers_of_source_sents)
                    percent_pair = list_of_numbers_of_source_sents.count(
                        2) * 1. / len(list_of_numbers_of_source_sents)
                print str(percent_singleton) + '\t',
            print ''
            for summ_sent_idx, list_of_numbers_of_source_sents in enumerate(
                    summ_sent_idx_to_number_of_source_sents):
                if len(list_of_numbers_of_source_sents) == 0:
                    percent_pair = 0.
                else:
                    percent_singleton = list_of_numbers_of_source_sents.count(
                        1) * 1. / len(list_of_numbers_of_source_sents)
                    percent_pair = list_of_numbers_of_source_sents.count(
                        2) * 1. / len(list_of_numbers_of_source_sents)
                print str(percent_pair) + '\t',
            print ''

            primary_pos = [ssi[0] for ssi in ssi_2d if len(ssi) >= 1]
            secondary_pos = [ssi[1] for ssi in ssi_2d if len(ssi) >= 2]
            all_pos = [max(ssi) for ssi in ssi_2d if len(ssi) >= 1]

            # if FLAGS.dataset_name != 'duc_2004':
            #     plot_positions(primary_pos, secondary_pos, all_pos)

            if FLAGS.dataset_split == 'all':
                glob_string = '*.bin'
            else:
                glob_string = dataset_splits[0]

            print('Loading TFIDF vectorizer')
            with open(tfidf_vec_path, 'rb') as f:
                tfidf_vectorizer = pickle.load(f)

            source_dir = os.path.join(data_dir, FLAGS.dataset_name)
            source_files = sorted(
                glob.glob(source_dir + '/' + glob_string + '*'))

            total = len(source_files) * 1000 if (
                'cnn' in FLAGS.dataset_name or 'newsroom' in FLAGS.dataset_name
                or 'xsum' in FLAGS.dataset_name) else len(source_files)
            example_generator = data.example_generator(
                source_dir + '/' + glob_string + '*',
                True,
                False,
                should_check_valid=False)

            all_possible_singles = 0
            all_possible_pairs = [0]
            all_filtered_pairs = 0
            all_all_combinations = 0
            all_ssi_pairs = [0]
            ssi_pairs_with_shared_coref = [0]
            ssi_pairs_with_shared_word = [0]
            ssi_pairs_with_either_coref_or_word = [0]
            all_pairs_with_shared_coref = [0]
            all_pairs_with_shared_word = [0]
            all_pairs_with_either_coref_or_word = [0]
            actual_total = [0]
            rel_positions_primary = []
            rel_positions_secondary = []
            rel_positions_all = []
            sent_lens = []
            all_sent_lens = []
            all_pos = []
            y = []
            normalized_positions_primary = []
            normalized_positions_secondary = []
            all_normalized_positions_primary = []
            all_normalized_positions_secondary = []
            normalized_positions_singles = []
            normalized_positions_pairs_first = []
            normalized_positions_pairs_second = []
            primary_pos_duc = []
            secondary_pos_duc = []
            all_pos_duc = []
            all_distances = []
            distances_duc = []
            tfidf_similarities = []
            all_tfidf_similarities = []
            average_mmrs = []
            all_average_mmrs = []

            for example_idx, example in enumerate(
                    tqdm(example_generator, total=total)):

                # def process(example_idx_example):
                #     # print '0'
                #     example = example_idx_example
                if FLAGS.num_instances != -1 and example_idx >= FLAGS.num_instances:
                    break
                raw_article_sents, groundtruth_similar_source_indices_list, groundtruth_summary_text, corefs, doc_indices = util.unpack_tf_example(
                    example, names_to_types)
                article_sent_tokens = [
                    util.process_sent(sent) for sent in raw_article_sents
                ]
                article_text = ' '.join(raw_article_sents)
                groundtruth_summ_sents = [[
                    sent.strip()
                    for sent in groundtruth_summary_text.strip().split('\n')
                ]]
                if doc_indices is None:
                    doc_indices = [0] * len(
                        util.flatten_list_of_lists(article_sent_tokens))
                doc_indices = [int(doc_idx) for doc_idx in doc_indices]
                rel_sent_indices, doc_sent_indices, doc_sent_lens = preprocess_for_lambdamart_no_flags.get_rel_sent_indices(
                    doc_indices, article_sent_tokens)
                groundtruth_similar_source_indices_list = util.enforce_sentence_limit(
                    groundtruth_similar_source_indices_list,
                    FLAGS.sentence_limit)

                sent_term_matrix = util.get_doc_substituted_tfidf_matrix(
                    tfidf_vectorizer, raw_article_sents, article_text)
                sents_similarities = util.cosine_similarity(
                    sent_term_matrix, sent_term_matrix)
                importances = util.special_squash(
                    util.get_tfidf_importances(tfidf_vectorizer,
                                               raw_article_sents))

                if FLAGS.dataset_name == 'duc_2004':
                    first_k_indices = lambdamart_scores_to_summaries.get_indices_of_first_k_sents_of_each_article(
                        rel_sent_indices, FLAGS.first_k)
                else:
                    first_k_indices = [
                        idx for idx in range(len(raw_article_sents))
                    ]
                article_indices = list(range(len(raw_article_sents)))

                possible_pairs = [
                    x for x in list(itertools.combinations(article_indices, 2))
                ]  # all pairs
                # # # filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_criteria(raw_article_sents, possible_pairs, corefs)
                # if FLAGS.dataset_name == 'duc_2004':
                #     filtered_possible_pairs = [x for x in list(itertools.combinations(first_k_indices, 2))]  # all pairs
                # else:
                #     filtered_possible_pairs = preprocess_for_lambdamart_no_flags.filter_pairs_by_sent_position(possible_pairs)
                # # removed_pairs = list(set(possible_pairs) - set(filtered_possible_pairs))
                # possible_singles = [(i,) for i in range(len(raw_article_sents))]
                # all_combinations = filtered_possible_pairs + possible_singles
                #
                # all_possible_singles += len(possible_singles)
                # all_possible_pairs[0] += len(possible_pairs)
                # all_filtered_pairs += len(filtered_possible_pairs)
                # all_all_combinations += len(all_combinations)

                # for ssi in groundtruth_similar_source_indices_list:
                #     if len(ssi) > 0:
                #         idx = rel_sent_indices[ssi[0]]
                #         rel_positions_primary.append(idx)
                #         rel_positions_all.append(idx)
                #     if len(ssi) > 1:
                #         idx = rel_sent_indices[ssi[1]]
                #         rel_positions_secondary.append(idx)
                #         rel_positions_all.append(idx)
                #
                #
                #

                # coref_pairs = preprocess_for_lambdamart_no_flags.get_coref_pairs(corefs)
                # # DO OVER LAP PAIRS BETTER
                # overlap_pairs = preprocess_for_lambdamart_no_flags.filter_by_overlap(article_sent_tokens, possible_pairs)
                # either_coref_or_word = list(set(list(coref_pairs) + overlap_pairs))
                #
                # for ssi in groundtruth_similar_source_indices_list:
                #     if len(ssi) == 2:
                #         all_ssi_pairs[0] += 1
                #         do_share_coref = ssi in coref_pairs
                #         do_share_words = ssi in overlap_pairs
                #         if do_share_coref:
                #             ssi_pairs_with_shared_coref[0] += 1
                #         if do_share_words:
                #             ssi_pairs_with_shared_word[0] += 1
                #         if do_share_coref or do_share_words:
                #             ssi_pairs_with_either_coref_or_word[0] += 1
                # all_pairs_with_shared_coref[0] += len(coref_pairs)
                # all_pairs_with_shared_word[0] += len(overlap_pairs)
                # all_pairs_with_either_coref_or_word[0] += len(either_coref_or_word)

                if FLAGS.dataset_name == 'duc_2004':
                    primary_pos_duc.extend([
                        rel_sent_indices[ssi[0]]
                        for ssi in groundtruth_similar_source_indices_list
                        if len(ssi) >= 1
                    ])
                    secondary_pos_duc.extend([
                        rel_sent_indices[ssi[1]]
                        for ssi in groundtruth_similar_source_indices_list
                        if len(ssi) >= 2
                    ])
                    all_pos_duc.extend([
                        max([rel_sent_indices[sent_idx] for sent_idx in ssi])
                        for ssi in groundtruth_similar_source_indices_list
                        if len(ssi) >= 1
                    ])

                for ssi in groundtruth_similar_source_indices_list:
                    for sent_idx in ssi:
                        sent_lens.append(len(article_sent_tokens[sent_idx]))
                    if len(ssi) >= 1:
                        orig_val = ssi[0]
                        vals_to_add = get_integral_values_for_histogram(
                            orig_val, rel_sent_indices, doc_sent_indices,
                            doc_sent_lens, raw_article_sents)
                        normalized_positions_primary.extend(vals_to_add)
                    if len(ssi) >= 2:
                        orig_val = ssi[1]
                        vals_to_add = get_integral_values_for_histogram(
                            orig_val, rel_sent_indices, doc_sent_indices,
                            doc_sent_lens, raw_article_sents)
                        normalized_positions_secondary.extend(vals_to_add)

                        if FLAGS.dataset_name == 'duc_2004':
                            distances_duc.append(
                                abs(rel_sent_indices[ssi[1]] -
                                    rel_sent_indices[ssi[0]]))

                        tfidf_similarities.append(sents_similarities[ssi[0],
                                                                     ssi[1]])
                        average_mmrs.append(
                            (importances[ssi[0]] + importances[ssi[1]]) / 2)

                for ssi in groundtruth_similar_source_indices_list:
                    if len(ssi) == 1:
                        orig_val = ssi[0]
                        vals_to_add = get_integral_values_for_histogram(
                            orig_val, rel_sent_indices, doc_sent_indices,
                            doc_sent_lens, raw_article_sents)
                        normalized_positions_singles.extend(vals_to_add)
                    if len(ssi) >= 2:
                        if doc_sent_indices[ssi[0]] != doc_sent_indices[
                                ssi[1]]:
                            continue
                        orig_val_first = min(ssi[0], ssi[1])
                        vals_to_add = get_integral_values_for_histogram(
                            orig_val_first, rel_sent_indices, doc_sent_indices,
                            doc_sent_lens, raw_article_sents)
                        normalized_positions_pairs_first.extend(vals_to_add)
                        orig_val_second = max(ssi[0], ssi[1])
                        vals_to_add = get_integral_values_for_histogram(
                            orig_val_second, rel_sent_indices,
                            doc_sent_indices, doc_sent_lens, raw_article_sents)
                        normalized_positions_pairs_second.extend(vals_to_add)

                # all_normalized_positions_primary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(single[0], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for single in possible_singles]))
                # all_normalized_positions_secondary.extend(util.flatten_list_of_lists([get_integral_values_for_histogram(pair[1], rel_sent_indices, doc_sent_indices, doc_sent_lens, raw_article_sents) for pair in possible_pairs]))
                all_sent_lens.extend(
                    [len(sent) for sent in article_sent_tokens])
                all_distances.extend([
                    abs(rel_sent_indices[pair[1]] - rel_sent_indices[pair[0]])
                    for pair in possible_pairs
                ])
                all_tfidf_similarities.extend([
                    sents_similarities[pair[0], pair[1]]
                    for pair in possible_pairs
                ])
                all_average_mmrs.extend([
                    (importances[pair[0]] + importances[pair[1]]) / 2
                    for pair in possible_pairs
                ])

                # if FLAGS.dataset_name == 'duc_2004':
                #     rel_pos_single = [rel_sent_indices[single[0]] for single in possible_singles]
                #     rel_pos_pair = [[rel_sent_indices[pair[0]], rel_sent_indices[pair[1]]] for pair in possible_pairs]
                #     all_pos.extend(rel_pos_single)
                #     all_pos.extend([max(pair) for pair in rel_pos_pair])
                # else:
                #     all_pos.extend(util.flatten_list_of_lists(possible_singles))
                #     all_pos.extend([max(pair) for pair in possible_pairs])
                # y.extend([1 if single in groundtruth_similar_source_indices_list else 0 for single in possible_singles])
                # y.extend([1 if pair in groundtruth_similar_source_indices_list else 0 for pair in possible_pairs])

                # actual_total[0] += 1

            # # p = Pool(144)
            # # list(tqdm(p.imap(process, example_generator), total=total))
            #
            # # print 'Possible_singles\tPossible_pairs\tFiltered_pairs\tAll_combinations: \n%.2f\t%.2f\t%.2f\t%.2f' % (all_possible_singles*1./actual_total, \
            # #     all_possible_pairs*1./actual_total, all_filtered_pairs*1./actual_total, all_all_combinations*1./actual_total)
            # #
            # # # print 'Relative positions of groundtruth source sentences in document:\nPrimary\tSecondary\tBoth\n%.2f\t%.2f\t%.2f' % (np.mean(rel_positions_primary), np.mean(rel_positions_secondary), np.mean(rel_positions_all))
            # #
            # # print 'SSI Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \
            # #       % (ssi_pairs_with_shared_coref[0]*100./all_ssi_pairs[0], ssi_pairs_with_shared_word[0]*100./all_ssi_pairs[0], ssi_pairs_with_either_coref_or_word[0]*100./all_ssi_pairs[0])
            # # print 'All Pair statistics:\nShare_coref\tShare_word\tShare_either\n%.2f\t%.2f\t%.2f' \
            # #       % (all_pairs_with_shared_coref[0]*100./all_possible_pairs[0], all_pairs_with_shared_word[0]*100./all_possible_pairs[0], all_pairs_with_either_coref_or_word[0]*100./all_possible_pairs[0])
            #
            # # hist_all_pos = np.histogram(all_pos, bins=max(all_pos)+1)
            # # print 'Histogram of all sent positions: ', util.hist_as_pdf_str(hist_all_pos)
            # # min_sent_len = min(sent_lens)
            # # hist_sent_lens = np.histogram(sent_lens, bins=max(sent_lens)-min_sent_len+1)
            # # print 'min, max sent lens:', min_sent_len, max(sent_lens)
            # # print 'Histogram of sent lens: ', util.hist_as_pdf_str(hist_sent_lens)
            # # min_all_sent_len = min(all_sent_lens)
            # # hist_all_sent_lens = np.histogram(all_sent_lens, bins=max(all_sent_lens)-min_all_sent_len+1)
            # # print 'min, max all sent lens:', min_all_sent_len, max(all_sent_lens)
            # # print 'Histogram of all sent lens: ', util.hist_as_pdf_str(hist_all_sent_lens)
            #
            # # print 'Pearsons r, p value', pearsonr(all_pos, y)
            # # fig, ax1 = plt.subplots(nrows=1)
            # # plt.scatter(all_pos, y)
            # # pp = PdfPages(os.path.join('stuff/plots', FLAGS.dataset_name + '_position_scatter.pdf'))
            # # plt.savefig(pp, format='pdf',bbox_inches='tight')
            # # plt.show()
            # # pp.close()
            #
            # # if FLAGS.dataset_name == 'duc_2004':
            # #     plot_positions(primary_pos_duc, secondary_pos_duc, all_pos_duc)
            #
            # normalized_positions_all = normalized_positions_primary + normalized_positions_secondary
            # # plot_histogram(normalized_positions_primary, num_bins=100)
            # # plot_histogram(normalized_positions_secondary, num_bins=100)
            # # plot_histogram(normalized_positions_all, num_bins=100)
            #
            # sent_lens_together = [sent_lens, all_sent_lens]
            # # plot_histogram(sent_lens_together, pdf=True, start_at_0=True, max_val=70)
            #
            # if FLAGS.dataset_name == 'duc_2004':
            #     distances = distances_duc
            # sent_distances_together = [distances, all_distances]
            # # plot_histogram(sent_distances_together, pdf=True, start_at_0=True, max_val=100)
            #
            # tfidf_similarities_together = [tfidf_similarities, all_tfidf_similarities]
            # # plot_histogram(tfidf_similarities_together, pdf=True, num_bins=100)
            #
            # average_mmrs_together = [average_mmrs, all_average_mmrs]
            # # plot_histogram(average_mmrs_together, pdf=True, num_bins=100)
            #
            # normalized_positions_primary_together = [normalized_positions_primary, bin_values]
            # normalized_positions_secondary_together = [normalized_positions_secondary, bin_values]
            # # plot_histogram(normalized_positions_primary_together, pdf=True, num_bins=100)
            # # plot_histogram(normalized_positions_secondary_together, pdf=True, num_bins=100)
            #
            #
            # list_of_hist_pairs = [
            #     {
            #         'lst': normalized_positions_primary_together,
            #         'pdf': True,
            #         'num_bins': 100,
            #         'y_lim': 3.9,
            #         'y_label': FLAGS.dataset_name,
            #         'x_label': 'Sent position (primary)'
            #     },
            #     {
            #         'lst': normalized_positions_secondary_together,
            #         'pdf': True,
            #         'num_bins': 100,
            #         'y_lim': 3.9,
            #         'x_label': 'Sent position (secondary)'
            #     },
            #     {
            #         'lst': sent_distances_together,
            #         'pdf': True,
            #         'start_at_0': True,
            #         'max_val': 100,
            #         'x_label': 'Sent distance'
            #     },
            #     {
            #         'lst': sent_lens_together,
            #         'pdf': True,
            #         'start_at_0': True,
            #         'max_val': 70,
            #         'x_label': 'Sent length'
            #     },
            #     {
            #         'lst': average_mmrs_together,
            #         'pdf': True,
            #         'num_bins': 100,
            #         'x_label': 'Average TF-IDF importance'
            #     }
            # ]

            normalized_positions_pairs_together = [
                normalized_positions_pairs_first,
                normalized_positions_pairs_second
            ]
            list_of_hist_pairs = [
                {
                    'lst': [normalized_positions_singles],
                    'pdf': True,
                    'num_bins': 100,
                    # 'y_lim': 3.9,
                    'x_lim': 1.0,
                    'y_label': FLAGS.dataset_name,
                    'x_label': 'Sent Position (Singles)',
                    'legend_labels': ['Primary']
                },
                {
                    'lst': normalized_positions_pairs_together,
                    'pdf': True,
                    'num_bins': 100,
                    # 'y_lim': 3.9,
                    'x_lim': 1.0,
                    'x_label': 'Sent Position (Pairs)',
                    'legend_labels': ['Primary', 'Secondary']
                }
            ]

            all_lists_of_histogram_pairs.append(list_of_hist_pairs)
        with open(plot_data_file, 'w') as f:
            cPickle.dump(all_lists_of_histogram_pairs, f)
    else:
        with open(plot_data_file) as f:
            all_lists_of_histogram_pairs = cPickle.load(f)
    plot_histograms(all_lists_of_histogram_pairs)