Python cluster Examples, plagcomps.intrinsic.cluster.cluster Python Examples

Example #1

0

Show file

File: genetic_programming.py Project: NoahCarnahan/plagcomps

def feature_test(formula, feature_mapping):

    actuals = []
    confidences = []

    files = training_files + test_files

    #try:
    for doc_num,doc in enumerate(get_cached_reduced_docs("paragraph", files)):
        print str(doc_num)
        for span in doc.get_spans():
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)

        computed_feature_vector = []
        feature_vectors = doc.get_feature_vectors(features, session)
        for feature_tuple in feature_vectors:
            ith_feature_slice = {features[j]:value for j,value in enumerate(feature_tuple)}
            #print ith_feature_slice
            computed_feature = test_evaluate(formula, ith_feature_slice, feature_mapping)
            computed_feature_vector.append([computed_feature])

        #print "clustering with", computed_feature_vector
        confidences += cluster("kmeans", 2, computed_feature_vector)

    #print "Conf, Actual", confidences, actuals
    # For each document, add (1 - AUC) for our ROC calculation
    return (1 - BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1])

Example #2

0

Show file

def feature_test(formula, feature_mapping):

    actuals = []
    confidences = []

    files = training_files + test_files

    #try:
    for doc_num, doc in enumerate(get_cached_reduced_docs("paragraph", files)):
        print str(doc_num)
        for span in doc.get_spans():
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)

        computed_feature_vector = []
        feature_vectors = doc.get_feature_vectors(features, session)
        for feature_tuple in feature_vectors:
            ith_feature_slice = {
                features[j]: value
                for j, value in enumerate(feature_tuple)
            }
            #print ith_feature_slice
            computed_feature = test_evaluate(formula, ith_feature_slice,
                                             feature_mapping)
            computed_feature_vector.append([computed_feature])

        #print "clustering with", computed_feature_vector
        confidences += cluster("kmeans", 2, computed_feature_vector)

    #print "Conf, Actual", confidences, actuals
    # For each document, add (1 - AUC) for our ROC calculation
    return (1 -
            BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1])

Example #3

0

Show file

File: genetic_programming.py Project: NoahCarnahan/plagcomps

    def fitness(self, training=True):
        # choose 10 random values
        badness = 0.0
        actuals = []
        confidences = []

        files = training_files if training else test_files


        try:
            for doc in get_cached_reduced_docs(atom_type, files):
                for span in doc.get_spans():
                    actuals.append(1 if doc.span_is_plagiarized(span) else 0)

                computed_feature_vector = []
                feature_vectors = doc.get_feature_vectors(features, session)
                for feature_tuple in feature_vectors:
                    ith_feature_slice = {chr(ord("A") + j):value for j,value in enumerate(feature_tuple)}
                    #print ith_feature_slice
                    computed_feature = self.calc(**ith_feature_slice)
                    computed_feature_vector.append([computed_feature])

                #print "clustering with", computed_feature_vector
                confidences += cluster(cluster_type, 2, computed_feature_vector)

            #print "Conf, Actual", confidences, actuals
            # For each document, add (1 - AUC) for our ROC calculation
            badness += (1 - BaseUtility.draw_roc(actuals, confidences, save_figure=False)[1])

            return badness
        except OverflowError:
            return 1.0e+255 # infinitely bad

Example #4

0

Show file

File: intrinsic.py Project: NoahCarnahan/plagcomps

def get_confidences_actuals(session, features, cluster_type, k, atom_type, docs, corpus='intrinsic', save_roc_figure=True, reduced_docs=None, feature_vector_weights=None, metadata={}, cheating=False, cheating_min_len=5000, **clusterargs):
    '''
    Return the confidences and acutals for the given list of documents parsed
    by atom_type, using the given features, cluster_type, and number of clusters k.
    
    features is a list of strings where each string is the name of a StylometricFeatureEvaluator method.
    cluster_type is "kmeans", "hmm", or "agglom".
    k is an integer.
    atom_type is "word", "sentence", or "paragraph".
    docs should be a list of full path strings.
    '''
    # TODO: Return more statistics, not just roc curve things.
    # If previous call cached <reduced_docs>, don't re-query the DB
    if not reduced_docs:
        reduced_docs = _get_reduced_docs(atom_type, docs, session, corpus=corpus)
    plag_likelihoods = []
    actuals = []
    
    count = 0
    valid_reduced_docs = []
    for d in reduced_docs:
        count += 1
        if DEBUG:
            print "On document", d, ". The", count, "th document."

        feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len)
        # skip if there are no feature_vectors
        if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did
            continue
        valid_reduced_docs.append(d)

       # add to actuals
        spans = d.get_spans()
        for i in xrange(len(spans)):
            span = spans[i]
            actuals.append(1 if d.span_is_plagiarized(span) else 0)
 
        if feature_vector_weights:
            weighted_vecs = []
            for vec in feature_vecs:
                cur_weight_vec = []
                for i, weight in enumerate(feature_vector_weights, 0):
                    cur_weight_vec.append(vec[i] * weight)
                weighted_vecs.append(cur_weight_vec)
            feature_vecs = weighted_vecs

        likelihood = cluster(cluster_type, k, feature_vecs, **clusterargs)
        plag_likelihoods.append(likelihood)
    
    session.close()

    all_confidences = []
    for likelihood_list in plag_likelihoods:
        all_confidences += likelihood_list

    return all_confidences, actuals

Example #5

0

Show file

File: serialization.py Project: NoahCarnahan/plagcomps

def extract_and_serialize(txt_file, xml_file, out_file, atom_type='paragraph',
                          cluster_method='kmeans', k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility() 

    feature_names = [
        'average_word_length',
        'average_sentence_length',
        'stopword_percentage',
        'punctuation_percentage',
        'syntactic_complexity',
        'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]
   

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file

Example #6

0

Show file

File: intrinsic.py Project: NoahCarnahan/plagcomps

def evaluate(reduced_docs, session, features, cluster_type, k, atom_type, docs, corpus='intrinsic', save_roc_figure=True, feature_vector_weights=None, 
            metadata={}, cheating=False, cheating_min_len=5000, **clusterargs):
    '''
    Return the roc curve path and area under the roc curve for the given list of documents parsed
    by atom_type, using the given features, cluster_type, and number of clusters k.
    
    features is a list of strings where each string is the name of a StylometricFeatureEvaluator method.
    cluster_type is "kmeans", "hmm", or "agglom".
    k is an integer.
    atom_type is "word", "sentence", or "paragraph".
    docs should be a list of full path strings.
    '''
    # If previous call cached <reduced_docs>, don't re-query the DB
    plag_likelihoods = []
    doc_plag_assignments = {}
    
    count = 0
    valid_reduced_docs = []
    for d in reduced_docs:
        count += 1
        if DEBUG:
            print "On document", d, ". The", count, "th document."

        feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len)
        # skip if there are no feature_vectors
        if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did
            continue
        valid_reduced_docs.append(d)

        if feature_vector_weights:
            weighted_vecs = []
            for vec in feature_vecs:
                cur_weight_vec = []
                for i, weight in enumerate(feature_vector_weights, 0):
                    cur_weight_vec.append(vec[i] * weight)
                weighted_vecs.append(cur_weight_vec)
            feature_vecs = weighted_vecs

        likelihood = cluster(cluster_type, k, feature_vecs, **clusterargs)
        doc_plag_assignments[d] = likelihood
        plag_likelihoods.append(likelihood)
    
    metadata['features'] = features
    metadata['cluster_type'] = cluster_type
    metadata['k'] = k
    metadata['atom_type'] = atom_type
    metadata['n'] = len(reduced_docs)
    roc_path, roc_auc = _roc(valid_reduced_docs, plag_likelihoods, save_roc_figure=save_roc_figure, cheating=cheating, cheating_min_len=cheating_min_len, **metadata)

    # Return reduced_docs for caching in case we call <evaluate> multiple times
    return roc_path, roc_auc

Example #7

0

Show file

def extract_and_serialize(txt_file,
                          xml_file,
                          out_file,
                          atom_type='paragraph',
                          cluster_method='kmeans',
                          k=2):
    '''
    Performs all of intrinsic (feature extraction, clustering etc.) and creates
    Passage objects for each passage in <txt_file>. Writes a CSV file out
    to <out_file> containing all the features of <txt_file>

    The CSV files can be read easily by R in order to create plots
    '''
    f = file(txt_file, 'r')
    text = f.read()
    f.close()

    util = IntrinsicUtility()

    feature_names = [
        'average_word_length', 'average_sentence_length',
        'stopword_percentage', 'punctuation_percentage',
        'syntactic_complexity', 'avg_internal_word_freq_class',
        'avg_external_word_freq_class'
    ]

    ext = FeatureExtractor(text)
    print 'Initialized extractor'
    # Note that passages don't know their ground truths yet
    passages = ext.get_passages(feature_names, atom_type)
    print 'Extracted passages'
    util.add_ground_truth_to_passages(passages, xml_file)

    feature_vecs = [p.features.values() for p in passages]

    # If just testing feature extraction, don't cluster passages
    if cluster_method != 'none':
        # Cluster the passages and set their confidences
        confidences = cluster(cluster_method, k, feature_vecs)
        for psg, conf in zip(passages, confidences):
            psg.set_plag_confidence(conf)

    f = file(out_file, 'wb')
    csv_writer = csv.writer(f)

    # Writes out the header for corresponding CSV
    csv_writer.writerow(IntrinsicPassage.serialization_header(feature_names))
    for p in passages:
        csv_writer.writerow(p.to_list(feature_names))
    f.close()
    print 'Finished writing', out_file

Example #8

0

Show file

File: genetic_programming.py Project: NoahCarnahan/plagcomps

def get_cached_confidences(doc):
    if doc in cached_confidences:
        return cached_confidences[doc]

    confidence_vectors = [ [] for x in range(len(doc.get_spans())) ] 
    feature_vectors = doc.get_feature_vectors(features, session)
    num_passages = len(feature_vectors)
    num_features = len(features)
    for feature_index in range(num_features):
        one_feature_vector = [passage_features[feature_index] for passage_features in feature_vectors]
        one_feature_confidences = cluster(cluster_type, 2, [[feature_value] for feature_value in one_feature_vector])
        for passage_index in range(num_passages):
            confidence_vectors[passage_index].append(one_feature_confidences[passage_index])

    cached_confidences[doc] = confidence_vectors
    return confidence_vectors

Example #9

0

Show file

File: feature_combination.py Project: nmeuschke/Intrinsic_Plagiarism_Analysis

def _get_feature_conf_and_actuals(features, cluster_type, atom_type, start_doc, n, pct_plag=None, **cluster_args):
    '''
    Returns a matrix of dimension <num_passages> x <num_features> where each row holds 
    the confidence that that row was plagiarized according to each feature. In other
    words,
    mat[passage_num][feat_num] is the plag. confidence of <passage_num> according to <feat_num>

    Note that the transpose of this matrix is built below, and then transposed before returning
    '''

    first_training_files = IntrinsicUtility().get_n_training_files(n, first_doc_num=start_doc, pct_plag=pct_plag)
    session = Session()
    reduced_docs = _get_reduced_docs(atom_type, first_training_files, session)

    actuals = []
    
    # feature_conf_matrix[feat][span_index] == Conf. that <span_index>
    # was plag. according to <feat>
    # NOTE that we're ignoring document boundaries in the storage of this 
    # matrix. So <span_index> is not relative to any document
    feature_conf_matrix = [[] for i in xrange(len(features))]
    
    for doc_index in xrange(len(reduced_docs)):
        if doc_index % 10 == 0:
            print 'Working on doc number (in training corpus)', start_doc + doc_index
        doc = reduced_docs[doc_index]
        spans = doc.get_spans()

        for feat_num in xrange(len(features)):
            feat = features[feat_num]
            feature_vecs = doc.get_feature_vectors([feat], session)
            # One column, i.e. confidence values for <feat> over all passages 
            # in <doc>
            confidences = cluster(cluster_type, 2, feature_vecs, **cluster_args)
            # Use append if we care about document_num
            
            feature_conf_matrix[feat_num].extend(confidences)
            
        for span_index in xrange(len(spans)):
            span = spans[span_index]
            
            actuals.append(1 if doc.span_is_plagiarized(span) else 0)
            
    
    rotated = np.matrix(feature_conf_matrix).T

    return rotated, actuals

Example #10

0

Show file

File: intrinsic.py Project: NoahCarnahan/plagcomps

def _cluster_auc_test(num_plag, num_noplag, mean_diff, std, dimensions = 1, repetitions = 1):
    '''
    roc area under curve evaluation of various clustering techniques
    creates two peaks based on normal distributions and tries to cluster them
    prints out AUC stat for each cluster type
    '''
    print "running cluster auc test with", num_plag, num_noplag, mean_diff, std, dimensions, repetitions
    if repetitions > 1:
        averages = {}

    for rep in range(repetitions):

        noplag_features = []
        for i in range(num_noplag):
            cur = []
            for j in range(dimensions):
                cur.append(scipy.random.normal(0, std))
            noplag_features.append(cur)

        plag_features = []
        for i in range(num_plag):
            cur = []
            for j in range(dimensions):
                cur.append(scipy.random.normal(mean_diff, std))
            plag_features.append(cur)

        features = noplag_features + plag_features
        actuals = [0] * num_noplag + [1] * num_plag

        for clus_type in ["kmeans", "agglom", "hmm"]:
            confidences = cluster(clus_type, 2, features)
            fpr, tpr, thresholds = sklearn.metrics.roc_curve(actuals, confidences, pos_label=1)
            roc_auc = sklearn.metrics.auc(fpr, tpr)
            if repetitions == 1:
                print clus_type, roc_auc
            else:
                averages[clus_type] = averages.get(clus_type, []) + [roc_auc]

    if repetitions > 1:
        for key in averages:
            print key, sum(averages[key])/float(max(1, len(averages[key])))

Example #11

0

Show file

def get_cached_confidences(doc):
    if doc in cached_confidences:
        return cached_confidences[doc]

    confidence_vectors = [[] for x in range(len(doc.get_spans()))]
    feature_vectors = doc.get_feature_vectors(features, session)
    num_passages = len(feature_vectors)
    num_features = len(features)
    for feature_index in range(num_features):
        one_feature_vector = [
            passage_features[feature_index]
            for passage_features in feature_vectors
        ]
        one_feature_confidences = cluster(
            cluster_type, 2,
            [[feature_value] for feature_value in one_feature_vector])
        for passage_index in range(num_passages):
            confidence_vectors[passage_index].append(
                one_feature_confidences[passage_index])

    cached_confidences[doc] = confidence_vectors
    return confidence_vectors

Example #12

0

Show file

    def fitness(self, training=True):
        # choose 10 random values
        badness = 0.0
        actuals = []
        confidences = []

        files = training_files if training else test_files

        try:
            for doc in get_cached_reduced_docs(atom_type, files):
                for span in doc.get_spans():
                    actuals.append(1 if doc.span_is_plagiarized(span) else 0)

                computed_feature_vector = []
                feature_vectors = doc.get_feature_vectors(features, session)
                for feature_tuple in feature_vectors:
                    ith_feature_slice = {
                        chr(ord("A") + j): value
                        for j, value in enumerate(feature_tuple)
                    }
                    #print ith_feature_slice
                    computed_feature = self.calc(**ith_feature_slice)
                    computed_feature_vector.append([computed_feature])

                #print "clustering with", computed_feature_vector
                confidences += cluster(cluster_type, 2,
                                       computed_feature_vector)

            #print "Conf, Actual", confidences, actuals
            # For each document, add (1 - AUC) for our ROC calculation
            badness += (1 - BaseUtility.draw_roc(
                actuals, confidences, save_figure=False)[1])

            return badness
        except OverflowError:
            return 1.0e+255  # infinitely bad

Example #13

0

Show file

File: precrecall.py Project: NoahCarnahan/plagcomps

def prec_recall_evaluate(reduced_docs, session, features, cluster_type, k, atom_type, corpus='intrinsic', feature_vector_weights=None, 
            metadata={}, cheating=False, cheating_min_len=5000, **clusterargs):
    '''
   
    '''
    thresholds = [.05 * i for i in range(20)]

    thresh_to_prec = {}
    thresh_to_recall = {}
    thresh_to_fmeasure = {}
    thresh_to_granularity = {}
    thresh_to_overall = {}

    # doc_to_thresh_to_result[i] = {thresh -> (prec, recall)}
    doc_to_thresh_to_result = []
    
    count = 0
    valid_reduced_docs = []

    for i, d in enumerate(reduced_docs):
        doc_to_thresh_to_result.append({})
        count += 1

        print "On document", d, ". The", count, "th document."

        feature_vecs = d.get_feature_vectors(features, session, cheating=cheating, cheating_min_len=cheating_min_len)
        # skip if there are no feature_vectors
        if cheating and len(feature_vecs) < 7: # 7, because that's what Benno did
            continue
        valid_reduced_docs.append(d)

        if feature_vector_weights:
            weighted_vecs = []
            for vec in feature_vecs:
                cur_weight_vec = []
                for i, weight in enumerate(feature_vector_weights, 0):
                    cur_weight_vec.append(vec[i] * weight)
                weighted_vecs.append(cur_weight_vec)
            feature_vecs = weighted_vecs

        # Grab the plagiarized spans
        spans = d.get_spans(cheating, cheating_min_len)
        actual_plag_spans = d.get_plag_spans()
        # Cluster to get plag probs
        plag_likelihoods = cluster(cluster_type, k, feature_vecs, **clusterargs)

        # Make sure we have a confidence level for every span
        assert(len(spans) == len(plag_likelihoods))

        # thresh => detected_spans
        all_detected_spans = {}
        for thresh in thresholds:
            prec, rec, fmeasure, granularity, overall, plag_spans, detected_spans = \
                get_all_measures(actual_plag_spans, spans, plag_likelihoods, thresh, cheating=cheating, cheating_min_len=cheating_min_len)
            all_detected_spans[thresh] = detected_spans

            # If measure wasn't well defined, None is returned. NOTE (nj) sneaky bug:
            # if we use the construct 
            # if prec:
            #   <add it to the dict>
            # never add prec or recall when they're 0. 
            # Thus we explicitly check for None-ness
            if prec is not None:
                thresh_to_prec.setdefault(thresh, []).append(prec)
            if rec is not None:
                thresh_to_recall.setdefault(thresh, []).append(rec)
            if fmeasure is not None:
                thresh_to_fmeasure.setdefault(thresh, []).append(fmeasure)
            if granularity is not None:
                thresh_to_granularity.setdefault(thresh, []).append(granularity)
            if overall is not None:
                thresh_to_overall.setdefault(thresh, []).append(overall)

            doc_to_thresh_to_result[i][thresh] = (prec, rec, fmeasure, granularity, overall)

        # Pass relevant data to plotting function
        doc_name = os.path.basename(d._short_name).replace('.txt', '')
        #visualize_overlaps(plag_spans, all_detected_spans, doc_name=doc_name)

    # For a given threshold, how many documents had valid precisions?
    print 'Valid precision:', sorted([(th, len(l)) for th, l in thresh_to_prec.iteritems()])
    # For a given threshold, how many documents had valid recall?
    print 'Valid recall (this number should not change):', sorted([(th, len(l)) for th, l in thresh_to_recall.iteritems()])

    thresh_prec_avgs = {t : sum(l) / len(l) for t, l in thresh_to_prec.iteritems()}
    thresh_recall_avgs = {t : sum(l) / len(l) for t, l in thresh_to_recall.iteritems()}
    thresh_fmeasure_avgs = {t : sum(l) / len(l) for t, l in thresh_to_fmeasure.iteritems()}
    thresh_granularity_avgs = {t : sum(l) / len(l) for t, l in thresh_to_granularity.iteritems()}
    thresh_overall_avgs = {t : sum(l) / len(l) for t, l in thresh_to_overall.iteritems()}

    if DEBUG:
        for thresh in sorted(thresh_prec_avgs.keys()):
            print thresh
            print 'Prec:', thresh_prec_avgs[thresh]
            print 'Recall:', thresh_recall_avgs[thresh]
            print 'F-Measure:', thresh_fmeasure_avgs[thresh]
            print 'Granularity:', thresh_granularity_avgs[thresh]
            print 'Overall:', thresh_overall_avgs[thresh]
            print '-'*40

    return thresh_prec_avgs, thresh_recall_avgs, thresh_fmeasure_avgs, thresh_granularity_avgs, thresh_overall_avgs

Example #14

0

Show file

File: precrecall.py Project: seenu-andi-rajendran/plagcomps

def prec_recall_evaluate(reduced_docs,
                         session,
                         features,
                         cluster_type,
                         k,
                         atom_type,
                         corpus='intrinsic',
                         feature_vector_weights=None,
                         metadata={},
                         cheating=False,
                         cheating_min_len=5000,
                         **clusterargs):
    '''
   
    '''
    thresholds = [.05 * i for i in range(20)]

    thresh_to_prec = {}
    thresh_to_recall = {}
    thresh_to_fmeasure = {}
    thresh_to_granularity = {}
    thresh_to_overall = {}

    # doc_to_thresh_to_result[i] = {thresh -> (prec, recall)}
    doc_to_thresh_to_result = []

    count = 0
    valid_reduced_docs = []

    for i, d in enumerate(reduced_docs):
        doc_to_thresh_to_result.append({})
        count += 1

        print "On document", d, ". The", count, "th document."

        feature_vecs = d.get_feature_vectors(features,
                                             session,
                                             cheating=cheating,
                                             cheating_min_len=cheating_min_len)
        # skip if there are no feature_vectors
        if cheating and len(
                feature_vecs) < 7:  # 7, because that's what Benno did
            continue
        valid_reduced_docs.append(d)

        if feature_vector_weights:
            weighted_vecs = []
            for vec in feature_vecs:
                cur_weight_vec = []
                for i, weight in enumerate(feature_vector_weights, 0):
                    cur_weight_vec.append(vec[i] * weight)
                weighted_vecs.append(cur_weight_vec)
            feature_vecs = weighted_vecs

        # Grab the plagiarized spans
        spans = d.get_spans(cheating, cheating_min_len)
        actual_plag_spans = d.get_plag_spans()
        # Cluster to get plag probs
        plag_likelihoods = cluster(cluster_type, k, feature_vecs,
                                   **clusterargs)

        # Make sure we have a confidence level for every span
        assert (len(spans) == len(plag_likelihoods))

        # thresh => detected_spans
        all_detected_spans = {}
        for thresh in thresholds:
            prec, rec, fmeasure, granularity, overall, plag_spans, detected_spans = \
                get_all_measures(actual_plag_spans, spans, plag_likelihoods, thresh, cheating=cheating, cheating_min_len=cheating_min_len)
            all_detected_spans[thresh] = detected_spans

            # If measure wasn't well defined, None is returned. NOTE (nj) sneaky bug:
            # if we use the construct
            # if prec:
            #   <add it to the dict>
            # never add prec or recall when they're 0.
            # Thus we explicitly check for None-ness
            if prec is not None:
                thresh_to_prec.setdefault(thresh, []).append(prec)
            if rec is not None:
                thresh_to_recall.setdefault(thresh, []).append(rec)
            if fmeasure is not None:
                thresh_to_fmeasure.setdefault(thresh, []).append(fmeasure)
            if granularity is not None:
                thresh_to_granularity.setdefault(thresh,
                                                 []).append(granularity)
            if overall is not None:
                thresh_to_overall.setdefault(thresh, []).append(overall)

            doc_to_thresh_to_result[i][thresh] = (prec, rec, fmeasure,
                                                  granularity, overall)

        # Pass relevant data to plotting function
        doc_name = os.path.basename(d._short_name).replace('.txt', '')
        #visualize_overlaps(plag_spans, all_detected_spans, doc_name=doc_name)

    # For a given threshold, how many documents had valid precisions?
    print 'Valid precision:', sorted([(th, len(l))
                                      for th, l in thresh_to_prec.iteritems()])
    # For a given threshold, how many documents had valid recall?
    print 'Valid recall (this number should not change):', sorted([
        (th, len(l)) for th, l in thresh_to_recall.iteritems()
    ])

    thresh_prec_avgs = {
        t: sum(l) / len(l)
        for t, l in thresh_to_prec.iteritems()
    }
    thresh_recall_avgs = {
        t: sum(l) / len(l)
        for t, l in thresh_to_recall.iteritems()
    }
    thresh_fmeasure_avgs = {
        t: sum(l) / len(l)
        for t, l in thresh_to_fmeasure.iteritems()
    }
    thresh_granularity_avgs = {
        t: sum(l) / len(l)
        for t, l in thresh_to_granularity.iteritems()
    }
    thresh_overall_avgs = {
        t: sum(l) / len(l)
        for t, l in thresh_to_overall.iteritems()
    }

    if DEBUG:
        for thresh in sorted(thresh_prec_avgs.keys()):
            print thresh
            print 'Prec:', thresh_prec_avgs[thresh]
            print 'Recall:', thresh_recall_avgs[thresh]
            print 'F-Measure:', thresh_fmeasure_avgs[thresh]
            print 'Granularity:', thresh_granularity_avgs[thresh]
            print 'Overall:', thresh_overall_avgs[thresh]
            print '-' * 40

    return thresh_prec_avgs, thresh_recall_avgs, thresh_fmeasure_avgs, thresh_granularity_avgs, thresh_overall_avgs