Beispiel #1
0
def readDatasets(datasets, samples=9999):
    ### read documents and ref. summaries
    summaries = []
    targets = []
    groups = []
    sentences_of_topics = {}
    sorted_idxs_list = []
    models_list = []
    docs_list = []
    topic_list = []

    for dataset in datasets:
        reader = CorpusReader(PROCESSED_PATH)
        data = reader.get_data(dataset)
        sample_num = samples

        ml = []
        dl = []
        tl = []

        ### read data
        for topic, docs, models in data:

            print('read DATA {}, TOPIC {}'.format(dataset, topic))
            summs, ref_values_dic = readSummaries(dataset, topic,
                                                  'ground_truth', sample_num)
            ref_rewards = aggregateScores(ref_values_dic)
            summs = summs[:len(ref_rewards)]
            print('num of summaries read: {}'.format(len(summs)))

            sentences = [
                sent2tokens(sentence, 'english') for _, doc in docs
                for sentence in doc
            ]
            sentences_of_topics[topic] = sentences
            ml.append(models)
            dl.append(docs)
            tl.append(topic)

            sorted_idxs = np.argsort(np.array(ref_rewards))
            sorted_idxs_list.extend(sorted_idxs)

            summaries.extend(summs)
            targets.extend(ref_rewards)
            groups.extend(['{}-{}'.format(dataset, topic)] * len(summs))

        models_list.append(ml)
        docs_list.append(dl)
        topic_list.append(tl)

    summaries = np.array(summaries)
    targets = np.array(targets)
    groups = np.array(groups)

    return summaries, targets, groups, models_list, docs_list, topic_list, sentences_of_topics, sorted_idxs_list
Beispiel #2
0
def build_duc_vocabulary():
    datasets = ['DUC2001', 'DUC2002', 'DUC2004']
    sample_num = 9999
    cv_fold_num = 10
    validation_size = 0.1

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    vocabulary = set()
    for dataset in datasets:
        data = reader.get_data(dataset)
        ### read data
        for topic, docs, models in data:
            print('read DATA {}, TOPIC {}'.format(dataset, topic))
            summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                                  sample_num)
            sentences = [
                sent2tokens(sentence, 'english') for _, doc in docs
                for sentence in doc
            ]
            vocabulary.update(
                [token for sentence in sentences for token in sentence])
    return vocabulary
    all_test_reward_dic = OrderedDict()
    topic_cnt = 0

    summaries = []
    targets = []
    groups = []
    models_list = []
    docs_list = []

    ### read data
    infersent = InfersentRewardGenerator()
    for topic, docs, models in data:
        if topic != 'd112h': continue

        print('read DATA {}, TOPIC {}'.format(dataset, topic))
        summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                              sample_num)
        values = infersent(summs, docs)
        assert len(values) == len(summs)

        out_str = ''
        for ii, vv in enumerate(values):
            out_str += '{}\t{}\n'.format(summs[ii], vv)

        if not os.path.exists(os.path.join(out_base, topic)):
            os.makedirs(os.path.join(out_base, topic))

        fpath = os.path.join(out_base, topic, 'infersent_heuristic')
        ff = open(fpath, 'w')
        ff.write(out_str)
        ff.close()
                cos_list.append(
                    cosine_similarity(ntf.reshape(1, -1),
                                      tf_idf_matrix[jj, :]))
            summary_features[ii].append(np.min(cos_list))
            summary_features[ii].append(np.max(cos_list))
            summary_features[ii].append(np.mean(cos_list))
            summary_features[ii].append(np.std(cos_list))

        return np.array(summary_features)


if __name__ == '__main__':
    dataset = 'DUC2001'
    summary_len = 100

    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset, summary_len)

    topic_cnt = 0

    for topic, docs, models in data:
        topic_cnt += 1
        summaries, heuristic_values_list = readSummaries(
            dataset, topic, 'heuristic')
        print('num of summaries read: {}'.format(len(summaries)))

        vec = CrossTopicNgramVectoriser(docs)
        features = vec(summaries)

        print('features.shape {}'.format(features.shape))
Beispiel #5
0
def correlation(features, sizes):
    names = []
    for f, s in zip(features, sizes):
        if s > 1:
            for i in range(s):
                names.append(f + str(i))
        else:
            names.append(f)
    names.append('rouge_reward')

    dataset = 'DUC2001'  ## DUC2001, DUC2002, DUC2004
    sample_num = 9999
    bin_num = 20
    cv_fold_num = 10

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    topic_cnt = 0

    summaries = []
    groups = []
    models_list = []
    docs_list = []
    targets = []

    ### read data
    for topic, docs, models in data:

        print('read DATA {}, TOPIC {}'.format(dataset, topic))
        summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                              sample_num)
        print('num of summaries read: {}'.format(len(summaries)))
        ref_rewards = aggregateScores(ref_values_dic)
        models_list.append(models)
        docs_list.append(docs)
        summaries.extend(summs)
        groups.extend([topic] * len(summs))
        targets.extend(ref_rewards)
        topic_cnt += 1

    allFeatures = readFeatures(features, dataset, np.array(summaries), groups,
                               set(groups))
    allFeatures = np.c_[allFeatures, np.array(targets)]
    correlations = {}
    threshold_correlation = {}
    for col1, col2 in itertools.combinations(range(len(names)), 2):
        pcc = pearsonr(allFeatures[:, col1], allFeatures[:, col2])[0]
        correlations[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc
        # other way for ease of reading
        correlations[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc
        if pcc < -0.8:
            threshold_correlation[names[col1] + ' ' + names[col2] +
                                  ': pcc = '] = pcc
            threshold_correlation[names[col2] + ' ' + names[col1] +
                                  ': pcc = '] = pcc
    #for key in sorted(correlations.keys()):
    #    print(key+str(correlations[key]))
    print("Pairs with pcc >.9")
    for key in sorted(threshold_correlation.keys()):
        print(key + str(threshold_correlation[key]))
    dataset = 'DUC2001' ## DUC2001, DUC2002, DUC2004
    sample_num = 9999
    out_base = os.path.join(FEATURE_DIR,dataset)
    if not os.path.exists(out_base):
        os.makedirs(out_base)

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    ### read data
    rewarder = WordEmbeddingRewarder()
    for topic,docs,models in data:

        print('read DATA {}, TOPIC {}'.format(dataset,topic))
        summs, _ = readSummaries(dataset,topic,'rouge',sample_num)
        values = rewarder(docs, summs)
        assert len(values) == len(summs)

        out_str = ''
        for ii,vv in enumerate(values[0]):
            out_str += '{}\t{}\n'.format(summs[ii],vv)

        if not os.path.exists(os.path.join(out_base,topic)):
            os.makedirs(os.path.join(out_base,topic))

        fpath = os.path.join(out_base,topic,'dss')
        ff = open(fpath,'w')
        ff.write(out_str)
        ff.close()