Beispiel #1
0
def readDatasets(datasets, samples=9999):
    ### read documents and ref. summaries
    summaries = []
    targets = []
    groups = []
    sentences_of_topics = {}
    sorted_idxs_list = []
    models_list = []
    docs_list = []
    topic_list = []

    for dataset in datasets:
        reader = CorpusReader(PROCESSED_PATH)
        data = reader.get_data(dataset)
        sample_num = samples

        ml = []
        dl = []
        tl = []

        ### read data
        for topic, docs, models in data:

            print('read DATA {}, TOPIC {}'.format(dataset, topic))
            summs, ref_values_dic = readSummaries(dataset, topic,
                                                  'ground_truth', sample_num)
            ref_rewards = aggregateScores(ref_values_dic)
            summs = summs[:len(ref_rewards)]
            print('num of summaries read: {}'.format(len(summs)))

            sentences = [
                sent2tokens(sentence, 'english') for _, doc in docs
                for sentence in doc
            ]
            sentences_of_topics[topic] = sentences
            ml.append(models)
            dl.append(docs)
            tl.append(topic)

            sorted_idxs = np.argsort(np.array(ref_rewards))
            sorted_idxs_list.extend(sorted_idxs)

            summaries.extend(summs)
            targets.extend(ref_rewards)
            groups.extend(['{}-{}'.format(dataset, topic)] * len(summs))

        models_list.append(ml)
        docs_list.append(dl)
        topic_list.append(tl)

    summaries = np.array(summaries)
    targets = np.array(targets)
    groups = np.array(groups)

    return summaries, targets, groups, models_list, docs_list, topic_list, sentences_of_topics, sorted_idxs_list
Beispiel #2
0
def build_duc_vocabulary():
    datasets = ['DUC2001', 'DUC2002', 'DUC2004']
    sample_num = 9999
    cv_fold_num = 10
    validation_size = 0.1

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    vocabulary = set()
    for dataset in datasets:
        data = reader.get_data(dataset)
        ### read data
        for topic, docs, models in data:
            print('read DATA {}, TOPIC {}'.format(dataset, topic))
            summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                                  sample_num)
            sentences = [
                sent2tokens(sentence, 'english') for _, doc in docs
                for sentence in doc
            ]
            vocabulary.update(
                [token for sentence in sentences for token in sentence])
    return vocabulary
Beispiel #3
0
def writeNER(dataset):
    summary_len = 100
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset, summary_len)
    nlp = StanfordCoreNLP(CORENLP_PATH)
    base_path = os.path.join(PROCESSED_PATH, dataset)

    topic_cnt = 0

    for topic, docs, models in data:
        topic_cnt += 1
        topic_path = os.path.join(base_path, topic, 'docs.ner')
        if not os.path.exists(topic_path):
            os.mkdir(topic_path)
        for dd in docs:
            dname = dd[0].split('/')[-1].strip()
            print('{} topic {}, doc {}'.format(topic_cnt, topic, dname))
            output = ''
            for sen in dd[1]:
                ner = nlp.ner(sen)
                output += repr(ner) + '\n'
            write_to_file(output, os.path.join(topic_path, dname))

    nlp.close()
                    #print('sentences: {}\n{}'.format(self.sentences[sum_idx[i]], self.sentences[sum_idx[j]]))
                    #print('vec1 : {}\n vec2: {}'.format(self.sent_vecs[sum_idx[i]], self.sent_vecs[sum_idx[j]]))
                    red_scores[-1] = 0.

        return np.mean(red_scores)


if __name__ == '__main__':
    dataset = 'DUC2002'  ## DUC2001, DUC2002, DUC2004
    sample_num = 9999
    out_base = os.path.join(FEATURE_DIR, dataset)
    if not os.path.exists(out_base):
        os.makedirs(out_base)

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    ### store all results
    all_test_reward_dic = OrderedDict()
    topic_cnt = 0

    summaries = []
    targets = []
    groups = []
    models_list = []
    docs_list = []

    ### read data
    infersent = InfersentRewardGenerator()
    for topic, docs, models in data:
Beispiel #5
0
    if len(sys.argv) == 4:
        dataset = sys.argv[1]
        start = int(sys.argv[2])
        end = int(sys.argv[3])
    else:
        dataset = 'DUC2001'  #DUC2001, DUC2002, DUC2004
        start = 0
        end = 10000

    language = 'english'
    summary_len = 100

    summary_num = 10001
    base_dir = os.path.join(SUMMARY_DB_DIR, dataset)

    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset, summary_len)

    topic_cnt = 0

    for topic, docs, models in data:
        topic_cnt += 1
        if not (topic_cnt > start and topic_cnt <= end):
            continue

        dir_path = os.path.join(base_dir, topic)
        if not os.path.exists(dir_path):
            os.makedirs(dir_path)
        vec = Vectoriser(docs, summary_len)
        print('-----Generate samples for topic {}: {}-----'.format(
            topic_cnt, topic))
Beispiel #6
0
def correlation(features, sizes):
    names = []
    for f, s in zip(features, sizes):
        if s > 1:
            for i in range(s):
                names.append(f + str(i))
        else:
            names.append(f)
    names.append('rouge_reward')

    dataset = 'DUC2001'  ## DUC2001, DUC2002, DUC2004
    sample_num = 9999
    bin_num = 20
    cv_fold_num = 10

    ### read documents and ref. summaries
    reader = CorpusReader(PROCESSED_PATH)
    data = reader.get_data(dataset)

    topic_cnt = 0

    summaries = []
    groups = []
    models_list = []
    docs_list = []
    targets = []

    ### read data
    for topic, docs, models in data:

        print('read DATA {}, TOPIC {}'.format(dataset, topic))
        summs, ref_values_dic = readSummaries(dataset, topic, 'rouge',
                                              sample_num)
        print('num of summaries read: {}'.format(len(summaries)))
        ref_rewards = aggregateScores(ref_values_dic)
        models_list.append(models)
        docs_list.append(docs)
        summaries.extend(summs)
        groups.extend([topic] * len(summs))
        targets.extend(ref_rewards)
        topic_cnt += 1

    allFeatures = readFeatures(features, dataset, np.array(summaries), groups,
                               set(groups))
    allFeatures = np.c_[allFeatures, np.array(targets)]
    correlations = {}
    threshold_correlation = {}
    for col1, col2 in itertools.combinations(range(len(names)), 2):
        pcc = pearsonr(allFeatures[:, col1], allFeatures[:, col2])[0]
        correlations[names[col1] + ' ' + names[col2] + ': pcc = '] = pcc
        # other way for ease of reading
        correlations[names[col2] + ' ' + names[col1] + ': pcc = '] = pcc
        if pcc < -0.8:
            threshold_correlation[names[col1] + ' ' + names[col2] +
                                  ': pcc = '] = pcc
            threshold_correlation[names[col2] + ' ' + names[col1] +
                                  ': pcc = '] = pcc
    #for key in sorted(correlations.keys()):
    #    print(key+str(correlations[key]))
    print("Pairs with pcc >.9")
    for key in sorted(threshold_correlation.keys()):
        print(key + str(threshold_correlation[key]))