コード例 #1
0
def load_sentences_for_topic_model(opt):
    '''
    Loads dataset
    :param opt:
    :return:
    '''
    # for s in opt['subsets']:
    #     assert ('train' in s)  # only use for training data
    # load dataset
    from src.loaders.load_data import load_data
    data_dict = load_data(opt, numerical=False)
    R1 = data_dict['T1']
    R2 = data_dict['T2']

    # select sentences from dataset (avoid duplication in Semeval)
    if opt['dataset'] == 'Semeval':
        assert opt['tasks'] == ['A', 'B', 'C']
        # combine data from all subtasks (A,B,C)
        Asent_1 = [sent for i, sent in enumerate(R1[0])
                   if i % 10 == 0]  # only once
        Asent_2 = R2[0]
        Bsent_1 = [sent for i, sent in enumerate(R1[1])
                   if i % 10 == 0]  # only once
        Bsent_2 = R2[1]
        # Csent_1 = [sent for i,sent in enumerate(dataset[12]) if i%100==0] # same as Bsent_!
        Csent_2 = R2[2]
        sentences = Asent_1 + Asent_2 + Bsent_1 + Bsent_2 + Csent_2
        print(len(sentences))
    else:
        sentences = [s for s in R1[0]] + [s for s in R2[0]]
    return sentences
コード例 #2
0
def calculate_word_overlap_ratio(opt):
    '''
    Loads dataset defined by opt, calculates word overlap ratio for each sentence pair and returns nested list with overlap 
    ratios in each subset.
    :param opt: option dictionary to load dataset
    :return : nested list with overlap ratios
    '''
    data_dict = load_data(opt, numerical=True)
    E1 = data_dict['E1']
    E2 = data_dict['E2']
    t = opt['tasks'][0]
    d = opt['dataset']
    colors = ['g', 'b', 'r']
    subset_overlap = []
    for n, s in enumerate(opt['subsets']):
        identical_per_pair = []
        for i in range(len(E1[n])):
            #         print(line)
            len_1 = (E1[n][i] != 1).sum()
            len_2 = (E2[n][i] != 2).sum()
            total_len = len_1 + len_2
            identical_ids = np.intersect1d(E1[n][i], E2[n][i])
            left_overlap = np.isin(E1[n][i], identical_ids).sum()
            right_overlap = np.isin(E2[n][i], identical_ids).sum()
            overlap_ratio = (left_overlap + right_overlap) / total_len
            #         print(overlap_ratio)
            identical_per_pair.append(overlap_ratio)
        plt.hist(identical_per_pair,
                 color=colors[n],
                 alpha=0.5,
                 bins=25,
                 label=s,
                 range=[0, 1])
        subset_overlap.append(identical_per_pair)
    plt.legend(loc='upper right')
    plt.ylabel('Number of document pairs')
    plt.xlabel('Lexical overlap ratio')
    plt.title('Lexical overlap in {} task {}'.format(d, t))
    plt.show()
    plt.close()
    return subset_overlap
コード例 #3
0
ファイル: evaluate.py プロジェクト: wuningxi/LexSim
def read_original_data(opt, subset='dev'):
    '''
    Reads original labelled dev file from data directory, extracts get pair_id, gold_label and sentences.
    :param opt: option log
    :param subset: ['train','dev','test']
    :return: pandas dataframe
    '''
    # adjust filenames in case of increased training data
    if 'train_large' in opt['subsets']:
        print('adjusting names')
        if subset=='dev':
            subset='test2016'
        elif subset=='test':
            subset='test2017'
    # adjust loading options:
    opt['subsets'] = [subset] # only specific subset
    opt['load_ids'] = True # with labels
#     print(opt)
    data_dict = load_data(opt,numerical=False)
    ID1 = data_dict['ID1'][0] # unlist, as we are only dealing with one subset
    ID2 = data_dict['ID2'][0]
    R1 = data_dict['R1'][0]
    R2 = data_dict['R2'][0]
    L = data_dict['L'][0]
    # extract get pair_id, gold_label, sentences
    labeled_data = []
    for i in range(len(L)):
        pair_id = ID1[i]+'-'+ID2[i]
        gold_label = L[i]
        s1 = R1[i]
        s2 = R2[i]
        labeled_data.append([pair_id,gold_label,s1,s2])
    # turn into pandas dataframe
    cols = ['pair_id','gold_label','s1','s2']
    label_df = pd.DataFrame.from_records(labeled_data,columns=cols)
    return label_df
コード例 #4
0
 def get_metric(self, distance_metric, dataset, task='', subset=None):
     '''
     Load calculated metric scores if existing, otherwise calculate
     :param distance_metric: 
     :param dataset: 
     :param task: 
     :return: nested list with distance/similarity scores depending on metric with outer length of subsets and 
     inner length of example numbers
     '''
     assert distance_metric in self.get_accepted_metrics()
     subsets = self.get_subsets(dataset)  # always load all 3 subsets
     opt = {
         'dataset': dataset,
         'datapath': 'data/',
         'subsets': subsets,
         'tasks': [task],
         'n_gram_embd': False,
         'cache': True
     }
     if dataset == 'Semeval':
         dataset = '{}_{}'.format(dataset, task)
     if distance_metric == 'jaccard':
         if dataset not in self.jaccard.keys():
             if dataset not in self.sentence1.keys():
                 data_dict = load_data(opt, numerical=False)
                 R1 = data_dict['R1']
                 R2 = data_dict['R2']
                 pair_ids = []
                 for s, _ in enumerate(subsets):
                     pair_ids.append([
                         i1 + '-' + i2 for i1, i2 in zip(
                             data_dict['ID1'][s], data_dict['ID2'][s])
                     ])
                 L = data_dict['L']
                 self.sentence1[dataset] = R1
                 self.sentence2[dataset] = R2
                 self.pair_ids[dataset] = pair_ids
                 self.labels[dataset] = L
             else:
                 R1 = self.sentence1[dataset]
                 R2 = self.sentence2[dataset]
             self.jaccard[dataset] = calculate_jaccard_index(R1, R2)
         overlapping = self.jaccard[dataset]
     elif distance_metric == 'dice':
         if dataset not in self.dice.keys():
             if dataset not in self.sentence1.keys():
                 data_dict = load_data(opt, numerical=False)
                 R1 = data_dict['R1']
                 R2 = data_dict['R2']
                 pair_ids = []
                 for s, _ in enumerate(subsets):
                     pair_ids.append([
                         i1 + '-' + i2 for i1, i2 in zip(
                             data_dict['ID1'][s], data_dict['ID2'][s])
                     ])
                 L = data_dict['L']
                 self.sentence1[dataset] = R1
                 self.sentence2[dataset] = R2
                 self.pair_ids[dataset] = pair_ids
                 self.labels[dataset] = L
             else:
                 R1 = self.sentence1[dataset]
                 R2 = self.sentence2[dataset]
             self.dice[dataset] = calculate_dice_sim(R1, R2)
         overlapping = self.dice[dataset]
     elif distance_metric == 'js-div':
         if dataset not in self.js_div.keys():
             if dataset not in self.sentence1.keys():
                 data_dict = load_data(opt, numerical=False)
                 R1 = data_dict['T1']
                 R2 = data_dict['T2']
                 pair_ids = []
                 for s, _ in enumerate(subsets):
                     pair_ids.append([
                         i1 + '-' + i2 for i1, i2 in zip(
                             data_dict['ID1'][s], data_dict['ID2'][s])
                     ])
                 L = data_dict['L']
                 self.sentence1[dataset] = R1
                 self.sentence2[dataset] = R2
                 self.pair_ids[dataset] = pair_ids
                 self.labels[dataset] = L
             else:
                 R1 = self.sentence1[dataset]
                 R2 = self.sentence2[dataset]
             self.js_div[dataset] = calculate_js_div(R1, R2)
         overlapping = self.js_div[dataset]
     if subset is None:
         return overlapping
     elif subset in ['train', 'p_train', 'train_large']:
         return overlapping[0]
     elif subset in ['dev', 'test2016']:
         return overlapping[1]
     elif subset in ['test', 'p_test', 'test2017']:
         return overlapping[-1]
     else:
         raise ValueError('{} not accepted value for subset'.format(subset))
コード例 #5
0
            examples = [
                441,  # Po
                89,  # Pn
                5874,  # No
                396
            ]  # Nn
        elif opt['dataset'] == 'Semeval':
            examples = [441, 89, 396]
        elif opt['dataset'] == 'MSRP':
            examples = [256]  # No
        else:
            ValueError('Example ids for {} not defined'.format(opt['dataset']))
        opt['max_m'] = max(examples) + 1

        #  prepare input to be passed through network
        data_dict = load_data(opt)
        print(data_dict['embd'].shape)
        subset = 1  # 0 for train, 1 for dev, 2 for test
        ID1 = data_dict['ID1'][subset]
        ID2 = data_dict['ID2'][subset]
        R1 = data_dict['R1'][subset]
        R2 = data_dict['R2'][subset]
        E1 = data_dict['E1'][subset]
        E2 = data_dict['E2'][subset]
        W_T1 = data_dict['W_T1']
        W_T2 = data_dict['W_T2']
        D_T1 = data_dict['D_T1']
        D_T2 = data_dict['D_T2']
        T1 = data_dict['T1'][subset]
        T2 = data_dict['T2'][subset]
        L = data_dict['L'][subset]
コード例 #6
0
ファイル: topic_baseline.py プロジェクト: zhongyunuestc/tBERT
            # 'subsets': ['train_large', 'test2016', 'test2017'],
            'topic': 'word',
            'topic_type': 'ldamallet',
            'padding': False,
            'simple_padding': True,
            'max_length': 'minimum',
            'unk_sub': False,
            'lemmatize': False,
            'datapath': 'data/',
            'dataset': 'Quora',
            'tasks': ['B'],
            'topic_alpha': 1,
            'num_topics': 90,
            'threshold': 0.090,
            'unk_topic': unk_topic,
            'stem': stem
        }

        # print(opt)
        data_dict = load_data(opt, cache=True, write_vocab=False)
        opt = model(data_dict,
                    opt,
                    logfile='specific_topic_settings.json',
                    print_dim=True)

    # todo: zeros - no stem
    # todo: zeros - stem
    # todo: uniform - stem

    # data_dict['embd']