コード例 #1
0
    def load_dict(self, dict_name, dict_file_path, sep='\t'):
        """
        manage the dict from list or dict
        list index start from 1
        """
        if dict_name not in self.dict_manager:

            dict_object = {}
            ''' load dict from file '''
            print('load dict from file {}'.format(dict_file_path))

            f_dict = utils.create_read_file(dict_file_path)

            for idx, line in enumerate(f_dict):
                if line[0] == '[' and line[-1] == ']':
                    continue
                line = line.strip().split(sep)
                if len(line) == 1:
                    dict_object[line[0]] = idx + 1
                elif len(line) == 2:
                    # NOT eval, for the value may be str
                    # dict_object[line[0]] = line[1]
                    dict_object[line[0]] = ast.literal_eval(line[1])
                else:
                    raise NotImplementedError

            self.dict_manager[dict_name] = dict_object

        return self.dict_manager[dict_name]
コード例 #2
0
    def train(self, train_instances, train_file, out_list=None):
        """
        out_list is used to sub train_instances from all train_instances
        if only happen when all faetures have been made.
        """
        ''' 1. Extract Features '''
        self.make_feature_file(train_instances, train_file)

        if out_list:
            dev = utils.create_read_file(self.train_feature_file).readlines()
            dev = [
                dev[idx].strip() for idx in range(len(dev))
                if idx not in out_list
            ]
            f_dev = utils.create_write_file(self.train_feature_file)
            print('\n'.join(dev), file=f_dev)
            f_dev.close()
            print('finish filter, train examples %d', len(dev))
        ''' 2. Train Classifier '''
        self.classifier.train_model(self.train_feature_file, self.model_file)
        ''' 3. Predict Answers '''
        self.output_file = self.get_output_file(train_file)
        predict_label = self.classifier.test_model(self.train_feature_file,
                                                   self.model_file,
                                                   self.output_file)

        f_out = utils.create_write_file(self.output_file)
        for label, train_instances in zip(predict_label, train_instances):
            print('%.2f\t#\t%s' %
                  (label, train_instances.get_instance_string()),
                  file=f_out)

        return self.classifier
コード例 #3
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             warrant0, warrant1, reason, claim, title, info = train_instance.get_six(
                 type='word')
             sents.append(warrant0)
             sents.append(warrant1)
             sents.append(reason)
             sents.append(claim)
         idf_dict = utils.idf_calculator(sents)
         # idf_dict = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
         with utils.create_write_file(config.RESOURCE_DIR +
                                      '/idf_dict.txt') as fw:
             for key in idf_dict:
                 print('{}\t{}'.format(key, idf_dict[key]), file=fw)
         print(len(idf_dict))
     else:
         with utils.create_read_file(config.RESOURCE_DIR +
                                     '/idf_dict.txt') as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
コード例 #4
0
    def load_dict(self, dict_name, path=config.DICT_DIR):
        """
        path: config.DICT_DIR
              config.DICT_EX_DIR
        """
        if dict_name not in self.dict_manager:

            dict_object = {}

            cur_dir = os.path.dirname(__file__)
            path = os.path.join(cur_dir, '../resources')
            ''' load dict from file '''
            file_name = path + '/dict_%s.txt' % dict_name
            print('load dict from file %s \n' % file_name)

            f_dict = utils.create_read_file(file_name)

            for idx, line in enumerate(f_dict):
                line = line.strip().split('\t')
                if len(line) == 1:
                    dict_object[line[0]] = idx + 1
                elif len(line) == 2:
                    dict_object[line[0]] = eval(line[1])
                else:
                    raise NotImplementedError

            self.dict_manager[dict_name] = dict_object

        return self.dict_manager[dict_name]
コード例 #5
0
def load_STS(train_file):
    with utils.create_read_file(train_file) as f:
        data = []
        for line in f:
            line = line.strip().split('\t')
            score = float(line[4])
            sa, sb = line[5], line[6]
            data.append((sa, sb, score))
    return data
コード例 #6
0
ファイル: model.py プロジェクト: rgtjf/classification_task
    def load_model_score(self, train_file):
        self.output_file = self.get_output_file(train_file)

        y_pred = utils.create_read_file(self.output_file).readlines()
        y_pred = [
            '1:{}'.format(DICT_LABEL_TO_INDEX[x.strip().split("\t#\t")[0]])
            for x in y_pred
        ]
        return y_pred
    def extract_instances(self, train_instances):
        """ extract features to features """
        self.extract_information(train_instances)

        features = []
        infos = []
        process_bar = pyprind.ProgPercent(len(train_instances))

        ''' get features from train instances'''

        alignment_feature_file = self.feature_file.replace('IdfAlignmentFeature', 'AlignmentFeature')
        alignment_features = utils.create_read_file(alignment_feature_file).readlines()

        idf_weight = self.idf_weight
        default_idf_weight = min(idf_weight.values())

        for train_instance, alignment_feature in zip(train_instances, alignment_features[1:]):
            process_bar.update()

            alignment_feature = alignment_feature.split('\t#\t')[1]
            myWordAlignments = json.loads(alignment_feature)[0]  # list of [sa_idx, sb_idx] index start from 1

            word_sa, word_sb = train_instance.get_word(type='lemma', lower=True)

            sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments]
            sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments]

            sent1_aligned = [0] * len(word_sa)
            sent2_aligned = [0] * len(word_sb)

            for sa_index in sa_aligned:
                sent1_aligned[sa_index] = 1

            for sb_index in sb_aligned:
                sent2_aligned[sb_index] = 1

            # calc all and aligned except stopwords
            sent1_sum = 0
            sent2_sum = 0
            sent1_ali = 0
            sent2_ali = 0
            for idx, word in enumerate(word_sa):
                weight = idf_weight.get(word, default_idf_weight)
                sent1_ali += sent1_aligned[idx] * weight
                sent1_sum += weight

            for idx, word in enumerate(word_sb):
                weight = idf_weight.get(word, default_idf_weight)
                sent2_ali += sent2_aligned[idx] * weight
                sent2_sum += weight
            feature = [1.0 * (sent1_ali + sent2_ali) / (sent1_sum + sent2_sum + 1e-6)]
            info = [sent2_ali, sent2_ali, sent1_sum, sent2_sum]
            features.append(feature)
            infos.append(info)

        return features, infos
コード例 #8
0
def load_parse_data(train_file, parser=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        if parser is None:
            raise RuntimeError(
                "parser should be init by ``nlp = stst.StanfordNLP('http://localhost:9000')``"
            )
        ''' Parse Data '''
        data = load_STS(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sa, sb, score) in data:
            process_bar.update()
            parse_sa = parser.parse(sa)
            parse_sb = parser.parse(sb)
            parse_data.append((parse_sa, parse_sb, score))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            parse_json = json.loads(line)
            sentpair_instance = SentPair(parse_json)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
コード例 #9
0
    def extract_information(self, train_instances):
        if self.is_training:
            sents, labels = [], []
            for train_instance in train_instances:
                sent = train_instance.get_word()
                label = train_instance.get_label()
                sents.append(sent)
                labels.append(label)

            rf_dict = utils.rf_calculator(sents, labels, max_cnt=1000)
            with utils.create_write_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'w') as fw:
                json.dump(rf_dict, fw, ensure_ascii=False)

        with utils.create_read_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'rb') as fr:
            rf_dict = json.load(fr)

        with utils.create_read_file(config.DICTIONARY_DIR + '/vocab.txt') as fr:
            vocab_dict = {}
            for line in fr:
                line = line.strip().split('\t')
                vocab_dict[line[0]] = int(line[1])

        self.rf_dict = rf_dict
        self.vocab_dict = vocab_dict
コード例 #10
0
def load_parse_data(train_file, nlp=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        ''' Parse Data '''
        data = load_data(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sent, label) in data:
            process_bar.update()
            sent = preprocess(sent)
            parse_data.append((sent, label))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance, ensure_ascii=False)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            sent, label = json.loads(line)
            sentpair_instance = Sent(sent, label)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
コード例 #11
0
    def load_feature_from_file(feature_file):
        """
        load features from file
        """
        f_feature = utils.create_read_file(feature_file)

        feature_information = f_feature.readline()
        n_instance, n_dim = feature_information.strip().split()
        n_instance, n_dim = int(n_instance), int(n_dim)

        features = []
        for feature in f_feature:
            feature_string, instance_string = feature.split("\t#\t")
            features.append(feature_string)

        return features, n_dim, n_instance
コード例 #12
0
    def extract_instances(self, train_instances):
        features = []
        infos = []

        input_file = self.feature_file.split('/')[-2] + '.txt'
        f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' + self.nntype + '/' + input_file)
        for line in f_in:
            line = line.strip()
            obj = json.loads(line)
            sc = obj[0] / 5.0
            features.append([sc])
            infos.append([])

        print(len(features), features[0])

        return features, infos
コード例 #13
0
    def extract_instances(self, train_instances):
        features = []
        infos = []
        input_file = self.feature_file.split('/')[-2] + '.txt'
        f_in = utils.create_read_file(config.NN_FEATURE_PATH + '/' + self.nntype + '/' + input_file)
        for line in f_in:
            line = line.strip()
            obj = json.loads(line)
            emb1 = obj[1]
            emb2 = obj[2]
            emb1 = vk.normalize(emb1)
            emb2 = vk.normalize(emb2)
            feats, info = vk.get_all_kernel(emb1, emb2)
            features.append(feats)
            infos.append(info)

        print(len(features), features[0], infos[0])

        return features, infos
コード例 #14
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             sent = train_instance.get_sent(self.type)
             sents.append(sent)
         idf_dict = utils.idf_calculator(sents)
         with utils.create_write_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fw:
             idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
             for key, value in idf_dict_tuple:
                 print('{}\t{}'.format(key, value), file=fw)
     else:
         with utils.create_read_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
     word_keys = sorted(idf_dict.keys(), reverse=True)
     self.word2index = {word: i for i, word in enumerate(word_keys)}
コード例 #15
0
    def extract_information(self, train_instances):
        if self.is_training:
            sents = []
            for train_instance in train_instances:
                sent = train_instance.get_word()
                sents.append(sent)
            idf_dict = utils.idf_calculator(sents)

            #idf_dict = sorted(idf_dict.iteritems(), key=lambda x: x[1], reverse=True)

            with utils.create_write_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fw:
                for key in idf_dict:
                    print('{}\t{}'.format(key, idf_dict[key]), file=fw)

            print(len(idf_dict))
        else:
            with utils.create_read_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fr:
                idf_dict = {}
                for line in fr:
                    line = line.strip().split('\t')
                    idf_dict[line[0]] = float(line[1])

        self.unigram_dict = idf_dict
コード例 #16
0
    def load_idf_dict(self, dict_name='idf_dict'):

        if dict_name not in self.dict_manager:

            word_frequencies = {}

            file_name = config.EX_DICT_DIR + '/word-frequencies.txt'
            print('load dict from file %s \n' % file_name)

            f_dict = utils.create_read_file(file_name)

            for idx, line in enumerate(f_dict):
                if idx == 0:
                    totfreq = int(line)
                else:
                    w, freq = line.strip().split()
                    freq = float(freq)
                    if freq < 10:
                        continue
                    word_frequencies[w] = math.log(totfreq / freq)  / math.log(2)
            self.dict_manager[dict_name] = word_frequencies

        return self.dict_manager[dict_name]
コード例 #17
0
    def load_model_score(self, train_file):
        self.output_file = self.get_output_file(train_file)

        y_pred = utils.create_read_file(self.output_file).readlines()
        y_pred = ['1:' + x.strip().split("\t#\t")[0] for x in y_pred]
        return y_pred
コード例 #18
0
    def cross_validation(self,
                         data_instances,
                         data_file,
                         k_fold=5,
                         shuffle=False):

        self.make_feature_file(data_instances, data_file)

        n_data = len(data_instances)
        n_batch = n_data // k_fold
        data_instances = list(zip(range(n_data), data_instances))

        id_map = range(n_data)
        if shuffle is True:
            random.shuffle(id_map)

        preds = [None] * n_data
        for fold in range(k_fold):
            st = fold * n_batch
            ed = (fold + 1) * n_batch
            if ed > n_data:
                ed = n_data

            data = utils.create_read_file(self.dev_feature_file).readlines()

            # make train data
            train = [
                data[id_map[idx]].strip() for idx in range(len(data))
                if idx not in range(st, ed)
            ]
            dev_feature_file_train = self.dev_feature_file.replace(
                'txt', 'train')
            f_train = utils.create_write_file(dev_feature_file_train)
            print('\n'.join(train), file=f_train)
            f_train.close()

            # make dev data
            dev = [data[id_map[idx]].strip() for idx in range(st, ed)]
            dev_feature_file_dev = self.dev_feature_file.replace('txt', 'dev')
            f_dev = utils.create_write_file(dev_feature_file_dev)
            print('\n'.join(dev), file=f_dev)
            f_dev.close()
            ''' Train Classifier '''
            self.classifier.train_model(
                dev_feature_file_train,
                self.model_file)  # Attention! self.dev_feature_file
            ''' Predict Lables'''
            self.output_file = self.get_output_file(data_file)

            predict_label = self.classifier.test_model(dev_feature_file_dev,
                                                       self.model_file,
                                                       self.output_file)

            for idx in range(st, ed):
                idy = idx - st
                preds[id_map[idx]] = predict_label[idy]
        ''' Write to File '''
        self.output_file = self.get_output_file(data_file)

        f_out = utils.create_write_file(self.output_file)
        for label, train_instance in zip(preds, data_instances):
            print('%.2f\t#\t%s' %
                  (label, train_instance[1].get_instance_string()),
                  file=f_out)
コード例 #19
0
    def extract_instances(self, train_instances):
        """ extract features to features """

        self.extract_information(train_instances)
        idf_weight = self.idf_weight
        default_idf_weight = min(idf_weight.values())

        features = []
        infos = []
        process_bar = pyprind.ProgPercent(len(train_instances))
        ''' get features from train instances'''

        alignment_feature_file = self.feature_file.replace(
            'PosAlignmentFeature', 'AlignmentFeature')
        alignment_features = utils.create_read_file(
            alignment_feature_file).readlines()

        for train_instance, alignment_feature in zip(train_instances,
                                                     alignment_features[1:]):
            process_bar.update()

            alignment_feature = alignment_feature.split('\t#\t')[1]
            myWordAlignments = json.loads(alignment_feature)[
                0]  # list of [sa_idx, sb_idx] index start from 1
            pos_sa, pos_sb = train_instance.get_pos_tag(stopwords=False)
            ner_sa, ner_sb = train_instance.get_word(type='ner',
                                                     stopwords=False)
            word_sa, word_sb = train_instance.get_word(type='lemma',
                                                       lower=True)

            feature, info = [], []
            sa_aligned = [sa_idx - 1 for sa_idx, sb_idx in myWordAlignments]
            sb_aligned = [sb_idx - 1 for sa_idx, sb_idx in myWordAlignments]

            sent1_aligned = [0] * len(word_sa)
            sent2_aligned = [0] * len(word_sb)

            for sa_index in sa_aligned:
                sent1_aligned[sa_index] = 1

            for sb_index in sb_aligned:
                sent2_aligned[sb_index] = 1

            sent1_sum = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.}
            sent2_sum = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.}
            sent1_ali = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.}
            sent2_ali = {'n': 0., 'v': 0., 'a': 0., 'r': 0., '#': 0.}
            for idx, word in enumerate(word_sa):
                pos = pos_sa[idx][1]
                weight = idf_weight.get(word, default_idf_weight)
                sent1_ali[pos] = sent1_aligned[idx] * weight
                sent1_sum[pos] += weight

            for idx, word in enumerate(word_sb):
                pos = pos_sb[idx][1]
                weight = idf_weight.get(word, default_idf_weight)
                sent2_ali[pos] += sent2_aligned[idx] * weight
                sent2_sum[pos] += weight

            for pos in ['n', 'v', 'a', 'r', '#']:
                score = 1.0 * (sent1_ali[pos] + sent2_ali[pos]) / (sent1_sum[pos] + sent2_sum[pos] + 1e-6) \
                    if sent1_sum[pos] + sent2_sum[pos] > 1e-6 else 0.0
                feature.append(score)

            info = [sent1_sum, sent2_sum, sent1_ali, sent2_ali]

            features.append(feature)
            infos.append(info)

        return features, infos