Esempio n. 1
0
    def test(self, dev_instances, dev_file):
        """
        """
        ''' 1. Extract Features '''
        self.make_feature_file(dev_instances, dev_file, dev=True)

        self.output_file = self.get_output_file(dev_file)
        print(self.output_file)
        ''' 2. Predict Answers '''
        predict_label = self.classifier.test_model(self.dev_feature_file,
                                                   self.model_file,
                                                   self.output_file)

        f_out = utils.create_write_file(self.output_file)
        for label, dev_instance in zip(predict_label, dev_instances):
            print('{:d}\t#\t{}'.format(label,
                                       dev_instance.get_instance_string()),
                  file=f_out)

        submit_file = self.output_file.replace('.txt', '.submit')
        f_out = utils.create_write_file(submit_file)
        print('#id	correctLabelW0orW1', file=f_out)
        for label, dev_instance in zip(predict_label, dev_instances):
            print('{}\t{}'.format(dev_instance.get_id(), label), file=f_out)

        return predict_label
Esempio n. 2
0
    def train(self, train_instances, train_file, out_list=None):
        """
        out_list is used to sub train_instances from all train_instances
        if only happen when all faetures have been made.
        """
        ''' 1. Extract Features '''
        self.make_feature_file(train_instances, train_file)

        if out_list:
            dev = utils.create_read_file(self.train_feature_file).readlines()
            dev = [
                dev[idx].strip() for idx in range(len(dev))
                if idx not in out_list
            ]
            f_dev = utils.create_write_file(self.train_feature_file)
            print('\n'.join(dev), file=f_dev)
            f_dev.close()
            print('finish filter, train examples %d', len(dev))
        ''' 2. Train Classifier '''
        self.classifier.train_model(self.train_feature_file, self.model_file)
        ''' 3. Predict Answers '''
        self.output_file = self.get_output_file(train_file)
        predict_label = self.classifier.test_model(self.train_feature_file,
                                                   self.model_file,
                                                   self.output_file)

        f_out = utils.create_write_file(self.output_file)
        for label, train_instances in zip(predict_label, train_instances):
            print('%.2f\t#\t%s' %
                  (label, train_instances.get_instance_string()),
                  file=f_out)

        return self.classifier
Esempio n. 3
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             warrant0, warrant1, reason, claim, title, info = train_instance.get_six(
                 type='word')
             sents.append(warrant0)
             sents.append(warrant1)
             sents.append(reason)
             sents.append(claim)
         idf_dict = utils.idf_calculator(sents)
         # idf_dict = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
         with utils.create_write_file(config.RESOURCE_DIR +
                                      '/idf_dict.txt') as fw:
             for key in idf_dict:
                 print('{}\t{}'.format(key, idf_dict[key]), file=fw)
         print(len(idf_dict))
     else:
         with utils.create_read_file(config.RESOURCE_DIR +
                                     '/idf_dict.txt') as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
Esempio n. 4
0
    def write_feature_to_file(feature_file, features, infos):
        """
        write features string to file
        """
        if type(features[0]) is list:
            dim = len(features[0])
        else:
            dim = infos[0][0]

        f_feature = utils.create_write_file(feature_file)
        ''' write features infomation to file '''
        print(len(features), dim, file=f_feature)
        ''' write features string to file '''
        for feature, info in zip(features, infos):
            ''' type(feature) is list '''
            if type(feature) is list:
                feature_string = Feature._feat_list_to_string(feature)
            elif type(feature) is str:
                feature_string = feature
            else:
                raise NotImplementedError

            info_string = Feature._info_list_to_string(info)
            print(feature_string + '\t#\t' + info_string, file=f_feature)

        f_feature.close()
Esempio n. 5
0
        def __create_dict(*args, **kwargs):
            print("====> create dict for function [{}]".format(func.__name__))

            ret = func(*args, **kwargs)
            ''' remove item whose frequency is less than threshold '''
            if 'threshold' in kwargs:
                threshold = kwargs['threshold']
                for key in ret.keys():
                    if ret[key] < threshold:
                        ret.pop(key)
            ''' write dict to file '''
            file_name = 'dict_{}.txt'.format(func.__name__)
            f_dict = utils.create_write_file(file_name)

            if type(ret) == list:
                # ensure it is set
                ret = list(set(ret))
                ret = sorted(ret)
                for idx, item in enumerate(ret):
                    print(str(item), file=f_dict)

            elif type(ret) == dict:
                # order the dict
                for item in sorted(ret.keys()):
                    print(item, ret[item])
                    print('%s\t%s' % (item, ret[item]), file=f_dict)
            else:
                raise NotImplementedError

            f_dict.close()

            print("====> write file {}, {:d}    instances".format(
                file_name, len(ret)))
            return ret
Esempio n. 6
0
def record(record_file, dev_pearsonr, test_pearsonr, model):
    with utils.create_write_file(record_file, 'ab') as f:
        writer = csv.writer(f, delimiter=',')
        features = [feature.feature_name for feature in model.feature_list]
        writer.writerow([
            model.model_name, dev_pearsonr, test_pearsonr,
            model.classifier.strategy.trainer, features
        ])
def load_parse_data(train_file, parser=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        if parser is None:
            raise RuntimeError(
                "parser should be init by ``nlp = stst.StanfordNLP('http://localhost:9000')``"
            )
        ''' Parse Data '''
        data = load_STS(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sa, sb, score) in data:
            process_bar.update()
            parse_sa = parser.parse(sa)
            parse_sb = parser.parse(sb)
            parse_data.append((parse_sa, parse_sb, score))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            parse_json = json.loads(line)
            sentpair_instance = SentPair(parse_json)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
    def extract_instances(self, train_instances):
        asiya = AsiyaDriver()

        n_lines = 250
        features = []
        infos = []

        idx_list = range(0, len(train_instances), n_lines)

        for idx in idx_list:
            st, ed = idx, idx + n_lines
            if ed > len(train_instances):
                ed = len(train_instances)
            print("\rAsiya MT Featyre index = %d, st = %d, ed = %d" %
                  (idx, st, ed),
                  end=' ')

            while True:
                ''' sa -> sb '''
                f_sa = utils.create_write_file(config.EX_DICT_DIR + '/sa.txt')
                f_sb = utils.create_write_file(config.EX_DICT_DIR + '/sb.txt')
                for id in range(st, ed):
                    lemma_sa, lemma_sb = train_instances[id].get_word(
                        type='lemma')
                    lemma_sa = ' '.join(lemma_sa)
                    lemma_sb = ' '.join(lemma_sb)
                    print(lemma_sa, file=f_sa)
                    print(lemma_sb, file=f_sb)
                f_sa.close()
                f_sb.close()
                page = asiya.run_file()
                if page != ' ':
                    features_sa = asiya.extract_table(page)
                    break
                else:
                    asiya.reload()
            features += features_sa
            infos += [['0']] * len(features)
        print(features[:10])
        return features, infos
Esempio n. 9
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the data ...")
        test_X, test_y = self.load_file(test_file_path)

        print("==> Load the model ...")
        clf = pickle.load(open(model_path, 'rb'))

        print("==> Test the model ...")
        y_pred = clf.predict(test_X.toarray())

        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 10
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the model ...")
        # bst = pickle.load(open(model_path, 'rb'))

        bst = xgb.Booster(self.param)
        bst.load_model(model_path)

        print("==> Test the model ...")
        dtest = xgb.DMatrix(test_file_path)
        y_pred = bst.predict(dtest)
        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 11
0
def load_parse_data(train_file, nlp=None, flag=False):
    """
    Load data after Parse, like POS, NER, etc.
    Value: [ SentPair:class, ... ]
    Parameter:
        flag: False(Default), Load from file (resources....)
              True, Parse and Write to file, and then load from file
    """
    ''' Pre-Define Write File '''

    # parse_train_file = config.PARSE_DIR + '/' + \
    #                    utils.FileManager.get_file(train_file)

    parse_train_file = train_file.replace('./data', './generate/parse')

    if flag or not os.path.isfile(parse_train_file):

        print(train_file)
        ''' Parse Data '''
        data = load_data(train_file)

        print('*' * 50)
        print("Parse Data, train_file=%s, n_train=%d\n" %
              (train_file, len(data)))

        parse_data = []
        process_bar = pyprind.ProgPercent(len(data))
        for (sent, label) in data:
            process_bar.update()
            sent = preprocess(sent)
            parse_data.append((sent, label))
        ''' Write Data to File '''
        with utils.create_write_file(parse_train_file) as f_parse:
            for parse_instance in parse_data:
                line = json.dumps(parse_instance, ensure_ascii=False)
                print(line, file=f_parse)
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with utils.create_read_file(parse_train_file) as f:
        for line in f:
            sent, label = json.loads(line)
            sentpair_instance = Sent(sent, label)
            parse_data.append(sentpair_instance)

    print("Load Data, train_file=%s, n_train=%d\n" %
          (train_file, len(parse_data)))
    return parse_data
    def write_feature_to_file(feature_file, features, infos):
        """
        write features string to file
        """

        dim = len(features[0])
        f_feature = utils.create_write_file(feature_file)
        ''' write features infomation to file '''
        print(len(features), dim, file=f_feature)
        ''' write features string to file '''
        for feature, info in zip(features, infos):
            ''' type(feature) is list '''
            feature_string = Feature._feat_list_to_string(feature)
            info_string = Feature._info_list_to_string(info)
            print(feature_string + '\t#\t' + info_string, file=f_feature)

        f_feature.close()
Esempio n. 13
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the model ...")
        # bst = pickle.load(open(model_path, 'rb'))
        bst = xgb.Booster(self.param)
        bst.load_model(model_path)

        print("==> Test the model ...")
        dtest = xgb.DMatrix(test_file_path)
        y_probs = bst.predict(dtest).reshape(-1, self.num_class)
        with open(result_file_path + '.pkl', 'wb') as f:
            pickle.dump(y_probs, f)
        y_pred = np.argmax(y_probs, axis=1)

        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 14
0
def load(train_file, train_gs=None, dev_flag=False):
    # train_file = config.TRAIN_FILE
    # train_gs = config.TRAIN_GS_FILE
    train_parse_data = data_utils.load_parse_data(train_file,
                                                  train_gs,
                                                  flag=dev_flag)
    datas = []
    for train_instance in train_parse_data:
        data = make(train_instance)
        datas.append(data)

    file_name = train_file.split('/')[-1]
    path_dir = '../iclr2016-test/data/eval/'
    f = utils.create_write_file(path_dir + file_name)
    for sa, sb, sc in datas:
        f.write('%s\t%s\t%.4f\n' % (' '.join(sa), ' '.join(sb), sc * 5))

    return datas
Esempio n. 15
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the data ...")
        X_test, Y_test = self.load_file(test_file_path)
        print(test_file_path, shape(X_test))

        print("==> Load the model ...")
        clf = pickle.load(open(model_path, 'rb'))
        scaler_path = model_path.replace('.pkl', '.scaler.pkl')
        min_max_scaler = pickle.load(open(scaler_path, 'rb'))

        print("==> Test the model ...")
        X_test_minmax = min_max_scaler.transform(X_test)
        y_pred = clf.predict(X_test_minmax.toarray())

        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 16
0
    def test(self, dev_instances, dev_file):
        """
        """
        ''' 1. Extract Features '''
        self.make_feature_file(dev_instances, dev_file, dev=True)

        self.output_file = self.get_output_file(dev_file)
        print(self.output_file)
        ''' 2. Predict Answers '''
        predict_label = self.classifier.test_model(self.dev_feature_file,
                                                   self.model_file,
                                                   self.output_file)

        f_out = utils.create_write_file(self.output_file)
        for label, dev_instance in zip(predict_label, dev_instances):
            print('%.2f\t#\t%s' % (label, dev_instance.get_instance_string()),
                  file=f_out)

        return predict_label
Esempio n. 17
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the data ...")
        X_test, Y_test = self.load_file(test_file_path)
        print(test_file_path, shape(X_test))
        X_test = X_test.toarray()
        for x in X_test[:10]:
            print(x)

        print("==> Test the model ...")
        y_pred = []
        for x in X_test:
            x = sum(x) / len(x)
            y_pred.append(x)

        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 18
0
    def test_model(self, test_file_path, model_path, result_file_path):
        print("==> Load the data ...")
        X_test, Y_test = self.load_file(test_file_path)
        print(test_file_path, shape(X_test))
        X_test = X_test.toarray()
        X_test = np.array(X_test, dtype=np.int32)

        print("==> Test the model ...")
        y_pred = []
        for x in X_test:
            counter = Counter(x)
            topk = counter.most_common(1)  # [(dict, freq)]
            y_pred.append(topk[0][0])

        print("==> Save the result ...")
        with utils.create_write_file(result_file_path) as f:
            for y in y_pred:
                print(y, file=f)
        return y_pred
Esempio n. 19
0
 def extract_information(self, train_instances):
     if self.is_training:
         sents = []
         for train_instance in train_instances:
             sent = train_instance.get_sent(self.type)
             sents.append(sent)
         idf_dict = utils.idf_calculator(sents)
         with utils.create_write_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fw:
             idf_dict_tuple = sorted(idf_dict.items(), key=lambda x: x[1], reverse=True)
             for key, value in idf_dict_tuple:
                 print('{}\t{}'.format(key, value), file=fw)
     else:
         with utils.create_read_file(config.DICTIONARY_DIR + '/{}_idf_dict.txt'.format(self.type)) as fr:
             idf_dict = {}
             for line in fr:
                 line = line.strip().split('\t')
                 idf_dict[line[0]] = float(line[1])
     self.unigram_dict = idf_dict
     word_keys = sorted(idf_dict.keys(), reverse=True)
     self.word2index = {word: i for i, word in enumerate(word_keys)}
Esempio n. 20
0
    def extract_information(self, train_instances):
        if self.is_training:
            sents = []
            for train_instance in train_instances:
                sent = train_instance.get_word()
                sents.append(sent)
            idf_dict = utils.idf_calculator(sents)

            #idf_dict = sorted(idf_dict.iteritems(), key=lambda x: x[1], reverse=True)

            with utils.create_write_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fw:
                for key in idf_dict:
                    print('{}\t{}'.format(key, idf_dict[key]), file=fw)

            print(len(idf_dict))
        else:
            with utils.create_read_file(config.DICTIONARY_DIR + '/idf_dict.txt') as fr:
                idf_dict = {}
                for line in fr:
                    line = line.strip().split('\t')
                    idf_dict[line[0]] = float(line[1])

        self.unigram_dict = idf_dict
Esempio n. 21
0
        def __create_dict(*args, **kwargs):
            print("==" * 40)
            print("Create dict for %s ...  " % (func.__name__.replace("create_", "")))
            print("==" * 40)
            ret = func(*args, **kwargs)

            ''' remove item whose frequency is less than threshold '''
            if 'threshold' in kwargs:
                threshold = kwargs['threshold']
                for key in ret.keys():
                    if ret[key] < threshold:
                        ret.pop(key)

            ''' write dict to file '''

            file_name = 'dict_' + func.__name__.replace("create_", "") + '.txt'
            f_dict = utils.create_write_file(config.DICT_DIR + '/' + file_name)

            if type(ret) == list:
                # ensure it is set
                ret = list(set(ret))
                ret = sorted(ret)
                for idx, item in enumerate(ret):
                    print(str(item), file=f_dict)

            elif type(ret) == dict:
                # order the dict
                for item in sorted(ret.keys()):
                    print(item, ret[item])
                    print('%s\t%s' % (item, ret[item]), file=f_dict)
            else:
                raise NotImplementedError

            f_dict.close()

            print("Write file: %s, %d instances" % (file_name, len(ret)))
            return ret
Esempio n. 22
0
    def extract_information(self, train_instances):
        if self.is_training:
            sents, labels = [], []
            for train_instance in train_instances:
                sent = train_instance.get_word()
                label = train_instance.get_label()
                sents.append(sent)
                labels.append(label)

            rf_dict = utils.rf_calculator(sents, labels, max_cnt=1000)
            with utils.create_write_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'w') as fw:
                json.dump(rf_dict, fw, ensure_ascii=False)

        with utils.create_read_file(config.DICTIONARY_DIR + '/rf_dict.txt', 'rb') as fr:
            rf_dict = json.load(fr)

        with utils.create_read_file(config.DICTIONARY_DIR + '/vocab.txt') as fr:
            vocab_dict = {}
            for line in fr:
                line = line.strip().split('\t')
                vocab_dict[line[0]] = int(line[1])

        self.rf_dict = rf_dict
        self.vocab_dict = vocab_dict
Esempio n. 23
0
    def make_feature_file(self, train_instances, train_file, dev=False):
        """

        :param train_instances:
        :param train_file:
        :param dev:
        :return:
        TODO. similar to feature, write to file
        """

        print("-" * 120)
        print("\n".join([f.feature_name for f in self.feature_list]))
        print("-" * 120)
        ''' Extract Features '''
        feature_strings = []
        feature_dimensions = []
        sum_feature_dimension = 0
        for feature_class in self.feature_list:
            if isinstance(feature_class, Feature):
                feature_string, feature_dimension, n_instance = \
                    feature_class.extract_dataset_instances(train_instances, train_file)
                feature_strings.append(feature_string)
                feature_dimensions.append(feature_dimension)
                sum_feature_dimension += feature_dimension
                print('[Extract Features]', 'Feature',
                      feature_class.feature_name, feature_dimension,
                      sum_feature_dimension)

            elif isinstance(feature_class, Model):
                if dev:
                    feature_class.test(train_instances, train_file)
                    feature_string = feature_class.load_model_score(train_file)
                else:
                    ''' seperate to train for speed up '''
                    # feature_class.train(train_instances, train_file)
                    feature_string = feature_class.load_model_score(train_file)
                feature_strings.append(feature_string)
                feature_dimensions.append(1)

                sum_feature_dimension += 1
                print('[Extract Features]', 'Model',
                      feature_class.feature_name, 1, sum_feature_dimension)
        ''' Merge Features'''
        merged_feature_string_list = []
        for feature_strings in zip(*feature_strings):
            merged_feature_string = ""
            dimension = 0
            for feature_dimension, feature_string in zip(
                    feature_dimensions, feature_strings):
                if dimension == 0:  # 第一个
                    merged_feature_string = feature_string
                else:
                    if feature_string != "":
                        # 修改当前feature的index
                        temp = ""
                        for item in feature_string.split(" "):
                            if len(item.split(":")) == 1:
                                print(item)
                            index, value = item.split(":")
                            temp += " %d:%s" % (int(index) + dimension, value)
                        merged_feature_string += temp
                dimension += feature_dimension
            merged_feature_string_list.append(merged_feature_string)

        merged_feature_dimension = sum(feature_dimensions)
        ''' Write to feature file '''
        if dev:
            f_feature = utils.create_write_file(self.dev_feature_file)
        else:
            f_feature = utils.create_write_file(self.train_feature_file)

        for idx, feature_string in enumerate(merged_feature_string_list):
            train_instance = train_instances[idx]
            print(str(train_instance.get_score()),
                  feature_string,
                  file=f_feature)

        return merged_feature_string_list, merged_feature_dimension, len(
            merged_feature_string_list)
Esempio n. 24
0
def load_parse_data(file_path, init=False):
    """
    Load data after Parse, like POS, NER, etc.
    Args:
        file_path:
        init: false, load from file;
            else init from corenlp

    Returns:
        parse_data: List of Example:class
    """
    ''' Pre-Define Write File '''
    parse_train_file = file_path.replace('data/', 'generate/parse/')
    parse_word_file = file_path.replace('data/', 'generate/word/')
    parse_lemma_file = file_path.replace('data/', 'generate/lemma/')
    parse_stopwords_lemma_file = file_path.replace(
        'data/', 'generate/stopwords/lemma/')

    if init or not os.path.exists(parse_train_file):
        ''' Define CoreNLP'''
        nlp = corenlp_utils.StanfordNLP(server_url='http://localhost:9000')
        ''' Read data '''
        print("Read Data from file: %s" % file_path)
        examples = load_data(file_path)
        ''' Parse data '''
        print('*' * 50)
        print("Parse Data to file: %s, n_line: %d\n" %
              (parse_train_file, len(examples)))
        parse_data = []
        process_bar = pyprind.ProgPercent(len(examples))
        for example in examples:
            process_bar.update()
            id = example['id']
            label = example['label']
            parse_lst = [id, label]
            try:
                # warrant0 / warrant1 / reason / claim / debate / negclaim
                example_lst = [
                    example['warrant0'], example['warrant1'],
                    example['reason'], example['claim'], example['debate'],
                    example['negclaim'], example['title'], example['info']
                ]
                for sent in example_lst:
                    parse_sent = nlp.parse(sent)
                    parse_lst.append(sent)
                    parse_lst.append(parse_sent)

            except Exception:
                print(example.get_id())
                traceback.print_exc()
                parse_lst = (
                    "id label warrant0 warrant1 reason claim title info".split(
                    ))

            parse_data.append(parse_lst)
        ''' Write Data to File '''
        f_parse = utils.create_write_file(parse_train_file)
        f_word = utils.create_write_file(parse_word_file)
        # id warrant0 warrant1 label reason claim title info
        f_lemma = utils.create_write_file(parse_lemma_file)
        f_stopwords_lemma = utils.create_write_file(parse_stopwords_lemma_file)

        for parse_example in parse_data:
            parse_sent = json.dumps(parse_example)  # list -> str
            parse_example = ParseExample(parse_example)  # list -> class

            id = parse_example.get_id()
            label = parse_example.get_label()

            for word_type, fw in zip(['word', 'lemma'], [f_word, f_lemma]):
                warrant0, warrant1, reason, claim, debate, negclaim = parse_example.get_six(
                    return_str=True, type=word_type)
                sent = '%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s' % (
                    id, warrant0, warrant1, label, reason, claim, debate,
                    negclaim)
                print(sent, file=fw)

            warrant0, warrant1, reason, claim, debate, negclaim = parse_example.get_six(
                return_str=True, type='lemma', stopwords=True)
            sent = '%s\t%s\t%s\t%d\t%s\t%s\t%s\t%s' % (
                id, warrant0, warrant1, label, reason, claim, debate, negclaim)
            print(sent, file=f_stopwords_lemma)

            print(parse_sent, file=f_parse)

        f_parse.close()
        f_word.close()
        f_lemma.close()
        f_stopwords_lemma.close()
    ''' Load Data from File '''
    print('*' * 50)
    parse_data = []
    with codecs.open(parse_train_file, 'r', encoding='utf8') as f:
        for line in f:
            # obtain the json object
            parse_sent = json.loads(line)
            # obtain the class
            parse_example = ParseExample(parse_sent)
            parse_data.append(parse_example)

    print("Load Data, file_path=%s  n_line=%d\n" %
          (file_path, len(parse_data)))
    return parse_data
Esempio n. 25
0
    def cross_validation(self,
                         data_instances,
                         data_file,
                         k_fold=5,
                         shuffle=False):

        self.make_feature_file(data_instances, data_file)

        n_data = len(data_instances)
        n_batch = n_data // k_fold
        data_instances = list(zip(range(n_data), data_instances))

        id_map = range(n_data)
        if shuffle is True:
            random.shuffle(id_map)

        preds = [None] * n_data
        for fold in range(k_fold):
            st = fold * n_batch
            ed = (fold + 1) * n_batch
            if ed > n_data:
                ed = n_data

            data = utils.create_read_file(self.dev_feature_file).readlines()

            # make train data
            train = [
                data[id_map[idx]].strip() for idx in range(len(data))
                if idx not in range(st, ed)
            ]
            dev_feature_file_train = self.dev_feature_file.replace(
                'txt', 'train')
            f_train = utils.create_write_file(dev_feature_file_train)
            print('\n'.join(train), file=f_train)
            f_train.close()

            # make dev data
            dev = [data[id_map[idx]].strip() for idx in range(st, ed)]
            dev_feature_file_dev = self.dev_feature_file.replace('txt', 'dev')
            f_dev = utils.create_write_file(dev_feature_file_dev)
            print('\n'.join(dev), file=f_dev)
            f_dev.close()
            ''' Train Classifier '''
            self.classifier.train_model(
                dev_feature_file_train,
                self.model_file)  # Attention! self.dev_feature_file
            ''' Predict Lables'''
            self.output_file = self.get_output_file(data_file)

            predict_label = self.classifier.test_model(dev_feature_file_dev,
                                                       self.model_file,
                                                       self.output_file)

            for idx in range(st, ed):
                idy = idx - st
                preds[id_map[idx]] = predict_label[idy]
        ''' Write to File '''
        self.output_file = self.get_output_file(data_file)

        f_out = utils.create_write_file(self.output_file)
        for label, train_instance in zip(preds, data_instances):
            print('%.2f\t#\t%s' %
                  (label, train_instance[1].get_instance_string()),
                  file=f_out)
    def extract_instances(self, train_instances):
        asiya = AsiyaDriver()

        n_lines = 250
        features = []
        infos = []

        idx_list = range(0, len(train_instances), n_lines)

        for idx in idx_list:
            st, ed = idx, idx + n_lines
            if ed > len(train_instances):
                ed = len(train_instances)
            print("\rAsiya MT Featyre index = %d, st = %d, ed = %d" %
                  (idx, st, ed),
                  end=' ')

            while True:
                ''' sa -> sb '''
                f_sa = utils.create_write_file(config.TMP_DIR + '/sa.txt')
                f_sb = utils.create_write_file(config.TMP_DIR + '/sb.txt')
                for id in range(st, ed):
                    lemma_sa, lemma_sb = train_instances[id].get_word(
                        type='lemma')
                    lemma_sa = ' '.join(lemma_sa)
                    lemma_sb = ' '.join(lemma_sb)
                    print(lemma_sa, file=f_sa)
                    print(lemma_sb, file=f_sb)
                f_sa.close()
                f_sb.close()
                page = asiya.run_file()
                if page != ' ':
                    features_sa = asiya.extract_table(page)
                    break
                else:
                    asiya.reload()

            while True:
                ''' sb -> sa '''
                f_sa = utils.create_write_file(config.TMP_DIR + '/sb.txt')
                f_sb = utils.create_write_file(config.TMP_DIR + '/sa.txt')
                # "F:\PyCharmWorkSpace\SemEval17_T1_System\resources\external_dict\sa.txt"
                for id in range(st, ed):
                    lemma_sa, lemma_sb = train_instances[id].get_word(
                        type='lemma')
                    lemma_sa = ' '.join(lemma_sa)
                    lemma_sb = ' '.join(lemma_sb)
                    print(lemma_sa, file=f_sa)
                    print(lemma_sb, file=f_sb)
                f_sa.close()
                f_sb.close()
                page = asiya.run_file()
                if page != ' ':
                    features_sb = asiya.extract_table(page)
                    break
                else:
                    asiya.reload()
            ''' Merge feature '''
            for a, b in zip(features_sa, features_sb):
                features.append(a + b)
                infos.append([])
        print(features[:10])
        return features, infos