def load_data(self): features = [] targets = [] f_train = open(raw_trainingset, 'r') for line in f_train: corpus = gen_training_corpus(line) feat = feature(corpus.title, corpus.person1, corpus.person2) features.append(np.array(feat)) targets.append(corpus.label) self.features = np.array(features) self.targets = np.array(targets) f_train.close()
def predict_main(): fi_test = open(test_file, 'r') for line in fi_test: corpus = gen_training_corpus(line) test_corpora = gen_test_corpora(corpus.title) if test_corpora == None: continue print '--------------' max_proba = 0.0 pred_label = -1 for corpus in test_corpora: feats = feature(corpus.title, corpus.person1, corpus.person2) cls = classfier.predict_proba(feats) for i in range(len(relations)): if cls[0][i] > max_proba: max_proba = cls[0][i] pred_label = i print pred_label, max_proba print '--------------' fi_test.close()
trainset_prefix = "project2_TrainingSet7000_" # 将每一种关系实例归类 if __name__ == '__main__': fi_train = open(raw_trainingset, 'r') # 为每一种关系建立一个预处理文件 file_out_list = [] fo_relation_list = [] for i in range(len(relations)): filename = trainset_prefix + str(i) filename = os.path.join(data_dir, filename) file_out_list.append(filename) fo_relation_list.append(open(filename.decode('utf-8'), 'w')) # 读取训练集 for line in fi_train: corpus = gen_training_corpus(line) sp_list = divide_line(corpus.title, corpus.person1, corpus.person2)# 将新闻标题以人名为分隔符划分成3部分 string = "" idx = 0 for sp in sp_list: if sp != '': for t in Seg(sp): # 调用中科院的分词 s = '%s:%s;' % (t[0],t[1]) string += s idx += 1 if idx < len(sp_list): string += '||' string += '\n' fo_relation_list[corpus.label].write(string) fi_train.close() for fo in fo_relation_list: