def proc_right_segments(relation, segments): seg_list = proc_line(segments, ';') word_list = [] pos_list =[] for seg in seg_list: if seg == '': continue sp_list = proc_line(seg, ':') word = sp_list[0] pos = sp_list[1] word_list.append(word) pos_list.append(pos) total_featureset_dict[relation]['right_words'].add(word) total_featureset_dict[relation]['p2_right_pos'].add(pos) featureset_dict[relation]['right_words'].append(word_list) featureset_dict[relation]['p2_right_pos'].append(pos_list)
def gen_total_featureset(relation): filename = trainset_prefix + relation file_in = os.path.join(data_dir, filename) fi = open(file_in.decode('utf-8'), 'r') for line in fi: seg_list = proc_line(line, '||') proc_left_segments(relation, seg_list[0]) proc_middle_segments(relation, seg_list[1]) proc_right_segments(relation, seg_list[2]) fi.close()
def gen_training_corpus(text): seg_list = proc_line(text, "\t") relation = seg_list[0].strip() person1 = seg_list[1].strip() person2 = seg_list[2].strip() title = seg_list[3].strip() has_rel = seg_list[4].strip() # 是否的确有对应关系 if has_rel == "0": relation = "null" label = fetch_label(relation) return corpus(title, person1, person2, label)
def read_extracted_featureset(): for feature_name in feature_name_list: filename = extracted_featset_prefix + feature_name file_in = os.path.join(featureset_dir, filename) fi = open(file_in, 'r') line = fi.readline() seg_list = proc_line(line, ';') featset = [] for seg in seg_list: if seg == '': continue featset.append(seg) extracted_featureset_dict[feature_name] = featset fi.close()