def load_integer_features(self, data): """Gives each POS tag in data a number.""" integer_features = [] pos_feature = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Build arrays of integers with which we can fit the encoder # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2 standardized_pos_target = self.standardize_sub_pos_feature(f.get_pos_target()) standardized_pos_source = self.standardize_sub_pos_feature(f.get_pos_source()) # Concatenate the two plain POS tag arrays from target and source event pos_feature = np.concatenate((standardized_pos_target, standardized_pos_source)) # Transform this array into the corresponding array of integers integer_feature = self.pos_tags_to_integers(pos_feature) integer_features.append(integer_feature) return integer_features
def train(self): self.feature_model = Feature() feature_list = [] label_list = [] sen_list = [] self.loading_none_spliter_rule(feature_list, label_list, sen_list) self.loading_forcing_spliter_rule() self.load_normal_data(feature_list, label_list, sen_list) self.classifier = LogisticRegression(verbose=False) print "Learning..." self.classifier.fit(feature_list, label_list) print "Saving..." utils.pickle_save(self, self.model_path) print "Done" print "Test..." #f = open("wrong.dat","w") predicted_labels = self.classifier.predict(feature_list) ll = len(predicted_labels) cc = 0 for i in xrange(ll): if label_list[i] == 0 and predicted_labels[i] == 1: cc += 1 #print sen_list[i] #f.write("%s\n"%sen_list[i]) #f.close() print cc, ll, cc * 1.0 / ll
def load_integer_features(self, data): """Gives each POS tag in data a number.""" integer_features = [] pos_feature = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Build arrays of integers with which we can fit the encoder # Standardize because f.get_pos_$x() doesn't have to be of length self.number_tags_per_feature/2 standardized_pos_target = self.standardize_sub_pos_feature( f.get_pos_target()) standardized_pos_source = self.standardize_sub_pos_feature( f.get_pos_source()) # Concatenate the two plain POS tag arrays from target and source event pos_feature = np.concatenate( (standardized_pos_target, standardized_pos_source)) # Transform this array into the corresponding array of integers integer_feature = self.pos_tags_to_integers(pos_feature) integer_features.append(integer_feature) return integer_features
def __init__(self, filepath, resultpath): self.filepath = filepath self.resultpath = resultpath self.patient_id = list() # pat_id self.feature = Feature(filepath, resultpath) # feature_info and other statistics self.patient_info = list() # list of Patient()
def make_feature(self, file=None): features_list = [] label_list = [] self.feature_model = Feature() if file is None: return features_list, label_list else: features_list, label_list = self.feature_model.gen_feature_matrix( file) return features_list, label_list
def split_paragraph(self, par): sens = [] try: new_par = self.regex_rule.fn_normalize_special_mark(par) paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \ self.regex_rule.run_regex_predict(new_par) features, _ = self.make_feature(paragraph) if not features: sens.append(par) return sens labels = self.classifier.predict(features) idx = 0 pos_start = 0 pos_end = 0 for c in paragraph: if Feature.is_splitter_candidate(c): if idx < len(labels) and labels[idx] == 1: sens.append(paragraph[pos_start:pos_end + 1].strip()) pos_start = pos_end + 1 idx += 1 pos_end += 1 if pos_start < len(paragraph): sens.append(paragraph[pos_start:].strip()) paragraph = '\n'.join(sens) paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \ mark3, mark4) # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph) # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph) sens = paragraph.split('\n') return sens except Exception as e: print(traceback.format_exc()) sens.append(par) return sens
def split_paragraph(self, par): sens = [] try: paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \ self.regex_rule.run_regex_predict(par) features, _ = self.make_feature(paragraph) if not features: sens.append(par) return sens labels = self.classifier.predict(features) idx = 0 pos_start = 0 pos_end = 0 for c in paragraph: if Feature.is_splitter_candidate(c): if idx < len(labels) and labels[idx] == 1: sens.append(paragraph[pos_start:pos_end + 1].strip()) pos_start = pos_end + 1 idx += 1 pos_end += 1 if pos_start < len(paragraph): sens.append(paragraph[pos_start:].strip()) paragraph = '\n'.join(sens) paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \ mark3, mark4) sens = paragraph.split('\n') return sens except: sens.append(par) return sens
def loading_none_spliter_rule(self, feature_list, label_list, sen_list=None): rules = loading_data.load_spliter_rules() print "Loading rules." for rule in rules: if rule[0] == "#": continue if rule[0] == "r": rule = rule[1:] print "Add a soft regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule) continue elif rule[0] == "h": rule = rule[1:] print "Add a hard rule regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule, True) continue idx = 0 for c in rule: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( rule, idx, is_forced=True) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(rule) idx += 1
def __split_par(self, par, is_debug=False): list_sens = [] list_features = [] list_candidates = [] list_hard_rule_none_spliter_idx = [] list_hard_rule_forcing_spliter_idx = [] idx = 0 for c in par: if Feature.is_spliter_candidate(c): list_candidates.append(idx) feature, is_hard = self.feature_model.gen_feature_vector( par, idx) if is_hard > 0: list_hard_rule_none_spliter_idx.append( len(list_candidates) - 1) elif is_hard < 0: list_hard_rule_forcing_spliter_idx.append( len(list_candidates) - 1) if is_debug: print feature list_features.append(feature) idx += 1 if is_debug: print list_candidates if len(list_candidates) == 0: list_sens.append(par) return list_sens #print list_features #list_features = np.array(list_features) #print "Shape: ",list_features.shape labels = self.classifier.predict(list_features) for l in list_hard_rule_none_spliter_idx: labels[l] = 0 for l in list_hard_rule_forcing_spliter_idx: labels[l] = 1 list_true_spliters = [-1] for i in xrange(len(labels)): if labels[i] == 1: list_true_spliters.append(list_candidates[i]) if list_candidates[-1] != len(par) - 1: list_true_spliters.append(len(par) - 1) if is_debug: print list_true_spliters if len(list_true_spliters) > 1: for i in xrange(len(list_true_spliters) - 1): list_sens.append(par[list_true_spliters[i] + 1:list_true_spliters[i + 1] + 1].strip()) else: list_sens.append(par) return list_sens
def load_normal_data(self, feature_list, label_list, sen_list=None): sens = loading_data.load_sentence() num_sen = len(sens) print "Loading total %s normal sentence." % num_sen for i in xrange(num_sen - 1): sen = sens[i] spliter_id = len(sen) - 1 #for single sentence feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) #for merge sentence sen_merge = " ".join([sens[i], sens[i + 1]]) feature, _ = self.feature_model.gen_feature_vector( sen_merge, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) idx = 0 for c in sen[:-1]: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( sen, idx) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(sen) idx += 1
def load_stems(self, data): """Returns all word stems used in the parsed XML data.""" # Get all word stems stems = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) stems = np.append(stems, [f.get_stem_target()]) stems = np.append(stems, [f.get_stem_source()]) stems = np.unique(stems) return stems
def load_pos_tags(self, data): """Loads all POS tags used in the pos_surrounding area around an event.""" pos_tags = np.array([]) for txt in data.textfiles: if self.annotations == "union": txt.compute_union_relations() elif self.annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) # Collect all pos tags from the data pos_tags = np.concatenate((pos_tags, f.get_pos_target())) pos_tags = np.concatenate((pos_tags, f.get_pos_source())) pos_tags = np.unique(pos_tags) # Append a blank tag which will be used for filling up features which don't have enough elements pos_tags = np.append(pos_tags, 'BL') return pos_tags
def __init__(self, K, filepath, resultpath): self.K = K self.feature = Feature(filepath, resultpath) self.pat_edu = dict() # p-value: first, median, last self.p_value = list() # feature name, p-value tuple (first, median, last, diff) self.mean = dict() # feature name : list(cluster1_mean, cluster1_std, ...)} # mean of the last state for temporal features self.mean_first = dict() # feature name : list(cluster1_mean, cluster1_std, ...) # mean of the first state for temporal fatures self.mean_median = dict() self.mean_total = dict() # feature name : (total mean, total std) self.mean_follow_up = dict() # feature name : mean follow-up time in this feature
mode = "train" if data_loader.is_train else "valid" print( f"epoch {epoch_idx:02} {mode} score > {score:.4} ({int(timer() - epoch_start)}s)" ) total_loss /= len(data_loader.dataset) return score, total_loss if __name__ == "__main__": config = get_config() # Vocab load sp = spm.SentencePieceProcessor() feature = Feature() if config.mode == "train": data_dir = os.path.join(nsml.DATASET_PATH, "train", "train_data") build_vocab(os.path.join(data_dir, config.train_file_name)) sp.load('vocab.model') feature.init_idf(os.path.join(data_dir, config.train_file_name)) # random seed random.seed(config.seed) np.random.seed(config.seed) torch.random.manual_seed(config.seed) if torch.cuda.is_available(): torch.cuda.manual_seed(config.seed)
class DataIO(object): def __init__(self, filepath, resultpath): self.filepath = filepath self.resultpath = resultpath self.patient_id = list() # pat_id self.feature = Feature(filepath, resultpath) # feature_info and other statistics self.patient_info = list() # list of Patient() def load_patient_id(self): f = codecs.open(self.filepath + 'patient_id.csv', 'r', 'utf-8') reader = csv.reader(f) line_ctr = 0 for row in reader: # table title if line_ctr < 1: table_ttl = dict(zip(row, range(len(row)))) line_ctr += 1 continue pid = row[table_ttl['PATNO']] self.patient_id.append(pid) line_ctr += 1 f.close() def load_demographics(self): f = codecs.open(self.filepath + 'demographics/' + 'patient_demo.csv', 'r', 'utf-8') reader = csv.reader(f) line_ctr = 0 for row in reader: # table title if line_ctr < 1: table_ttl = dict(zip(row, range(len(row)))) line_ctr += 1 continue if len(row) == 0: continue pval = Patient() pval.id = row[table_ttl['ID']] pval.age = row[table_ttl['AGE']] pval.gender = row[table_ttl['GENDER']] pval.edu_year = row[table_ttl['EDUCATION YEAR']] pval.duration = row[table_ttl['DURATION(MONTH)']] pval.diagnosis = row[table_ttl['DIAGNOSIS']] self.patient_info.append(pval) line_ctr += 1 f.close() def load_feature(self, ftype=None, fname=None, featname=None): self.feature.load_feature(ftype, fname, featname) def read_data(self): self.load_patient_id() self.load_demographics() self.load_feature('Motor', 'MDS UPDRS PartI') self.load_feature('Motor', 'MDS UPDRS PartII') self.load_feature('Motor', 'MDS UPDRS PartIII') self.load_feature('Motor', 'MDS UPDRS PartIV') self.load_feature('Non-Motor', 'BJLO') self.load_feature('Non-Motor', 'ESS') self.load_feature('Non-Motor', 'GDS') self.load_feature('Non-Motor', 'HVLT') self.load_feature('Non-Motor', 'LNS') self.load_feature('Non-Motor', 'MoCA') self.load_feature('Non-Motor', 'QUIP') self.load_feature('Non-Motor', 'RBD') self.load_feature('Non-Motor', 'SCOPA-AUT') self.load_feature('Non-Motor', 'SF') self.load_feature('Non-Motor', 'STAI') self.load_feature('Non-Motor', 'SDM') self.load_feature('Non-Motor', 'MCI') self.load_feature('Biospecimen', 'DNA') self.load_feature('Biospecimen', 'CSF', 'Total tau') self.load_feature('Biospecimen', 'CSF', 'Abeta 42') self.load_feature('Biospecimen', 'CSF', 'p-Tau181P') self.load_feature('Biospecimen', 'CSF', 'CSF Alpha-synuclein') self.load_feature('Image', 'DaTScan SBR') self.load_feature('Image', 'MRI') self.load_feature('Medication', 'MED USE') return self.feature.get_feature_name()
class SentenceSpliter(): def __init__(self, path="models/model.dump", is_training=False, new_rule_path=None): self.classifier = None self.feature_model = None self.multi_newline_regex = re.compile("\n+") self.c_dir = os.path.abspath(os.path.dirname(__file__)) self.model_path = "%s/%s" % (self.c_dir, path) if not is_training: if os.path.exists(self.model_path) and not is_training: #print "Loading model..." model = utils.pickle_load(self.model_path) self.classifier = model.classifier self.feature_model = model.feature_model if new_rule_path != None: self.load_custom_hard_rule(new_rule_path) else: print "Unalbe to load the spliter model. %s" % path exit(-1) def load_normal_data(self, feature_list, label_list, sen_list=None): sens = loading_data.load_sentence() num_sen = len(sens) print "Loading total %s normal sentence." % num_sen for i in xrange(num_sen - 1): sen = sens[i] spliter_id = len(sen) - 1 #for single sentence feature, _ = self.feature_model.gen_feature_vector(sen, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) #for merge sentence sen_merge = " ".join([sens[i], sens[i + 1]]) feature, _ = self.feature_model.gen_feature_vector( sen_merge, spliter_id) feature_list.append(feature) label_list.append(1) if sen_list != None: sen_list.append(sen) idx = 0 for c in sen[:-1]: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( sen, idx) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(sen) idx += 1 def load_custom_hard_rule(self, path): rules = loading_data.load_spliter_rules(path) for rule in rules: if rule[0] == "#": continue elif rule[0] == "h": rule = rule[1:] print "Add a hard rule regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule, True) continue def loading_forcing_spliter_rule(self): rules = loading_data.load_spliter_rules( loading_data.raw_forcing_spliter_path) for rule in rules: if rule[0] == "#": continue elif rule[0] == "h": rule = rule[1:] print "Add a hard forcing rule regex: %s" % rule self.feature_model.add_forcing_splitter_regrex(rule) def loading_none_spliter_rule(self, feature_list, label_list, sen_list=None): rules = loading_data.load_spliter_rules() print "Loading rules." for rule in rules: if rule[0] == "#": continue if rule[0] == "r": rule = rule[1:] print "Add a soft regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule) continue elif rule[0] == "h": rule = rule[1:] print "Add a hard rule regex: %s" % rule self.feature_model.add_none_spliter_regrex(rule, True) continue idx = 0 for c in rule: if Feature.is_spliter_candidate(c): feature, _ = self.feature_model.gen_feature_vector( rule, idx, is_forced=True) feature_list.append(feature) label_list.append(0) if sen_list != None: sen_list.append(rule) idx += 1 #print Feature.NONE_SPLITER_DICT def train(self): self.feature_model = Feature() feature_list = [] label_list = [] sen_list = [] self.loading_none_spliter_rule(feature_list, label_list, sen_list) self.loading_forcing_spliter_rule() self.load_normal_data(feature_list, label_list, sen_list) self.classifier = LogisticRegression(verbose=False) print "Learning..." self.classifier.fit(feature_list, label_list) print "Saving..." utils.pickle_save(self, self.model_path) print "Done" print "Test..." #f = open("wrong.dat","w") predicted_labels = self.classifier.predict(feature_list) ll = len(predicted_labels) cc = 0 for i in xrange(ll): if label_list[i] == 0 and predicted_labels[i] == 1: cc += 1 #print sen_list[i] #f.write("%s\n"%sen_list[i]) #f.close() print cc, ll, cc * 1.0 / ll def __split_par(self, par, is_debug=False): list_sens = [] list_features = [] list_candidates = [] list_hard_rule_none_spliter_idx = [] list_hard_rule_forcing_spliter_idx = [] idx = 0 for c in par: if Feature.is_spliter_candidate(c): list_candidates.append(idx) feature, is_hard = self.feature_model.gen_feature_vector( par, idx) if is_hard > 0: list_hard_rule_none_spliter_idx.append( len(list_candidates) - 1) elif is_hard < 0: list_hard_rule_forcing_spliter_idx.append( len(list_candidates) - 1) if is_debug: print feature list_features.append(feature) idx += 1 if is_debug: print list_candidates if len(list_candidates) == 0: list_sens.append(par) return list_sens #print list_features #list_features = np.array(list_features) #print "Shape: ",list_features.shape labels = self.classifier.predict(list_features) for l in list_hard_rule_none_spliter_idx: labels[l] = 0 for l in list_hard_rule_forcing_spliter_idx: labels[l] = 1 list_true_spliters = [-1] for i in xrange(len(labels)): if labels[i] == 1: list_true_spliters.append(list_candidates[i]) if list_candidates[-1] != len(par) - 1: list_true_spliters.append(len(par) - 1) if is_debug: print list_true_spliters if len(list_true_spliters) > 1: for i in xrange(len(list_true_spliters) - 1): list_sens.append(par[list_true_spliters[i] + 1:list_true_spliters[i + 1] + 1].strip()) else: list_sens.append(par) return list_sens def split(self, doc, is_debug=False): doc = doc.replace("\r", "") doc = self.multi_newline_regex.sub("\n", doc) paragraphs = doc.split("\n") sens = [] for par in paragraphs: if len(par) < 1: continue par_sens = self.__split_par(par, is_debug) for sen in par_sens: sens.append(sen) return sens
def parse_Features(data, new=False, annotations="union", features=["pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality"], distance=False): """Extracts the features out of the dataset and returns a list of features with the corresponding classes. Args: data (list): The parsed data from fables-100-temporal-dependency.xml. new (bool): With new=True a new calculation of Pos() and Stem() can be enforced. Otherwise it will be loaded from a file. annotations (str): Looking on all relations ("union") or at all relations in common between the annotators ("intersected"). features (list): Determines which features should be activated. Possible values: "pos", "stem", "aspect", "tense", "distance", "similarity", "polarity", "modality". distance (bool): If set to True parse_Features() will return distance information for the data (needed for evaluation) """ # Only compute pos and stem if new flag is set if "pos" in features or "stem" in features: if new or not os.path.isfile("set.p"): pos = Pos(data, 6, annotations) stem = Stem(data, annotations) pickle.dump((pos, stem), open("save.p", "wb")) else: pos, stem = pickle.load(open("save.p", "rb")) if distance: distance_diff = [] X = [] y = np.array([], dtype=int) for txt in data.textfiles: # Union or intersected relations? if annotations == "union": txt.compute_union_relations() elif annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) feature = [] # Make polarity feature if "polarity" in features: feature = np.concatenate((feature, [f.get_polarity()])) # Make distance feature if "distance" in features: feature = np.concatenate((feature, f.get_distance())) # Make POS feature if "pos" in features: pos_feature = pos.transform(f.get_pos_target(), f.get_pos_source()) pos_feature = pos_feature.toarray()[0] feature = np.concatenate((feature, pos_feature)) # Make Stem feature if "stem" in features: stem_feature = stem.transform(f.get_stem_source(), f.get_stem_target()) stem_feature = stem_feature[0] feature = np.concatenate((feature, stem_feature)) # Make similarity feature if "similarity" in features: feature = np.concatenate((feature, [f.get_similarity_of_words()])) # Make modality feature if "modality" in features: feature = np.concatenate((feature, [f.get_modality()])) # Make aspect feature if "aspect" in features: feature = np.concatenate((feature, f.get_aspect())) # Make tense feature if "tense" in features: feature = np.concatenate((feature, f.get_tense())) # Append feature to X X.append(feature) y = np.append(y, [f.get_class()]) # Append distance information if needed if distance: distance_diff.append(f.get_distance_diff()) if distance: return (X, y, distance_diff) else: return (X, y)
def get_sentences(number, class_id, annotations="intersected"): """Returns number sentences which have the relation type class_id. Useful if you need to get an overview over sentences with a certain temporal relation. """ data = parse_XML("fables-100-temporal-dependency.xml", "McIntyreLapata09Resources/fables") i=0 go_to_next_textfile = False for txt in data.textfiles: go_to_next_textfile = False if annotations == "union": txt.compute_union_relations() elif annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) if f.get_class() == class_id and go_to_next_textfile == False: # Stop if number relations are reached if i >= number: break i += 1 if rel.target.sentence == rel.source.sentence: print "---------------" print "Source event: " +rel.source.content print "Target event: " +rel.target.content print rel.target.sentence print print "Source Surrounding: " + rel.source.surrounding print "Target Surrounding: " + rel.target.surrounding else: print "---------------" print "Source event: " +rel.source.content print "Whole sentence " +rel.source.sentence print "Surrounding" + rel.source.surrounding print print "Target event: " +rel.target.content print "Whole sentence: " + rel.target.sentence print "Surrounding: " + rel.target.surrounding tense_source = f.get_tense_source() tense_target = f.get_tense_target() if tense_source == 0: print "Estimated tense for source event: None" elif tense_source == 1: print "Estimated tense for source event: Present" elif tense_source == 2: print "Estimated tense for source event: Past" elif tense_source == 3: print "Estimated tense for source event: Future" if tense_target == 0: print "Estimated tense for target event: None" elif tense_target == 1: print "Estimated tense for target event: Present" elif tense_target == 2: print "Estimated tense for target event: Past" elif tense_target == 3: print "Estimated tense for target event: Future" aspect_source = f.get_aspect_source() aspect_target = f.get_aspect_target() if aspect_source == 0: print "Estimated aspect for source event: None" elif aspect_source == 1: print "Estimated aspect for source event: Progressive" elif aspect_source == 2: print "Estimated aspect for source event: Perfect" elif aspect_source == 3: print "Estimated aspect for source event: Perfect Progressive" if aspect_target == 0: print "Estimated aspect for target event: None" elif aspect_target == 1: print "Estimated aspect for target event: Progressive" elif aspect_target == 2: print "Estimated aspect for target event: Perfect" elif aspect_target == 3: print "Estimated aspect for target event: Perfect Progressive" print "Distance between events: " + str(f.get_distance()) print "---------------" print # Get next sentence from the next text go_to_next_textfile = True
def get_sentences(number, class_id, annotations="intersected"): """Returns number sentences which have the relation type class_id. Useful if you need to get an overview over sentences with a certain temporal relation. """ data = parse_XML("fables-100-temporal-dependency.xml", "McIntyreLapata09Resources/fables") i = 0 go_to_next_textfile = False for txt in data.textfiles: go_to_next_textfile = False if annotations == "union": txt.compute_union_relations() elif annotations == "intersected": txt.compute_intersection_relations() for rel in txt.relations: f = Feature(rel) if f.get_class() == class_id and go_to_next_textfile == False: # Stop if number relations are reached if i >= number: break i += 1 if rel.target.sentence == rel.source.sentence: print "---------------" print "Source event: " + rel.source.content print "Target event: " + rel.target.content print rel.target.sentence print print "Source Surrounding: " + rel.source.surrounding print "Target Surrounding: " + rel.target.surrounding else: print "---------------" print "Source event: " + rel.source.content print "Whole sentence " + rel.source.sentence print "Surrounding" + rel.source.surrounding print print "Target event: " + rel.target.content print "Whole sentence: " + rel.target.sentence print "Surrounding: " + rel.target.surrounding tense_source = f.get_tense_source() tense_target = f.get_tense_target() if tense_source == 0: print "Estimated tense for source event: None" elif tense_source == 1: print "Estimated tense for source event: Present" elif tense_source == 2: print "Estimated tense for source event: Past" elif tense_source == 3: print "Estimated tense for source event: Future" if tense_target == 0: print "Estimated tense for target event: None" elif tense_target == 1: print "Estimated tense for target event: Present" elif tense_target == 2: print "Estimated tense for target event: Past" elif tense_target == 3: print "Estimated tense for target event: Future" aspect_source = f.get_aspect_source() aspect_target = f.get_aspect_target() if aspect_source == 0: print "Estimated aspect for source event: None" elif aspect_source == 1: print "Estimated aspect for source event: Progressive" elif aspect_source == 2: print "Estimated aspect for source event: Perfect" elif aspect_source == 3: print "Estimated aspect for source event: Perfect Progressive" if aspect_target == 0: print "Estimated aspect for target event: None" elif aspect_target == 1: print "Estimated aspect for target event: Progressive" elif aspect_target == 2: print "Estimated aspect for target event: Perfect" elif aspect_target == 3: print "Estimated aspect for target event: Perfect Progressive" print "Distance between events: " + str(f.get_distance()) print "---------------" print # Get next sentence from the next text go_to_next_textfile = True
class SentenceSpliter(): def __init__(self, is_training=False): self.classifier = None self.feature_model = None self.regex_rule = Regex() if not is_training: self.classifier = utils.load( os.path.join('vnspliter/model', 'model.pkl')) if self.classifier is None: print "Unable to load model!" exit(-1) def make_feature(self, file=None): features_list = [] label_list = [] self.feature_model = Feature() if file is None: return features_list, label_list else: features_list, label_list = self.feature_model.gen_feature_matrix( file) return features_list, label_list def split_paragraph(self, par): sens = [] try: new_par = self.regex_rule.fn_normalize_special_mark(par) paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \ self.regex_rule.run_regex_predict(new_par) features, _ = self.make_feature(paragraph) if not features: sens.append(par) return sens labels = self.classifier.predict(features) idx = 0 pos_start = 0 pos_end = 0 for c in paragraph: if Feature.is_splitter_candidate(c): if idx < len(labels) and labels[idx] == 1: sens.append(paragraph[pos_start:pos_end + 1].strip()) pos_start = pos_end + 1 idx += 1 pos_end += 1 if pos_start < len(paragraph): sens.append(paragraph[pos_start:].strip()) paragraph = '\n'.join(sens) paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \ mark3, mark4) # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph) # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph) sens = paragraph.split('\n') return sens except Exception as e: print(traceback.format_exc()) sens.append(par) return sens def split(self, pars): sens = [] try: pars = pars.replace(u'\r', u'\n') pars = re.compile(u'\n+').sub(u'\n', pars) pars = pars.split('\n') for par in pars: if par.strip(): s = self.split_paragraph(par) sens += s return sens except: sens.append(pars) return sens