def split_paragraph(self, par): sens = [] try: new_par = self.regex_rule.fn_normalize_special_mark(par) paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \ self.regex_rule.run_regex_predict(new_par) features, _ = self.make_feature(paragraph) if not features: sens.append(par) return sens labels = self.classifier.predict(features) idx = 0 pos_start = 0 pos_end = 0 for c in paragraph: if Feature.is_splitter_candidate(c): if idx < len(labels) and labels[idx] == 1: sens.append(paragraph[pos_start:pos_end + 1].strip()) pos_start = pos_end + 1 idx += 1 pos_end += 1 if pos_start < len(paragraph): sens.append(paragraph[pos_start:].strip()) paragraph = '\n'.join(sens) paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \ mark3, mark4) # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph) # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph) sens = paragraph.split('\n') return sens except Exception as e: print(traceback.format_exc()) sens.append(par) return sens
def split_paragraph(self, par): sens = [] try: paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \ self.regex_rule.run_regex_predict(par) features, _ = self.make_feature(paragraph) if not features: sens.append(par) return sens labels = self.classifier.predict(features) idx = 0 pos_start = 0 pos_end = 0 for c in paragraph: if Feature.is_splitter_candidate(c): if idx < len(labels) and labels[idx] == 1: sens.append(paragraph[pos_start:pos_end + 1].strip()) pos_start = pos_end + 1 idx += 1 pos_end += 1 if pos_start < len(paragraph): sens.append(paragraph[pos_start:].strip()) paragraph = '\n'.join(sens) paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \ mark3, mark4) sens = paragraph.split('\n') return sens except: sens.append(par) return sens