Esempio n. 1
0
 def split_paragraph(self, par):
     sens = []
     try:
         new_par = self.regex_rule.fn_normalize_special_mark(par)
         paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \
             self.regex_rule.run_regex_predict(new_par)
         features, _ = self.make_feature(paragraph)
         if not features:
             sens.append(par)
             return sens
         labels = self.classifier.predict(features)
         idx = 0
         pos_start = 0
         pos_end = 0
         for c in paragraph:
             if Feature.is_splitter_candidate(c):
                 if idx < len(labels) and labels[idx] == 1:
                     sens.append(paragraph[pos_start:pos_end + 1].strip())
                     pos_start = pos_end + 1
                 idx += 1
             pos_end += 1
         if pos_start < len(paragraph):
             sens.append(paragraph[pos_start:].strip())
         paragraph = '\n'.join(sens)
         paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \
                                            mark3, mark4)
         # paragraph = self.regex_rule.normalize_special_mark.sub(u' \g<special_mark> ', paragraph)
         # paragraph = self.regex_rule.normalize_space.sub(u' ', paragraph)
         sens = paragraph.split('\n')
         return sens
     except Exception as e:
         print(traceback.format_exc())
         sens.append(par)
         return sens
 def split_paragraph(self, par):
     sens = []
     try:
         paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, mark3, mark4 = \
             self.regex_rule.run_regex_predict(par)
         features, _ = self.make_feature(paragraph)
         if not features:
             sens.append(par)
             return sens
         labels = self.classifier.predict(features)
         idx = 0
         pos_start = 0
         pos_end = 0
         for c in paragraph:
             if Feature.is_splitter_candidate(c):
                 if idx < len(labels) and labels[idx] == 1:
                     sens.append(paragraph[pos_start:pos_end + 1].strip())
                     pos_start = pos_end + 1
                 idx += 1
             pos_end += 1
         if pos_start < len(paragraph):
             sens.append(paragraph[pos_start:].strip())
         paragraph = '\n'.join(sens)
         paragraph = self.regex_rule.restore_info(paragraph, number, url, url2, email, datetime, hard_rules, non_vnese, mark, \
                                            mark3, mark4)
         sens = paragraph.split('\n')
         return sens
     except:
         sens.append(par)
         return sens