def analyse_header(self, s, sentences, index): isEnd = False if self.set_contract_date(s, index): if self._is_society_employer_sentence(s): self._add_society_employer(s) elif not self._title and self._is_title(s): self._title = s ''' add to extraction ''' crf_utils.add_clause_string(self.CONTRACT_NAME, self._title, self._results) elif self._is_society_employer_sentence(s): index = self._add_society_employer_sentences(sentences, index) elif self._is_society(s): index = self._add_society_info(sentences, index) elif self._is_employer(s): index = self._add_employer_info(sentences, index) elif not self._has_table_contents and self.is_table_contents(s): index = self.pass_table_contents(sentences, (index + 1)) elif self.is_end_header(s, sentences, (index + 1)): isEnd = True return isEnd, index
def _parse_output1(self, output, results): sentence = '' for s in output: s = s.decode('utf-8') clause_name = '' sentences = {} for line in s.splitlines(): if line.strip(): pieces = line.split('\t') key = pieces[-1] n = 1 clause_name = crf_utils.get_tagging_name( key, self.keywords, self._focus) if (clause_name in sentences.keys()): n += sentences[clause_name] sentences[clause_name] = n sentence += pieces[0] else: clause_name = self._get_clause_name_for_line(sentences) crf_utils.add_clause_string(clause_name, sentence, results) sentences = {} sentence = '' if len(sentence) > 0: clause_name = self._get_clause_name_for_line(sentences) crf_utils.add_clause_string(clause_name, sentence, results)
def set_contract_date(self, s, index): if not self._contract_date: self._contract_date = self.get_contract_time(s, index) if self._contract_date: ''' add to extraction ''' crf_utils.add_clause_string(self.CONTRACT_DATE, self._contract_date, self._results) return True return False
def _add_employer_info(self, sentences, index): s = sentences[index] self.employer = [s] crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) index += 1 while s: s = sentences[index] if self._is_employer_info(s): self.employer.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: index -= 1 break index += 1 return index
def _add_assignee(self, s, sentences, index): if len(s) < 6 : d = 1 else : d = 0 ss = sentences[index+d] self.assignee = [ss] crf_utils.add_clause_string(self.CONTRACT_AB, ss, self._results) d += 1 for i in range(d, 5) : ss = sentences[index + i] if self._is_transfor_info(ss) : self.assignee.append(ss) else: break
def _add_society_employer_sentences(self, sentences, index): self.employer = [] self.society = [] index += 1 while True: s = sentences[index] if self._is_employer_sentence(s): self.employer.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) elif self._is_society_sentence(s): self.society.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: break index += 1 return index
def analyse_header(self, s, sentences, index): isEnd = False if self.set_contract_date(s, index) : pass elif not self._title and self._find_in_info(s, self.TITLE_NAME): self._title = s crf_utils.add_clause_string(self.CONTRACT_NAME, self._title, self._results) elif not self.transfor and self._is_transfor(s) : self._add_transfor(s, sentences, index) elif not self.assignee and self._is_assignee(s) : self._add_assignee(s, sentences, index) elif not self._has_table_contents and self.is_table_contents(s) : index = self.pass_table_contents(sentences, (index+1)) elif self.is_end_header(s) : isEnd = True return isEnd, index
def _parse_output(self, document, results): sentence = '' clause_name = '' sentences = {} for line in document.splitlines(): if line.strip(): n = 1 pieces = line.split('\t') key = pieces[-1] clause_name = crf_utils.get_tagging_name( key, self.dtn_doc._keywords, self.dtn_doc._focus_points) if (clause_name in sentences.keys()): n += sentences[clause_name] sentences[clause_name] = n sentence += pieces[0] else: clause_name = self._get_clause_name_for_line(sentences) crf_utils.add_clause_string(clause_name, sentence, results) sentences = {} sentence = '' if len(sentence) > 0: clause_name = self._get_clause_name_for_line(sentences) crf_utils.add_clause_string(clause_name, sentence, results)
def _add_society_employer(self, sentence): start = sentence.find(self.SOC_START) if start < 0: start = 0 end = sentence.find(self.EMP_START) if end < 0: end = len(sentence) - 1 nb = sentence.find(self.TIME_START) if nb < end: nb = len(sentence) - 1 soc = sentence[start + 1:end] tab = soc.split(',') self.society = [] if tab and len(tab): for s in tab: s = s.strip() if len(s) > 0: self.society.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: self.society.append(soc) crf_utils.add_clause_string(self.CONTRACT_AB, soc, self._results) emp = sentence[end + 1:nb] tab = emp.split(',') self.employer = [] if tab and len(tab): for s in tab: s = s.strip() if len(s) > 0: self.employer.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: self.employer.append(emp) crf_utils.add_clause_string(self.CONTRACT_AB, emp, self._results)
def _add_society_info(self, sentences, index): s = sentences[index] self.society = [s] crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) index += 1 prev = s while s: s = sentences[index] if self._is_society_info(s): self.society.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: if prev.endswith(':') or prev.endswith(':'): self.society.append(s) crf_utils.add_clause_string(self.CONTRACT_AB, s, self._results) else: index -= 1 break prev = s index += 1 return index