Example #1
0
    def tagging(self, filename, outputfile):
        ''' if this is not text convert it to text file '''
        if (filename.endswith(".txt")):
            inputfilename = filename
        else:
            inputfilename = docutonelocate.convert_file(filename)

        documents = self._read_file(inputfilename)
        self.outputfile = outputfile
        self.file_tagging(documents)
Example #2
0
    def file_clean(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        for sentence in document:
            print(' '.join(sentence))
Example #3
0
    def file_named_tag(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        self.new_ner = {}
        for sentence in document:
            self.get_sentence_named_tag(sentence)

        self.write_ner()
Example #4
0
    def create_test_tagging(self, filename, ftype):

        self.fullname = filename
        self.filename = os.path.basename(filename).split('.')[0]
        if ";" in ftype:
            parent, cat = ftype.split(";", 1)
            self.categorie = cat.strip()
            self.ftype = parent.strip()
        else:
            if '(' in ftype:
                index = ftype.find('(')
            elif '(' in ftype:
                index = ftype.find('(')
            else:
                index = 0
            if index > 3:
                self.categorie = ftype[0:index].strip()
            else:
                self.categorie = ftype.strip()
            self.ftype = None
        ''' if this is not text convert it to text file '''
        if (filename.endswith(".txt")):
            inputfilename = filename
        else:
            inputfilename = docutonelocate.convert_file(filename)

        self.dtn_doc = None
        if self.categorie == dtn_document.LABOR_CONTRACT:
            self.dtn_doc = labor_contract.LaborContract()
        elif self.categorie == dtn_document.LOAN_AGREEMENT:
            self.dtn_doc = loan_agreement.LoanAgreement()
        elif self.categorie == dtn_document.TRANSFER_AGREEMENT:
            self.dtn_doc = transfer_agreement.TransferAgreement()
        else:
            self.dtn_doc = other_document.OtherDocument()

        documents = self.dtn_doc.read(inputfilename)

        self._add_new_clauses(documents, self.dtn_doc._results)

        result = self.tagging.tagging_test(documents)

        return result
Example #5
0
    def _load_document_clauses(self, filename, label):

        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)

        self._clause.create_clauses(ofile)

        if len(self._clause.sections) > 0:
            for section in self._clause.sections:
                name = label + ":" + section.title
                name = section.title
                if name in self.labels_index.keys():
                    label_id = self.labels_index[name]
                else:
                    label_id = len(self.labels_index)
                    self.labels_index[name] = label_id

                self.label_name.append(name)
                self.labels.append(label_id)
                words = section.toWords()
                self.texts.append(words)
Example #6
0
    def load_predict_document(self, filename):
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        self._document.read_section(ofile)

        texts = []
        if len(self._document.sections) > 0:
            for section in self._document.sections:
                ss = []
                if section.title:
                    pass
                if len(section.sentences) > 0:
                    ss = [p[0] for p in section.sentences]
                    if len(ss) > 0:
                        texts.append(doc.sentencesTowords(ss))

        else:
            for s in self._document.document_header:
                texts.append(doc.sentencesTowords([s]))

        return texts
Example #7
0
    def get_terms(self, filename, filetype):

        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)

        #lawdocument.create_document(ofile, filetype)
        self.document.read_section(ofile)

        self._title = self.document.document_name
        self._contract_date = self.document.document_date
        if self._title:
            if '文件名称' in self.keywords:
                self.verified_terms['文件名称'].add_value(self._title, 1)
            elif '合同名称' in self.keywords:
                self.verified_terms['合同名称'].add_value(self._title, 1)
        if self._contract_date:
            if '签约日期' in self.keywords:
                self.verified_terms['签约日期'].add_value(self._contract_date, 1)
            elif '签发日期' in self.keywords:
                self.verified_terms['签发日期'].add_value(self._contract_date, 1)
            elif '合同日期' in self.keywords:
                self.verified_terms['合同日期'].add_value(self._contract_date, 1)

        terms = []
        '''
        prev_sentence = ''
        for s in ld.document_header :
            prev_sentence += s
            if ld._is_sentence_end(s) :
                terms.append([prev_sentence])
                prev_sentence = ''
        if prev_sentence :
            terms.append([prev_sentence])
        '''
        nb = len(self.document.sections)
        if nb > 0:
            index = 0
            while index < nb:
                p = self.document.sections[index]
                index += 1
                ''' if section title = term name add it to verfied table '''
                if p.title:
                    termname = dtn_sentence.get_keywords_by_name(
                        p.title, self.keywords)
                    if termname:
                        if len(p.sentences) > 0:
                            for s in p.sentences:
                                if isinstance(s, str):
                                    self.verified_terms[termname].add_value(
                                        s, 1)
                                else:
                                    s_line = s[0]
                                    self._add_verified_sentences(
                                        termname, s[1], s_line[-1], 1)

                        while index < nb:
                            sp = self.document.sections[index]
                            index += 1
                            if sp.level > p.level:
                                for s in sp.sentences:
                                    if isinstance(s, str):
                                        self.verified_terms[
                                            termname].add_value(s, 1)
                                    else:
                                        s_line = s[0]
                                        self._add_verified_sentences(
                                            termname, s[1], s_line[-1], 1)
                            else:
                                ''' back to prev section '''
                                index -= 1
                                break

                if len(p.sentences) > 0:
                    terms.append(p.sentences)

        return terms
Example #8
0
        commd += " -i " + infile
    if o_file:
        commd += " -o " + o_file
    if options.doc_type:
        commd += " -t " + options.doc_type

    dtn_logger.logger_info("MAIN", commd)

    if options.action == 'convert':
        '''
        conv = Convert(verbose=verbose, restart=options.restart)   
        o_file = conv.open_output(infile, o_file)
        conv.files_to_text(infile, o_file)    
        conv.close_output()  
        '''
        ofile = docutonelocate.convert_file(infile, True)

    elif options.action == 'testfile':

        conv = Convert(verbose=verbose, restart=options.restart)

        conv.test_files_in_directory(infile, o_file)

    elif options.action == 'changebad':

        conv = Convert(verbose=verbose, restart=options.restart)

        conv.change_root_bad_files(infile, o_file)

    elif options.action == 'text4sentences':