コード例 #1
0
ファイル: contract.py プロジェクト: minlogiciel/docutone
class Contract(object):
    
    '''
    create legal terms classifier
    
    input model : data/terms/template
    
    output model : data/models
    '''
       
    def __init__(self, debug=0, crf_model=True):
        
        self.texts = []         # list of legal terms tests
        self.terms_index = {}  #  mapping legal term name to numeric id
        self.terms_name = {}   #  legal term name 
        self.terms_label = []  #  mapping legal term name to label
        self.labels = []        # list of legal term label ids
        self._debug = debug
        self.seg = Segmentation()
        self.seg.load_suggest_words()
        self.lawdocument = LawDocument()
        self.clause = Clause()
        self.doc_type = None
        self.doc_path = None
        self.labor_model = True
        self.crf_model = crf_model
    
    def get_data_file_name(self, dataname, categorie='models') :
        path = variables.get_data_file_name(self.doc_path, categorie=categorie)
        if not os.path.exists(path) :
            os.mkdir(path)
        return os.path.join(path, dataname)
 
    def get_term_model_name(self) :
        return self.get_data_file_name(variables.TERM_DOC_MODEL)
        

    # term vector [0, 0, 1, 1, ...]  
    def load_term_set(self) :
        fname = self.get_data_file_name(variables.TERM_VECT)
        f = codecs.open(fname, 'r', 'utf-8')
        
        termSet = [int(line) for line in f if len(line.strip()) > 0]
        f.close()
    
        return termSet

    def save_term_set(self) :
        fname = self.get_data_file_name(variables.TERM_VECT)
        f = codecs.open(fname, 'w', 'utf-8')

        for v in self.labels :
            f.write("%s\n" % (v))
        f.close()
         
    
    
    # term name [termname=termid]
    def load_term_label(self) :
        fname = self.get_data_file_name(variables.TERM_LABEL)
        f = codecs.open(fname, 'r', 'utf-8')
        
        labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()
    
        return labelSet


    def save_term_label(self) :
        fname = self.get_data_file_name(variables.TERM_LABEL)
        f = codecs.open(fname, 'w', 'utf-8')
        
        for v, k in self.terms_name.items():
            f.write("%s=%d\n" % (k, v))
        f.close()
          
    
    # term list 
    def load_term_list(self) :
        fname = self.get_data_file_name(variables.TERM_LIST)
        f = codecs.open(fname, 'r', 'utf-8')
        
        termList = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()
    
        return termList
 

    def save_term_list(self) :
        fname = self.get_data_file_name(variables.TERM_LIST)
        f = codecs.open(fname, 'w', 'utf-8')
        for index in range(len (self.terms_label)):
            k = self.terms_label[index]
            v = self.labels[index]
            f.write("%s=%d\n" % (k, v))
        f.close()
 
    def _convert(self, text_path, convert=False) :   
        
        path = text_path;
        if text_path.endswith("doc") :
            doc_path = text_path
            path = text_path[0:-3] + "TEXT"
        
        if (convert and os.path.exists(doc_path)) :
            conv = Convert(verbose=0)
            o_file = conv.open_output(doc_path, path)
            conv.files_to_text(doc_path, o_file)    
            conv.close_output()
        return path

    
    def get_term_words(self, text) :
        
        if isinstance(text, string_types) :
            sentences = [text]
        else :
            sentences = text

        words_all_filter = self.seg.segment(sentences)[2]

        words = []
        for sentences in words_all_filter :
            for w in sentences :
                if len(w.strip()) > 0 :
                    words.append(w.strip()) 
        return words
 


    
    
    def segment_terms(self, term_sentences):
        """
        Arguments :
        
        term_sentences : test term sentences
    
        return segmentation words
        
        """
        words_all_filter = self.seg.segment(term_sentences)[2]
            
        return words_all_filter
       

    
    def get_terms(self, filename, encoding="utf-8"):
                        
        terms = [] 
         
        self.lawdocument.create_document(filename, encoding)
        
        if len(self.lawdocument.sections) > 0 :         
            for p in self.lawdocument.sections :
                
                term_sentences = []
                term_sentences.append(p.title)
                for s in p.sentences :
                    term_sentences.append(s[0]) # document sentence [s, num, type]
                terms.append(term_sentences)
        # if doc is not law document
        else :
            for p in self.lawdocument.document_header :
                terms.append([p])
            pass
        return terms
             
    
    def load_file(self, filename, encoding="utf-8"):
        
        # directory name is document type 
        ftype = os.path.basename(os.path.dirname(filename))
        
        self.clause.create_clauses(filename, encoding=encoding)

        for p in self.clause.sections :
            name = p.title
            term_sentences = []
            term_sentences.append(name)
            for s in p.sentences :
                term_sentences.append(s)
            
            # add term vector 
            self.texts.append(self.segment_terms(term_sentences))
                     
            if name in self.terms_index :
                label_id = self.terms_index[name]
            else :
                label_id = len(self.terms_index)+1
                self.terms_index[name] = label_id
                self.terms_name[label_id] = name
                        
            self.labels.append(label_id)
            
            self.terms_label.append(name+":"+ftype)
  


    def load_directory(self, path) :

        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            if os.path.isdir(fpath):
                if variables.noloaddir(fname) :
                    continue
                self.load_directory(fpath)
    
            elif fname.endswith(".txt"):
                self.load_file(fpath)
            else :
                #is not text file
                pass
    
    
    def load_terms(self, text_path) :
 
        self.load_directory(text_path) 

        termdocs = []   # term doc2vec
        allterms = []   # term contents

        for index in range(len(self.texts)) :
            term = self.texts[index]
            if len(term) > 0 :
                s = []
                for sentences in term :                 
                    for word in sentences :
                        s.append(word)
                    
                #string = 'doc_' + str(index+1)
                docs = TaggedDocument(s, tags = [index])         
                termdocs.append(docs)
                allterms.append(s)
        return allterms, termdocs
                 

        
    def _create_terms(self, text_path, doctype=None, min_count=2, sg=0, workers=1, size=256, window=5) :   
        """
        min_count : ignore all words with total frequency lower than this.
        sg : sg = O CBOW, sg=1 skip-gram 
        workers: thread
        size : dimension feature vectors.
        window : maximum distance between the current and predicted word within a sentence.
    
        """
        
        self.texts = []         # list of legal terms tests
        self.terms_index = {}  #  mapping legal term name to numeric id
        self.terms_name = {}   #  legal term name 
        self.terms_label = []  #  mapping legal term name to label
        self.labels = []        # list of legal term label ids
        self.doc_type = doctype
        self.doc_path = doctype
        
        path = text_path
            
    
        allterms, termdocs = self.load_terms(path)
        
        # if there is no more clauses, do nothing
        if  len(allterms) < 10 :       
            return
                
        dictionary = corpora.Dictionary(allterms)
        corpus = [dictionary.doc2bow(text) for text in allterms]
        
        # save corpus
        corpusfname = self.get_data_file_name(variables.TERM_MODEL_MM)
        corpora.MmCorpus.serialize(corpusfname, corpus) 
    
        
        # save dictionary
        dictfname = self.get_data_file_name(variables.TERM_MODEL_DICT)
        dictionary.save(dictfname)
        
    
    
        dictfname = self.get_data_file_name(variables.TERM_MODEL_LSI)        
        # initialize a model
        tfidf = models.TfidfModel(corpus, normalize=True)
            
        # use the model to transform vectors        
        corpus_tfidf = tfidf[corpus]
            
        # initialize an LSI transformation, LSI 2-D space
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) 
        lsi.save(dictfname) # same for tfidf, lda, ...
    
    
        #trainig doc2vec
        datasets = Datasets()
        model = datasets.TrainingDoc2Vec(termdocs, size, window, 16) 
        # save doc vector
        vectfname = self.get_term_model_name()
        model.save(vectfname)

        '''
        # word to vector 
        model = word2vec.Word2Vec(allterms, min_count=min_count, sg=sg, workers=workers, size=size, window=window)
        # save words vector
        vectfname = self.get_data_file_name(variables.TERM_WORD_MODEL)
        model.wv.save_word2vec_format(vectfname, binary=False)
        '''
        # save term list 
        self.save_term_list()
        
        # save term vector  
        self.save_term_set()
    
        # save term name 
        self.save_term_label()


    def create_crf(self, path) :   

        crf = CRF()
        if self.labor_model :
            fpath = path + "/劳动合同" 
            ftype = "劳动合同"
            crf.create_categorie_tagging(fpath, ftype)
        else :
            crf.create_crf_model()
            
    def create_terms(self, text_path, convert=False) :   
        
        path = self._convert(text_path, convert)
        
        if self.crf_model :
            self.create_crf(path)
        else :
            self._create_terms(path, doctype=None)
            for doctype in sorted(os.listdir(path)):
                fpath = os.path.join(path, doctype)
                if os.path.isdir(fpath):
                    self._create_terms(fpath, doctype=doctype)
コード例 #2
0
class Text4Sentences(object):
    def __init__(self, stopwords_file=None):
        """
        Keyword arguments:
        stopwords_file :    stopwords file name
        """

        self.pagerank_config = {
            'alpha': 0.85,
        }

        self.seg = Segmentation(stopwords_file=stopwords_file)
        self.law_document = LawDocument()
        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None

    def create_segment_sentences(self,
                                 sentences,
                                 sim_func=util.get_similarity):
        """
        Keyword arguments:
        
        sentences : sentences of document
        
        sim_func 指定计算句子相似度的函数。
        
        """

        self.words_no_filter, self.words_no_stop_words, self.words_all_filters = self.seg.segment(
            sentences)
        self.sentences = sentences

        self.key_sentences = util.sort_sentences(
            sentences=self.sentences,
            words=self.words_no_filter,
            sim_func=sim_func,
            pagerank_config=self.pagerank_config)

    def analyze_file(self, filename, encoding='utf-8'):
        """
        Keyword arguments:
        
        filename : input file name
        
        
        """

        f = self.law_document.create_document(filename=filename)

        self.create_segment_sentences(
            self.law_document.get_segmented_document())

    def get_key_sentences(self, num=6):
        """
        num : 个句子用来生成摘要。

        Return: important sentences。
        """

        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            result.append(item)
            count += 1
        return result

    def show_key_sentences(self):

        for item in self.get_key_sentences(2):
            [sentence, idx, stype] = item['sentence']
            print(sentence)
            print("=" * 20)
            print(self.law_document.get_document_chapiter(idx, chapiter=True))
            print("--" * 20)