コード例 #1
0
    def __init__(self, modelname=None):

        self._document = LawDocument()
        self.clause_model = clause_training.ClauseTraining()
        self.clause_model.load_model_label()

        pass
コード例 #2
0
 def __init__(self, filename=None):
     """
     """
     self.law_document = LawDocument()
     self.important_word = []
     self.top_n_scored = []
     self.mean_scored = []
コード例 #3
0
ファイル: searchtext.py プロジェクト: minlogiciel/docutone
 def get_document_type(self, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     
     text = law_document.document_type;
     
     return text
コード例 #4
0
ファイル: searchtext.py プロジェクト: minlogiciel/docutone
 def get_document(self, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     
     text = "\n".join(law_document.document_title);
     
     return text
コード例 #5
0
    def file_clean(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        for sentence in document:
            print(' '.join(sentence))
コード例 #6
0
ファイル: datasets.py プロジェクト: minlogiciel/docutone
    def __init__(self):

        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels_files = {}  # dictionary mapping label name to numeric id
        self.labels_name = {}  # dictionary mapping label name to numeric id
        self.file_label = []  # file label id
        self.labels = []  # list of label ids
        self.classifiers = []  # list of classifier
        self.law_doc = LawDocument()
        self.folder_structure = {}
        self.folder_order = []

        pass
コード例 #7
0
    def file_named_tag(self, filename):
        from docutone.core.document import LawDocument
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        lawdoc = LawDocument()
        document = lawdoc.get_fusion_document(ofile)

        self.new_ner = {}
        for sentence in document:
            self.get_sentence_named_tag(sentence)

        self.write_ner()
コード例 #8
0
    def __init__(self):

        self.contract = Contract(0)

        self.verified_terms = {}
        self._filetime = None
        self.fullname = None
        self.filename = None
        self._title = None
        self._contract_date = None
        self.keywords = []

        self.segment = Segmentation()
        self.document = LawDocument()
コード例 #9
0
ファイル: contract.py プロジェクト: minlogiciel/docutone
 def __init__(self, debug=0, crf_model=True):
     
     self.texts = []         # list of legal terms tests
     self.terms_index = {}  #  mapping legal term name to numeric id
     self.terms_name = {}   #  legal term name 
     self.terms_label = []  #  mapping legal term name to label
     self.labels = []        # list of legal term label ids
     self._debug = debug
     self.seg = Segmentation()
     self.seg.load_suggest_words()
     self.lawdocument = LawDocument()
     self.clause = Clause()
     self.doc_type = None
     self.doc_path = None
     self.labor_model = True
     self.crf_model = crf_model
コード例 #10
0
    def __init__(self, modelname=None):

        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NB_WORDS = 20000
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.25
        self.EPOCHS = 64
        self.BATCH_SIZE = 32
        self.POOL_SIZE = 5
        self.FILTERS = 64
        self.LSTM_OUTPUT_SIZE = 70

        if modelname == None:
            self.MODEL_NAME = "clause_model"
        else:
            self.MODEL_NAME = modelname

        self._document = LawDocument()
        self._clause = Clause()

        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self.label_name = []
        self._debug = 1
        self._save_model = False
コード例 #11
0
    def __init__(self, stopwords_file=None):
        """
        Keyword arguments:
        stopwords_file :    stopwords file name
        """

        self.pagerank_config = {
            'alpha': 0.85,
        }

        self.seg = Segmentation(stopwords_file=stopwords_file)
        self.law_document = LawDocument()
        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None
コード例 #12
0
    def __init__(self, filename=None):

        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NB_WORDS = 20000
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.25

        self.embeddings_index = self.load_embedding_base()

        self._document = LawDocument()

        self.label_name = []
        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self._debug = 1

        pass
コード例 #13
0
ファイル: searchtext.py プロジェクト: minlogiciel/docutone
 def get_document_chapiter(self, sims, dictname) :
     
     textfname = "../dictionary/text/" + dictname  + ".txt"
     law_document = LawDocument()
     law_document.analyze(filename=textfname)
     text = "";
     n_line = 1
     for sim in sims :
         doc_no, simil = sim[0], sim [1]
         if (simil > 0.4) :
             text +=  "******** " + str(n_line) + "  ********\n"
             text += law_document.get_document_chapiter(doc_no) + "\n"
             n_line += 1
             if n_line > 2:
                 break;
         else :
             break
     return text
コード例 #14
0
ファイル: searchtext.py プロジェクト: minlogiciel/docutone
    def search_document(self, textpath, filename) :
        ld = LawDocument()
        ld.analyze(filename=filename)
        
        
        doc_tab = []
        names = os.listdir("../dictionary/dict")
        n_file = 1
        for filename in os.listdir("../dictionary/dict") :
            if filename.endswith(".dict") :
                dictname = filename.replace('.dict', '')
                total = 0.0
                sentences = []
                for sentence in ld.table_contents :
                    if len(sentence) > 1 :
                        
                        sims = self.text_search_lsi(textpath, sentence[1])
                        total += self.get_similarity_value(sims)
                
                doc_tab.append([dictname, total])
        doc_tab = sorted(doc_tab, key=lambda total: total[1], reverse=True)

        return self.get_document_type(doc_tab[0][0])
コード例 #15
0
    def __init__(self):
        
        self.law_doc =  LawDocument()


        self.file_index = 1
        self.folder_structure = {}
        self.folder_order = []
        
        self.corpus_document = []
        
        instance = Terms()
        self.categories = instance.get_all_term_items() 
        
        pass
コード例 #16
0
class EnbedTraining(object):
    def __init__(self, filename=None):

        self.MAX_SEQUENCE_LENGTH = 1000
        self.MAX_NB_WORDS = 20000
        self.EMBEDDING_DIM = 100
        self.VALIDATION_SPLIT = 0.25

        self.embeddings_index = self.load_embedding_base()

        self._document = LawDocument()

        self.label_name = []
        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self._debug = 1

        pass

    def load_embedding_base(self):

        embeddings_index = {}

        f = codecs.open(
            os.path.join(variables.BASE_DIR,
                         'data/document_classification.txt'), 'r', 'utf-8')
        for line in f:
            values = line.split()
            if len(values) > 2:
                word = values[0]
                coefs = np.asarray(values[1:], dtype='float32')
                embeddings_index[word] = coefs
        f.close()

        return embeddings_index

    def _load_directory(self, path, label, label_id):

        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            if os.path.isdir(fpath):
                self._load_directory(fpath, label, label_id)
            elif fname.endswith(".txt"):
                words = self._document.get_normalize_document(fpath, outtype=0)
                if len(words) > 0:
                    self.texts.append(words)
                    self.classifiers.append(label)
                    self.labels.append(label_id)
                    self.file_label.append(fname)

    def load_data(self, path):

        self.label_name = []
        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels = []  # list of label ids
        self.classifiers = []
        self.file_label = []
        for fname in os.listdir(path):
            fpath = os.path.join(path, fname)
            if os.path.isdir(fpath):
                label_id = len(self.labels_index)
                self.labels_index[fname] = label_id
                self._load_directory(fpath, fname, label_id)

        # tokenizer
        tokenizer = Tokenizer(num_words=self.MAX_NB_WORDS)
        tokenizer.fit_on_texts(self.texts)
        self.sequences = tokenizer.texts_to_sequences(self.texts)
        self.word_index = tokenizer.word_index

    def create_traning_data(self):

        labels = to_categorical(np.asarray(self.labels))

        # create data
        data = pad_sequences(self.sequences, maxlen=self.MAX_SEQUENCE_LENGTH)
        indices = np.arange(data.shape[0])
        np.random.shuffle(indices)
        data = data[indices]
        labels = labels[indices]
        nb_validation_samples = int(self.VALIDATION_SPLIT * data.shape[0])

        # training size and values
        x_train = data[:-nb_validation_samples]
        y_train = labels[:-nb_validation_samples]
        x_val = data[-nb_validation_samples:]
        y_val = labels[-nb_validation_samples:]

        return x_train, y_train, x_val, y_val

    def preparing_matrix(self):

        # prepare embedding matrix
        nb_words = min(self.MAX_NB_WORDS, len(self.word_index))
        embedding_matrix = np.zeros((nb_words + 1, self.EMBEDDING_DIM))
        for word, i in self.word_index.items():
            if i > self.MAX_NB_WORDS:
                continue
            embedding_vector = self.embeddings_index.get(word)
            if embedding_vector is not None:
                # words not found in embedding index will be all-zeros.
                embedding_matrix[i] = embedding_vector

        # load pre-trained word embeddings into an Embedding layer
        # note that we set trainable = False so as to keep the embeddings fixed
        embedding_layer = Embedding(nb_words + 1,
                                    self.EMBEDDING_DIM,
                                    weights=[embedding_matrix],
                                    input_length=self.MAX_SEQUENCE_LENGTH,
                                    trainable=False)

        return embedding_layer

    def LSTMTraining(self, x_train, y_train, x_val, y_val):

        model = Sequential()

        # 词向量嵌入层,输入:词典大小,词向量大小,文本长度
        model.add(
            Embedding(self.MAX_SEQUENCE_LENGTH,
                      100,
                      input_length=self.MAX_NB_WORDS))
        #model.add(Dropout(0.25))

        model.add(LSTM(100))
        #model.add(Flatten())

        model.add(Convolution1D(128, 5, border_mode="valid",
                                activation="relu"))

        # 全连接层
        model.add(Dense(128))
        model.add(Dropout(0.25))
        model.add(Activation('relu'))
        model.add(Dense(128))
        model.add(Activation('softmax'))
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'],
                      verbose=1)

        result = model.fit(x_train,
                           y_train,
                           validation_data=(x_val, y_val),
                           nb_epoch=2,
                           batch_size=128,
                           verbose=1)

    def test(self):

        # input: meant to receive sequences of 100 integers, between 1 and 10000.

        main_input = Input(shape=(100, ), dtype='int32', name='main_input')

        # this embedding layer will encode the input sequence
        # into a sequence of dense 512-dimensional vectors.
        x = Embedding(output_dim=512, input_dim=10000,
                      input_length=100)(main_input)

        # LSTM will transform the vector sequence into a single vector,
        # containing information about the entire sequence
        lstm_out = LSTM(32)(x)

        #insert the auxiliary loss, allowing the LSTM and Embedding layer to be trained
        #smoothly even though the main loss will be much higher in the model.
        auxiliary_output = Dense(1, activation='sigmoid',
                                 name='aux_output')(lstm_out)

        #we feed into the model our auxiliary input data by concatenating it with the LSTM output:

        auxiliary_input = Input(shape=(5, ), name='aux_input')
        #x = merge([lstm_out, auxiliary_input], mode='concat')

        # we stack a deep fully-connected network on top
        x = Dense(64, activation='relu')(x)
        x = Dense(64, activation='relu')(x)
        x = Dense(64, activation='relu')(x)

        # and finally we add the main logistic regression layer
        main_output = Dense(1, activation='sigmoid', name='main_output')(x)

        #This defines a model with two inputs and two outputs:
        model = Model(input=[main_input, auxiliary_input],
                      output=[main_output, auxiliary_output])

        model.compile(optimizer='rmsprop',
                      loss='binary_crossentropy',
                      loss_weights=[1., 0.2])

        #We can train the model by passing it lists of input arrays and target arrays:
        '''
        model.fit([headline_data, additional_data], [labels, labels], nb_epoch=50, batch_size=32)


        model.compile(optimizer='rmsprop',
              loss={'main_output': 'binary_crossentropy', 'aux_output': 'binary_crossentropy'},
              loss_weights={'main_output': 1., 'aux_output': 0.2})

        # and trained it via:
        model.fit({'main_input': headline_data, 'aux_input': additional_data},
            {'main_output': labels, 'aux_output': labels},
            nb_epoch=50, batch_size=32)

        '''

    def clause_training(self, path):
        #from keras.utils.vis_utils import plot_model

        # 1, loading text samples and their labels
        self.load_data(path)

        # 2. create data into a training set and a validation set
        x_train, y_train, x_val, y_val = self.create_traning_data()

        # 4. create embedding layer
        embedding_layer = self.preparing_matrix()

        # 5. train a 1D convnet with global maxpooling
        sequence_input = Input(shape=(self.MAX_SEQUENCE_LENGTH, ),
                               dtype='int32',
                               name='main_input')
        embedded_sequences = embedding_layer(sequence_input)

        #lstm_out = LSTM(32)(embedded_sequences)
        #x = MaxPooling1D(5)(lstm_out)

        x = Conv1D(128, 5, activation='relu')(embedded_sequences)
        x = MaxPooling1D(5)(x)

        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(5)(x)

        x = Conv1D(128, 5, activation='relu')(x)
        x = MaxPooling1D(35)(x)
        x = Flatten()(x)

        x = Dense(128, activation='relu')(x)
        preds = Dense(len(self.labels_index), activation='softmax')(x)

        model = Model(sequence_input, preds)
        model.compile(loss='categorical_crossentropy',
                      optimizer='rmsprop',
                      metrics=['acc'])

        #plot_model(model, to_file='model_plot.png', show_shapes=True, show_layer_names=True)
        '''
        history = model.fit(x_train, y_train, validation_data=(x_val, y_val), nb_epoch=2, batch_size=128)
        
        # Estimate model performance
        #trainScore = model.evaluate(x_train, y_train, verbose=0)
        #print('Train Score: %.2f MSE (%.2f RMSE)' % (trainScore, math.sqrt(trainScore)))
        #testScore = model.evaluate(x_val, y_val, verbose=0)
        #print('Test Score: %.2f MSE (%.2f RMSE)' % (testScore, math.sqrt(testScore)))

        # generate predictions for training
        
        trainPredict = model.predict(x_train)
        testPredict = model.predict(x_val)
        '''

        md.save_json_model(model, "clause_model")
        #md.save_yaml_model(model, "embedded_model")
        return model

    def loading(self):
        from keras.datasets import imdb
        from keras.preprocessing import sequence
        max_features = 5000
        maxlen = 400

        (x_train, y_train), (x_test,
                             y_test) = imdb.load_data(num_words=max_features)
        if self._debug:
            print(len(x_train), 'train sequences')
            print(len(x_test), 'test sequences')

            print('Pad sequences (samples x time)')
        x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
        x_test = sequence.pad_sequences(x_test, maxlen=maxlen)

        if self._debug:
            print('x_train shape:', x_train.shape)
            print('x_test shape:', x_test.shape)

        return x_train, y_train, x_test, y_test

    def training_model(self):
        max_features = 5000
        maxlen = 400
        batch_size = 32
        embedding_dims = 50
        filters = 250
        kernel_size = 3
        hidden_dims = 250
        epochs = 2

        model = Sequential()
        '''we start off with an efficient embedding layer which maps our vocab indices into embedding_dims dimensions'''
        model.add(Embedding(max_features, embedding_dims, input_length=maxlen))
        model.add(Dropout(0.2))
        '''we add a Convolution1D, which will learn filters word group filters of size filter_length: '''
        model.add(
            Conv1D(filters,
                   kernel_size,
                   padding='valid',
                   activation='relu',
                   strides=1))
        # we use max pooling:
        model.add(GlobalMaxPooling1D())

        # We add a vanilla hidden layer:
        model.add(Dense(hidden_dims))
        model.add(Dropout(0.2))
        model.add(Activation('relu'))

        # We project onto a single unit output layer, and squash it with a sigmoid:
        model.add(Dense(1))
        model.add(Activation('sigmoid'))

        model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])

        x_train, y_train, x_test, y_test = self.loading()
        model.fit(x_train,
                  y_train,
                  batch_size=batch_size,
                  epochs=epochs,
                  validation_data=(x_test, y_test))

        md.save_json_model(model, "json_model")
        md.save_yaml_model(model, "yaml_model")

        del model

    def test_imdb(self):

        from keras.callbacks import ModelCheckpoint
        from keras.utils import np_utils

        law_document = LawDocument()

        fname = os.path.join(
            variables.BASE_DIR,
            'data/Corpus/TEXT/合同、协议/劳动合同/1. 劳动合同- 最终版.DOC.txt')
        sentences = law_document.get_sentences(fname)

        # tokenizer
        tokenizer = Tokenizer(nb_words=self.MAX_NB_WORDS)
        tokenizer.fit_on_texts([sentences])
        self.sequences = tokenizer.texts_to_sequences([sentences])
        self.word_index = tokenizer.word_index

        seq_length = 10
        data = [m for m in self.word_index.values()]

        index_word = {}
        for w, id in list(self.word_index.items()):
            index_word[id] = w

        dataX = []
        dataY = []
        length = len(data) - seq_length
        for i in range(0, length, seq_length):
            seq_in = data[i:i + seq_length - 1]
            seq_out = data[i + seq_length]
            dataX.append(seq_in)
            dataY.append(seq_out)
        """
        raw_text = sentences
        
        chars = sorted(list(set("word telphone main")))
        
        # create mapping of unique chars to integers
        chars = sorted(list(set(raw_text)))
        char_to_int = dict((c, i) for i, c in enumerate(chars))
        int_to_char = dict((i, c) for i, c in enumerate(chars))
        # summarize the loaded data
        n_chars = len(raw_text)
        n_vocab = len(chars)
        print ("Total Characters: ", n_chars)
        print ("Total Vocab: ", n_vocab)
        # prepare the dataset of input to output pairs encoded as integers
        seq_length = 100
        dataX = []
        dataY = []
        for i in range(0, n_chars - seq_length, 1):
            seq_in = raw_text[i:i + seq_length]
            seq_out = raw_text[i + seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])
        """

        n_patterns = len(dataX)
        print("Total Patterns: ", n_patterns)

        # reshape X to be [samples, time steps, features]
        X = np.reshape(dataX, (n_patterns, seq_length - 1, 1))
        n_vocab = len(index_word)
        # normalize
        X = X / float(n_vocab)

        # one hot encode the output variable
        y = np_utils.to_categorical(dataY)
        # define the LSTM model
        model = Sequential()
        model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
        model.add(Dropout(0.2))
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        # define the checkpoint
        filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        callbacks_list = [checkpoint]
        # fit the model
        #model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list)

        start = np.random.randint(0, len(dataX) - 1)
        pattern = dataX[start]
        print("Seed:")
        print("\"", ''.join([index_word[value] for value in pattern]), "\"")
        # generate characters
        for i in range(1000):
            x = np.reshape(pattern, (1, len(pattern), 1))
            x = x / float(n_vocab)
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = index_word[index]
            seq_in = [index_word[value] for value in pattern]
            sys.stdout.write(result)
            pattern.append(index)
            pattern = pattern[1:len(pattern)]
        print("\nDone.")

    '''
コード例 #17
0
class ClauseVerifying(object):
    def __init__(self, modelname=None):

        self._document = LawDocument()
        self.clause_model = clause_training.ClauseTraining()
        self.clause_model.load_model_label()

        pass

    def load_predict_document(self, filename):
        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)
        self._document.read_section(ofile)

        texts = []
        if len(self._document.sections) > 0:
            for section in self._document.sections:
                ss = []
                if section.title:
                    pass
                if len(section.sentences) > 0:
                    ss = [p[0] for p in section.sentences]
                    if len(ss) > 0:
                        texts.append(doc.sentencesTowords(ss))

        else:
            for s in self._document.document_header:
                texts.append(doc.sentencesTowords([s]))

        return texts

    def predict(self, filename):

        texts = self.load_predict_document(filename)

        tokenizer = Tokenizer(num_words=self.clause_model.MAX_NB_WORDS)
        tokenizer.fit_on_texts(texts)
        sequences = tokenizer.texts_to_sequences(texts)

        # create data
        data = pad_sequences(sequences,
                             maxlen=self.clause_model.MAX_SEQUENCE_LENGTH)

        dtn_logger.logger_info("PREDICT",
                               "Verification document : " + filename)
        dtn_logger.logger_info("PREDICT", "Predict Data : " + str(data.shape))

        model = md.load_json_model(self.clause_model.MODEL_NAME)
        #model.compile(loss='binary_crossentropy', optimizer='rmsprop', metrics=['accuracy'])
        model.compile(loss='binary_crossentropy',
                      optimizer=md.OPTIMIZER_ADAM,
                      metrics=['accuracy'])

        for i, s in enumerate(data):
            s = data[np.array([i])]
            preds = model.predict(s)

            n = self.sample(preds[0])
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.8)
            print("*** " + self.clause_model.label_name[n] + "***")
            n = self.sample(preds[0], 0.2)
            print("*** " + self.clause_model.label_name[n] + "***")

            print(texts[i])
            if i > 5:
                break

    def sample(self, p, temperature=1.0):
        # helper function to sample an index from a probability array
        preds = np.asarray(p).astype('float64')
        preds = np.asarray(preds).astype('float64')
        preds = np.log(preds) / temperature
        exp_preds = np.exp(preds)
        preds = exp_preds / np.sum(exp_preds)
        probas = np.random.multinomial(1, preds, 1)

        mmm = np.argmax(probas)
        print(mmm)
        return mmm
コード例 #18
0
ファイル: datasets.py プロジェクト: minlogiciel/docutone
class Datasets(object):
    '''
    create all types of document data set
    
    doc2vec label document type
    
    save to dataset data/dataset directory 
   
    '''
    NB_LOOP = 7
    V_ALPHA = 0.2
    V_MIN_ALPHA = 0.05
    V_ALPHA_RATE = 0.005

    def __init__(self):

        self.texts = []  # list of text samples
        self.labels_index = {}  # dictionary mapping label name to numeric id
        self.labels_files = {}  # dictionary mapping label name to numeric id
        self.labels_name = {}  # dictionary mapping label name to numeric id
        self.file_label = []  # file label id
        self.labels = []  # list of label ids
        self.classifiers = []  # list of classifier
        self.law_doc = LawDocument()
        self.folder_structure = {}
        self.folder_order = []

        pass

    def get_data_file_name(self, fname, isdataset=True):
        path = os.path.join(variables.BASE_DIR, variables.MODEL_DATA_DIR)
        if not os.path.exists(path):
            os.mkdir(path)
        if isdataset:
            path = os.path.join(path, 'datasets')
            if not os.path.exists(path):
                os.mkdir(path)
        return os.path.join(path, fname)

    def get_svc_file_name(self):
        return self.get_data_file_name(variables.SVC_MODEL)

    def get_model_file_name(self):
        return self.get_data_file_name(variables.DOC_MODEL)

    def get_word_model_name(self):
        return self.get_data_file_name(variables.WORD_MODEL)

    def get_dict_file_name(self):
        return self.get_data_file_name(variables.MODEL_DICT)

    def get_mm_file_name(self):
        return self.get_data_file_name(variables.MODEL_MM)

    def get_model_list_name(self):
        return self.get_data_file_name(variables.MODEL_LSI)

    """
    all files in this directory are a classifier document
    
    """

    def load_directory_for_document(self, path, label, label_id):

        nb_files = 0

        for fname in sorted(os.listdir(path)):

            fpath = os.path.join(path, fname)
            if os.path.isdir(fpath):

                if fname in self.folder_structure.keys():
                    sublabel = self.folder_structure[fname][0]
                    if sublabel == None:
                        sublabel = label
                else:
                    sublabel = label

                nb_files += self.load_directory_for_document(
                    fpath, sublabel, label_id)

            elif fname.lower().endswith(".txt"):
                words = document.get_document_words(fpath)
                if len(words) > 10:
                    self.texts.append(words)

                    self.classifiers.append(label)
                    self.labels.append(label_id)
                    self.file_label.append(fname)
                    nb_files += 1
            else:
                #is not text file
                pass
        return nb_files

    '''
    
    '''

    def load_document_directorie(self, fpath, label):

        if label in self.labels_index.keys():
            label_id = self.labels_index[label]
            nb = self.labels_files[label]
        else:
            label_id = len(self.labels_index) + 1
            nb = 0

        n = self.load_directory_for_document(fpath, label, label_id)
        if n > 5:
            self.labels_index[label] = label_id
            self.labels_name[label_id - 1] = label
            self.labels_files[label] = nb + n
            print(" === " + label + " : " + str(label_id) + "   === ")

    def load_directories(self, path):
        for name in sorted(os.listdir(path)):
            fpath = os.path.join(path, name)
            if os.path.isdir(fpath):
                if name in self.folder_structure.keys():
                    label, level = self.folder_structure[name]
                    if level == 0:
                        continue
                    elif not label:
                        self.load_directories(fpath)
                        continue
                else:
                    label = name

                self.load_document_directorie(fpath, label)

    def load_text_files(self, text_path):

        self.texts = []
        self.labels_index = {}
        self.labels_name = {}
        self.labels = []
        self.file_label = []

        # load defined directory type
        folder = Folder()

        # folder structure define classifier document type
        self.folder_structure = folder.load_folder_structure(text_path)

        for name in sorted(os.listdir(text_path)):
            path = os.path.join(text_path, name)
            if name != variables.TEMP_DIR and name != variables.DATA_DIR and os.path.isdir(
                    path):
                if name in self.folder_structure.keys():
                    label, level = self.folder_structure[name]
                    if level == 0:
                        continue
                    else:
                        if label:
                            self.load_document_directorie(path, label)
                        else:
                            self.load_directories(path)

    '''
    load all file from merged files
    
    '''

    def load_data_files(self, text_path):

        self.texts = []
        self.labels_index = {}
        self.labels_name = {}
        self.labels = []
        self.file_label = []

        # load defined directory type
        folder = Folder()

        # folder structure define classifier document type
        self.folder_structure = folder.load_folder_structure(text_path)

        folder_order = folder.folder_order

        prevlabel = None
        for fname in sorted(os.listdir(text_path)):
            if (folder.CORPUS_FILE_NAME in fname):
                filename = os.path.join(text_path, fname)
                docs = folder.load_corpus_file(filename)

                for doc in docs:
                    fname = doc[0][0]  # file name
                    name = doc[0][1]  # type of file
                    label = doc[0][2]  # file label
                    level = doc[0][3]  # file level in directory

                    if name in self.folder_structure.keys():
                        categorie, level = self.folder_structure[name]
                        if categorie and ';' in categorie:
                            label = categorie.split(';')[0]
                        else:
                            label = categorie

                    if not categorie:
                        savecat = None

                        if name in self.folder_structure.keys():

                            for fn in folder_order:
                                if fn == name and savecat != None:
                                    categorie = savecat
                                    if ';' in categorie:
                                        label = categorie.split(';')[0]
                                    else:
                                        label = categorie
                                    break
                                else:
                                    categorie, level = self.folder_structure[
                                        fn]
                                    if categorie:
                                        savecat = categorie

                        else:
                            categorie = doc[0][2]
                            label = categorie

                    sentences = doc[1:]
                    norm_sentences = [
                        util.normalize_sentence(s) for s in sentences
                    ]
                    words = self.law_doc.get_normalize_document_from_sentences(
                        norm_sentences, outtype=2)
                    if len(words) > 0:

                        # find same classifier
                        if label in self.labels_index.keys():
                            label_id = self.labels_index[label]
                        else:
                            label_id = len(self.labels_index) + 1

                            self.labels_index[label] = label_id
                            self.labels_name[label_id] = label

                        # add document to text
                        self.texts.append(words)
                        # add label
                        self.classifiers.append(categorie)
                        # add label id
                        self.labels.append(label_id)
                        # add file name
                        self.file_label.append(fname)

                        if label in self.labels_files.keys():
                            self.labels_files[label] += 1
                        else:
                            self.labels_files[label] = 1

                        if (prevlabel != label):
                            print(" === " + label + " : " + str(label_id) +
                                  "   === ")
                            prevlabel = label

    def load_documents(self, text_path):

        if (text_path.endswith("training_data")):
            self.load_data_files(text_path)
        else:
            self.load_text_files(text_path)

        alldocs = []
        doclists = []
        for index in range(len(self.texts)):
            words = self.texts[index]
            if len(words) > 10:

                #string = 'doc_' + str(index+1)
                docs = TaggedDocument(words, tags=[index])
                doclists.append(docs)
                alldocs.append(words)
        return alldocs, doclists

    def get_document_words(self, filename, f=None):

        f = File(filename, None, verbose=0)
        words = f.get_document_words()
        if len(words) > 10:
            return words
        else:
            return None

    def getTaggedDocuments(self, filename, index):

        words = self.get_document_words(filename)
        if words:
            return TaggedDocument(words, tags=[index])
        else:
            return None

    def TrainingDoc2Vec(self, documents, size, window, nb_loop=NB_LOOP):

        #doc to vector
        #model = doc2vec.Doc2Vec(documents, size=size, window=window)
        alpha = self.V_ALPHA
        min_alpha = self.V_MIN_ALPHA
        model = doc2vec.Doc2Vec(documents,
                                size=size,
                                window=window,
                                alpha=alpha,
                                min_alpha=min_alpha)
        #model.sort_vocab()
        #model.build_vocab(documents)

        for epoch in range(nb_loop):
            random.shuffle(documents)
            model.train(documents,
                        total_examples=len(documents),
                        epochs=1,
                        start_alpha=alpha,
                        end_alpha=min_alpha)
            alpha -= self.V_ALPHA_RATE
            min_alpha = alpha
            # decrease the learning rate
            model.alpha -= self.V_ALPHA_RATE  # decrease the learning rate
            model.min_alpha = model.alpha  # fix the learning rate, no decay
            #err, err_count, test_count, predictor = error_rate_for_model(model, documents, test_docs)
            print("epoch = %d alpha = %f\n" % (epoch, model.alpha))

        return model

    def create_dataset(self,
                       text_path,
                       min_count=2,
                       sg=0,
                       workers=1,
                       size=256,
                       window=5):
        """
            
        min_count : ignore all words with total frequency lower than this.
        sg : sg = O CBOW, sg=1 skip-gram 
        workers: thread
        size : dimension feature vectors.
        window : maximum distance between the current and predicted word within a sentence.
    
        """
        dtn_logger.logger_info("DATASET", "create dataset " + text_path)

        lists, doclists = self.load_documents(text_path)

        dictionary = corpora.Dictionary(lists)
        corpus = [dictionary.doc2bow(text) for text in lists]

        # save corpus
        corpusfname = self.get_mm_file_name()
        corpora.MmCorpus.serialize(corpusfname, corpus)

        # save dictionay
        dictfname = self.get_dict_file_name()
        dictionary.save(dictfname)

        dictfname = self.get_model_list_name()
        # initialize a model
        tfidf = models.TfidfModel(corpus, normalize=True)

        # use the model to transform vectors
        corpus_tfidf = tfidf[corpus]

        # initialize an LSI transformation, LSI 2-D space
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300)
        lsi.save(dictfname)  # same for tfidf, lda, ...

        #training doc2vec
        model = self.TrainingDoc2Vec(doclists,
                                     size=size,
                                     window=window,
                                     nb_loop=32)
        # save doc vector
        vectfname = self.get_model_file_name()
        model.save(vectfname)

        # word to vector
        model = word2vec.Word2Vec(lists,
                                  min_count=min_count,
                                  sg=sg,
                                  workers=workers,
                                  size=size,
                                  window=window)
        # save words vector
        vectfname = self.get_word_model_name()
        model.wv.save_word2vec_format(vectfname, binary=False)
        #model.sort_vocab()
        #model.build_vocab(sentences, update=False)

        # save file label
        self.save_filelabel()

        # save doc label
        self.save_doclabel()

        # save vector labels
        self.save_labelset()

        # save classifier labels
        self.save_classifierlabel()

    def load_wordvect(self):
        fname = self.get_word_model_name()
        f = codecs.open(fname, 'r', 'utf-8')
        sentences = f.read()
        sentences = sentences.split('\n')
        sentences = [s.split()[1:] for s in sentences]
        word2vec = [' '.join(s) for s in sentences if len(s) > 0]
        f.close()

        return word2vec

    def load_docvect(self):

        fname = self.get_model_file_name()
        model = doc2vec.Doc2Vec.load(fname)

        nbdocs = len(model.docvecs)
        resultlist = []
        for i in range(nbdocs):
            #string = 'doc_' + str(i+1)
            #resultlist.append(model.docvecs[string])
            vv = model.docvecs[i]
            vv = [v for v in vv]
            resultlist.append(vv)

        return resultlist

    def load_labelset(self):
        fname = self.get_data_file_name(variables.VECT_LABEL, True)
        f = codecs.open(fname, 'r', 'utf-8')

        labelSet = [int(line) for line in f if len(line.strip()) > 0]
        f.close()

        return labelSet

    def save_labelset(self):
        fname = self.get_data_file_name(variables.VECT_LABEL, True)
        f = codecs.open(fname, 'w', 'utf-8')

        for v in self.labels:
            f.write("%s\n" % (v))
        f.close()

    def load_doclabel(self):
        fname = self.get_data_file_name(variables.DOC_LABEL, True)
        f = codecs.open(fname, 'r', 'utf-8')

        labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()

        return labelSet

    def save_doclabel(self):
        # save doc label
        fname = self.get_data_file_name(variables.DOC_LABEL, True)
        f = codecs.open(fname, 'w', "utf-8")
        #for k, v in labels_index.items():
        for v, k in self.labels_name.items():
            f.write("%s=%d=%d\n" % (k, v, self.labels_files[k]))
        f.close()

    def load_filelabel(self):
        fname = self.get_data_file_name(variables.FILE_LABEL, True)
        f = codecs.open(fname, 'r', 'utf-8')

        labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()

        return labelSet

    def save_filelabel(self):
        # save file label
        fname = self.get_data_file_name(variables.FILE_LABEL, True)
        f = codecs.open(fname, 'w', "utf-8")
        for index in range(len(self.file_label)):
            k = self.file_label[index]
            v = self.labels[index]
            f.write("%s=%d\n" % (k, v))
        f.close()

    def load_classifierlabel(self):
        fname = self.get_data_file_name(variables.CLASSIFY_LABEL, True)
        f = codecs.open(fname, 'r', 'utf-8')

        labelSet = [line.strip() for line in f if len(line.strip()) > 0]
        f.close()

        return labelSet

    def save_classifierlabel(self):
        # save classifier label
        fname = self.get_data_file_name(variables.CLASSIFY_LABEL, True)
        f = codecs.open(fname, 'w', "utf-8")
        for v in self.classifiers:
            f.write("%s\n" % (v))
        f.close()

    def test_corpus_dictionary(self):

        dictfname = self.get_dict_file_name()
        if (os.path.exists(dictfname)):
            dictionary = corpora.Dictionary.load(dictfname)
            corpusfname = self.get_mm_file_name()
            corpus = corpora.MmCorpus(corpusfname)
            print(corpus)
        else:
            print("corpus dictionary does not exist")

    def test_word_vector_model(self):

        vectfname = self.get_word_model_name()

        #sentences = LineSentence(vectfname)
        #sentences = Text8Corpus(vectfname)
        #sentences = LineSentence('compressed_text.txt.bz2')
        #sentences = LineSentence('compressed_text.txt.gz')

        model = word2vec.Word2Vec.load_word2vec_format(vectfname, binary=False)

        print("Test word2vec most similar for 驾驶, 机动车, 交通运输")
        print(model.most_similar(positive=['驾驶']))
コード例 #19
0
ファイル: contract.py プロジェクト: minlogiciel/docutone
class Contract(object):
    
    '''
    create legal terms classifier
    
    input model : data/terms/template
    
    output model : data/models
    '''
       
    def __init__(self, debug=0, crf_model=True):
        
        self.texts = []         # list of legal terms tests
        self.terms_index = {}  #  mapping legal term name to numeric id
        self.terms_name = {}   #  legal term name 
        self.terms_label = []  #  mapping legal term name to label
        self.labels = []        # list of legal term label ids
        self._debug = debug
        self.seg = Segmentation()
        self.seg.load_suggest_words()
        self.lawdocument = LawDocument()
        self.clause = Clause()
        self.doc_type = None
        self.doc_path = None
        self.labor_model = True
        self.crf_model = crf_model
    
    def get_data_file_name(self, dataname, categorie='models') :
        path = variables.get_data_file_name(self.doc_path, categorie=categorie)
        if not os.path.exists(path) :
            os.mkdir(path)
        return os.path.join(path, dataname)
 
    def get_term_model_name(self) :
        return self.get_data_file_name(variables.TERM_DOC_MODEL)
        

    # term vector [0, 0, 1, 1, ...]  
    def load_term_set(self) :
        fname = self.get_data_file_name(variables.TERM_VECT)
        f = codecs.open(fname, 'r', 'utf-8')
        
        termSet = [int(line) for line in f if len(line.strip()) > 0]
        f.close()
    
        return termSet

    def save_term_set(self) :
        fname = self.get_data_file_name(variables.TERM_VECT)
        f = codecs.open(fname, 'w', 'utf-8')

        for v in self.labels :
            f.write("%s\n" % (v))
        f.close()
         
    
    
    # term name [termname=termid]
    def load_term_label(self) :
        fname = self.get_data_file_name(variables.TERM_LABEL)
        f = codecs.open(fname, 'r', 'utf-8')
        
        labelSet = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()
    
        return labelSet


    def save_term_label(self) :
        fname = self.get_data_file_name(variables.TERM_LABEL)
        f = codecs.open(fname, 'w', 'utf-8')
        
        for v, k in self.terms_name.items():
            f.write("%s=%d\n" % (k, v))
        f.close()
          
    
    # term list 
    def load_term_list(self) :
        fname = self.get_data_file_name(variables.TERM_LIST)
        f = codecs.open(fname, 'r', 'utf-8')
        
        termList = [line.split('=')[0] for line in f if len(line.strip()) > 0]
        f.close()
    
        return termList
 

    def save_term_list(self) :
        fname = self.get_data_file_name(variables.TERM_LIST)
        f = codecs.open(fname, 'w', 'utf-8')
        for index in range(len (self.terms_label)):
            k = self.terms_label[index]
            v = self.labels[index]
            f.write("%s=%d\n" % (k, v))
        f.close()
 
    def _convert(self, text_path, convert=False) :   
        
        path = text_path;
        if text_path.endswith("doc") :
            doc_path = text_path
            path = text_path[0:-3] + "TEXT"
        
        if (convert and os.path.exists(doc_path)) :
            conv = Convert(verbose=0)
            o_file = conv.open_output(doc_path, path)
            conv.files_to_text(doc_path, o_file)    
            conv.close_output()
        return path

    
    def get_term_words(self, text) :
        
        if isinstance(text, string_types) :
            sentences = [text]
        else :
            sentences = text

        words_all_filter = self.seg.segment(sentences)[2]

        words = []
        for sentences in words_all_filter :
            for w in sentences :
                if len(w.strip()) > 0 :
                    words.append(w.strip()) 
        return words
 


    
    
    def segment_terms(self, term_sentences):
        """
        Arguments :
        
        term_sentences : test term sentences
    
        return segmentation words
        
        """
        words_all_filter = self.seg.segment(term_sentences)[2]
            
        return words_all_filter
       

    
    def get_terms(self, filename, encoding="utf-8"):
                        
        terms = [] 
         
        self.lawdocument.create_document(filename, encoding)
        
        if len(self.lawdocument.sections) > 0 :         
            for p in self.lawdocument.sections :
                
                term_sentences = []
                term_sentences.append(p.title)
                for s in p.sentences :
                    term_sentences.append(s[0]) # document sentence [s, num, type]
                terms.append(term_sentences)
        # if doc is not law document
        else :
            for p in self.lawdocument.document_header :
                terms.append([p])
            pass
        return terms
             
    
    def load_file(self, filename, encoding="utf-8"):
        
        # directory name is document type 
        ftype = os.path.basename(os.path.dirname(filename))
        
        self.clause.create_clauses(filename, encoding=encoding)

        for p in self.clause.sections :
            name = p.title
            term_sentences = []
            term_sentences.append(name)
            for s in p.sentences :
                term_sentences.append(s)
            
            # add term vector 
            self.texts.append(self.segment_terms(term_sentences))
                     
            if name in self.terms_index :
                label_id = self.terms_index[name]
            else :
                label_id = len(self.terms_index)+1
                self.terms_index[name] = label_id
                self.terms_name[label_id] = name
                        
            self.labels.append(label_id)
            
            self.terms_label.append(name+":"+ftype)
  


    def load_directory(self, path) :

        for fname in sorted(os.listdir(path)):
            fpath = os.path.join(path, fname)
            if os.path.isdir(fpath):
                if variables.noloaddir(fname) :
                    continue
                self.load_directory(fpath)
    
            elif fname.endswith(".txt"):
                self.load_file(fpath)
            else :
                #is not text file
                pass
    
    
    def load_terms(self, text_path) :
 
        self.load_directory(text_path) 

        termdocs = []   # term doc2vec
        allterms = []   # term contents

        for index in range(len(self.texts)) :
            term = self.texts[index]
            if len(term) > 0 :
                s = []
                for sentences in term :                 
                    for word in sentences :
                        s.append(word)
                    
                #string = 'doc_' + str(index+1)
                docs = TaggedDocument(s, tags = [index])         
                termdocs.append(docs)
                allterms.append(s)
        return allterms, termdocs
                 

        
    def _create_terms(self, text_path, doctype=None, min_count=2, sg=0, workers=1, size=256, window=5) :   
        """
        min_count : ignore all words with total frequency lower than this.
        sg : sg = O CBOW, sg=1 skip-gram 
        workers: thread
        size : dimension feature vectors.
        window : maximum distance between the current and predicted word within a sentence.
    
        """
        
        self.texts = []         # list of legal terms tests
        self.terms_index = {}  #  mapping legal term name to numeric id
        self.terms_name = {}   #  legal term name 
        self.terms_label = []  #  mapping legal term name to label
        self.labels = []        # list of legal term label ids
        self.doc_type = doctype
        self.doc_path = doctype
        
        path = text_path
            
    
        allterms, termdocs = self.load_terms(path)
        
        # if there is no more clauses, do nothing
        if  len(allterms) < 10 :       
            return
                
        dictionary = corpora.Dictionary(allterms)
        corpus = [dictionary.doc2bow(text) for text in allterms]
        
        # save corpus
        corpusfname = self.get_data_file_name(variables.TERM_MODEL_MM)
        corpora.MmCorpus.serialize(corpusfname, corpus) 
    
        
        # save dictionary
        dictfname = self.get_data_file_name(variables.TERM_MODEL_DICT)
        dictionary.save(dictfname)
        
    
    
        dictfname = self.get_data_file_name(variables.TERM_MODEL_LSI)        
        # initialize a model
        tfidf = models.TfidfModel(corpus, normalize=True)
            
        # use the model to transform vectors        
        corpus_tfidf = tfidf[corpus]
            
        # initialize an LSI transformation, LSI 2-D space
        lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=300) 
        lsi.save(dictfname) # same for tfidf, lda, ...
    
    
        #trainig doc2vec
        datasets = Datasets()
        model = datasets.TrainingDoc2Vec(termdocs, size, window, 16) 
        # save doc vector
        vectfname = self.get_term_model_name()
        model.save(vectfname)

        '''
        # word to vector 
        model = word2vec.Word2Vec(allterms, min_count=min_count, sg=sg, workers=workers, size=size, window=window)
        # save words vector
        vectfname = self.get_data_file_name(variables.TERM_WORD_MODEL)
        model.wv.save_word2vec_format(vectfname, binary=False)
        '''
        # save term list 
        self.save_term_list()
        
        # save term vector  
        self.save_term_set()
    
        # save term name 
        self.save_term_label()


    def create_crf(self, path) :   

        crf = CRF()
        if self.labor_model :
            fpath = path + "/劳动合同" 
            ftype = "劳动合同"
            crf.create_categorie_tagging(fpath, ftype)
        else :
            crf.create_crf_model()
            
    def create_terms(self, text_path, convert=False) :   
        
        path = self._convert(text_path, convert)
        
        if self.crf_model :
            self.create_crf(path)
        else :
            self._create_terms(path, doctype=None)
            for doctype in sorted(os.listdir(path)):
                fpath = os.path.join(path, doctype)
                if os.path.isdir(fpath):
                    self._create_terms(fpath, doctype=doctype)
コード例 #20
0
ファイル: extraction2.py プロジェクト: minlogiciel/docutone
 def __init__(self):
     """
     """
     self.law_document = LawDocument()        
     self.all_keywords = util.load_legalterm_type()
コード例 #21
0
class Summarize(object):

    CLUSTER_THRESHOLD = 5  # Distance between words to consider

    def __init__(self, filename=None):
        """
        """
        self.law_document = LawDocument()
        self.important_word = []
        self.top_n_scored = []
        self.mean_scored = []

    def load_keywords(self):

        filename = self.law_document.get_keywords_file_name()
        f = codecs.open(filename, 'r', 'utf-8')

        self.important_word = []
        for line in f:
            if line.strip():
                tokens = line.strip().split(" ")
                if tokens[0].strip():
                    word = [tokens[0].strip()]

                    if len(tokens) > 1 and tokens[1].strip():
                        word.append(int(tokens[1].strip()))
                    else:
                        word.append(0)

                    if len(tokens) > 2 and tokens[2].strip():
                        word.append(int(tokens[2].strip()))

                    self.important_word.append(word)
        f.close()
        return self.important_word

    def _cluster_sentences(self, s, important_word):

        word_idx = []
        clusters = []
        # For each word in the keyword list
        for [w, n] in important_word:
            word = w.strip()
            if word:
                try:
                    index = s.index(word)
                    word_idx.append(index)
                    if n == 1:
                        index = index + 1
                        word_idx.append(index)
                except ValueError:  # w not in this particular sentence
                    pass

        # Using the word index, compute clusters by using a max distance threshold,
        # for any two consecutive words
        if len(word_idx) > 0:
            word_idx.sort()
            cluster = [word_idx[0]]
            i = 1
            while i < len(word_idx):
                if word_idx[i] - word_idx[i - 1] < self.CLUSTER_THRESHOLD:
                    cluster.append(word_idx[i])
                else:
                    clusters.append(cluster[:])
                    cluster = [word_idx[i]]
                i += 1
            clusters.append(cluster)

        return clusters

    def _score_sentences(self, sentences, important_word):
        scores = []
        sentence_idx = -1

        for [s, idx, type] in sentences:
            sentence_idx += 1
            clusters = self._cluster_sentences(s, important_word)

            if len(clusters) == 0:
                continue
            # Score each cluster. The max score for any given cluster is the score
            # for the sentence

            max_cluster_score = 0
            for c in clusters:
                significant_words_in_cluster = len(c)
                total_words_in_cluster = c[-1] - c[0] + 1
                score = 1.0 * significant_words_in_cluster \
                    * significant_words_in_cluster / total_words_in_cluster
                if score > max_cluster_score:
                    max_cluster_score = score

                if score > max_cluster_score:
                    max_cluster_score = score

            scores.append((sentence_idx, score))

        return scores

    def analyze(self, filename, withWeight=True, encoding="utf-8"):

        self.law_document.analyze_file(filename)

        self.load_keywords()

        scored_sentences = self._score_sentences(self.law_document.sentences,
                                                 self.important_word)

        # Summaization Approach 1:
        # Filter out non-significant sentences by using the average score plus a
        # fraction of the std dev as a filter

        avg = numpy.mean([s[1] for s in scored_sentences])
        std = numpy.std([s[1] for s in scored_sentences])

        ff = avg + 0.5 * std
        self.mean_scored = []
        for (sent_idx, score) in scored_sentences:
            if score > ff:
                self.mean_scored.append((sent_idx, score))

        # Summarization Approach 2:
        # Another approach would be to return only the top N ranked sentences

        self.top_n_scored = sorted(scored_sentences, key=lambda s: s[1])
        self.top_n_scored = sorted(self.top_n_scored, key=lambda s: s[0])

    def write_top_summarize(self, show_nb=5, outputfile=None, mode="a+"):
        if outputfile != None:
            f = codecs.open(outputfile, mode, 'utf-8')
            f.write(' '.join(self.law_document.document_title) + "\n")
            f.write('\n'.join(self.law_document.table_contents))
            f.write("\n\n摘要 : \n")

        else:
            f = None
            print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n")

        n_sentence = 0
        for (idx, score) in self.top_n_scored:
            if n_sentence < show_nb:
                sentence = self.law_document.get_document_chapiter(idx)
                if sentence:
                    if f != None:
                        f.write(sentence + "\n\n")
                    else:
                        print(sentence)
                        print("=" * 20)

                    n_sentence += 1
            else:
                if f != None:
                    f.write("\n" + "*" * 30 + "\n\n")
                break

    def write_summarize(self, show_nb=5, outputfile=None, mode="a+"):

        if outputfile != None:
            f = codecs.open(outputfile, mode, 'utf-8')
            f.write('摘要 : \n' + ' '.join(self.law_document.document_title) +
                    "\n")
        else:
            f = None
            print('摘要 : ' + ' '.join(self.law_document.document_title) + "\n")

        self.law_document.init_sentence_index()
        n_sentence = 0
        for (idx, score) in self.mean_scored:
            if n_sentence < show_nb:
                sentence = self.law_document.get_document_chapiter(idx)
                if sentence:
                    if f != None:
                        f.write(sentence)
                    else:
                        print(sentence)
                        print(" " * 20)
                    n_sentence += 1

            else:
                if f != None:
                    f.write("*" * 30)
                break
コード例 #22
0
class TermsVerification(object):

    SIMU_SEUIL = 0.6

    def __init__(self):

        self.contract = Contract(0)

        self.verified_terms = {}
        self._filetime = None
        self.fullname = None
        self.filename = None
        self._title = None
        self._contract_date = None
        self.keywords = []

        self.segment = Segmentation()
        self.document = LawDocument()

    def _init_terms_table(self, filename, termtype):
        self.categorie = termtype
        # get file name

        self.fullname = filename
        self.filename = os.path.basename(filename).split('.')[0]
        # get file created date
        self._filetime = util.get_creation_file_date(filename)

        # init verfying tab
        self.verified_terms = {}
        self.keywords = dtn_sentence.get_document_categorie(termtype)
        for key in self.keywords:
            self.verified_terms[key] = ExtractData(key, termtype)

        dtn_logger.logger_info("VERIFY", "%s (%s)" % (filename, termtype))

    def _load_terms_model(self, doctype=None):

        self.contract.doc_path = doctype

        self.term_names = self.contract.load_term_label()
        self.term_set = self.contract.load_term_set()
        self.term_list = self.contract.load_term_list()

        fname = self.contract.get_term_model_name()
        self.model = doc2vec.Doc2Vec.load(fname)

    def similar_term(self, term_words, termtype):

        tname = None
        ttype = None
        simu = 0.0
        docvec = self.model.infer_vector(doc_words=term_words)
        sims = self.model.docvecs.most_similar(positive=[docvec], topn=5)

        for i in range(len(sims)):
            n_term = int(sims[i][0])
            f_simu = sims[i][1]
            if f_simu > self.SIMU_SEUIL:
                if (n_term >= len(self.term_list)):
                    continue
                '''term = self.term_names[self.term_set[n_term]-1]'''
                term_name = self.term_list[n_term]
                if ':' in term_name:
                    tab = term_name.split(':', 1)
                    if tab[1] == termtype:
                        if tname == None:
                            tname = tab[0]
                            ttype = tab[1]
                            simu = f_simu
                            break

                elif term_name == termtype:
                    tname = term_name
                    ttype = term_name
                    simu = f_simu
                    break

            else:
                break

        return tname, ttype, simu

    def verify_term(self, text):

        term_words = self.contract.get_term_words(text)

        return self.similar_term(term_words)

    def _add_verified_sentences(self, termname, n_start, end_char, simu):

        nl = n_start
        st = self.document.norm_sentences[nl]
        ps = self.document.parser_sentence(st)
        if ps:
            st = ps[1]
            if ps[1][-1] is not ' ' and ps[2][0] is not ' ':
                st += ' '
            st += ps[2]

        self.verified_terms[termname].add_value(st, simu)
        while len(st) == 0 or st[-1] != end_char:
            nl += 1
            st = self.document.norm_sentences[nl]
            self.verified_terms[termname].add_value(st, 1)

    ''' get document term '''

    def get_terms(self, filename, filetype):

        if (filename.endswith(".txt")):
            ofile = filename
        else:
            ofile = docutonelocate.convert_file(filename)

        #lawdocument.create_document(ofile, filetype)
        self.document.read_section(ofile)

        self._title = self.document.document_name
        self._contract_date = self.document.document_date
        if self._title:
            if '文件名称' in self.keywords:
                self.verified_terms['文件名称'].add_value(self._title, 1)
            elif '合同名称' in self.keywords:
                self.verified_terms['合同名称'].add_value(self._title, 1)
        if self._contract_date:
            if '签约日期' in self.keywords:
                self.verified_terms['签约日期'].add_value(self._contract_date, 1)
            elif '签发日期' in self.keywords:
                self.verified_terms['签发日期'].add_value(self._contract_date, 1)
            elif '合同日期' in self.keywords:
                self.verified_terms['合同日期'].add_value(self._contract_date, 1)

        terms = []
        '''
        prev_sentence = ''
        for s in ld.document_header :
            prev_sentence += s
            if ld._is_sentence_end(s) :
                terms.append([prev_sentence])
                prev_sentence = ''
        if prev_sentence :
            terms.append([prev_sentence])
        '''
        nb = len(self.document.sections)
        if nb > 0:
            index = 0
            while index < nb:
                p = self.document.sections[index]
                index += 1
                ''' if section title = term name add it to verfied table '''
                if p.title:
                    termname = dtn_sentence.get_keywords_by_name(
                        p.title, self.keywords)
                    if termname:
                        if len(p.sentences) > 0:
                            for s in p.sentences:
                                if isinstance(s, str):
                                    self.verified_terms[termname].add_value(
                                        s, 1)
                                else:
                                    s_line = s[0]
                                    self._add_verified_sentences(
                                        termname, s[1], s_line[-1], 1)

                        while index < nb:
                            sp = self.document.sections[index]
                            index += 1
                            if sp.level > p.level:
                                for s in sp.sentences:
                                    if isinstance(s, str):
                                        self.verified_terms[
                                            termname].add_value(s, 1)
                                    else:
                                        s_line = s[0]
                                        self._add_verified_sentences(
                                            termname, s[1], s_line[-1], 1)
                            else:
                                ''' back to prev section '''
                                index -= 1
                                break

                if len(p.sentences) > 0:
                    terms.append(p.sentences)

        return terms

    def _verified_clauses(self, filename, termtype):

        terms = self.get_terms(filename, termtype)
        for term in terms:
            sentences = [s[0] for s in term]

            n_start = term[0][1]
            end_char = sentences[-1][-1]

            term_words = self.contract.get_term_words(sentences)

            tname, ttype, simu = self.similar_term(term_words, termtype)
            if ttype != None and tname != None:
                if ttype == termtype:
                    if tname in self.verified_terms.keys():
                        '''
                        for s in sentences :
                            self.verified_terms[tname].add_value(s, simu)
                        '''
                        self._add_verified_sentences(tname, n_start, end_char,
                                                     simu)

    def create_contract_model(self, fpath):

        self.contract.create_terms(fpath)

    def get_contract_date(self):

        return time.strftime("%Y-%m-%d   %H:%M:%S",
                             time.gmtime(self._filetime))

    def verify_document(self, filename, doctype, termtype):

        # init clause table
        self._init_terms_table(filename, termtype)

        # load lagal terms training model
        self._load_terms_model(doctype)

        self._verified_clauses(filename, termtype)

        sorted_list = []

        for key in self.keywords:
            if key in self.verified_terms.keys():
                term = self.verified_terms[key].term_value
                if (len(term) > 0):
                    sorted_list.append((key, 1, term))
                else:
                    sorted_list.append((key, 0))

        return sorted_list

    def _to_html_text(self, term_list):

        lists = []
        for elem in term_list:
            if len(elem) == 3:
                name, _, data = elem
            else:
                continue

            text = ""
            if len(data) > 0:
                for v, s_simu in data:
                    if s_simu > 0:  # is term name and find term string
                        s = dtn_sentence.get_sentence(v)
                        ss = dtn_document.law_document.parser_sentence(s)
                        text += '<p>'
                        if ss:
                            text += '<b>' + ss[1] + ' ' + ss[2] + '</b></p>'
                            text += '<p>'  # empty line
                        else:
                            text += s
                        text += '</p>'

                lists.append([name, text])
            else:
                lists.append([name, text])

        return lists

    def to_json(self, term_list):
        result = {}
        '''
        result["FILE"] = [self.fullname]
        result["TEMPS"] = [str(self._filetime)]
        result["TTILE"] = [self._title]
        '''

        result["filename"] = [self.filename, self.fullname, self.categorie]
        result["result"] = self._to_html_text(term_list)
        #result["result"] = self._to_list(lists)

        docutonejson.print_json(result)

    def example0(self):

        fname = config.TEST_PATH + "/劳动合同/Chanel劳动合同.docx.txt"
        ftype = "劳动合同"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)

    def example1(self):

        fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.pdf.txt"
        ftype = "有限责任公司章程"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)

    def example2(self):

        fname = config.TEST_PATH + "/章程/华能国际电力股份有限公司章程.docx.txt"
        ftype = "有限责任公司章程"
        term_list = self.verify_document(fname, None, ftype)
        self.to_json(term_list)
コード例 #23
0
ファイル: extraction2.py プロジェクト: minlogiciel/docutone
class Extraction(object):
    

    def __init__(self):
        """
        """
        self.law_document = LawDocument()        
        self.all_keywords = util.load_legalterm_type()
       

    
    def score_sentences(self, sentences, important_word):
        
        scores = {}
    
        for s in  sentences:
            for word in important_word:
                word = word[0]
                if word in s :
                    index = s.index(word)                 
                    index = index + len(word)
                    
                    sentence = s[index:]
                    seg = Segmentation()                 
                    #compare prev sentence
                    if word in scores :
                        n = scores[word][1]
                        if index < n :
                            scores[word] = [sentence, index]
                    else :
                        scores[word] = [sentence, index]
                    # find only one word 
                    break

        return scores



    

    def extraction(self, filename, doctype='营业执照', encoding="utf-8"):
    
        document = self.law_document.get_segment_document(filename) [0]
        sentences = []
        for sentence in document :
            s = ""
            for word in sentence :
                if word in util.sentence_delimiters :
                    s += word + ' '
                else :
                    s += word
            sentences.append(s)
        
                
        important_words = self.all_keywords[doctype]
        scored_sentences = self.score_sentences(sentences, important_words)
        
        return scored_sentences
                

    def extraction_documents(self, fpath, doctype='营业执照') :
    
        data = {}
        for name in sorted(os.listdir(fpath)):
            if name.endswith('.txt') :
                fname = os.path.join(fpath, name)
                data[fname] = self.extraction(fname, doctype) 

        return data
 
    def write_result(self, scored_sentences, important_word, outputfile = None): 
        f = None

        for word in important_word :
            word = word[0]
            if word in scored_sentences :
                sentence = scored_sentences[word][0]

                if f != None :
                    f.write(sentence)
                else :                  
                    print (word + " : " + sentence)
        
        print ("="*40)
 
    def write_documents_info(self, data, doctype) :
    
        important_words = self.all_keywords[doctype]
        for fname, scored_sentences in data.items() :
            self.write_result(scored_sentences, important_words)

    
    
        
    def extraction_ner(self, st, filename, doctype='营业执照') :
    
        document = self.law_document.get_segment_document(filename) [0]

        prevtype = 'O'
        string = ""
        for sentence in document :
            sttag = st.tag(sentence)
            for word, type in sttag :
                if type == 'GPE' or  type == 'ORG' or type == 'PRESON' :
                    if prevtype == 'O' or prevtype == type :
                        string += word
                        prevtype = type
                    else :
                        print('%s %s ' % (string, prevtype))
                        string = ""
                        prevtype = 'O'
                else :
                    if len(string) > 0 :
                        print('%s %s ' % (string, prevtype))
                    string = ""
                    prevtype = 'O'


    def test_ner(self, fpath, doctype='营业执照') :
        from nltk.tag.stanford import StanfordNERTagger
        st = StanfordNERTagger('D:/WORK/docutone/java/classifiers/chinese.misc.distsim.crf.ser.gz', 'D:/WORK/docutone/java/lib/stanford-ner-3.7.0.jar')
     
        data = {}
        for name in sorted(os.listdir(fpath)):
            if name.endswith('.txt') :
                fname = os.path.join(fpath, name)
                data[fname] = self.extraction_ner(st, fname, doctype) 

        return data

    def test_polyglot1(self) :
        import polyglot
        from polyglot.text import Text, Word
     
        text = Text("Bonjour, Mesdames.")
        print("Language Detected: Code={}, Name={}\n".format(text.language.code, text.language.name))

        text = Text("第一条  机动车第三者责任保险合同(以下简称本保险合同)由保险条款、投保单、保险单、批单和特别约定共同组成。 "
                    "本保险合同争议处理适用中华人民共和国法律。")
        #print(text.entities)
        """
        print("{:<16}{}".format("Word", "POS Tag")+"\n"+"-"*30)
        for word, tag in text.pos_tags:
            print(u"{:<16}{:>2}".format(word, tag))
        """
        word = Word("Obama", language="en")
        word = Word("中华人民共和国", language="zh")
        print("Neighbors (Synonms) of {}".format(word)+"\n"+"-"*30)
        for w in word.neighbors:
            print("{:<16}".format(w))
            print("\n\nThe first 10 dimensions out the {} dimensions\n".format(word.vector.shape[0]))
            print(word.vector[:10])
            
    def test_polyglot(self) :
        from polyglot.mapping import Embedding
        embeddings = Embedding.load("/home/rmyeid/polyglot_data/embeddings2/en/embeddings_pkl.tar.bz2")
        
        neighbors = embeddings.nearest_neighbors("green")
コード例 #24
0
class Text4Sentences(object):
    def __init__(self, stopwords_file=None):
        """
        Keyword arguments:
        stopwords_file :    stopwords file name
        """

        self.pagerank_config = {
            'alpha': 0.85,
        }

        self.seg = Segmentation(stopwords_file=stopwords_file)
        self.law_document = LawDocument()
        self.sentences = None
        self.words_no_filter = None  # 2维列表
        self.words_no_stop_words = None
        self.words_all_filters = None

        self.key_sentences = None

    def create_segment_sentences(self,
                                 sentences,
                                 sim_func=util.get_similarity):
        """
        Keyword arguments:
        
        sentences : sentences of document
        
        sim_func 指定计算句子相似度的函数。
        
        """

        self.words_no_filter, self.words_no_stop_words, self.words_all_filters = self.seg.segment(
            sentences)
        self.sentences = sentences

        self.key_sentences = util.sort_sentences(
            sentences=self.sentences,
            words=self.words_no_filter,
            sim_func=sim_func,
            pagerank_config=self.pagerank_config)

    def analyze_file(self, filename, encoding='utf-8'):
        """
        Keyword arguments:
        
        filename : input file name
        
        
        """

        f = self.law_document.create_document(filename=filename)

        self.create_segment_sentences(
            self.law_document.get_segmented_document())

    def get_key_sentences(self, num=6):
        """
        num : 个句子用来生成摘要。

        Return: important sentences。
        """

        result = []
        count = 0
        for item in self.key_sentences:
            if count >= num:
                break
            result.append(item)
            count += 1
        return result

    def show_key_sentences(self):

        for item in self.get_key_sentences(2):
            [sentence, idx, stype] = item['sentence']
            print(sentence)
            print("=" * 20)
            print(self.law_document.get_document_chapiter(idx, chapiter=True))
            print("--" * 20)
コード例 #25
0
    def test_imdb(self):

        from keras.callbacks import ModelCheckpoint
        from keras.utils import np_utils

        law_document = LawDocument()

        fname = os.path.join(
            variables.BASE_DIR,
            'data/Corpus/TEXT/合同、协议/劳动合同/1. 劳动合同- 最终版.DOC.txt')
        sentences = law_document.get_sentences(fname)

        # tokenizer
        tokenizer = Tokenizer(nb_words=self.MAX_NB_WORDS)
        tokenizer.fit_on_texts([sentences])
        self.sequences = tokenizer.texts_to_sequences([sentences])
        self.word_index = tokenizer.word_index

        seq_length = 10
        data = [m for m in self.word_index.values()]

        index_word = {}
        for w, id in list(self.word_index.items()):
            index_word[id] = w

        dataX = []
        dataY = []
        length = len(data) - seq_length
        for i in range(0, length, seq_length):
            seq_in = data[i:i + seq_length - 1]
            seq_out = data[i + seq_length]
            dataX.append(seq_in)
            dataY.append(seq_out)
        """
        raw_text = sentences
        
        chars = sorted(list(set("word telphone main")))
        
        # create mapping of unique chars to integers
        chars = sorted(list(set(raw_text)))
        char_to_int = dict((c, i) for i, c in enumerate(chars))
        int_to_char = dict((i, c) for i, c in enumerate(chars))
        # summarize the loaded data
        n_chars = len(raw_text)
        n_vocab = len(chars)
        print ("Total Characters: ", n_chars)
        print ("Total Vocab: ", n_vocab)
        # prepare the dataset of input to output pairs encoded as integers
        seq_length = 100
        dataX = []
        dataY = []
        for i in range(0, n_chars - seq_length, 1):
            seq_in = raw_text[i:i + seq_length]
            seq_out = raw_text[i + seq_length]
            dataX.append([char_to_int[char] for char in seq_in])
            dataY.append(char_to_int[seq_out])
        """

        n_patterns = len(dataX)
        print("Total Patterns: ", n_patterns)

        # reshape X to be [samples, time steps, features]
        X = np.reshape(dataX, (n_patterns, seq_length - 1, 1))
        n_vocab = len(index_word)
        # normalize
        X = X / float(n_vocab)

        # one hot encode the output variable
        y = np_utils.to_categorical(dataY)
        # define the LSTM model
        model = Sequential()
        model.add(LSTM(256, input_shape=(X.shape[1], X.shape[2])))
        model.add(Dropout(0.2))
        model.add(Dense(y.shape[1], activation='softmax'))
        model.compile(loss='categorical_crossentropy', optimizer='adam')
        # define the checkpoint
        filepath = "weights-improvement-{epoch:02d}-{loss:.4f}.hdf5"
        checkpoint = ModelCheckpoint(filepath,
                                     monitor='loss',
                                     verbose=1,
                                     save_best_only=True,
                                     mode='min')
        callbacks_list = [checkpoint]
        # fit the model
        #model.fit(X, y, nb_epoch=20, batch_size=128, callbacks=callbacks_list)

        start = np.random.randint(0, len(dataX) - 1)
        pattern = dataX[start]
        print("Seed:")
        print("\"", ''.join([index_word[value] for value in pattern]), "\"")
        # generate characters
        for i in range(1000):
            x = np.reshape(pattern, (1, len(pattern), 1))
            x = x / float(n_vocab)
            prediction = model.predict(x, verbose=0)
            index = np.argmax(prediction)
            result = index_word[index]
            seq_in = [index_word[value] for value in pattern]
            sys.stdout.write(result)
            pattern.append(index)
            pattern = pattern[1:len(pattern)]
        print("\nDone.")