def get_model():
    print "Reading text file..."
    dir_path = os.getcwd() + text_file
    txt_file = UFile(dir_path)
    structure_obj = structure.Structure(txt_file.text)
    word_list = structure_obj.prepare_pure_list_of_words()
    word_list.append(unknown)
    vocabulary = sorted(list(set(word_list)))
    word_to_int, int_to_word = equivalent_word_to_int(vocabulary)
    structure_obj.generate_tags_dict()
    tags_dict = collections.OrderedDict(sorted(structure_obj.tags.items()))
    tag_to_int, int_to_tag = equivalent_tag_to_int(tags_dict)
    semantic_vector_obj = sv.SemanticVector(structure_obj)
    word2vec = prepare_word_2_vec(semantic_vector_obj)
    print "Start Modeling..."
    embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int)

    nb_classes = len(vocabulary)
    model = modeling(embedding_matrix, len(word_list), 0.05, nb_classes)

    print len(word2vec.wv.vocab)
    train_X, train_y = prepare_multi_layer_train_sequence(tag_to_int, word_to_int, word_list,
                                                          structure_obj.sentences_obj, len(vocabulary), is_sparse=True)
    test_X = prepare_test_sequences(tag_to_int, word_to_int)
    train_model(model, train_X, train_y, 1, 128, int_to_word, test_X, structure_obj, tag_to_int)
Ejemplo n.º 2
0
    def model(self):
        struct = structure.Structure(self.file.text)
        seq_length = 15
        word_list = struct.prepare_pure_list_of_words()
        # compute the vocabulary size
        vocabulary = sorted(list(set(word_list)))
        vocab_lenght = len(vocabulary)
        struct.generate_tags_dict()
        # semantic modeling
        semantic = StructureModel.semantic_model(struct)

        # tags modeling
        tag_dict, tag_model = StructureModel.tags_model(struct, seq_length)


        #data prepration
        #tags data
        tag_list = struct.tagged_text.split()
        tag_set = sorted(list(set(tag_list)))
        tags_array, tags_to_int, int_to_tags, tagsX, tagsY = StructureModel.data_preparation(tag_set, seq_length, tag_list)
        #words data

        words_array, words_to_int, int_to_words, wordsX, wordsY = StructureModel.data_preparation(vocabulary, seq_length, word_list)

        nb_patterns = len(wordsX)
        print 'nb_patt', nb_patterns
        # word modeling
        word_model = StructureModel.word_model(struct, seq_length, vocab_lenght, nb_patterns)

        model = StructureModel.combine_model(struct, tag_model, word_model, seq_length, vocab_lenght)
def get_model():
    print "Reading text file..."
    txt_file = UFile(text_file)
    chars, char_to_int, int_to_char = discover_characters(txt_file.text)
    structure_obj = structure.Structure(txt_file.text)
    word_list = structure_obj.prepare_pure_list_of_words()
    vocabulary = sorted(list(set(word_list)))
    word_to_int, int_to_word = equivalent_word_to_int(vocabulary)
    semantic_vector_obj = sv.SemanticVector(structure_obj)
    word2vec = prepare_word_2_vec(semantic_vector_obj)
    print "Start Modeling..."
    embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int)

    nb_classes = len(vocabulary)

    model = word2vec_model(embedding_matrix, len(word_list), 0.05, nb_classes,
                           len(chars))

    print len(word2vec.wv.vocab)
    train_X, train_y = prepare_train_sequences_for_sparse(
        word_to_int, word_list, structure_obj.sentences_obj)
    train_y = generate_sequence_character(chars, char_to_int,
                                          structure_obj.sentences_obj,
                                          len(train_X))
    train_model(model, train_X, train_y, 1, 128, int_to_word, word2vec,
                word_to_int)
Ejemplo n.º 4
0
 def model(self):
     struct = structure.Structure(self.file.text)
     seq_length = 7
     word_list = struct.prepare_pure_list_of_words()
     # compute the vocabulary size
     vocabulary = sorted(list(set(word_list)))
     vocab_lenght = len(vocabulary)
     struct.generate_tags_dict()
     # semantic modeling
     semantic = StructureModel.semantic_model(struct, seq_length, w2v_size)
     StructureModel.word_model(struct, seq_length, semantic.model, word_list, vocabulary, vocab_lenght)
    def load_test_data(cls, seq_length, word_to_int):
        test_data_file = UFile('test_hafez.txt')
        test_data_structure = structure.Structure(test_data_file.text)
        test_data_word_list = test_data_structure.prepare_pure_list_of_words()

        dataX = []
        for i in range(0, len(test_data_word_list) - seq_length):
            words = test_data_word_list
            seq_in = words[i:i + seq_length]
            tempX = []
            for word in seq_in:
                if word in word_to_int:
                    tempX.append(word_to_int[word])
                else:
                    tempX.append(0)
            dataX.append(tempX)
        return dataX
Ejemplo n.º 6
0
def prepare_test_sequences(tag2int, word2int):
    print "preparing test sequences"
    txt_file = UFile(os.getcwd() + test_text_file)
    structure_obj = structure.Structure(txt_file.text)
    word_list = structure_obj.prepare_pure_list_of_words()
    structure_obj.generate_tags_dict()
    vocabulary = sorted(list(set(word_list)))

    test_tagX = prepare_tag_train_sequences(tag2int,
                                            structure_obj.sentences_obj)
    test_word_X, test_word_Y = prepare_train_sequences(
        word2int,
        word_list,
        structure_obj.sentences_obj,
        len(vocabulary),
        is_test=True)

    return [test_tagX, test_word_X]
def get_model():
    print "Reading text file..."
    dir_path = os.getcwd() + text_file
    txt_file = UFile(dir_path)
    structure_obj = structure.Structure(txt_file.text)
    word_list = structure_obj.prepare_pure_list_of_words()
    vocabulary = sorted(list(set(word_list)))
    word_to_int, int_to_word = equivalent_word_to_int(vocabulary)
    semantic_vector_obj = sv.SemanticVector(structure_obj)
    word2vec = prepare_word_2_vec(semantic_vector_obj)
    print "Start Modeling..."
    embedding_matrix = prepare_embedding(word_list, word2vec, word_to_int)

    nb_classes = len(vocabulary)
    model = word2vec_model(embedding_matrix, len(word_list), 0.05, nb_classes)

    print len(word2vec.wv.vocab)
    train_X, train_y = prepare_train_sequences_for_sparse(
        word_to_int, word_list, structure_obj.sentences_obj)

    train_model(model, train_X, train_y, 1, 128, int_to_word, word2vec,
                word_to_int)
Ejemplo n.º 8
0
    def tags_model(cls, structure, seq_length, word2vec):
        total = 0
        for t in structure.sentences_obj:
            total += t.sentence_len

        avg = total / len(structure.sentences_obj)
        print "average length of sentence", avg

        tags_dict = collections.OrderedDict(sorted(structure.tags.items()))
        tags_len = len(tags_dict)

        word_list = structure.prepare_pure_list_of_words()
        vocabulary = sorted(list(set(word_list)))

        word_to_int = dict((c, i) for i, c in enumerate(vocabulary))
        int_to_word = dict((i, c) for i, c in enumerate(vocabulary))
        tag_to_int = dict((c, i) for i, c in enumerate(tags_dict))
        int_to_tag = dict((i, c) for i, c in enumerate(tags_dict))

        dataX = []
        wordsX = []
        dataY = []
        tagged_text = structure.tagged_text.split()
        n_tags_in_text = len(tagged_text)

        for i in range(0, n_tags_in_text - seq_length, 1):
            seq_in = tagged_text[i:i + seq_length]
            word_in = word_list[i:i + seq_length]
            seq_out = tagged_text[i + seq_length]
            dataX.append([tag_to_int[char] for char in seq_in])
            wordsX.append([word_to_int[word] for word in word_in])
            dataY.append(tag_to_int[seq_out])
        n_patterns = len(dataX)

        # reshape X to be [samples, time steps, features]
        X = numpy.reshape(dataX, (n_patterns, seq_length, 1))
        # normalize
        X = X / float(tags_len)
        # one hot encode the output variable
        y = np_utils.to_categorical(dataY)

        print y.shape
        # define the LSTM model
        tag_model = Sequential()
        nn = 16

        tag_model.add(
            GRU(nn * 4,
                return_sequences=True,
                input_shape=(X.shape[1], X.shape[2])))
        tag_model.add(Dropout(0.02))

        tag_model.add(GRU(nn * 3, return_sequences=True))
        tag_model.add(Dropout(0.02))

        tag_model.add(GRU(nn * 2, return_sequences=True))
        tag_model.add(Dropout(0.02))

        tag_model.add(GRU(nn * 1, return_sequences=False))
        tag_model.add(Dropout(0.02))

        tag_model.add(Dense(y.shape[1], activation='sigmoid'))
        tag_model.add(Dropout(0.02))

        # # load the network weights
        tag_model.compile(loss='categorical_crossentropy', optimizer='rmsprop')
        # testing
        for rn in range(1):
            print rn
            tag_model.fit(X, y, nb_epoch=1,
                          batch_size=512)  # , callbacks=callbacks_list)
            # pick a random seed
            start = numpy.random.randint(0, len(dataX) - 1)
            pattern = dataX[start]
            word_pattern = wordsX[start]
            print "Seed:"
            # print "\"", ' '.join([int_to_tag[value] for value in pattern]), "\""
            print "\"", ' '.join(
                [int_to_word[value] for value in word_pattern]), "\""
            rs = []
            for i in range(1):
                x = numpy.reshape(pattern, (1, len(pattern), 1))
                x = x / float(tags_len)
                prediction = tag_model.predict(x, verbose=0)

                # index = numpy.argmax(prediction[0])
                print prediction
                print 'size: ', len(prediction)
                index = StructureModel.sample(prediction[0], 2.0)

                result = int_to_tag[index]
                w_p_list = w.word_tokenize(" ".join(word_pattern))
                word_window = w_p_list[len(pattern) - 5:len(pattern)]
                word_window = [int_to_word[value] for value in word_window]
                StructureModel.find_nearest_word(word2vec, word_window, result,
                                                 structure)
                # sys.stdout.write(result)
                # sys.stdout.write(" ")
                rs.append(index)
                pattern.append(index)
                pattern = pattern[1:len(pattern)]
            print "\nDone."