Ejemplo n.º 1
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    #save_path = os.path.join(args.save_dir,args.model)
    if args.load_model is not None:
        load_path = os.path.join(args.save_dir, args.load_model)
        print('load_path:', load_path)

#####read data#####
    dm = DataManager()
    w2v_path = os.path.join(args.save_dir, 'word2vec')
    print(w2v_path)

    if args.action == 'train':
        print('Loading data...')
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', args.test_path)

        test_data = dm.get_test_data('test_data')
        train_data = dm.get_data('train_data')
        semi_data = dm.get_data('semi_data')

        all_text = np.concatenate((train_data[0], semi_data[0], test_data),
                                  axis=0)
        print('Number of all_text:', all_text.shape[0])
        #print('Text sample:',all_text[0])

        print('Converting texts to words sequence...')
        text2word = []

        with_filter = 0
        if with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(
                        text,
                        filters='!"#$%&()*+,-./:;<=>?@[\\]^_`{|}~\t\n',
                        lower=True,
                        split=" "))
        if not with_filter:
            for text in all_text:
                text2word.append(
                    text_to_word_sequence(text,
                                          filters='',
                                          lower=True,
                                          split=" "))

        print('Word sequence sample:', text2word[0])

        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
        else:
            print('Building word2vec model...')
            word_vec = gensim.models.Word2Vec(text2word,
                                              size=128,
                                              min_count=15)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            if not os.path.isdir(save_path):
                os.makedirs(save_path)
            if not os.path.exists(os.path.join(save_path, 'word2vec')):
                word_vec.save((os.path.join(save_path, 'word2vec')))

        print('Coverting train_data to vector...')
        index_data = []
        i = 0
        for line in train_data[0]:
            index_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

        for i in range(len(word_vec.wv.vocab)):
            embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
            if embedding_vector is not None:
                embedding_matrix[i] = embedding_vector

        index_data = pad_sequences(index_data, args.max_length)
    else:
        if os.path.exists(w2v_path):
            print('Loading w2v_model from %s' % w2v_path)
            word_vec = gensim.models.Word2Vec.load(w2v_path)
            print('Vocabulary size:', len(word_vec.wv.vocab))
            embedding_matrix = np.zeros((len(word_vec.wv.vocab), 128))

            for i in range(len(word_vec.wv.vocab)):
                embedding_vector = word_vec.wv[word_vec.wv.index2word[i]]
                if embedding_vector is not None:
                    embedding_matrix[i] = embedding_vector
        else:
            print('Can not load w2v model, please training w2v model first!')

    #print ('get Tokenizer...')
    #if args.load_model is not None:
    #    # read exist tokenizer
    #    dm.load_tokenizer(os.path.join(load_path,'token.pk'))
    #else:
    #    # create tokenizer on new data
    #    dm.tokenize(args.vocab_size)
    #
    #if not os.path.isdir(save_path):
    #    os.makedirs(save_path)
    #if not os.path.exists(os.path.join(save_path,'token.pk')):
    #    dm.save_tokenizer(os.path.join(save_path,'token.pk'))
#
# mat_train_data = dm.tokenizer.texts_to_matrix(train_data[0], mode='count')
# mat_test_data = dm.tokenizer.texts_to_matrix(test_data, mode='count')

# convert to sequences
#dm.to_sequence(args.max_length)

# initial model
    print('initial model...')
    #model = bow_model(args,mat_train_data)
    model = simpleRNN(args, embedding_matrix)
    print(model.summary())

    if args.load_model is not None:
        if args.action == 'train':
            print('Warning : load a exist model and keep training')
        #path = os.path.join(load_path,'model.h5')
        if os.path.exists(load_path):
            print('load model from %s' % load_path)
            model.load_weights(load_path)
        else:
            raise ValueError("Can't find the file %s" % load_path)
    elif args.action == 'test':
        print('Warning : testing without loading any model')

# training
    if args.action == 'train':
        #(X,Y),(X_val,Y_val) = dm.split_data('train_data', args.val_ratio)
        X, X_val, Y, Y_val = train_test_split(index_data,
                                              train_data[1],
                                              test_size=0.33,
                                              random_state=42)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

        print(history.history.keys())
        print('Val_acc:', history.history['val_acc'])
        print('Train_acc:', history.history['acc'])

# testing
    elif args.action == 'test':
        dm.add_test_data('test_data', args.test_path)
        test_data = dm.get_test_data('test_data')

        # Covert to vector
        index_test_data = []
        i = 0
        for line in test_data:
            index_test_data.append([])
            for word in line.split():
                if word in word_vec.wv:
                    #print(word ,word_vec.wv.vocab[word].index)
                    index_test_data[i].append(word_vec.wv.vocab[word].index)
            i += 1

        index_test_data = pad_sequences(index_test_data, args.max_length)

        if not os.path.exists(args.result_path):
            os.makedirs(args.result_path)

        csv_path = os.path.join(args.result_path, 'prediction.csv')

        print("Predicting testing data...")
        Y_pred = model.predict(index_test_data)
        Y_pred = np.round(Y_pred)
        print('Saving result csv to', csv_path)
        with open(csv_path, 'w') as f:
            f.write('id,label\n')
            for i, v in enumerate(Y_pred):
                f.write('%d,%d\n' % (i, v))

# semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')

        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(5):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=2048,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=256,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)
Ejemplo n.º 2
0
def main():
    # limit gpu memory usage
    def get_session(gpu_fraction):
        gpu_options = tf.GPUOptions(
            per_process_gpu_memory_fraction=gpu_fraction)
        return tf.Session(config=tf.ConfigProto(gpu_options=gpu_options))

    K.set_session(get_session(args.gpu_fraction))

    save_path = args.save_dir
    if args.load_model is not None:
        load_path = args.save_dir

    #####read data#####
    dm = DataManager()
    print('Loading data...')
    if args.action == 'train':
        dm.add_data('train_data', train_path, True)
    elif args.action == 'semi':
        dm.add_data('train_data', train_path, True)
        dm.add_data('semi_data', semi_path, False)
        dm.add_test_data('test_data', test_path)
    else:
        dm.add_test_data('test_data', test_path)

    # prepare tokenizer
    print('get Tokenizer...')
    if args.load_model is not None:
        # read exist tokenizer
        dm.load_tokenizer(os.path.join(load_path, 'token.pk'))
    else:
        # create tokenizer on new data
        dm.tokenize(args.vocab_size)

    if not os.path.isdir(save_path):
        os.makedirs(save_path)
    if not os.path.exists(os.path.join(save_path, 'token.pk')):
        dm.save_tokenizer(os.path.join(save_path, 'token.pk'))

    # convert to sequences
    dm.to_sequence(args.max_length)

    # initial model
    print('initial model...')
    model = simpleRNN(args)
    print(model.summary())

    if args.load_model is not None:
        path = os.path.join(load_path, 'model.h5')
        if os.path.exists(path):
            print('load model from %s' % path)
            model.load_weights(path)
        else:
            raise ValueError("Can't find the file %s" % path)

    # training
    if args.action == 'train':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        history = model.fit(X,
                            Y,
                            validation_data=(X_val, Y_val),
                            epochs=args.nb_epoch,
                            batch_size=args.batch_size,
                            callbacks=[checkpoint, earlystopping])

    # testing
    elif args.action == 'test':
        print(model.summary())
        [test_x] = dm.get_data('test_data')
        classes = model.predict(test_x, batch_size=32)
        with open(args.output_path, "w", encoding='utf-8') as f:
            spamwriter = csv.writer(f, delimiter=',')
            spamwriter.writerow(['id', 'label'])
            for i in range(len(classes)):
                if classes[i][0] < 0.5:
                    result = 0
                else:
                    result = 1
                spamwriter.writerow([str(i), str(result)])

    # semi-supervised training
    elif args.action == 'semi':
        (X, Y), (X_val, Y_val) = dm.split_data('train_data', args.val_ratio)

        [semi_all_X] = dm.get_data('semi_data')
        [test_x] = dm.get_data('test_data')
        semi_all_X = np.concatenate((semi_all_X, test_x), axis=0)
        earlystopping = EarlyStopping(monitor='val_acc',
                                      patience=3,
                                      verbose=1,
                                      mode='max')

        save_path = os.path.join(save_path, 'model.h5')
        checkpoint = ModelCheckpoint(filepath=save_path,
                                     verbose=1,
                                     save_best_only=True,
                                     save_weights_only=True,
                                     monitor='val_acc',
                                     mode='max')
        # repeat 10 times
        for i in range(16):
            # label the semi-data
            semi_pred = model.predict(semi_all_X,
                                      batch_size=1024,
                                      verbose=True)
            semi_X, semi_Y = dm.get_semi_data('semi_data', semi_pred,
                                              args.threshold,
                                              args.loss_function)
            semi_X = np.concatenate((semi_X, X))
            semi_Y = np.concatenate((semi_Y, Y))
            print('-- iteration %d  semi_data size: %d' % (i + 1, len(semi_X)))
            # train
            history = model.fit(semi_X,
                                semi_Y,
                                validation_data=(X_val, Y_val),
                                epochs=2,
                                batch_size=args.batch_size,
                                callbacks=[checkpoint, earlystopping])

            if os.path.exists(save_path):
                print('load model from %s' % save_path)
                model.load_weights(save_path)
            else:
                raise ValueError("Can't find the file %s" % path)