def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        #remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    #prepare trainning and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    # # random shuffle
    df.sample(frac=1)
    train_df = df[0:TRAINING_DATA_SIZE]
    test_df = df.drop(train_df.index)

    # 2 - news description, 0 - class
    x_train = train_df[2]
    x_test = test_df[2]
    y_train = train_df[0]
    y_test = test_df[0]

    # tokenize sentences
    x_train = [word_tokenize(s) for s in x_train.tolist()]
    x_test = [word_tokenize(s) for s in x_test.tolist()]

    # stemming words
    x_train = stemWords(x_train)
    x_test = stemWords(x_test)

    # process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    LOGGER.debug('Total words: %d', n_words)

    # saving n_words and vocab_processor:
    # we need to use the same vocabulary processor
    # each word the same index
    # we also need to save n_words itself for news_cnn_model
    with open(VARS_FILE, 'wb') as f:  # needs to be opened in binary mode
        pickle.dump(n_words, f)
    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    LOGGER.info('Accuracy: {0:f}'.format(score))
Ejemplo n.º 2
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # 导入训练数据集和测试数据集,
    # 数据集的格式为:topic,title,description,source
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x代表news的title,y代表news的topic(类别)
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    # x_train经过fit_transform后转化为以下形式:
    # array([[   1,    2,    3, ...,    0,    0,    0],
    #       [   1,    8,    9, ...,    0,    0,    0],
    #       [  17,    1,   18, ...,    0,    0,    0],
    #       ...,
    #       [2112, 2113, 1417, ...,    0,    0,    0],
    #       [2120,   49, 2121, ...,    0,    0,    0],
    #       [2123, 1895, 2124, ...,    0,    0,    0]])
    # 每一条news的title由字符串序列转换为数字序列,
    # 400条news的title转变为400个数字序列
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    # 总共有2127个unique words
    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # 将unique words的总数以及vocab_processor存到文件中
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # 构建模型
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # 训练
    classifier.fit(x_train, y_train, steps=STEPS)

    # 对测试集进行预测,评估泛化性能
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 3
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        if os.path.exists(MODEL_OUTPUT_DIR):
            # Remove old model
            shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:700]
    # x - news title, y - class
    x_train = train_df[2]
    # y_train [1, entry amount in x_train]
    y_train = train_df[0]

    test_df = df.drop(train_df.index)
    x_test = test_df[2]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    # fit_transform: vocab_processor recognize words in x_train
    # x_train [entry amount in x_train, MAX_DOCUMENT_LENGTH]
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    # transform: vocab_processor only mark words in x_test only appeared in x_train
    # x_test [entry amount in x_test, MAX_DOCUMENT_LENGTH]
    x_test = np.array(list(vocab_processor.transform(x_test)))

    # unique words amount in x_train
    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 4
0
def main(unused_argv):
    print('hello')
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE)
    df['class'] = df['class'].map(CLASS_ENCODING)
    df['title'].fillna('Untitled', inplace=True)
    num_train = int(len(df) * TRAIN_TEST_SPLIT)
    train_df = df[0:num_train]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    # use only title to train the topic model
    x_train = train_df['title']
    y_train = train_df['class']
    x_test = test_df['title']
    y_test = test_df['class']

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'wb') as f:  # needs to be opened in binary mode.
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    # This is to create customized estimator
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 5
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]
    # x_train = train_df[1] + ". " + train_df[2]
    # x_train = x_train.astype(str)
    # y_train = train_df[0]
    # x_test = test_df[1] + ". " + test_df[2]
    # x_test = x_test.astype(str)
    # y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'wb') as f:  # needs to be opened in binary mode.
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 6
0
def loadModel():
    global classifier
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_DIR)
    df = pd.read_csv('../data/labeled_news.csv', header=None)

    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    print 'Model update'
def loadModel():
    global classifier
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_DIR)

    df = pd.read_csv('../data/labeled_news.csv', header=None)
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    print("Model updated!")
Ejemplo n.º 8
0
def loadModel():
    global classifier
    classifier = estimator.SKCompat(estimator.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_DIR
    ))
    df = pd.read_csv(CSV_FILE, header=None)

    train_df = df[0:1]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)), dtype=int)
    y_train = np.array(train_df[0], dtype=int)
    classifier.score(x_train, y_train)

    print 'Model updated'
Ejemplo n.º 9
0
def loadModel():
    global classifier
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_DIR)
    # Prepare training and testing
    df = pd.read_csv(os.path.join(os.path.dirname(__file__), '..', 'labeled_news.csv'), header=None)

    train_df = df[0:1]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    print "Model updated."
Ejemplo n.º 10
0
def loadModel():
    global classifier
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        number_classes, n_words, learning_rate),
                                 model_dir=MODEL_DIR)
    # Prepare training and testing
    df = pd.read_csv(DATA_FILE, header=None)
    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    test_df = df[0:testing_index_end]
    x_test = test_df[1]
    x_test = np.array(list(vocab_processor.transform(x_test)))
    y_test = test_df[0]
    classifier.evaluate(x_test, y_test)

    print "Model updated."
    logging.info('news_topic_modeling: model updated')
Ejemplo n.º 11
0
def loadModel():
    global classifier
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(number_classes, n_words, learning_rate),
        model_dir=MODEL_DIR)
    # Prepare training and testing
    df = pd.read_csv(DATA_FILE, header=None)
    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    test_df = df[0:testing_index_end]
    x_test = test_df[1]
    x_test = np.array(list(vocab_processor.transform(x_test)))
    y_test = test_df[0]
    classifier.evaluate(x_test, y_test)

    print "Model updated."
    logging.info('news_topic_modeling: model updated')
def loadModel():
    global classifier
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_DIR)
    # Prepare training and testing
    df = pd.read_csv('../data/labeled_news.csv', header=None)

    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    print("Model update.")
Ejemplo n.º 13
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 14
0
def loadModel():
    global classifier
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_DIR)
    # Prepare training and testing
    df = pd.read_csv('../data/labeled_news.csv', header=None)

    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    train_df = df[0:400]
    x_train = train_df[1]
    x_train = np.array(list(vocab_processor.transform(x_train)))
    y_train = train_df[0]
    classifier.evaluate(x_train, y_train)

    print "Model update."
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        print("Removing previous model...")
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x - title
    x_train = train_df[1]
    x_test = test_df[1]
    # y - classes
    y_train = train_df[0]
    y_test = test_df[0]

    vocab = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    # print(x_train)
    x_train = np.array(list(vocab.fit_transform(x_train)))
    x_test = np.array(list(vocab.transform(x_test)))

    n_words = len(vocab.vocabulary_)
    print("Total words: %d" % n_words)
    # save the vocabulary
    with open(VARS_FILE, 'wb') as f:
        pickle.dump(n_words, f)
    vocab.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    model = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                            model_dir=MODEL_OUTPUT_DIR)

    # train model
    model.fit(x_train, y_train, steps=STEPS)

    # evaluate model
    y_predict = [p['class'] for p in model.predict(x_test, as_iterable=True)]
    model_score = metrics.accuracy_score(y_test, y_predict)
    print("Accuracy of the model: {0:f}".format(model_score))
Ejemplo n.º 16
0
def loadModel():
    global classifier
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(N_CLASSES, n_words),
        model_dir=MODEL_DIR)

    # TODO: fix this until https://github.com/tensorflow/tensorflow/issues/5548 is solved.
    # We have to call evaluate or predict at least once to make the restored Estimator work.
    # the below is just a dummy evaluate
    df = pd.read_csv('../data/labeled_news1.csv')
    df['class'] = df['class'].map(CLASS_ENCODING)
    df['title'].fillna('Untitled', inplace = True)
    num_train = int(len(df) * TRAIN_TEST_SPLIT)
    train_df = df[0:num_train]
    x_train = train_df['title']
    y_train = train_df['class']
    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    classifier.evaluate(x_train, y_train)
    print("Model update.")
Ejemplo n.º 17
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)
    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]
    '''
    ### brian 07/11/2017
    news = pd.read_csv("../training_data/labeled_news_title2.csv")

    news['TITLE'] = news['TITLE'].apply(nltk.word_tokenize)

    news['DESCRIPTION'] = news['DESCRIPTION'].apply(nltk.word_tokenize)

    news['DESCRIPTION'] = news['DESCRIPTION'] + news['TITLE']

    news['TEXT'] = [normalize_text(s) for s in news['DESCRIPTION']]

    # pull the data into vectors
    vectorizer = CountVectorizer()
    x = vectorizer.fit_transform(news['TEXT'])

    encoder = LabelEncoder()
    y = encoder.fit_transform(news['CATEGORY'])

    # split into train and test sets
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2)

    nb = MultinomialNB()
    nb.fit(x_train, y_train)

    with open(MODEL_OUTPUT_FOLDER, 'wb') as f:
        pickle.dump(nb, f)
        print "save NB model"
    with open(MODEL_FEATURE_FOLDER, 'wb') as f1:
        pickle.dump(vectorizer.vocabulary_, f1)
        print "save Vocabulary feature"
    print "nb score: ", nb.score(x_test, y_test)

    #loaded_model = pickle.load(open(MODEL_OUTPUT_FOLDER, 'rb'))
    #print "load model score ",loaded_model.score(x_test, y_test)
    '''

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)
    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)
    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]
    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 18
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:3300]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    x_train = x_train.str.replace('[^\x00-\x7F]', '')

    #####################################
    '''
    x_train = train_df[2]
    x_train = x_train.str.replace('[^\x00-\x7F]','')
    tokenizer =  RegexpTokenizer(r"\w+")
    stemmer = PorterStemmer()
    #wnl = WordNetLemmatizer()

    for i in xrange(0,3000):
        x_train[i] = str(x_train[i])
        x_train[i] = tokenizer.tokenize(x_train[i])
        x_train[i] = list(word for word in x_train[i] if word not in stopwords.words('english'))
        x_train[i] = [stemmer.stem(word) for word in x_train[i]]
        #x_train[i] = [wnl.lemmatize(word) for word in x_train[i]]
        x_train[i] = " ".join(str(word) for word in x_train[i])
    '''
    ###########################################################

    y_train = np.array(train_df[0], dtype=int)
    x_test = test_df[1]
    y_test = np.array(test_df[0], dtype=int)

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)
    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = estimator.SKCompat(
        estimator.Estimator(model_fn=news_cnn_model.generate_cnn_model(
            N_CLASSES, n_words),
                            model_dir=MODEL_OUTPUT_DIR,
                            config=learn.RunConfig(save_checkpoints_secs=10,
                                                   save_summary_steps=10)))
    # Set up logging for predictions
    tensors_to_log = {"prob": "softmax_tensor"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=100)

    # Train and predict
    classifier.fit(x_train,
                   y_train,
                   batch_size=BATCH,
                   steps=STEPS,
                   monitors=[logging_hook])

    # Configure the accuracy metric
    metrics = {
        "accuracy":
        learn.MetricSpec(metric_fn=tf.metrics.accuracy, prediction_key="class")
    }

    # Evaluate the model
    eval_results = classifier.score(x=x_test, y=y_test, metrics=metrics)
Ejemplo n.º 19
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    df.sample(frac=1)
    stemmer = SnowballStemmer('english')
    for i in range(0, len(df)):
        print i
        df_line = str(df.loc[i, 1]) + ' ' + str(df.loc[i, 2]) + ' ' + str(
            df.loc[i, 3])
        df_line = df_line.translate(None, string.punctuation)
        df_line = df_line.lower()
        df_tokens = word_tokenize(df_line)
        df_filtered_tokens = []
        for word in df_tokens:
            if word not in stopwords.words('english'):
                df_filtered_tokens.append(stemmer.stem(word.decode('utf-8')))
        df_filtered_line = ' '.join(df_filtered_tokens)
        #print df_filtered_line
        df.loc[i, 1] = df_filtered_line
        #df.loc[i, 2] = df.loc[i, 3]

    train_df = df[0:training_index_end]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        max_document_length, min_frequency, None, None)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        number_classes, n_words, learning_rate),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=steps)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 20
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    train_df = df[0:400]
    test_df = df.drop(train_df.index)

    # x - news description, y - class
    x_train = train_df[2]
    y_train = train_df[0]
    x_test = test_df[2]
    y_test = test_df[0]

    # tokenize sentences
    x_train = [word_tokenize(sentence) for sentence in x_train.tolist()]
    x_test = [word_tokenize(sentence) for sentence in x_test.tolist()]

    # Stemming words.
    norm_x_train = []
    norm_x_test = []
    for tokens in x_train:
        stemmed_tokens = [
            stemmer.stem(w.lower()) for w in tokens if not w in stop_words
        ]
        norm_sentence = ' '.join(stemmed_tokens)
        norm_x_train.append(norm_sentence)

    for tokens in x_test:
        stemmed_tokens = [
            stemmer.stem(w.lower()) for w in tokens if not w in stop_words
        ]
        norm_sentence = ' '.join(stemmed_tokens)
        norm_x_test.append(norm_sentence)

    x_train = norm_x_train
    x_test = norm_x_test

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    LOGGER.debug('Total words: %d', n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'wb') as f:  # needs to be opened in binary mode.
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
        N_CLASSES, n_words),
                                 model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=STEPS)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    LOGGER.info('Accuracy: {0:f}'.format(score))
Ejemplo n.º 21
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    df.sample(frac=1)
    stemmer = SnowballStemmer('english')
    for i in range(0, len(df)):
        print i
        df_line = str(df.loc[i, 1]) + ' ' + str(df.loc[i, 2]) + ' ' + str(df.loc[i, 3])
        df_line = df_line.translate(None, string.punctuation)
        df_line = df_line.lower()
        df_tokens = word_tokenize(df_line)
        df_filtered_tokens = []
        for word in df_tokens:
            if word not in stopwords.words('english'):
                df_filtered_tokens.append(stemmer.stem(word.decode('utf-8')))
        df_filtered_line = ' '.join(df_filtered_tokens)
        #print df_filtered_line
        df.loc[i, 1] = df_filtered_line
        #df.loc[i, 2] = df.loc[i, 3]

    train_df = df[0:training_index_end]
    test_df = df.drop(train_df.index)

    # x - news title, y - class
    x_train = train_df[1]
    y_train = train_df[0]
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(max_document_length, min_frequency, None, None)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Build model
    classifier = learn.Estimator(
        model_fn=news_cnn_model.generate_cnn_model(number_classes, n_words, learning_rate),
        model_dir=MODEL_OUTPUT_DIR)

    # Train and predict
    classifier.fit(x_train, y_train, steps=steps)

    # Evaluate model
    y_predicted = [
        p['class'] for p in classifier.predict(x_test, as_iterable=True)
    ]

    score = metrics.accuracy_score(y_test, y_predicted)
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 22
0
def loopFunction(steps, docLength, iteration):
    if REMOVE_PREVIOUS_MODEL:
        if os.path.exists(MODEL_OUTPUT_DIR):
            # Remove old model
            shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    # x - news title, y - class
    # training data
    train_df = df[0:1100]
    x_train = train_df[1]
    # y_train [1, entry amount in x_train]
    y_train = train_df[0]

    # testing data
    test_df = df.drop(train_df.index)
    x_test = test_df[1]
    y_test = test_df[0]

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(docLength)
    # fit_transform: vocab_processor recognize words in x_train
    # x_train [entry amount in x_train, MAX_DOCUMENT_LENGTH]
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    # transform: vocab_processor only mark words in x_test only appeared in x_train
    # x_test [entry amount in x_test, MAX_DOCUMENT_LENGTH]
    x_test = np.array(list(vocab_processor.transform(x_test)))

    # unique words amount in x_train
    n_words = len(vocab_processor.vocabulary_)
    # print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor:
    with open(VARS_FILE, 'w') as f:
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)

    # Set up logging for predictions
    tensors_to_log = {"opt": "softmax"}
    logging_hook = tf.train.LoggingTensorHook(tensors=tensors_to_log,
                                              every_n_iter=20)

    # validation monitor, log the metrics
    # https://www.tensorflow.org/get_started/monitors
    validation_metrics = {
        "accuracy":
        MetricSpec(metric_fn=tf.contrib.metrics.streaming_accuracy,
                   prediction_key=learn.PredictionKey.CLASSES),
        "precision":
        MetricSpec(metric_fn=tf.contrib.metrics.streaming_precision,
                   prediction_key=learn.PredictionKey.CLASSES)
    }

    validation_monitor = tf.contrib.learn.monitors.ValidationMonitor(
        x_test, y_test, every_n_steps=11, metrics=validation_metrics)

    # Build model
    classifier = learn.SKCompat(
        learn.Estimator(model_fn=news_cnn_model.generate_cnn_model(
            N_CLASSES, n_words),
                        model_dir=MODEL_OUTPUT_DIR,
                        config=learn.RunConfig(save_checkpoints_secs=None,
                                               save_checkpoints_steps=10)))

    # Train
    classifier.fit(x_train,
                   y_train,
                   steps=steps,
                   monitors=[validation_monitor])

    # Evaluate model
    prediction = classifier.predict(x_test)
    y_predicted = prediction['classes']

    score = metrics.accuracy_score(y_test, y_predicted)
    # with open('night_test.csv','ab') as f:
    #     writer=csv.writer(f, delimiter=',')
    #     writer.writerow([iteration, steps, docLength, score])
    print('Accuracy: {0:f}'.format(score))
Ejemplo n.º 23
0
def main(unused_argv):
    if REMOVE_PREVIOUS_MODEL:
        # Remove old model
        shutil.rmtree(MODEL_OUTPUT_DIR)
        os.mkdir(MODEL_OUTPUT_DIR)

    # Prepare training and testing data
    df = pd.read_csv(DATA_SET_FILE, header=None)
    # df = df.sample(frac=1).reset_index(drop=True)
    train_df = df[0:1800]
    test_df = df.drop(train_df.index)

    # x - news title + source, y - class
    x_train = train_df[1] + ' ' + train_df[3].astype(str).map(str.strip)
    y_train = train_df[0]
    x_test = test_df[1] + ' ' + test_df[3].astype(str).map(str.strip)
    y_test = list(test_df[0])

    # Process vocabulary
    vocab_processor = learn.preprocessing.VocabularyProcessor(
        MAX_DOCUMENT_LENGTH)
    x_train = np.array(list(vocab_processor.fit_transform(x_train)))
    x_test = np.array(list(vocab_processor.transform(x_test)))

    n_words = len(vocab_processor.vocabulary_)
    print('Total words: %d' % n_words)

    # Saving n_words and vocab_processor for serving:
    with open(VARS_FILE, 'wb') as f:  # needs to be opened in binary mode.
        pickle.dump(n_words, f)

    vocab_processor.save(VOCAB_PROCESSOR_SAVE_FILE)
    '''
    Version 2
    '''

    # Build model
    model_fn = news_cnn_model.generate_cnn_model(N_CLASSES, n_words)
    classifier = tf.estimator.Estimator(model_fn=model_fn)
    # Train
    train_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={WORDS_FEATURE: x_train},
        y=y_train,
        batch_size=BATCH_SIZE,
        num_epochs=None,
        shuffle=True)
    classifier.train(input_fn=train_input_fn, steps=STEPS)

    # Predict
    predict_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={WORDS_FEATURE: x_test}, num_epochs=1, shuffle=False)
    y_predicted = list(classifier.predict(input_fn=predict_input_fn))
    predicted_classes = [p['class'] for p in y_predicted]

    # Evaluate w/o 'sports'
    for i in range(30):
        print(predicted_classes[i])
    count = 0
    sports = 0
    for i in range(len(predicted_classes)):
        if (y_test[i] == 6):
            sports += 1
            continue
        if (predicted_classes[i] == y_test[i]):
            count += 1
    total = len(predicted_classes) - sports
    print('Augmented Accuracy: {0:f}'.format(float(count) / total))

    # Evaluate.
    test_input_fn = tf.estimator.inputs.numpy_input_fn(
        x={WORDS_FEATURE: x_test},
        y=np.array(y_test),
        num_epochs=1,
        shuffle=False)
    scores = classifier.evaluate(input_fn=test_input_fn)
    print('Overall Accuracy: {0:f}'.format(scores['accuracy']))

    # Export
    export_dir = classifier.export_savedmodel(MODEL_OUTPUT_DIR,
                                              serving_input_receiver_fn)

    print('Model exported to %s' % export_dir)
    with open(EXPORT_DIR_FILE,
              'wb') as f:  # needs to be opened in binary mode.
        pickle.dump(export_dir, f)
    '''
    Version 1
    '''
    '''