Example #1
0
def main():
    global config
    USE_CLASSIFIER='lstm'

    weight_file_path = Config.getPath('models') + '/' + USE_CLASSIFIER + '-weights.h5'

    load_config(USE_CLASSIFIER)

    classifier=ClassifierFactory.getLSTM(**{'config':config})
    #Classifier(model_name=USE_CLASSIFIER,config=config)

    classifier.load_weights(weight_file_path)

    df = pd.read_csv(Config.getPath('data') + '/' + TESTING_DATA)

    Xtest = df['question_text']
    Ytest = df['target']

    print('extract configuration from input texts ...')

    print('testing size: ', len(Xtest))

    print('start predicting ...')
    pred = classifier.predict(Xtest)
    print(pred)
    score = metrics.accuracy_score(Ytest, pred)
    print("accuracy:   %0.3f" % score)
    cm = metrics.confusion_matrix(Ytest, pred, labels=[0, 1])
    plot_confusion_matrix(cm, classes=[0, 1])
Example #2
0
def load_config(model_name):
    print('loading csv file ...')
    global config

    config_file_path = Config.getPath('models') + '/' + model_name + '-config.joblib'

    config = ClassifierFactory.getConfig(joblib_file=config_file_path)

    #Two classes - Fake=0, Reliable=1
    config.set('num_target_tokens',2)
Example #3
0
def main():
    load_config(
        **{
            'embedding': 'glove',
            'max_input_seq_length': 100,
            'max_vocab_size': 8000,
            'epochs': 20
        })
    c = ClassifierFactory.getLSTM(**{'lstm_units': 64})
    #c=ClassifierFactory.getLSTM(**{'lstm_units':64,'dropout':0.2,'epochs':15})
    train_vanilla(c)
Example #4
0
def load_config(**kwargs):
    print('loading csv file ...')
    global config

    df = pd.read_csv(Config.getPath('data') + '/' + TRAINING_DATA)
    df = df.sample(50000)
    X = df['question_text']
    Y = df['target']

    print('preparing configuration...')

    config = ClassifierFactory.getConfig(X, Y, json_file=None, **kwargs)

    #Two classes - Fake=0, Reliable=1
    config.set('num_target_tokens', 2)
Example #5
0
def predict_svm():
    global config
    load_config('svm')

    print('loading data...')
    df = pd.read_csv(Config.getPath('data') + '/' + TRAINING_DATA)

    df2 = df.sample(50000)

    X = df2['question_text']
    Y = df2['target']

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)
    # Two classes - Fake=0, Reliable=1
    config.set('num_target_tokens', 2)

    classifier = ClassifierFactory.getSVM()

    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')

    classifier.fit(Xtrain, Ytrain, Xtest, Ytest)

    df = pd.read_csv(Config.getPath('data') + '/' + TESTING_DATA)

    df = df.sample(100000)
    X = df['question_text']
    Y = df['target']

    pred = classifier.predict(X)

    score = metrics.accuracy_score(Y, pred)
    f1score = metrics.f1_score(Y, pred)
    print("accuracy:   %0.3f" % score)
    print("f1 score:   %0.3f" % f1score)

    cm = metrics.confusion_matrix(Ytest, pred, labels=[0, 1])
    plot_confusion_matrix(cm, classes=[0, 1])
Example #6
0
def train_experiment(classifier):
    print('loading csv file ...')
    global config

    df = pd.read_csv(Config.getPath('data') + '/' + TRAINING_DATA)

    df = df.sample(20000)

    X = df['question_text']
    Y = df['target']

    print('splitting data...')

    Xtrain, Xtest, Ytrain, Ytest = train_test_split(X,
                                                    Y,
                                                    test_size=0.2,
                                                    random_state=42)

    print('training size: ', len(Xtrain))
    print('testing size: ', len(Xtest))

    print('start fitting ...')

    # max_sequence, vocab_size,  lstm_units, dropout
    experiment = [
        [20, 5000, 64, 0.2],  #0
        [35, 5000, 64, 0.2],  #1
        [50, 5000, 64, 0.2],  #2
        [100, 5000, 64, 0.2],  #3
        [50, 5000, 128, 0.2],  # 4
        [50, 5000, 256, 0.2],  # 5
        [50, 5000, 512, 0.2],  # 6
        [50, 2000, 64, 0.2],  # 7
        [50, 3000, 64, 0.2],  # 8
        [50, 4000, 64, 0.2],  # 9
        [50, 5000, 64, 0.2],  # 10
        [50, 6000, 64, 0.2],  # 11
        [50, 7000, 64, 0.2],  # 12
        [50, 8000, 64, 0.2],  # 13
        [50, 9000, 64, 0.2],  # 14
        [50, 5000, 64, 0.1],  # 15
        [50, 5000, 64, 0.2],  # 16
        [50, 5000, 64, 0.3],  # 17
        [50, 5000, 64, 0.4],  # 18
    ]
    i = 0
    for max_seq, vocab_siz, lstm_u, drop in experiment:

        config = Config(X, Y, max_seq, vocab_siz, 2, 'glove')

        print("%s starting experiment ... %d" % (datetime.datetime.now(), i))
        #model=ClassifierFactory.getLSTM(**{'config':config,'lstm_units':lstm_u,'dropout':drop})
        model = ClassifierFactory.getLSTM(**{
            'config': config,
            'lstm_units': lstm_u,
            'dropout': drop
        })

        history = model.fit(Xtrain,
                            Ytrain,
                            Xtest,
                            Ytest,
                            epochs=10,
                            file_prefix='experiment-%i' % i)

        history_plot_file_path = Config.getPath(
            'reports') + '/' + model.model_name + ('_experiment_%d' %
                                                   i) + '-history.png'
        plot_and_save_history(history, model.model_name,
                              history_plot_file_path)
        i += 1
Example #7
0
def main_svm():
    load_config()
    c = ClassifierFactory.getSVM()
    train_vanilla(c)