Example #1
0
def index():
    # setup database connection to mongo db or dynamo
    dbconn = dbw.DbWork()
    mymod = mm.MyModel()
    clean = cld.CleanData()
    return_code = 0
    filename = '/Users/janehillyard/capstone/hate-speech/src/output.json'

    text, return_code = clean.convert_tweets(filename)
    #file exists ready to process tweets
    if return_code == 0:
        text = np.array(clean.clean_data(text))
        vec = vectorizer.transform(text)  #vectorizer.transform(text)
        words = vectorizer.get_feature_names()
        top_f = cb.top_features(vectorizer, words, 10)
        pos, neg = np.array(mymod.pred(model, vec))
        dbconn.load_results(pos, neg)
        pred = mymod.predict(model, vec)

        ### get most frequent neg tweets
        probs = model.predict_proba(vec)
        top_neg_tweets = mymod.get_doc_frequencies(text, probs)
        #clean.process_file(filename)
    else:
        top_f = 'There are no new Tweets to Process.'
        top_neg_tweets = ['There are no new Tweets to Process.', 0.000]
    # page = 'Section name prediction.<br><br>pos prediction: {0} <br>pos prediction: {1} <br>  Top ten words {2}'
    # return page.format(pos, neg, top_f)
    #return render_template('welcome.html', data = data)

    # get database values for graph
    # json list sent to HTML for graph
    a = dbconn.get_results()
    lst = json.dumps(a)
    lst_top_neg = json.dumps(top_neg_tweets)

    return render_template('welcome.html', vars=lst, top_tweets=lst_top_neg)
Example #2
0
class_weights = torch.FloatTensor(weights).cuda()

optimizer_name = 'ADAM'
if optimizer_name == 'ADAM':
    print("Using ADAM optimizer")
    lr = 1e-6
    cnn_lr = 1e-7
else:
    print("Using SGD optimizer")
    lr = 1e-3  #0.01 Initial learning rate # Default 0.1, but people report better performance with 0.01 and 0.001
    lr_cnn = 1e-4  # Initial learning rate for pretrained CNN layers
    decay_every = 30  # Decay lr by a factor of 10 every decay_every epochs
    momentum = 0.9
    weight_decay = 1e-4

model = mymodel.MyModel(gpu=gpu)

# define loss function (criterion) and optimizer
criterion = nn.CrossEntropyLoss(weight=class_weights).cuda(gpu)

# OPTIMIZER
# ADAM
if optimizer_name == 'ADAM':
    print("Using ADAM optimizer with: CNN lr: " + str(cnn_lr) + " , mm_lr: " +
          str(lr))
    optimizer = torch.optim.Adam([{
        'params': model.mm.parameters()
    }, {
        'params': model.cnn.parameters(),
        'lr': cnn_lr
    }],
Example #3
0
                            '.pth.tar',
                            map_location={
                                'cuda:1': 'cuda:0',
                                'cuda:2': 'cuda:0',
                                'cuda:3': 'cuda:0'
                            })
else:
    state_dict = torch.load(dataset + '/MMCNN_models_loss/' + model_name +
                            '.pth.tar',
                            map_location={
                                'cuda:1': 'cuda:0',
                                'cuda:2': 'cuda:0',
                                'cuda:3': 'cuda:0'
                            })

model = mymodel.MyModel()
model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpu)
model.load_state_dict(state_dict)

test_dataset = customDatasetTest.customDatasetTest(dataset, split, Rescale=299)
test_loader = torch.utils.data.DataLoader(test_dataset,
                                          batch_size=batch_size,
                                          shuffle=False,
                                          num_workers=workers,
                                          pin_memory=True,
                                          sampler=None)

with torch.no_grad():
    model.eval()
    for i, (tweet_id, image, image_text, tweet,
            target) in enumerate(test_loader):
Example #4
0
    #n = number of top words to report
    #output: list of words
    indices = np.argsort(vec.idf_)[::-1]
    return [words[i] for i in indices[:n]]
    # print "Top words are:"
    # for i in indices[:10]:
    #     print words[i]


# examples = ['hate hate hate', "great fun awsome", "sucks"]

#plot_roc(v_probs, y_test,"ROC plot of churn data","False Positive Rate (1 - Specificity)", "True Positive Rate (Sensitivity, Recall)")
#plot_datatwin(pos,neg,len(neg))

if __name__ == '__main__':
    mymod = mm.MyModel()
    clean = cld.CleanData()
    stemmer = SnowballStemmer('english')
    lemma = WordNetLemmatizer()

    #split file]#old_tweets.txt .86 roc
    # sancsv2 roc.99
    # sentiment rox 0
    X, y = process_file('data/gold_tweets.txt')
    X = clean.clean_data(X)  # this is taking a bit of time
    X_train, X_test, y_train, y_test = train_split(X, y)
    train_vec = make_vecs(X_train)  # from test
    mymodel = clf_model(train_vec.transform(X_train), y_train, 1.5)
    test_vectors = train_vec.transform(X_test)  #vectorize_trans(tfidf,X_test)

    ### Model Scoring