def index(): # setup database connection to mongo db or dynamo dbconn = dbw.DbWork() mymod = mm.MyModel() clean = cld.CleanData() return_code = 0 filename = '/Users/janehillyard/capstone/hate-speech/src/output.json' text, return_code = clean.convert_tweets(filename) #file exists ready to process tweets if return_code == 0: text = np.array(clean.clean_data(text)) vec = vectorizer.transform(text) #vectorizer.transform(text) words = vectorizer.get_feature_names() top_f = cb.top_features(vectorizer, words, 10) pos, neg = np.array(mymod.pred(model, vec)) dbconn.load_results(pos, neg) pred = mymod.predict(model, vec) ### get most frequent neg tweets probs = model.predict_proba(vec) top_neg_tweets = mymod.get_doc_frequencies(text, probs) #clean.process_file(filename) else: top_f = 'There are no new Tweets to Process.' top_neg_tweets = ['There are no new Tweets to Process.', 0.000] # page = 'Section name prediction.<br><br>pos prediction: {0} <br>pos prediction: {1} <br> Top ten words {2}' # return page.format(pos, neg, top_f) #return render_template('welcome.html', data = data) # get database values for graph # json list sent to HTML for graph a = dbconn.get_results() lst = json.dumps(a) lst_top_neg = json.dumps(top_neg_tweets) return render_template('welcome.html', vars=lst, top_tweets=lst_top_neg)
class_weights = torch.FloatTensor(weights).cuda() optimizer_name = 'ADAM' if optimizer_name == 'ADAM': print("Using ADAM optimizer") lr = 1e-6 cnn_lr = 1e-7 else: print("Using SGD optimizer") lr = 1e-3 #0.01 Initial learning rate # Default 0.1, but people report better performance with 0.01 and 0.001 lr_cnn = 1e-4 # Initial learning rate for pretrained CNN layers decay_every = 30 # Decay lr by a factor of 10 every decay_every epochs momentum = 0.9 weight_decay = 1e-4 model = mymodel.MyModel(gpu=gpu) # define loss function (criterion) and optimizer criterion = nn.CrossEntropyLoss(weight=class_weights).cuda(gpu) # OPTIMIZER # ADAM if optimizer_name == 'ADAM': print("Using ADAM optimizer with: CNN lr: " + str(cnn_lr) + " , mm_lr: " + str(lr)) optimizer = torch.optim.Adam([{ 'params': model.mm.parameters() }, { 'params': model.cnn.parameters(), 'lr': cnn_lr }],
'.pth.tar', map_location={ 'cuda:1': 'cuda:0', 'cuda:2': 'cuda:0', 'cuda:3': 'cuda:0' }) else: state_dict = torch.load(dataset + '/MMCNN_models_loss/' + model_name + '.pth.tar', map_location={ 'cuda:1': 'cuda:0', 'cuda:2': 'cuda:0', 'cuda:3': 'cuda:0' }) model = mymodel.MyModel() model = torch.nn.DataParallel(model, device_ids=gpus).cuda(gpu) model.load_state_dict(state_dict) test_dataset = customDatasetTest.customDatasetTest(dataset, split, Rescale=299) test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False, num_workers=workers, pin_memory=True, sampler=None) with torch.no_grad(): model.eval() for i, (tweet_id, image, image_text, tweet, target) in enumerate(test_loader):
#n = number of top words to report #output: list of words indices = np.argsort(vec.idf_)[::-1] return [words[i] for i in indices[:n]] # print "Top words are:" # for i in indices[:10]: # print words[i] # examples = ['hate hate hate', "great fun awsome", "sucks"] #plot_roc(v_probs, y_test,"ROC plot of churn data","False Positive Rate (1 - Specificity)", "True Positive Rate (Sensitivity, Recall)") #plot_datatwin(pos,neg,len(neg)) if __name__ == '__main__': mymod = mm.MyModel() clean = cld.CleanData() stemmer = SnowballStemmer('english') lemma = WordNetLemmatizer() #split file]#old_tweets.txt .86 roc # sancsv2 roc.99 # sentiment rox 0 X, y = process_file('data/gold_tweets.txt') X = clean.clean_data(X) # this is taking a bit of time X_train, X_test, y_train, y_test = train_split(X, y) train_vec = make_vecs(X_train) # from test mymodel = clf_model(train_vec.transform(X_train), y_train, 1.5) test_vectors = train_vec.transform(X_test) #vectorize_trans(tfidf,X_test) ### Model Scoring