def __getitem__(self, item): data = process_data(self.tweet[item], self.selected_text[item], self.sentiment[item]) return { 'ids': torch.tensor(data["ids"], dtype=torch.long), 'mask': torch.tensor(data["mask"], dtype=torch.long), 'token_type_ids': torch.tensor(data["token_type_ids"], dtype=torch.long), 'targets_start': torch.tensor(data["targets_start"], dtype=torch.long), 'targets_end': torch.tensor(data["targets_end"], dtype=torch.long), 'orig_tweet': data["orig_tweet"], 'orig_selected': data["orig_selected"], 'sentiment': data["sentiment"], 'offsets': torch.tensor(data["offsets"], dtype=torch.long), 'padding_len': data["padding_len"] }
def getInfoFromParameters(input_file, parameters, estimator): Corpus = preprocessing.process_data( input_file, to_lower_case=parameters.lowerCaseFlag, remove_stop_words=parameters.removeStopWordsFlag, stem=parameters.stemFlag) pipeline = preprocessing.vectorize(estimator, max_features=parameters.maxFeatures, ngram_range=parameters.ngramRange, tf=parameters.tfidfFlags[0], tfidf=parameters.tfidfFlags[1]) return Corpus, pipeline
def getInfoFromParameters(input_file, parameters): Corpus = preprocessing.process_data( input_file, to_lower_case=parameters.lowerCaseFlag, remove_stop_words=parameters.removeStopWordsFlag, stem=parameters.stemFlag) counts_by_comment, names = preprocessing.vectorize( Corpus, max_features=parameters.maxFeatures, ngram_range=parameters.ngramRange, tf=parameters.tfidfFlags[0], tfidf=parameters.tfidfFlags[1]) return Corpus, counts_by_comment, names
def process_and_send_flows(combined_flows, attributes, normalization_values): normalization_values = np.asarray(normalization_values).astype(np.float32) flow_list = [] flow_names = [flow_attribute[0] for flow_attribute in flow.get_flow_attributes()] indexes = [] for name in flow_names: indexes.append([i for i in range(len(attributes)) if attributes[i][0] == name][0]) temp_list = ["None" if type(attribute[1]) == list else 0 for attribute in attributes] for key in combined_flows: flow_list.append(temp_list) new_values = combined_flows[key].get_flow_as_list() for i in range(len(new_values)): flow_list[-1][indexes[i]] = new_values[i] #Function returns predictions too, not needed online data, useless = process_data(flow_list, attributes) data = np.asarray(data).astype(np.float32) data = (data - normalization_values[0]) / normalization_values[1] data = np.nan_to_num(data) send_data(data)
"keep_prob": keep_prob,\ "batch_size": batch_size,\ "epochs": epochs,\ "max_to_keep": max_to_keep,\ "no_imprv_tolerance": no_imprv_tolerance,\ "checkpoint_path": checkpoint_path,\ "summary_path": summary_path,\ "model_name": model_name} # alpha & gamma for focal loss (tune hyperparameter) alpha = 0.1 gamma = 0.5 import os if not os.path.exists(config["save_path"]): os.mkdir(config["save_path"]) process_data(config) print("Load datasets...") # used for training train_set = batchnize_dataset(config["train_set"], config["batch_size"], shuffle=True) # used for computing validate loss valid_set = batchnize_dataset(config["dev_set"], batch_size=100, shuffle=False) import tensorflow as tf tf.reset_default_graph() print("Build models...") model = BiLSTM_Attention_model(config, alpha, gamma) model.train(train_set, valid_set) # used for computing test precision, recall and F1 scores
from sklearn.ensemble import RandomForestClassifier from sklearn.model_selection import train_test_split import pandas as pd from preprocessing import process_data TWEETS = 'tweet' LABEL = 'user' # fix random seed for reproducibility seed = 7 np.random.seed(seed) # Read data df = pd.read_csv('../train_data.csv', index_col=0) df.reset_index(inplace=True) dataset, num_features = process_data(df[TWEETS]) y = df[LABEL] # Split data X_train, X_test, y_train, y_test = train_test_split(dataset, y, train_size=0.8) # Random Forest estimator = RandomForestClassifier(n_estimators=200, n_jobs=4, verbose=True, random_state=0) estimator.fit(X_train, y_train) y_hat = estimator.predict(X_test) print("Accuracy of model: " + str(sum(y_hat == y_test) / len(y_test)))
import numpy as np from sklearn.linear_model import SGDClassifier from sklearn.model_selection import train_test_split import pandas as pd from preprocessing import process_data TWEETS = 'tweet' LABEL = 'user' # fix random seed for reproducibility seed = 7 np.random.seed(seed) # Read data df = pd.read_csv('../train_data.csv', index_col=0) df.reset_index(inplace=True) dataset = process_data(df[TWEETS]) y = df[LABEL] # Split data X_train, X_test, y_train, y_test = train_test_split(dataset, y, train_size=0.8) # Logistic Regression estimator = SGDClassifier(verbose=True) estimator.fit(X_train, y_train) y_hat = estimator.predict(X_test) print("Accuracy of model: " + str(sum(y_hat == y_test) / len(y_test)))
import numpy as np import visualisation import model_functions as modf import preprocessing import config import build import pickle np.random.seed(8888) model_type = config.model if __name__ == '__main__': df = preprocessing.download_data() df = preprocessing.process_data(df) model = build.create_model() build.build_model(model, df)
'stars': 1 }).limit(500000) reviews_4 = db.reviews.find({ 'stars': 4 }, { 'text': 1, 'stars': 1 }).limit(500000) reviews_5 = db.reviews.find({ 'stars': 5 }, { 'text': 1, 'stars': 1 }).limit(500000) reviews = chain(reviews_1, reviews_2, reviews_3, reviews_4, reviews_5) result, stars = preprocessing.process_data(reviews, lexicon='save') svm.train_model(result, stars) # yelp_neural_networks.train_model('lstm', result, stars) # yelp_neural_networks.train_model('cnn', result, stars) # svm.evaluate_model(result, stars) # yelp_neural_networks.evaluate_model('lstm', result, stars) # yelp_neural_networks.evaluate_model('cnn', result, stars) # predictions = yelp_neural_networks.predict_model('lstm', result) # print error_cost(predictions, stars) # print confusion_matrix(stars, predictions, labels=[1, 2, 3, 4, 5])
if __name__ == '__main__': client = MongoClient('localhost', 27017) db = client.Yelp reviews_1 = db.reviews.find({'stars': 1}, {'text': 1, 'stars': 1}).skip(100000).limit(10000) reviews_2 = db.reviews.find({'stars': 2}, {'text': 1, 'stars': 1}).skip(50000).limit(10000) reviews_3 = db.reviews.find({'stars': 3}, {'text': 1, 'stars': 1}).skip(100000).limit(10000) reviews_4 = db.reviews.find({'stars': 4}, {'text': 1, 'stars': 1}).skip(100000).limit(10000) reviews_5 = db.reviews.find({'stars': 5}, {'text': 1, 'stars': 1}).skip(100000).limit(10000) reviews = chain(reviews_1, reviews_2, reviews_3, reviews_4, reviews_5) result, stars = preprocessing.process_data(reviews, lexicon='load', using='tokenizer') # result_svm, stars = preprocessing.process_data(reviews, lexicon='load', using='tf-idf') # svm.train_model(result, stars) # yelp_neural_networks.train_model('lstm', result, stars) # yelp_neural_networks.train_model('cnn', result, stars) # svm.evaluate_model(result, stars) # yelp_neural_networks.evaluate_model('lstm', result, stars) # yelp_neural_networks.evaluate_model('cnn', result, stars) predictions_cnn = yelp_neural_networks.predict_model('cnn', result) predictions_lstm = yelp_neural_networks.predict_model('lstm', result) # predictions_svm = svm.predict_model(result_svm)
import torch.utils.data as data_utils import torch.nn.functional as F from preprocessing import process_data # Seed for reproducibility torch.manual_seed(12) # Parameters epochs = 100 batch_size = 20 learn_rate = .01 input_size = 36 num_classes = 3 # Data X, Y, _ = process_data() # Convert to tensor X = torch.Tensor(X).float() Y = torch.Tensor(Y).long() # Initialize data loader dataset = data_utils.TensorDataset(X, Y) data_loader = data_utils.DataLoader(dataset, batch_size=batch_size) # Network class Classifier(nn.Module): def __init__(self, input_size, num_classes): super().__init__() self.h1 = nn.Linear(input_size, 100)