def main(): # load dataset df = pd.read_csv('movies_metadata.csv') df = df.loc[:, ['title', 'genres', 'overview']] df = df[pd.notnull(df.overview)] df = df[pd.notnull(df.title)] df = df[pd.notnull(df.genres)] # Training parameters max_len_desc = 300 max_len_title = 50 max_input_len = max_len_title + max_len_desc genres_to_be_predicted = [ 'Drama', 'Comedy', 'Documentary', 'Science Fiction', 'Romance' ] num_classes = len(genres_to_be_predicted) params = { 'GENRES': genres_to_be_predicted, 'VOCABULARY_SIZE': 20000, 'EMBEDDING_DIM': 100, 'MAX_LEN_DESC': max_len_desc, 'MAX_LEN_TITLE': max_len_title, 'INPUT_LEN': max_input_len, 'NUM_DENSE_1': 512, 'NUM_CLASSES': num_classes, 'NUM_EPOCHS': 4, 'BATCH_DIM': 64 } # init custom classes p = preprocessor(genres=params['GENRES']) e = encoder(max_words=params['VOCABULARY_SIZE'], maxlen_desc=params['MAX_LEN_DESC'], maxlen_title=params['MAX_LEN_TITLE']) m = model_classifier() # prepare data for training df = p.preprocess(df) X, y = e.encode(df) X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1000) e.save() # create and train model model = m.define_model(params) history = m.train_model(X_train, X_test, y_train, y_test) # save m.save_model() m.save_params()
def reliability_label_predictor(attributes): source_name = attributes[0] topic_name = attributes[1] url = attributes[2] print(source_name, topic_name, url) article = parse_url(url) cleanedArticle = preprocess.preprocessor(article) cluster_number, flag = Clustering.cluster_new_article( cleanedArticle, topic_name) neutrality_score = Neutrality.neutrality_score_finder( cluster_number, flag, cleanedArticle) print(neutrality_score) source_score = df.loc[df['Source'] == source_name, 'SourceScore'].iloc[0] print(source_score) score_label = strong_words_score(cleanedArticle) print(score_label) reliability_label = reliability_finder(source_score, neutrality_score, score_label) print(reliability_label) return reliability_label
# softmax for computing the perplexity later on, not used elsewhere (no gradient computation) softmax = tf.nn.softmax(predictions) # Average Cross Entropy loss, compute CE separately to use in testing cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits = predictions, labels = labels) loss = tf.reduce_sum(cross_entropy) #training adam = tf.train.AdamOptimizer(conf.lr) gradients, variables = zip(*adam.compute_gradients(loss)) gradients, _ = tf.clip_by_global_norm(gradients, 10.0) train_step = adam.apply_gradients(zip(gradients, variables)) # preprocessing print("Starting preprocessing") preproc = preprocessor() preproc.preprocess("../data/sentences.train") # training print("Start training") config = tf.ConfigProto() config.gpu_options.allow_growth = True if not os.path.exists(conf.ckpt_dir): os.makedirs(conf.ckpt_dir) saver = tf.train.Saver() with tf.Session(config=config) as sess: if conf.mode == "TRAIN": print("Mode set to TRAIN") sess.run(tf.global_variables_initializer()) for i in range(conf.num_epochs):
def get_input_image(imagename, train_mode): pp = preprocess.preprocessor() image = pp._parse_function(imagename) cimage = tf.cond(train_mode, lambda: pp.training_preprocess(image), lambda: pp.val_preprocess(image)) return cimage
import numpy as np import sklearn.datasets from sklearn.model_selection import KFold from preprocess import preprocessor import pandas as pd import optuna.integration.lightgbm as lgb if __name__ == "__main__": ### Load the data ### train = pd.read_csv("train.tsv", sep='\t') test = pd.read_csv("test.tsv", sep='\t') ### preprocess the data ### prep = preprocessor() train = prep.fit_transform(train) test = prep.transform(test) data_test = test.drop(['revenue'], axis=1) logtarget_test = np.log1p(test.revenue) data = train.drop(['revenue'], axis=1) target = train.revenue logtarget = np.log1p(target) dtrain = lgb.Dataset(data, label=logtarget) ### set the parameters and optimize the hiper-parameters #### params = { "objective": "rmse", "metric": "rmse", "verbosity": -1, "boosting_type": "gbdt",
# #!/usr/bin/env python # # -*- coding: utf-8 -*- # # ''' # Created on 26 Jul 2019 # # @author: Ajay # ''' # from preprocess import preprocessor from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report import time pp = preprocessor(1500, "sentiment", "mysentiment") X_train = pp.X_train X_test = pp.X_test y_train = pp.y_train y_test = pp.y_test clf = MultinomialNB(alpha=1.1) start = time.time() model = clf.fit(X_train, y_train) stop = time.time() predicted_y = model.predict(X_test) # expected results vs predicted results # print(y_test, predicted_y) # print("Predict: ", model.predict_proba(X_test)) print("Accuracy: ", accuracy_score(y_test, predicted_y))
# #!/usr/bin/env python # # -*- coding: utf-8 -*- ''' Created on 28 Jul 2019 @author: Ajay ''' from preprocess import preprocessor from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report from sklearn import tree import time # DT uses max_features=200 in addition to normal CountVectorizer arguments pp = preprocessor(1500, "topic", "dt") X_train = pp.X_train X_test = pp.X_test y_train = pp.y_train y_test = pp.y_test # if random_state is not set, the features are randomised, therefore the tree may be different each time clf = tree.DecisionTreeClassifier(criterion='entropy', random_state=0, min_samples_leaf=20) start = time.time() model = clf.fit(X_train, y_train) stop = time.time() predicted_y = model.predict(X_test) # expected results vs predicted results # print(y_test, predicted_y)
# #!/usr/bin/env python # # -*- coding: utf-8 -*- # # ''' # Created on 26 Jul 2019 # # @author: Ajay # ''' # from preprocess import preprocessor from sklearn.naive_bayes import BernoulliNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report import time divider = 1500 pp = preprocessor(divider, "sentiment", "bnb") X_train = pp.X_train X_test = pp.X_test y_train = pp.y_train y_test = pp.y_test clf = BernoulliNB() start = time.time() model = clf.fit(X_train, y_train) stop = time.time() predicted_y = model.predict(X_test) # expected results vs predicted results # print(y_test, predicted_y) # print("Predict: ", model.predict_proba(X_test)) print("Accuracy: ", accuracy_score(y_test, predicted_y))
# #!/usr/bin/env python # # -*- coding: utf-8 -*- # # ''' # Created on 26 Jul 2019 # # @author: Ajay # ''' # from preprocess import preprocessor from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report import time pp = preprocessor(1500, "topic", "mytopic") X_train = pp.X_train X_test = pp.X_test y_train = pp.y_train y_test = pp.y_test clf = MultinomialNB(alpha=.77) start = time.time() model = clf.fit(X_train, y_train) stop = time.time() predicted_y = model.predict(X_test) # expected results vs predicted results # print(y_test, predicted_y) # print("Predict: ", model.predict_proba(X_test)) print("Accuracy: ", accuracy_score(y_test, predicted_y)) print("Precision (array): ", precision_score(y_test, predicted_y, average=None))
return out if __name__ == "__main__": parser = argparse.ArgumentParser(description='Enhanced Neural Network') parser.add_argument("--hidden", type=int, help="number of hidden neurons", default=5) parser.add_argument("--activation", help="activation for neural network", default="sigmoid") args = parser.parse_args() h = int(args.hidden) activation = args.activation dataset_url = "https://raw.githubusercontent.com/ronakHegde98/CS-4372-Computational-Methods-for-Data-Scientists/master/data/diabetic_data.csv" df = pd.read_csv(dataset_url) X_train, X_test, y_train, y_test = preprocessor(df) #reshaping of train and test y_train = y_train.values.reshape(y_train.shape[0], 1) y_test = y_test.values.reshape(y_test.shape[0], 1) nn_model = NeuralNet(X_train.T, y_train.T, h) nn_model.train(activation) predictions = nn_model.predict(X_test.T, y_test.T, activation) # predictions = np.around(predictions, 0).astype(np.int32)
# #!/usr/bin/env python # # -*- coding: utf-8 -*- # # ''' # Created on 26 Jul 2019 # # @author: Ajay # ''' # from preprocess import preprocessor from sklearn.naive_bayes import MultinomialNB from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, classification_report import time pp = preprocessor(1500, "topic", "mnb") X_train = pp.X_train X_test = pp.X_test y_train = pp.y_train y_test = pp.y_test clf = MultinomialNB() start = time.time() model = clf.fit(X_train, y_train) stop = time.time() predicted_y = model.predict(X_test) # expected results vs predicted results # print(y_test, predicted_y) # print("Predict: ", model.predict_proba(X_test)) print("Accuracy: ", accuracy_score(y_test, predicted_y)) print("Precision (array): ", precision_score(y_test, predicted_y,