def run(args): """run the rating algorithms given the arguments. Args: args: the arguments. """ train = data.read_train(args.training_file, impute = False) test = data.read_test(args.test_file) start_time = time.time() """initialize the model.""" if args.rating_method == 'user-user': model = rating.user_user(train, args.weighting_method, args.metric, args.k) elif args.rating_method == 'movie-movie': model = rating.item_item(train, args.weighting_method, args.metric, args.k) elif args.rating_method == 'pcc': model = rating.user_user(train, args.weighting_method, args.metric, args.k, True) elif args.rating_method == 'bipartite-user': clu_user = data.read_cluster(args.ucluster_file) model = rating.bipartite_user(train, clu_user, args.weighting_method, args.metric, args.k) elif args.rating_method == 'bipartite-movie': clu_movie = data.read_cluster(args.mcluster_file) model = rating.bipartite_item(train, clu_movie, args.weighting_method, args.metric, args.k) """predict the ratings. assume all models have a query API.""" py = [] for i, query in enumerate(test): if i % 1000 == 0: logging.info('{} out of {}'.format(i, len(test))) py.append(model.query(query[0], query[1])) """time the program.""" end_time = time.time() print 'running time =', end_time - start_time """output the prediction to files. used for test.""" if args.output_file: fout = open(args.output_file, 'w') for ppy in py: fout.write('{}\n'.format(ppy)) fout.close() """evaluate the results. used for validation.""" if args.ground_truth: gold = data.read_gold(args.ground_truth) print 'RMSE =', math.sqrt(((gold - np.array(py)) ** 2).mean())
if __name__ == '__main__': logging.basicConfig(format='%(asctime)s : %(levelname)s : %(message)s', level=logging.INFO) random.seed(0) parser = argparse.ArgumentParser() parser.add_argument('training_file', help = 'the file name of the training data') parser.add_argument('output_movie', help = 'the output file for movie clusters') parser.add_argument('output_user', help = 'the output file for user clusters') parser.add_argument('--k_movie', help = 'the number of movie clusters', type = int, default = 200) parser.add_argument('--k_user', help = 'the number of user clusters', type = int, default = 400) parser.add_argument('--iter', help = 'the number of iterations', type = int, default = 10) args = parser.parse_args() mat = data.read_train(args.training_file, impute = True) start_time = time.time() res_row, res_col = bipartite(args.k_movie, args.k_user, args.iter, mat) end_time = time.time() fout = open(args.output_movie, 'w') for e in res_row: fout.write('{}\n'.format(e)) fout.close() fout = open(args.output_user, 'w') for e in res_col: fout.write('{}\n'.format(e))
# -*- coding: utf-8 -*- """ Created on Sun Jun 10 15:37:16 2018 @author: ashima.garg """ import config import data import model if __name__ == "__main__": data = data.Data() data.read_train(config.TRAIN_X_PATH, config.TRAIN_Y_PATH) data.preprocess() data.split() print("data read") model = model.Model() model.build() print("model build") model.train(data) print("model trained") model.test(data) print("model tested") ''' data.read_test(config.TEST_X_PATH) data.preprocess() print("model predicted")
from nltk.corpus import stopwords from sklearn.metrics import accuracy_score, confusion_matrix from tensorflow.keras.preprocessing import sequence from tensorflow.keras.models import Sequential from tensorflow.keras.layers import Dense, Embedding from tensorflow.keras.layers import LSTM from tensorflow.keras.utils import to_categorical from tensorflow.keras.preprocessing.text import Tokenizer from tensorflow.keras.preprocessing.sequence import pad_sequences #Train dataset train_df = read_train( "/Users/Ricou/Desktop/ANDRE/machine_learning/tweet_sentiment_extraction/data/train.csv" ) print(train_df.head()) print(train_df.shape) #Test dataset test_df = read_test( "/Users/Ricou/Desktop/ANDRE/machine_learning/tweet_sentiment_extraction/data/test.csv" ) #print(test_df.head()) print(test_df.shape) print("Text preprocessing") #Recherche de valeurs manquantes sur le dataset train print(f'Training null Values:\n{train_df.isnull().sum()}\n') print(f'Test null Values:\n{test_df.isnull().sum()}')
def train_ld(): ds = data.Dataset() ds.datas = data.read_train() ld = data.Loader(ds, 1) return ld