def run_demo(): # to predict some example movies: example_file = open(EXAMPLE_FILE, "r") raw_data = [] for line in example_file.read().splitlines(): raw_data.append(json.loads(line)) raw_data, _ = preprocess.split_data(raw_data, 1.0) _, __, ___, X_example, y_example, names_example = preprocess.preprocess( data_train, raw_data, False) _, __, ___, X_example2, y_example2, names_example2 = preprocess.preprocess( data_train, raw_data, True) print("Prediction of movies:", names_example, "using KNN.") print("Real IMDB score:", divided_by_ten(y_example)) # print("and rounded score:", divided_by_ten(y_example2)) # print("svr error margin:", svm_reg(X_train, y_train, X_example, y_example)[1]) # print("svr error margin (rounded):", svm_reg(X_train2, y_train2, X_test2, y_test2)) # print("svc error margin:", svm_clf(X_train, y_train, X_test, y_test)) # print("svc error margin (rounded):", svm_clf(X_train2, y_train2, X_test2, y_test2)) # print("random forest error margin:", random_forest_reg(X_train, y_train, X_test, y_test)) # print("random forest error margin (rounded):", random_forest_reg(X_train2, y_train2, X_test2, y_test2)) print("KNN prediction score:", divided_by_ten(knn(X_train, y_train, X_example, y_example)[1]))
def split_file(file_value, option): if option.split is not None: file_value, file_value_test = preprocess.split_data( file_value, option.split) else: file_value_test = None return file_value, file_value_test
def main(in_file): df = load_data(in_file) features, Y_train, Y_test = split_data(df) X_train, X_test = preprocess(features) X_train.to_csv('X_train.csv') X_test.to_csv('X_test.csv') Y_train.to_csv('Y_train.csv') Y_test.to_csv('Y_test.csv') X_train = pd.read_csv('X_train.csv') X_test = pd.read_csv('X_test.csv') Y_train = pd.read_csv('Y_train.csv') Y_test = pd.read_csv('Y_test.csv') cosineSim(X_train) classProbs, condProbs, vocabSize, proVocabSize, conVocabSize = trainSentimentAnalysis(X_train) accuracy = 0 for review in X_test['pros']: result = testSentimentAnalysis(str(review).split(), classProbs, condProbs, vocabSize, proVocabSize, conVocabSize) if result == 'pro': accuracy += 1 for review in X_test['cons']: result = testSentimentAnalysis(str(review).split(), classProbs, condProbs, vocabSize, proVocabSize, conVocabSize) if result == 'con': accuracy += 1 accuracy = float(accuracy / (len(X_test['pros']) + len(X_test['cons']))) print('Accuracy for Naive Bayes Sentiment Analysis:', accuracy) trainModels(X_train, X_test, Y_train, Y_test) return 'done'
def setup(self): sentences, labels = pp.split_data(self.raw_data, test_split=p.split) sub = 5 if self.use_subwords else 0 sequences, tokens = pp.get_input_sequences(*sentences, vocab_size=p.vocab_size, maxlen=p.max_length, focus=p.focus, subwords=sub) self.tokens = tokens return sequences, labels
def test_split_data(self): df = read_data(self.data_dir, self.bodies_file, self.stances_file) train_data, dev_data, test_data = split_data(df) for data in train_data, dev_data, test_data: self.assert_valid_df(data) self.assertLess(len(train_data), 0.9 * len(df)) self.assertLess(len(dev_data), 0.1 * len(df)) self.assertAlmostEqual(len(test_data), 0.1 * len(df), delta=100)
def data_prep(seed): profile = profile.Profile() interest = interest.Interest() preprocess = preprocess.Preprocessor() profile_raw = profile.get_profile() interest_raw, ids = interest.data_merge() data = preprocess.finalize_data(profile_raw, interest_raw) X, y, X_train, y_train, X_test, y_test = preprocess.split_data(data, seed=seed, re=False) return X, y, X_train, y_train, X_test, y_test, ids
def train(): raw_data = pd.read_csv(train_data_path) data, labels = preprocess(raw_data) train_data, test_data, train_labels, test_labels = split_data(data, labels) trained_model = train_model(digit_recognition_model(), train_data, train_labels, 15) result = eval(trained_model, test_data, test_labels) print('model accuracy:', result) save_model(model_path, trained_model)
def classifier(self, rep_id, method, model): print('\n\n\nrep: {}\nmethod: {}\nmodel: {}\n\n'.format( str(rep_id + 1), method, model)) # data import X, y = import_wm_data(mode=method, dim=model) # data split X_split, y_split = split_data(X, y, RAND_NUM=rep_id) # build each model if method == 'MFE': return self.MFE(X_split, y_split, model) if method == 'CNN': return self.CNN(X_split, y_split, model)
def main(): NAME_IDX = 5 data_file = '../data/the_office_scripts.csv' length = get_num_lines(data_file) data_names = get_data(data_file, NAME_IDX, length) vocab = build_vocab(data_names) data_ids = convert_to_id(vocab, data_names) train_data, test_data = split_data(data_ids, length) num_tokens = len(vocab) model = RNN_WORD_Model(num_tokens) for i in range(40): train(model, train_data) test(model, test_data)
import heapq # read csv, format date, save to 'data' csvfile = file('t_alibaba_data.csv','rb') reader = csv.reader(csvfile) data = [] for line in reader: if reader.line_num == 1: continue line[3] = date(*pp.parse_date(line[3])) line[2] = int(line[2]) data.append(line) csvfile.close() # spliting data to 4 parts mon4,mon5,mon6,mon7 = pp.split_data(data) # training set: train_d aka x, (mon4, mon5); train_t aka y, (mon6) # testing set: test_d aka x, (mon4, mon5, mon6); test_t aka y, (mon7) train_d = dc(mon4) train_d.extend(mon5) test_d = dc(train_d) test_d.extend(mon6) train_t = dc(mon6) test_t = dc(mon7) # processing data train_d = pp.process_activity(train_d) train_t = pp.process_activity(mon6) train_d = pp.normalization(train_d)
from SVM import support_vector_machine path = "./data" if __name__ == "__main__": #PROCESS THE DATA sentences, labels = read_data(path) TFIDF = compute_TFIDF(sentences, path) #calculate TFIDF values word_list, TFIDF = read_TFIDF( path) #read unique words list and TFIDF values create_vectors(word_list, TFIDF, path) #vectorize data NROWS = len(TFIDF) NCOLS = len(word_list) #vectorized corpus (premise+hypothesis) data = get_vectors(path, NROWS, NCOLS) #get vectorized data y = convert_list_to_nd_array("y", labels) train_x, train_y, test_x, test_y = split_data( data, y, path) #split %80 for training %20 for test #2D arrays converted into 1D arrays to use in SVM train_y_1 = train_y.flatten() test_y_1 = test_y.flatten() accuracy_lg = logistic_reg(train_x, train_y, test_x, test_y) accuracy_svm = support_vector_machine(train_x, train_y_1, test_x, test_y_1)
import numpy as np import keras from keras.models import Sequential from keras.layers import Dense, Conv2D, MaxPooling2D, Dropout, Flatten from keras import optimizers from sklearn.model_selection import train_test_split import preprocess X, y = preprocess.read_data('../../smiles') X = preprocess.normalize(X) X = X.reshape(X.shape[0], X.shape[1], X.shape[2], 1) X_train, X_valid, X_test, y_train, y_valid, y_test = preprocess.split_data( X, y) # from keras.utils import to_categorical # y_train = to_categorical(y_train) # y_test = to_categorical(y_test) # y_valid = to_categorical(y_valid) print(y_train.shape) input_shape = X_train.shape[1:] def createModel(): model = Sequential() model.add(
def start(): if len(sys.argv) == 1: print("========Training a new model========") # scenario_name = sys.argv[1] dataset = [get_data(i) for i in range(1,8)] fer_cols = ['R%d_FER' % i for i in range(1, 97)] rssi_min, rssi_max = preprocess.get_rssi_min_max(dataset) for data in dataset: data[rssi_cols] = encode_rssi(data[rssi_cols], rssi_min, rssi_max).fillna(1) data_train, data_test, data24_train, data5_train, data24_test, data5_test, data24_sim, data5_sim = preprocess.split_data(dataset) train_all_models(data24_train, fer_cols) print('Trained all models...') exit() time_dataset = [] time_dataset.append(data24_test[0]) agency = test_lstm(time_dataset, fer_cols) sim_dataset = [] sim_dataset.append(data24_sim[0]) full_best_dict = start_sim(sim_dataset, agency) util_file.best_dict_to_csv(full_best_dict) elif len(sys.argv) == 2: if sys.argv[1] == 'run': print("========Using Existing Models========") # scenario_name = sys.argv[1] dataset = [get_data(i) for i in range(1,8)] fer_cols = ['R%d_FER' % i for i in range(1, 97)] rssi_min, rssi_max = preprocess.get_rssi_min_max(dataset) for data in dataset: data[rssi_cols] = encode_rssi(data[rssi_cols], rssi_min, rssi_max).fillna(1) data_train, data_test, data24_train, data5_train, data24_test, data5_test, data24_sim, data5_sim = preprocess.split_data(dataset) time_dataset = [] time_dataset.append(data24_test[2]) agency = test_lstm(time_dataset, fer_cols) sim_dataset = [] sim_dataset.append(data24_sim[2]) full_best_dict = start_sim(sim_dataset, agency) util_file.best_dict_to_csv(full_best_dict) if sys.argv[1] == 'c': print("========Using The C Code For Prediction========") points = _est.python_init(b"data/1-sec-average-bugFix/twoHourStableS8/s8.cfg", b"s8-1-rates.sets", b"data/1-sec-average-bugFix/twoHourStableS8-a", b"data/1-sec-average-bugFix/twoHourStableS8-b") RATES = _est.get_rates() rates_array_type = ctypes.c_double * (RATES+1) knownfers = rates_array_type() for x in range(1, RATES+1): knownfers[x] = FER_INVALID # -1.0 means not used agency = C_test_lstm(RATES, knownfers, points) full_best_dict = C_start_sim(agency, points, RATES, knownfers) util_file.best_dict_to_csv(full_best_dict)
def run_experiment(): scores = { "svr1": [], "svr2": [], "svc1": [], "svc2": [], "rf1": [], "rf2": [], "knn1": [], "knn2": [], "nb1": [], "nb2": [], "log1": [], "log2": [], "ridge_reg1": [], "ridge_reg2": [], "ridge_clf1": [], "ridge_clf2": [] } for i in range(ITER): print("Iter:", i + 1) raw_data = [] random.shuffle(lines) for line in lines: raw_data.append(json.loads(line)) data_train, data_test = preprocess.split_data(raw_data, TRAIN_TEST_RATION) # imdb rating as float: X_train, y_train, names_train, X_test, y_test, names_test = preprocess.preprocess( data_train, data_test, False) # rounded imdb rating: X_train2, y_train2, names_train2, X_test2, y_test2, names_test2 = preprocess.preprocess( data_train, data_test, True) scores["svr1"].append(svm_reg(X_train, y_train, X_test, y_test)) scores["svr2"].append(svm_reg(X_train2, y_train2, X_test2, y_test2)) scores["svc1"].append(svm_clf(X_train, y_train, X_test, y_test)) scores["svc2"].append(svm_clf(X_train2, y_train2, X_test2, y_test2)) scores["rf1"].append( random_forest_reg(X_train, y_train, X_test, y_test)) scores["rf2"].append( random_forest_reg(X_train2, y_train2, X_test2, y_test2)) scores["knn1"].append(knn(X_train, y_train, X_test, y_test)) scores["knn2"].append(knn(X_train2, y_train2, X_test2, y_test2)) scores["nb1"].append(naive_bayes(X_train, y_train, X_test, y_test)) scores["nb2"].append(naive_bayes(X_train2, y_train2, X_test2, y_test2)) scores["log1"].append(logistic_reg(X_train, y_train, X_test, y_test)) scores["log2"].append( logistic_reg(X_train2, y_train2, X_test2, y_test2)) scores["ridge_reg1"].append(ridge_reg(X_train, y_train, X_test, y_test)) scores["ridge_reg2"].append( ridge_reg(X_train2, y_train2, X_test2, y_test2)) scores["ridge_clf1"].append(ridge_clf(X_train, y_train, X_test, y_test)) scores["ridge_clf2"].append( ridge_clf(X_train2, y_train2, X_test2, y_test2)) for key in scores: mean = sum(scores[key]) / len(scores[key]) scores[key].append(mean) print(key, mean) print(scores) return scores
import preprocess from models import * TRAIN_FILE = "crawler/data3.json" EXAMPLE_FILE = "example.json" PREDICTION_FILE = "prediction.txt" TRAIN_TEST_RATION = 0.8 ITER = 20 train_file = open(TRAIN_FILE, "r") raw_data = [] lines = train_file.read().splitlines() random.shuffle(lines) for line in lines: raw_data.append(json.loads(line)) data_train, data_test = preprocess.split_data(raw_data, TRAIN_TEST_RATION) # imdb rating as float: X_train, y_train, names_train, X_test, y_test, names_test = preprocess.preprocess( data_train, data_test, False) # rounded imdb rating: X_train2, y_train2, names_train2, X_test2, y_test2, names_test2 = preprocess.preprocess( data_train, data_test, True) def divided_by_ten(x): return [i / 10 for i in x] def run_models(): print("svr error margin:", svm_reg(X_train, y_train, X_test, y_test)[0]) print("svr error margin (rounded):",
true_pred = 0 for i in range(len(predictions)): if np.argmax(predictions[i]) == np.argmax( true_labels[i]): # if 1 is in same index with ground truth true_pred += 1 return true_pred / len(predictions) if __name__ == "__main__": #PROCESS THE DATA words, labels = read_data(path) sentences = create_samples(words, labels) train_x, train_y, test_x, test_y = split_data(sentences) # creating one-hot vector notation of labels. (Labels are given numeric) # [0 1] is PERSON # [1 0] is not PERSON new_train_y = np.zeros(shape=(len(train_y), output_size)) new_test_y = np.zeros(shape=(len(test_y), output_size)) for i in range(len(train_y)): new_train_y[i][int(train_y[i])] = 1 for i in range(len(test_y)): new_test_y[i][int(test_y[i])] = 1 train_y = new_train_y test_y = new_test_y
def run_bert(device, results_file): #prepare the dataset logging.info("READING AND PARSING THE DATA...........") ground_truth = read_ground_truth_files(args.data_dir) if args.mode == "concreteness": data_path = os.path.join(args.data_dir, "absconc_data_raw.csv") data, labels = read_data(data_path, ground_truth, args.feature) train_data, dev_data, test_data, train_labels, dev_labels, test_labels = split_data( data, labels, dev_size=0.1, test_size=0.2) elif args.mode == "wiki": wiki_data_path = os.path.join(args.data_dir, "articles.csv") data, labels = read_wikidata(wiki_data_path, ground_truth) train_data, dev_data, test_data, train_labels, dev_labels, test_labels = split_data( data, labels, dev_size=0.1, test_size=0.2) train_data, dev_data, test_data = tokenize_data(args, train_data, train_labels, dev_data, dev_labels, test_data, test_labels) #prepare the models if args.classifier == 'bert': model = BertForSequenceClassification.from_pretrained( "bert-base-uncased", num_labels=args.num_label, output_attentions=False, output_hidden_states=False) elif args.classifier == 'distilbert': model = BertForSequenceClassification.from_pretrained( "distilbert-base-uncased", num_labels=args.num_label, output_attentions=False, output_hidden_states=False) optimizer = AdamW(model.parameters(), lr=args.lr, eps=1e-8) epoch = args.epochs train_iter = DataLoader(train_data, sampler=RandomSampler(train_data), batch_size=args.batch_size) dev_iter = DataLoader(dev_data, sampler=SequentialSampler(dev_data), batch_size=args.batch_size) test_iter = DataLoader(test_data, sampler=SequentialSampler(test_data), batch_size=args.batch_size) #create model save directory checkpoint_dir = os.path.join(args.checkpoint_dir, args.model_name) if not os.path.exists(checkpoint_dir): os.makedirs(checkpoint_dir) #run the tests logging.info( "Number of training samples {train}, number of dev samples {dev}, number of test samples {test}" .format(train=len(train_data), dev=len(dev_data), test=len(test_data))) train_bert(epoch, model, train_iter, dev_iter, optimizer, device, checkpoint_dir, results_file) model = load_model(checkpoint_dir) acc, f1, recall, prec, f1_ave, recall_ave, prec_ave = test_bert( test_iter, model, device) del model return acc, f1, recall, prec, f1_ave, recall_ave, prec_ave
'train_dir': train_dir, 'valid_dir': valid_dir, 'output': output_dir, 'input_size': (224, 224), 'batch': 16, 'epoch': 15, 'lr': 0.001, 'momentum': 0.9, 'log_interval': 2, 'valid_interval': 2, 'n_cpu': 16, 'augment': True, 'ver': 1.1 } cv = split_data(option['ver'], p=0.1) option['train_dir'] = cv[0] option['test_dir'] = cv[1] use_cuda = torch.cuda.is_available() device = torch.device("cuda" if use_cuda else "cpu") kwargs = { 'num_workers': option['n_cpu'], 'pin_memory': True } if use_cuda else {} print('option:', option) print('use cuda:', use_cuda) if __name__ == '__main__':