def run_predict_review(review_text): if review_text is None or review_text == "": return utils.data_preprocessing() df = pd.read_csv("Data/preprocessed_reviews_file.csv", names=['reviewContent', 'rating']) stopset = set(stopwords.words('english')) vectorizer = TfidfVectorizer(use_idf=True, lowercase=True, strip_accents='ascii', stop_words=stopset) y = df['rating'] X = vectorizer.fit_transform(df.reviewContent) X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0) clf = naive_bayes.MultinomialNB() print("Fitting Multinomial Naive Bayes Classifier") clf.fit(X, y) # y_pred_class = clf.predict(X_test) # print(metrics.accuracy_score(y_test, y_pred_class)) # confusion = metrics.confusion_matrix(y_test, y_pred_class) # TP = [] # TN = [] # FP = [] # FN = [] # TP.append(confusion[]) # print(classification_report(X, y)) Review_input = np.array([review_text]) Review_input_vector = vectorizer.transform(Review_input) print("Predicted review rating: " + str(clf.predict(Review_input_vector)[0])) print("Review Content: " + review_text)
def load_train_data(data_dir, input_shape): val_filename = os.listdir(os.path.join(data_dir, 'validation')) train_filename = sorted( os.listdir(os.path.join(data_dir, 'training')) + [name[:-3] + '_.jpg' for name in val_filename]) X = np.array([ cv2.resize( cv2.imread( os.path.join(data_dir, 'training', n) if n[-5] != '_' else os. path.join(data_dir, 'validation', n[:-5] + 'jpg')), input_shape) for n in tqdm(train_filename) ]) Y = np.array([int(n.split('_')[0]) for n in train_filename]) X = X.astype(np.float32) utils.data_preprocessing(X) Y = to_categorical(Y) return X, Y
def trainer(dataset): model = DAEGC(num_features=args.input_dim, hidden_size=args.hidden_size, embedding_size=args.embedding_size, alpha=args.alpha, num_clusters=args.n_clusters).to(device) print(model) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # data process dataset = utils.data_preprocessing(dataset) adj = dataset.adj.to(device) adj_label = dataset.adj_label.to(device) M = utils.get_M(adj).to(device) # data and label data = torch.Tensor(dataset.x).to(device) y = dataset.y.cpu().numpy() with torch.no_grad(): _, z = model.gat(data, adj, M) # get kmeans and pretrain cluster result kmeans = KMeans(n_clusters=args.n_clusters, n_init=20) y_pred = kmeans.fit_predict(z.data.cpu().numpy()) model.cluster_layer.data = torch.tensor(kmeans.cluster_centers_).to(device) eva(y, y_pred, 'pretrain') for epoch in range(args.max_epoch): model.train() if epoch % args.update_interval == 0: # update_interval A_pred, z, Q = model(data, adj, M) q = Q.detach().data.cpu().numpy().argmax(1) # Q eva(y, q, epoch) A_pred, z, q = model(data, adj, M) p = target_distribution(Q.detach()) kl_loss = F.kl_div(q.log(), p, reduction='batchmean') re_loss = F.binary_cross_entropy(A_pred.view(-1), adj_label.view(-1)) loss = 10 * kl_loss + re_loss optimizer.zero_grad() loss.backward() optimizer.step()
def run_restaurants_sentiments(): utils.data_preprocessing() restaurants_file_pd = pd.read_csv("Data/restaurants_60601-60606.csv") reviews_file_pd = pd.read_csv("Data/clean_reviews_60601-60606.csv") combined_file = pd.merge(restaurants_file_pd, reviews_file_pd, left_on="restaurantID", right_on="restaurantID") combined_file.to_csv("Data/restaurants_reviews_combined_file.csv", sep=',', encoding='utf-8') combined_file_pd = pd.read_csv( "Data/restaurants_reviews_combined_file.csv") review_sentiment = [] end = len(combined_file_pd.index) for i in range(end): temp = combined_file_pd.ix[i]["reviewContent"] blob = TextBlob(temp) if blob.sentiment.polarity > 0.175: predict_rating = "positive" else: predict_rating = "negative" review_sentiment.append(predict_rating) utils.progress(i, end - 1, status='Calculating polarity for each review') combined_file_pd['Review Sentiment'] = review_sentiment result_pd = combined_file_pd[[ 'restaurantID', 'name', 'reviewID', 'Review Sentiment', 'rating_y' ]] result_pd.to_csv("Results/query_5_result.csv", index=False) print("The output is generated in Results/query_5_result.csv")
def main(configs): # read mat data from file input_data = utl.read_mat(configs['DATA_PATH']) # data preprocessing input_data, proc_mask = utl.data_preprocessing(input_data,\ configs['MONTH_SELECTION']) # generate feature vectors feats, labels = generate_features(input_data) # backup feats and labels feats_backup = feats labels_backup = labels # weather classification feats, labels, masks = weather_classification(feats, configs['MODE'], labels) if configs['MODE'] == 'grid search': grid_search_wrapper(feats, labels, configs) elif configs['MODE'] == 'holdout training': holdout_train_wrapper(feats, labels, configs, masks) elif configs['MODE'] == 'weather prediction': preds = weather_prediction(feats, labels, configs, masks) # compare predicted irradiance drop utl.plot_irradiance_drop(feats_backup[:, 5] - preds, feats_backup[:, 5] - labels_backup) utl.plot_irradiance_drop(preds, labels_backup) ''' regroup the data ''' preds_cube, labels_cube = utl.regroup_data(preds, labels_backup, proc_mask) utl.compare_daily_mean(preds_cube, labels_cube, sensor_selection=24) plt.show()
def calc_normal(points, tree, radius=0.1): """Calculate unoriented normals from a 2D point set Params: points (np.array): the point cloud tree (KDTree) : the kdtree to find neighbors radius (float) : the radius to collect neighbors Returns: normals (np.array): the estimated unoriented normal """ nbs_list = tree.query_radius(points, radius) normals = np.zeros((len(points), 2)) for i, nbs in enumerate(nbs_list): my_nbs = points[nbs] eigen_vectors, _ = standard_pca(data_preprocessing(my_nbs)[0]) normals[i] = eigen_vectors[:, -1] normals = normals / np.linalg.norm(normals, axis=1, keepdims=True) return normals
def pretrain(dataset): model = GAT( num_features=args.input_dim, hidden_size=args.hidden_size, embedding_size=args.embedding_size, alpha=args.alpha, ).to(device) print(model) optimizer = Adam(model.parameters(), lr=args.lr, weight_decay=args.weight_decay) # data process dataset = utils.data_preprocessing(dataset) adj = dataset.adj.to(device) adj_label = dataset.adj_label.to(device) M = utils.get_M(adj).to(device) # data and label x = torch.Tensor(dataset.x).to(device) y = dataset.y.cpu().numpy() for epoch in range(args.max_epoch): model.train() A_pred, z = model(x, adj, M) loss = F.binary_cross_entropy(A_pred.view(-1), adj_label.view(-1)) optimizer.zero_grad() loss.backward() optimizer.step() with torch.no_grad(): _, z = model(x, adj, M) kmeans = KMeans(n_clusters=args.n_clusters, n_init=20).fit(z.data.cpu().numpy()) acc, nmi, ari, f1 = eva(y, kmeans.labels_, epoch) if epoch % 5 == 0: torch.save(model.state_dict(), f"./pretrain/predaegc_{args.name}_{epoch}.pkl")
input_size = len(sel_variables) #num_output = len(sel_variables) #if you use shared neural network, please remove # conf_int = linear_model.conf_int() coef = linear_model.params std = linear_model.bse conf_int[0] = coef conf_int[1] = std print(data_name, len(sel_variables)) exit('bye') train_set, val_set, test_set = data_preprocessing( train=train, test=test, variables=sel_variables, conf_int=conf_int, dep_var=dep_var) meta_reg = MetaRegression(input_size=input_size, hidden_size=hidden_size, output_size=num_output) #print(summary(meta_reg, [(1, 12), (1, 12), (1, 12), (1, 12)])) if os.path.isfile('models/checkpoint_%s_%s.pt' % (data_name, k)): meta_reg = training(meta_reg, train_set, val_set, epochs=num_epochs, batch_size=batch_size, lr=learning_rate,
) checkpoint = ModelCheckpoint(model_path, 'val_loss', verbose=1, save_best_only=True, save_weights_only=True) reduce_lr = ReduceLROnPlateau('val_loss', 0.8, 2, verbose=1, min_lr=1e-5) logger = CSVLogger(model_path + '.csv', append=True) #tensorboard = TensorBoard(model_path[:model_path.rfind('.')]+'_logs', histogram_freq=1, batch_size=1024, write_grads=True, write_images=True, update_freq=512) model.fit(trainX, trainY, validation_data=(validX, validY), batch_size=256, epochs=10, callbacks=[checkpoint, reduce_lr, logger]) else: print('\033[32;1mLoading Model\033[0m') model.load_weights(model_path) sentences = [ text_to_word_sequence('today is a good day, but it is hot'), text_to_word_sequence('today is hot, but it is a good day') ] sentences = utils.data_preprocessing(sentences, word2idx, max_seq_len) print(model.predict(sentences))
## Training params from paper BATCH_SIZE = 32 LEARNING_RATE = 0.001 WEIGHT_DECAY = 0.0001 EARLY_STOP_EPOCHS = 10 # after n_epochs not improve then stop training EARLY_STOP_MONITOR = "loss" # monitor early stop on validation's loss or accuracy. MODEL_SAVE_PATH = "/tmp/text-level-gnn-{}.pt".format(experiment_dataset) SAVE_ACC_THRES = 0.8 if experiment_dataset != "Ohsumed" else 0.5 ### # Read dataset data_pd = utils.read_data(experiment_dataset) # Data preprocessing data_pd = utils.data_preprocessing(data_pd) # Feature Extraction data_pd, word2idx = utils.features_extracting( data_pd, minimum_word_count=MIN_WORD_COUNT, neighbor_distance=NEIGHBOR_DISTANCE) print("\n", data_pd.head()) # # Model NUM_CLASSES = len(set(data_pd['y'].values)) # Construct model text_level_gnn = model.TextLevelGNN_Model(word2idx, NUM_CLASSES, WORD_EMBED_DIM, PRETRAIN_EMBEDDING,
For the description of the goal, see https://www.kaggle.com/c/ieee-fraud-detection/overview """ import pandas as pd import numpy as np import matplotlib.pyplot as plt import scipy as sp from scipy import stats import sklearn from sklearn.model_selection import train_test_split from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc, precision_recall_curve from utils import PCA_change, resumetable, reduce_mem_usage, make_day_feature, make_hour_feature, fit_categorical_feature, data_preprocessing, get_input_features from nn_model import construct_model import pickle df_train, df_test, Y_train = data_preprocessing("nn", pre_loaded=True) categorical = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain"] + ["M" + str(i) for i in range(1, 10)] +\ ['DeviceType', 'DeviceInfo', 'Weekday', 'Hour', 'P_emaildomain_pre', 'P_emaildomain_suffix', 'R_emaildomain_pre', 'R_emaildomain_suffix'] +\ ["id_"+str(i) for i in range(12,38) if "id_"+str(i) in df_train.columns] category_counts = { 'ProductCD': 6, 'card1': 17092, 'card2': 503, 'card3': 135, 'card4': 6, 'card5': 140, 'card6': 6, 'addr1': 443, 'addr2': 95,
import run import tensorflow as tf batch_size = 128 embed_dim = 100 maxlen = 200 rnn_hiden_size = 128 epochs = 3 if __name__ == '__main__': # load_data train = utils.load_data('data/china-people-daily-ner-corpus/example.train') dev = utils.load_data('data/china-people-daily-ner-corpus/example.dev') test = utils.load_data('data/china-people-daily-ner-corpus/example.test') token2idx, idx2token, tag2idx, idx2tag, real_maxlen = utils.data_preprocessing( train) # len(token2idx) = 4314 print(f'real_maxlen = {real_maxlen}') # padding train = utils.padding(train, token2idx, tag2idx, maxlen) dev = utils.padding(dev, token2idx, tag2idx, maxlen) test = utils.padding(test, token2idx, tag2idx, maxlen) train = utils._to_tensor(train, tf.int32) dev = utils._to_tensor(dev, tf.int32) test = utils._to_tensor(test, tf.int32) # to batch train_ds = tf.data.Dataset.from_tensor_slices(train).shuffle(10000).batch( batch_size) dev_ds = tf.data.Dataset.from_tensor_slices(dev).shuffle(2000).batch(
def load_normalize_data(path): data, label = load_data(path) processed_data = data_preprocessing(data) return processed_data, label
import numpy as np import matplotlib.pyplot as plt import scipy as sp from scipy import stats import sklearn import seaborn as sns from sklearn.model_selection import train_test_split from sklearn.metrics import precision_score, recall_score, confusion_matrix, accuracy_score, roc_auc_score, f1_score, roc_curve, auc,precision_recall_curve from utils import resumetable, reduce_mem_usage, fit_categorical_feature, data_preprocessing import pickle from sklearn.preprocessing import StandardScaler import lightgbm as lgb from bayes_opt import BayesianOptimization df_train, df_test, Y_train = data_preprocessing("bayesopt", pre_loaded = True) categorical = ["ProductCD", "card1", "card2", "card3", "card4", "card5", "card6", "addr1", "addr2", "P_emaildomain", "R_emaildomain"] + ["M" + str(i) for i in range(1, 10)] +\ ['DeviceType', 'DeviceInfo', 'Weekday', 'Hour', 'P_emaildomain_pre', 'P_emaildomain_suffix', 'R_emaildomain_pre', 'R_emaildomain_suffix'] +\ ["id_"+str(i) for i in range(12,38) if "id_"+str(i) in df_train.columns] category_counts = {'ProductCD': 6, 'card1': 17092, 'card2': 503, 'card3': 135, 'card4': 6, 'card5': 140, 'card6': 6, 'addr1': 443, 'addr2': 95, 'P_emaildomain': 62, 'R_emaildomain': 62, 'M1': 4, 'M2': 4, 'M3': 4, 'M4': 5, 'M5': 4, 'M6': 4, 'M7': 4, 'M8': 4, 'M9': 4, 'DeviceType': 4, 'DeviceInfo': 2801, 'Weekday': 8, 'Hour': 25, 'P_emaildomain_pre': 10, 'P_emaildomain_suffix': 10, 'R_emaildomain_pre': 10, 'R_emaildomain_suffix': 10, 'id_12': 4, 'id_13': 57, 'id_14': 30, 'id_15': 5, 'id_16': 4, 'id_17': 129, 'id_19': 570, 'id_20': 549, 'id_28': 4, 'id_29': 4, 'id_30': 88, 'id_31': 173, 'id_32': 8, 'id_33': 463, 'id_34': 6, 'id_35': 4, 'id_36': 4, 'id_37': 4, 'id_38': 4} numerical = ["TransactionAmt", "dist1"] + ["C" + str(i) for i in range(1, 15) if "C" + str(i) in df_train.columns] + \ ["D" + str(i) for i in range(1, 16) if "D" + str(i) in df_train.columns] + ["PCA_V_"+str(i) for i in range(20)] + \ ['id_01', 'id_02', 'id_03', 'id_04', 'id_05', 'id_06', 'id_09', 'id_10', 'id_11']
def main(_): # Load the configuration file. with open(FLAGS.config, 'r') as f: config = yaml.load(f) print('**********', config['experiment_name'],'**********') """ Cuda Check """ if torch.cuda.is_available(): print('Using GPU!') else: print('No GPU!') """ Data Preprocessing """ if config['data_preprocessing']: print('Pre-processing Original Data ...') data_preprocessing() print('Data Pre-processing Done!') """ Read Data & Get Embedding """ train_data = pd.read_csv('input/cleaned_train.csv') test_data = pd.read_csv('input/cleaned_test.csv') # split dataset msk = np.random.rand(len(train_data)) < 0.8 train = train_data[msk] valid = train_data[~msk] all_sents = train_data['s1'].tolist() + train_data['s2'].tolist() + test_data['s1'].tolist() + test_data['s2'].tolist() # dataset trainDS = myDS(train, all_sents) validDS = myDS(valid, all_sents) print('Data size:',train_data.shape[0], test_data.shape[0]) full_embed_path = config['embedding']['full_embedding_path'] cur_embed_path = config['embedding']['cur_embedding_path'] if os.path.exists(cur_embed_path) and not config['make_dict']: embed_dict = load_embed(cur_embed_path) print('Loaded existing embedding.') else: print('Making embedding...') embed_dict = get_embedding(trainDS.vocab._id2word, full_embed_path) save_embed(embed_dict,cur_embed_path) print('Saved generated embedding.') vocab_size = len(embed_dict) # initialize nn embedding embedding = nn.Embedding(vocab_size, config['model']['embed_size']) embed_list = [] for word in trainDS.vocab._id2word: embed_list.append(embed_dict[word]) weight_matrix = np.array(embed_list) # pass weights to nn embedding embedding.weight = nn.Parameter(torch.from_numpy(weight_matrix).type(torch.FloatTensor), requires_grad = False) """ Model Preparation """ # embedding config['embedding_matrix'] = embedding config['vocab_size'] = len(embed_dict) # model siamese = Siamese_lstm(config) print(siamese) # loss func loss_weights = Variable(torch.FloatTensor([1, 3])) if torch.cuda.is_available(): loss_weights = loss_weights.cuda() criterion = torch.nn.CrossEntropyLoss(loss_weights) # optimizer learning_rate = config['training']['learning_rate'] if config['training']['optimizer'] == 'sgd': optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate) elif config['training']['optimizer'] == 'adam': optimizer = torch.optim.Adam(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate) elif config['training']['optimizer'] == 'adadelta': optimizer = torch.optim.Adadelta(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate) elif config['training']['optimizer'] == 'rmsprop': optimizer = torch.optim.RMSprop(filter(lambda x: x.requires_grad, siamese.parameters()), lr=learning_rate) print('Optimizer:', config['training']['optimizer']) print('Learning rate:', config['training']['learning_rate']) # log info train_log_string = '%s :: Epoch %i :: Iter %i / %i :: train loss: %0.4f' valid_log_string = '%s :: Epoch %i :: valid loss: %0.4f\n' # Restore saved model (if one exists). ckpt_path = os.path.join(config['ckpt_dir'], config['experiment_name']+'.pt') if os.path.exists(ckpt_path): print('Loading checkpoint: %s' % ckpt_path) ckpt = torch.load(ckpt_path) epoch = ckpt['epoch'] siamese.load_state_dict(ckpt['siamese']) optimizer.load_state_dict(ckpt['optimizer']) else: epoch = 1 print('Fresh start!\n') if torch.cuda.is_available(): criterion = criterion.cuda() siamese = siamese.cuda() """ Train """ if config['task'] == 'train': # save every epoch for visualization train_loss_record = [] valid_loss_record = [] best_record = 10.0 # training print('Experiment: {}\n'.format(config['experiment_name'])) while epoch < config['training']['num_epochs']: print('Start Epoch {} Training...'.format(epoch)) # loss train_loss = [] train_loss_sum = [] # dataloader train_dataloader = DataLoader(dataset=trainDS, shuffle=True, num_workers=2, batch_size=1) for idx, data in enumerate(train_dataloader, 0): # get data s1, s2, label = data # clear gradients optimizer.zero_grad() # input output = siamese(s1, s2) output = output.squeeze(0) # label cuda label = Variable(label) if torch.cuda.is_available(): label = label.cuda() # loss backward loss = criterion(output, label) loss.backward() optimizer.step() train_loss.append(loss.data.cpu()) train_loss_sum.append(loss.data.cpu()) # Every once and a while check on the loss if ((idx + 1) % 5000) == 0: print(train_log_string % (datetime.now(), epoch, idx + 1, len(train), np.mean(train_loss))) train_loss = [] # Record at every epoch print('Train Loss at epoch {}: {}\n'.format(epoch, np.mean(train_loss_sum))) train_loss_record.append(np.mean(train_loss_sum)) # Valid print('Epoch {} Validating...'.format(epoch)) # loss valid_loss = [] # dataloader valid_dataloader = DataLoader(dataset=validDS, shuffle=True, num_workers=2, batch_size=1) for idx, data in enumerate(valid_dataloader, 0): # get data s1, s2, label = data # input output = siamese(s1, s2) output = output.squeeze(0) # label cuda label = Variable(label) if torch.cuda.is_available(): label = label.cuda() # loss loss = criterion(output, label) valid_loss.append(loss.data.cpu()) print(valid_log_string % (datetime.now(), epoch, np.mean(valid_loss))) # Record valid_loss_record.append(np.mean(valid_loss)) epoch += 1 if np.mean(valid_loss)-np.mean(train_loss_sum) > 0.02: print("Early Stopping!") break # Keep track of best record if np.mean(valid_loss) < best_record: best_record = np.mean(valid_loss) # save the best model state_dict = { 'epoch': epoch, 'siamese': siamese.state_dict(), 'optimizer': optimizer.state_dict(), } torch.save(state_dict, ckpt_path) print('Model saved!\n') """ Inference """ if config['task'] == 'inference': testDS = mytestDS(test_data, all_sents) # Do not shuffle here test_dataloader = DataLoader(dataset=testDS, num_workers=2, batch_size=1) result = [] for idx, data in enumerate(test_dataloader, 0): # get data s1, s2 = data # input output = siamese(s1,s2) output = output.squeeze(0) # feed output into softmax to get prob prediction sm = nn.Softmax(dim=1) res = sm(output.data)[:,1] result += res.data.tolist() result = pd.DataFrame(result) print(result.shape) print('Inference Done.') res_path = os.path.join(config['result']['filepath'], config['result']['filename']) result.to_csv(res_path, header=False, index=False) print('Result has writtn to', res_path, ', Good Luck!')