def load_data(): ''' Preprocess the datasets, split the data arrays into X's and y's. ''' file_name1 = "yeast.dat" data1 = preprocessing(file_name1) Xdat1 = data1[:, :103] ydat1 = data1[:, 103:] n_dat1, _ = Xdat1.shape file_name2 = "scene.dat" data2 = preprocessing(file_name2) Xdat2 = data2[:, :294] ydat2 = data2[:, 294:] n_dat2, _ = Xdat2.shape #split the yeast data into training and test set proportion = 0.8 n1 = math.floor(proportion * n_dat1) Xtrain1, ytrain1 = Xdat1[:n1, :], ydat1[:n1, :] Xtest1, ytest1 = Xdat1[n1:, :], ydat1[n1:, :] #split the scene data into training and test set n2 = math.floor(proportion * n_dat2) Xtrain2, ytrain2 = Xdat2[:n2, :], ydat2[:n2, :] Xtest2, ytest2 = Xdat2[n2:, :], ydat2[n2:, :] return (normalize(Xtrain1), ytrain1, normalize(Xtest1), ytest1, normalize(Xtrain2), ytrain2, normalize(Xtest2), ytest2)
def getInputData(): # save in pickle fileName = 'emnist.p' if exists(fileName): # load pickle file trainData, trainLabel, one_hot_trainLabel, testData = pickle.load( open(fileName, mode='rb')) return trainData, trainLabel, one_hot_trainLabel, testData else: emnist = spio.loadmat("data/emnist-digits.mat") # load training dataset x_train = emnist["dataset"][0][0][0][0][0][0] x_train = x_train.astype(np.float32) # load training labels y_train = emnist["dataset"][0][0][0][0][0][1] # load test dataset x_test = emnist["dataset"][0][0][1][0][0][0] x_test = x_test.astype(np.float32) # load test labels y_test = emnist["dataset"][0][0][1][0][0][1] train_labels = y_train test_labels = y_test x_train = x_train.reshape(x_train.shape[0], 1, 28, 28, order="A") x_test = x_test.reshape(x_test.shape[0], 1, 28, 28, order="A") x_train = x_train.reshape(x_train.shape[0], 28 * 28) x_test = x_test.reshape(x_test.shape[0], 28 * 28) train = pd.DataFrame(y_train, columns=["label"]).join(pd.DataFrame(x_train)) test = pd.DataFrame(y_test, columns=["label"]).join(pd.DataFrame(x_test)) # cast to numpy array trainData = train.values[:, 1:] trainLabel = train.values[:, 0] testData = x_test processedTrainData = preprocessing(trainData) processedTestData = preprocessing(testData) one_hot_trainLabel = one_hot_encoding(trainLabel, 10) # save data to pickle if not isfile(fileName): pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel, processedTestData), open(fileName, 'wb')) return processedTrainData, trainLabel, one_hot_trainLabel, processedTestData return None
def get_tfidf_train_test(csv_filename, threshold=4.0): restaurants = pd.read_csv(csv_filename, encoding='utf-8', low_memory=False) review_list = restaurants['text'] cleaned_list = [x for x in review_list if str(x) != 'nan'] cleaned_df = pd.DataFrame(cleaned_list, columns=['review']) #update after preprocessing cleaned_df = preprocessing(cleaned_df, 'review') #set the targets based on threshold of 4.5 stars y = restaurants['stars'].apply(lambda u: 1 if u >= threshold else -1) #get the training and test data ready #TODO: kfold cv for train/test split y = np.array(y) X_train, X_test, y_train, y_test = train_test_split(cleaned_df['review'], y, shuffle=True, random_state=123) tfidf = TfidfVectorizer(strip_accents='ascii', stop_words='english') tfidf.fit(X_train) X_train = tfidf.transform(X_train).toarray() X_test = tfidf.transform(X_test).toarray() return X_train, y_train, X_test, y_test
def intent_classification(test_text): #load pickle file for tokenizer label_dict = pickle.load(open("label_encoder.pkl", "rb")) #clean customer input clean_test_text = preprocessing(test_text) #tokenize cusotmer input tokenizer = pickle.load(open('tokenizer.pkl', 'rb')) test_sequences = tokenizer.texts_to_sequences([clean_test_text]) test_input = pad_sequences(test_sequences, maxlen=MAX_SEQUENCE_LENGTH) #load pickle file for model model = pickle.load(open('model.pkl', 'rb')) #using intent classification model to classfy user input test_predictions_probas = model.predict([test_input]) test_predictions = test_predictions_probas.argmax(axis=-1) #find intent intent = None for index, item in label_dict.items(): if item == test_predictions[0]: intent = index break K.clear_session() return intent
def get_predict(model, img): processed_img = preprocessing(img) processed_img = np.expand_dims(processed_img, axis=0) out_put = model.predict(processed_img) out_put = out_put[..., -1] out_put = out_put[0] out_put = sklearn.preprocessing.binarize(out_put, threshold=0.5) out_put = out_put * 255. return out_put
def btnOnClick(self): self.hide() link = self.lineEdit.text() print(link) data = parser(link) print(data) new_df = preprocessing(data) salary = predict(new_df, self.randomForest) self.secondMenu = SecondMenu(salary) self.secondMenu.show()
def predict(currentClassifier, filename_predict): print("Making Predictions...") results = preprocessing(filename_predict, False) model_file = 'scam_' + currentClassifier + '.pkl' clf_predict = joblib.load(model_file) y_true, y_pred = results['labels'], clf_predict.predict( results['features']) print("Accuracy Score: ", currentClassifier, accuracy_score(y_true, y_pred)) print(classification_report(y_true, y_pred)) print("Completed!")
def clean_comments(data_set): """ Function to run the preprocessing function for each comment in the data set """ cleaned_data_set = [] for ds in data_set: ds.comment = preprocessing(ds.comment) cleaned_data_set.append((ds)) return cleaned_data_set
def linearSVM(data): # get scaled training and testing sets alongside their labels trainSet, testSet, trainLabels, testLabels = preprocessing(data) # initialize linear SVM using sklearn SVClassifier = SVC(kernel='linear') # train the model SVClassifier.fit(trainSet, trainLabels) # predict using test set predictions = SVClassifier.predict(testSet) # compute accuracy and confusion matrix total = len(testSet) correct = 0 confusionMatrix = confusion_matrix(testLabels, predictions) # loop through predictions and compute total accuracy for i in range(len(predictions)): if testLabels[i] == predictions[i]: correct += 1 # print total accuracy print(correct / total) # print confusion matrix print(confusionMatrix) # print classification report provided by the sklearn library print(classification_report(testLabels, predictions)) # initialize FPR and TPR variables to generate ROC curve fpr = dict() tpr = dict() roc_auc = dict() score = SVClassifier.fit(trainSet, trainLabels).decision_function(testSet) # get ROC curve data over time for i in range(len([0, 1])): fpr[i], tpr[i], _ = roc_curve(testLabels, score) fpr[i], tpr[i], _ = roc_curve(testLabels, predictions) roc_auc[i] = auc(fpr[i], tpr[i]) # generate ROC curve using matplotlib plt.figure() plt.plot(fpr[1], tpr[1]) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.05]) plt.xlabel('False Positive Rate') plt.ylabel('True Positive Rate') plt.title('ROC Curve') plt.show()
def main(): # Reads the probeA csv file into a panda dataframe probeAData = pd.read_csv('../probeA.csv') # This chunk of code calls orderData, which goes through the passed in # headers, swapping values such that the first header has all the smallest # values, the middle with the middle value and the last header with the largest # value. This was done as after inspecting the data, where it became apparent this # was a common trend in the data, with them occasionally being out of order. # After swapping these around, an improvement was seen in the AUC score, so was kept firstChemOrdered = orderData(probeAData, 'c1', 'c2', 'c3') secondChemOrdered = orderData(firstChemOrdered, 'm1', 'm2', 'm3') thirdChemOrdered = orderData(secondChemOrdered, 'n1', 'n2', 'n3') fourthChemOrdered = orderData(thirdChemOrdered, 'p1', 'p2', 'p3') probeAData = fourthChemOrdered.copy() # After swapping the data around into a fixed order, the data is then passed to preprocessing # As the name suggests, this method performs preprocessing on the data, returning a # dataframe containing the preprocessed data # 'tna' is passed into the method to ensure that it isn't included in the preprocessing # step. As, testing showed that performing preprocessing steps on tna lead to a decrease # in performance in terms of accuracy and AUC probeAPreprocessed = preprocessing(probeAData, 'tna') # Read in the classA csv file into a panda dataframe probeAResults = pd.read_csv('../classA.csv') # Here the tna column, (which is dropped in the preprocessing call as it isn't to have # any preprocessing run on it), is concatenated with the preprocessed data and the contents # of the classA csv file (which contains the classification assignments) probeAConcatenated = pd.concat( [probeAData['tna'], probeAPreprocessed, probeAResults], axis=1) # probeAConcatenated = pd.concat([probeAScaled, probeAResults], axis=1) bestModel = [0, 0, 0] tLabel = 'class' # Defines the number of splits we're going to use on the training data. In this case we use 10 meaning we will # Produce 10 folds, 900 training 100 validation splits = 10 # Run cross validation for depth in range 1 to 20 for depth in range(1, 20): #calls crossvalidation for given depth level with a fixed number of 100 estimators result = crossValidation(probeAConcatenated, depth, 200, splits, tLabel) if (result[0] > bestModel[0]): bestModel = result model = bestModel[2] # Simple print statement which prints the overall best model's AUC and the depth it used. print("BEST MODEL PREDICTED: ") print( str(bestModel[0]) + "______" + str(bestModel[1]) + "______" + str(bestModel[3])) print(model.feature_importances_)
def combineImage(randImage): # combine 10 images to 1 image #preprocessing: rescale image from min to max def preprocessing(image): return min_max_scaler.fit_transform(image) randImage = restoreImg(randImage) randImage = [pad(preprocessing(img)) for img in randImage] img_top = randImage[0] for i in range(1, 5): img_top = np.append(img_top, randImage[i], axis=1) img_bottom = randImage[5] for i in range(6, 10): img_bottom = np.append(img_bottom, randImage[i], axis=1) return np.append(img_top, img_bottom, axis=0)
def testPreprocessing(): fichier_data = "public_data/perso_train.data" #on va cherhcer les données d'entrainement datas = np.loadtxt( fichier_data, separateur=" " ) #on va cherhcer les données d'entrainement, chaque donnée est delimité par " " prepro = preprocessing() Tab = prepro.fit(datas) #on preprocess des données transf_Tab = prepro.transform(Tab) #on verifie que le preprocessing a agit sur les données, c'est a dire s'il y a des données censuré censured_data = 0 for i in range(0, transf_Tab.lenth): if i == 1: print(i, " est Censuré") censured_data += 1 if censured_data != 0: return True
def main(): X, y = load_data("spambase.data") X = preprocessing(X) k = 10 model = svm_model() model = NN_model() from sklearn.model_selection import cross_validate from sklearn.metrics import make_scorer, accuracy_score from sklearn.metrics import confusion_matrix def tn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 0] def fp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[0, 1] def fn(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 0] def tp(y_true, y_pred): return confusion_matrix(y_true, y_pred)[1, 1] scoring = { 'Accuracy': 'accuracy', 'Precision': 'precision', 'Recall': 'recall', 'tp': make_scorer(tp), 'tn': make_scorer(tn), 'fp': make_scorer(fp), 'fn': make_scorer(fn) } cv_results = cross_validate(model, X, y, scoring=scoring, cv=k, n_jobs=-1, return_train_score=False) table(cv_results, k, 5)
def gradDscent(xArr, yArr): if scale == True: xArr = preprocessing(xArr) xMat = np.mat(xArr) yMat = np.mat(yArr) lr = 0.03 epochs = 50000 costList = [] #计算数据列数,有几列就有几个权值 m, n = np.shape(xMat) #初始化权值 ws = np.mat(np.ones((n, 1))) for i in range(epochs+1): # xMat和weights矩阵相乘 h = sigmoid(xMat*ws) # 计算误差 ws_grad = xMat.T*(h-yMat)/m ws = ws - lr * ws_grad if i % 50 == 0: costList.append(cost(xMat, yMat, ws)) return ws, costList
def predict_note_authentication(sentence): w1 = preprocessing(sentence) list1 = [] list1.append(w1) transformer = TfidfTransformer() loaded_vec = CountVectorizer(decode_error="replace", vocabulary=pickle.load( open("feature.pkl", "rb"))) tfidf = transformer.fit_transform( loaded_vec.fit_transform(list1)).toarray() prediction = classifier.predict(tfidf) if (prediction == 0): return "a not Greeting" elif (prediction == 1): return "a Greeting"
def logisticRegression(data): # get training and testing sets alongside their labels trainSet, testSet, trainLabels, testLabels = preprocessing(data) # initialize logistic regression using sklearn logisticRegr = LogisticRegression() # train the model logisticRegr.fit(trainSet, trainLabels) # predict using test set predictions = logisticRegr.predict(testSet) # print classification report provided by the sklearn library print(classification_report(testLabels, predictions)) # compute accuracy score = logisticRegr.score(testSet, testLabels) print(score) # compute confusion matrix confusionMatrix = confusion_matrix(testLabels, predictions) print(confusionMatrix)
''' lectura de datos ''' print("----- Leyendo datos ...") #los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos) data_x = pd.read_csv('../data/nepal_earthquake_tra.csv') data_y = pd.read_csv('../data/nepal_earthquake_labels.csv') data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv') df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv') #se quitan las columnas que no se usan data_x.drop(labels=['building_id'], axis=1, inplace=True) data_x_tst.drop(labels=['building_id'], axis=1, inplace=True) data_y.drop(labels=['building_id'], axis=1, inplace=True) X, X_tst = preprocessing(data_x, data_x_tst) y = np.ravel(data_y.values) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) print("------ Stacking...") estimators = [('lgbm', lgb.LGBMClassifier(objective='regression_l1', n_jobs=-1, n_estimators=1000, num_leaves=80, scale_pos_weight=0.05, verbose=2)), ('rf', RandomForestClassifier(random_state=123456,
lr_scores = cross_val_score(linear_model.LogisticRegression(), X, y, scoring ='accuracy', cv = 10 ) rf_scores = cross_val_score(RandomForestClassifier(), X.toarray(), y, scoring ='accuracy', cv = 10 ) logging.info("CV: Accuracy of Logistic Regression is %.4f\n, and Accuracy of Random Forest is %.4f\n." % (lr_scores.mean, rf_scores.mean)) if __name__ == '__main__': # arguments: set ndays and sector ndays = 1 sector = 'Financials' filenameX = '%s/stock_%s_X.txt' % (DATA_DIR, sector) filenameY = '%s/stock_%s_Y_%sdays.txt' % (DATA_DIR, sector, ndays) X = openfiles(filenameX, 100) y = openfiles(filenameY, 1) # arg = 1: SNP500 X, y = preprocessing(X, y, arg=1) ids = X['id'] numX = X.ix[:,1:6].copy() numX.index = ids docs = tokenizing(list(X['text']), mode='tf') # term doc matrix logging.info(docs.shape) # docX = pd.DataFrame(docs, index=ids).to_sparse().sort_index() docX = pd.SparseDataFrame([pd.SparseSeries(docs[i].toarray().ravel()) for i in np.arange(docs.shape[0])],\ index =ids).sort_index() X =concat([numX.sort_index(), docX], axis =1) # print(X[0]) # raise # # X = numX.sort_index() y = y.sort_index()
#plt.scatter(X[:, 0], X[:, 1], c=y, s=50, cmap='autumn') #plot_svc_decision_function(model); # check solves with enough numerical precision from sklearn.metrics import zero_one_loss X,y = preprocessing(train_data, 4.0) E_in = [] for c in C: clf = svm.SVC(C = c, kernel = 'poly', gamma = 1 ,degree = 2) clf.fit(X, y) s = zero_one_loss(y, clf.predict(X), normalize=False) print(s) E_in.append(s) plt.plot(np.log10(C),E_in) plt.scatter(np.log10(C), E_in) plt.show() # Try reducing C (try C= 0.001,0.01,0.1). C is the penalty parameter and as C gets bigger, the model tries to reduce the penalty, and so takes more time to train.
''' lectura de datos ''' print("----- Leyendo datos ...") #los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos) data_x = pd.read_csv('../data/nepal_earthquake_tra.csv') data_y = pd.read_csv('../data/nepal_earthquake_labels.csv') data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv') df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv') #se quitan las columnas que no se usan data_x.drop(labels=['building_id'], axis=1,inplace = True) data_x_tst.drop(labels=['building_id'], axis=1,inplace = True) data_y.drop(labels=['building_id'], axis=1,inplace = True) X, X_tst, selec = preprocessing(data_x, data_x_tst) y = np.ravel(data_y.values) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) print("------ XGB...") xgboost = xgb.XGBClassifier(predictor='cpu_predictor', n_gpus=0, n_estimators = 700, eta = 0.1, max_depth = 10, verbose=2) xgboost, y_test_lgbm = validacion_cruzada(xgboost, X, y, skf) # Entreno de nuevo con el total de los datos # El resultado que muestro es en training, será mejor que en test clf = xgboost clf = clf.fit(X,y) plotImp(clf, selec, X.shape[1]) y_pred_tra = clf.predict(X)
def main(): # Defines the filename in which the models weights/paramaters have been pre-calculated and stored filename = "predictTNAModel.sav" model = None try: # If the model has been precomputed this will not throw an exception model = pickle.load(open(filename, 'rb')) except Exception as e: # If the model has not been precomputed, the filename will not exist, cannot be opened and thus throw an error # This will be caught and come into this section, where the models will then be calculated and stored in the given # Filename, meaning in the future they need not be calculated again tLabel = 'tna' # Reads the probeA csv file into a panda dataframe probeAData = pd.read_csv('../probeA.csv') # This chunk of code calls orderData, which goes through the passed in # headers, swapping values such that the first header has all the smallest # values, the middle with the middle value and the last header with the largest # value. This was done as after inspecting the data, where it became apparent this # was a common trend in the data, with them occasionally being out of order. # After swapping these around, an improvement was seen in the R2 score, so was kept firstChemOrdered = orderData(probeAData, 'c1', 'c2', 'c3') secondChemOrdered = orderData(firstChemOrdered, 'm1', 'm2', 'm3') thirdChemOrdered = orderData(secondChemOrdered, 'n1', 'n2', 'n3') fourthChemOrdered = orderData(thirdChemOrdered, 'p1', 'p2', 'p3') probeAData = fourthChemOrdered.copy() # After swapping the data around into a fixed order, the data is then passed to preprocessing # As the name suggests, this method performs preprocessing on the data, returning a # dataframe containing the preprocessed data # 'tna' is passed into the method to ensure that it isn't included in the preprocessing # step. As, testing showed that performing preprocessing steps on tna lead to a decrease # in performance in terms of R2 probeAScaled = preprocessing(probeAData, tLabel) # A dataFrame 'probeAConcatenated' is produced containing the tna and preprocessed data # Which will be used for training probeAConcatenated = pd.concat([probeAData['tna'], probeAScaled], axis=1) probeATraining = probeAScaled.copy() # Defines number of splits/folds to use splits = 10 # This creates an array of 800 alpha values to test in a range between 0.00000001 and 2 which are produced non-linearly alphas = np.geomspace(0.00000001, 2, 800) # The best model is retrieved by running crossValidation on the data, with a defined # number of splits, with a given target label with the defined set of alphas bestModelResult = crossValidation(probeAConcatenated, splits, tLabel, alphas) model = bestModelResult[0] result = bestModelResult[1] bestAlpha = bestModelResult[2] print("BEST MODEL - " + str(result) + "__" + str(bestAlpha)) #Can use TNA in prediction of t1, but need to check effect of this #On average the prediction is 0.14 out, so make gaussian distribution ot mdoel noise of 0.14 and apply to #task 1 tna column and check results pickle.dump(model, open(filename, 'wb')) # Now the model has either been loaded or computed we can start loading in the data needed for predictions # ProbeB csv file is opened and read into a panda dataFrame probeBData = pd.read_csv('../probeB.csv') #This section shuffles the data in the columns to assure that the values appear in the order of: # col1val <= col2val <= col3val BfirstChemOrdered = orderData(probeBData, 'c1', 'c2', 'c3') BsecondChemOrdered = orderData(BfirstChemOrdered, 'm1', 'm2', 'm3') BthirdChemOrdered = orderData(BsecondChemOrdered, 'n1', 'n2', 'n3') BfourthChemOrdered = orderData(BthirdChemOrdered, 'p1', 'p2', 'p3') probeBData = BfourthChemOrdered.copy() # Performs preprocessing on the probeBData # 'none' is passed as no column needs to be dropped (as it doesn't contain a tna column) probeBScaled = preprocessing(probeBData, 'none') # The model predicts the probabilities for the given data predictions = model.predict(probeBScaled) # The predictions are then printed to a tnaB csv file with a header of "tna" dataFramePredictions = pd.DataFrame(predictions, columns=["tna"]) dataFramePredictions.to_csv('tnaB.csv', index=False)
output: scaled numpy array ''' minV = 0 maxV = 255 data = (data - minV) / (maxV - minV) return data def one_hot_encoding(data, numberOfClass): from sklearn import preprocessing lb = preprocessing.LabelBinarizer() lb.fit(range(numberOfClass)) return lb.transform(data) processedTrainData = preprocessing(trainData) processedTestData = preprocessing(testData) one_hot_trainLabel = one_hot_encoding(trainLabel, 10) # save in pickle fileName = 'mnist.p' if not isfile(fileName): pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel, processedTestData), open(fileName, 'wb')) # load pickle file fileName = 'mnist.p' def getInputTensor(features, numberOfClass): ''' Create tf.placeholder for input & label
img = make_img(drawing[draw_num]) img = np.array(img.resize((32, 32))).reshape(32, 32, 1) X.append(img) Y.append(Y_num) Y_num += 1 tmpx = np.array(X) Y = np.array([[i] for i in Y]) enc = OneHotEncoder(categories='auto') enc.fit(Y) tmpy = enc.transform(Y).toarray() return tmpx, tmpy, class_label X_train, Y_train, class_label = preprocessing(filenames) print('\n', X_train.shape, Y_train.shape, '\n', class_label) #df.head() #print(drawing[0]) #img = make_img(drawing[1]) #plt.imshow(img) import tensorflow.compat.v1 as tf tf.disable_eager_execution() #like tensorflow v1 activate learning_rate = 0.001 training_epochs = 30 batch_size = 100 X = tf.placeholder(tf.float32, [None, 32, 32, 1], name='input') Y = tf.placeholder(tf.float32, [None, 340], name='output')
def main(): import matplotlib.pyplot as plt splitVal = 0.99 TicksIntoFuture = 1 TicksIntoPast = 54 #8days => 8[day]*24[std/day] = 192[std] ##the present is not included into this value hence TicksIntoPast can be 0 #and the batch size is TicksIntoPast+1 batch_sizes = 128 epochs = 45 print("Data will be shaped acording to tensor flows [batch, time, features] ... windows") featureListRaw = ["Date","Open","High","Low","Close"] #labelListTimeShifted = ["High","Low","Close"] labelListTimeShifted = ["Open","High","Low","Close"] #labelListTimeShifted = ["Open","High","Low","Close"] featListUnScale = ["Date"] featListScale = ["Open","High","Low","Close"] featureList = ["DaySin","DayCos","Open","High","Low","Close"] #featList.append("Day") #featList.append("Hour") pp = preprocessing(ticksIntoPast=TicksIntoPast,ticksIntoFuture=TicksIntoFuture) data = pp.pullData('BTC-h.csv',0,featureListRaw,0) #data = pp.pullData('dbg.csv',0,featureListRaw,0) print("\n\nShrink Data \n==============") UnscaleData = data[featListUnScale] data = data[featListScale] print(data) data = pp.scaleData(data,'standardize') print("\n\nTODO: ADD Additional Features that are not scaled \n==============") data[featListUnScale] = UnscaleData print(data) print("\n\nAdding Date\n==============") databuffer = data["Date"] #s[0] = '2017-07-29 03-PM' #s[1] = '2017-07-30 01-PM' try: databuffer = pd.to_datetime(databuffer.values).to_series() except: try: databuffer = pd.to_datetime(databuffer.values,format='%Y-%m-%d %I-%p').to_series() except: print("timestamp not known") data["Day"] = databuffer.dt.dayofweek.values #data["Date"] data["Hour"] = databuffer.dt.hour.values print("==============") #timestamp_s = date_time.map(datetime.datetime.timestamp) print("timestamp_s defines how many ticks per day\nBTC-h.csv has 1h tick\n==============") #timestamp_s = np.linspace(0, 1, num=24) #date_time = pd.to_datetime(df.pop('Date Time'), format='%d.%m.%Y %H:%M:%S') databuffer = data["Date"].values try: databuffer = pd.to_datetime(databuffer) except: try: databuffer = pd.to_datetime(databuffer,format='%Y-%m-%d %I-%p') except: print("timestamp not known") databuffer = pd.to_datetime(databuffer, format='%d.%m.%Y %H:%M:%S') print("\n\nDataBuffer\n==============") print(databuffer) timestamp_s = databuffer.map(datetime.datetime.timestamp) print(timestamp_s) day = 24*60*60 week = 7*day year = (365.2425)*day data['DaySin'] = np.sin(timestamp_s * (2 * np.pi / day)) data['DayCos'] = np.cos(timestamp_s * (2 * np.pi / day)) data['WeekSin'] = np.sin(timestamp_s * (2 * np.pi / week)) data['WeekCos'] = np.cos(timestamp_s * (2 * np.pi / week)) print("==============") print(data['WeekSin'][0:25]) #print(data['Date'][0:25]) print("==============\n") #print(data) [data,y] = pp.genForcastY(data, LabelList=labelListTimeShifted, featureList=featureList, includeAllFuturDays=False) [data,y] = pp.genTimeSeries(data,y) print(y) print("\n\nAmount of ouputs \n==============") N_outPutFeatures = y.shape[1] y = np.expand_dims(y, -1) print(y.shape) dataSize = int(data.shape[0] * splitVal) x1_train = data[:dataSize] x1_eval = data[dataSize:] y1_train = y[:dataSize] y1_eval = y[dataSize:] print(data.shape) print(y.shape) print(x1_train.shape) print(y1_train.shape) print(x1_eval.shape) print(y1_eval.shape) print("\n\nHook up Models \n==============") m1 = model1(x1_train.shape,y1_train.shape[1]) #m1 = m.call(x1_train,y1_train) #batch_sizes,epochs m1.compile(optimizer='adam', loss='mse') m1.fit(x=x1_train, y=y1_train, batch_size=batch_sizes, epochs=epochs, shuffle=True, validation_split=0.1) #shuffle=True, validation_split=0.1 scores = m1.evaluate(x1_eval, y1_eval) print(scores) print("\n\nPlot it\n==============") fig, axs = plt.subplots(N_outPutFeatures,3, figsize=(25,14)) y1_predict = m1.predict(x1_eval) y1_predict2 = m1.predict(x1_train) plotX = np.linspace(0, 10, y1_eval.shape[0]) plotX2 = np.linspace(0, 10, y1_train.shape[0]) print("\n\ndebg \n==============") print(y1_eval.shape) print(y1_predict.shape) print(y1_train.shape) print(y1_predict2.shape) y1_eval[:,] Variance = np.array([[]]) for n in range(N_outPutFeatures): yDiv = y1_eval[:,n,0] - y1_predict[:,n] yDiv = np.abs(yDiv) #print("shape yDiv") #print(yDiv.shape) #ydum = np.where(y1_eval[:,n,0] == 0, 10000, y1_eval[:,n,0]) #ydum = y1_eval[:,n,0] + 1 #ydum2 = yDiv + 1 #ERROR_ = (100/ydum) * ydum2 ERROR_ = yDiv #print("shape ERROR_") #print(ERROR_.shape) yDiv = np.sum(yDiv)/len(y1_predict[:,n]) Variance = np.append(Variance, np.array([yDiv,yDiv])) axs[n,0].plot(plotX,y1_eval[:,n], 'k') axs[n,0].plot(plotX,y1_predict[:,n], 'g') axs[n,1].plot(plotX,ERROR_, 'r') axs[n,2].plot(plotX2,y1_train[:,n], 'k') axs[n,2].plot(plotX2,y1_predict2[:,n], 'g') plt.show() fig2 = go.Figure(data=[go.Candlestick(x=plotX, open=y1_eval[:,0], high=y1_eval[:,1], low=y1_eval[:,2], close=y1_eval[:,3])]) fig2.show() fig3 = go.Figure(data=[go.Candlestick(x=plotX, open=y1_predict[:,0], high=y1_predict[:,1], low=y1_predict[:,2], close=y1_predict[:,3])]) fig3.show() print("\n\ndebg \n==============") print("Variance list {}".format(Variance)) '''
''' lectura de datos ''' print("----- Leyendo datos ...") #los ficheros .csv se han preparado previamente para sustituir ,, y "Not known" por NaN (valores perdidos) data_x = pd.read_csv('../data/nepal_earthquake_tra.csv') data_y = pd.read_csv('../data/nepal_earthquake_labels.csv') data_x_tst = pd.read_csv('../data/nepal_earthquake_tst.csv') df_submission = pd.read_csv('../data/nepal_earthquake_submission_format.csv') #se quitan las columnas que no se usan data_x.drop(labels=['building_id'], axis=1, inplace=True) data_x_tst.drop(labels=['building_id'], axis=1, inplace=True) data_y.drop(labels=['building_id'], axis=1, inplace=True) X, X_tst, y = preprocessing(data_x, data_x_tst, data_y) #y = np.ravel(data_y.values) skf = StratifiedKFold(n_splits=5, shuffle=True, random_state=123456) print("------ LightGBM...") lgbm = lgb.LGBMClassifier(objective='regression_l1', n_estimators=200, n_jobs=2, num_leaves=45, scale_pos_weight=0.1) lgbm, y_test_lgbm = validacion_cruzada(lgbm, X, y, skf) # Entreno de nuevo con el total de los datos # El resultado que muestro es en training, será mejor que en test clf = lgbm
def allinone(): train_data = valuetoint(load_df('http://test.blueathiean.me/train.csv')) test_data = valuetoint(load_df('http://test.blueathiean.me/test.csv')) train_data['transactionRevenue'].fillna(value='0', inplace=True) train_data['transactionRevenue'] = train_data['transactionRevenue'].astype( int) train_data, test_data = preprocessing(train_data, test_data) train_data = encoder(train_data) test_data = encoder(test_data) train_data, test_data = one_hot_encode(train_data, test_data) train_data = datetimeconvert(train_data) test_data = datetimeconvert(test_data) train_staging, test_staging = delcols(train_data, test_data) train_staging, test_staging = fillnans(train_staging, test_staging) train_staging, test_staging = train_staging.align(test_staging, join='inner', axis=1) train_staging['transactionRevenue'] = train_data['transactionRevenue'] test_staging['fullVisitorId'] = test_data['fullVisitorId'] train_staging['fullVisitorId'] = train_data['fullVisitorId'] train_agg = train_staging \ .groupby(['fullVisitorId']) \ .agg(['count','mean','min','max','sum']) \ .reset_index() test_agg = test_staging \ .groupby(['fullVisitorId']) \ .agg(['count','mean','min','max','sum']) \ .reset_index() columns_train = ['fullVisitorId'] for var in train_agg.columns.levels[0]: if var != 'fullVisitorId': for stat in train_agg.columns.levels[1][:-1]: columns_train.append('%s_%s' % (var, stat)) train_agg.columns = columns_train columns_test = ['fullVisitorId'] for var in test_agg.columns.levels[0]: if var != 'fullVisitorId': for stat in test_agg.columns.levels[1][:-1]: columns_test.append('%s_%s' % (var, stat)) test_agg.columns = columns_test del train_staging del train_data del test_staging del test_data train_agg['TARGET'] = train_agg['transactionRevenue_sum'].apply( create_target) train_agg = train_agg.drop([ 'transactionRevenue_count', 'transactionRevenue_mean', 'transactionRevenue_min', 'transactionRevenue_max', 'transactionRevenue_sum' ], axis=1) train_agg_corr = train_agg.corr() #CORRELATION CHECK #if they click around the site more, it's more likely it will end up in a transaction print(train_agg_corr['TARGET'].sort_values(ascending=False)) train_agg.to_csv('application.csv') test_agg.to_csv('test_agg.csv')
def preprocessing(data): minV=0 maxV=255 data=(data-minV)/(maxV-minV) return data def one_hot_encoding(data,numberOfClass): from sklearn import preprocessing lb=preprocessing.LabelBinarizer() lb.fit(range(numberOfClass)) return lb.transform(data) processedTrainData=preprocessing(trainData) processedTestData=preprocessing(testData) one_hot_trainlabel=one_hot_encoding(trainLabel,10) fileName='mnist.p' if not isfile(filename): pickle.dump((processedTrainData, trainLabel, one_hot_trainLabel, processedTestData),open(fileName,'wb')) trainData, trainLabel, one_hot_trainLabel, testData = pickle.load(open(fileName, mode = 'rb'))
# removing length less than 3 tokens = [word for word in tokens if len(word)>=3] pre_proc_text = " ".join(tokens) return pre_proc_text lines = [] fin = open(data_path+"Smart_Bomb_with_Language_parser.txt", "rb") #fin = open(data_path+"alice_in_wonderland.txt", "rb") for line in fin: line = line.strip().decode("ascii", "ignore").encode("utf-8") if len(line) == 0: continue lines.append(preprocessing(line)) fin.close() import collections counter = collections.Counter() for line in lines: for word in nltk.word_tokenize(line): counter[word.lower()]+=1 word2idx = {w:(i+1) for i,(w,_) in enumerate(counter.most_common())} idx2word = {v:k for k,v in word2idx.items()} # Window size to control the nearest words vicinity
output.write("RF, N-Gram Vectors: ") output.write(str(accuracy13)) output.write("\n") output.write("XGB, N-Gram Vectors: ") output.write(str(accuracy14)) output.close() #---------------------------------------------------------------------------------------------------------------------------------------------------------------------------------- if __name__ == "__main__": corpusMake(1) #corpus = loadData() data, target_data = loadData() data = preprocessing(data) #trainDF is the issue here, gotta figure out why we are getting out of index trainDF = dataset_prep() trainDF = nlp_features(trainDF) train_x, valid_x, train_y, valid_y = split_section(trainDF, 0) accuracy, accuracy1, accuracy2, accuracy3, accuracy4 = count_vector( train_x, valid_x, train_y, valid_y) accuracy5, accuracy6, accuracy7, accuracy8, accuracy9, accuracy10, accuracy11, accuracy12, accuracy13, accuracy14 = tf_idf_vectors( train_x, valid_x, train_y, valid_y) output_file('test_results/10author_result1.txt', accuracy, accuracy1, accuracy2, accuracy3, accuracy4, accuracy5, accuracy6, accuracy7, accuracy8, accuracy9, accuracy10, accuracy11, accuracy12, accuracy13, accuracy14)
# Create new column representing mean sqft of above area for each house in each (zipcode,grade) group df['mean_sqft_sqft_above'] = df.groupby( by=['zipcode', 'grade'])['sqft_above'].transform('mean') # Create new column representing mean grade for each house in each zipcode group df['mean_grade'] = df.groupby(by=['zipcode'])['grade'].transform('mean') df_updated = df return df_updated # In[ ]: # Preprocess train and test datasets df_train = preprocessing('train.csv') # Use log transform to make the columns less skewed (to meet the assumptions of inferential statistics) trans_columns = ['sqft_living', 'sqft_living15', 'sqft_lot', 'sqft_lot15'] df_train = log_transform(df_train, trans_columns + ['price']) # Preprocess train and test datasets df_train = feature_generator(df_train, 47.36217, -122.20069) # Additional step in preprocessing train data # Delete outliers in number of bedrooms from train data df_train = df_train[df_train['bedrooms'] < 11] # In[ ]: df_train.head()
def preprocessing(notes): namecat = sklearn.preprocessing.LabelEncoder().fit_transform( [n["name"] for n in notes]) onehotencoded = sklearn.preprocessing.OneHotEncoder().fit_transform( [[n] for n in namecat]).todense() numerator = [n["length"].numerator for n in notes] denominator = [n["length"].denominator for n in notes] return numpy.append(numpy.append(onehotencoded, [[n] for n in numerator], axis=1), [[d] for d in denominator], axis=1) data = preprocessing(notes).getA() scaler = sklearn.preprocessing.MinMaxScaler() scaled = scaler.fit_transform(data) print(scaled.shape) sequenceLength = 20 n_features = len(data[0]) input = numpy.zeros( (len(scaled) - sequenceLength, sequenceLength, len(scaled[0]))) output = numpy.zeros((len(scaled) - sequenceLength, len(scaled[0]))) for i in range(0, len(scaled) - sequenceLength): for j in range(sequenceLength): input[i, j] = scaled[i + j] output[i] = scaled[i + sequenceLength] print(input.shape)
scaler = MinMaxScaler() gafeat = GenAlFeaturesSelector(n_pop=n_population,max_gen=max_generation, desired_fit=desired_fitness, scaler = scaler,clf=clf) #for each time window perform analysis for t_window in range(1,5): time_window = t_window tmp_results = list() bs = BakSys(threeclass=False,seconds = time_window) subj1 = preprocessing(subj1_raw,'subject 1 data',256,bs,n_class = 2, time_window = time_window) subj2 = preprocessing(subj2_raw,'subject 2 data',256,bs,n_class = 2, time_window = time_window) subj3 = preprocessing(subj3_raw,'subject 3 data',256,bs, time_window = time_window) subj4 = preprocessing(subj4_raw,'subject 4 data',256,bs, time_window = time_window) overall = (np.vstack([subj1[0],subj2[0],subj3[0],subj4[0]]), np.hstack([subj1[1],subj2[1],subj3[1],subj4[1]]), subj1[2],'overall data') for n in [subj1,subj2,subj3,subj4,overall]: #for n in [subj1,subj2]: tmp = subroutine(n,time_window,gafeat) tmp_results.append(tmp)
return ficm #concatenates the matrixes and does classification def concat(f1, f2, f3, f4, f5): f12 = np.concatenate((f1, f2), axis=1) f123 = np.concatenate((f1, f2, f3), axis=1) f1234 = np.concatenate((f1, f2, f3, f4), axis=1) f12345 = np.concatenate((f1, f2, f3, f4, f5), axis=1) print(f12.shape) print(f123.shape) print(f1234.shape) print(f12345.shape) classification(f12) classification(f123) classification(f1234) classification(f12345) file_names = [ "amazon_cells_labelled.txt", "imdb_labelled.txt", "yelp_labelled.txt" ] for fname in file_names: sentences, lables = preprocessing(fname) f1 = onegrams(sentences, lables) f2 = bigrams(sentences, lables) f3 = trigrams(sentences, lables) f4 = fgrams(sentences, lables) f5 = fivegrams(sentences, lables) concat(f1, f2, f3, f4, f5)