def main(num_clusters=6, dfFileName="../data/EFDataFrame_sample=0.01.pk"): logger.info('Start') currentFile = os.path.abspath(os.path.dirname(__file__)) dfFilePath = os.path.join(currentFile, dfFileName) label = 'group' #Load the data at the given fileName location for the given logger.info('load data') efdata = loadData(dfFilePath) logger.info('Number of element %d', efdata.shape[0]) #Extract Features logger.info('extract features') (similarity_matrix, tfidf_matrix) = get_similarity_matrix(efdata['text']) #_____________________________________________________________________ #______________________________KMEANS_________________________________ #peform KMEANS logger.info('------ K-means : %d--------', num_clusters) titles = efdata[label] km_clusters = get_cluster_kmeans(tfidf_matrix, num_clusters, titles) # logger.info('------ Dimensions reduction --------') x_pos, y_pos = pca_reduction(similarity_matrix, 10) #res = efdata.set_index('topic_id')['topic'].to_dict() #res2 = efdata.set_index('level')['topic'].to_dict() logger.info('plot') figName = '../figure/clustering_experiment_%s_isSample=True.pdf' % (label) figFilePath = os.path.join(currentFileDir, figName) scatter_clusters(x_pos, y_pos, km_clusters, titles, figFilePath) # Scatter K-means with PCA logger.info('End')
stocks = ['FB', 'TSLA', 'BAC'] # To download new Data #data = yp.securityData(stocks, end='2010-01-01', save=True, epoch=False) """ 2) Next we want to turn that data into our deisred format There are many different ways to do this, to each their own Here we will be using basic python data structures (pandas is best practice and runs faster, but can be kindof confusing if not in the dataframe headspace) The end goal, is chunks of 10 day increments of closing stock price, to predict if tomorrow will go up or down """ # To read in existing - can also use pd.read_csv() for each indiv csv. #data = yp.readExisting(stocks, end='2011-01-01') data = bk.loadData(stocks) def visualize_data(data): #print(type(data)) # dictionary #print(data.keys()) # see keys = ['BAC', 'FB', 'TSLA'] for stock in data.keys(): #print(type(data[stock])) # see all data is in the format of list #print(data[stock]) # see the data format is list of lists print(data[stock][0]) close = data[stock][0][4] print(stock, "close:", close) #visualize_data(data)
def crossValidationExperimentWithDF(classLabel, isSample, sampleSize, cvType, cvFold, classifierName, featureVecName, dfFileName="../data/EFDataFrame.pk"): """ Perfom cross-validation for a classification experiment, given dataframe representing the dataset (to classify). Two option are provided. Perform cross validation to find the right features and/or the right classifier. Parameters : classLabel : string The class Label, group_id or level_id isSample : boolean Indicate if the subsample of the dataset should be used for the classification sampleSize : float size of the sample dataset if it applies cvType : string One can perform cross-validation to feature selection development of for classification algorithm development. cvFold: int Number of folder for cross-validation classifierName : string Name of the classifier to use for the classifier : (naive bayes, logistic regression, knn ...) If perfoming 'on-feature' type of cross-validation, one needs to provide a classifier to cross-validate on. featureVecName : string The type of features Vector to use, tfidfVect, countVec, or customedfeatureVec. If perfoming 'on-classifier' type of cross-validation, one needs to provide a type of featureVec to cross-validate on. dFileName : string relative Path to the stored dataframe representing the whole dataset. Returns : None """ figName = '../figure/%d-FolcvExperiment_%s_%s_isSample=%r.pdf' % ( cvFold, cvType, classLabel, isSample) figFilePath = os.path.join(currentFileDir, figName) logger.info('%s - load data...', classLabel) #Sampling logger.info('Data Sampling - %.2f percent of data', sampleSize * 100) if (isSample): dfFilePath = os.path.join(currentFileDir, dfFileName) sampleDfFileName = renameFileName(dfFileName, 'sample=%.2f' % sampleSize) sampleDfFilePath = os.path.join(currentFileDir, sampleDfFileName) efdata = sampling(sampleSize, dfFilePath, sampleDfFilePath) logger.info('Sampled Data file is at the location - %s', sampleDfFilePath) else: dfFilePath = os.path.join(currentFileDir, dfFileName) efdata = loadData('text', classLabel, dfFilePath) logger.info('Number of writings in working data : %r', efdata.shape[0]) #Train -Test split logger.info('Train-test split : 80-20') xtrain_df, xtest_df, ytrain_df, ytest_df = train_test_split( efdata['text'], efdata[classLabel], random_state=0, test_size=0.2) #feature logger.info('Feature - %s ', featureVecName) featureVecFunction = createFeaturesVec[featureVecName] xtrain_vec, featureVec = featureVecFunction(efdata['text'], xtrain_df) xtest_vec = featureVec.transform(xtest_df) logger.info('Cross Validation - %s ...', cvType) if (cvType == 'on-feature'): #Cross Validation - Features Selection logger.info('Comparing Different Feature-Vectors') compareFeatures(classifierName, efdata, xtrain_df, ytrain_df, xtest_df, cvFold) else: #Cross Validation - Algo Selection logger.info('Comparing Different Classifiers') scoring = 'all' compareAlgo(xtrain_vec, ytrain_df, cvFold, scoring, figFilePath) logger.info('End-Processing')
def experimentWithDF(classLabel, classifierName, featureVecName, isSample, sampleSize=0.1, dfFileName="../data/EFDataFrame.pk"): """ Perfom classification experiment for a given dataframe representing the dataset (to classify). Parameters : classLabel : string The class Label, group_id or level_id classifierName : string Name of the classifier to use for the classifier : (naive bayes, logistic regression, knn ...) featureVecName : string The type of features Vector to use, tfidfVect, countVec, or customedfeatureVec. isSample : boolean Indicate if the subsample of the dataset should be used for the classification sampleSize : float size of the sample dataset if it applies dFileName : string relative Path to the stored dataframe representing the whole dataset. Returns : None """ logger.info("Start: Logger file info here : %r", logFile) figName = '../figure/experiment_%s_isSample=%r.pdf' % (classLabel, isSample) figFilePath = os.path.join(currentFileDir, figName) logger.info('%s - load data...', classLabel) #Sampling logger.info('Data Sampling - %.2f percent of data', sampleSize * 100) if (isSample): dfFilePath = os.path.join(currentFileDir, dfFileName) sampleDfFileName = renameFileName(dfFileName, 'sample=%.2f' % sampleSize) sampleDfFilePath = os.path.join(currentFileDir, sampleDfFileName) efdata = sampling(sampleSize, dfFilePath, sampleDfFilePath) logger.info('Sampled Data file is at the location - %s', sampleDfFilePath) else: dfFilePath = os.path.join(currentFileDir, dfFileName) efdata = loadData('text', classLabel, dfFilePath) logger.info('Number of writings in working data : %r', efdata.shape[0]) #Train -Test split logger.info('train-test split...') xtrain_df, xtest_df, ytrain_df, ytest_df = train_test_split( efdata['text'], efdata[classLabel], random_state=0, test_size=0.2) #feature logger.info('features computation with %s ...', featureVecName) featureVecFunction = createFeaturesVec[featureVecName] xtrain_vec, featureVec = featureVecFunction(efdata['text'], xtrain_df) xtest_vec = featureVec.transform(xtest_df) #Test Algo logger.info('Test\n___________________________') testAlgo(classifierName, xtrain_vec, ytrain_df, xtest_vec, ytest_df, figFilePath) logger.info('END processing on dataframe')
preprocessor = ColumnTransformer(transformers=[( 'num', numeric_transformer, numeric_features ), ('feed_col', feedback_feature_transformer, feedback_features), ('other_cat_col', other_cat_transformer, other_cat_cols)]) from sklearn.ensemble import RandomForestClassifier #Adding into Pipeline clf = Pipeline(steps=[('preprocessor', preprocessor), ('classifier', RandomForestClassifier(bootstrap=True, max_depth=30, max_features='auto', min_samples_leaf=1, n_estimators=100))]) data = pre.loadData() #Getting X and y X1 = data.drop(['Satisfaction', 'ArrivalDelayin_Mins'], axis=1) y1 = pd.get_dummies(data['Satisfaction']) #Data SPlit from sklearn.model_selection import train_test_split # Splitting the data for training and testing out model X_trains, X_tests, y_trains, y_tests = train_test_split(X1, y1, random_state=1, stratify=y1) #Fitting Pipeline clf.fit(X_trains, y_trains)
print("\n\nCurrent configuration: Task " + task + "; Model " + model_name + "; Subject", str(subject)) print("Cascade: detecion + classification") # get test set if subject == 23: X_test = preprocessing.loadDataMultiple(label=label, folder=data_folder, window_size=window_size, stride=stride, make_binary=False, null_class=True, print_info=print_info)[2] else: X_test = preprocessing.loadData(subject=subject, label=label, folder=data_folder, window_size=window_size, stride=stride, make_binary=False, null_class=True, print_info=print_info)[2] # mask mask = (Y_pred_ad == 1) activity_windows = X_test[mask, :, :] if model_name == "Convolutional2DRecurrent": activity_windows = activity_windows.reshape(activity_windows.shape[0], window_size, X_test.shape[2], 1) Y_casc_ac = model.predict_classes(activity_windows) + 1 # last model saved is "activity classification" Y_casc = Y_pred_ad Y_casc[mask] = Y_casc_ac score_casc = f1_score(Y_true, Y_casc, average='weighted') print("Two-Steps results:\n", classification_report(Y_true, Y_casc)) # store results as text
i = None i = 0 if cat == 'Quinarius' else i i = 1 if cat == 'Denarius' else i i = 2 if cat == 'As' else i i = 3 if cat == 'Aureus' else i i = 4 if cat == 'Quinarius aureus' else i i = 5 if cat == 'Dupondius' else i i = 6 if cat == 'Quadrans' else i i = 7 if cat == 'Sestertius' else i i = 8 if cat == 'Semis' else i i = 9 if cat == 'Cistophorus' else i i = 10 if cat == 'Drachma' else i i = 11 if cat == 'Didrachm' else i i = 12 if cat == 'Hemidrachm' else i if i is not None: onehot[i] = 1 return onehot if __name__ == '__main__': cnn = CNN() props, imgs = loadData('dataframe.csv', './images') cnn.setMintModel() train(cnn, input_x, input_y, 10000, 100, './')
def cascade_classification(subject, task, model_name, data_folder, window_size=15, stride=5, epochs=15, batch_size=32, balcance_classes=False, GPU=False, print_info=False): # preprocessing if task == "A": label = 0 elif task == "B": label = 6 else: print("Error: invalid task.") if subject == 23: X_train, Y_train, X_test, Y_test, n_features, n_classes = preprocessing.loadDataMultiple( label=label, folder=data_folder, window_size=window_size, stride=stride, make_binary=False, null_class=False, print_info=print_info) else: X_train, Y_train, X_test, Y_test, n_features, n_classes = preprocessing.loadData( subject=subject, label=label, folder=data_folder, window_size=window_size, stride=stride, make_binary=False, null_class=False, print_info=print_info) # model if model_name == "Convolutional": model = models.Convolutional((window_size, n_features), n_classes, print_info=print_info) elif model_name == "Convolutional1DRecurrent": model = models.Convolutional1DRecurrent((window_size, n_features), n_classes, GPU=GPU, print_info=print_info) elif model_name == "Convolutional2DRecurrent": model = models.Convolutional2DRecurrent((window_size, n_features, 1), n_classes, GPU=GPU, print_info=print_info) # reshaping for 2D convolutional model X_train = X_train.reshape(X_train.shape[0], window_size, n_features, 1) X_test = X_test.reshape(X_test.shape[0], window_size, n_features, 1) elif model_name == "ConvolutionalDeepRecurrent": model = models.ConvolutionalDeepRecurrent((window_size, n_features), n_classes, GPU=GPU, print_info=print_info) else: print("Model not found.") model.compile(optimizer=Adam(lr=0.001), loss="categorical_crossentropy", metrics=["accuracy"]) save_model_name = task + "_" + model_name + "_TSC_" + str(subject) filepath = './data/models/' + save_model_name + '.hdf5' print("Model:", save_model_name, "\nLocation:", filepath, "\n") # training checkpointer = ModelCheckpoint(filepath=filepath, verbose=1, save_best_only=True) lr_reducer = ReduceLROnPlateau(factor=0.1, patience=5, min_lr=0.00001, verbose=1) model.fit(x=X_train, y=to_categorical(Y_train), epochs=epochs, batch_size=batch_size, verbose=1, validation_data=(X_test, to_categorical(Y_test)), callbacks=[checkpointer, lr_reducer]) return model, X_test, Y_test, filepath, save_model_name