def main(): c = input('Enter manually or read from a csv file?') if (c == "Y"): size = int(input("Enter the size of X attribute:")) x = [] y = [] print("Enter the X Elements:") for i in range(size): element = int(input()) x.append(element) print("Enter the Y Elements:") for i in range(size): element = int(input()) y.append(element) x = np.array(x) y = np.array(y) else: data = gft('test_file.csv', delimiter=',') X = [] Y = [] for major in data: X.append(major[0]) Y.append(major[1]) x = np.array(X) y = np.array(Y) print("Estimated coefficients of linear regression are: ") print(estimate_coefficients(x, y)) print("Graph:") graph_of_regression(x, y, estimate_coefficients(x, y))
def retValid(): image_list, Y_all = read_image_list() label = ["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"] TASK3_PRED_FILE = '/home/grads/k/kaihe/Documents/ISIC/models/deepModels/resnet_50/valid_results.csv' predLab = gft(TASK3_PRED_FILE, delimiter=' ') valid_list = image_list[Y_all.shape[0] - predLab.shape[0]:] Y_valid = Y_all[Y_all.shape[0] - predLab.shape[0]:, :] wrongPred = open( '/home/grads/k/kaihe/Documents/ISIC/models/deepModels/resnet_50/wrongPred.csv', 'w') wrongPred.write("Image,Correct Label,Predicted Label\n") for i in range(0, predLab.shape[0] - 1): if np.argmax(predLab[i, :]) != np.argmax(Y_valid[i, :]): wrongPred.write("%s," % valid_list[i]) wrongPred.write("%s," % label[np.argmax(Y_valid[i, :])]) wrongPred.write("%s\n" % label[np.argmax(predLab[i, :])])
def read_image_list(): DATA_DIR = '/home/grads/k/kaihe/Documents/ISIC/Task3/' TASK3_DATA_DIR = DATA_DIR + 'ISIC2018_Task3_Training_Input/Image/' TASK3_TRUTH_FILE = DATA_DIR + 'ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv' pattern = re.compile(r"\w+\.jpg") image_files = [] truth_file = open(TASK3_TRUTH_FILE) csv_reader = csv.reader(truth_file) for row in csv_reader: image_files.append(TASK3_DATA_DIR + row[0] + '.jpg') image_files = image_files[1:] labels = gft(TASK3_TRUTH_FILE, delimiter=',') labels = labels[1:, 1:] return image_files, labels
# -*- coding: utf-8 -*- """ Created on Mon Sep 11 08:33:48 2017 PSU-Phys_296: Idependent Study Purpous: Read and convert data set Plot data @author: Aardvark """ ## Reading data as .dat from numpy import genfromtxt as gft # Import package from pandas import DataFrame as df # Import package L=open('all_hminus_compass2007.dat').readlines() # read each line L=[l.strip() for l in L] # remove white space between values in each line H=L[0].split() # save the second line as H and delete the hashtag dat = gft('all_hminus_compass2007.dat') # Read file from working dir as numpi array data = df(dat, columns = H) # save data as a panda data frame with headers # saving as csv data.to_csv("all_hminus_compass2007.csv",index=False) # save as csv with out index vaues
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath="", NumGenes=0, aligned="F"): ''' run Cell_BLAST Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype='int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool') col = np.array(robjects.r['col_Index'], dtype='int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the feature file # if (NumGenes > 0): features = pd.read_csv(GeneOrderPath, header=0, index_col=None, sep=',') # read the data and labels data_old = cb.data.ExprDataSet.read_table(DataPath, orientation="cg", sep=",", index_col=0, header=0, sparsify=True) labels = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',', usecols=col) data = cb.data.ExprDataSet(data_old.exprs[tokeep], data_old.obs.iloc[tokeep], data_old.var, data_old.uns) labels = gft(LabelsPath, dtype="str", skip_header=1, delimiter=",", usecols=col) labels = labels[tokeep] truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): train_ind_i = np.array(train_ind[i], dtype='int') - 1 train = data[train_ind_i, :] y_train = labels[train_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes, i].dropna() else: feat_to_use = features.iloc[:, i].dropna() # train = train[:,feat_to_use] # test = test[:,feat_to_use] train.obs['cell_type'] = y_train start = tm.time() # reduce dimensions models = [] for j in range(4): models.append( cb.directi.fit_DIRECTi(train, feat_to_use, cat_dim=20, epoch=500, patience=20, random_seed=j)) # train model blast = cb.blast.BLAST(models, train) tr_time.append(tm.time() - start) if test_ind.shape[0] != train_ind.shape[ 0]: # Make Inter-dataset work correctly assert train_ind.shape[0] == np.squeeze( nfolds) == 1 and test_ind.shape[0] > train_ind.shape[0] test_folds = list(range(test_ind.shape[0])) else: test_folds = [i] for j in test_folds: test_ind_i = np.array(test_ind[j], dtype='int') - 1 test = data[test_ind_i, :] y_test = labels[test_ind_i] # predict labels start = tm.time() blast_use = blast.align(test) if aligned == "T" else blast test_hits = blast_use.query(test) test_pred = test_hits.reconcile_models().filter().annotate( 'cell_type') ts_time.append(tm.time() - start) truelab.extend(y_test) pred.extend(test_pred.values) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) method_name = "Cell_BLAST" if "seurat_gene" in GeneOrderPath: method_name += "_seurat" if aligned == "T": method_name += "_aligned" truelab.to_csv(str(Path(OutputDir + f"/{method_name}_true.csv")), index=False) pred.to_csv(str(Path(OutputDir + f"/{method_name}_pred.csv")), index=False) tr_time.to_csv(str(Path(OutputDir + f"/{method_name}_training_time.csv")), index=False) ts_time.to_csv(str(Path(OutputDir + f"/{method_name}_test_time.csv")), index=False)
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath="", NumGenes=0): ''' run Cell_BLAST Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation, outputs lists of true and predicted cell labels as csv files, as well as computation time. Parameters ---------- DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes as row names and gene names as column names. LabelsPath : Cell population annotations file path (.csv). CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function. OutputDir : Output directory defining the path of the exported file. GeneOrderPath : Gene order file path (.csv) obtained from feature selection, defining the genes order for each cross validation fold, default is NULL. NumGenes : Number of genes used in case of feature selection (integer), default is 0. ''' # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype='int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool') col = np.array(robjects.r['col_Index'], dtype='int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath, header=0, index_col=None, sep=',') # read the data and labels data_old = cb.data.ExprDataSet.read_table(DataPath, orientation="cg", sep=",", index_col=0, header=0, sparsify=True).normalize() labels = pd.read_csv(LabelsPath, header=0, index_col=None, sep=',', usecols=col) data = cb.data.ExprDataSet(data_old.exprs[tokeep], data_old.obs.iloc[tokeep], data_old.var, data_old.uns) labels = gft(LabelsPath, dtype="str", skip_header=1, delimiter=",", usecols=col) labels = labels[tokeep] truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype='int') - 1 train_ind_i = np.array(train_ind[i], dtype='int') - 1 train = data[train_ind_i, :] test = data[test_ind_i, :] y_train = labels[train_ind_i] y_test = labels[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes, i] train = train[:, feat_to_use] test = test[:, feat_to_use] train.obs['cell_type'] = y_train start = tm.time() # reduce dimensions num_epoch = 50 models = [] for j in range(4): models.append( cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed=j, path="%d" % j)) # train model blast = cb.blast.BLAST(models, train).build_empirical() tr_time.append(tm.time() - start) # predict labels start = tm.time() test_pred = blast.query(test).annotate('cell_type') ts_time.append(tm.time() - start) truelab.extend(y_test) pred.extend(test_pred.values) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) truelab.to_csv(str(Path(OutputDir + "/Cell_BLAST_true.csv")), index=False) pred.to_csv(str(Path(OutputDir + "/Cell_BLAST_pred.csv")), index=False) tr_time.to_csv(str(Path(OutputDir + "/Cell_BLAST_training_time.csv")), index=False) ts_time.to_csv(str(Path(OutputDir + "/Cell_BLAST_test_time.csv")), index=False)
import numpy as np from numpy import genfromtxt as gft import matplotlib.pyplot as plt import pickle from sklearn.cluster import KMeans from sklearn.svm import SVC file = open("classifier.pkl","rb") clf = pickle.load(file) file.close() data = gft("./Dataset.csv", delimiter = ',') mydata = data[2:,1:9] time = [] value1 = [] value2 = [] value3 = [] value4 = [] value5 = [] value6 = [] value7 = [] value8 = [] fig = plt.figure() for f in range(len(data)): time.append(f) value1.append(mydata[f,0])
# -*- coding: utf-8 -*- """ Created on Mon Sep 11 08:33:48 2017 PSU-Phys_296: Idependent Study Purpous: Read and convert data set Plot data @author: Aardvark """ # Reading data as .dat from numpy import genfromtxt as gft # Import package dat = gft('clas_data.dat') # Read file from working dir # saving as csv from pandas import DataFrame as df # Import package data = df(dat) # data as a panda data frame data.to_csv("clas_data.csv", index=False) # save as csv with out index vaues # plot Pht with random and systimatic error import matplotlib.pyplot as plt # Import package plt.figure() # this seems to be usless plt.errorbar(dat[:, 0], dat[:, 1], dat[:, 2], dat[:, 3], fmt='o') # bild scatter with error bars plt.title('Figure 1') # add title plt.show() # seems to be usless plt.clf( ) # should be clearing the figure but it is only ploting in my console so, usless?
import numpy as np from numpy import * from google.colab import drive drive.mount('/content/gdrive') from numpy import genfromtxt as gft !pip install scipy #https://riptutorial.com/numpy/example/22990/reading-csv-files X_train= gft("/content/gdrive/My Drive/P2/Prob2_Xtrain.csv", delimiter=",") Y_train= gft("/content/gdrive/My Drive/P2/Prob2_ytrain.csv", delimiter=",") X_train Y_train X_test = gft("/content/gdrive/My Drive/P2/Prob2_Xtest.csv", delimiter=",") Y_test = gft("/content/gdrive/My Drive/P2/Prob2_ytest.csv", delimiter=",") #Splitting X_train into its two associated output classes based on Y_train X_train_class0=[]
def run_Cell_BLAST(input_dir,output_dir,datafile,labfile,Rfile): ''' Run CellBlast Parameters ---------- input_dir : directory of the input files output_dir : directory of the output files datafile : name of the data file labfile : name of the label file Rfile : file to read the cross validation indices from ''' os.chdir(input_dir) # read the Rdata file robjects.r['load'](Rfile) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the data and labels os.chdir(input_dir) data_old = cb.data.ExprDataSet.read_table(input_dir + datafile,orientation="cg", sep=",", index_col = 0, header = 0) labels = pd.read_csv(labfile, header=0,index_col=None, sep=',', usecols = col) data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) labels = gft(input_dir + labfile, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) labels = labels[tokeep] os.chdir(output_dir) truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data[train_ind_i,:] test=data[test_ind_i,:] y_train = labels[train_ind_i] y_test = labels[test_ind_i] train.obs['cell_type'] = y_train start = tm.time() train = train.normalize() # reduce dimensions num_epoch = 50 models = [] for j in range(4): models.append(cb.directi.fit_DIRECTi(train, latent_dim = 10, cat_dim=20, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) # train model blast = cb.blast.BLAST(models, train).build_empirical() tr_time.append(tm.time()-start) # predict labels start = tm.time() test_pred = blast.query(test).annotate('cell_type') ts_time.append(tm.time()-start) truelab.extend(y_test) pred.extend(test_pred.values) #write results os.chdir(output_dir) truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) truelab.to_csv("Cell_BLAST_" + str(col) +"_true.csv", index = False) pred.to_csv("Cell_BLAST_" + str(col) +"_pred.csv", index = False) tr_time.to_csv("Cell_BLAST_" + str(col) +"_training_time.csv", index = False) ts_time.to_csv("Cell_BLAST_" + str(col) +"_test_time.csv", index = False)
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0): # read the Rdata file robjects.r['load'](CV_RDataPath) nfolds = np.array(robjects.r['n_folds'], dtype = 'int') tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool') col = np.array(robjects.r['col_Index'], dtype = 'int') col = col - 1 test_ind = np.array(robjects.r['Test_Idx']) train_ind = np.array(robjects.r['Train_Idx']) # read the feature file if (NumGenes > 0): features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',') # read the data and labels data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize() labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col) data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns) labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col) labels = labels[tokeep] truelab = [] pred = [] tr_time = [] ts_time = [] for i in range(np.squeeze(nfolds)): test_ind_i = np.array(test_ind[i], dtype = 'int') - 1 train_ind_i = np.array(train_ind[i], dtype = 'int') - 1 train=data[train_ind_i,:] test=data[test_ind_i,:] y_train = labels[train_ind_i] y_test = labels[test_ind_i] if (NumGenes > 0): feat_to_use = features.iloc[0:NumGenes,i] train = train[:,feat_to_use] test = test[:,feat_to_use] train.obs['cell_type'] = y_train start = tm.time() # reduce dimensions num_epoch = 50 models = [] for j in range(4): models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j)) # train model blast = cb.blast.BLAST(models, train)#.build_empirical() tr_time.append(tm.time()-start) # predict labels start = tm.time() test_pred = blast.query(test).annotate('cell_type') ts_time.append(tm.time()-start) truelab.extend(y_test) pred.extend(test_pred.values) #write results truelab = pd.DataFrame(truelab) pred = pd.DataFrame(pred) tr_time = pd.DataFrame(tr_time) ts_time = pd.DataFrame(ts_time) if not os.path.exists(OutputDir): os.mkdir(OutputDir) truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False) pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False) tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False) ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False)