Beispiel #1
0
def main():
    c = input('Enter manually or read from a csv file?')
    if (c == "Y"):
        size = int(input("Enter the size of X attribute:"))
        x = []
        y = []
        print("Enter the X Elements:")
        for i in range(size):
            element = int(input())
            x.append(element)
        print("Enter the Y Elements:")
        for i in range(size):
            element = int(input())
            y.append(element)

        x = np.array(x)
        y = np.array(y)

    else:
        data = gft('test_file.csv', delimiter=',')
        X = []
        Y = []
        for major in data:
            X.append(major[0])
            Y.append(major[1])
        x = np.array(X)
        y = np.array(Y)

    print("Estimated coefficients of linear regression are: ")
    print(estimate_coefficients(x, y))
    print("Graph:")
    graph_of_regression(x, y, estimate_coefficients(x, y))
Beispiel #2
0
def retValid():

    image_list, Y_all = read_image_list()
    label = ["MEL", "NV", "BCC", "AKIEC", "BKL", "DF", "VASC"]
    TASK3_PRED_FILE = '/home/grads/k/kaihe/Documents/ISIC/models/deepModels/resnet_50/valid_results.csv'
    predLab = gft(TASK3_PRED_FILE, delimiter=' ')
    valid_list = image_list[Y_all.shape[0] - predLab.shape[0]:]
    Y_valid = Y_all[Y_all.shape[0] - predLab.shape[0]:, :]

    wrongPred = open(
        '/home/grads/k/kaihe/Documents/ISIC/models/deepModels/resnet_50/wrongPred.csv',
        'w')
    wrongPred.write("Image,Correct Label,Predicted Label\n")
    for i in range(0, predLab.shape[0] - 1):
        if np.argmax(predLab[i, :]) != np.argmax(Y_valid[i, :]):
            wrongPred.write("%s," % valid_list[i])
            wrongPred.write("%s," % label[np.argmax(Y_valid[i, :])])
            wrongPred.write("%s\n" % label[np.argmax(predLab[i, :])])
Beispiel #3
0
def read_image_list():
    DATA_DIR = '/home/grads/k/kaihe/Documents/ISIC/Task3/'
    TASK3_DATA_DIR = DATA_DIR + 'ISIC2018_Task3_Training_Input/Image/'

    TASK3_TRUTH_FILE = DATA_DIR + 'ISIC2018_Task3_Training_GroundTruth/ISIC2018_Task3_Training_GroundTruth.csv'

    pattern = re.compile(r"\w+\.jpg")

    image_files = []
    truth_file = open(TASK3_TRUTH_FILE)
    csv_reader = csv.reader(truth_file)
    for row in csv_reader:
        image_files.append(TASK3_DATA_DIR + row[0] + '.jpg')
    image_files = image_files[1:]

    labels = gft(TASK3_TRUTH_FILE, delimiter=',')
    labels = labels[1:, 1:]

    return image_files, labels
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 11 08:33:48 2017
PSU-Phys_296: Idependent Study
Purpous: 
    Read and convert data set
    Plot data
@author: Aardvark
"""
## Reading data as .dat 
from numpy import genfromtxt as gft # Import package
from pandas import DataFrame as df # Import package

L=open('all_hminus_compass2007.dat').readlines()     # read each line
L=[l.strip() for l in L]                # remove white space between values in each line
H=L[0].split()    # save the second line as H and delete the hashtag 
dat = gft('all_hminus_compass2007.dat')              # Read file from working dir as numpi array
data = df(dat, columns = H)             # save data as a panda data frame with headers

# saving as csv
data.to_csv("all_hminus_compass2007.csv",index=False)    # save as csv with out index vaues


def run_Cell_BLAST(DataPath,
                   LabelsPath,
                   CV_RDataPath,
                   OutputDir,
                   GeneOrderPath="",
                   NumGenes=0,
                   aligned="F"):
    '''
    run Cell_BLAST
    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.

    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection,
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype='int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')
    col = np.array(robjects.r['col_Index'], dtype='int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the feature file
    # if (NumGenes > 0):
    features = pd.read_csv(GeneOrderPath, header=0, index_col=None, sep=',')

    # read the data and labels
    data_old = cb.data.ExprDataSet.read_table(DataPath,
                                              orientation="cg",
                                              sep=",",
                                              index_col=0,
                                              header=0,
                                              sparsify=True)
    labels = pd.read_csv(LabelsPath,
                         header=0,
                         index_col=None,
                         sep=',',
                         usecols=col)

    data = cb.data.ExprDataSet(data_old.exprs[tokeep],
                               data_old.obs.iloc[tokeep], data_old.var,
                               data_old.uns)

    labels = gft(LabelsPath,
                 dtype="str",
                 skip_header=1,
                 delimiter=",",
                 usecols=col)
    labels = labels[tokeep]

    truelab = []
    pred = []
    tr_time = []
    ts_time = []

    for i in range(np.squeeze(nfolds)):
        train_ind_i = np.array(train_ind[i], dtype='int') - 1

        train = data[train_ind_i, :]
        y_train = labels[train_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes, i].dropna()
        else:
            feat_to_use = features.iloc[:, i].dropna()


#             train = train[:,feat_to_use]
#             test = test[:,feat_to_use]

        train.obs['cell_type'] = y_train

        start = tm.time()

        # reduce dimensions
        models = []

        for j in range(4):
            models.append(
                cb.directi.fit_DIRECTi(train,
                                       feat_to_use,
                                       cat_dim=20,
                                       epoch=500,
                                       patience=20,
                                       random_seed=j))

        # train model
        blast = cb.blast.BLAST(models, train)
        tr_time.append(tm.time() - start)

        if test_ind.shape[0] != train_ind.shape[
                0]:  # Make Inter-dataset work correctly
            assert train_ind.shape[0] == np.squeeze(
                nfolds) == 1 and test_ind.shape[0] > train_ind.shape[0]
            test_folds = list(range(test_ind.shape[0]))
        else:
            test_folds = [i]

        for j in test_folds:
            test_ind_i = np.array(test_ind[j], dtype='int') - 1
            test = data[test_ind_i, :]
            y_test = labels[test_ind_i]

            # predict labels
            start = tm.time()
            blast_use = blast.align(test) if aligned == "T" else blast
            test_hits = blast_use.query(test)
            test_pred = test_hits.reconcile_models().filter().annotate(
                'cell_type')
            ts_time.append(tm.time() - start)

            truelab.extend(y_test)
            pred.extend(test_pred.values)

    #write results
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    method_name = "Cell_BLAST"
    if "seurat_gene" in GeneOrderPath:
        method_name += "_seurat"
    if aligned == "T":
        method_name += "_aligned"
    truelab.to_csv(str(Path(OutputDir + f"/{method_name}_true.csv")),
                   index=False)
    pred.to_csv(str(Path(OutputDir + f"/{method_name}_pred.csv")), index=False)
    tr_time.to_csv(str(Path(OutputDir + f"/{method_name}_training_time.csv")),
                   index=False)
    ts_time.to_csv(str(Path(OutputDir + f"/{method_name}_test_time.csv")),
                   index=False)
def run_Cell_BLAST(DataPath,
                   LabelsPath,
                   CV_RDataPath,
                   OutputDir,
                   GeneOrderPath="",
                   NumGenes=0):
    '''
    run Cell_BLAST
    Wrapper script to run Cell_BLAST on a benchmark dataset with 5-fold cross validation,
    outputs lists of true and predicted cell labels as csv files, as well as computation time.
  
    Parameters
    ----------
    DataPath : Data file path (.csv), cells-genes matrix with cell unique barcodes 
    as row names and gene names as column names.
    LabelsPath : Cell population annotations file path (.csv).
    CV_RDataPath : Cross validation RData file path (.RData), obtained from Cross_Validation.R function.
    OutputDir : Output directory defining the path of the exported file.
    GeneOrderPath : Gene order file path (.csv) obtained from feature selection, 
    defining the genes order for each cross validation fold, default is NULL.
    NumGenes : Number of genes used in case of feature selection (integer), default is 0.
    '''

    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype='int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype='bool')
    col = np.array(robjects.r['col_Index'], dtype='int')
    col = col - 1
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,
                               header=0,
                               index_col=None,
                               sep=',')

    # read the data and labels
    data_old = cb.data.ExprDataSet.read_table(DataPath,
                                              orientation="cg",
                                              sep=",",
                                              index_col=0,
                                              header=0,
                                              sparsify=True).normalize()
    labels = pd.read_csv(LabelsPath,
                         header=0,
                         index_col=None,
                         sep=',',
                         usecols=col)

    data = cb.data.ExprDataSet(data_old.exprs[tokeep],
                               data_old.obs.iloc[tokeep], data_old.var,
                               data_old.uns)

    labels = gft(LabelsPath,
                 dtype="str",
                 skip_header=1,
                 delimiter=",",
                 usecols=col)
    labels = labels[tokeep]

    truelab = []
    pred = []
    tr_time = []
    ts_time = []

    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype='int') - 1
        train_ind_i = np.array(train_ind[i], dtype='int') - 1

        train = data[train_ind_i, :]
        test = data[test_ind_i, :]
        y_train = labels[train_ind_i]
        y_test = labels[test_ind_i]

        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes, i]
            train = train[:, feat_to_use]
            test = test[:, feat_to_use]

        train.obs['cell_type'] = y_train

        start = tm.time()

        # reduce dimensions
        num_epoch = 50
        models = []

        for j in range(4):
            models.append(
                cb.directi.fit_DIRECTi(train,
                                       epoch=num_epoch,
                                       patience=10,
                                       random_seed=j,
                                       path="%d" % j))

        # train model
        blast = cb.blast.BLAST(models, train).build_empirical()
        tr_time.append(tm.time() - start)

        # predict labels
        start = tm.time()
        test_pred = blast.query(test).annotate('cell_type')
        ts_time.append(tm.time() - start)

        truelab.extend(y_test)
        pred.extend(test_pred.values)

    #write results
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)

    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)

    truelab.to_csv(str(Path(OutputDir + "/Cell_BLAST_true.csv")), index=False)
    pred.to_csv(str(Path(OutputDir + "/Cell_BLAST_pred.csv")), index=False)
    tr_time.to_csv(str(Path(OutputDir + "/Cell_BLAST_training_time.csv")),
                   index=False)
    ts_time.to_csv(str(Path(OutputDir + "/Cell_BLAST_test_time.csv")),
                   index=False)
Beispiel #7
0
import numpy as np
from numpy import genfromtxt as gft 
import matplotlib.pyplot as plt 
import pickle
from sklearn.cluster import KMeans
from sklearn.svm import SVC


file = open("classifier.pkl","rb")
clf = pickle.load(file)
file.close()

data = gft("./Dataset.csv", delimiter = ',')
mydata = data[2:,1:9]


time = []
value1 = []
value2 = []
value3 = []
value4 = []
value5 = []
value6 = []
value7 = []
value8 = []

fig = plt.figure()

for f in range(len(data)):
	time.append(f)
	value1.append(mydata[f,0])
# -*- coding: utf-8 -*-
"""
Created on Mon Sep 11 08:33:48 2017
PSU-Phys_296: Idependent Study
Purpous: 
    Read and convert data set
    Plot data
@author: Aardvark
"""
# Reading data as .dat
from numpy import genfromtxt as gft  # Import package

dat = gft('clas_data.dat')  # Read file from working dir

# saving as csv
from pandas import DataFrame as df  # Import package

data = df(dat)  # data as a panda data frame
data.to_csv("clas_data.csv", index=False)  # save as csv with out index vaues

# plot Pht with random and systimatic error
import matplotlib.pyplot as plt  # Import package

plt.figure()  # this seems to be usless
plt.errorbar(dat[:, 0], dat[:, 1], dat[:, 2], dat[:, 3],
             fmt='o')  # bild scatter with error bars
plt.title('Figure 1')  # add title
plt.show()  # seems to be usless
plt.clf(
)  # should be clearing the figure but it is only ploting in my console so, usless?
import numpy as np

from numpy import *

from google.colab import drive
drive.mount('/content/gdrive')

from numpy import genfromtxt as gft



!pip install scipy

#https://riptutorial.com/numpy/example/22990/reading-csv-files

X_train= gft("/content/gdrive/My Drive/P2/Prob2_Xtrain.csv", delimiter=",")

Y_train= gft("/content/gdrive/My Drive/P2/Prob2_ytrain.csv", delimiter=",")

X_train

Y_train

X_test = gft("/content/gdrive/My Drive/P2/Prob2_Xtest.csv", delimiter=",")

Y_test = gft("/content/gdrive/My Drive/P2/Prob2_ytest.csv", delimiter=",")

#Splitting X_train into its two associated output classes based on Y_train

X_train_class0=[]
Beispiel #10
0
def run_Cell_BLAST(input_dir,output_dir,datafile,labfile,Rfile):
    '''
    Run CellBlast
	
	Parameters
	----------
	input_dir : directory of the input files
	output_dir : directory of the output files
	datafile : name of the data file
    labfile : name of the label file
    Rfile : file to read the cross validation indices from
    '''
    
    os.chdir(input_dir)
    
    # read the Rdata file
    robjects.r['load'](Rfile)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])

    # read the data and labels
    os.chdir(input_dir)
    data_old = cb.data.ExprDataSet.read_table(input_dir + datafile,orientation="cg", sep=",", index_col = 0, header = 0)
    labels = pd.read_csv(labfile, header=0,index_col=None, sep=',', usecols = col)
    
    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)

    labels = gft(input_dir + labfile, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
    labels = labels[tokeep]

    os.chdir(output_dir)
    
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data[train_ind_i,:]
        test=data[test_ind_i,:]
        y_train = labels[train_ind_i]
        y_test = labels[test_ind_i]
        
        train.obs['cell_type'] = y_train
                
        start = tm.time()
        train = train.normalize()
                
        # reduce dimensions
        num_epoch = 50
        models = []
    
        for j in range(4):
            models.append(cb.directi.fit_DIRECTi(train, latent_dim = 10, cat_dim=20, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
    
        # train model
        blast = cb.blast.BLAST(models, train).build_empirical()
        tr_time.append(tm.time()-start)
        
        # predict labels
        start = tm.time()
        test_pred = blast.query(test).annotate('cell_type')
        ts_time.append(tm.time()-start)

        truelab.extend(y_test)
        pred.extend(test_pred.values)
    
    #write results
    os.chdir(output_dir)
    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
            
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
    
    truelab.to_csv("Cell_BLAST_" + str(col) +"_true.csv", index = False)
    pred.to_csv("Cell_BLAST_" + str(col) +"_pred.csv", index = False)
    
    tr_time.to_csv("Cell_BLAST_" + str(col) +"_training_time.csv", index = False)
    ts_time.to_csv("Cell_BLAST_" + str(col) +"_test_time.csv", index = False)

        
Beispiel #11
0
def run_Cell_BLAST(DataPath, LabelsPath, CV_RDataPath, OutputDir, GeneOrderPath = "", NumGenes = 0):
    
        
    # read the Rdata file
    robjects.r['load'](CV_RDataPath)

    nfolds = np.array(robjects.r['n_folds'], dtype = 'int')
    tokeep = np.array(robjects.r['Cells_to_Keep'], dtype = 'bool')
    col = np.array(robjects.r['col_Index'], dtype = 'int')
    col = col - 1 
    test_ind = np.array(robjects.r['Test_Idx'])
    train_ind = np.array(robjects.r['Train_Idx'])
    
    # read the feature file
    if (NumGenes > 0):
        features = pd.read_csv(GeneOrderPath,header=0,index_col=None, sep=',')

    # read the data and labels
    data_old = cb.data.ExprDataSet.read_table(DataPath,orientation="cg", sep=",", index_col = 0, header = 0, sparsify = True).normalize()
    labels = pd.read_csv(LabelsPath, header=0,index_col=None, sep=',', usecols = col)
    
    data = cb.data.ExprDataSet(data_old.exprs[tokeep],data_old.obs.iloc[tokeep],data_old.var,data_old.uns)

    labels = gft(LabelsPath, dtype = "str", skip_header = 1, delimiter = ",", usecols = col)      
    labels = labels[tokeep]
   
    truelab = []
    pred = []
    tr_time = []
    ts_time = []
    
    for i in range(np.squeeze(nfolds)):
        test_ind_i = np.array(test_ind[i], dtype = 'int') - 1
        train_ind_i = np.array(train_ind[i], dtype = 'int') - 1

        train=data[train_ind_i,:]
        test=data[test_ind_i,:]
        y_train = labels[train_ind_i]
        y_test = labels[test_ind_i]
        
        if (NumGenes > 0):
            feat_to_use = features.iloc[0:NumGenes,i]
            train = train[:,feat_to_use]
            test = test[:,feat_to_use]

        
        train.obs['cell_type'] = y_train
                
        start = tm.time()
                
        # reduce dimensions
        num_epoch = 50
        models = []
    
        for j in range(4):
            models.append(cb.directi.fit_DIRECTi(train, epoch=num_epoch, patience=10, random_seed = j, path="%d" % j))
    
        # train model
        blast = cb.blast.BLAST(models, train)#.build_empirical()
        tr_time.append(tm.time()-start)
        
        # predict labels
        start = tm.time()
        test_pred = blast.query(test).annotate('cell_type')
        ts_time.append(tm.time()-start)

        truelab.extend(y_test)
        pred.extend(test_pred.values)
    
    #write results    
    truelab = pd.DataFrame(truelab)
    pred = pd.DataFrame(pred)
            
    tr_time = pd.DataFrame(tr_time)
    ts_time = pd.DataFrame(ts_time)
    if not os.path.exists(OutputDir):
      os.mkdir(OutputDir)
    truelab.to_csv(str(Path(OutputDir+"/Cell_BLAST_true.csv")),index = False)
    pred.to_csv(str(Path(OutputDir+"/Cell_BLAST_pred.csv")),index = False)
    tr_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_training_time.csv")), index = False)
    ts_time.to_csv(str(Path(OutputDir+"/Cell_BLAST_test_time.csv")),index = False)