Esempio n. 1
0
def org_classification():
    '''
    '''
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/org/K/6-mer_org_restrictedkmer.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/org/K/org_label_restrictedkmer.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,256,0.1,256,0.1,128,0.1,64])
    DNN.cross_validation('../../datasets/results/org/classifier/nn', gpu_dev='2', n_fold=10, epochs=30, batch_size=100, model_strct='mlp')
 def __init__(self,
              fasta_file,
              matrix_path,
              feature_file_path,
              phenotypes,
              phenotype_mapping,
              selected_samples,
              p_value_threshold=0.01,
              remove_redundants=False,
              num_p=4,
              blastn_path=''):
     if len(blastn_path) > 0:
         os.environ['PATH'] += ':' + blastn_path
     self.num_p = num_p
     self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file)
     self.remove_redundants = remove_redundants
     self.ez_taxa_dict = {
         x.split()[0]: x.split()[1].split(';')
         for x in FileUtility.load_list('db/ez_idx_taxonomy.txt')
     }
     self.mat = FileUtility.load_sparse_csr(matrix_path)
     self.mat = self.mat.toarray()
     self.mat = self.mat[selected_samples, :]
     self.mat = csr_matrix(self.mat)
     self.features = FileUtility.load_list(feature_file_path)
     self.align_markers_parallel(p_value_threshold)
     self.redundant_columns_indentification()
     self.phenotype_mapping = phenotype_mapping
     self.phenotypes = phenotypes
Esempio n. 3
0
def eco_all_classification():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[1024,0.2,512,0.2,512,0.1,256])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='1', n_fold=10, epochs=20, batch_size=10, model_strct='mlp')
Esempio n. 4
0
def eco_all_classification_transfer_learning():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/eco_all_classes/6-mer_eco_restrictedmer_all.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/eco_all_classes/eco_label_restrictedkmer_all.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.1,256, 0.1,128])
    DNN.cross_validation('../../datasets/results/eco_all/nn', gpu_dev='6', pretrained_model=True,trainable=False, n_fold=5, epochs=10, batch_size=10, model_strct='../../datasets/results/eco_10000/classifiers/nn_layers_mlp_1024-0.2-512-0.2-512_0.88.pickle')
Esempio n. 5
0
def crohns_disease():
    '''
    '''
    #[1024,0.2,256,0.1,256,0.1,128,0.1,64]
    X=FileUtility.load_sparse_csr('../../datasets/processed_data/crohn/sample-size/6-mers_rate_complete1359_seq_5000.npz').toarray()
    Y=FileUtility.load_list('../../datasets/processed_data/crohn/data_config/labels_disease_complete1359.txt')
    DNN=DNNMutliclass16S(X,Y,model_arch=[512,0.2,256,0.2,128,0.1,64,16])
    DNN.cross_validation('../../datasets/results/crohn/classifier/nn', gpu_dev='2', n_fold=3, epochs=25, batch_size=10, model_strct='mlp')
Esempio n. 6
0
def test():
    X = FileUtility.load_sparse_csr(
        '../body-sites/npe_rate_5000.npz').toarray()
    Y = FileUtility.load_list(
        '../body-sites/npe_representations_labels/labels_phen.txt')
    DNN = DNNMutliclass16S(X, Y, model_arch=[512, 0.2, 256, 0.2, 128, 0.1, 64])
    DNN.cross_validation('../body-sites/nn',
                         gpu_dev='2',
                         n_fold=3,
                         epochs=300,
                         batch_size=10,
                         model_strct='mlp')
Esempio n. 7
0
 def DNN_classifier(out_dir, X_file, Y_file, arch, gpu_id, epochs,
                    batch_size):
     # k-mer data
     X = FileUtility.load_sparse_csr(X_file).toarray()
     # labels
     Y = [int(y) for y in FileUtility.load_list(Y_file)]
     DeepNN = DNN(X, Y, model_arch=arch)
     DeepNN.cross_validation(out_dir,
                             gpu_dev=gpu_id,
                             n_fold=10,
                             epochs=epochs,
                             batch_size=batch_size,
                             model_strct='mlp')
 def __init__(self, X_file, Y_file, features_file, path, selected_samples):
     '''
     :param X:
     :param Y:
     :param features:
     :param path:
     '''
     self.X = FileUtility.load_sparse_csr(X_file)
     self.X = self.X.toarray()
     self.X = self.X[selected_samples, :]
     self.X = csr_matrix(self.X)
     self.Y = [int(x) for x in FileUtility.load_list(Y_file)]
     self.features = FileUtility.load_list(features_file)
     self.path = path
Esempio n. 9
0
 def DNN_classifier(X_file, Y_file, arch, out_dir, dataset_name, gpu_id,
                    epochs, batch_size):
     # k-mer data
     X = FileUtility.load_sparse_csr(X_file).toarray()
     # labels
     Y = FileUtility.load_list(Y_file)
     DNN = DNNMutliclass16S(X, Y, model_arch=arch)
     DNN.cross_validation(out_dir + 'nn_classification_results_' +
                          dataset_name,
                          gpu_dev=gpu_id,
                          n_fold=10,
                          epochs=epochs,
                          batch_size=batch_size,
                          model_strct='mlp')
Esempio n. 10
0
 def load_data(self, prefix_list=None):
     '''
     Load list of features
     :param dir:
     :param prefix_list:
     :return:
     '''
     for save_pref in prefix_list:
         print('@@@' + '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz']))
         self.X[save_pref] = FileUtility.load_sparse_csr(
             '_'.join([self.representation_path + save_pref, 'feature', 'vect.npz']))
         self.feature_names[save_pref] = FileUtility.load_list(
             '_'.join([self.representation_path + save_pref, 'feature', 'list.txt']))
         self.strains[save_pref] = FileUtility.load_list(
             '_'.join([self.representation_path + save_pref, 'strains', 'list.txt']))
Esempio n. 11
0
 def load_data(self, dir, prefix_list):
     '''
     Load list of features
     :param dir:
     :param prefix_list:
     :return:
     '''
     for save_pref in prefix_list:
         print('@@@' + '_'.join([dir + save_pref, 'feature', 'vect.npz']))
         self.X[save_pref] = FileUtility.load_sparse_csr('_'.join(
             [dir + save_pref, 'feature', 'vect.npz']))
         self.feature_names[save_pref] = FileUtility.load_list('_'.join(
             [dir + save_pref, 'feature', 'list.txt']))
         self.isolates[save_pref] = FileUtility.load_list('_'.join(
             [dir + save_pref, 'isolates', 'list.txt']))
Esempio n. 12
0
    def classical_classifier(out_dir, X_file, Y_file, model, cores):
        #
        X = FileUtility.load_sparse_csr(X_file)
        # labels
        Y = [int(y) for y in FileUtility.load_list(Y_file)]

        if model == 'RF':
            #### Random Forest classifier
            MRF = RFClassifier(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MRF.tune_and_eval(out_dir, njobs=cores)
        elif model == 'SVM':
            #### Support Vector Machine classifier
            MSVM = SVM(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MSVM.tune_and_eval(out_dir, njobs=cores)
        elif model == 'LR':
            #### Logistic regression classifier
            MLR = LogRegression(X, Y)
            # results containing the best parameter, confusion matrix, best estimator, results on fold will be stored in this address
            MLR.tune_and_eval(out_dir, njobs=cores)
Esempio n. 13
0
    def classical_classifier(X_file, Y_file, model, out_dir, dataset_name,
                             cores):
        #
        X = FileUtility.load_sparse_csr(X_file)
        # labels
        Y = FileUtility.load_list(Y_file)

        if model == 'RF':
            #### Random Forest classifier
            MRF = RFClassifier(X, Y)
            # results containing the best parameter, confusion metrix, best estimator, results on fold will be stored in this address
            MRF.tune_and_eval(out_dir + '/classification_results_' +
                              dataset_name,
                              n_jobs=cores)
        else:
            #### Support Vector Machine classifier
            MSVM = SVM(X, Y)
            # results containing the best parameter, confusion metrix, best estimator, results on fold will be stored in this address
            MSVM.tune_and_eval(out_dir + '/classification_results_' +
                               dataset_name,
                               n_jobs=cores)
Esempio n. 14
0
    def plot_res(file_address,
                 X_addr,
                 features_addr,
                 selected_addr,
                 label_addr,
                 labels=['Negative', 'Positive']):
        global color_schemes
        color_schemes = [
            ['green', 'blue', 'red', 'gold', 'cyan'],
            [
                '#ff0505', '#f2a041', '#cdff05', '#04d9cb', '#45a8ff',
                '#8503a6', '#590202', '#734d02', '#4ab304', '#025359',
                '#0454cc', '#ff45da', '#993829', '#ffda45', '#1c661c',
                '#05cdff', '#1c2f66', '#731f57', '#b24a04', '#778003',
                '#0e3322', '#024566', '#0404d9', '#e5057d', '#66391c',
                '#31330e', '#3ee697', '#2d7da6', '#20024d', '#33011c'
            ] + list(({
                'aliceblue': '#F0F8FF',
                'antiquewhite': '#FAEBD7',
                'aqua': '#00FFFF',
                'aquamarine': '#7FFFD4',
                'azure': '#F0FFFF',
                'beige': '#F5F5DC',
                'bisque': '#FFE4C4',
                'black': '#000000',
                'blanchedalmond': '#FFEBCD',
                'blue': '#0000FF',
                'blueviolet': '#8A2BE2',
                'brown': '#A52A2A',
                'burlywood': '#DEB887',
                'cadetblue': '#5F9EA0',
                'chartreuse': '#7FFF00',
                'chocolate': '#D2691E',
                'coral': '#FF7F50',
                'cornflowerblue': '#6495ED',
                'cornsilk': '#FFF8DC',
                'crimson': '#DC143C',
                'cyan': '#00FFFF',
                'darkblue': '#00008B',
                'darkcyan': '#008B8B',
                'darkgoldenrod': '#B8860B',
                'darkgray': '#A9A9A9',
                'darkgreen': '#006400',
                'darkkhaki': '#BDB76B',
                'darkmagenta': '#8B008B',
                'darkolivegreen': '#556B2F',
                'darkorange': '#FF8C00',
                'darkorchid': '#9932CC',
                'darkred': '#8B0000',
                'darksalmon': '#E9967A',
                'darkseagreen': '#8FBC8F',
                'darkslateblue': '#483D8B',
                'darkslategray': '#2F4F4F',
                'darkturquoise': '#00CED1',
                'darkviolet': '#9400D3',
                'deeppink': '#FF1493',
                'deepskyblue': '#00BFFF',
                'dimgray': '#696969',
                'dodgerblue': '#1E90FF',
                'firebrick': '#B22222',
                'floralwhite': '#FFFAF0',
                'forestgreen': '#228B22',
                'fuchsia': '#FF00FF',
                'gainsboro': '#DCDCDC',
                'ghostwhite': '#F8F8FF',
                'gold': '#FFD700',
                'goldenrod': '#DAA520',
                'gray': '#808080',
                'green': '#008000',
                'greenyellow': '#ADFF2F',
                'honeydew': '#F0FFF0',
                'hotpink': '#FF69B4',
                'indianred': '#CD5C5C',
                'indigo': '#4B0082',
                'ivory': '#FFFFF0',
                'khaki': '#F0E68C',
                'lavender': '#E6E6FA',
                'lavenderblush': '#FFF0F5',
                'lawngreen': '#7CFC00',
                'lemonchiffon': '#FFFACD',
                'lightblue': '#ADD8E6',
                'lightcoral': '#F08080',
                'lightcyan': '#E0FFFF',
                'lightgoldenrodyellow': '#FAFAD2',
                'lightgreen': '#90EE90',
                'lightgray': '#D3D3D3',
                'lightpink': '#FFB6C1',
                'lightsalmon': '#FFA07A',
                'lightseagreen': '#20B2AA',
                'lightskyblue': '#87CEFA',
                'lightslategray': '#778899',
                'lightsteelblue': '#B0C4DE',
                'lightyellow': '#FFFFE0',
                'lime': '#00FF00',
                'limegreen': '#32CD32',
                'linen': '#FAF0E6',
                'magenta': '#FF00FF',
                'maroon': '#800000',
                'mediumaquamarine': '#66CDAA',
                'mediumblue': '#0000CD',
                'mediumorchid': '#BA55D3',
                'mediumpurple': '#9370DB',
                'mediumseagreen': '#3CB371',
                'mediumslateblue': '#7B68EE',
                'mediumspringgreen': '#00FA9A',
                'mediumturquoise': '#48D1CC',
                'mediumvioletred': '#C71585',
                'midnightblue': '#191970',
                'mintcream': '#F5FFFA',
                'mistyrose': '#FFE4E1',
                'moccasin': '#FFE4B5',
                'navajowhite': '#FFDEAD',
                'navy': '#000080',
                'oldlace': '#FDF5E6',
                'olive': '#808000',
                'olivedrab': '#6B8E23',
                'orange': '#FFA500',
                'orangered': '#FF4500',
                'orchid': '#DA70D6',
                'palegoldenrod': '#EEE8AA',
                'palegreen': '#98FB98',
                'paleturquoise': '#AFEEEE',
                'palevioletred': '#DB7093',
                'papayawhip': '#FFEFD5',
                'peachpuff': '#FFDAB9',
                'peru': '#CD853F',
                'pink': '#FFC0CB',
                'plum': '#DDA0DD',
                'powderblue': '#B0E0E6',
                'purple': '#800080',
                'red': '#FF0000',
                'rosybrown': '#BC8F8F',
                'royalblue': '#4169E1',
                'saddlebrown': '#8B4513',
                'salmon': '#FA8072',
                'sandybrown': '#FAA460',
                'seagreen': '#2E8B57',
                'seashell': '#FFF5EE',
                'sienna': '#A0522D',
                'silver': '#C0C0C0',
                'skyblue': '#87CEEB',
                'slateblue': '#6A5ACD',
                'slategray': '#708090',
                'snow': '#FFFAFA',
                'springgreen': '#00FF7F',
                'steelblue': '#4682B4',
                'tan': '#D2B48C',
                'teal': '#008080',
                'thistle': '#D8BFD8',
                'tomato': '#FF6347',
                'turquoise': '#40E0D0',
                'violet': '#EE82EE',
                'wheat': '#F5DEB3',
                'white': '#FFFFFF',
                'whitesmoke': '#F5F5F5',
                'yellow': '#FFFF00',
                'yellowgreen': '#9ACD32'
            }).keys()),
            [
                '#ff0505', '#f2a041', '#cdff05', '#04d9cb', '#45a8ff',
                '#8503a6', '#590202', '#734d02', '#4ab304', '#025359',
                '#0454cc', '#ff45da', '#993829', '#ffda45', '#1c661c',
                '#05cdff', '#1c2f66', '#731f57', '#b24a04', '#778003',
                '#0e3322', '#024566', '#0404d9', '#e5057d', '#66391c',
                '#31330e', '#3ee697', '#2d7da6', '#20024d', '#33011c'
            ]
        ]
        X = FileUtility.load_sparse_csr(X_addr)
        features = FileUtility.load_list(features_addr)
        features_selected = FileUtility.load_list(selected_addr)
        idx = [features.index(x) for x in features_selected if x in features]
        X_selected = X[:, idx]
        Y = FileUtility.load_list(label_addr)
        X_tsne = DiTaxaWorkflow.get_tsne(X)
        X_red_tsne = DiTaxaWorkflow.get_tsne(X_selected)

        f = plt.figure(figsize=(16, 8))
        ax1 = f.add_subplot(121)
        ax2 = f.add_subplot(122)
        DiTaxaWorkflow.plot_scatter(ax1,
                                    X_tsne,
                                    Y,
                                    't-SNE 1',
                                    't-SNE 0',
                                    '(i) t-SNE over NPE representations',
                                    legend_hide=False,
                                    legend_loc=9,
                                    legend_size=10,
                                    label_dict={
                                        '0': labels[0],
                                        '1': labels[1]
                                    },
                                    color_schemes_idx=0)
        DiTaxaWorkflow.plot_scatter(ax2,
                                    X_red_tsne,
                                    Y,
                                    't-SNE 1',
                                    't-SNE 0',
                                    '(ii) t-SNE over selected markers',
                                    legend_hide=False,
                                    legend_loc=9,
                                    legend_size=10,
                                    label_dict={
                                        '0': labels[0],
                                        '1': labels[1]
                                    },
                                    color_schemes_idx=0)
        plt.savefig(file_address)
        plt.close()