def __init__(self, bindres_file, pssms_file, log_file, method, fold=5, undersampling=True, shuffle=True, maxEpochs_for_trainer=10, geneScale=(0, 10)): if geneScale[0] != 0 or geneScale[1] <= geneScale[0]: raise ValueError("Gene Scale in GA has to be (0, n). n is greater than 0.") if method != "neuralNetwork" and method != "randomForest" and method != "SVM": raise ValueError("method must be neuralNetwork or randomForest or SVM [{}]".format(method)) self.bindingResidueData, self.pssmData = feature.parse_record_files(bindres_file, pssms_file) self.log_file = log_file self.method = method self.fold = fold self.undersampling = undersampling self.shuffle = shuffle self.maxEpochs_for_trainer=maxEpochs_for_trainer self.SVMParamScales = {"cost" : (-10, 10), "gamma" : (-10, 5)} self.NNParamScales = {"node_num" : (5, 50), "learning_rate" : (0.01, 0.1)} self.RFParamScales = {"n_estimators" : (101, 1001), "max_features" : (2, 30)} self.windowSizeScales = (1, 19) self.geneScale = geneScale self.log = {}
def create_positive_and_negative_dataset(window_size, sequence_length): bindres_file = "/tmp/bindingData.txt" pssms_file = "/tmp/pssms.txt" with open(bindres_file, "w") as fp: fp.write("http://purl.uniprot.org/uniprot/AAAAAA 0 1 2\n") fp.write("http://purl.uniprot.org/uniprot/BBBBBB 9\n") fp.write("http://purl.uniprot.org/uniprot/CCCCCC 7 2\n") with open(pssms_file, "w") as fp: fp.write(">http://purl.uniprot.org/uniprot/AAAAAA\n") pssm = '\n'.join(map('\t'.join, [['1' if i == j else '-1' for i in xrange(20)]+['5' for l in xrange(20)] for j in xrange(sequence_length)])) fp.write(pssm+"\n") fp.write(">http://purl.uniprot.org/uniprot/BBBBBB\n") pssm = '\n'.join(map('\t'.join, [['2' if i == j else '-2' for i in xrange(20)]+['5' for l in xrange(20)] for j in xrange(sequence_length)])) fp.write(pssm+"\n") fp.write(">http://purl.uniprot.org/uniprot/CCCCCC\n") pssm = '\n'.join(map('\t'.join, [['3' if i == j else '-3' for i in xrange(20)]+['5' for l in xrange(20)] for j in xrange(sequence_length)])) fp.write(pssm+"\n") bindingResidueData, pssmData = feature.parse_record_files(bindres_file, pssms_file) positive_dataset, negative_dataset = feature.create_dataset(bindingResidueData, pssmData, window_size) return positive_dataset, negative_dataset
def __init__(self, bindres_file, pssms_file, log_file, method, fold=5, undersampling=True, shuffle=True, maxEpochs_for_trainer=10, geneScale=(0, 10)): if geneScale[0] != 0 or geneScale[1] <= geneScale[0]: raise ValueError( "Gene Scale in GA has to be (0, n). n is greater than 0.") if method != "neuralNetwork" and method != "randomForest" and method != "SVM": raise ValueError( "method must be neuralNetwork or randomForest or SVM [{}]". format(method)) self.bindingResidueData, self.pssmData = feature.parse_record_files( bindres_file, pssms_file) self.log_file = log_file self.method = method self.fold = fold self.undersampling = undersampling self.shuffle = shuffle self.maxEpochs_for_trainer = maxEpochs_for_trainer self.SVMParamScales = {"cost": (-10, 10), "gamma": (-10, 5)} self.NNParamScales = { "node_num": (5, 50), "learning_rate": (0.01, 0.1) } self.RFParamScales = { "n_estimators": (101, 1001), "max_features": (2, 30) } self.windowSizeScales = (1, 19) self.geneScale = geneScale self.log = {}
indim = 21 * (2 * window_size + 1) clf = svm.SVC(C=cost, gamma=gamma, class_weight='auto') clf.fit(train_dataset, train_labels) return clf def create_classifier(method_and_genes, positive_dataset, negative_dataset): method, genes = method_and_genes[0], method_and_genes[1:] if method == "neuralNetwork": return create_NN_classifier(genes, positive_dataset, negative_dataset) elif method == "randomForest": return create_RF_classifier(genes, positive_dataset, negative_dataset) elif method == "SVM": return create_SVM_classifier(genes, positive_dataset, negative_dataset) else: raise ValueError("method must be neuralNetwork or randomForest or SVM [{}]".format(method)) if __name__ == "__main__": arguments = docopt(__doc__) best_chromosome_file = arguments['<best_chromosome_file>'] bindres_file = arguments['<binding_residue_file>'] pssms_file = arguments['<pssms_file>'] output_pickled_model_file = arguments['<output_pickled_model_file>'] method_and_genes = common.get_method_and_genes(best_chromosome_file) bindingResidueData, pssmData = feature.parse_record_files(bindres_file, pssms_file) positive_dataset, negative_dataset = feature.create_dataset(bindingResidueData, pssmData, int(method_and_genes[3])) clf_or_net = create_classifier(method_and_genes, positive_dataset, negative_dataset) with open(output_pickled_model_file, 'wb') as fp: pickle.dump(clf_or_net, fp)
def create_classifier(method_and_genes, positive_dataset, negative_dataset): method, genes = method_and_genes[0], method_and_genes[1:] if method == "neuralNetwork": return create_NN_classifier(genes, positive_dataset, negative_dataset) elif method == "randomForest": return create_RF_classifier(genes, positive_dataset, negative_dataset) elif method == "SVM": return create_SVM_classifier(genes, positive_dataset, negative_dataset) else: raise ValueError( "method must be neuralNetwork or randomForest or SVM [{}]".format( method)) if __name__ == "__main__": arguments = docopt(__doc__) best_chromosome_file = arguments['<best_chromosome_file>'] bindres_file = arguments['<binding_residue_file>'] pssms_file = arguments['<pssms_file>'] output_pickled_model_file = arguments['<output_pickled_model_file>'] method_and_genes = common.get_method_and_genes(best_chromosome_file) bindingResidueData, pssmData = feature.parse_record_files( bindres_file, pssms_file) positive_dataset, negative_dataset = feature.create_dataset( bindingResidueData, pssmData, int(method_and_genes[3])) clf_or_net = create_classifier(method_and_genes, positive_dataset, negative_dataset) with open(output_pickled_model_file, 'wb') as fp: pickle.dump(clf_or_net, fp)