def svm_cv(argv): """A top level script to parse input parameters and run cross validation""" assert(argv[1]=='cv') if len(argv)<5:sys.stderr.write("usage: %s cv repeat C kernelname [kernelparameters] [arff|fasta] inputfiles outputfile [dna|protein] non(nucleotide|amino)converter \n" % argv[0]);sys.exit(-1) # parse input parameters cv = int(argv[2]) C = float(argv[3]) (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:],False) (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) (seq_source, nuc_con) = ('', '') if kernelname == 'spec' or kernelname == 'wd': if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<3: if argv_rest[-1] == 'dna': sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") sys.exit(-1) elif argv_rest[-1] == 'protein': sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") sys.exit(-1) else: sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") sys.exit(-1) if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) seq_source = argv_rest[1] nuc_con = argv_rest[2] if kernelname == 'linear' or kernelname == 'gauss' or kernelname == 'poly': if len(argv_rest)<1:sys.stderr.write("outputfile misssing\n");sys.exit(-1) if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1) outfilename = argv_rest[0] utils.check_params(kparam, C, len(examples[0])) # run cross-validation (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) try: f = open(outfilename, 'w+') except: sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') sys.exit(-1) res_str = '#example\toutput\tsplit\n' f.write(res_str) for ix in xrange(len(all_outputs)): res_str = '%d\t%2.7f\t%d\n' % (ix,all_outputs[ix],all_split[ix]) f.write(res_str) f.close()
def svm_eval(argv): """A top level script to parse input parameters and evaluate""" assert (argv[1] == 'eval') if len(argv) < 6: sys.stderr.write( "usage: %s eval predictionfile [arff|fasta] inputfiles outputfile [roc|prc figure.png]\n" % argv[0]) sys.exit(-1) # parse input parameters (predictions, splitassignments) = parse.parse_prediction_file(argv[2]) (trainex, trainlab, argv_rest) = parse.parse_input_file_train(None, argv[3:]) if len(argv_rest) < 1: sys.stderr.write("Output file missing\n") sys.exit(-1) if len(argv_rest) > 3: sys.stderr.write("Too many arguments\n") sys.exit(-1) outfilename = argv_rest[0] roc_fname = None prc_fname = None if len(argv_rest) > 2: if argv_rest[1] == 'roc': roc_fname = argv_rest[2] elif argv_rest[1] == 'prc': prc_fname = argv_rest[2] else: sys.stderr.write('Usage: [roc|prc]') sys.exit(-1) # run training and testing (res_str, mean_roc, mean_prc, mean_acc) = evaluate(predictions, splitassignments, trainlab, roc_fname, prc_fname) # write output file try: f = open(outfilename, 'w') except: sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') sys.exit(-1) f.write(res_str) f.close()
def svm_poim(argv): """A top level script to parse input parameters and plot poims""" assert(argv[1]=='poim') if len(argv)<7:sys.stderr.write("usage: %s poim C poimdegree wd [kernelparameters] [arff|fasta] inputfiles poim.png [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) # parse input parameters C = float(argv[2]) poimdegree = int(argv[3]) (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], False) (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) if len(argv_rest)<1:sys.stderr.write("poim.png [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<3: if argv_rest[-1] == 'dna': sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") sys.exit(-1) elif argv_rest[-1] == 'protein': sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") sys.exit(-1) else: sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") sys.exit(-1) if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) poimfilename = argv_rest[0] seq_source = argv_rest[1] nuc_con = argv_rest[2] utils.check_params(kparam, C, len(examples[0])) # train svm and compute POIMs (svm, kernel, feats_train, preproc) = train(examples,labels,C,kernelname,kparam,seq_source,nuc_con) print "done with training " (poim, max_poim, diff_poim, poim_totalmass) = compute_poims(svm, kernel, poimdegree, len(examples[0])) # plot poims plots.plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, len(examples[0]))
def svm_poim(argv): """A top level script to parse input parameters and plot poims""" assert(argv[1]=='poim') if len(argv)<7:sys.stderr.write("usage: %s poim C poimdegree wd [kernelparameters] [arff|fasta] inputfiles poim.png [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) # parse input parameters C = float(argv[2]) poimdegree = int(argv[3]) (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], False) (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) if len(argv_rest)<1:sys.stderr.write("poim.png [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<3: if argv_rest[-1] == 'dna': sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") sys.exit(-1) elif argv_rest[-1] == 'protein': sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") sys.exit(-1) else: sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") sys.exit(-1) if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) poimfilename = argv_rest[0] seq_source = argv_rest[1] nuc_con = argv_rest[2] utils.check_params(kparam, C, len(examples[0])) # train svm and compute POIMs (svm, kernel, feats_train, preproc) = train(examples,labels,C,kernelname,kparam,seq_source,nuc_con) (poim, max_poim, diff_poim, poim_totalmass) = compute_poims(svm, kernel, poimdegree, len(examples[0])) # plot poims plots.plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, len(examples[0]))
def svm_eval(argv): """A top level script to parse input parameters and evaluate""" assert(argv[1]=='eval') if len(argv)<6:sys.stderr.write("usage: %s eval predictionfile [arff|fasta] inputfiles outputfile [roc|prc figure.png]\n" % argv[0]);sys.exit(-1) # parse input parameters (predictions, splitassignments) = parse.parse_prediction_file(argv[2]) (trainex, trainlab, argv_rest) = parse.parse_input_file_train(None, argv[3:]) if len(argv_rest)<1:sys.stderr.write("Output file missing\n");sys.exit(-1) if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) outfilename = argv_rest[0] roc_fname = None prc_fname = None if len(argv_rest)>2: if argv_rest[1]=='roc': roc_fname=argv_rest[2] elif argv_rest[1]=='prc': prc_fname=argv_rest[2] else: sys.stderr.write('Usage: [roc|prc]') sys.exit(-1) # run training and testing (res_str,mean_roc,mean_prc,mean_acc) = evaluate(predictions, splitassignments, trainlab, roc_fname, prc_fname) # write output file try: f = open(outfilename,'w') except: sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') sys.exit(-1) f.write(res_str) f.close()
def svm_modelsel(argv): """A top level script to parse input parameters and run model selection""" assert(argv[1]=='modelsel') if len(argv)<5:sys.stderr.write("usage: %s modelsel repeat Cs kernelname [kernelparameters] [arff|fasta] inputfiles outputfile [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1) # parse input parameters cv = int(argv[2]) Cs = parse.parse_float_list(argv[3]) (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], True) (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest) (seq_source, nuc_con) = ('', '') if kernelname == 'spec' or kernelname == 'wd': if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1) if len(argv_rest)<3: if argv_rest[-1] == 'dna': sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n") sys.exit(-1) elif argv_rest[-1] == 'protein': sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n") sys.exit(-1) else: sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n") sys.exit(-1) if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1) seq_source = argv_rest[1] nuc_con = argv_rest[2] if kernelname == 'linear' or kernelname == 'gauss' or kernelname== 'poly': if len(argv_rest)<1:sys.stderr.write("outputfile missing\n");sys.exit(-1) if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1) outfilename = argv_rest[0] # run cross-validation mean_rocs=[] ; mean_prcs=[] ; mean_accs=[] ; all_Cs = [] ; all_kparam=[] ; if kparam["modelsel_name"]==None: for C in Cs: utils.check_params(kparam, C, len(examples[0])) (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels) mean_rocs.append(mean_roc) mean_prcs.append(mean_prc) mean_accs.append(mean_acc) all_Cs.append(C) all_kparam.append(None) else: # also optimize one kernel parameter for C in Cs: for kp in kparam["modelsel_params"]: kparam[kparam["modelsel_name"]] = kp utils.check_params(kparam, C, len(examples[0])) (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con) (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels) mean_rocs.append(mean_roc) mean_prcs.append(mean_prc) mean_accs.append(mean_acc) all_Cs.append(C) all_kparam.append(kp) max_roc=numpy.max(numpy.array(mean_rocs)) max_prc=numpy.max(numpy.array(mean_prcs)) max_acc=numpy.max(numpy.array(mean_accs)) try: f = open(outfilename, 'w+') except: sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n') sys.exit(-1) if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1: detail_str = "\tC\tROC\tPRC\tAccuracy (at threshold 0)\n" else: detail_str = "\tC\t%s\tROC\tPRC\tAccuracy (at threshold 0)\n" % kparam["modelsel_name"] best_roc_str='' best_prc_str='' best_acc_str='' for i in xrange(len(all_Cs)): # determine the best parameter combinations if mean_rocs[i]==max_roc: rocsym='+' best_roc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' else: rocsym=' ' if mean_prcs[i]==max_prc: prcsym='+' best_prc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' else: prcsym=' ' if mean_accs[i]==max_acc: accsym='+' best_acc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n' else: accsym=' ' detail_str+=model2str(kparam, all_Cs[i], all_kparam[i], False)+'\t' if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1: detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i]) else: detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i]) f.write('Best model(s) according to ROC measure:\n%s' % best_roc_str) f.write('\nBest model(s) according to PRC measure:\n%s' % best_prc_str) f.write('\nBest model(s) according to accuracy measure:\n%s' % best_acc_str) f.write('\nDetailed results:\n') f.write(detail_str) f.close()