Esempio n. 1
0
def svm_pred(argv):
    """A top level script to parse input parameters and train and predict"""

    assert(argv[1]=='pred')
    if len(argv)<6:sys.stderr.write("usage: %s pred C kernelname kernelparameters [arff|fasta] inputfiles  outputfile [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1)

    # parse input parameters
    C = float(argv[2])
    (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[3:],False)
    (trainex, trainlab, testex, argv_rest) = parse.parse_input_file_train_test(kernelname, argv_rest)

    (seq_source, nuc_con) = ('', '')
    if kernelname == 'spec' or kernelname == 'wd':
        if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<3:
            if argv_rest[-1] == 'dna':
                sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n")
                sys.exit(-1)
            elif argv_rest[-1] == 'protein':
                sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n")
                sys.exit(-1)
            else:
                sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n")
                sys.exit(-1)
        if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1)
        seq_source = argv_rest[1]
        nuc_con = argv_rest[2]
    
    if kernelname == 'linear' or kernelname== 'poly' or kernelname == 'gauss':
        if len(argv_rest)<1:sys.stderr.write("outputfile missing\n");sys.exit(-1)
        if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1)
    
    outfilename = argv_rest[0]
    
    utils.check_params(kparam, C, len(trainex[0]))

    # run training and testing
    svmout = train_and_test(trainex, trainlab, testex, C, kernelname, kparam, seq_source, nuc_con)

    # write output file
    try:
        f = open(outfilename,'w')
    except:
        sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n')
        sys.exit(-1)
        
    res_str = '#example\toutput\n'
    f.write(res_str)
    for ix in xrange(len(svmout)):
        res_str = str(ix)+'\t'+str(svmout[ix])+'\n'
        f.write(res_str)
    f.close()
Esempio n. 2
0
def svm_cv(argv):
    """A top level script to parse input parameters and run cross validation"""

    assert(argv[1]=='cv')
    if len(argv)<5:sys.stderr.write("usage: %s cv repeat C kernelname [kernelparameters] [arff|fasta] inputfiles outputfile [dna|protein] non(nucleotide|amino)converter \n" % argv[0]);sys.exit(-1)

    # parse input parameters
    cv = int(argv[2])
    C = float(argv[3])
    (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:],False)
    (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest)
    
    (seq_source, nuc_con) = ('', '')
    if kernelname == 'spec' or kernelname == 'wd':
        if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<3:
            if argv_rest[-1] == 'dna':
                sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n")
                sys.exit(-1)
            elif argv_rest[-1] == 'protein':    
                sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n")
                sys.exit(-1)
            else:
                sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n")
                sys.exit(-1)
        if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1)
        seq_source = argv_rest[1]
        nuc_con = argv_rest[2]

    if kernelname == 'linear' or kernelname == 'gauss' or kernelname == 'poly':
        if len(argv_rest)<1:sys.stderr.write("outputfile misssing\n");sys.exit(-1)
        if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1)
    outfilename = argv_rest[0]

    utils.check_params(kparam, C, len(examples[0]))

    # run cross-validation
    (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con)
    try:
        f = open(outfilename, 'w+')
    except:
        sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n')
        sys.exit(-1)
    res_str = '#example\toutput\tsplit\n'
    f.write(res_str)
    for ix in xrange(len(all_outputs)):
	res_str = '%d\t%2.7f\t%d\n' % (ix,all_outputs[ix],all_split[ix])
        f.write(res_str)
    f.close()
Esempio n. 3
0
def svm_poim(argv):
    """A top level script to parse input parameters and plot poims"""

    assert(argv[1]=='poim')
    if len(argv)<7:sys.stderr.write("usage: %s poim C poimdegree wd [kernelparameters] [arff|fasta] inputfiles  poim.png [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1)

    # parse input parameters
    C = float(argv[2])
    poimdegree = int(argv[3])
    (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], False)
    (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest)
    
    if len(argv_rest)<1:sys.stderr.write("poim.png [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
    if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
    if len(argv_rest)<3:
        if argv_rest[-1] == 'dna':
            sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n")
            sys.exit(-1)
        elif argv_rest[-1] == 'protein':
            sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n")
            sys.exit(-1)
        else:
            sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n")
            sys.exit(-1)
    if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1)
    poimfilename = argv_rest[0]
    seq_source = argv_rest[1]
    nuc_con = argv_rest[2]

    utils.check_params(kparam, C, len(examples[0]))

    # train svm and compute POIMs
    (svm, kernel, feats_train, preproc) = train(examples,labels,C,kernelname,kparam,seq_source,nuc_con)
    print "done with training "
    (poim, max_poim, diff_poim, poim_totalmass) = compute_poims(svm, kernel, poimdegree, len(examples[0]))

    # plot poims
    plots.plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, len(examples[0]))
Esempio n. 4
0
def svm_poim(argv):
    """A top level script to parse input parameters and plot poims"""

    assert(argv[1]=='poim')
    if len(argv)<7:sys.stderr.write("usage: %s poim C poimdegree wd [kernelparameters] [arff|fasta] inputfiles  poim.png [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1)

    # parse input parameters
    C = float(argv[2])
    poimdegree = int(argv[3])
    (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], False)
    (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest)
    
    if len(argv_rest)<1:sys.stderr.write("poim.png [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
    if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
    if len(argv_rest)<3:
        if argv_rest[-1] == 'dna':
            sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n")
            sys.exit(-1)
        elif argv_rest[-1] == 'protein':
            sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n")
            sys.exit(-1)
        else:
            sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n")
            sys.exit(-1)
    if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1)
    poimfilename = argv_rest[0]
    seq_source = argv_rest[1]
    nuc_con = argv_rest[2]

    utils.check_params(kparam, C, len(examples[0]))

    # train svm and compute POIMs
    (svm, kernel, feats_train, preproc) = train(examples,labels,C,kernelname,kparam,seq_source,nuc_con)
    (poim, max_poim, diff_poim, poim_totalmass) = compute_poims(svm, kernel, poimdegree, len(examples[0]))

    # plot poims
    plots.plot_poims(poimfilename, poim, max_poim, diff_poim, poim_totalmass, poimdegree, len(examples[0]))
Esempio n. 5
0
def svm_modelsel(argv):
    """A top level script to parse input parameters and run model selection"""

    assert(argv[1]=='modelsel')
    if len(argv)<5:sys.stderr.write("usage: %s modelsel repeat Cs kernelname [kernelparameters] [arff|fasta] inputfiles  outputfile [dna|protein] non(nucleotide|amino)converter\n" % argv[0]);sys.exit(-1)

    # parse input parameters
    cv = int(argv[2])
    Cs = parse.parse_float_list(argv[3])
    (kernelname,kparam,argv_rest) = parse.parse_kernel_param(argv[4:], True)
    (examples,labels,argv_rest) = parse.parse_input_file_train(kernelname, argv_rest)

    (seq_source, nuc_con) = ('', '')
    if kernelname == 'spec' or kernelname == 'wd':
        if len(argv_rest)<1:sys.stderr.write("outputfile [dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<2:sys.stderr.write("[dna|protein] non(nucleotide|amino)converter are missing\n");sys.exit(-1)
        if len(argv_rest)<3:
            if argv_rest[-1] == 'dna':
                sys.stderr.write("non-nucleotide converter like [A|T|C|G|R|Y|N] is missing. Cannot continue.\n")
                sys.exit(-1)
            elif argv_rest[-1] == 'protein':
                sys.stderr.write("non-amino acid converter like [G|P|A|V|L|I|M|C|F|Y|W|H|K|R|Q|N|E|D|S|T|random] is missing. Cannot continue.\n")
                sys.exit(-1)
            else:
                sys.stderr.write("Here expect FASTA sequence type as [dna|protein] instead of -"+ argv_rest[-1] +"- Cannot continue.\n")
                sys.exit(-1)
        if len(argv_rest)>3:sys.stderr.write("Too many arguments\n");sys.exit(-1)
        seq_source = argv_rest[1]
        nuc_con = argv_rest[2]
    
    if kernelname == 'linear' or kernelname == 'gauss' or kernelname== 'poly':
        if len(argv_rest)<1:sys.stderr.write("outputfile missing\n");sys.exit(-1)
        if len(argv_rest)>1:sys.stderr.write("Too many arguments\n");sys.exit(-1)

    outfilename = argv_rest[0]

    # run cross-validation
    mean_rocs=[] ;
    mean_prcs=[] ;
    mean_accs=[] ;
    all_Cs = [] ;
    all_kparam=[] ;

    if kparam["modelsel_name"]==None:
        for C in Cs:
            utils.check_params(kparam, C, len(examples[0]))

            (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con)
            (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels)
            mean_rocs.append(mean_roc) 
            mean_prcs.append(mean_prc) 
            mean_accs.append(mean_acc) 
            all_Cs.append(C) 
            all_kparam.append(None) 
    else: # also optimize one kernel parameter
        for C in Cs:
            for kp in kparam["modelsel_params"]:
                kparam[kparam["modelsel_name"]] = kp 
                utils.check_params(kparam, C, len(examples[0]))

                (all_outputs, all_split) = crossvalidation(cv, kernelname, kparam, C, examples, labels, seq_source, nuc_con)
                (res_str, mean_roc, mean_prc, mean_acc) = evaluate(all_outputs, all_split, labels)
                mean_rocs.append(mean_roc) 
                mean_prcs.append(mean_prc) 
                mean_accs.append(mean_acc) 
                all_Cs.append(C) 
                all_kparam.append(kp)

    max_roc=numpy.max(numpy.array(mean_rocs)) 
    max_prc=numpy.max(numpy.array(mean_prcs)) 
    max_acc=numpy.max(numpy.array(mean_accs)) 
    try:
        f = open(outfilename, 'w+')
    except:
        sys.stderr.write('Fails to open the outputfile at ' + outfilename + ' Cannot continue.\n')
        sys.exit(-1)
    if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1:
        detail_str = "\tC\tROC\tPRC\tAccuracy (at threshold 0)\n"
    else:
        detail_str = "\tC\t%s\tROC\tPRC\tAccuracy (at threshold 0)\n" % kparam["modelsel_name"]

    best_roc_str=''
    best_prc_str=''
    best_acc_str=''
    for i in xrange(len(all_Cs)):
        # determine the best parameter combinations
        if mean_rocs[i]==max_roc:
            rocsym='+'
            best_roc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n'
        else:
            rocsym=' '
        if mean_prcs[i]==max_prc:
            prcsym='+'
            best_prc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n'
        else:
            prcsym=' '
        if mean_accs[i]==max_acc:
            accsym='+'
            best_acc_str+=model2str(kparam, all_Cs[i], all_kparam[i])+'\n'
        else:
            accsym=' '
        
        detail_str+=model2str(kparam, all_Cs[i], all_kparam[i], False)+'\t'
        if kparam["modelsel_name"]==None or len(kparam["modelsel_params"])==1:
            detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i])
        else:
            detail_str += '%c%2.1f%%\t%c%2.1f%%\t%c%2.1f%%\n' % (rocsym, 100*mean_rocs[i], prcsym, 100*mean_prcs[i], accsym, 100*mean_accs[i])

    f.write('Best model(s) according to ROC measure:\n%s' % best_roc_str)
    f.write('\nBest model(s) according to PRC measure:\n%s' % best_prc_str)
    f.write('\nBest model(s) according to accuracy measure:\n%s' % best_acc_str)

    f.write('\nDetailed results:\n')
    f.write(detail_str)
    f.close()