def arffread(kernelname,datafilename):
    """Decide based on kernelname whether to read a sequence or vectorial file"""

    if kernelname == 'gauss' or kernelname == 'linear' or kernelname == 'poly' or kernelname == None:
        fp = init_datasetfile(datafilename,'vec')
    elif kernelname == 'wd' or kernelname == 'localalign' or kernelname == 'localimprove'\
             or kernelname == 'spec' or kernelname == 'cumspec':
        fp = init_datasetfile(datafilename,'seq')
    elif kernelname == 'spec2' or kernelname == 'cumspec2':
        fp = init_datasetfile(datafilename,'mseq')
    else:
        print 'Unknown kernel in arffread'

    return fp.readlines()
Exemple #2
0
def arffread(kernelname, datafilename):
    """Decide based on kernelname whether to read a sequence or vectorial file"""

    if kernelname == 'gauss' or kernelname == 'linear' or kernelname == 'poly' or kernelname == None:
        fp = init_datasetfile(datafilename, 'vec')
    elif kernelname == 'wd' or kernelname == 'localalign' or kernelname == 'localimprove'\
             or kernelname == 'spec' or kernelname == 'cumspec':
        fp = init_datasetfile(datafilename, 'seq')
    elif kernelname == 'spec2' or kernelname == 'cumspec2':
        fp = init_datasetfile(datafilename, 'mseq')
    else:
        print 'Unknown kernel in arffread'

    return fp.readlines()
Exemple #3
0
def fastaread(fnamepos, fnameneg=None):
    """Read two fasta files, the first positive, the second negative"""
    fpos = init_datasetfile(fnamepos, 'seq')
    (fa1, lab1) = fpos.readlines()

    if fnameneg is not None:
        fneg = init_datasetfile(fnameneg, 'seq')
        (fa2, lab2) = fneg.readlines()

        print 'positive: %d, negative %d' % (len(fa1), len(fa2))
        all_labels = concatenate((ones(len(fa1)), -ones(len(fa2))))
        all_examples = fa1 + fa2
    else:
        all_examples = fa1
        all_labels = ones(len(fa1))

    return all_examples, all_labels
def fastaread(fnamepos,fnameneg=None):
    """Read two fasta files, the first positive, the second negative"""
    fpos = init_datasetfile(fnamepos,'seq')
    (fa1,lab1) = fpos.readlines()

    if fnameneg is not None:
        fneg = init_datasetfile(fnameneg,'seq')
        (fa2,lab2) = fneg.readlines()

        print 'positive: %d, negative %d' % (len(fa1),len(fa2))
        all_labels = concatenate((ones(len(fa1)),-ones(len(fa2))))
        all_examples = fa1 + fa2
    else:
        all_examples = fa1
        all_labels = ones(len(fa1))

    return all_examples, all_labels
Exemple #5
0
def fastawrite_sequence(filename,p):
    """Write a FASTA file containing a sequence dataset"""
    import arff

    (metadata,seqlist) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate)
    labels = ones(len(seqlist))
    fp = init_datasetfile(filename,'seq')
    fp.writelines(seqlist,labels)
Exemple #6
0
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0):
    """Write an ARFF file containing a vectorial dataset"""
    #import arff

    (metadata, pointcloud, labels) = cloudgen(numpoint, numfeat, fracpos, width)

    fp = init_datasetfile(filename,'vec')
    fp.comment = metadata
    fp.dataname = 'pointcloud'
    fp.writelines(pointcloud,labels)
Exemple #7
0
def arffwrite_sequence(filename,p, n):
    """Write an ARFF file containing a sequence dataset"""
    #import arff

    (metadatapos,seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate)
    (metadataneg,seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate)

    labels = concatenate((ones(len(seqlistpos)),-ones(len(seqlistneg))))
    seqlist = seqlistpos + seqlistneg
    fp = init_datasetfile(filename,'seq')
    fp.comment = metadatapos+' '+metadataneg
    fp.dataname = 'motif'
    fp.writelines(seqlist,labels)
Exemple #8
0
def test_gc(gcfilename):
    """
    Check the gc content files for conflicting labels
    """
    fp = init_datasetfile(gcfilename,'vec')
    (examples,labels) = fp.readlines()
    print '%d positive and %d negative examples' % (sum(labels>0.0),sum(labels<0.0))

    distance = sqr_dist(numpy.matrix(examples),numpy.matrix(examples))
    labdist = numpy.matrix(labels).T*numpy.matrix(labels)
    #difflab = numpy.where(labdist.A<0,distance,numpy.matlib.ones((len(labels),len(labels))))
    contracount = 0
    for ix in xrange(len(labels)):
        for iy in xrange(ix+1,len(labels)):
            if labdist[ix,iy]<0 and distance[ix,iy]<0.01:
                contracount += 1
    print distance.shape, labdist.shape
    #print '%d identical examples with opposing labels' %len(numpy.unique(numpy.where(difflab==0)[0]))
    print '%d identical examples with opposing labels' % contracount
Exemple #9
0
def test_gc(gcfilename):
    """
    Check the gc content files for conflicting labels
    """
    fp = init_datasetfile(gcfilename, 'vec')
    (examples, labels) = fp.readlines()
    print '%d positive and %d negative examples' % (sum(labels > 0.0),
                                                    sum(labels < 0.0))

    distance = sqr_dist(numpy.matrix(examples), numpy.matrix(examples))
    labdist = numpy.matrix(labels).T * numpy.matrix(labels)
    #difflab = numpy.where(labdist.A<0,distance,numpy.matlib.ones((len(labels),len(labels))))
    contracount = 0
    for ix in xrange(len(labels)):
        for iy in xrange(ix + 1, len(labels)):
            if labdist[ix, iy] < 0 and distance[ix, iy] < 0.01:
                contracount += 1
    print distance.shape, labdist.shape
    #print '%d identical examples with opposing labels' %len(numpy.unique(numpy.where(difflab==0)[0]))
    print '%d identical examples with opposing labels' % contracount
Exemple #10
0
            (n.seqlenmin, n.seqlenmax) = esvm.parse.parse_range(sys.argv[10])
            (n.posstart, n.posend) = esvm.parse.parse_range(sys.argv[11])
            n.mutrate = float(sys.argv[12])

            filename = sys.argv[13]
            arffwrite_sequence(filename, p, n)

    elif sys.argv[1] == 'cloud':
        # generate a data cloud in ARFF format
        numpoint = int(sys.argv[2])
        numfeat = int(sys.argv[3])
        fracpos = float(sys.argv[4])
        width = float(sys.argv[5])

        filename = sys.argv[6]
        arffwrite_real(filename, numpoint, numfeat, fracpos, width)
        if len(sys.argv) >= 8:
            fp = init_datasetfile(filename, 'vec')
            (examples, labels) = fp.readlines()
            pointcloud = []
            for ix in xrange(numpoint):
                pointcloud.append(
                    array([labels[ix], examples[0, ix], examples[1, ix]]))
            esvm.plots.plotcloud(pointcloud, sys.argv[7], 'Pointcloud')

#(examples,labels,metadata)=arffwrite_real(filename, numpoint, numfeat, fracpos, width)
#if len(sys.argv)>=8:
#	plots.plotcloud(pointcloud,sys.argv[7],metadata)
    else:
        print 'Unknown option %s\n' % sys.argv[1]
def splice_example(Cs, gcfilename,seqfilename,seq2filename, plot=False):
    """
    For the data files, apply the set of kernels
    """
    # hyperparameters
    num_fold_cv = 5

    # The area under the receiver operating characteristic
    results=[]

    # Read datasets

    # GC features
    fp = init_datasetfile(gcfilename,'vec')
    (gc_examples,gc_labels) = fp.readlines()
    gc_examples = normalize(gc_examples, subtract_mean=True)

    if plot:
        from pylab import scatter,show
        color=['b','r']
        scatter(gc_examples[0,], gc_examples[1,], s=400*(gc_labels+2), c=''.join([ color[(int(i)+1)/2] for i in gc_labels]), alpha=0.1)
        show()

    # 2 sequence features
    fp = init_datasetfile(seq2filename,'mseq')
    (dna2_examples,dna2_labels) = fp.readlines()

    # DNA sequences
    fp = init_datasetfile(seqfilename,'seq')
    (dna_examples,dna_labels) = fp.readlines()


    #Define experiments to carry out

    experiments=(
    # Linear kernel on GC content
    ('linear', {'scale':1.0, 'name':'scale'}, (gc_examples, gc_labels)),

    # Polynomial kernel on GC content
    ( 'poly', {'degree':3, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)),
    ( 'poly', {'degree':5, 'name':'degree', 'inhomogene':True, 'normal':True}, (gc_examples, gc_labels)),

    # Gaussian kernel on GC content
    ('gauss', {'width':100.0, 'name':'width'}, (gc_examples, gc_labels)),
    ('gauss', {'width':1.0, 'name':'width'}, (gc_examples, gc_labels)),
    ('gauss', {'width':0.01, 'name':'width'}, (gc_examples, gc_labels)),

    # Spectrum kernel on 2 dna sequences
    ('spec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)),
    ('spec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)),
    ('spec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)),

    # Cumulative Spectrum kernel on 2 dna sequences
    ('cumspec2', {'degree':1, 'name':'degree'}, (dna2_examples, dna2_labels)),
    ('cumspec2', {'degree':3, 'name':'degree'}, (dna2_examples, dna2_labels)),
    ('cumspec2', {'degree':5, 'name':'degree'}, (dna2_examples, dna2_labels)),

    # Weighted degree kernel on dna sequences
    ('wd', {'degree':1,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)),
    ('wd', {'degree':3,'shift':0, 'name':'degree'}, (dna_examples, dna_labels)),
    ('wd', {'degree':5,'shift':0, 'name':'degree'}, (dna_examples, dna_labels))
    )


    if Cs is None:
        for C in (0.01, 0.1, 1, 2, 5, 10):
            for e in experiments:
                run_single_experiment(results, num_fold_cv, e[0], e[1], C, e[2][0], e[2][1])
    else:
        for i in xrange(len(experiments)):
            e=experiments[i]
            run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i], e[2][0], e[2][1])

    return results
Exemple #12
0
def splice_example(Cs, gcfilename, seqfilename, seq2filename, plot=False):
    """
    For the data files, apply the set of kernels
    """
    # hyperparameters
    num_fold_cv = 5

    # The area under the receiver operating characteristic
    results = []

    # Read datasets

    # GC features
    fp = init_datasetfile(gcfilename, 'vec')
    (gc_examples, gc_labels) = fp.readlines()
    gc_examples = normalize(gc_examples, subtract_mean=True)

    if plot:
        from pylab import scatter, show
        color = ['b', 'r']
        scatter(gc_examples[0, ],
                gc_examples[1, ],
                s=400 * (gc_labels + 2),
                c=''.join([color[(int(i) + 1) / 2] for i in gc_labels]),
                alpha=0.1)
        show()

    # 2 sequence features
    fp = init_datasetfile(seq2filename, 'mseq')
    (dna2_examples, dna2_labels) = fp.readlines()

    # DNA sequences
    fp = init_datasetfile(seqfilename, 'seq')
    (dna_examples, dna_labels) = fp.readlines()

    #Define experiments to carry out

    experiments = (
        # Linear kernel on GC content
        ('linear', {
            'scale': 1.0,
            'name': 'scale'
        }, (gc_examples, gc_labels)),

        # Polynomial kernel on GC content
        ('poly', {
            'degree': 3,
            'name': 'degree',
            'inhomogene': True,
            'normal': True
        }, (gc_examples, gc_labels)),
        ('poly', {
            'degree': 5,
            'name': 'degree',
            'inhomogene': True,
            'normal': True
        }, (gc_examples, gc_labels)),

        # Gaussian kernel on GC content
        ('gauss', {
            'width': 100.0,
            'name': 'width'
        }, (gc_examples, gc_labels)),
        ('gauss', {
            'width': 1.0,
            'name': 'width'
        }, (gc_examples, gc_labels)),
        ('gauss', {
            'width': 0.01,
            'name': 'width'
        }, (gc_examples, gc_labels)),

        # Spectrum kernel on 2 dna sequences
        ('spec2', {
            'degree': 1,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),
        ('spec2', {
            'degree': 3,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),
        ('spec2', {
            'degree': 5,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),

        # Cumulative Spectrum kernel on 2 dna sequences
        ('cumspec2', {
            'degree': 1,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),
        ('cumspec2', {
            'degree': 3,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),
        ('cumspec2', {
            'degree': 5,
            'name': 'degree'
        }, (dna2_examples, dna2_labels)),

        # Weighted degree kernel on dna sequences
        ('wd', {
            'degree': 1,
            'shift': 0,
            'name': 'degree'
        }, (dna_examples, dna_labels)),
        ('wd', {
            'degree': 3,
            'shift': 0,
            'name': 'degree'
        }, (dna_examples, dna_labels)),
        ('wd', {
            'degree': 5,
            'shift': 0,
            'name': 'degree'
        }, (dna_examples, dna_labels)))

    if Cs is None:
        for C in (0.01, 0.1, 1, 2, 5, 10):
            for e in experiments:
                run_single_experiment(results, num_fold_cv, e[0], e[1], C,
                                      e[2][0], e[2][1])
    else:
        for i in xrange(len(experiments)):
            e = experiments[i]
            run_single_experiment(results, num_fold_cv, e[0], e[1], Cs[i],
                                  e[2][0], e[2][1])

    return results
Exemple #13
0
            n.numseq = int(sys.argv[9])
            (n.seqlenmin,n.seqlenmax) = esvm.parse.parse_range(sys.argv[10])
            (n.posstart,n.posend) = esvm.parse.parse_range(sys.argv[11])
            n.mutrate = float(sys.argv[12])

            filename = sys.argv[13]
            arffwrite_sequence(filename, p, n)

    elif sys.argv[1] == 'cloud':
        # generate a data cloud in ARFF format
        numpoint = int(sys.argv[2])
        numfeat = int(sys.argv[3])
        fracpos = float(sys.argv[4])
        width = float(sys.argv[5])

        filename = sys.argv[6]
        arffwrite_real(filename, numpoint, numfeat, fracpos, width)
        if len(sys.argv)>=8:
            fp = init_datasetfile(filename,'vec')
            (examples,labels) = fp.readlines()
            pointcloud = []
            for ix in xrange(numpoint):
                pointcloud.append(array([labels[ix],examples[0,ix],examples[1,ix]]))
            esvm.plots.plotcloud(pointcloud,sys.argv[7],'Pointcloud')

	#(examples,labels,metadata)=arffwrite_real(filename, numpoint, numfeat, fracpos, width)
	#if len(sys.argv)>=8:
	#	plots.plotcloud(pointcloud,sys.argv[7],metadata)
    else:
        print 'Unknown option %s\n' % sys.argv[1]