Ejemplo n.º 1
0
    def writelines(self,examples,labels,idx=None):
        """Merge the examples and labels and write to file"""
        alist = [('label',1,[])]

        if idx is not None:
            examples = examples[idx]
            labels = labels[idx]

        if self.extype == 'vec':
            data = list(concatenate((labels.reshape(len(labels),1),examples.T),axis=1))
            for ix in xrange(examples.shape[0]):
                attname = 'att%d' % ix
                alist.append((attname,1,[]))
        elif self.extype == 'seq':
            data = zip(labels,examples)
            alist.append(('sequence',0,[]))
        elif self.extype == 'mseq':
            data = []
            for ix,curlab in enumerate(labels):
                data.append([curlab]+list(examples[ix]))
            alist.append(('upstream sequence',0,[]))
            alist.append(('downstream sequence',0,[]))

        fp = open(self.filename,'w')
        arff.arffwrite(fp,alist,data,name=self.dataname,comment=self.comment)
        fp.close()
Ejemplo n.º 2
0
def arffwrite_sequence(filename,p, n):
    """Write an ARFF file containing a sequence dataset"""
    import arff

    (metadatapos,seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate)
    (metadataneg,seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate)

    seqlist = zip(ones(len(seqlistpos)),seqlistpos) + zip(-ones(len(seqlistneg)),seqlistneg)
    alist = [('label',1,[]),('sequence',0,[])]
    f = open(filename,'w')
    arff.arffwrite(f,alist,seqlist,name='motif',comment=metadatapos+' '+metadataneg)
    f.close()
Ejemplo n.º 3
0
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0):
    """Write an ARFF file containing a vectorial dataset"""
    import arff

    (metadata, pointcloud) = cloudgen(numpoint, numfeat, fracpos, width)
    alist = [('label',1,[])]
    for ix in xrange(numfeat):
        attname = 'att%d' % ix
        alist.append((attname,1,[]))

    f = open(filename,'w')
    arff.arffwrite(f,alist,pointcloud,name='pointcloud',comment=metadata)
    f.close()
    return (pointcloud, metadata)
Ejemplo n.º 4
0
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0):
    """Write an ARFF file containing a vectorial dataset"""
    import arff

    (metadata, pointcloud) = cloudgen(numpoint, numfeat, fracpos, width)
    alist = [('label', 1, [])]
    for ix in xrange(numfeat):
        attname = 'att%d' % ix
        alist.append((attname, 1, []))

    f = open(filename, 'w')
    arff.arffwrite(f, alist, pointcloud, name='pointcloud', comment=metadata)
    f.close()
    return (pointcloud, metadata)
Ejemplo n.º 5
0
def arffwrite_sequence(filename, p, n):
    """Write an ARFF file containing a sequence dataset"""
    import arff

    (metadatapos, seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin,
                                         p.seqlenmax, p.posstart, p.posend,
                                         p.mutrate)
    (metadataneg, seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin,
                                         n.seqlenmax, n.posstart, n.posend,
                                         n.mutrate)

    seqlist = zip(ones(len(seqlistpos)), seqlistpos) + zip(
        -ones(len(seqlistneg)), seqlistneg)
    alist = [('label', 1, []), ('sequence', 0, [])]
    f = open(filename, 'w')
    arff.arffwrite(f,
                   alist,
                   seqlist,
                   name='motif',
                   comment=metadatapos + ' ' + metadataneg)
    f.close()
Ejemplo n.º 6
0
	output_rows.append([full_name] +
						[vowels, consonants] +
						letter_positions[0:6] +
						letter_parities[0:6] +
						letter_types[0:6] +
						[first_name_length_parity, last_name_length_parity] +
						[classification])

print "shortest name length:"
print min(name_sizes)

attributes = []

attributes.append(('person_name', 0, []))

attributes.append(('total_vowels', 1, []))
attributes.append(('total_consonants', 1, []))

attributes += [("pos_%d" % i, 1, []) for i in range(6)]
attributes += [("par_%d" % i, 0, ['True','False']) for i in range(6)]
attributes += [("type_%d" % i, 0, ['v','c']) for i in range(6)]

attributes.append(('first_name_length_parity', 0, ['True','False']))
attributes.append(('last_name_length_parity', 0, ['True','False']))
attributes.append(('class', 0, ['+','-']))

arff.arffwrite(open('winners_losers_augmented.arff', 'wb'),
				attributes,
				output_rows,
				'winners_losers')
Ejemplo n.º 7
0
def gen_arff(fastafilename,gcfilename,seqfilename,seq2filename,specfilename,\
             num_seqs=100000,subset=False,max_pos=200,max_neg=2000,\
             overwrite=False,normalise=True):
    """If data not yet created, generate 2 arff files
    - containing the two dimensional GC content before and after splice site
    - containing the sequence around the splice site.
    """
    if (exists(gcfilename) and exists(seqfilename)) and not overwrite:
        return

    print 'Creating %s and %s from %s' % (gcfilename,seqfilename,fastafilename)

    if fastafilename.find('acc')!= -1:
        # acceptor, AG at [40:42]
        window = (-40, 197, 42)
    elif fastafilename.find('don')!= -1:
        # donor, GT or GC at [40:42]
        window = (-40, 200, 42)
    else:
        print "Error: Cannot determine whether donor or acceptor"

    [strings, lab]=read_data(bz2.BZ2File(fastafilename), num_seqs, window)
    # Only a subset of the examples are used.
    if subset:
        [strings, lab] = take_subset(strings, lab, max_pos, max_neg)

    gcs=count_gs_and_cs(strings, (0, -window[0]), (-window[0]+2, -window[0]+2+window[2]))

    seq_upstream = []
    seq_downstream = []
    for curstr in strings:
        seq_upstream.append(curstr[0:-window[0]])
        seq_downstream.append(curstr[(-window[0]+2):(-window[0]+2+window[2])])
    seq_upstream = array(seq_upstream)
    seq_downstream = array(seq_downstream)

    spec_up = count_nt_freq(seq_upstream)
    spec_down = count_nt_freq(seq_downstream)

    if normalise:
        gcs = normalise_features(gcs)
        spec_up = normalise_features(spec_up)
        spec_down = normalise_features(spec_down)

    # sequence file
    alist = [('label',1,[]),('sequence',0,[])]
    f = open(seqfilename,'w')
    arff.arffwrite(f,alist,zip(lab,strings),name=fastafilename,comment='Converted from '+fastafilename)
    f.close()

    # 2 sequence file
    alist = [('label',1,[]),('upstream sequence',0,[]),('downstream sequence',0,[])]
    f = open(seq2filename,'w')
    arff.arffwrite(f,alist,zip(lab,seq_upstream,seq_downstream),\
                   name=fastafilename,comment='Converted from '+fastafilename)
    f.close()

    # gc contents
    alist = [('label',1,[]),('upstream',1,[]),('downstream',1,[])]
    data = []
    for ix,curlab in enumerate(lab):
        data.append((curlab,gcs[0,ix],gcs[1,ix]))
    f = open(gcfilename,'w')
    arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename)
    f.close()

    # spectrum
    alist = [('label',1,[]),\
             ('upA',1,[]),('upC',1,[]),('upG',1,[]),('upT',1,[]),\
             ('downA',1,[]),('downC',1,[]),('downG',1,[]),('downT',1,[])]
    data = []
    for ix,curlab in enumerate(lab):
        data.append((curlab,spec_up[0,ix],spec_up[1,ix],spec_up[2,ix],spec_up[3,ix],\
                     spec_down[0,ix],spec_down[1,ix],spec_down[2,ix],spec_down[3,ix]))
    if len(specfilename)>0:
        f = open(specfilename,'w')
        arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename)
        f.close()
Ejemplo n.º 8
0
def gen_arff(fastafilename,gcfilename,seqfilename,seq2filename,specfilename,\
             num_seqs=100000,subset=False,max_pos=200,max_neg=2000,\
             overwrite=False,normalise=True):
    """If data not yet created, generate 2 arff files
    - containing the two dimensional GC content before and after splice site
    - containing the sequence around the splice site.
    """
    if (exists(gcfilename) and exists(seqfilename)) and not overwrite:
        return

    print 'Creating %s and %s from %s' % (gcfilename, seqfilename,
                                          fastafilename)

    if fastafilename.find('acc') != -1:
        # acceptor, AG at [40:42]
        window = (-40, 197, 42)
    elif fastafilename.find('don') != -1:
        # donor, GT or GC at [40:42]
        window = (-40, 200, 42)
    else:
        print "Error: Cannot determine whether donor or acceptor"

    [strings, lab] = read_data(bz2.BZ2File(fastafilename), num_seqs, window)
    # Only a subset of the examples are used.
    if subset:
        [strings, lab] = take_subset(strings, lab, max_pos, max_neg)

    gcs = count_gs_and_cs(strings, (0, -window[0]),
                          (-window[0] + 2, -window[0] + 2 + window[2]))

    seq_upstream = []
    seq_downstream = []
    for curstr in strings:
        seq_upstream.append(curstr[0:-window[0]])
        seq_downstream.append(curstr[(-window[0] + 2):(-window[0] + 2 +
                                                       window[2])])
    seq_upstream = array(seq_upstream)
    seq_downstream = array(seq_downstream)

    spec_up = count_nt_freq(seq_upstream)
    spec_down = count_nt_freq(seq_downstream)

    if normalise:
        gcs = normalise_features(gcs)
        spec_up = normalise_features(spec_up)
        spec_down = normalise_features(spec_down)

    # sequence file
    alist = [('label', 1, []), ('sequence', 0, [])]
    f = open(seqfilename, 'w')
    arff.arffwrite(f,
                   alist,
                   zip(lab, strings),
                   name=fastafilename,
                   comment='Converted from ' + fastafilename)
    f.close()

    # 2 sequence file
    alist = [('label', 1, []), ('upstream sequence', 0, []),
             ('downstream sequence', 0, [])]
    f = open(seq2filename, 'w')
    arff.arffwrite(f,alist,zip(lab,seq_upstream,seq_downstream),\
                   name=fastafilename,comment='Converted from '+fastafilename)
    f.close()

    # gc contents
    alist = [('label', 1, []), ('upstream', 1, []), ('downstream', 1, [])]
    data = []
    for ix, curlab in enumerate(lab):
        data.append((curlab, gcs[0, ix], gcs[1, ix]))
    f = open(gcfilename, 'w')
    arff.arffwrite(f,
                   alist,
                   data,
                   name=fastafilename,
                   comment='Converted from ' + fastafilename)
    f.close()

    # spectrum
    alist = [('label',1,[]),\
             ('upA',1,[]),('upC',1,[]),('upG',1,[]),('upT',1,[]),\
             ('downA',1,[]),('downC',1,[]),('downG',1,[]),('downT',1,[])]
    data = []
    for ix, curlab in enumerate(lab):
        data.append((curlab,spec_up[0,ix],spec_up[1,ix],spec_up[2,ix],spec_up[3,ix],\
                     spec_down[0,ix],spec_down[1,ix],spec_down[2,ix],spec_down[3,ix]))
    if len(specfilename) > 0:
        f = open(specfilename, 'w')
        arff.arffwrite(f,
                       alist,
                       data,
                       name=fastafilename,
                       comment='Converted from ' + fastafilename)
        f.close()
Ejemplo n.º 9
0
def get_features(relations_file,
                 pmi_file,
                 cooccurrence_counts_file,
                 feature_file,
                 truth_file=None,
                 truth_function=None):
    """Write an arff file with the correct features from past experiments.
    """

    relations_DB = db.DB()
    relations_DB.open(relations_file, None, db.DB_HASH, db.DB_RDONLY)
    pmi_DB = db.DB()
    pmi_DB.open(pmi_file, None, db.DB_HASH, db.DB_RDONLY)
    cooccurrence_counts_DB = db.DB()
    cooccurrence_counts_DB.open(cooccurrence_counts_file, None, db.DB_HASH,
                                db.DB_RDONLY)

    attribute_list = [
        ("context similarity", 1, []),
        ("normal similarity", 1, []),
        ("pmi between diseases", 1, []),
        ("times diseases cooccurred", 1, []),
        ("disease names", 0, []),
    ]
    if truth_file and truth_function:
        truth_DB = db.DB()
        truth_DB.open(truth_file, None, db.DB_HASH, db.DB_RDONLY)
        attribute_list.append(
            ("truth value", 0, get_truth_nominals(truth_function)))
        truth_function = get_truth_function(truth_function)

    instances = []
    cursor = relations_DB.cursor()
    record = cursor.first()
    while record:
        instance = []
        key, value = record
        target_1, target_2 = key.split(',')
        context_sim, norm = value.split(',')

        instance.append(context_sim)
        instance.append(norm)

        key_1 = "%s,%s" % (target_1, target_2)
        key_2 = "%s,%s" % (target_2, target_1)
        ## it's possible the two diseases never cooccurr with each
        ## other, so the cooccurrence count is 0 and the PMI is
        ## uncalculatable (can't divide by infinity) so we'll
        ## mark it as a missing feature (a '?')
        instance.append(pmi_DB.get(key_1) or pmi_DB.get(key_2) or '?')
        instance.append(
            cooccurrence_counts_DB.get(key_1)
            or cooccurrence_counts_DB.get(key_2) or 0)
        instance.append("%s-%s" % (target_1, target_2))

        if truth_file and truth_function:
            pearson = truth_DB.get(key_1) or truth_DB.get(key_2)
            if pearson:
                instance.append(truth_function(float(pearson)))
            else:
                instance.append('?')

        instances.append(instance)
        record = cursor.next()

    with open(feature_file, 'w') as arff_file:
        arff.arffwrite(arff_file,
                       attribute_list,
                       instances,
                       name='comorbidity')
Ejemplo n.º 10
0
def get_features(relations_file, pmi_file, cooccurrence_counts_file, feature_file,
        truth_file=None, truth_function=None):
    """Write an arff file with the correct features from past experiments.
    """

    relations_DB = db.DB()
    relations_DB.open(relations_file, None, db.DB_HASH, 
            db.DB_RDONLY)
    pmi_DB = db.DB()
    pmi_DB.open(pmi_file, None, db.DB_HASH, db.DB_RDONLY)
    cooccurrence_counts_DB = db.DB()
    cooccurrence_counts_DB.open(cooccurrence_counts_file, None, db.DB_HASH, 
            db.DB_RDONLY)

    attribute_list = [("context similarity", 1, []),
            ("normal similarity", 1, []),
            ("pmi between diseases", 1, []),
            ("times diseases cooccurred", 1, []),
            ("disease names", 0, []),
            ]
    if truth_file and truth_function:
        truth_DB = db.DB()
        truth_DB.open(truth_file, None, db.DB_HASH, db.DB_RDONLY)
        attribute_list.append(("truth value", 0, get_truth_nominals(truth_function)))
        truth_function = get_truth_function(truth_function)

    instances = []
    cursor = relations_DB.cursor()
    record = cursor.first()
    while record:
        instance = []
        key, value = record
        target_1, target_2 = key.split(',')
        context_sim, norm = value.split(',')

        instance.append(context_sim)
        instance.append(norm)

        key_1 = "%s,%s" % (target_1, target_2)
        key_2 = "%s,%s" % (target_2, target_1)
        ## it's possible the two diseases never cooccurr with each
        ## other, so the cooccurrence count is 0 and the PMI is
        ## uncalculatable (can't divide by infinity) so we'll
        ## mark it as a missing feature (a '?')
        instance.append(pmi_DB.get(key_1) or pmi_DB.get(key_2) or '?')
        instance.append(cooccurrence_counts_DB.get(key_1) or 
                cooccurrence_counts_DB.get(key_2) or 0)
        instance.append("%s-%s" % (target_1, target_2))

        if truth_file and truth_function:
            pearson = truth_DB.get(key_1) or truth_DB.get(key_2)
            if pearson:
                instance.append(truth_function(float(pearson)))
            else:
                instance.append('?')

        instances.append(instance)
        record = cursor.next()

    with open(feature_file, 'w') as arff_file:
        arff.arffwrite(arff_file, attribute_list, instances, name='comorbidity')
Ejemplo n.º 11
0
def numpy2arff(filename,matrix):
    alist=map(lambda x:("a%03d"%(x,),1,[]),range(matrix.shape[1]))
    alist[-1]=(alist[-1][0],0,list(set(matrix[:,-1])))
    print filename, alist,matrix
    arff.arffwrite(file(filename,"wb"), alist,matrix)