def writelines(self,examples,labels,idx=None): """Merge the examples and labels and write to file""" alist = [('label',1,[])] if idx is not None: examples = examples[idx] labels = labels[idx] if self.extype == 'vec': data = list(concatenate((labels.reshape(len(labels),1),examples.T),axis=1)) for ix in xrange(examples.shape[0]): attname = 'att%d' % ix alist.append((attname,1,[])) elif self.extype == 'seq': data = zip(labels,examples) alist.append(('sequence',0,[])) elif self.extype == 'mseq': data = [] for ix,curlab in enumerate(labels): data.append([curlab]+list(examples[ix])) alist.append(('upstream sequence',0,[])) alist.append(('downstream sequence',0,[])) fp = open(self.filename,'w') arff.arffwrite(fp,alist,data,name=self.dataname,comment=self.comment) fp.close()
def arffwrite_sequence(filename,p, n): """Write an ARFF file containing a sequence dataset""" import arff (metadatapos,seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) (metadataneg,seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate) seqlist = zip(ones(len(seqlistpos)),seqlistpos) + zip(-ones(len(seqlistneg)),seqlistneg) alist = [('label',1,[]),('sequence',0,[])] f = open(filename,'w') arff.arffwrite(f,alist,seqlist,name='motif',comment=metadatapos+' '+metadataneg) f.close()
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0): """Write an ARFF file containing a vectorial dataset""" import arff (metadata, pointcloud) = cloudgen(numpoint, numfeat, fracpos, width) alist = [('label',1,[])] for ix in xrange(numfeat): attname = 'att%d' % ix alist.append((attname,1,[])) f = open(filename,'w') arff.arffwrite(f,alist,pointcloud,name='pointcloud',comment=metadata) f.close() return (pointcloud, metadata)
def arffwrite_real(filename, numpoint, numfeat, fracpos=0.5, width=1.0): """Write an ARFF file containing a vectorial dataset""" import arff (metadata, pointcloud) = cloudgen(numpoint, numfeat, fracpos, width) alist = [('label', 1, [])] for ix in xrange(numfeat): attname = 'att%d' % ix alist.append((attname, 1, [])) f = open(filename, 'w') arff.arffwrite(f, alist, pointcloud, name='pointcloud', comment=metadata) f.close() return (pointcloud, metadata)
def arffwrite_sequence(filename, p, n): """Write an ARFF file containing a sequence dataset""" import arff (metadatapos, seqlistpos) = motifgen(p.motif, p.numseq, p.seqlenmin, p.seqlenmax, p.posstart, p.posend, p.mutrate) (metadataneg, seqlistneg) = motifgen(n.motif, n.numseq, n.seqlenmin, n.seqlenmax, n.posstart, n.posend, n.mutrate) seqlist = zip(ones(len(seqlistpos)), seqlistpos) + zip( -ones(len(seqlistneg)), seqlistneg) alist = [('label', 1, []), ('sequence', 0, [])] f = open(filename, 'w') arff.arffwrite(f, alist, seqlist, name='motif', comment=metadatapos + ' ' + metadataneg) f.close()
output_rows.append([full_name] + [vowels, consonants] + letter_positions[0:6] + letter_parities[0:6] + letter_types[0:6] + [first_name_length_parity, last_name_length_parity] + [classification]) print "shortest name length:" print min(name_sizes) attributes = [] attributes.append(('person_name', 0, [])) attributes.append(('total_vowels', 1, [])) attributes.append(('total_consonants', 1, [])) attributes += [("pos_%d" % i, 1, []) for i in range(6)] attributes += [("par_%d" % i, 0, ['True','False']) for i in range(6)] attributes += [("type_%d" % i, 0, ['v','c']) for i in range(6)] attributes.append(('first_name_length_parity', 0, ['True','False'])) attributes.append(('last_name_length_parity', 0, ['True','False'])) attributes.append(('class', 0, ['+','-'])) arff.arffwrite(open('winners_losers_augmented.arff', 'wb'), attributes, output_rows, 'winners_losers')
def gen_arff(fastafilename,gcfilename,seqfilename,seq2filename,specfilename,\ num_seqs=100000,subset=False,max_pos=200,max_neg=2000,\ overwrite=False,normalise=True): """If data not yet created, generate 2 arff files - containing the two dimensional GC content before and after splice site - containing the sequence around the splice site. """ if (exists(gcfilename) and exists(seqfilename)) and not overwrite: return print 'Creating %s and %s from %s' % (gcfilename,seqfilename,fastafilename) if fastafilename.find('acc')!= -1: # acceptor, AG at [40:42] window = (-40, 197, 42) elif fastafilename.find('don')!= -1: # donor, GT or GC at [40:42] window = (-40, 200, 42) else: print "Error: Cannot determine whether donor or acceptor" [strings, lab]=read_data(bz2.BZ2File(fastafilename), num_seqs, window) # Only a subset of the examples are used. if subset: [strings, lab] = take_subset(strings, lab, max_pos, max_neg) gcs=count_gs_and_cs(strings, (0, -window[0]), (-window[0]+2, -window[0]+2+window[2])) seq_upstream = [] seq_downstream = [] for curstr in strings: seq_upstream.append(curstr[0:-window[0]]) seq_downstream.append(curstr[(-window[0]+2):(-window[0]+2+window[2])]) seq_upstream = array(seq_upstream) seq_downstream = array(seq_downstream) spec_up = count_nt_freq(seq_upstream) spec_down = count_nt_freq(seq_downstream) if normalise: gcs = normalise_features(gcs) spec_up = normalise_features(spec_up) spec_down = normalise_features(spec_down) # sequence file alist = [('label',1,[]),('sequence',0,[])] f = open(seqfilename,'w') arff.arffwrite(f,alist,zip(lab,strings),name=fastafilename,comment='Converted from '+fastafilename) f.close() # 2 sequence file alist = [('label',1,[]),('upstream sequence',0,[]),('downstream sequence',0,[])] f = open(seq2filename,'w') arff.arffwrite(f,alist,zip(lab,seq_upstream,seq_downstream),\ name=fastafilename,comment='Converted from '+fastafilename) f.close() # gc contents alist = [('label',1,[]),('upstream',1,[]),('downstream',1,[])] data = [] for ix,curlab in enumerate(lab): data.append((curlab,gcs[0,ix],gcs[1,ix])) f = open(gcfilename,'w') arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename) f.close() # spectrum alist = [('label',1,[]),\ ('upA',1,[]),('upC',1,[]),('upG',1,[]),('upT',1,[]),\ ('downA',1,[]),('downC',1,[]),('downG',1,[]),('downT',1,[])] data = [] for ix,curlab in enumerate(lab): data.append((curlab,spec_up[0,ix],spec_up[1,ix],spec_up[2,ix],spec_up[3,ix],\ spec_down[0,ix],spec_down[1,ix],spec_down[2,ix],spec_down[3,ix])) if len(specfilename)>0: f = open(specfilename,'w') arff.arffwrite(f,alist,data,name=fastafilename,comment='Converted from '+fastafilename) f.close()
def gen_arff(fastafilename,gcfilename,seqfilename,seq2filename,specfilename,\ num_seqs=100000,subset=False,max_pos=200,max_neg=2000,\ overwrite=False,normalise=True): """If data not yet created, generate 2 arff files - containing the two dimensional GC content before and after splice site - containing the sequence around the splice site. """ if (exists(gcfilename) and exists(seqfilename)) and not overwrite: return print 'Creating %s and %s from %s' % (gcfilename, seqfilename, fastafilename) if fastafilename.find('acc') != -1: # acceptor, AG at [40:42] window = (-40, 197, 42) elif fastafilename.find('don') != -1: # donor, GT or GC at [40:42] window = (-40, 200, 42) else: print "Error: Cannot determine whether donor or acceptor" [strings, lab] = read_data(bz2.BZ2File(fastafilename), num_seqs, window) # Only a subset of the examples are used. if subset: [strings, lab] = take_subset(strings, lab, max_pos, max_neg) gcs = count_gs_and_cs(strings, (0, -window[0]), (-window[0] + 2, -window[0] + 2 + window[2])) seq_upstream = [] seq_downstream = [] for curstr in strings: seq_upstream.append(curstr[0:-window[0]]) seq_downstream.append(curstr[(-window[0] + 2):(-window[0] + 2 + window[2])]) seq_upstream = array(seq_upstream) seq_downstream = array(seq_downstream) spec_up = count_nt_freq(seq_upstream) spec_down = count_nt_freq(seq_downstream) if normalise: gcs = normalise_features(gcs) spec_up = normalise_features(spec_up) spec_down = normalise_features(spec_down) # sequence file alist = [('label', 1, []), ('sequence', 0, [])] f = open(seqfilename, 'w') arff.arffwrite(f, alist, zip(lab, strings), name=fastafilename, comment='Converted from ' + fastafilename) f.close() # 2 sequence file alist = [('label', 1, []), ('upstream sequence', 0, []), ('downstream sequence', 0, [])] f = open(seq2filename, 'w') arff.arffwrite(f,alist,zip(lab,seq_upstream,seq_downstream),\ name=fastafilename,comment='Converted from '+fastafilename) f.close() # gc contents alist = [('label', 1, []), ('upstream', 1, []), ('downstream', 1, [])] data = [] for ix, curlab in enumerate(lab): data.append((curlab, gcs[0, ix], gcs[1, ix])) f = open(gcfilename, 'w') arff.arffwrite(f, alist, data, name=fastafilename, comment='Converted from ' + fastafilename) f.close() # spectrum alist = [('label',1,[]),\ ('upA',1,[]),('upC',1,[]),('upG',1,[]),('upT',1,[]),\ ('downA',1,[]),('downC',1,[]),('downG',1,[]),('downT',1,[])] data = [] for ix, curlab in enumerate(lab): data.append((curlab,spec_up[0,ix],spec_up[1,ix],spec_up[2,ix],spec_up[3,ix],\ spec_down[0,ix],spec_down[1,ix],spec_down[2,ix],spec_down[3,ix])) if len(specfilename) > 0: f = open(specfilename, 'w') arff.arffwrite(f, alist, data, name=fastafilename, comment='Converted from ' + fastafilename) f.close()
def get_features(relations_file, pmi_file, cooccurrence_counts_file, feature_file, truth_file=None, truth_function=None): """Write an arff file with the correct features from past experiments. """ relations_DB = db.DB() relations_DB.open(relations_file, None, db.DB_HASH, db.DB_RDONLY) pmi_DB = db.DB() pmi_DB.open(pmi_file, None, db.DB_HASH, db.DB_RDONLY) cooccurrence_counts_DB = db.DB() cooccurrence_counts_DB.open(cooccurrence_counts_file, None, db.DB_HASH, db.DB_RDONLY) attribute_list = [ ("context similarity", 1, []), ("normal similarity", 1, []), ("pmi between diseases", 1, []), ("times diseases cooccurred", 1, []), ("disease names", 0, []), ] if truth_file and truth_function: truth_DB = db.DB() truth_DB.open(truth_file, None, db.DB_HASH, db.DB_RDONLY) attribute_list.append( ("truth value", 0, get_truth_nominals(truth_function))) truth_function = get_truth_function(truth_function) instances = [] cursor = relations_DB.cursor() record = cursor.first() while record: instance = [] key, value = record target_1, target_2 = key.split(',') context_sim, norm = value.split(',') instance.append(context_sim) instance.append(norm) key_1 = "%s,%s" % (target_1, target_2) key_2 = "%s,%s" % (target_2, target_1) ## it's possible the two diseases never cooccurr with each ## other, so the cooccurrence count is 0 and the PMI is ## uncalculatable (can't divide by infinity) so we'll ## mark it as a missing feature (a '?') instance.append(pmi_DB.get(key_1) or pmi_DB.get(key_2) or '?') instance.append( cooccurrence_counts_DB.get(key_1) or cooccurrence_counts_DB.get(key_2) or 0) instance.append("%s-%s" % (target_1, target_2)) if truth_file and truth_function: pearson = truth_DB.get(key_1) or truth_DB.get(key_2) if pearson: instance.append(truth_function(float(pearson))) else: instance.append('?') instances.append(instance) record = cursor.next() with open(feature_file, 'w') as arff_file: arff.arffwrite(arff_file, attribute_list, instances, name='comorbidity')
def get_features(relations_file, pmi_file, cooccurrence_counts_file, feature_file, truth_file=None, truth_function=None): """Write an arff file with the correct features from past experiments. """ relations_DB = db.DB() relations_DB.open(relations_file, None, db.DB_HASH, db.DB_RDONLY) pmi_DB = db.DB() pmi_DB.open(pmi_file, None, db.DB_HASH, db.DB_RDONLY) cooccurrence_counts_DB = db.DB() cooccurrence_counts_DB.open(cooccurrence_counts_file, None, db.DB_HASH, db.DB_RDONLY) attribute_list = [("context similarity", 1, []), ("normal similarity", 1, []), ("pmi between diseases", 1, []), ("times diseases cooccurred", 1, []), ("disease names", 0, []), ] if truth_file and truth_function: truth_DB = db.DB() truth_DB.open(truth_file, None, db.DB_HASH, db.DB_RDONLY) attribute_list.append(("truth value", 0, get_truth_nominals(truth_function))) truth_function = get_truth_function(truth_function) instances = [] cursor = relations_DB.cursor() record = cursor.first() while record: instance = [] key, value = record target_1, target_2 = key.split(',') context_sim, norm = value.split(',') instance.append(context_sim) instance.append(norm) key_1 = "%s,%s" % (target_1, target_2) key_2 = "%s,%s" % (target_2, target_1) ## it's possible the two diseases never cooccurr with each ## other, so the cooccurrence count is 0 and the PMI is ## uncalculatable (can't divide by infinity) so we'll ## mark it as a missing feature (a '?') instance.append(pmi_DB.get(key_1) or pmi_DB.get(key_2) or '?') instance.append(cooccurrence_counts_DB.get(key_1) or cooccurrence_counts_DB.get(key_2) or 0) instance.append("%s-%s" % (target_1, target_2)) if truth_file and truth_function: pearson = truth_DB.get(key_1) or truth_DB.get(key_2) if pearson: instance.append(truth_function(float(pearson))) else: instance.append('?') instances.append(instance) record = cursor.next() with open(feature_file, 'w') as arff_file: arff.arffwrite(arff_file, attribute_list, instances, name='comorbidity')
def numpy2arff(filename,matrix): alist=map(lambda x:("a%03d"%(x,),1,[]),range(matrix.shape[1])) alist[-1]=(alist[-1][0],0,list(set(matrix[:,-1]))) print filename, alist,matrix arff.arffwrite(file(filename,"wb"), alist,matrix)