def read_self_mate_map_data_and_trim(cursor,conn,sample,species,DB_NAME,tablename,table_raw): conn1=mb.connect(host="localhost",user="******",passwd="123456",db=DB_NAME) cursor1 = conn1.cursor() files = ['self_refseq','mate_refseq'] rec = ['S_R','M_R'] for index,filename in enumerate(files): values = [] cmd = 'samtools view -F 4 '+d00.get_sample_file(cursor,sample,filename) p1 = subprocess.Popen(cmd,shell = True,stdout=subprocess.PIPE) for line in p1.stdout: t = str.split(line) values.append((t[0],rec[index],'T','101M')) cursor1.executemany("insert into "+tablename+" values(%s,%s,%s,1000,%s) ",values); conn1.commit() files = ['mate_genome'] rec = ['M_G'] for index,filename in enumerate(files): values = [] cmd = 'samtools view -F 4 '+d00.get_sample_file(cursor,sample,filename) p1 = subprocess.Popen(cmd,shell = True,stdout=subprocess.PIPE) for line in p1.stdout: t = str.split(line) if species == 'mm10': t[2] = 'chr'+t[2] if len(t[5])>20: t[5] = t[5][0:20] values.append((t[0],rec[index],t[2],t[3],t[5])) cursor1.executemany("insert into "+tablename+" values(%s,%s,%s,%s,%s) ",values); conn1.commit()
def FPKM_DB(cursor, conn, genes_table, samples, intype, insert, tablename): sql = 'create table %s select * from %s' % (tablename, genes_table) genes = d00.table_2_dict(cursor, genes_table, ['gene', 'gene']) try: cursor.execute(sql) conn.commit() cursor.execute("create index reci on " + tablename + "(gene);") conn.commit() except: print "exists" for sample in samples: exp = d00.get_sample_info(cursor, sample, 'exp') + insert try: cursor.execute("alter table " + tablename + " add " + exp + " float DEFAULT '0'") conn.commit() except: print "EXISTS", fpkm = [] f = open(d00.get_sample_file(cursor, sample, intype)) f.readline() print d00.get_sample_file(cursor, sample, intype) for line in f: t = re.split('\s+', line) if t[9] > 0 and t[0] in genes: fpkm.append([t[9], t[0]]) cursor.executemany( "update " + tablename + " set " + exp + "=%s where gene = %s", fpkm) conn.commit()
def FPKM_DB(cursor,conn,genes_table,samples,intype,insert,tablename): sql = 'create table %s select * from %s' %(tablename,genes_table) genes = d00.table_2_dict(cursor,genes_table,['gene','gene']) try: cursor.execute(sql) conn.commit() cursor.execute("create index reci on "+tablename+"(gene);") conn.commit() except: print "exists" for sample in samples: exp = d00.get_sample_info(cursor,sample,'exp')+insert try: cursor.execute("alter table "+tablename+" add "+exp+" float DEFAULT '0'") conn.commit() except: print "EXISTS", fpkm = [] f = open(d00.get_sample_file(cursor,sample,intype)) f.readline() print d00.get_sample_file(cursor,sample,intype) for line in f: t = re.split('\s+',line) if t[9] > 0 and t[0] in genes: fpkm.append([t[9],t[0]]) cursor.executemany("update "+tablename+" set "+exp+"=%s where gene = %s",fpkm) conn.commit()
def BOWTIE_alignment(cursor,conn,samples,species,ref,ins,outdir,rec): cmds = [] for sample in samples: path = outdir+'/BAM.anchor_genome.'+sample+'_'+rec+'.bam' fq1 = d00.get_sample_file(cursor,sample,ins[0]) fq2 = d00.get_sample_file(cursor,sample,ins[1]) cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BOWTIE.pair.sh %s %s %s %s' %(fq1,fq2,path[:-4],refall[species]['bowtie2'][ref]) cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) conn.commit() if not os.path.exists(path): cmds.append(cmd) return cmds
def BOWTIE_alignment(cursor,conn,samples,species,in1,in2,rec,para,outdir): cmds = [] for sample in samples: outsam = outdir+'/SAM.anchor.'+sample+'_'+rec+'.sam' path = outdir+'/BAM.anchor_genome.'+sample+'_'+rec+'.bam' fq1 = d00.get_sample_file(cursor,sample,in1) fq2 = d00.get_sample_file(cursor,sample,in2) cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/BOWTIE.pair.sh %s %s %s %s %s %s' %(fq1,fq2,outsam,path[:-4],Ref[species]['bowtie2']['genome'],para) cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) conn.commit() if not os.path.exists(path): cmds.append(cmd) return cmds
def BWA_PAIRED(cursor,conn,specise,ref,samples,intype,folder,rec): cmds = [] for sample in samples: insert = rec+'_'+sample fq1 = d00.get_sample_file(cursor,sample,intype[0]) fq2 = d00.get_sample_file(cursor,sample,intype[1]) path = folder+'/BAM'+insert+'.bam' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BWA.pair.sh ' +folder+' '+fq1+' '+fq2\ +' '+refall[specise]['bwa'][ref]+" "+path[:-4]+" "+insert cmd = add_files(cmd,"/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BWA.pair.sh") cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s) ",[sample,rec,path,cmd]) cmds.append(cmd) conn.commit() return cmds
def BOWTIE_PAIRED(cursor,conn,samples,species,ref,ins,outdir,usage,rec,server="TANG"): cmds = [] for sample in samples: path = outdir+'/BAM.'+rec+'_'+sample+'.bam' fq1 = d00.get_sample_file(cursor,sample,ins[0]) fq2 = d00.get_sample_file(cursor,sample,ins[1]) refpath = d00.get_ref(cursor,species,'bowtie2',ref,server) (scriptpath,scriptcmds) = d00.get_script(cursor,'bowtie_pair',server) cmd = 'bash %s %s %s %s %s %s' %(scriptpath,fq1,fq2,path[:-4],refpath,usage) method = cmd+" \n"+scriptcmds cursor.execute("replace into files (sample,type,path,method,server)values(%s,%s,%s,%s,%s)",[sample,rec,path,method,server]) conn.commit() if not os.path.exists(path): cmds.append(cmd) return cmds
def TOPHAT_PAIRED(cursor,conn,specise,ref,samples,intype,report,folder,rec,server="TANG"): cmds = [] for sample in samples: insert = rec+'_'+sample outdir = folder+'/'+insert fq1 = d00.get_sample_file(cursor,sample,intype[0]) fq2 = d00.get_sample_file(cursor,sample,intype[1]) path = outdir+'/BAM'+insert+'.bam' refpath = d00.get_ref(cursor,specise,'bowtie2',ref,server) cmd = "bash %sscripts/TOPHAT.pair.sh %s %s %s %s %s %s %s" %(dirname,outdir,fq1,fq2,refpath,path[:-4],insert,report) method = add_files(cmd,dirname+"scripts/TOPHAT.pair.sh") cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,method]) cmds.append(cmd) conn.commit() return cmds
def BWA_PAIRED(cursor,conn,specise,ref,samples,intype,folder,rec,server="TANG"): cmds = [] for sample in samples: insert = rec+'_'+sample fq1 = d00.get_sample_file(cursor,sample,intype[0]) fq2 = d00.get_sample_file(cursor,sample,intype[1]) path = folder+'/BAM'+insert+'.bam' (scriptpath,scriptcmds) = d00.get_script(cursor,'bwa_pair',server) refpath = d00.get_ref(cursor,specise,'bwa',ref,server) cmd = "bash %s %s %s %s %s %s %s" %(scriptpath,folder,fq1,fq2,refpath,path[:-4],insert) method = cmd+" \n"+scriptcmds cursor.execute("replace into files (sample,type,path,method,server) values(%s,%s,%s,%s,%s) ",[sample,rec,path,method,server]) cmds.append(cmd) conn.commit() return cmds
def BOWTIE_alignment(cursor, conn, samples, species, ref, ins, outdir, rec): cmds = [] for sample in samples: path = outdir + '/BAM.anchor_genome.' + sample + '_' + rec + '.bam' fq1 = d00.get_sample_file(cursor, sample, ins[0]) fq2 = d00.get_sample_file(cursor, sample, ins[1]) cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BOWTIE.pair.sh %s %s %s %s' % ( fq1, fq2, path[:-4], refall[species]['bowtie2'][ref]) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, cmd]) conn.commit() if not os.path.exists(path): cmds.append(cmd) return cmds
def circularRNA_tophat_pair(cursor,conn,samples,species,ref,insert,out,in1,in2): cmds = [] for sample in samples: outdir = out+'/'+sample if not os.path.exists(outdir): os.mkdir(outdir) path = outdir+'/BAM_PE_'+sample+'_'+insert+'.bam' c = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/cRNA_TOPHAT.pair.sh ' fq1 = d00.get_sample_file(cursor,sample,in1) fq2 = d00.get_sample_file(cursor,sample,in2) cmd = "%s %s %s %s %s %s %s" %(c,outdir,fq1,fq2,path[:-4],insert,refall[species]['bowtie2'][ref]) if not os.path.exists(path): cmds.append(cmd) cursor.execute("insert ignore into files values(%s,%s,%s,NULL,%s)",[sample,'BAM_PE_CRNA_SORT',path,cmd]) conn.commit() return cmds
def GET_BINNED_COUNTS_ALONG_GENOME_Multiple(cursor,conn,samples,bamfile,binsize,chrom_size,tablename): print tablename, colnames = ['chr','bin_id']+samples types = ['varchar(20)','int']+['int']*len(samples) print colnames,types in05.CREATE_TABLE(cursor,tablename,colnames,types) cursor.execute("select chr,`length` from "+chrom_size) results = cursor.fetchall() samfiles = [] for sample in samples: bamname = d00.get_sample_file(cursor,sample,bamfile) samfiles.append(pysam.Samfile(bamname, "rb")) counts = [] for chr in results: for pos in range(1,int(chr[1]),binsize): bin_info = [chr[0],int(pos/binsize)] for samfile in samfiles: bin_info.append(samfile.count(chr[0],pos,min(pos+binsize,int(chr[1])))) counts.append(bin_info) up_cmd = "insert ignore into "+tablename+" values("+" %s,"*len(colnames) up_cmd = up_cmd[0:len(up_cmd)-1]+")" print up_cmd cursor.executemany(up_cmd,counts) conn.commit()
def summarize_pair_map_conditions_db(cursor,samples,dbname,tablename,in1,dist,anchor_length): cmds = [] for sample in samples: bamfile = d00.get_sample_file(cursor,sample,in1) cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/summarize_reads_db.py %s %s %s %s %s %s' %(dbname,tablename,bamfile,dist,sample,anchor_length) cmds.append(cmd) return cmds
def RPKM_DB(cursor,conn,g_t,gene_length,totalreads,samples,intype,insert,tablename): sql = 'create table %s select * from %s' %(tablename,gene_length) try: cursor.execute(sql) conn.commit() cursor.execute("create index reci on "+tablename+"(gene);") conn.commit() except: print "exists" gt = d00.table_2_dict(cursor,g_t,['transc','gene']) gl = d00.table_2_dict(cursor,gene_length,['gene','length']) for sample in samples: total = int(d00.get_sample_info(cursor,sample,totalreads)) count = {} exp = d00.get_sample_info(cursor,sample,'exp')+insert try: cursor.execute("alter table "+tablename+" add "+exp+" float DEFAULT '0'") except: print "EXISTS_colume" f = open(d00.get_sample_file(cursor,sample,intype)) for line in f: t = re.split('\s+',line) if t[0] not in gt: continue gene = gt[t[0]] if gene in gl: if gene not in count: count[gene] = 0 count[gene] += int(t[1]) values = [] for gene in count: values.append([float(count[gene]*1000000*1000)/(total*int(gl[gene])),gene]) print len(values),values[1] cursor.executemany("update "+tablename+" set "+exp+"=%s where gene = %s",values) conn.commit()
def FILES_GROUPER(cursor,conn,samples,newsName,intype,sep): newname = "" for sample in samples: newname += d00.get_sample_file(cursor,sample,intype)+sep print newname[:-1] method = "FILES_GROUPER #"+" "+intype cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[newsName,intype,newname,method]) conn.commit()
def circularRNA_tophat_pair(cursor,conn,samples,species,ref,out,in1,in2,rec,server="TANG"): cmds = [] for sample in samples: outdir = out+'/'+rec+"_"+sample if not os.path.exists(outdir): os.mkdir(outdir) path = outdir+'/'+rec+"_"+sample+'.bam' c = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/cRNA_TOPHAT.pair.sh ' fq1 = d00.get_sample_file(cursor,sample,in1) fq2 = d00.get_sample_file(cursor,sample,in2) refpath = d00.get_ref(cursor,species,'bowtie2',ref,server) cmd = "%s %s %s %s %s %s %s" %(c,outdir,fq1,fq2,path[:-4],rec,refpath) if not os.path.exists(path): cmds.append(cmd) cursor.execute("insert ignore into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) conn.commit() return cmds
def BOWTIE_alignment(cursor, conn, samples, species, in1, in2, rec, para, outdir): cmds = [] for sample in samples: outsam = outdir + '/SAM.anchor.' + sample + '_' + rec + '.sam' path = outdir + '/BAM.anchor_genome.' + sample + '_' + rec + '.bam' fq1 = d00.get_sample_file(cursor, sample, in1) fq2 = d00.get_sample_file(cursor, sample, in2) cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/BOWTIE.pair.sh %s %s %s %s %s %s' % ( fq1, fq2, outsam, path[:-4], Ref[species]['bowtie2']['genome'], para) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, cmd]) conn.commit() if not os.path.exists(path): cmds.append(cmd) return cmds
def summarize_pair_map_conditions_db(cursor, samples, dbname, tablename, in1, dist, anchor_length): cmds = [] for sample in samples: bamfile = d00.get_sample_file(cursor, sample, in1) cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/summarize_reads_db.py %s %s %s %s %s %s' % ( dbname, tablename, bamfile, dist, sample, anchor_length) cmds.append(cmd) return cmds
def BAM_FLAGSTAT(cursor,conn,samples,intype,folder,rec): cmds = [] for sample in samples: path = folder+'/flagstat.'+sample+"_"+intype+'.txt' f1 = d00.get_sample_file(cursor,sample,intype) cmd = 'samtools flagstat '+f1+' > '+path cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) cmds.append(cmd) conn.commit() return cmds
def circularRNA_tophat_pair(cursor, conn, samples, species, ref, insert, out, in1, in2): cmds = [] for sample in samples: outdir = out + '/' + sample if not os.path.exists(outdir): os.mkdir(outdir) path = outdir + '/BAM_PE_' + sample + '_' + insert + '.bam' c = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/cRNA_TOPHAT.pair.sh ' fq1 = d00.get_sample_file(cursor, sample, in1) fq2 = d00.get_sample_file(cursor, sample, in2) cmd = "%s %s %s %s %s %s %s" % (c, outdir, fq1, fq2, path[:-4], insert, refall[species]['bowtie2'][ref]) if not os.path.exists(path): cmds.append(cmd) cursor.execute("insert ignore into files values(%s,%s,%s,NULL,%s)", [sample, 'BAM_PE_CRNA_SORT', path, cmd]) conn.commit() return cmds
def BWA_PAIRED(cursor, conn, specise, ref, samples, intype, folder, rec): cmds = [] for sample in samples: insert = rec + '_' + sample fq1 = d00.get_sample_file(cursor, sample, intype[0]) fq2 = d00.get_sample_file(cursor, sample, intype[1]) path = folder + '/BAM' + insert + '.bam' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BWA.pair.sh ' +folder+' '+fq1+' '+fq2\ +' '+refall[specise]['bwa'][ref]+" "+path[:-4]+" "+insert cmd = add_files( cmd, "/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/BWA.pair.sh" ) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s) ", [sample, rec, path, cmd]) cmds.append(cmd) conn.commit() return cmds
def SUMMARIZE(cursor,conn,samples,intype,folder,para,rec): cmds = [] for sample in samples: insert = rec+'_'+sample bam = d00.get_sample_file(cursor,sample,intype) path = folder+'/summarize.'+insert+'.txt' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/SUMMARIZE.sh '+bam+' '+path+' '+para cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) cmds.append(cmd) conn.commit() return cmds
def MAPPED_SINGLE(cursor,conn,samples,bamtype,folder,rec): cmds = [] for sample in samples: bam = d00.get_sample_file(cursor,sample,bamtype) path = folder+'/'+rec+'_'+sample+'.fa.gz' cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/assembly/mapped_reads_single.py ' +bam+' '+path+' '+sample method = add_files(cmd,"/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/assembly/mapped_reads_single.py") cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,method]) cmds.append(cmd) conn.commit() return cmds
def BAM_FLAGSTAT(cursor, conn, samples, intype, folder, rec): cmds = [] for sample in samples: path = folder + '/flagstat.' + sample + "_" + intype + '.txt' f1 = d00.get_sample_file(cursor, sample, intype) cmd = 'samtools flagstat ' + f1 + ' > ' + path cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, cmd]) cmds.append(cmd) conn.commit() return cmds
def SUMMARIZE(cursor,conn,samples,intype,folder,para,rec,server="TANG"): cmds = [] for sample in samples: insert = rec+'_'+sample bam = d00.get_sample_file(cursor,sample,intype) path = folder+'/summarize.'+insert+'.txt' (scriptpath,scriptcmds) = d00.get_script(cursor,'summarize',server) cmd = 'bash %s %s %s %s' %(scriptpath,bam,path,para) cursor.execute("replace into files (sample,type,path,method,server)values(%s,%s,%s,%s,%s)",[sample,rec,path,cmd,server]) cmds.append(cmd) conn.commit() return cmds
def Read_BWA_flagstat(cursor,conn,samples,intype,recs): d02.check_table_colume(cursor,conn,'samples',recs[0],'INT') d02.check_table_colume(cursor,conn,'samples',recs[1],'INT') for sample in samples: f1 = open(d00.get_sample_file(cursor,sample,intype)) a = re.split('\s+',f1.readline())[0] f1.readline() b = re.split('\s+',f1.readline())[0] sql = "update %s set %s=%s,%s=%s where sample='%s'" %('samples',recs[0],a,recs[1],b,sample) print sql cursor.execute(sql) conn.commit()
def read_VCF_file_single(cursor, conn, DB_NAME, tablename, samples, type): limit = 30 sample_infos = '' for sample in samples: sample_info = " %s_DP varchar(5) DEFAULT '0',%s_alt float DEFAULT '0'," % ( sample, sample) sample_infos += sample_info sql = """CREATE TABLE %s ( `chr` varchar(20) NOT NULL DEFAULT '', `pos` int(11) NOT NULL DEFAULT '0', `Ref` varchar(30) DEFAULT NULL, `Alt` varchar(30) NOT NULL DEFAULT '', %s PRIMARY KEY (`chr`,`pos`,`Alt`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1""" % (tablename, sample_infos) print sql try: cursor.execute(sql) except: print "EXISTS" for sample in samples: path = d00.get_sample_file(cursor, sample, type) file = open(path) values = [] for line in file: if re.search('#', line): continue t = re.split('\s*', line) info = {} for i in re.split(';', t[7]): a = re.split('=', i) if len(a) > 1: info[a[0]] = a[1] if 'DP4' not in info: continue DP4 = re.split(',', info['DP4']) if len(t[3]) > limit: t[3] = t[3][0:limit] continue if len(t[4]) > limit: t[4] = t[4][0:limit] continue value = (t[0], t[1], t[3], t[4], info['DP'], float(int(DP4[2]) + int(DP4[3])) / int(info['DP'])) values.append(value) cmd = "insert into %s (chr,pos,Ref,Alt,%s,%s)values(%%s,%%s,%%s,%%s,%%s,%%s) on duplicate key update %s=values(%s),%s=values(%s)" % ( tablename, sample + '_DP', sample + '_alt', sample + '_DP', sample + '_DP', sample + '_alt', sample + '_alt') print cmd, values[0] cursor.executemany(cmd, values) conn.commit() cursor.close() conn.close()
def Read_BWA_flagstat(cursor, conn, samples, intype, recs): d02.check_table_colume(cursor, conn, 'samples', recs[0], 'INT') d02.check_table_colume(cursor, conn, 'samples', recs[1], 'INT') for sample in samples: f1 = open(d00.get_sample_file(cursor, sample, intype)) a = re.split('\s+', f1.readline())[0] f1.readline() b = re.split('\s+', f1.readline())[0] sql = "update %s set %s=%s,%s=%s where sample='%s'" % ( 'samples', recs[0], a, recs[1], b, sample) print sql cursor.execute(sql) conn.commit()
def SUMMARIZE(cursor, conn, samples, intype, folder, para, rec): cmds = [] for sample in samples: insert = rec + '_' + sample bam = d00.get_sample_file(cursor, sample, intype) path = folder + '/summarize.' + insert + '.txt' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/SUMMARIZE.sh ' + bam + ' ' + path + ' ' + para cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, cmd]) cmds.append(cmd) conn.commit() return cmds
def BWA_SINGLE(cursor,conn,specise,ref,samples,intype,folder,rec): cmds = [] for sample in samples: insert = rec+'_'+sample fq = d00.get_sample_file(cursor,sample,intype) path = folder+'/BAM'+insert+'.bam' refpath = d00.get_ref(cursor,specise,'bwa',ref,server) cmd = "bash %sscripts/BWA.pair.sh %s %s %s %s %s" %(dirname,folder,fq,refpath,path[:-4],insert) method = add_files(cmd,dirname+"scripts/BWA.single.sh") cursor.execute("insert ignore into files (sample,type,path,method)values(%s,%s,%s,%s) ",[sample,rec,path,method]) cmds.append(cmd) conn.commit() return cmds
def CUFFLINKS(cursor,conn,species,ref,samples,intype,folder,rec): cmds = [] for sample in samples: insert = rec+'_'+sample outdir = folder+'/'+insert if not os.path.exists(outdir): os.mkdir(outdir) bam = d00.get_sample_file(cursor,sample,intype) path = outdir+'/genes.fpkm_tracking' cmd = 'cufflinks -o %s -p 6 -G %s %s' %(outdir,refall[species]['gtf'][ref],bam) cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) cmds.append(cmd) conn.commit() return cmds
def TOPHAT_SINGLE(cursor,conn,specise,ref,samples,intype,folder,report,rec): cmds = [] for sample in samples: insert = rec+'_'+sample outdir = folder+'/'+insert bam = d00.get_sample_file(cursor,sample,intype) path = outdir+'/BAM'+insert+'.bam' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/TOPHAT.single.sh ' +outdir+' '+bam\ +' '+refall[specise]['bowtie2'][ref]+" "+path[:-4]+" "+insert+" "+report method = add_files(cmd,"/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/TOPHAT.single.sh") cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,method]) cmds.append(cmd) conn.commit() return cmds
def read_self_mate_map_data_and_trim(cursor, conn, sample, species, DB_NAME, tablename, table_raw): conn1 = mb.connect(host="localhost", user="******", passwd="123456", db=DB_NAME) cursor1 = conn1.cursor() files = ['self_refseq', 'mate_refseq'] rec = ['S_R', 'M_R'] for index, filename in enumerate(files): values = [] cmd = 'samtools view -F 4 ' + d00.get_sample_file( cursor, sample, filename) p1 = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for line in p1.stdout: t = str.split(line) values.append((t[0], rec[index], 'T', '101M')) cursor1.executemany( "insert into " + tablename + " values(%s,%s,%s,1000,%s) ", values) conn1.commit() files = ['mate_genome'] rec = ['M_G'] for index, filename in enumerate(files): values = [] cmd = 'samtools view -F 4 ' + d00.get_sample_file( cursor, sample, filename) p1 = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE) for line in p1.stdout: t = str.split(line) if species == 'mm10': t[2] = 'chr' + t[2] if len(t[5]) > 20: t[5] = t[5][0:20] values.append((t[0], rec[index], t[2], t[3], t[5])) cursor1.executemany( "insert into " + tablename + " values(%s,%s,%s,%s,%s) ", values) conn1.commit()
def read_VCF_file_single(cursor,conn,DB_NAME,tablename,samples,type): limit = 30 sample_infos = '' for sample in samples: sample_info = " %s_DP varchar(5) DEFAULT '0',%s_alt float DEFAULT '0'," %(sample,sample) sample_infos += sample_info sql = """CREATE TABLE %s ( `chr` varchar(20) NOT NULL DEFAULT '', `pos` int(11) NOT NULL DEFAULT '0', `Ref` varchar(30) DEFAULT NULL, `Alt` varchar(30) NOT NULL DEFAULT '', %s PRIMARY KEY (`chr`,`pos`,`Alt`) ) ENGINE=InnoDB DEFAULT CHARSET=latin1""" %(tablename,sample_infos) print sql try: cursor.execute(sql) except: print "EXISTS" for sample in samples: path = d00.get_sample_file(cursor,sample,type) file = open(path) values = [] for line in file: if re.search('#',line): continue t = re.split('\s*',line) info = {} for i in re.split(';',t[7]): a = re.split('=',i) if len(a)>1: info[a[0]] = a[1] if 'DP4' not in info: continue DP4 = re.split(',',info['DP4']) if len(t[3])>limit: t[3]=t[3][0:limit] continue if len(t[4])>limit: t[4]=t[4][0:limit] continue value = (t[0],t[1],t[3],t[4],info['DP'],float(int(DP4[2])+int(DP4[3]))/int(info['DP'])) values.append(value) cmd = "insert into %s (chr,pos,Ref,Alt,%s,%s)values(%%s,%%s,%%s,%%s,%%s,%%s) on duplicate key update %s=values(%s),%s=values(%s)" %(tablename,sample+'_DP',sample+'_alt',sample+'_DP',sample+'_DP',sample+'_alt',sample+'_alt') print cmd,values[0] cursor.executemany(cmd,values) conn.commit() cursor.close() conn.close()
def MAPPED_SINGLE(cursor, conn, samples, bamtype, folder, rec): cmds = [] for sample in samples: bam = d00.get_sample_file(cursor, sample, bamtype) path = folder + '/' + rec + '_' + sample + '.fa.gz' cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/assembly/mapped_reads_single.py ' + bam + ' ' + path + ' ' + sample method = add_files( cmd, "/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/assembly/mapped_reads_single.py" ) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, method]) cmds.append(cmd) conn.commit() return cmds
def makes_anchors_fq(cursor,conn,samples,bamfile,outdir,length,insert,rec): sql = [] cmds = [] for sample in samples: anchor1 = outdir+'/Anchor.'+insert+'_'+sample+'.1.fq.gz' anchor2 = outdir+'/Anchor.'+insert+'_'+sample+'.2.fq.gz' if os.path.exists(anchor1): print "ALREADY EXISTS" else: bam_pe = d00.get_sample_file(cursor,sample,bamfile) cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/make_anchors.py %s %s %s %s' %(bam_pe,anchor1,anchor2,length) cmds.append(cmd) sql.append([sample,rec[0],anchor1,cmd]) sql.append([sample,rec[1],anchor2,cmd]) cursor.executemany("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",sql) conn.commit() return cmds
def CUFFLINKS(cursor, conn, species, ref, samples, intype, folder, rec): cmds = [] for sample in samples: insert = rec + '_' + sample outdir = folder + '/' + insert if not os.path.exists(outdir): os.mkdir(outdir) bam = d00.get_sample_file(cursor, sample, intype) path = outdir + '/genes.fpkm_tracking' cmd = 'cufflinks -o %s -p 6 -G %s %s' % ( outdir, refall[species]['gtf'][ref], bam) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, cmd]) cmds.append(cmd) conn.commit() return cmds
def RPKM_DB(cursor, conn, g_t, gene_length, totalreads, samples, intype, insert, tablename): sql = 'create table %s select * from %s' % (tablename, gene_length) try: cursor.execute(sql) conn.commit() cursor.execute("create index reci on " + tablename + "(gene);") conn.commit() except: print "exists" gt = d00.table_2_dict(cursor, g_t, ['transc', 'gene']) gl = d00.table_2_dict(cursor, gene_length, ['gene', 'length']) for sample in samples: total = int(d00.get_sample_info(cursor, sample, totalreads)) count = {} exp = d00.get_sample_info(cursor, sample, 'exp') + insert try: cursor.execute("alter table " + tablename + " add " + exp + " float DEFAULT '0'") except: print "EXISTS_colume" f = open(d00.get_sample_file(cursor, sample, intype)) for line in f: t = re.split('\s+', line) if t[0] not in gt: continue gene = gt[t[0]] if gene in gl: if gene not in count: count[gene] = 0 count[gene] += int(t[1]) values = [] for gene in count: values.append([ float(count[gene] * 1000000 * 1000) / (total * int(gl[gene])), gene ]) print len(values), values[1] cursor.executemany( "update " + tablename + " set " + exp + "=%s where gene = %s", values) conn.commit()
def TOPHAT_SINGLE(cursor, conn, specise, ref, samples, intype, folder, report, rec): cmds = [] for sample in samples: insert = rec + '_' + sample outdir = folder + '/' + insert bam = d00.get_sample_file(cursor, sample, intype) path = outdir + '/BAM' + insert + '.bam' cmd = 'bash /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/TOPHAT.single.sh ' +outdir+' '+bam\ +' '+refall[specise]['bowtie2'][ref]+" "+path[:-4]+" "+insert+" "+report method = add_files( cmd, "/data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/scripts/TOPHAT.single.sh" ) cursor.execute( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", [sample, rec, path, method]) cmds.append(cmd) conn.commit() return cmds
def makes_anchors_fq(cursor, conn, samples, bamfile, outdir, length, insert, rec): sql = [] cmds = [] for sample in samples: anchor1 = outdir + '/Anchor.' + insert + '_' + sample + '.1.fq.gz' anchor2 = outdir + '/Anchor.' + insert + '_' + sample + '.2.fq.gz' if os.path.exists(anchor1): print "ALREADY EXISTS" else: bam_pe = d00.get_sample_file(cursor, sample, bamfile) cmd = 'python /data/Analysis/fanxiaoying/project/project01_polyA-RNAseq/modules/cRNA/make_anchors.py %s %s %s %s' % ( bam_pe, anchor1, anchor2, length) cmds.append(cmd) sql.append([sample, rec[0], anchor1, cmd]) sql.append([sample, rec[1], anchor2, cmd]) cursor.executemany( "replace into files (sample,type,path,method)values(%s,%s,%s,%s)", sql) conn.commit() return cmds
def COMMAND_generator(cursor,conn,samples,template,infiles,folder,suffix,rec): cmds = [] for sample in samples: lists = [] for i in infiles: lists.append(d00.get_sample_file(cursor,sample,i)) path = folder+'/'+rec+'_'+sample+suffix if suffix=='/': path = path[:-1] if suffix=='/' and not os.path.exists(path): os.mkdir(path) lists.append(path) cmd = template for i in range(len(lists)): cmd = cmd.replace("#"+str(i),lists[i]) cmd = cmd.replace("#sample",sample) if not (rec == '' or rec == 'NA'): cursor.execute("replace into files (sample,type,path,method)values(%s,%s,%s,%s)",[sample,rec,path,cmd]) cmds.append(cmd) conn.commit() return cmds
def Samples_Bam_Handles(cursor,conn,samples,bamtype): bamfiles = [] for sample in samples: file_path = d00.get_sample_file(cursor,sample,bamtype) bamfiles.append(pysam.Samfile(file_path, "rb")) return bamfiles