def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_1_list = list(RILseq.flat_list(settings.fastq_1)) fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) if settings.create_wig: outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwigs[i]) # Print the table of counts if settings.genes_gff: outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] for i, r1_name in enumerate(fastq_1_list): outt = csv.writer(outtables[i], delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w') if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd, processors=settings.processors) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwig) # Print the table of counts if settings.genes_gff: outtable = open( "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w') outt = csv.writer(outtable, delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: return 1 lib_order = [] all_counts = {} if settings.singles: settings.only_first = True settings.only_second = False for r1_name in RILseq.flat_list(settings.reads_files): sys.stderr.write('%s\n'%str(r1_name)) lib_order.append(r1_name) all_counts[r1_name] = count_features( pos_feat_list, open(r1_name), settings.overlap, length=25, ignore_first=settings.only_second, ignore_second=settings.only_first, count_singles=settings.singles) outt = csv.writer(sys.stdout, delimiter='\t') if not settings.quiet: outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): if settings.quiet and g.startswith('~'): continue row_out = [g] for libn in lib_order: row_out.append(all_counts[libn][g]) outt.writerow(row_out) # application code here, like: # run(settings, args) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the transcripts if given if settings.transcripts: trans_dict = RILseq.read_transcripts(settings.transcripts) else: trans_dict = None # Get the ends of the reads from the bam files # sys.stderr.write('%s\n'%str(settings.bamfiles)) if settings.all_reads: try: outall = open(settings.all_reads, 'w') except IOError: outall = None elif settings.add_all_reads: outall = sys.stdout else: outall = None for bf in RILseq.flat_list(settings.bamfiles): bfin = pysam.Samfile(bf) outhead = bf.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname) fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname) if settings.skip_mapping: fsq1 = open(os.devnull, 'w') fsq2 = fsq1 else: fsq1 = open(fsq1name, 'w') fsq2 = open(fsq2name, 'w') single_mapped = RILseq.get_unmapped_reads( bfin, fsq1, fsq2, settings.length, settings.maxG, rev=settings.reverse_complement, all_reads=True, dust_thr=settings.dust_thr) reads_in = [] # Map the fastq files to the genome for fqname in (fsq1name, fsq2name): bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1] if settings.skip_mapping: bamname = "%s/%s.bam"%(settings.dirout, bamheadname) else: bamname = RILseq.run_bwa( settings.bwa_exec, fqname, None, settings.dirout, bamheadname, settings.max_mismatches, settings.genome_fasta, settings.params_aln, '', settings.samse_params, settings.samtools_cmd, processors=settings.processors) bamin = pysam.Samfile(bamname) reads_in.append(RILseq.read_bam_file( bamin, bamin.references, settings.allowed_mismatches)) RILseq.write_reads_table( sys.stdout, reads_in[0], reads_in[1], bfin.references, settings.distance, not settings.keep_circular, trans_dict, write_single=outall, single_mapped=single_mapped, max_NM=settings.allowed_mismatches) return 0 # success
def main(argv=None): settings = process_command_line(argv) try: pos_feat_list, all_features = RILseq.read_gtf(open(settings.genes_gff), settings.feature, settings.identifier) except IOError: return 1 lib_order = [] all_counts = {} if settings.singles: settings.only_first = True settings.only_second = False for r1_name in RILseq.flat_list(settings.reads_files): sys.stderr.write('%s\n' % str(r1_name)) lib_order.append(r1_name) all_counts[r1_name] = count_features(pos_feat_list, open(r1_name), settings.overlap, length=25, ignore_first=settings.only_second, ignore_second=settings.only_first, count_singles=settings.singles) outt = csv.writer(sys.stdout, delimiter='\t') if not settings.quiet: outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): if settings.quiet and g.startswith('~'): continue row_out = [g] for libn in lib_order: row_out.append(all_counts[libn][g]) outt.writerow(row_out) # application code here, like: # run(settings, args) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = {} gsize = {} for sr in SeqIO.parse(settings.genome, 'fasta'): genome[sr.id] = sr.seq gsize[sr.id] = len(sr.seq) if len(settings.EC_chrlist) >= 2: chr_dict = dict(zip( settings.EC_chrlist.split(',')[0::2], settings.EC_chrlist.split(',')[1::2])) else: chr_dict = {} if settings.summary: sig_reads = RILseq.read_significant_reads( settings.summary, chr_dict, gname=settings.gene_name) for line in csv.reader(open(settings.list_reads), delimiter='\t'): # skip single if len(line) > 7 and line[7]=="single": continue if settings.summary: if (int(line[4])-1, line[5], line[3]) not in\ sig_reads[(int(line[1])-1, line[2], line[0])]: continue read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]] read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]] # read_genes[line[6]] = [line[0], line[1]] # Read the bam files and return the long sequences r1_seqs = {} r2_seqs = {} for bamfile in list(RILseq.flat_list(settings.bamfiles)): r1s, r2s = get_reads_seqs( pysam.Samfile(bamfile), read_5ps.keys(), rev=settings.reverse) r1_seqs.update(r1s) r2_seqs.update(r2s) # For each read find the overlap, if exists and find the fusion point outer = csv.writer(sys.stdout, delimiter='\t') print 'track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%( settings.track_name, settings.track_desc) # Because I'm lazy, the code is written so r1 is the 3' end of the fragment for rname in set(r2_seqs.keys()): if rname in r1_seqs: r2seq = r2_seqs[rname] r1seq = r1_seqs[rname] else: # single-end r2seq = r2_seqs[rname] r1seq = '' s1, overlap, s2 = find_overlap(r2seq, r1seq) side_5p_len = extend_alignment( s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome[read_5ps[rname][2]]) side_3p_len = extend_alignment( s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome[read_3ps[rname][2]]) # Write each of the sides to the output file score=0 if settings.rand_score: score=random.randint(0, 1000) if read_5ps[rname][1] == '+': gfrom = max(0, read_5ps[rname][0]) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+', gfrom, gto, settings.pos_first]) elif read_5ps[rname][1] == '-': gfrom = max(0, read_5ps[rname][0]-side_5p_len+1) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-', gfrom, gto,settings.rev_first]) if read_3ps[rname][1] == '+': gfrom = max(0, read_3ps[rname][0]-side_3p_len+1) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1) outer.writerow([ read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+', gfrom, gto, settings.pos_second]) elif read_3ps[rname][1] == '-': gfrom = max(0, read_3ps[rname][0]) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len) outer.writerow([ read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-', gfrom, gto, settings.rev_second]) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = {} gsize = {} for sr in SeqIO.parse(settings.genome, 'fasta'): genome[sr.id] = sr.seq gsize[sr.id] = len(sr.seq) # genome size dictionary - {chr:size} if len(settings.BC_chrlist) >= 2: chr_dict = dict(zip( settings.BC_chrlist.split(',')[0::2], settings.BC_chrlist.split(',')[1::2])) # create dictionary of {'COLI-K12' : 'chr'} else: chr_dict = {} if settings.summary: # only reads from the significant interactions in -s param, also can enter a specific gene. sig_reads = RILseq.read_significant_reads( settings.summary, chr_dict, gname=settings.gene_name) for line in csv.reader(open(settings.list_reads), delimiter='\t'): # skip single if len(line) > 7 and line[7]=="single": continue if settings.summary: if (int(line[4])-1, line[5], line[3]) not in\ sig_reads[(int(line[1])-1, line[2], line[0])]: # skip if (coord_2, strand_2, chrom_2) not in the (coord_2, strand_2, chrom_2) of the significant reads. continue read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]] # {read_id : [coord_1(0-based), strand_1, chrom_1]} read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]] # {read_id : [coord_2(0-based), strand_2, chrom_2]} # read_genes[line[6]] = [line[0], line[1]] # Read the bam files and return the long sequences r1_seqs = {} r2_seqs = {} for bamfile in list(RILseq.flat_list(settings.bamfiles)): # flat multiple lists into one list. r1s, r2s = get_reads_seqs( pysam.AlignmentFile(bamfile, 'rb'), read_5ps.keys(), rev=settings.reverse) r1_seqs.update(r1s) r2_seqs.update(r2s) # For each read find the overlap, if exists and find the fusion point outer = csv.writer(sys.stdout, delimiter='\t') print ('track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%( settings.track_name, settings.track_desc)) # Because I'm lazy, the code is written so r1 is the 3' end of the fragment for rname in set(r2_seqs.keys()): if rname in r1_seqs: r2seq = r2_seqs[rname] r1seq = r1_seqs[rname] else: # single-end r2seq = r2_seqs[rname] r1seq = '' # r2seq, r1seq are the sequences from the bam files for paired end s1, overlap, s2 = find_overlap(r2seq, r1seq) # print here print(s1+overlap+s2) print(read_5ps[rname][0]) print(read_3ps[rname][0]) side_5p_len = extend_alignment( s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome[read_5ps[rname][2]]) side_3p_len = extend_alignment( s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome[read_3ps[rname][2]]) # Write each of the sides to the output file score=0 if settings.rand_score: score=random.randint(0, 1000) if read_5ps[rname][1] == '+': gfrom = max(0, read_5ps[rname][0]) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+', gfrom, gto, settings.pos_first]) elif read_5ps[rname][1] == '-': gfrom = max(0, read_5ps[rname][0]-side_5p_len+1) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-', gfrom, gto, settings.rev_first]) if read_3ps[rname][1] == '+': gfrom = max(0, read_3ps[rname][0]-side_3p_len+1) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1) outer.writerow([ read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+', gfrom, gto, settings.pos_second]) elif read_3ps[rname][1] == '-': gfrom = max(0, read_3ps[rname][0]) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len) outer.writerow([ read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-', gfrom, gto, settings.rev_second]) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the transcripts if given if settings.transcripts: trans_dict = RILseq.read_transcripts(settings.transcripts) else: trans_dict = None # Get the ends of the reads from the bam files # sys.stderr.write('%s\n'%str(settings.bamfiles)) if settings.all_reads: try: outall = open(settings.all_reads, 'w') except IOError: outall = None elif settings.add_all_reads: outall = sys.stdout else: outall = None for bf in RILseq.flat_list(settings.bamfiles): bfin = pysam.Samfile(bf) outhead = bf.rsplit('.', 1)[0] libname = outhead.rsplit('/', 1)[-1] fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname) fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname) if settings.skip_mapping: fsq1 = open(os.devnull, 'w') fsq2 = fsq1 else: fsq1 = open(fsq1name, 'w') fsq2 = open(fsq2name, 'w') single_mapped = RILseq.get_unmapped_reads( bfin, fsq1, fsq2, settings.length, settings.maxG, rev=settings.reverse_complement, all_reads=True, dust_thr=settings.dust_thr) reads_in = [] # Map the fastq files to the genome for fqname in (fsq1name, fsq2name): bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1] if settings.skip_mapping: bamname = "%s/%s.bam" % (settings.dirout, bamheadname) else: bamname = RILseq.run_bwa(settings.bwa_exec, fqname, None, settings.dirout, bamheadname, settings.max_mismatches, settings.genome_fasta, settings.params_aln, '', settings.samse_params, settings.samtools_cmd, processors=settings.processors) bamin = pysam.Samfile(bamname) reads_in.append( RILseq.read_bam_file(bamin, bamin.references, settings.allowed_mismatches)) RILseq.write_reads_table(sys.stdout, reads_in[0], reads_in[1], bfin.references, settings.distance, not settings.keep_circular, trans_dict, write_single=outall, single_mapped=single_mapped, max_NM=settings.allowed_mismatches) return 0 # success