def main(argv=None): settings = process_command_line(argv) try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: return 1 lib_order = [] all_counts = {} if settings.singles: settings.only_first = True settings.only_second = False for r1_name in RILseq.flat_list(settings.reads_files): sys.stderr.write('%s\n'%str(r1_name)) lib_order.append(r1_name) all_counts[r1_name] = count_features( pos_feat_list, open(r1_name), settings.overlap, length=25, ignore_first=settings.only_second, ignore_second=settings.only_first, count_singles=settings.singles) outt = csv.writer(sys.stdout, delimiter='\t') if not settings.quiet: outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): if settings.quiet and g.startswith('~'): continue row_out = [g] for libn in lib_order: row_out.append(all_counts[libn][g]) outt.writerow(row_out) # application code here, like: # run(settings, args) return 0 # success
def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_1_list = list(RILseq.flat_list(settings.fastq_1)) fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) if settings.create_wig: outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwigs[i]) # Print the table of counts if settings.genes_gff: outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] for i, r1_name in enumerate(fastq_1_list): outt = csv.writer(outtables[i], delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the transcripts if given if settings.transcripts: trans_dict = RILseq.read_transcripts(settings.transcripts) else: trans_dict = None # Get the ends of the reads from the bam files # sys.stderr.write('%s\n'%str(settings.bamfiles)) if settings.all_reads: try: outall = open(settings.all_reads, 'w') except IOError: outall = None elif settings.add_all_reads: outall = sys.stdout else: outall = None for bf in RILseq.flat_list(settings.bamfiles): bfin = pysam.Samfile(bf) outhead = bf.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname) fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname) if settings.skip_mapping: fsq1 = open(os.devnull, 'w') fsq2 = fsq1 else: fsq1 = open(fsq1name, 'w') fsq2 = open(fsq2name, 'w') single_mapped = RILseq.get_unmapped_reads( bfin, fsq1, fsq2, settings.length, settings.maxG, rev=settings.reverse_complement, all_reads=True, dust_thr=settings.dust_thr) reads_in = [] # Map the fastq files to the genome for fqname in (fsq1name, fsq2name): bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1] if settings.skip_mapping: bamname = "%s/%s.bam"%(settings.dirout, bamheadname) else: bamname = RILseq.run_bwa( settings.bwa_exec, fqname, None, settings.dirout, bamheadname, settings.max_mismatches, settings.genome_fasta, settings.params_aln, '', settings.samse_params, settings.samtools_cmd, processors=settings.processors) bamin = pysam.Samfile(bamname) reads_in.append(RILseq.read_bam_file( bamin, bamin.references, settings.allowed_mismatches)) RILseq.write_reads_table( sys.stdout, reads_in[0], reads_in[1], bfin.references, settings.distance, not settings.keep_circular, trans_dict, write_single=outall, single_mapped=single_mapped, max_NM=settings.allowed_mismatches) return 0 # success
def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w') if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd, processors=settings.processors) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwig) # Print the table of counts if settings.genes_gff: outtable = open( "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w') outt = csv.writer(outtable, delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) try: pos_feat_list, all_features = RILseq.read_gtf(open(settings.genes_gff), settings.feature, settings.identifier) except IOError: return 1 lib_order = [] all_counts = {} if settings.singles: settings.only_first = True settings.only_second = False for r1_name in RILseq.flat_list(settings.reads_files): sys.stderr.write('%s\n' % str(r1_name)) lib_order.append(r1_name) all_counts[r1_name] = count_features(pos_feat_list, open(r1_name), settings.overlap, length=25, ignore_first=settings.only_second, ignore_second=settings.only_first, count_singles=settings.singles) outt = csv.writer(sys.stdout, delimiter='\t') if not settings.quiet: outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): if settings.quiet and g.startswith('~'): continue row_out = [g] for libn in lib_order: row_out.append(all_counts[libn][g]) outt.writerow(row_out) # application code here, like: # run(settings, args) return 0 # success
def main(argv=None): settings = process_command_line(argv) if settings.ribozero and settings.bc_dir: try: # sys.stderr.write("{}\n".format(','.join(settings.rrna_list))) uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data( settings.bc_dir, rRNA_prod=settings.rrna_list) except IOError: raise rr_pos = [] chr_dict = dict(zip( settings.BC_chrlist.split(',')[1::2], settings.BC_chrlist.split(',')[0::2])) for rrgene in rRNAs: # Pad the position of the rRNA gene with the alignment length rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\ [uid_pos[rrgene][1]-settings.length] +\ [uid_pos[rrgene][2]+settings.length] +\ [uid_pos[rrgene][3]]) # sys.stderr.write('\n'.join(rRNAs)) # print uid_pos.items()[:10] # sys.stderr.write('\n'.join(' '.join(map(str,sl)) for sl in rr_pos)) else: rr_pos = None region_interactions, region_ints_as1, region_ints_as2, total_interactions=\ RILseq.read_reads_table( open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles) # If all interactions are desired, skip the tests and report all if settings.all_interactions: interacting_regions = [] for reg1, r1data in region_interactions.items(): for reg2, clist in r1data.items(): interacting_regions.append( (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen, reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen, reg2[1], reg2[2], 0, 0, 0)) else: # Now run the test for each pair of interacting regions found_in_interaction = defaultdict(bool) interacting_regions = [] # Start with the regions with the most interactions pairs_num = {} for reg1 in list(region_interactions.keys()): if region_ints_as1[reg1] < settings.min_int: continue for reg2 in list(region_interactions[reg1].keys()): if len(region_interactions[reg1][reg2]) >= settings.min_int: pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2]) # Iterate the list of regions from the pairs with many interactions down for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True): pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\ RILseq.minpv_regions( reg1, reg2, region_interactions, region_ints_as1, region_ints_as2, total_interactions, found_in_interaction, settings.seglen, settings.maxsegs, settings.min_odds_ratio) pv *= len(pairs_num) if pv <= settings.max_pv: # Mark as participating for r1 in range(r1_from, r1_to, settings.seglen): for r2 in range(r2_from, r2_to, settings.seglen): found_in_interaction[ (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True # Report this interaction interacting_regions.append( (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2], r2_from, r2_to, reg2[1], reg2[2], mat_b, mat_c, mat_d)) # Read the number of total RNAs in each region if the bam file is given sum_reads=0 if settings.total_RNA: # prepare a dictionary of features feat_dict = defaultdict(lambda: defaultdict(set)) for region1 in region_ints_as1: for i in range(settings.seglen): feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1) for region2 in region_ints_as2: for i in range(settings.seglen): feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2) feat_list = {} for chrom, data in feat_dict.items(): maxpos = max(data.keys()) list_of_sets = [] for k in range(maxpos+1): list_of_sets.append(list(data[k])) feat_list[chrom] = list_of_sets totRNA_counts = defaultdict(int) sum_reads = 0 for bamfile in settings.total_RNA.split(','): saminf = pysam.Samfile(bamfile) totcounts, sum_of_counts_lib = RILseq.count_features( feat_list, saminf, 5, rev=settings.total_reverse, get_sum=True) for k, v in totcounts.items(): totRNA_counts[k] += v sum_reads += sum_of_counts_lib # Collect all the ratios between IP and total then choose the 90% # percentile to avoid liers max_IP_div_total_as1 = [] max_IP_div_total_as2 = [] for reg, counts in totRNA_counts.items(): if counts > settings.min_total_counts: counts = float(counts+1) # div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts) max_IP_div_total_as1.append(region_ints_as1[reg]/counts) max_IP_div_total_as2.append(region_ints_as2[reg]/counts) # mm1_sorted = sorted(max_IP_div_total_as1) mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1) max_IP_div_total = mm_sorted[ int(len(mm_sorted)*settings.norm_percentile)] sys.stderr.write("%f\n"%(max_IP_div_total)) else: totRNA_counts = defaultdict(int) max_IP_div_total = 0 if (settings.shuffles ==0 and settings.run_RNAup): settings.shuffles=-1 # Read the additional data to decorate the results with RILseq.report_interactions( region_interactions, sys.stdout, interacting_regions, settings.seglen, settings.bc_dir, settings.genome, settings.BC_chrlist, settings.refseq_dir, settings.targets_file, settings.rep_table, settings.single_counts, settings.shuffles, settings.RNAup_cmd, settings.servers, settings.length, settings.est_utr_lens, settings.pad_seqs, totRNA_counts, max_IP_div_total, total_interactions, sum_reads, settings.linear_chromosome_list) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = {} gsize = {} for sr in SeqIO.parse(settings.genome, 'fasta'): genome[sr.id] = sr.seq gsize[sr.id] = len(sr.seq) if len(settings.EC_chrlist) >= 2: chr_dict = dict(zip( settings.EC_chrlist.split(',')[0::2], settings.EC_chrlist.split(',')[1::2])) else: chr_dict = {} if settings.summary: sig_reads = RILseq.read_significant_reads( settings.summary, chr_dict, gname=settings.gene_name) for line in csv.reader(open(settings.list_reads), delimiter='\t'): # skip single if len(line) > 7 and line[7]=="single": continue if settings.summary: if (int(line[4])-1, line[5], line[3]) not in\ sig_reads[(int(line[1])-1, line[2], line[0])]: continue read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]] read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]] # read_genes[line[6]] = [line[0], line[1]] # Read the bam files and return the long sequences r1_seqs = {} r2_seqs = {} for bamfile in list(RILseq.flat_list(settings.bamfiles)): r1s, r2s = get_reads_seqs( pysam.Samfile(bamfile), read_5ps.keys(), rev=settings.reverse) r1_seqs.update(r1s) r2_seqs.update(r2s) # For each read find the overlap, if exists and find the fusion point outer = csv.writer(sys.stdout, delimiter='\t') print 'track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%( settings.track_name, settings.track_desc) # Because I'm lazy, the code is written so r1 is the 3' end of the fragment for rname in set(r2_seqs.keys()): if rname in r1_seqs: r2seq = r2_seqs[rname] r1seq = r1_seqs[rname] else: # single-end r2seq = r2_seqs[rname] r1seq = '' s1, overlap, s2 = find_overlap(r2seq, r1seq) side_5p_len = extend_alignment( s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome[read_5ps[rname][2]]) side_3p_len = extend_alignment( s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome[read_3ps[rname][2]]) # Write each of the sides to the output file score=0 if settings.rand_score: score=random.randint(0, 1000) if read_5ps[rname][1] == '+': gfrom = max(0, read_5ps[rname][0]) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+', gfrom, gto, settings.pos_first]) elif read_5ps[rname][1] == '-': gfrom = max(0, read_5ps[rname][0]-side_5p_len+1) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-', gfrom, gto,settings.rev_first]) if read_3ps[rname][1] == '+': gfrom = max(0, read_3ps[rname][0]-side_3p_len+1) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1) outer.writerow([ read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+', gfrom, gto, settings.pos_second]) elif read_3ps[rname][1] == '-': gfrom = max(0, read_3ps[rname][0]) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len) outer.writerow([ read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-', gfrom, gto, settings.rev_second]) return 0 # success
def main(argv=None): settings = process_command_line(argv) if settings.ribozero: try: uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data( settings.ec_dir) except IOError: raise rr_pos = [] chr_dict = dict(zip( settings.EC_chrlist.split(',')[1::2], settings.EC_chrlist.split(',')[0::2])) for rrgene in rRNAs: # Pad the position of the rRNA gene with the alignment lenght rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\ [uid_pos[rrgene][1]-settings.length] +\ [uid_pos[rrgene][2]+settings.length] +\ [uid_pos[rrgene][3]]) else: rr_pos = None region_interactions, region_ints_as1, region_ints_as2, total_interactions=\ RILseq.read_reads_table( open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles) # If all interactions are desired, skip the tests and report all if settings.all_interactions: interacting_regions = [] for reg1, r1data in region_interactions.items(): for reg2, clist in r1data.items(): interacting_regions.append( (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen, reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen, reg2[1], reg2[2], 0, 0, 0)) else: # Now run the test for each pair of interacting regions found_in_interaction = defaultdict(bool) interacting_regions = [] # Start with the regions with the most interactions pairs_num = {} for reg1 in list(region_interactions.keys()): if region_ints_as1[reg1] < settings.min_int: continue for reg2 in list(region_interactions[reg1].keys()): if len(region_interactions[reg1][reg2]) >= settings.min_int: pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2]) # Iterate the list of regions from the pairs with many interactions down for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True): pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\ RILseq.minpv_regions( reg1, reg2, region_interactions, region_ints_as1, region_ints_as2, total_interactions, found_in_interaction, settings.seglen, settings.maxsegs, settings.min_odds_ratio) pv *= len(pairs_num) if pv <= settings.max_pv: # Mark as participating for r1 in range(r1_from, r1_to, settings.seglen): for r2 in range(r2_from, r2_to, settings.seglen): found_in_interaction[ (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True # Report this interaction interacting_regions.append( (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2], r2_from, r2_to, reg2[1], reg2[2], mat_b, mat_c, mat_d)) # Read the number of total RNAs in each region if the bam file is given if settings.total_RNA: # prepare a dictionary of features feat_dict = defaultdict(lambda: defaultdict(set)) for region1 in region_ints_as1: for i in range(settings.seglen): feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1) for region2 in region_ints_as2: for i in range(settings.seglen): feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2) feat_list = {} for chrom, data in feat_dict.items(): maxpos = max(data.keys()) list_of_sets = [] for k in range(maxpos+1): list_of_sets.append(list(data[k])) feat_list[chrom] = list_of_sets totRNA_counts = defaultdict(int) for bamfile in settings.total_RNA.split(','): totcounts = RILseq.count_features( feat_list, pysam.Samfile(bamfile), 5, rev=settings.total_reverse) for k, v in totcounts.items(): totRNA_counts[k] += v # Collect all the ratios between IP and total then choose the 90% # percentile to avoid liers max_IP_div_total_as1 = [] max_IP_div_total_as2 = [] for reg, counts in totRNA_counts.items(): if counts > settings.min_total_counts: counts = float(counts+1) # div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts) max_IP_div_total_as1.append(region_ints_as1[reg]/counts) max_IP_div_total_as2.append(region_ints_as2[reg]/counts) # mm1_sorted = sorted(max_IP_div_total_as1) mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1) max_IP_div_total = mm_sorted[ int(len(mm_sorted)*settings.norm_percentile)] sys.stderr.write("%f\n"%(max_IP_div_total)) else: totRNA_counts = defaultdict(int) max_IP_div_total = 0 # Read the additional data to decorate the results with RILseq.report_interactions( region_interactions, sys.stdout, interacting_regions, settings.seglen, settings.ec_dir, settings.EC_chrlist, settings.refseq_dir, settings.targets_file, settings.rep_table, settings.single_counts, settings.shuffles, settings.RNAup_cmd, settings.servers, settings.length, settings.est_utr_lens, settings.pad_seqs, totRNA_counts, max_IP_div_total) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the read names and positions read_5ps = {} read_3ps = {} read_genes = {} genome = {} gsize = {} for sr in SeqIO.parse(settings.genome, 'fasta'): genome[sr.id] = sr.seq gsize[sr.id] = len(sr.seq) # genome size dictionary - {chr:size} if len(settings.BC_chrlist) >= 2: chr_dict = dict(zip( settings.BC_chrlist.split(',')[0::2], settings.BC_chrlist.split(',')[1::2])) # create dictionary of {'COLI-K12' : 'chr'} else: chr_dict = {} if settings.summary: # only reads from the significant interactions in -s param, also can enter a specific gene. sig_reads = RILseq.read_significant_reads( settings.summary, chr_dict, gname=settings.gene_name) for line in csv.reader(open(settings.list_reads), delimiter='\t'): # skip single if len(line) > 7 and line[7]=="single": continue if settings.summary: if (int(line[4])-1, line[5], line[3]) not in\ sig_reads[(int(line[1])-1, line[2], line[0])]: # skip if (coord_2, strand_2, chrom_2) not in the (coord_2, strand_2, chrom_2) of the significant reads. continue read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]] # {read_id : [coord_1(0-based), strand_1, chrom_1]} read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]] # {read_id : [coord_2(0-based), strand_2, chrom_2]} # read_genes[line[6]] = [line[0], line[1]] # Read the bam files and return the long sequences r1_seqs = {} r2_seqs = {} for bamfile in list(RILseq.flat_list(settings.bamfiles)): # flat multiple lists into one list. r1s, r2s = get_reads_seqs( pysam.AlignmentFile(bamfile, 'rb'), read_5ps.keys(), rev=settings.reverse) r1_seqs.update(r1s) r2_seqs.update(r2s) # For each read find the overlap, if exists and find the fusion point outer = csv.writer(sys.stdout, delimiter='\t') print ('track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%( settings.track_name, settings.track_desc)) # Because I'm lazy, the code is written so r1 is the 3' end of the fragment for rname in set(r2_seqs.keys()): if rname in r1_seqs: r2seq = r2_seqs[rname] r1seq = r1_seqs[rname] else: # single-end r2seq = r2_seqs[rname] r1seq = '' # r2seq, r1seq are the sequences from the bam files for paired end s1, overlap, s2 = find_overlap(r2seq, r1seq) # print here print(s1+overlap+s2) print(read_5ps[rname][0]) print(read_3ps[rname][0]) side_5p_len = extend_alignment( s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1], genome[read_5ps[rname][2]]) side_3p_len = extend_alignment( s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1], genome[read_3ps[rname][2]]) # Write each of the sides to the output file score=0 if settings.rand_score: score=random.randint(0, 1000) if read_5ps[rname][1] == '+': gfrom = max(0, read_5ps[rname][0]) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+', gfrom, gto, settings.pos_first]) elif read_5ps[rname][1] == '-': gfrom = max(0, read_5ps[rname][0]-side_5p_len+1) gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1) outer.writerow([ read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-', gfrom, gto, settings.rev_first]) if read_3ps[rname][1] == '+': gfrom = max(0, read_3ps[rname][0]-side_3p_len+1) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1) outer.writerow([ read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+', gfrom, gto, settings.pos_second]) elif read_3ps[rname][1] == '-': gfrom = max(0, read_3ps[rname][0]) gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len) outer.writerow([ read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-', gfrom, gto, settings.rev_second]) return 0 # success
def main(argv=None): settings = process_command_line(argv) if len(settings.EC_chrlist) >= 2: chr_dict = dict(zip( settings.EC_chrlist.split(',')[0::2], settings.EC_chrlist.split(',')[1::2])) else: chr_dict = {} region_interactions, _, _, _=\ RILseq.read_reads_table(open(settings.reads_in), settings.region) both_strs = defaultdict(lambda: defaultdict(int)) if settings.summary: sig_reads = RILseq.read_significant_reads( settings.summary, chr_dict) if settings.known: known_reads = defaultdict(list) ptt_c, ptt_str = get_coords("%s.ptt.gz"%settings.refseq_dir) rnt_c, rnt_str = get_coords("%s.rnt.gz"%settings.refseq_dir) for line in open(settings.known): spl = line.strip().split() try: scoor = rnt_c[spl[0]] rcoor = ptt_c[spl[1]] except KeyError: pass else: for i in range(scoor[0], scoor[1]): for j in range(rcoor[0], rcoor[1]): known_reads[i].append(j) known_reads[j].append(i) for reg1 in region_interactions: if reg1[2] != settings.chrn: continue for reg2 in region_interactions[reg1]: if reg2[2] != settings.chrn: continue if settings.summary: nsigs = 0 for r1, r2, in region_interactions[reg1][reg2]: nsigs += int((r2, reg2[1], reg2[2]) in \ sig_reads[(r1, reg1[1], reg1[2])]) else: nsigs = len(region_interactions[reg1][reg2]) both_strs[reg1[0]][reg2[0]] += nsigs if settings.sRNAs: from RILseq.ecocyc_parser import read_genes_data uid_pos, uid_names, uid_tudata, sRNAs_list, other_RNAs_list, rRNAs = \ read_genes_data(settings.ec_dir) sposs = set() for g in sRNAs_list: for i in range(uid_pos[g][1], uid_pos[g][2]): sposs.add(i) for r1 in both_strs: for r2 in both_strs[r1]: if both_strs[r1][r2] > settings.min_interactions: color = 'thickness=%dp'%max( int(log(both_strs[r1][r2])/log(10)),1) if settings.sRNAs: rset = set([i for i in range(r1, r1+settings.region)]) rset |= set([i for i in range(r2, r2+settings.region)]) if rset & sposs: color = 'color=orange' if settings.known: for k in set(range(r1, r1+settings.region)) & set(known_reads.keys()): if set(range(r2, r2+settings.region)) & set(known_reads[k]): color = 'color=red' sys.stdout.write('%s %d %d %s %d %d %s\n'%( settings.print_chr, r1+1, r1+settings.region, settings.print_chr, r2+1, r2+settings.region, color)) return 0 # success
def main(argv=None): settings = process_command_line(argv) # Read the transcripts if given if settings.transcripts: trans_dict = RILseq.read_transcripts(settings.transcripts) else: trans_dict = None # Get the ends of the reads from the bam files # sys.stderr.write('%s\n'%str(settings.bamfiles)) if settings.all_reads: try: outall = open(settings.all_reads, 'w') except IOError: outall = None elif settings.add_all_reads: outall = sys.stdout else: outall = None for bf in RILseq.flat_list(settings.bamfiles): bfin = pysam.Samfile(bf) outhead = bf.rsplit('.', 1)[0] libname = outhead.rsplit('/', 1)[-1] fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname) fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname) if settings.skip_mapping: fsq1 = open(os.devnull, 'w') fsq2 = fsq1 else: fsq1 = open(fsq1name, 'w') fsq2 = open(fsq2name, 'w') single_mapped = RILseq.get_unmapped_reads( bfin, fsq1, fsq2, settings.length, settings.maxG, rev=settings.reverse_complement, all_reads=True, dust_thr=settings.dust_thr) reads_in = [] # Map the fastq files to the genome for fqname in (fsq1name, fsq2name): bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1] if settings.skip_mapping: bamname = "%s/%s.bam" % (settings.dirout, bamheadname) else: bamname = RILseq.run_bwa(settings.bwa_exec, fqname, None, settings.dirout, bamheadname, settings.max_mismatches, settings.genome_fasta, settings.params_aln, '', settings.samse_params, settings.samtools_cmd, processors=settings.processors) bamin = pysam.Samfile(bamname) reads_in.append( RILseq.read_bam_file(bamin, bamin.references, settings.allowed_mismatches)) RILseq.write_reads_table(sys.stdout, reads_in[0], reads_in[1], bfin.references, settings.distance, not settings.keep_circular, trans_dict, write_single=outall, single_mapped=single_mapped, max_NM=settings.allowed_mismatches) return 0 # success