def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_1_list = list(RILseq.flat_list(settings.fastq_1)) fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) if settings.create_wig: outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwigs[i]) # Print the table of counts if settings.genes_gff: outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w') for fastq in fastq_1_list] for i, r1_name in enumerate(fastq_1_list): outt = csv.writer(outtables[i], delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) if not os.path.exists(settings.dirout): os.makedirs(settings.dirout) outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w') if settings.genes_gff: try: pos_feat_list, all_features = RILseq.read_gtf( open(settings.genes_gff), settings.feature, settings.identifier) except IOError: settings.genes_gff = None gcounts = {} lib_order = [] fastq_2_list = list(RILseq.flat_list(settings.fastq_2)) for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)): try: r2_name = fastq_2_list[i] except IndexError: r2_name = None outhead = r1_name.rsplit('.', 1)[0] libname = outhead.rsplit('/',1)[-1] outhead = '%s_bwa'%libname bamname = RILseq.run_bwa( settings.bwa_exec, r1_name, r2_name, settings.dirout, outhead, settings.allowed_mismatches, settings.genome_fasta, settings.params_aln, settings.sampe_params, settings.samse_params, settings.samtools_cmd, processors=settings.processors) samfile = pysam.Samfile(bamname) if settings.genes_gff: lib_order.append(libname) gcounts[libname] = RILseq.count_features( pos_feat_list, samfile, settings.overlap, rev=settings.reverse_complement) coverage = RILseq.generate_wig( samfile, rev=settings.reverse_complement, first_pos=False) RILseq.print_wiggle( coverage, "%s_single_fragments_coverage"%libname, "%s single fragments coverage"%libname, outwig) # Print the table of counts if settings.genes_gff: outtable = open( "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w') outt = csv.writer(outtable, delimiter='\t') outt.writerow(['Gene name'] + lib_order) for g in sorted(list(all_features)): row_out = [g] for libn in lib_order: row_out.append(gcounts[libn][g]) outt.writerow(row_out) return 0 # success
def main(argv=None): settings = process_command_line(argv) if settings.ribozero and settings.bc_dir: try: # sys.stderr.write("{}\n".format(','.join(settings.rrna_list))) uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data( settings.bc_dir, rRNA_prod=settings.rrna_list) except IOError: raise rr_pos = [] chr_dict = dict(zip( settings.BC_chrlist.split(',')[1::2], settings.BC_chrlist.split(',')[0::2])) for rrgene in rRNAs: # Pad the position of the rRNA gene with the alignment length rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\ [uid_pos[rrgene][1]-settings.length] +\ [uid_pos[rrgene][2]+settings.length] +\ [uid_pos[rrgene][3]]) # sys.stderr.write('\n'.join(rRNAs)) # print uid_pos.items()[:10] # sys.stderr.write('\n'.join(' '.join(map(str,sl)) for sl in rr_pos)) else: rr_pos = None region_interactions, region_ints_as1, region_ints_as2, total_interactions=\ RILseq.read_reads_table( open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles) # If all interactions are desired, skip the tests and report all if settings.all_interactions: interacting_regions = [] for reg1, r1data in region_interactions.items(): for reg2, clist in r1data.items(): interacting_regions.append( (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen, reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen, reg2[1], reg2[2], 0, 0, 0)) else: # Now run the test for each pair of interacting regions found_in_interaction = defaultdict(bool) interacting_regions = [] # Start with the regions with the most interactions pairs_num = {} for reg1 in list(region_interactions.keys()): if region_ints_as1[reg1] < settings.min_int: continue for reg2 in list(region_interactions[reg1].keys()): if len(region_interactions[reg1][reg2]) >= settings.min_int: pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2]) # Iterate the list of regions from the pairs with many interactions down for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True): pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\ RILseq.minpv_regions( reg1, reg2, region_interactions, region_ints_as1, region_ints_as2, total_interactions, found_in_interaction, settings.seglen, settings.maxsegs, settings.min_odds_ratio) pv *= len(pairs_num) if pv <= settings.max_pv: # Mark as participating for r1 in range(r1_from, r1_to, settings.seglen): for r2 in range(r2_from, r2_to, settings.seglen): found_in_interaction[ (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True # Report this interaction interacting_regions.append( (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2], r2_from, r2_to, reg2[1], reg2[2], mat_b, mat_c, mat_d)) # Read the number of total RNAs in each region if the bam file is given sum_reads=0 if settings.total_RNA: # prepare a dictionary of features feat_dict = defaultdict(lambda: defaultdict(set)) for region1 in region_ints_as1: for i in range(settings.seglen): feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1) for region2 in region_ints_as2: for i in range(settings.seglen): feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2) feat_list = {} for chrom, data in feat_dict.items(): maxpos = max(data.keys()) list_of_sets = [] for k in range(maxpos+1): list_of_sets.append(list(data[k])) feat_list[chrom] = list_of_sets totRNA_counts = defaultdict(int) sum_reads = 0 for bamfile in settings.total_RNA.split(','): saminf = pysam.Samfile(bamfile) totcounts, sum_of_counts_lib = RILseq.count_features( feat_list, saminf, 5, rev=settings.total_reverse, get_sum=True) for k, v in totcounts.items(): totRNA_counts[k] += v sum_reads += sum_of_counts_lib # Collect all the ratios between IP and total then choose the 90% # percentile to avoid liers max_IP_div_total_as1 = [] max_IP_div_total_as2 = [] for reg, counts in totRNA_counts.items(): if counts > settings.min_total_counts: counts = float(counts+1) # div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts) max_IP_div_total_as1.append(region_ints_as1[reg]/counts) max_IP_div_total_as2.append(region_ints_as2[reg]/counts) # mm1_sorted = sorted(max_IP_div_total_as1) mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1) max_IP_div_total = mm_sorted[ int(len(mm_sorted)*settings.norm_percentile)] sys.stderr.write("%f\n"%(max_IP_div_total)) else: totRNA_counts = defaultdict(int) max_IP_div_total = 0 if (settings.shuffles ==0 and settings.run_RNAup): settings.shuffles=-1 # Read the additional data to decorate the results with RILseq.report_interactions( region_interactions, sys.stdout, interacting_regions, settings.seglen, settings.bc_dir, settings.genome, settings.BC_chrlist, settings.refseq_dir, settings.targets_file, settings.rep_table, settings.single_counts, settings.shuffles, settings.RNAup_cmd, settings.servers, settings.length, settings.est_utr_lens, settings.pad_seqs, totRNA_counts, max_IP_div_total, total_interactions, sum_reads, settings.linear_chromosome_list) return 0 # success
def main(argv=None): settings = process_command_line(argv) if settings.ribozero: try: uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data( settings.ec_dir) except IOError: raise rr_pos = [] chr_dict = dict(zip( settings.EC_chrlist.split(',')[1::2], settings.EC_chrlist.split(',')[0::2])) for rrgene in rRNAs: # Pad the position of the rRNA gene with the alignment lenght rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\ [uid_pos[rrgene][1]-settings.length] +\ [uid_pos[rrgene][2]+settings.length] +\ [uid_pos[rrgene][3]]) else: rr_pos = None region_interactions, region_ints_as1, region_ints_as2, total_interactions=\ RILseq.read_reads_table( open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles) # If all interactions are desired, skip the tests and report all if settings.all_interactions: interacting_regions = [] for reg1, r1data in region_interactions.items(): for reg2, clist in r1data.items(): interacting_regions.append( (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen, reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen, reg2[1], reg2[2], 0, 0, 0)) else: # Now run the test for each pair of interacting regions found_in_interaction = defaultdict(bool) interacting_regions = [] # Start with the regions with the most interactions pairs_num = {} for reg1 in list(region_interactions.keys()): if region_ints_as1[reg1] < settings.min_int: continue for reg2 in list(region_interactions[reg1].keys()): if len(region_interactions[reg1][reg2]) >= settings.min_int: pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2]) # Iterate the list of regions from the pairs with many interactions down for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True): pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\ RILseq.minpv_regions( reg1, reg2, region_interactions, region_ints_as1, region_ints_as2, total_interactions, found_in_interaction, settings.seglen, settings.maxsegs, settings.min_odds_ratio) pv *= len(pairs_num) if pv <= settings.max_pv: # Mark as participating for r1 in range(r1_from, r1_to, settings.seglen): for r2 in range(r2_from, r2_to, settings.seglen): found_in_interaction[ (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True # Report this interaction interacting_regions.append( (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2], r2_from, r2_to, reg2[1], reg2[2], mat_b, mat_c, mat_d)) # Read the number of total RNAs in each region if the bam file is given if settings.total_RNA: # prepare a dictionary of features feat_dict = defaultdict(lambda: defaultdict(set)) for region1 in region_ints_as1: for i in range(settings.seglen): feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1) for region2 in region_ints_as2: for i in range(settings.seglen): feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2) feat_list = {} for chrom, data in feat_dict.items(): maxpos = max(data.keys()) list_of_sets = [] for k in range(maxpos+1): list_of_sets.append(list(data[k])) feat_list[chrom] = list_of_sets totRNA_counts = defaultdict(int) for bamfile in settings.total_RNA.split(','): totcounts = RILseq.count_features( feat_list, pysam.Samfile(bamfile), 5, rev=settings.total_reverse) for k, v in totcounts.items(): totRNA_counts[k] += v # Collect all the ratios between IP and total then choose the 90% # percentile to avoid liers max_IP_div_total_as1 = [] max_IP_div_total_as2 = [] for reg, counts in totRNA_counts.items(): if counts > settings.min_total_counts: counts = float(counts+1) # div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts) max_IP_div_total_as1.append(region_ints_as1[reg]/counts) max_IP_div_total_as2.append(region_ints_as2[reg]/counts) # mm1_sorted = sorted(max_IP_div_total_as1) mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1) max_IP_div_total = mm_sorted[ int(len(mm_sorted)*settings.norm_percentile)] sys.stderr.write("%f\n"%(max_IP_div_total)) else: totRNA_counts = defaultdict(int) max_IP_div_total = 0 # Read the additional data to decorate the results with RILseq.report_interactions( region_interactions, sys.stdout, interacting_regions, settings.seglen, settings.ec_dir, settings.EC_chrlist, settings.refseq_dir, settings.targets_file, settings.rep_table, settings.single_counts, settings.shuffles, settings.RNAup_cmd, settings.servers, settings.length, settings.est_utr_lens, settings.pad_seqs, totRNA_counts, max_IP_div_total) return 0 # success