Example #1
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)

    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    
    fastq_1_list = list(RILseq.flat_list(settings.fastq_1))
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        if settings.create_wig:
            outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
               for fastq in fastq_1_list]
            coverage = RILseq.generate_wig(
                samfile, rev=settings.reverse_complement, first_pos=False)
            RILseq.print_wiggle(
                coverage, "%s_single_fragments_coverage"%libname,
                "%s single fragments coverage"%libname, outwigs[i])
    # Print the table of counts
    if settings.genes_gff:
        outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
                     for fastq in fastq_1_list]
        for i, r1_name in enumerate(fastq_1_list):
            outt = csv.writer(outtables[i], delimiter='\t')
            outt.writerow(['Gene name'] + lib_order)
            for g in sorted(list(all_features)):
                row_out = [g]
                for libn in lib_order:
                    row_out.append(gcounts[libn][g])
                outt.writerow(row_out)
    return 0        # success
Example #2
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)
    outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w')
    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd,
            processors=settings.processors)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        coverage = RILseq.generate_wig(
            samfile, rev=settings.reverse_complement, first_pos=False)
        RILseq.print_wiggle(
            coverage, "%s_single_fragments_coverage"%libname,
            "%s single fragments coverage"%libname, outwig)
    # Print the table of counts
    if settings.genes_gff:
        outtable = open(
            "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w')
        outt = csv.writer(outtable, delimiter='\t')
        outt.writerow(['Gene name'] + lib_order)
        for g in sorted(list(all_features)):
            row_out = [g]
            for libn in lib_order:
                row_out.append(gcounts[libn][g])
            outt.writerow(row_out)
    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    if settings.ribozero and settings.bc_dir:
        try:
            # sys.stderr.write("{}\n".format(','.join(settings.rrna_list)))
            uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data(
                settings.bc_dir, rRNA_prod=settings.rrna_list)
        except IOError:
            raise 
        rr_pos = []
        chr_dict = dict(zip(
                settings.BC_chrlist.split(',')[1::2],
                settings.BC_chrlist.split(',')[0::2]))
        for rrgene in rRNAs:
            # Pad the position of the rRNA gene with the alignment length
            rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\
                              [uid_pos[rrgene][1]-settings.length] +\
                              [uid_pos[rrgene][2]+settings.length] +\
                              [uid_pos[rrgene][3]])
        # sys.stderr.write('\n'.join(rRNAs))
        # print uid_pos.items()[:10]
        # sys.stderr.write('\n'.join(' '.join(map(str,sl)) for sl in rr_pos))
    else:
        rr_pos = None

    region_interactions, region_ints_as1, region_ints_as2, total_interactions=\
        RILseq.read_reads_table(
        open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles)
        
    # If all interactions are desired, skip the tests and report all
    if settings.all_interactions:
        interacting_regions = []
        for reg1, r1data in region_interactions.items():
            for reg2, clist in r1data.items():
                interacting_regions.append(
                    (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen,
                     reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen,
                     reg2[1], reg2[2], 0, 0, 0))
    else:
        # Now run the test for each pair of interacting regions
        found_in_interaction = defaultdict(bool)
        interacting_regions = []
        # Start with the regions with the most interactions
        pairs_num = {}
        for reg1 in list(region_interactions.keys()):
            if region_ints_as1[reg1] < settings.min_int:
                continue
            for reg2 in list(region_interactions[reg1].keys()):
                if len(region_interactions[reg1][reg2]) >= settings.min_int:
                    pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2])
        # Iterate the list of regions from the pairs with many interactions down
        for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True):
            pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\
                RILseq.minpv_regions(
                reg1, reg2, region_interactions, region_ints_as1,
                region_ints_as2, total_interactions, found_in_interaction,
                settings.seglen, settings.maxsegs, settings.min_odds_ratio)
            pv *= len(pairs_num)
            if pv <= settings.max_pv:
                # Mark as participating
                for r1 in range(r1_from, r1_to, settings.seglen):
                    for r2 in range(r2_from, r2_to, settings.seglen):
                        found_in_interaction[
                            (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True
                        
                # Report this interaction
                interacting_regions.append(
                    (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2],  r2_from,
                     r2_to, reg2[1], reg2[2],  mat_b, mat_c, mat_d))
    # Read the number of total RNAs in each region if the bam file is given
    sum_reads=0
    if settings.total_RNA:
        # prepare a dictionary of features
        feat_dict = defaultdict(lambda: defaultdict(set))
        for region1 in region_ints_as1:
            for i in range(settings.seglen):
                feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1)
        for region2 in region_ints_as2:
            for i in range(settings.seglen):
                feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2)
        feat_list = {}
        for chrom, data in feat_dict.items():
            maxpos = max(data.keys())
            list_of_sets = []
            for k in range(maxpos+1):
                list_of_sets.append(list(data[k]))
            feat_list[chrom] = list_of_sets
        totRNA_counts = defaultdict(int)
        sum_reads = 0
        for bamfile in settings.total_RNA.split(','):
            saminf = pysam.Samfile(bamfile)
            totcounts, sum_of_counts_lib = RILseq.count_features(
                feat_list, saminf, 5,
                rev=settings.total_reverse, get_sum=True)
            for k, v in totcounts.items():
                totRNA_counts[k] += v
            sum_reads += sum_of_counts_lib
        # Collect all the ratios between IP and total then choose the 90%
        # percentile to avoid liers 
        max_IP_div_total_as1 = []
        max_IP_div_total_as2 = []
        for reg, counts in totRNA_counts.items():
            if counts > settings.min_total_counts:
                counts = float(counts+1)
#                div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts)
                max_IP_div_total_as1.append(region_ints_as1[reg]/counts)
                max_IP_div_total_as2.append(region_ints_as2[reg]/counts)
#        mm1_sorted = sorted(max_IP_div_total_as1)
        mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1)
        max_IP_div_total = mm_sorted[
            int(len(mm_sorted)*settings.norm_percentile)]
        sys.stderr.write("%f\n"%(max_IP_div_total))
            
    else:
        totRNA_counts = defaultdict(int)
        max_IP_div_total = 0
    if (settings.shuffles ==0 and settings.run_RNAup):
        settings.shuffles=-1
    # Read the additional data to decorate the results with
    RILseq.report_interactions(
        region_interactions, sys.stdout, interacting_regions, settings.seglen,
        settings.bc_dir, settings.genome, settings.BC_chrlist, settings.refseq_dir,
        settings.targets_file, settings.rep_table, settings.single_counts,
        settings.shuffles, settings.RNAup_cmd, settings.servers,
        settings.length, settings.est_utr_lens, settings.pad_seqs,
        totRNA_counts, max_IP_div_total, total_interactions, sum_reads, settings.linear_chromosome_list)

    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    if settings.ribozero:
        try:
            uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data(
                settings.ec_dir)
        except IOError:
            raise 
        rr_pos = []
        chr_dict = dict(zip(
                settings.EC_chrlist.split(',')[1::2],
                settings.EC_chrlist.split(',')[0::2]))
        for rrgene in rRNAs:
            # Pad the position of the rRNA gene with the alignment lenght
            rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\
                              [uid_pos[rrgene][1]-settings.length] +\
                              [uid_pos[rrgene][2]+settings.length] +\
                              [uid_pos[rrgene][3]])
    else:
        rr_pos = None
    region_interactions, region_ints_as1, region_ints_as2, total_interactions=\
        RILseq.read_reads_table(
        open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles)
        
    # If all interactions are desired, skip the tests and report all
    if settings.all_interactions:
        interacting_regions = []
        for reg1, r1data in region_interactions.items():
            for reg2, clist in r1data.items():
                interacting_regions.append(
                    (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen,
                     reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen,
                     reg2[1], reg2[2], 0, 0, 0))
    else:
        # Now run the test for each pair of interacting regions
        found_in_interaction = defaultdict(bool)
        interacting_regions = []
        # Start with the regions with the most interactions
        pairs_num = {}
        for reg1 in list(region_interactions.keys()):
            if region_ints_as1[reg1] < settings.min_int:
                continue
            for reg2 in list(region_interactions[reg1].keys()):
                if len(region_interactions[reg1][reg2]) >= settings.min_int:
                    pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2])
        # Iterate the list of regions from the pairs with many interactions down
        for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True):
            pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\
                RILseq.minpv_regions(
                reg1, reg2, region_interactions, region_ints_as1,
                region_ints_as2, total_interactions, found_in_interaction,
                settings.seglen, settings.maxsegs, settings.min_odds_ratio)
            pv *= len(pairs_num)
            if pv <= settings.max_pv:
                # Mark as participating
                for r1 in range(r1_from, r1_to, settings.seglen):
                    for r2 in range(r2_from, r2_to, settings.seglen):
                        found_in_interaction[
                            (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True
                        
                # Report this interaction
                interacting_regions.append(
                    (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2],  r2_from,
                     r2_to, reg2[1], reg2[2],  mat_b, mat_c, mat_d))
    # Read the number of total RNAs in each region if the bam file is given
    if settings.total_RNA:
        # prepare a dictionary of features
        feat_dict = defaultdict(lambda: defaultdict(set))
        for region1 in region_ints_as1:
            for i in range(settings.seglen):
                feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1)
        for region2 in region_ints_as2:
            for i in range(settings.seglen):
                feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2)
        feat_list = {}
        for chrom, data in feat_dict.items():
            maxpos = max(data.keys())
            list_of_sets = []
            for k in range(maxpos+1):
                list_of_sets.append(list(data[k]))
            feat_list[chrom] = list_of_sets
        totRNA_counts = defaultdict(int)
        for bamfile in settings.total_RNA.split(','):
            totcounts = RILseq.count_features(
                feat_list, pysam.Samfile(bamfile), 5,
                rev=settings.total_reverse)
            for k, v in totcounts.items():
                totRNA_counts[k] += v
        # Collect all the ratios between IP and total then choose the 90%
        # percentile to avoid liers 
        max_IP_div_total_as1 = []
        max_IP_div_total_as2 = []
        for reg, counts in totRNA_counts.items():
            if counts > settings.min_total_counts:
                counts = float(counts+1)
#                div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts)
                max_IP_div_total_as1.append(region_ints_as1[reg]/counts)
                max_IP_div_total_as2.append(region_ints_as2[reg]/counts)
#        mm1_sorted = sorted(max_IP_div_total_as1)
        mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1)
        max_IP_div_total = mm_sorted[
            int(len(mm_sorted)*settings.norm_percentile)]
        sys.stderr.write("%f\n"%(max_IP_div_total))
            
    else:
        totRNA_counts = defaultdict(int)
        max_IP_div_total = 0
    # Read the additional data to decorate the results with
    RILseq.report_interactions(
        region_interactions, sys.stdout, interacting_regions, settings.seglen,
        settings.ec_dir, settings.EC_chrlist, settings.refseq_dir,
        settings.targets_file, settings.rep_table, settings.single_counts,
        settings.shuffles, settings.RNAup_cmd, settings.servers,
        settings.length, settings.est_utr_lens, settings.pad_seqs,
        totRNA_counts, max_IP_div_total)

    return 0        # success