def main(argv=None):
    settings = process_command_line(argv)
    try:
        pos_feat_list, all_features = RILseq.read_gtf(
            open(settings.genes_gff), settings.feature, settings.identifier)
    except IOError:
        return 1
    lib_order = []
    all_counts = {}
    if settings.singles:
        settings.only_first = True
        settings.only_second = False
    for r1_name in RILseq.flat_list(settings.reads_files):
        sys.stderr.write('%s\n'%str(r1_name))
        lib_order.append(r1_name)
        all_counts[r1_name] = count_features(
            pos_feat_list, open(r1_name), settings.overlap, length=25,
            ignore_first=settings.only_second,
            ignore_second=settings.only_first, count_singles=settings.singles)
    outt = csv.writer(sys.stdout, delimiter='\t')
    if not settings.quiet:
        outt.writerow(['Gene name'] + lib_order)
    for g in sorted(list(all_features)):
        if settings.quiet and g.startswith('~'):
            continue
        row_out = [g]
        for libn in lib_order:
            row_out.append(all_counts[libn][g])
        outt.writerow(row_out)
    # application code here, like:
    # run(settings, args)
    return 0        # success
Esempio n. 2
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)

    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    
    fastq_1_list = list(RILseq.flat_list(settings.fastq_1))
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        if settings.create_wig:
            outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
               for fastq in fastq_1_list]
            coverage = RILseq.generate_wig(
                samfile, rev=settings.reverse_complement, first_pos=False)
            RILseq.print_wiggle(
                coverage, "%s_single_fragments_coverage"%libname,
                "%s single fragments coverage"%libname, outwigs[i])
    # Print the table of counts
    if settings.genes_gff:
        outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
                     for fastq in fastq_1_list]
        for i, r1_name in enumerate(fastq_1_list):
            outt = csv.writer(outtables[i], delimiter='\t')
            outt.writerow(['Gene name'] + lib_order)
            for g in sorted(list(all_features)):
                row_out = [g]
                for libn in lib_order:
                    row_out.append(gcounts[libn][g])
                outt.writerow(row_out)
    return 0        # success
Esempio n. 3
0
def main(argv=None):
    settings = process_command_line(argv)
    # Read the transcripts if given
    if settings.transcripts:
        trans_dict = RILseq.read_transcripts(settings.transcripts)
    else:
        trans_dict = None
    # Get the ends of the reads from the bam files
#    sys.stderr.write('%s\n'%str(settings.bamfiles))
    if settings.all_reads:
        try:
            outall = open(settings.all_reads, 'w')
        except IOError:
            outall = None
    elif settings.add_all_reads:
        outall = sys.stdout
    else:
        outall = None
    for bf in RILseq.flat_list(settings.bamfiles):
        bfin = pysam.Samfile(bf)
        outhead = bf.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname)
        fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname)
        if settings.skip_mapping:
            fsq1 = open(os.devnull, 'w')
            fsq2 = fsq1
        else:
            fsq1 = open(fsq1name, 'w')
            fsq2 = open(fsq2name, 'w')
        single_mapped = RILseq.get_unmapped_reads(
            bfin, fsq1, fsq2, settings.length, settings.maxG,
            rev=settings.reverse_complement, all_reads=True,
            dust_thr=settings.dust_thr)
        reads_in = []
        # Map the fastq files to the genome
        for fqname in (fsq1name, fsq2name):
            bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1]
            if settings.skip_mapping:
                bamname = "%s/%s.bam"%(settings.dirout, bamheadname)
            else:
                bamname = RILseq.run_bwa(
                    settings.bwa_exec, fqname, None,
                    settings.dirout, bamheadname, settings.max_mismatches,
                    settings.genome_fasta, settings.params_aln,
                    '', settings.samse_params,
                    settings.samtools_cmd, processors=settings.processors)
            bamin = pysam.Samfile(bamname)
            reads_in.append(RILseq.read_bam_file(
                    bamin, bamin.references, settings.allowed_mismatches))
        RILseq.write_reads_table(
            sys.stdout, reads_in[0], reads_in[1], bfin.references,
            settings.distance, not settings.keep_circular,
            trans_dict, write_single=outall, single_mapped=single_mapped,
            max_NM=settings.allowed_mismatches)
    return 0        # success
Esempio n. 4
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)
    outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w')
    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd,
            processors=settings.processors)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        coverage = RILseq.generate_wig(
            samfile, rev=settings.reverse_complement, first_pos=False)
        RILseq.print_wiggle(
            coverage, "%s_single_fragments_coverage"%libname,
            "%s single fragments coverage"%libname, outwig)
    # Print the table of counts
    if settings.genes_gff:
        outtable = open(
            "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w')
        outt = csv.writer(outtable, delimiter='\t')
        outt.writerow(['Gene name'] + lib_order)
        for g in sorted(list(all_features)):
            row_out = [g]
            for libn in lib_order:
                row_out.append(gcounts[libn][g])
            outt.writerow(row_out)
    return 0        # success
Esempio n. 5
0
def main(argv=None):
    settings = process_command_line(argv)
    try:
        pos_feat_list, all_features = RILseq.read_gtf(open(settings.genes_gff),
                                                      settings.feature,
                                                      settings.identifier)
    except IOError:
        return 1
    lib_order = []
    all_counts = {}
    if settings.singles:
        settings.only_first = True
        settings.only_second = False
    for r1_name in RILseq.flat_list(settings.reads_files):
        sys.stderr.write('%s\n' % str(r1_name))
        lib_order.append(r1_name)
        all_counts[r1_name] = count_features(pos_feat_list,
                                             open(r1_name),
                                             settings.overlap,
                                             length=25,
                                             ignore_first=settings.only_second,
                                             ignore_second=settings.only_first,
                                             count_singles=settings.singles)
    outt = csv.writer(sys.stdout, delimiter='\t')
    if not settings.quiet:
        outt.writerow(['Gene name'] + lib_order)
    for g in sorted(list(all_features)):
        if settings.quiet and g.startswith('~'):
            continue
        row_out = [g]
        for libn in lib_order:
            row_out.append(all_counts[libn][g])
        outt.writerow(row_out)
    # application code here, like:
    # run(settings, args)
    return 0  # success
def main(argv=None):
    settings = process_command_line(argv)
    if settings.ribozero and settings.bc_dir:
        try:
            # sys.stderr.write("{}\n".format(','.join(settings.rrna_list)))
            uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data(
                settings.bc_dir, rRNA_prod=settings.rrna_list)
        except IOError:
            raise 
        rr_pos = []
        chr_dict = dict(zip(
                settings.BC_chrlist.split(',')[1::2],
                settings.BC_chrlist.split(',')[0::2]))
        for rrgene in rRNAs:
            # Pad the position of the rRNA gene with the alignment length
            rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\
                              [uid_pos[rrgene][1]-settings.length] +\
                              [uid_pos[rrgene][2]+settings.length] +\
                              [uid_pos[rrgene][3]])
        # sys.stderr.write('\n'.join(rRNAs))
        # print uid_pos.items()[:10]
        # sys.stderr.write('\n'.join(' '.join(map(str,sl)) for sl in rr_pos))
    else:
        rr_pos = None

    region_interactions, region_ints_as1, region_ints_as2, total_interactions=\
        RILseq.read_reads_table(
        open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles)
        
    # If all interactions are desired, skip the tests and report all
    if settings.all_interactions:
        interacting_regions = []
        for reg1, r1data in region_interactions.items():
            for reg2, clist in r1data.items():
                interacting_regions.append(
                    (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen,
                     reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen,
                     reg2[1], reg2[2], 0, 0, 0))
    else:
        # Now run the test for each pair of interacting regions
        found_in_interaction = defaultdict(bool)
        interacting_regions = []
        # Start with the regions with the most interactions
        pairs_num = {}
        for reg1 in list(region_interactions.keys()):
            if region_ints_as1[reg1] < settings.min_int:
                continue
            for reg2 in list(region_interactions[reg1].keys()):
                if len(region_interactions[reg1][reg2]) >= settings.min_int:
                    pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2])
        # Iterate the list of regions from the pairs with many interactions down
        for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True):
            pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\
                RILseq.minpv_regions(
                reg1, reg2, region_interactions, region_ints_as1,
                region_ints_as2, total_interactions, found_in_interaction,
                settings.seglen, settings.maxsegs, settings.min_odds_ratio)
            pv *= len(pairs_num)
            if pv <= settings.max_pv:
                # Mark as participating
                for r1 in range(r1_from, r1_to, settings.seglen):
                    for r2 in range(r2_from, r2_to, settings.seglen):
                        found_in_interaction[
                            (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True
                        
                # Report this interaction
                interacting_regions.append(
                    (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2],  r2_from,
                     r2_to, reg2[1], reg2[2],  mat_b, mat_c, mat_d))
    # Read the number of total RNAs in each region if the bam file is given
    sum_reads=0
    if settings.total_RNA:
        # prepare a dictionary of features
        feat_dict = defaultdict(lambda: defaultdict(set))
        for region1 in region_ints_as1:
            for i in range(settings.seglen):
                feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1)
        for region2 in region_ints_as2:
            for i in range(settings.seglen):
                feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2)
        feat_list = {}
        for chrom, data in feat_dict.items():
            maxpos = max(data.keys())
            list_of_sets = []
            for k in range(maxpos+1):
                list_of_sets.append(list(data[k]))
            feat_list[chrom] = list_of_sets
        totRNA_counts = defaultdict(int)
        sum_reads = 0
        for bamfile in settings.total_RNA.split(','):
            saminf = pysam.Samfile(bamfile)
            totcounts, sum_of_counts_lib = RILseq.count_features(
                feat_list, saminf, 5,
                rev=settings.total_reverse, get_sum=True)
            for k, v in totcounts.items():
                totRNA_counts[k] += v
            sum_reads += sum_of_counts_lib
        # Collect all the ratios between IP and total then choose the 90%
        # percentile to avoid liers 
        max_IP_div_total_as1 = []
        max_IP_div_total_as2 = []
        for reg, counts in totRNA_counts.items():
            if counts > settings.min_total_counts:
                counts = float(counts+1)
#                div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts)
                max_IP_div_total_as1.append(region_ints_as1[reg]/counts)
                max_IP_div_total_as2.append(region_ints_as2[reg]/counts)
#        mm1_sorted = sorted(max_IP_div_total_as1)
        mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1)
        max_IP_div_total = mm_sorted[
            int(len(mm_sorted)*settings.norm_percentile)]
        sys.stderr.write("%f\n"%(max_IP_div_total))
            
    else:
        totRNA_counts = defaultdict(int)
        max_IP_div_total = 0
    if (settings.shuffles ==0 and settings.run_RNAup):
        settings.shuffles=-1
    # Read the additional data to decorate the results with
    RILseq.report_interactions(
        region_interactions, sys.stdout, interacting_regions, settings.seglen,
        settings.bc_dir, settings.genome, settings.BC_chrlist, settings.refseq_dir,
        settings.targets_file, settings.rep_table, settings.single_counts,
        settings.shuffles, settings.RNAup_cmd, settings.servers,
        settings.length, settings.est_utr_lens, settings.pad_seqs,
        totRNA_counts, max_IP_div_total, total_interactions, sum_reads, settings.linear_chromosome_list)

    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    # Read the read names and positions
    read_5ps = {}
    read_3ps = {}
    read_genes = {}
    genome = {}
    gsize = {}
    for sr in SeqIO.parse(settings.genome, 'fasta'):
        genome[sr.id] = sr.seq
        gsize[sr.id] = len(sr.seq)
    if len(settings.EC_chrlist) >= 2:
        chr_dict = dict(zip(
                settings.EC_chrlist.split(',')[0::2],
                settings.EC_chrlist.split(',')[1::2]))
    else:
        chr_dict = {}
    if settings.summary:
        sig_reads = RILseq.read_significant_reads(
            settings.summary, chr_dict, gname=settings.gene_name)

    for line in csv.reader(open(settings.list_reads), delimiter='\t'):
        # skip single
        if len(line) > 7 and line[7]=="single":
            continue
        if settings.summary:
            if (int(line[4])-1, line[5], line[3]) not in\
                    sig_reads[(int(line[1])-1, line[2], line[0])]:
                continue
        read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]]
        read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]]
#        read_genes[line[6]] = [line[0], line[1]]
    # Read the bam files and return the long sequences
    r1_seqs = {}
    r2_seqs = {}
    for bamfile in list(RILseq.flat_list(settings.bamfiles)):
        r1s, r2s = get_reads_seqs(
            pysam.Samfile(bamfile), read_5ps.keys(), rev=settings.reverse)
        r1_seqs.update(r1s)
        r2_seqs.update(r2s)
    # For each read find the overlap, if exists and find the fusion point
    outer = csv.writer(sys.stdout, delimiter='\t')
    print 'track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%(
        settings.track_name, settings.track_desc)
    # Because I'm lazy, the code is written so r1 is the 3' end of the fragment
    for rname in set(r2_seqs.keys()):
        if rname in r1_seqs:
            r2seq = r2_seqs[rname]
            r1seq = r1_seqs[rname]
        else: # single-end
            r2seq = r2_seqs[rname]
            r1seq = ''
        s1, overlap, s2 = find_overlap(r2seq, r1seq)
        side_5p_len = extend_alignment(
            s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1],
            genome[read_5ps[rname][2]])
        side_3p_len = extend_alignment(
            s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1],
            genome[read_3ps[rname][2]])
        # Write each of the sides to the output file
        score=0
        if settings.rand_score:
            score=random.randint(0, 1000)
        if read_5ps[rname][1] == '+':
            gfrom = max(0,  read_5ps[rname][0])
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+',
                    gfrom, gto, settings.pos_first])
        elif read_5ps[rname][1] == '-':
            gfrom = max(0, read_5ps[rname][0]-side_5p_len+1)
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-',
                    gfrom, gto,settings.rev_first])
        if read_3ps[rname][1] == '+':
            gfrom = max(0, read_3ps[rname][0]-side_3p_len+1)
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+',
                    gfrom, gto, settings.pos_second])
        elif read_3ps[rname][1] == '-':
            gfrom = max(0, read_3ps[rname][0])
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-',
                    gfrom, gto, settings.rev_second])
    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    if settings.ribozero:
        try:
            uid_pos,_,_,_,_,rRNAs = RILseq.ecocyc_parser.read_genes_data(
                settings.ec_dir)
        except IOError:
            raise 
        rr_pos = []
        chr_dict = dict(zip(
                settings.EC_chrlist.split(',')[1::2],
                settings.EC_chrlist.split(',')[0::2]))
        for rrgene in rRNAs:
            # Pad the position of the rRNA gene with the alignment lenght
            rr_pos.append([chr_dict[uid_pos[rrgene][0]]] +\
                              [uid_pos[rrgene][1]-settings.length] +\
                              [uid_pos[rrgene][2]+settings.length] +\
                              [uid_pos[rrgene][3]])
    else:
        rr_pos = None
    region_interactions, region_ints_as1, region_ints_as2, total_interactions=\
        RILseq.read_reads_table(
        open(settings.reads_in), settings.seglen, rr_pos, settings.only_singles)
        
    # If all interactions are desired, skip the tests and report all
    if settings.all_interactions:
        interacting_regions = []
        for reg1, r1data in region_interactions.items():
            for reg2, clist in r1data.items():
                interacting_regions.append(
                    (1, len(clist), 0, reg1[0], reg1[0]+settings.seglen,
                     reg1[1], reg1[2], reg2[0], reg2[0]+settings.seglen,
                     reg2[1], reg2[2], 0, 0, 0))
    else:
        # Now run the test for each pair of interacting regions
        found_in_interaction = defaultdict(bool)
        interacting_regions = []
        # Start with the regions with the most interactions
        pairs_num = {}
        for reg1 in list(region_interactions.keys()):
            if region_ints_as1[reg1] < settings.min_int:
                continue
            for reg2 in list(region_interactions[reg1].keys()):
                if len(region_interactions[reg1][reg2]) >= settings.min_int:
                    pairs_num[(reg1, reg2)] = len(region_interactions[reg1][reg2])
        # Iterate the list of regions from the pairs with many interactions down
        for (reg1, reg2) in sorted(pairs_num, key=pairs_num.get, reverse=True):
            pv, ints, odds, r1_from, r1_to, r2_from, r2_to, mat_b, mat_c,mat_d=\
                RILseq.minpv_regions(
                reg1, reg2, region_interactions, region_ints_as1,
                region_ints_as2, total_interactions, found_in_interaction,
                settings.seglen, settings.maxsegs, settings.min_odds_ratio)
            pv *= len(pairs_num)
            if pv <= settings.max_pv:
                # Mark as participating
                for r1 in range(r1_from, r1_to, settings.seglen):
                    for r2 in range(r2_from, r2_to, settings.seglen):
                        found_in_interaction[
                            (r1, reg1[1], reg1[2], r2, reg2[1], reg2[2])] = True
                        
                # Report this interaction
                interacting_regions.append(
                    (pv, ints, odds, r1_from, r1_to, reg1[1], reg1[2],  r2_from,
                     r2_to, reg2[1], reg2[2],  mat_b, mat_c, mat_d))
    # Read the number of total RNAs in each region if the bam file is given
    if settings.total_RNA:
        # prepare a dictionary of features
        feat_dict = defaultdict(lambda: defaultdict(set))
        for region1 in region_ints_as1:
            for i in range(settings.seglen):
                feat_dict[region1[2]+region1[1]][region1[0]+i].add(region1)
        for region2 in region_ints_as2:
            for i in range(settings.seglen):
                feat_dict[region2[2]+region2[1]][region2[0]+i].add(region2)
        feat_list = {}
        for chrom, data in feat_dict.items():
            maxpos = max(data.keys())
            list_of_sets = []
            for k in range(maxpos+1):
                list_of_sets.append(list(data[k]))
            feat_list[chrom] = list_of_sets
        totRNA_counts = defaultdict(int)
        for bamfile in settings.total_RNA.split(','):
            totcounts = RILseq.count_features(
                feat_list, pysam.Samfile(bamfile), 5,
                rev=settings.total_reverse)
            for k, v in totcounts.items():
                totRNA_counts[k] += v
        # Collect all the ratios between IP and total then choose the 90%
        # percentile to avoid liers 
        max_IP_div_total_as1 = []
        max_IP_div_total_as2 = []
        for reg, counts in totRNA_counts.items():
            if counts > settings.min_total_counts:
                counts = float(counts+1)
#                div_prod = max(region_ints_as1[reg]/counts, region_ints_as2[reg]/counts)
                max_IP_div_total_as1.append(region_ints_as1[reg]/counts)
                max_IP_div_total_as2.append(region_ints_as2[reg]/counts)
#        mm1_sorted = sorted(max_IP_div_total_as1)
        mm_sorted = sorted(max_IP_div_total_as2+max_IP_div_total_as1)
        max_IP_div_total = mm_sorted[
            int(len(mm_sorted)*settings.norm_percentile)]
        sys.stderr.write("%f\n"%(max_IP_div_total))
            
    else:
        totRNA_counts = defaultdict(int)
        max_IP_div_total = 0
    # Read the additional data to decorate the results with
    RILseq.report_interactions(
        region_interactions, sys.stdout, interacting_regions, settings.seglen,
        settings.ec_dir, settings.EC_chrlist, settings.refseq_dir,
        settings.targets_file, settings.rep_table, settings.single_counts,
        settings.shuffles, settings.RNAup_cmd, settings.servers,
        settings.length, settings.est_utr_lens, settings.pad_seqs,
        totRNA_counts, max_IP_div_total)

    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    # Read the read names and positions
    read_5ps = {}
    read_3ps = {}
    read_genes = {}
    genome = {}
    gsize = {}
    for sr in SeqIO.parse(settings.genome, 'fasta'):
        genome[sr.id] = sr.seq
        gsize[sr.id] = len(sr.seq)  # genome size dictionary - {chr:size}
    if len(settings.BC_chrlist) >= 2:
        chr_dict = dict(zip(
                settings.BC_chrlist.split(',')[0::2],
                settings.BC_chrlist.split(',')[1::2]))
    #  create dictionary of {'COLI-K12' : 'chr'}
    else:
        chr_dict = {}
    if settings.summary:  # only reads from the significant interactions in -s param, also can enter a specific gene.
        sig_reads = RILseq.read_significant_reads(
            settings.summary, chr_dict, gname=settings.gene_name)

    for line in csv.reader(open(settings.list_reads), delimiter='\t'):
        # skip single
        if len(line) > 7 and line[7]=="single":
            continue
        if settings.summary:
            if (int(line[4])-1, line[5], line[3]) not in\
                    sig_reads[(int(line[1])-1, line[2], line[0])]:
                # skip if (coord_2, strand_2, chrom_2) not in the (coord_2, strand_2, chrom_2) of the significant reads.
                continue
        read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]]  # {read_id : [coord_1(0-based), strand_1, chrom_1]}
        read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]]  # {read_id : [coord_2(0-based), strand_2, chrom_2]}
#        read_genes[line[6]] = [line[0], line[1]]
    # Read the bam files and return the long sequences
    r1_seqs = {}
    r2_seqs = {}
    for bamfile in list(RILseq.flat_list(settings.bamfiles)):  # flat multiple lists into one list.
        r1s, r2s = get_reads_seqs(
            pysam.AlignmentFile(bamfile, 'rb'), read_5ps.keys(), rev=settings.reverse)
        r1_seqs.update(r1s)
        r2_seqs.update(r2s)
    # For each read find the overlap, if exists and find the fusion point
    outer = csv.writer(sys.stdout, delimiter='\t')
    print ('track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%(
        settings.track_name, settings.track_desc))
    # Because I'm lazy, the code is written so r1 is the 3' end of the fragment
    for rname in set(r2_seqs.keys()):
        if rname in r1_seqs:
            r2seq = r2_seqs[rname]
            r1seq = r1_seqs[rname]
        else:  # single-end
            r2seq = r2_seqs[rname]
            r1seq = ''
        # r2seq, r1seq are the sequences from the bam files for paired end
        s1, overlap, s2 = find_overlap(r2seq, r1seq)
        # print here
        print(s1+overlap+s2)
        print(read_5ps[rname][0])
        print(read_3ps[rname][0])
        side_5p_len = extend_alignment(
            s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1],
            genome[read_5ps[rname][2]])
        side_3p_len = extend_alignment(
            s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1],
            genome[read_3ps[rname][2]])
        # Write each of the sides to the output file
        score=0
        if settings.rand_score:
            score=random.randint(0, 1000)
        if read_5ps[rname][1] == '+':
            gfrom = max(0,  read_5ps[rname][0])
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+',
                    gfrom, gto, settings.pos_first])
        elif read_5ps[rname][1] == '-':
            gfrom = max(0, read_5ps[rname][0]-side_5p_len+1)
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-',
                    gfrom, gto, settings.rev_first])
        if read_3ps[rname][1] == '+':
            gfrom = max(0, read_3ps[rname][0]-side_3p_len+1)
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+',
                    gfrom, gto, settings.pos_second])
        elif read_3ps[rname][1] == '-':
            gfrom = max(0, read_3ps[rname][0])
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-',
                    gfrom, gto, settings.rev_second])
    return 0        # success
Esempio n. 10
0
def main(argv=None):
    settings = process_command_line(argv)
    if len(settings.EC_chrlist) >= 2:
        chr_dict = dict(zip(
                settings.EC_chrlist.split(',')[0::2],
                settings.EC_chrlist.split(',')[1::2]))
    else:
        chr_dict = {}
    region_interactions, _, _, _=\
        RILseq.read_reads_table(open(settings.reads_in), settings.region)
    both_strs = defaultdict(lambda: defaultdict(int))
    if settings.summary:
        sig_reads = RILseq.read_significant_reads(
            settings.summary, chr_dict)
    if settings.known:
        known_reads = defaultdict(list)
        ptt_c, ptt_str = get_coords("%s.ptt.gz"%settings.refseq_dir)
        rnt_c, rnt_str = get_coords("%s.rnt.gz"%settings.refseq_dir)
        for line in open(settings.known):
            spl = line.strip().split()
            try:
                scoor = rnt_c[spl[0]]
                rcoor = ptt_c[spl[1]]
            except KeyError:
                pass
            else:
                for i in range(scoor[0], scoor[1]):
                    for j in range(rcoor[0], rcoor[1]):
                        known_reads[i].append(j)
                        known_reads[j].append(i)

    for reg1 in region_interactions:
        if reg1[2] != settings.chrn:
            continue
        for reg2 in region_interactions[reg1]:
            if reg2[2] != settings.chrn:
                continue
            if settings.summary:
                nsigs = 0
                for r1, r2, in region_interactions[reg1][reg2]:
                    nsigs += int((r2, reg2[1], reg2[2]) in \
                        sig_reads[(r1, reg1[1], reg1[2])])
            else:
                nsigs = len(region_interactions[reg1][reg2])
            both_strs[reg1[0]][reg2[0]] += nsigs
    if settings.sRNAs:
        from RILseq.ecocyc_parser import read_genes_data
        uid_pos, uid_names, uid_tudata, sRNAs_list, other_RNAs_list, rRNAs = \
            read_genes_data(settings.ec_dir)
        sposs = set()
        for g in sRNAs_list:
            for i in range(uid_pos[g][1], uid_pos[g][2]):
                sposs.add(i)
    for r1 in both_strs:
        for r2 in both_strs[r1]:
            if both_strs[r1][r2] > settings.min_interactions:
                color = 'thickness=%dp'%max(
                    int(log(both_strs[r1][r2])/log(10)),1)
                if settings.sRNAs:
                    rset = set([i for i in range(r1, r1+settings.region)])
                    rset |= set([i for i in range(r2, r2+settings.region)])
                    if rset & sposs:
                        color = 'color=orange'
                if settings.known:
                    for k in set(range(r1, r1+settings.region)) & set(known_reads.keys()):
                        if set(range(r2, r2+settings.region)) & set(known_reads[k]):
                            color = 'color=red'
                            
                sys.stdout.write('%s %d %d %s %d %d %s\n'%(
                        settings.print_chr, r1+1, r1+settings.region,
                        settings.print_chr, r2+1, r2+settings.region,
                        color))

    return 0        # success
Esempio n. 11
0
def main(argv=None):
    settings = process_command_line(argv)
    # Read the transcripts if given
    if settings.transcripts:
        trans_dict = RILseq.read_transcripts(settings.transcripts)
    else:
        trans_dict = None
    # Get the ends of the reads from the bam files


#    sys.stderr.write('%s\n'%str(settings.bamfiles))
    if settings.all_reads:
        try:
            outall = open(settings.all_reads, 'w')
        except IOError:
            outall = None
    elif settings.add_all_reads:
        outall = sys.stdout
    else:
        outall = None
    for bf in RILseq.flat_list(settings.bamfiles):
        bfin = pysam.Samfile(bf)
        outhead = bf.rsplit('.', 1)[0]
        libname = outhead.rsplit('/', 1)[-1]
        fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname)
        fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname)
        if settings.skip_mapping:
            fsq1 = open(os.devnull, 'w')
            fsq2 = fsq1
        else:
            fsq1 = open(fsq1name, 'w')
            fsq2 = open(fsq2name, 'w')
        single_mapped = RILseq.get_unmapped_reads(
            bfin,
            fsq1,
            fsq2,
            settings.length,
            settings.maxG,
            rev=settings.reverse_complement,
            all_reads=True,
            dust_thr=settings.dust_thr)
        reads_in = []
        # Map the fastq files to the genome
        for fqname in (fsq1name, fsq2name):
            bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1]
            if settings.skip_mapping:
                bamname = "%s/%s.bam" % (settings.dirout, bamheadname)
            else:
                bamname = RILseq.run_bwa(settings.bwa_exec,
                                         fqname,
                                         None,
                                         settings.dirout,
                                         bamheadname,
                                         settings.max_mismatches,
                                         settings.genome_fasta,
                                         settings.params_aln,
                                         '',
                                         settings.samse_params,
                                         settings.samtools_cmd,
                                         processors=settings.processors)
            bamin = pysam.Samfile(bamname)
            reads_in.append(
                RILseq.read_bam_file(bamin, bamin.references,
                                     settings.allowed_mismatches))
        RILseq.write_reads_table(sys.stdout,
                                 reads_in[0],
                                 reads_in[1],
                                 bfin.references,
                                 settings.distance,
                                 not settings.keep_circular,
                                 trans_dict,
                                 write_single=outall,
                                 single_mapped=single_mapped,
                                 max_NM=settings.allowed_mismatches)
    return 0  # success