Example #1
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)

    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    
    fastq_1_list = list(RILseq.flat_list(settings.fastq_1))
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        if settings.create_wig:
            outwigs = [open("%s/%s_coverage.wig"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
               for fastq in fastq_1_list]
            coverage = RILseq.generate_wig(
                samfile, rev=settings.reverse_complement, first_pos=False)
            RILseq.print_wiggle(
                coverage, "%s_single_fragments_coverage"%libname,
                "%s single fragments coverage"%libname, outwigs[i])
    # Print the table of counts
    if settings.genes_gff:
        outtables = [open("%s/%s_counts.txt"%(settings.dirout, fastq.split("_cutadapt")[0]), 'w')
                     for fastq in fastq_1_list]
        for i, r1_name in enumerate(fastq_1_list):
            outt = csv.writer(outtables[i], delimiter='\t')
            outt.writerow(['Gene name'] + lib_order)
            for g in sorted(list(all_features)):
                row_out = [g]
                for libn in lib_order:
                    row_out.append(gcounts[libn][g])
                outt.writerow(row_out)
    return 0        # success
Example #2
0
def main(argv=None):
    settings = process_command_line(argv)
    if not os.path.exists(settings.dirout):
        os.makedirs(settings.dirout)
    outwig = open("%s/%s_coverage.wig"%(settings.dirout, settings.outhead), 'w')
    if settings.genes_gff:
        try:
            pos_feat_list, all_features = RILseq.read_gtf(
                open(settings.genes_gff), settings.feature, settings.identifier)
        except IOError:
            settings.genes_gff = None
        gcounts = {}
        lib_order = []
    fastq_2_list = list(RILseq.flat_list(settings.fastq_2))
    for i, r1_name in enumerate(RILseq.flat_list(settings.fastq_1)):
        try:
            r2_name = fastq_2_list[i]
        except IndexError:
            r2_name = None
        outhead = r1_name.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        outhead = '%s_bwa'%libname
        bamname = RILseq.run_bwa(
            settings.bwa_exec, r1_name, r2_name,
            settings.dirout, outhead, settings.allowed_mismatches,
            settings.genome_fasta, settings.params_aln, settings.sampe_params,
            settings.samse_params, settings.samtools_cmd,
            processors=settings.processors)
        samfile = pysam.Samfile(bamname)
        if settings.genes_gff:
            lib_order.append(libname)
            gcounts[libname] = RILseq.count_features(
                pos_feat_list, samfile, settings.overlap,
                rev=settings.reverse_complement)
        coverage = RILseq.generate_wig(
            samfile, rev=settings.reverse_complement, first_pos=False)
        RILseq.print_wiggle(
            coverage, "%s_single_fragments_coverage"%libname,
            "%s single fragments coverage"%libname, outwig)
    # Print the table of counts
    if settings.genes_gff:
        outtable = open(
            "%s/%s_counts.txt"%(settings.dirout, settings.outhead), 'w')
        outt = csv.writer(outtable, delimiter='\t')
        outt.writerow(['Gene name'] + lib_order)
        for g in sorted(list(all_features)):
            row_out = [g]
            for libn in lib_order:
                row_out.append(gcounts[libn][g])
            outt.writerow(row_out)
    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    try:
        pos_feat_list, all_features = RILseq.read_gtf(
            open(settings.genes_gff), settings.feature, settings.identifier)
    except IOError:
        return 1
    lib_order = []
    all_counts = {}
    if settings.singles:
        settings.only_first = True
        settings.only_second = False
    for r1_name in RILseq.flat_list(settings.reads_files):
        sys.stderr.write('%s\n'%str(r1_name))
        lib_order.append(r1_name)
        all_counts[r1_name] = count_features(
            pos_feat_list, open(r1_name), settings.overlap, length=25,
            ignore_first=settings.only_second,
            ignore_second=settings.only_first, count_singles=settings.singles)
    outt = csv.writer(sys.stdout, delimiter='\t')
    if not settings.quiet:
        outt.writerow(['Gene name'] + lib_order)
    for g in sorted(list(all_features)):
        if settings.quiet and g.startswith('~'):
            continue
        row_out = [g]
        for libn in lib_order:
            row_out.append(all_counts[libn][g])
        outt.writerow(row_out)
    # application code here, like:
    # run(settings, args)
    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    # Read the transcripts if given
    if settings.transcripts:
        trans_dict = RILseq.read_transcripts(settings.transcripts)
    else:
        trans_dict = None
    # Get the ends of the reads from the bam files
#    sys.stderr.write('%s\n'%str(settings.bamfiles))
    if settings.all_reads:
        try:
            outall = open(settings.all_reads, 'w')
        except IOError:
            outall = None
    elif settings.add_all_reads:
        outall = sys.stdout
    else:
        outall = None
    for bf in RILseq.flat_list(settings.bamfiles):
        bfin = pysam.Samfile(bf)
        outhead = bf.rsplit('.', 1)[0]
        libname = outhead.rsplit('/',1)[-1]
        fsq1name = "%s/%s_ends_1.fastq"%(settings.dirout, libname)
        fsq2name = "%s/%s_ends_2.fastq"%(settings.dirout, libname)
        if settings.skip_mapping:
            fsq1 = open(os.devnull, 'w')
            fsq2 = fsq1
        else:
            fsq1 = open(fsq1name, 'w')
            fsq2 = open(fsq2name, 'w')
        single_mapped = RILseq.get_unmapped_reads(
            bfin, fsq1, fsq2, settings.length, settings.maxG,
            rev=settings.reverse_complement, all_reads=True,
            dust_thr=settings.dust_thr)
        reads_in = []
        # Map the fastq files to the genome
        for fqname in (fsq1name, fsq2name):
            bamheadname = fqname.rsplit('.',1)[0].rsplit('/',1)[-1]
            if settings.skip_mapping:
                bamname = "%s/%s.bam"%(settings.dirout, bamheadname)
            else:
                bamname = RILseq.run_bwa(
                    settings.bwa_exec, fqname, None,
                    settings.dirout, bamheadname, settings.max_mismatches,
                    settings.genome_fasta, settings.params_aln,
                    '', settings.samse_params,
                    settings.samtools_cmd, processors=settings.processors)
            bamin = pysam.Samfile(bamname)
            reads_in.append(RILseq.read_bam_file(
                    bamin, bamin.references, settings.allowed_mismatches))
        RILseq.write_reads_table(
            sys.stdout, reads_in[0], reads_in[1], bfin.references,
            settings.distance, not settings.keep_circular,
            trans_dict, write_single=outall, single_mapped=single_mapped,
            max_NM=settings.allowed_mismatches)
    return 0        # success
Example #5
0
def main(argv=None):
    settings = process_command_line(argv)
    try:
        pos_feat_list, all_features = RILseq.read_gtf(open(settings.genes_gff),
                                                      settings.feature,
                                                      settings.identifier)
    except IOError:
        return 1
    lib_order = []
    all_counts = {}
    if settings.singles:
        settings.only_first = True
        settings.only_second = False
    for r1_name in RILseq.flat_list(settings.reads_files):
        sys.stderr.write('%s\n' % str(r1_name))
        lib_order.append(r1_name)
        all_counts[r1_name] = count_features(pos_feat_list,
                                             open(r1_name),
                                             settings.overlap,
                                             length=25,
                                             ignore_first=settings.only_second,
                                             ignore_second=settings.only_first,
                                             count_singles=settings.singles)
    outt = csv.writer(sys.stdout, delimiter='\t')
    if not settings.quiet:
        outt.writerow(['Gene name'] + lib_order)
    for g in sorted(list(all_features)):
        if settings.quiet and g.startswith('~'):
            continue
        row_out = [g]
        for libn in lib_order:
            row_out.append(all_counts[libn][g])
        outt.writerow(row_out)
    # application code here, like:
    # run(settings, args)
    return 0  # success
def main(argv=None):
    settings = process_command_line(argv)
    # Read the read names and positions
    read_5ps = {}
    read_3ps = {}
    read_genes = {}
    genome = {}
    gsize = {}
    for sr in SeqIO.parse(settings.genome, 'fasta'):
        genome[sr.id] = sr.seq
        gsize[sr.id] = len(sr.seq)
    if len(settings.EC_chrlist) >= 2:
        chr_dict = dict(zip(
                settings.EC_chrlist.split(',')[0::2],
                settings.EC_chrlist.split(',')[1::2]))
    else:
        chr_dict = {}
    if settings.summary:
        sig_reads = RILseq.read_significant_reads(
            settings.summary, chr_dict, gname=settings.gene_name)

    for line in csv.reader(open(settings.list_reads), delimiter='\t'):
        # skip single
        if len(line) > 7 and line[7]=="single":
            continue
        if settings.summary:
            if (int(line[4])-1, line[5], line[3]) not in\
                    sig_reads[(int(line[1])-1, line[2], line[0])]:
                continue
        read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]]
        read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]]
#        read_genes[line[6]] = [line[0], line[1]]
    # Read the bam files and return the long sequences
    r1_seqs = {}
    r2_seqs = {}
    for bamfile in list(RILseq.flat_list(settings.bamfiles)):
        r1s, r2s = get_reads_seqs(
            pysam.Samfile(bamfile), read_5ps.keys(), rev=settings.reverse)
        r1_seqs.update(r1s)
        r2_seqs.update(r2s)
    # For each read find the overlap, if exists and find the fusion point
    outer = csv.writer(sys.stdout, delimiter='\t')
    print 'track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%(
        settings.track_name, settings.track_desc)
    # Because I'm lazy, the code is written so r1 is the 3' end of the fragment
    for rname in set(r2_seqs.keys()):
        if rname in r1_seqs:
            r2seq = r2_seqs[rname]
            r1seq = r1_seqs[rname]
        else: # single-end
            r2seq = r2_seqs[rname]
            r1seq = ''
        s1, overlap, s2 = find_overlap(r2seq, r1seq)
        side_5p_len = extend_alignment(
            s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1],
            genome[read_5ps[rname][2]])
        side_3p_len = extend_alignment(
            s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1],
            genome[read_3ps[rname][2]])
        # Write each of the sides to the output file
        score=0
        if settings.rand_score:
            score=random.randint(0, 1000)
        if read_5ps[rname][1] == '+':
            gfrom = max(0,  read_5ps[rname][0])
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+',
                    gfrom, gto, settings.pos_first])
        elif read_5ps[rname][1] == '-':
            gfrom = max(0, read_5ps[rname][0]-side_5p_len+1)
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-',
                    gfrom, gto,settings.rev_first])
        if read_3ps[rname][1] == '+':
            gfrom = max(0, read_3ps[rname][0]-side_3p_len+1)
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+',
                    gfrom, gto, settings.pos_second])
        elif read_3ps[rname][1] == '-':
            gfrom = max(0, read_3ps[rname][0])
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-',
                    gfrom, gto, settings.rev_second])
    return 0        # success
def main(argv=None):
    settings = process_command_line(argv)
    # Read the read names and positions
    read_5ps = {}
    read_3ps = {}
    read_genes = {}
    genome = {}
    gsize = {}
    for sr in SeqIO.parse(settings.genome, 'fasta'):
        genome[sr.id] = sr.seq
        gsize[sr.id] = len(sr.seq)  # genome size dictionary - {chr:size}
    if len(settings.BC_chrlist) >= 2:
        chr_dict = dict(zip(
                settings.BC_chrlist.split(',')[0::2],
                settings.BC_chrlist.split(',')[1::2]))
    #  create dictionary of {'COLI-K12' : 'chr'}
    else:
        chr_dict = {}
    if settings.summary:  # only reads from the significant interactions in -s param, also can enter a specific gene.
        sig_reads = RILseq.read_significant_reads(
            settings.summary, chr_dict, gname=settings.gene_name)

    for line in csv.reader(open(settings.list_reads), delimiter='\t'):
        # skip single
        if len(line) > 7 and line[7]=="single":
            continue
        if settings.summary:
            if (int(line[4])-1, line[5], line[3]) not in\
                    sig_reads[(int(line[1])-1, line[2], line[0])]:
                # skip if (coord_2, strand_2, chrom_2) not in the (coord_2, strand_2, chrom_2) of the significant reads.
                continue
        read_5ps[line[6]] = [int(line[1])-1, line[2], line[0]]  # {read_id : [coord_1(0-based), strand_1, chrom_1]}
        read_3ps[line[6]] = [int(line[4])-1, line[5], line[3]]  # {read_id : [coord_2(0-based), strand_2, chrom_2]}
#        read_genes[line[6]] = [line[0], line[1]]
    # Read the bam files and return the long sequences
    r1_seqs = {}
    r2_seqs = {}
    for bamfile in list(RILseq.flat_list(settings.bamfiles)):  # flat multiple lists into one list.
        r1s, r2s = get_reads_seqs(
            pysam.AlignmentFile(bamfile, 'rb'), read_5ps.keys(), rev=settings.reverse)
        r1_seqs.update(r1s)
        r2_seqs.update(r2s)
    # For each read find the overlap, if exists and find the fusion point
    outer = csv.writer(sys.stdout, delimiter='\t')
    print ('track name="%s" description="%s" visibility=4 itemRgb="On" useScore=0'%(
        settings.track_name, settings.track_desc))
    # Because I'm lazy, the code is written so r1 is the 3' end of the fragment
    for rname in set(r2_seqs.keys()):
        if rname in r1_seqs:
            r2seq = r2_seqs[rname]
            r1seq = r1_seqs[rname]
        else:  # single-end
            r2seq = r2_seqs[rname]
            r1seq = ''
        # r2seq, r1seq are the sequences from the bam files for paired end
        s1, overlap, s2 = find_overlap(r2seq, r1seq)
        # print here
        print(s1+overlap+s2)
        print(read_5ps[rname][0])
        print(read_3ps[rname][0])
        side_5p_len = extend_alignment(
            s1+overlap+s2, read_5ps[rname][0], 0, False, read_5ps[rname][1],
            genome[read_5ps[rname][2]])
        side_3p_len = extend_alignment(
            s1+overlap+s2, 0, read_3ps[rname][0], True, read_3ps[rname][1],
            genome[read_3ps[rname][2]])
        # Write each of the sides to the output file
        score=0
        if settings.rand_score:
            score=random.randint(0, 1000)
        if read_5ps[rname][1] == '+':
            gfrom = max(0,  read_5ps[rname][0])
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+side_5p_len)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '+',
                    gfrom, gto, settings.pos_first])
        elif read_5ps[rname][1] == '-':
            gfrom = max(0, read_5ps[rname][0]-side_5p_len+1)
            gto = min(gsize[read_5ps[rname][2]], read_5ps[rname][0]+1)
            outer.writerow([
                    read_5ps[rname][2], gfrom, gto, "%s_5p"%rname, score, '-',
                    gfrom, gto, settings.rev_first])
        if read_3ps[rname][1] == '+':
            gfrom = max(0, read_3ps[rname][0]-side_3p_len+1)
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+1)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto,"%s_3p"%rname, score, '+',
                    gfrom, gto, settings.pos_second])
        elif read_3ps[rname][1] == '-':
            gfrom = max(0, read_3ps[rname][0])
            gto = min(gsize[read_3ps[rname][2]], read_3ps[rname][0]+side_3p_len)
            outer.writerow([
                    read_3ps[rname][2], gfrom, gto, "%s_3p"%rname, score, '-',
                    gfrom, gto, settings.rev_second])
    return 0        # success
Example #8
0
def main(argv=None):
    settings = process_command_line(argv)
    # Read the transcripts if given
    if settings.transcripts:
        trans_dict = RILseq.read_transcripts(settings.transcripts)
    else:
        trans_dict = None
    # Get the ends of the reads from the bam files


#    sys.stderr.write('%s\n'%str(settings.bamfiles))
    if settings.all_reads:
        try:
            outall = open(settings.all_reads, 'w')
        except IOError:
            outall = None
    elif settings.add_all_reads:
        outall = sys.stdout
    else:
        outall = None
    for bf in RILseq.flat_list(settings.bamfiles):
        bfin = pysam.Samfile(bf)
        outhead = bf.rsplit('.', 1)[0]
        libname = outhead.rsplit('/', 1)[-1]
        fsq1name = "%s/%s_ends_1.fastq" % (settings.dirout, libname)
        fsq2name = "%s/%s_ends_2.fastq" % (settings.dirout, libname)
        if settings.skip_mapping:
            fsq1 = open(os.devnull, 'w')
            fsq2 = fsq1
        else:
            fsq1 = open(fsq1name, 'w')
            fsq2 = open(fsq2name, 'w')
        single_mapped = RILseq.get_unmapped_reads(
            bfin,
            fsq1,
            fsq2,
            settings.length,
            settings.maxG,
            rev=settings.reverse_complement,
            all_reads=True,
            dust_thr=settings.dust_thr)
        reads_in = []
        # Map the fastq files to the genome
        for fqname in (fsq1name, fsq2name):
            bamheadname = fqname.rsplit('.', 1)[0].rsplit('/', 1)[-1]
            if settings.skip_mapping:
                bamname = "%s/%s.bam" % (settings.dirout, bamheadname)
            else:
                bamname = RILseq.run_bwa(settings.bwa_exec,
                                         fqname,
                                         None,
                                         settings.dirout,
                                         bamheadname,
                                         settings.max_mismatches,
                                         settings.genome_fasta,
                                         settings.params_aln,
                                         '',
                                         settings.samse_params,
                                         settings.samtools_cmd,
                                         processors=settings.processors)
            bamin = pysam.Samfile(bamname)
            reads_in.append(
                RILseq.read_bam_file(bamin, bamin.references,
                                     settings.allowed_mismatches))
        RILseq.write_reads_table(sys.stdout,
                                 reads_in[0],
                                 reads_in[1],
                                 bfin.references,
                                 settings.distance,
                                 not settings.keep_circular,
                                 trans_dict,
                                 write_single=outall,
                                 single_mapped=single_mapped,
                                 max_NM=settings.allowed_mismatches)
    return 0  # success