コード例 #1
0
def loadFilteredContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
コード例 #2
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [
            x for x in open(statfile, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()
コード例 #3
0
def buildUniformityOfCoverage(infiles, outfile):
    '''
    build matrix of coverage over contigs
    '''
    bam = infiles[0]
    track = P.snip(os.path.basename(bam), ".bam")
    tmp_bed = P.getTempFilename(".") + ".bed"
    tmp_bam = P.getTempFilename(".") + ".bam"
    
    # filter for mapped reads
    statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s
                   ; samtools index %(tmp_bam)s'''
    P.run()

    for infs in infiles[1:]:
        for inf in infs:
            if P.snip(inf, ".lengths.tsv") == track:
                length_file = inf
                

    statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s'''
    P.run()

    statement = '''python %(scriptsdir)s/bam2peakshape.py 
                   --only-interval %(tmp_bam)s %(tmp_bed)s 
                   --log=%(outfile)s.log 
                   --output-filename-pattern=%(track)s.%%s'''
    P.run()
    os.unlink(tmp_bed)
    os.unlink(tmp_bam)
コード例 #4
0
def plotFalsePositiveRates(infile, outfile):
    '''
    barplot the false positive rates across
    taxonomic levels
    '''
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % infile)
    for i in [0, 1]:
        # specificity
        outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

        # sensitivity
        outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))'''
          % i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))'''
          )
        R('''ggsave("%s")''' % outf)

    P.touch(outfile)
コード例 #5
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()
コード例 #6
0
def plotFalsePositiveRates(infile, outfile):
    '''
    barplot the false positive rates across
    taxonomic levels
    '''
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")''' %
      infile)
    for i in [0, 1]:
        # specificity
        outf = P.snip(outfile, ".pdf") + ".%i.specificity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = fp_rate, fill = track, stat = "identity"))''' %
          i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''')
        R('''ggsave("%s")''' % outf)

        # sensitivity
        outf = P.snip(outfile, ".pdf") + ".%i.sensitivity.pdf" % i
        R('''plot1 <- ggplot(dat[dat$cutoff == %i,], aes(x=reorder(level, fp_rate), y = tp_rate, fill = track, stat = "identity"))''' %
          i)
        R('''plot2 <- plot1 + geom_bar(position = "dodge", stat="identity")''')
        R('''plot2 + scale_fill_manual(values = c("cadetblue", "slategray", "lightblue"))''')
        R('''ggsave("%s")''' % outf)

    P.touch(outfile)
コード例 #7
0
def createConfigFiles(infile, outfile):
    '''
    create all of the relevant .ini files in each working
    directory in order to execute the transcript building
    '''
    # test options for cufflinks
    cuff_opts = P.snip(infile, ".log").split("_")
    cuff_options = []
    for opt in cuff_opts:
        if len(opt)>6: # not ideal to do my length but all I can think of at the moment
            cuff_options.append("--" + opt)
        else:
            cuff_options.append(opt)
    cuff_options = " ".join(cuff_options)
    
    options = PARAMS["cufflinks_options"]
    # directory for output config
    outdir = P.snip(infile, ".log")

    outf = open(os.path.join(outdir, "pipeline.ini"), "w")
    config_headers = []
    lines = []
    for line in open("pipeline.ini").readlines():
        lines.append(line)
        if line.find("[cufflinks]") != -1:
            outf.write( "[cufflinks]\n\n# general cufflinks options\n\noptions=%s %s   \n" % (options,cuff_options) )
        elif "[cufflinks]\n" in lines and "[cuffdiff\n]" not in lines:
            if line.find("options=") != -1:
                continue
            else:
                outf.write(line)
        else:
            outf.write(line)
    outf.close()
コード例 #8
0
def runFeatureCounts(annotations_file,
                     bamfile,
                     outfile,
                     nthreads=4,
                     strand=2,
                     options=""):
    '''run feature counts on *annotations_file* with
    *bam_file*.

    If the bam-file is paired, paired-end counting
    is enabled and the bam file automatically sorted.
    '''

    # featureCounts cannot handle gzipped in or out files
    outfile = P.snip(outfile, ".gz")
    tmpdir = P.getTempDir()
    annotations_tmp = os.path.join(tmpdir, 'geneset.gtf')
    bam_tmp = os.path.join(tmpdir, os.path.basename(bamfile))

    # -p -B specifies count fragments rather than reads, and both
    # reads must map to the feature
    # for legacy reasons look at feature_counts_paired
    if BamTools.isPaired(bamfile):
        # select paired end mode, additional options
        paired_options = "-p -B"
        # remove .bam extension
        bam_prefix = P.snip(bam_tmp, ".bam")
        # sort by read name
        paired_processing = \
            """samtools 
                sort -@ %(nthreads)i -n %(bamfile)s %(bam_prefix)s; 
            checkpoint; """ % locals()
        bamfile = bam_tmp
    else:
        paired_options = ""
        paired_processing = ""

    job_threads = nthreads

    # AH: what is the -b option doing?
    statement = '''mkdir %(tmpdir)s;
                   zcat %(annotations_file)s > %(annotations_tmp)s;
                   checkpoint;
                   %(paired_processing)s
                   featureCounts %(options)s
                                 -T %(nthreads)i
                                 -s %(strand)s
                                 -b
                                 -a %(annotations_tmp)s
                                 %(paired_options)s
                                 -o %(outfile)s
                                 %(bamfile)s
                    >& %(outfile)s.log;
                    checkpoint;
                    gzip -f %(outfile)s;
                    checkpoint;
                    rm -rf %(tmpdir)s
    '''

    P.run()
コード例 #9
0
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write("\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."]))
                            + "\t" + exon.attributes + "\n")
        elif len(entry) == 1:
            for exon in entry:
                single.write("\t".join(map(str, [exon.contig, exon.source, exon.feature, exon.start, exon.end, ".", exon.strand, "."]))
                             + "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
コード例 #10
0
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
コード例 #11
0
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write("\t".join(
                    map(str, [
                        exon.contig, exon.source, exon.feature, exon.start,
                        exon.end, ".", exon.strand, "."
                    ])) + "\t" + exon.attributes + "\n")
        elif len(entry) == 1:
            for exon in entry:
                single.write("\t".join(
                    map(str, [
                        exon.contig, exon.source, exon.feature, exon.start,
                        exon.end, ".", exon.strand, "."
                    ])) + "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
コード例 #12
0
def buildUniformityOfCoverage(infiles, outfile):
    '''
    build matrix of coverage over contigs
    '''
    bam = infiles[0]
    track = P.snip(os.path.basename(bam), ".bam")
    tmp_bed = P.getTempFilename(".") + ".bed"
    tmp_bam = P.getTempFilename(".") + ".bam"

    # filter for mapped reads
    statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s
                   ; samtools index %(tmp_bam)s'''
    P.run()

    for infs in infiles[1:]:
        for inf in infs:
            if P.snip(inf, ".lengths.tsv") == track:
                length_file = inf

    statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s'''
    P.run()

    statement = '''python %(scriptsdir)s/bam2peakshape.py 
                   --only-interval %(tmp_bam)s %(tmp_bed)s 
                   --log=%(outfile)s.log 
                   --output-filename-pattern=%(track)s.%%s'''
    P.run()
    os.unlink(tmp_bed)
    os.unlink(tmp_bam)
コード例 #13
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
コード例 #14
0
def loadPicardHistogram( infiles, outfile, suffix, column, pipeline_suffix = ".picard_stats" ):
    '''extract a histogram from a picard output file and load it into database.'''

    tablename = P.toTable( outfile )
    tname = "%s_%s" % (tablename, suffix)
    
    tname = P.snip( tname, "_metrics") + "_histogram"

    # some files might be missing
    xfiles = [ x for x in infiles if os.path.exists( "%s.%s" % (x, suffix) ) ]

    if len(xfiles) == 0: 
        E.warn ( "no files for %s" % tname )
        return
    
    header = ",".join( [P.snip( os.path.basename(x), pipeline_suffix) for x in xfiles ] )        
    filenames = " ".join( [ "%s.%s" % (x, suffix) for x in xfiles ] )

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest
    statement = """python %(scriptsdir)s/combine_tables.py
                      --regex-start="## HISTOGRAM"
                      --missing=0
                      --take=2
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """
    
    P.run()
コード例 #15
0
    def postprocess( self, infiles, outfile ):
        '''collect output data and postprocess.'''

        track = P.snip( os.path.basename(outfile), ".bam" )
        outf = P.snip( outfile, ".bam" )
        tmpdir = self.tmpdir_fastq

        strip_cmd, unique_cmd = "", ""

        if self.remove_non_unique:
            unique_cmd = '| python %%(scriptsdir)s/bam2bam.py --filter=unique --log=%(outfile)s.log' % locals()
            
        if self.strip_sequence:
            strip_cmd = '| python %%(scriptsdir)s/bam2bam.py --strip=sequence --log=%(outfile)s.log' % locals()

        statement = '''
                cp %(tmpdir)s/Log.std.out %(outfile)s.std.log;
                cp %(tmpdir)s/Log.final.out %(outfile)s.final.log;
                cp %(tmpdir)s/SJ.out.tab %(outfile)s.junctions;
                cat %(tmpdir)s/Log.out >> %(outfile)s.log;
                cp %(tmpdir)s/Log.progress.out %(outfile)s.progress;
                samtools view -uS %(tmpdir)s/%(track)s.sam
                %(unique_cmd)s
                %(strip_cmd)s
                | samtools sort - %(outf)s 2>>%(outfile)s.log; 
                samtools index %(outfile)s;''' % locals()

        return statement
コード例 #16
0
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
コード例 #17
0
    def build(self, infile):

        track = self.getTrack(infile)

        format = self.getFormat(infile)
        if format.endswith(".gz"):
            format = P.snip(format, ".gz")
        format = format.upper()

        # cortex_var only uses paired end information to
        # remove pcr duplicates
        if not self.checkPairs(infile):
            paired = "--se_list"
            reads = os.path.join(os.getcwd(), infile)

        elif len(self.checkPairs(infile)) > 1:
            paired = "--pe_list"
            read1 = infile
            format = P.snip(format, ".1")
            read2 = self.checkPairs(infile)[1]

        elif self.checkPairs(infile) == "interleaved":
            raise ValueError, "pipeline does not support file of type 'interleaved'"

        temp = P.getTempDir()
        read1_new = os.path.join(temp, P.snip(read1, ".1.gz"))
        read2_new = os.path.join(temp, P.snip(read2, ".2.gz"))

        # paired end list
        list1 = open("cortex_var.dir/read1.txt", "w")
        list2 = open("cortex_var.dir/read2.txt", "w")
        list1.write(read1_new + "\n")
        list2.write(read2_new + "\n")
        list1.close()
        list2.close()

        list1 = os.path.abspath("cortex_var.dir/read1.txt")
        list2 = os.path.abspath("cortex_var.dir/read2.txt")

        reads = ",".join([os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]])
        statement = (
            """  gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s  
                       ; cd cortex_var.dir
                       ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s 
                       --format %(format)s
                       --mem_height 15
                       --quality_score_threshold %%(cortex_var_qual_threshold)i 
                       --remove_pcr_duplicates 
                       --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i
                       --sample_id %(track)s
                       --kmer_size %%(kmer)s
                       --dump_binary dump_binary.ctx
                       ; rm -rf %(temp)s
                    """
            % locals()
        )

        return statement
コード例 #18
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + "_" + P.snip(
        os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname)
    P.touch(outfile)
コード例 #19
0
def loadContigGCContent(infile, outfile):
    '''
    load contig GC content
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=id")
    P.touch(outfile)
コード例 #20
0
def loadContigLengths(infile, outfile):
    '''
    load contig lengths
    '''
    outname = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + P.snip(os.path.basename(infile), ".tsv") + ".load"
    P.load(infile, outname, "--index=scaffold_name")
    P.touch(outfile)
コード例 #21
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first: outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
コード例 #22
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header=%(column)s,%(header)s
                      --replace-header
                      --index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
コード例 #23
0
ファイル: pipeline_rnaseqlncrna.py プロジェクト: lesheng/cgat
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;" " gzip %(removed)s")
    P.run()
コード例 #24
0
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            if len(self.checkPairs(
                    infile)) > 1:  # check for paired data in separate files
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals(
                )
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        if self.getFormat(infile).find("fastq") != -1 and len(
                self.checkPairs(infile)
        ) > 1:  # reads are fastq and paired in separate files
            mtype = "--merge"  # argument for conversion tool
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(
                infile
        ) == "interleaved":  # reads are fastq and in the same file
            mtype = "--paired"  # argument for conversion tool

        # requires a merge of the fastq files in to fasta format
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa"

                # check if file exists - metaphlan also performs this preprocessing step
                if not os.path.exists(outf):
                    statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s
                                ''' % locals()
                    P.run()
                else:
                    E.info("no need to create file %s - exists" % outf)

            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s'''
                P.run()
                statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
                P.run()
        else:
            statement = None
        return statement
コード例 #25
0
ファイル: pipeline_idr.py プロジェクト: jmadzo/cgat
def reMergeBamfiles(infiles, sentinal):
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
コード例 #26
0
def reMergeBamfiles(infiles, sentinal):
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
コード例 #27
0
    def build(self, infile):

        track = self.getTrack(infile)

        format = self.getFormat(infile)
        if format.endswith(".gz"):
            format = P.snip(format, ".gz")
        format = format.upper()

        # cortex_var only uses paired end information to
        # remove pcr duplicates
        if not self.checkPairs(infile):
            paired = "--se_list"
            reads = os.path.join(os.getcwd(), infile)

        elif len(self.checkPairs(infile)) > 1:
            paired = "--pe_list"
            read1 = infile
            format = P.snip(format, ".1")
            read2 = self.checkPairs(infile)[1]

        elif self.checkPairs(infile) == "interleaved":
            raise ValueError, "pipeline does not support file of type 'interleaved'"

        temp = P.getTempDir()
        read1_new = os.path.join(temp, P.snip(read1, ".1.gz"))
        read2_new = os.path.join(temp, P.snip(read2, ".2.gz"))

        # paired end list
        list1 = open("cortex_var.dir/read1.txt", "w")
        list2 = open("cortex_var.dir/read2.txt", "w")
        list1.write(read1_new + "\n")
        list2.write(read2_new + "\n")
        list1.close()
        list2.close()

        list1 = os.path.abspath("cortex_var.dir/read1.txt")
        list2 = os.path.abspath("cortex_var.dir/read2.txt")

        reads = ",".join(
            [os.path.join(os.getcwd(), x) for x in [read1_new, read2_new]])
        statement = '''  gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s  
                       ; cd cortex_var.dir
                       ; %%(cortex_var_executable)s %(paired)s %(list1)s,%(list2)s 
                       --format %(format)s
                       --mem_height 15
                       --quality_score_threshold %%(cortex_var_qual_threshold)i 
                       --remove_pcr_duplicates 
                       --remove_low_coverage_supernodes %%(cortex_var_rm_low_coverage_supernodes)i
                       --sample_id %(track)s
                       --kmer_size %%(kmer)s
                       --dump_binary dump_binary.ctx
                       ; rm -rf %(temp)s
                    ''' % locals()

        return statement
コード例 #28
0
def poolSampleBamfiles(infiles, sentinal):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinal)
コード例 #29
0
def buildSpeciesMap(infiles, outfile):
    '''
    build species map file for input into
    contigs2random_samples.py
    '''
    to_cluster = True
    bam = infiles[0]
    contig = [x for x in infiles[1] if P.snip(x, ".fa") == P.snip(bam, ".bam")][0]
    statement = ''' cat %(contig)s | python %(scriptsdir)s/bam2species_map.py -b %(bam)s --log=%(outfile)s.log > %(outfile)s'''
    P.run()
コード例 #30
0
ファイル: pipeline_idr.py プロジェクト: jmadzo/cgat
def poolSampleBamfiles(infiles, sentinal):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinal)
コード例 #31
0
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    statement = '''cat effect.txt |
                   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
                       --index=transcript_id \
                       --table=%(tablename)s \
                   > %(outfile)s'''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        statement = '''cat %(tmpfilename)s |
                       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                           --allow-empty
                           --index=transcript_id 
                           --table=%(tablename)s_%(suffix)s 
                           --ignore-column=seq_na
                           --ignore-column=seq_aa
                       >> %(outfile)s'''
        P.run()
コード例 #32
0
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            # check for paired data in separate files
            if len(self.checkPairs(infile)) > 1:
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals()
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        # reads are fastq and paired in separate files
        if self.getFormat(infile).find("fastq") != -1 and len(self.checkPairs(infile)) > 1:
            mtype = "--merge"  # argument for conversion tool
        # reads are fastq and in the same file
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(infile) == "interleaved":
            mtype = "--paired"  # argument for conversion tool

        # requires a merge of the fastq files in to fasta format
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1), ".fastq.1.gz") + ".fa"

                # check if file exists - metaphlan also performs this
                # preprocessing step
                if not os.path.exists(outf):
                    statement = '''python %%(scriptsdir)s/fastqs2fasta.py -a %(read1)s -b %(read2)s --log=%(read1)s.log > %(outf)s
                                ''' % locals()
                    P.run()
                else:
                    E.info("no need to create file %s - exists" % outf)

            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s'''
                P.run()
                statement = '''fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
                P.run()
        else:
            statement = None
        return statement
コード例 #33
0
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    statement = '''cat effect.txt |
                   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
                       --index=transcript_id \
                       --table=%(tablename)s \
                   > %(outfile)s'''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        statement = '''cat %(tmpfilename)s |
                       python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
                           --allow-empty
                           --index=transcript_id 
                           --table=%(tablename)s_%(suffix)s 
                           --ignore-column=seq_na
                           --ignore-column=seq_aa
                       >> %(outfile)s'''
        P.run()
コード例 #34
0
    def preprocess(self, infile):
        '''
        fastq files need to be converted to fasta
        and pairs need to be merged
        '''

        mtype = None

        # check for paired end data either in the same file or in a separate file
        # for each read - will need to be gunzipped
        # check compression status
        if infile.endswith(".gz"):
            if len(self.checkPairs(
                    infile)) > 1:  # check for paired data in separate files
                read1 = infile
                read2 = self.checkPairs(infile)[1]
                temp = P.getTempDir()
                read1_new = os.path.join(temp, P.snip(infile, ".gz"))
                read2_new = os.path.join(
                    temp, P.snip(self.checkPairs(infile)[1], ".gz"))
                zippy = """gunzip -c %(read1)s > %(read1_new)s
                       ; gunzip -c %(read2)s > %(read2_new)s; """ % locals()
            elif self.checkPairs == "interleaved":
                infile_new = os.path.join(temp, P.snip(infile, ".gz"))
                zippy = """gunzip -c %(infile)s > %(infile_new)s; """ % locals(
                )
        else:
            zippy = ""

        # only need to convert if the data are in fastq format
        if self.getFormat(infile).find("fastq") != -1 and len(
                self.checkPairs(infile)
        ) > 1:  # reads are fastq and paired in separate files
            mtype = "--merge"  # argument for conversion tool
        elif self.getFormat(infile).find("fastq") != -1 and self.checkPairs(
                infile
        ) == "interleaved":  # reads are fastq and in the same file
            mtype = "--paired"  # argument for conversion tool

        # build statement
        if mtype:  # the reads are paired end
            if mtype == "--merge":
                outf = P.snip(os.path.basename(read1_new), ".fastq.1") + ".fa"
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(read1_new)s %(read2_new)s %(outf)s
                             ''' % locals()
            elif mtype == "--paired":
                outf = P.snip(os.path.basename(infile_new), ".fastq") + ".fa"
                statement = '''%(zippy)s
                             fq2fa %(mtype)s %(infile_new)s %(outf)s
                             rm -rf %(temp)s''' % locals()
        else:
            statement = None
        return statement
コード例 #35
0
def plotCoverageHistogram(infile, outfile):
    '''
    plot the coverage over kmers
    '''
    inf = P.snip(infile, ".contigs.fa") + ".stats.txt"
    outf = P.snip(inf, ".txt") + ".pdf"
    R('''library(plotrix)''')
    R('''data = read.table("%s", header=TRUE)''' % inf)
    R('''pdf("%s", height = 7, width = 7 )''' % outf)
    R('''weighted.hist(data$short1_cov, data$lgth, breaks=seq(0, 200, by=1))''')
    R["dev.off"]()
コード例 #36
0
def alignmentTargets(genome_files, contig_files):
    '''
    generator object to produce filenames for 
    aligning contigs to known ncbi genomes
    '''
    parameters = []
    for genome, contig in itertools.product(genome_files, contig_files):
        outfile = os.path.join("alignment.dir", P.snip(
            contig, ".contigs.fa") + "_vs_" + P.snip(os.path.basename(genome), ".fna")) + ".delta"
        parameters.append([genome, outfile, contig])
    return parameters
コード例 #37
0
def summarizeProcessing(infile, outfile):
    '''build processing summary.'''
    def _parseLog(inf, step):

        inputs, outputs = [], []
        if step == "reconcile":
            for line in inf:
                x = re.search(
                    "first pair: (\d+) reads, second pair: (\d+) reads, shared: (\d+) reads",
                    line)
                if x:
                    i1, i2, o = map(int, x.groups())
                    inputs = [i1, i2]
                    outputs = [o, o]
                    break
        elif step == "contaminants":
            lines = inf.readlines()
            assert lines[0].startswith("cutadapt")
            lines = "@@@".join(lines)
            for part in lines.split("cutadapt")[1:]:
                results, adapters = parseCutadapt(
                    ("cutadapt" + part).split("@@@"))
                inputs.append(results["processed_reads"])
                outputs.append(results["unchanged_reads"])
        else:
            for line in inf:
                if line.startswith("Input:"):
                    inputs.append(
                        int(re.match("Input: (\d+) reads.", line).groups()[0]))
                elif line.startswith("Output:"):
                    outputs.append(
                        int(
                            re.match("Output: (\d+) reads.",
                                     line).groups()[0]))

        return zip(inputs, outputs)

    infile2 = checkPairs(infile)
    if infile2:
        track = P.snip(infile, ".fastq.1.gz")
    else:
        track = P.snip(infile, ".fastq.gz")

    outf = IOTools.openFile(outfile, "w")
    outf.write("track\tstep\tpair\tinput\toutput\n")

    for step in "contaminants", "artifacts", "trim", "filter", "reconcile":
        fn = infile + "_%s.log" % step
        if not os.path.exists(fn):
            continue
        for x, v in enumerate(_parseLog(IOTools.openFile(fn), step)):
            outf.write("%s\t%s\t%i\t%i\t%i\n" % (track, step, x, v[0], v[1]))

    outf.close()
コード例 #38
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
コード例 #39
0
ファイル: pipeline_idr.py プロジェクト: Charlie-George/cgat
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
コード例 #40
0
def buildSpeciesMap(infiles, outfile):
    '''
    build species map file for input into
    contigs2random_samples.py
    '''
    to_cluster = True
    bam = infiles[0]
    contig = [
        x for x in infiles[1] if P.snip(x, ".fa") == P.snip(bam, ".bam")
    ][0]
    statement = ''' cat %(contig)s | python %(scriptsdir)s/bam2species_map.py -b %(bam)s --log=%(outfile)s.log > %(outfile)s'''
    P.run()
コード例 #41
0
ファイル: pipeline_idr.py プロジェクト: Charlie-George/cgat
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
コード例 #42
0
def summarizeProcessing(infile, outfile):
    '''build processing summary.'''

    def _parseLog(inf, step):

        inputs, outputs = [], []
        if step == "reconcile":
            for line in inf:
                x = re.search(
                    "first pair: (\d+) reads, second pair: (\d+) reads, shared: (\d+) reads", line)
                if x:
                    i1, i2, o = map(int, x.groups())
                    inputs = [i1, i2]
                    outputs = [o, o]
                    break
        elif step == "contaminants":
            lines = inf.readlines()
            assert lines[0].startswith("cutadapt")
            lines = "@@@".join(lines)
            for part in lines.split("cutadapt")[1:]:
                results, adapters = parseCutadapt(
                    ("cutadapt" + part).split("@@@"))
                inputs.append(results["processed_reads"])
                outputs.append(results["unchanged_reads"])
        else:
            for line in inf:
                if line.startswith("Input:"):
                    inputs.append(
                        int(re.match("Input: (\d+) reads.", line).groups()[0]))
                elif line.startswith("Output:"):
                    outputs.append(
                        int(re.match("Output: (\d+) reads.", line).groups()[0]))

        return zip(inputs, outputs)

    infile2 = checkPairs(infile)
    if infile2:
        track = P.snip(infile, ".fastq.1.gz")
    else:
        track = P.snip(infile, ".fastq.gz")

    outf = IOTools.openFile(outfile, "w")
    outf.write("track\tstep\tpair\tinput\toutput\n")

    for step in "contaminants", "artifacts", "trim", "filter", "reconcile":
        fn = infile + "_%s.log" % step
        if not os.path.exists(fn):
            continue
        for x, v in enumerate(_parseLog(IOTools.openFile(fn), step)):
            outf.write("%s\t%s\t%i\t%i\t%i\n" % (track, step, x, v[0], v[1]))

    outf.close()
コード例 #43
0
def poolInputBamfiles(infiles, sentinal):
    """
    Merge filtered input files for each tissue, with the option of excluding
    undesirable libraries.
    """
    infiles = [P.snip(x, ".sentinal") + ".bam" for x in infiles]
    outfile = P.snip(sentinal, ".sentinal") + ".bam"
    bad_samples = PARAMS["filter_remove_inputs"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinal)
コード例 #44
0
def loadLCA(infile, outfile):
    '''
    load LCA results
    '''
    tablename = P.snip(os.path.dirname(infile), ".dir") + \
        "_" + os.path.basename(P.snip(infile, ".gz"))
    tablename = P.toTable(tablename + ".load")
    statement = '''zcat %(infile)s | python %(scriptsdir)s/csv2db.py
                  -t %(tablename)s
                  --index=id
                  --log=%(outfile)s.log
                  > %(outfile)s'''
    P.run()
コード例 #45
0
def alignmentTargets(genome_files, contig_files):
    '''
    generator object to produce filenames for 
    aligning contigs to known ncbi genomes
    '''
    parameters = []
    for genome, contig in itertools.product(genome_files, contig_files):
        outfile = os.path.join(
            "alignment.dir",
            P.snip(contig, ".contigs.fa") + "_vs_" +
            P.snip(os.path.basename(genome), ".fna")) + ".delta"
        parameters.append([genome, outfile, contig])
    return parameters
コード例 #46
0
ファイル: pipeline_idr.py プロジェクト: jmadzo/cgat
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".py")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
コード例 #47
0
def splitPooledBamfiles(infile, sentinal):
    infile = P.snip(infile, ".sentinal") + ".bam"
    outfile = P.snip(sentinal, ".sentinal")
    params = '2'
    module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "splitBam",
             params,
             infile,
             outfile)

    P.touch(sentinal)
コード例 #48
0
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[
                 str(idr_thresh),
             ],
             infiles=infiles,
             outfiles=outfile)
コード例 #49
0
def linkBamToWorkingDirs(infiles, outfile):
    '''
    symlink the bam file and index to the working directories
    for execution of the transcript building pipeline
    '''

    bamfile = P.snip(infiles[0], ".bai")
    indexfile = infiles[0]
    directories = [P.snip(logfile, ".log") for logfile in infiles[1]]

    for directory in directories:
        os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile))
        os.symlink(os.path.abspath(indexfile), os.path.join(directory, indexfile))
    updateFile(outfile)
コード例 #50
0
ファイル: pipeline_windows.py プロジェクト: pombredanne/cgat
def loadEdgeR( infile, outfile ):
    '''load EdgeR per-chunk summary stats.'''

    prefix = P.snip( outfile, ".load" )

    for fn in glob.glob( infile + "*_summary.tsv" ):
        prefix = P.snip(fn[len(infile)+1:], "_summary.tsv")

        P.load( fn, 
                prefix + ".deseq_summary.load", 
                collapse = 0,
                transpose = "sample")

    P.touch( outfile )
コード例 #51
0
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[
                 str(idr_thresh),
             ],
             infiles=infiles,
             outfiles=outfile)
コード例 #52
0
def loadCufflinks( infile, outfile ):
    '''load expression level measurements.'''

    track = P.snip( outfile, ".load" )
    P.load( infile + ".genes_tracking.gz",
            outfile = track + "_genefpkm.load",
            options = "--index=gene_id --ignore-column=tracking_id --ignore-column=class_code --ignore-column=nearest_ref_id" )

    track = P.snip( outfile, ".load" )
    P.load( infile + ".fpkm_tracking.gz",
            outfile = track + "_fpkm.load",
            options = "--index=tracking_id --ignore-column=nearest_ref_id --rename-column=tracking_id:transcript_id" )

    P.touch( outfile )
コード例 #53
0
def compareAbundanceOfFalsePositiveSpecies(infiles, outfile):
    '''
    boxplot the relative abundance of false positive
    species compared to true positives
    '''
    tablename_estimate = P.toTable(infiles[0])

    track = P.snip(
        os.path.basename(infiles[0]).replace("metaphlan_", ""), ".load")
    tablename_true = [
        P.toTable(x) for x in infiles[1:]
        if P.snip(os.path.basename(x), ".load") == track
    ][0]
    dbh = sqlite3.connect("csvdb")
    cc = dbh.cursor()
    tmp = P.getTempFile(".")
    tmp.write("taxa\tabundance\tstatus\n")
    estimate = {}
    true = set()
    for data in cc.execute(
            """SELECT taxon, rel_abundance FROM %s WHERE taxon_level == 'species'"""
            % tablename_estimate).fetchall():
        estimate[data[0]] = data[1]
    for data in cc.execute("""SELECT taxa FROM %s WHERE level == 'species'""" %
                           tablename_true).fetchall():
        true.add(data[0])

    for taxa, abundance in estimate.iteritems():
        if taxa in true:
            tmp.write("%s\t%f\ttp\n" % (taxa, abundance))
        else:
            tmp.write("%s\t%f\tfp\n" % (taxa, abundance))
    tmp.close()

    inf = tmp.name
    if track.find("15M") != -1:
        col = "cadetblue"
    elif track.find("30M") != -1:
        col = "lightblue"
    elif track.find("50M") != -1:
        col = "slategray"

    R('''dat <- read.csv("%s", header = T, stringsAsFactors = F, sep = "\t")'''
      % inf)
    R('''library(ggplot2)''')
    R('''ggplot(dat, aes(x = status, y = log2(abundance))) + geom_boxplot(colour = "%s") + geom_hline(yintersect=0, linetype="dashed")'''
      % col)
    R('''ggsave("%s")''' % outfile)
    os.unlink(inf)
コード例 #54
0
def linkBamToWorkingDirs(infiles, outfile):
    '''
    symlink the bam file and index to the working directories
    for execution of the transcript building pipeline
    '''

    bamfile = P.snip(infiles[0], ".bai")
    indexfile = infiles[0]
    directories = [P.snip(logfile, ".log") for logfile in infiles[1]]

    for directory in directories:
        os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile))
        os.symlink(os.path.abspath(indexfile),
                   os.path.join(directory, indexfile))
    updateFile(outfile)
コード例 #55
0
def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)