コード例 #1
0
def runMutectReverse(infiles, outfile):
    '''Use control as tumor and vis versa to estimate false positive rate'''
    infile, normal_panel = infiles
    infile_tumour = infile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])

    basename = P.snip(outfile, "_normal_mutect.vcf")
    call_stats_out = basename + "_call_stats.out"
    mutect_log = basename + ".log"

    basename = P.snip(outfile, ".mutect.reverse.snp.vcf")
    call_stats_out = basename + "_call_stats.reverse.out"
    coverage_wig_out = basename + "_coverage.reverse.wig"
    mutect_log = basename + ".reverse.log"

    (cosmic, dbsnp, quality, max_alt_qual, max_alt,
     max_fraction, tumor_LOD) = (
         PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"],
         PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"],
         PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"],
         PARAMS["mutect_LOD"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                           PARAMS["genome"])

    PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome,
                                  cosmic, dbsnp, call_stats_out,
                                  PARAMS['mutect_memory'],
                                  PARAMS['mutect_threads'],
                                  quality, max_alt_qual,
                                  max_alt, max_fraction, tumor_LOD,
                                  normal_panel, infile_tumour)
コード例 #2
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
コード例 #3
0
def flashReads(infiles, outfile, overlap):
    ''' Flashes read pairs'''
    
    job_memory = "2G"
    
    # Use one thread to maintain read order
    job_threads = 1
    
    # Retrieve the input file from [generateReadSamplesProduct, relocateReads]
    read1 = infiles[0]
    
    
    read2 = P.snip(read1, ".fastq.1.gz") + ".fastq.2.gz"
    
    
    outfilebase = P.snip(outfile, ".extendedFrags.fastq.gz")   
    
    
    statement = '''flash -m %(overlap)s --interleaved-output %(read1)s 
    %(read2)s -o %(outfilebase)s -z ;
    checkpoint;
    touch %(outfile)s'''
    # touch in case there are no flashed reads 
    # but the process still succeeds with no errors

    P.run()
コード例 #4
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
コード例 #5
0
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".", exon.strand,
                                        "."])) +
                    "\t" + exon.attributes + "\n")

        elif len(entry) == 1:
            for exon in entry:
                single.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".",
                                        exon.strand, "."])) +
                    "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
コード例 #6
0
def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"], dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile), ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename, PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print contigs
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
コード例 #7
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """python %(scriptsdir)s/combine_tables.py
                      --missing-value=0
                   %(filenames)s
                | python %(scriptsdir)s/csv2db.py
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s 
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
コード例 #8
0
ファイル: pipeline_idr.py プロジェクト: gjaime/CGATPipelines
def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)
コード例 #9
0
ファイル: pipeline_idr.py プロジェクト: gjaime/CGATPipelines
def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)
コード例 #10
0
ファイル: pipeline_idr.py プロジェクト: gjaime/CGATPipelines
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
コード例 #11
0
ファイル: pipeline_idr.py プロジェクト: gjaime/CGATPipelines
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[str(idr_thresh), ],
             infiles=infiles,
             outfiles=outfile)
コード例 #12
0
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt",
           outfile,
           options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")
コード例 #13
0
def linkBamToWorkingDirs(infiles, outfile):
    '''
    symlink the bam file and index to the working directories
    for execution of the transcript building pipeline
    '''

    bamfile = P.snip(infiles[0], ".bai")
    indexfile = infiles[0]
    directories = [P.snip(logfile, ".log") for logfile in infiles[1]]

    for directory in directories:
        os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile))
        os.symlink(
            os.path.abspath(indexfile), os.path.join(directory, indexfile))
    updateFile(outfile)
コード例 #14
0
def runMutectOnDownsampled(infiles, outfile):
    '''call somatic SNPs using MuTect on downsampled bams'''
    infile, normal_panel = infiles
    infile_tumour = infile.replace(
        PARAMS["sample_control"], PARAMS["sample_tumour"])
    basename = P.snip(outfile, "_normal_mutect.vcf")

    call_stats_out = basename + "_call_stats.out"
    mutect_log = basename + ".log"

    (cosmic, dbsnp, quality, max_alt_qual, max_alt,
     max_fraction, tumor_LOD) = (
         PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"],
         PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"],
         PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"],
         PARAMS["mutect_LOD"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"],
                           PARAMS["genome"])

    PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome,
                                  cosmic, dbsnp, call_stats_out,
                                  PARAMS['mutect_memory'], PARAMS['mutect_threads'],
                                  quality, max_alt_qual,
                                  max_alt, max_fraction, tumor_LOD,
                                  normal_panel, infile)
コード例 #15
0
def mergeAndLoad(infiles, outfile, suffix):
    """load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    """
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """python %(scriptsdir)s/combine_tables.py
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/" 
                | python %(scriptsdir)s/table2table.py --transpose
                | python %(scriptsdir)s/csv2db.py
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
            """
    P.run()
コード例 #16
0
def runPCA(infile, outfile, rownames=1):
    '''
    run principle components analysis on 
    normalised matrix
    '''
#    ncol = len(open(infile).readline().strip("\n").split("\t"))
    # read in and format data
    R('''dat <- read.csv("%s",
                          header=T,
                          stringsAsFactors=F,
                          sep="\t",
                          row.names=%i)''' % (infile, rownames))
    # run PCA
    R('''pc.dat <- prcomp(as.matrix(t(dat)))''')

    # get scores
    R('''pc.dat.scores <- data.frame(pc.dat$x)''')
    R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''')
    R('''pc.dat.scores <- pc.dat.scores[, c("sample", 
                                          colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]''')
    R('''write.table(pc.dat.scores,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outfile)

    # get the variance explained
    outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv"
    R('''ve <- data.frame(summary(pc.dat)$importance)''')
    R('''ve <- ve[2,]''')
    R('''write.table(ve,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outf_ve)
コード例 #17
0
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
コード例 #18
0
def plotMDS(infile, outfile):
    '''
    perform multidimensional scaling of normalised
    counts
    '''
    outname_matrix = P.snip(outfile, ".pdf") + ".tsv"
    R('''library(gtools)''')
    R('''library(ggplot2)''')
    R('''dat <- read.csv("%s",
                         header = T,
                         stringsAsFactors = F,
                         sep = "\t")''' % infile)
    R('''rownames(dat) <- dat$taxa
         dat <- dat[,1:ncol(dat)-1]
         dat <- dat[, mixedsort(colnames(dat))]
         conds <- unlist(strsplit(colnames(dat),
                         ".R[0-9].*"))[seq(1, ncol(dat)*2, 2)]
         conds <- unlist(strsplit(conds, ".",
                         fixed = T))[seq(2, length(conds)*2, 2)]
         dat <- as.matrix(t(dat))
         dist <- dist(dat)
         ord1 <- cmdscale(dist)
         ord2 <- as.data.frame(ord1)
         ord2$cond <- conds
         plot1 <- ggplot(ord2, aes(x = V1, y = V2, colour = cond))
         plot2 <- plot1 + geom_point(size = 3)
         cols <- rainbow(length(unique(conds)))
         plot3 <- plot2 + scale_colour_manual(values = c(cols))
         ggsave("%s")''' % outfile)
コード例 #19
0
def generateCoordinates(infile, outfile):
    fragments, probes = infile
    lookup = P.snip(outfile, "bed.gz") + "lookup.tsv"
    pipelineCaptCPerl.getProbeFragments(probes,
                                        fragments,
                                        outfile, 
                                        lookup)
コード例 #20
0
def buildCheckSums(infile, outfile):
    '''build checksums for files in the build directory.

    Files are uncompressed before computing the checksum
    as gzip stores meta information such as the time stamp.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get(
        '%s_suffixes' % track,
        PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)
    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} md5sum \;
    | perl -p -e "s/ +/\\t/g"
    | sort -k1,1
    > %(outfile)s'''
    P.run()
コード例 #21
0
def collectFastQCSections(infiles, section, datadir):
    '''iterate over all fastqc files and extract a particular section.

    Arguments
    ---------
    infiles : list
        List of filenames with fastqc output (logging information). The
        track name is derived from that.
    section : string
        Section name to extract
    datadir : string
        Location of actual Fastqc output to be parsed.

    Returns
    -------
    results : list
        List of tuples, one tuple per input file. Each tuple contains
        track, status, header and data of `section`.

    '''
    results = []
    for infile in infiles:
        track = P.snip(os.path.basename(infile), ".fastqc")
        filename = os.path.join(datadir, track + "*_fastqc", "fastqc_data.txt")
        for fn in glob.glob(filename):
            for name, status, header, data in FastqcSectionIterator(
                    IOTools.openFile(fn)):
                if name == section:
                    results.append((track, status, header, data))
    return results
コード例 #22
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get(
        '%s_suffixes' % track,
        PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)

    regex_pattern = pipes.quote(regex_pattern)

    # ignore log files as time stamps will
    # be different
    statement = '''find %(track)s.dir
    -type f
    -not -regex ".*.log"
    -regex %(regex_pattern)s
    -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \;
    | sort -k1,1
    > %(outfile)s'''
    P.run()
コード例 #23
0
def GATKBaseRecal(infile, outfile, genome, dbsnp, solid_options=""):
    '''Recalibrates base quality scores using GATK'''

    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''GenomeAnalysisTK
                    -T BaseRecalibrator
                    --out %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s
                    --knownSites %(dbsnp)s %(solid_options)s ;
                    checkpoint ;''' % locals()

    statement += '''GenomeAnalysisTK
                    -T PrintReads -o %(outfile)s
                    -BQSR %(tmpdir_gatk)s/%(track)s.recal.grp
                    -R %(genome)s
                    -I %(infile)s ;
                    checkpoint ;''' % locals()

    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()
    P.run()
コード例 #24
0
def runTest(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")

    pipeline_name = PARAMS.get(
        "%s_pipeline" % track,
        "pipeline_" + track[len("test_"):])

    pipeline_targets = P.asList(
        PARAMS.get("%s_target" % track,
                   "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    to_cluster = False

    template_statement = '''
    (cd %%(track)s.dir;
    python %%(pipelinedir)s/%%(pipeline_name)s.py
    %%(pipeline_options)s make %s) >& %%(outfile)s
    '''
    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(ignore_errors=True)
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(ignore_errors=True)
コード例 #25
0
def mergeMeanTables(infiles, outfile):
    '''
    Collate and merge all separate tables into a single
    large table for all MZ and DZ twins
    '''

    job_memory = "300G"
    panel = outfile.split("/")[-1].split("-")[1]
    cell_type = outfile.split("/")[-1].split("mean_")[-1]
    cell_type = P.snip(cell_type, ".tsv")
    table_name = "_".join([cell_type, "mean"])
    out_dir = "/".join(outfile.split("/")[:-1])
    
    twin_id = "twin.id"

    statement = '''
    python /ifs/devel/projects/proj052/flow_pipeline/scripts/flow2twins.py
    --task=merge_flow
    --twin-id-column=%(twin_id)s
    --demographics-file=%(twins_demographics)s
    --demo-id-column=%(twins_demo_header)s
    --database=%(database)s
    --tablename=%(table_name)s
    --filter-gates="(F|S)SC-(A|H)"
    --filter-zero-arrays    
    --log=%(outfile)s.log
    --output-directory=%(out_dir)s
    --output-file-pattern=%(table_name)s
    '''

    P.run()
    P.touch(outfile)
コード例 #26
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
コード例 #27
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN",
                                                      "TRAN")
            statement = '''
            zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | python %(scriptsdir)s/gtf2gtf.py
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()
コード例 #28
0
def GATKReadGroups(infile, outfile, genome,
                   library="unknown", platform="Illumina",
                   platform_unit="1", track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                    checkpoint ;''' % locals()

    statement += '''AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                    checkpoint ;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run()
コード例 #29
0
def calculateFalsePositiveRate(infiles, outfile):
    '''
    taxonomy false positives and negatives etc
    '''
    # connect to database
    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()

    levels = ["phylum", "class", "order", "family", "genus", "species"]
    tablename_true = P.toTable(infiles[0])

    # get corresponding estimate file
    tablename_estimate = P.toTable(os.path.basename([inf for inf in infiles[
                                   1:] if os.path.basename(inf)[len("metaphlan_"):] == os.path.basename(infiles[0])][0]))

    outf = open(outfile, "w")
    track = P.snip(os.path.basename(infiles[0]), ".taxonomy.relab.load")
    for level in levels:
        for cutoff in [0, 1]:
            true_set = set()
            estimate_set = set()
            for taxa in cc.execute("""SELECT taxa FROM %s WHERE level == '%s' AND relab > %f""" % (tablename_true, level, float(cutoff) / 100)):
                true_set.add(taxa[0])
            for taxa in cc.execute("""SELECT taxon FROM %s WHERE taxon_level == '%s' AND rel_abundance > %f""" % (tablename_estimate, level, float(cutoff))):
                estimate_set.add(taxa[0])
            total_true = len(true_set)
            total_estimate = len(estimate_set)
            tp = true_set.intersection(estimate_set)
            fp = estimate_set.difference(true_set)

            fp_rate = float(len(fp)) / total_estimate
            tp_rate = float(len(tp)) / total_true
            outf.write("%s\t%f\t%f\t%s\t%s\n" %
                       (level, fp_rate, tp_rate, track, str(cutoff)))
    outf.close()
コード例 #30
0
def mapReadsWithBowtie(infiles, outfile, mismatches):
    ''' Aligns the digested reads to the genome '''
    
    # If mapping analysis is not defined, use user specified parameters
    if (PARAMS["addtests_mapping"] == 0):
        align_command = PARAMS["bowtie_options"]
    else:
        align_command = "-v " +mismatches+" -p 1 -m 2 --best --strata --sam --chunkmb 256"
    
    
    # Obtain the reads
    reads = infiles[0]
    
    
    genome = PARAMS["environment_bowtiegenome"]
    
    job_memory = "4G"
        
    log_file = P.snip(outfile, ".sam") + ".log"
 
    statement = '''bowtie %(align_command)s 
    --sam %(genome)s 
    %(reads)s 
    %(outfile)s 2> %(log_file)s'''
 
    P.run()
コード例 #31
0
def loadBigWigStats(infiles, outfile):
    '''merge and load bigwig summary for all wiggle files.

    Summarise and merge bigwig files for all samples and load into a
    table called bigwig_stats

    Parameters
    ----------
    infiles : list
       Input filenames in :term:`bigwig` format
    outfile : string
        Output filename, the table name is derived from `outfile`.
    '''

    data = " ".join(
        ['<( bigWigInfo %s | perl -p -e "s/:/\\t/; s/ //g; s/,//g")' %
         x for x in infiles])
    headers = ",".join([P.snip(os.path.basename(x), ".bw")
                        for x in infiles])

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=track")

    statement = '''cgat combine_tables
    --header-names=%(headers)s
    --skip-titles
    --missing-value=0
    --ignore-empty
    %(data)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    '''

    P.run()
コード例 #32
0
def exportSequencesFromBedFile(infile, outfile, masker=None, mode="intervals"):
    '''export sequences for intervals in :term:`bed`-formatted *infile* 
    to :term:`fasta` formatted *outfile*
    '''

    track = P.snip(infile, ".bed.gz")

    fasta = IndexedFasta.IndexedFasta(
        os.path.join(PARAMS["genome_dir"], PARAMS["genome"]))
    outs = IOTools.openFile(outfile, "w")

    ids, seqs = [], []
    for bed in Bed.setName(Bed.iterator(IOTools.openFile(infile))):
        lcontig = fasta.getLength(bed.contig)

        if mode == "intervals":
            seqs.append(fasta.getSequence(bed.contig, "+", bed.start, bed.end))
            ids.append("%s_%s %s:%i..%i" %
                       (track, bed.name, bed.contig, bed.start, bed.end))

        elif mode == "leftright":
            l = bed.end - bed.start

            start, end = max(0, bed.start - l), bed.end - l
            ids.append("%s_%s_l %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

            start, end = bed.start + l, min(lcontig, bed.end + l)
            ids.append("%s_%s_r %s:%i..%i" %
                       (track, bed.name, bed.contig, start, end))
            seqs.append(fasta.getSequence(bed.contig, "+", start, end))

    masked = maskSequences(seqs, masker)
    outs.write("\n".join([">%s\n%s" % (x, y) for x, y in zip(ids, masked)]))

    outs.close()
コード例 #33
0
def mergeSummarizedContextStats(infiles, outfile, samples_in_columns=False):
    """combine output from :func:`summarizeTagsWithinContext`.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format
    outfile : string
        Output filename in :term:`tsv` format.
    samples_in_columns :
        If True, put samples in columns. The default is to put them
        in rows.
    """

    header = ",".join([P.snip(os.path.basename(x), ".contextstats.tsv.gz")
                      for x in infiles])
    filenames = " ".join(infiles)

    if not samples_in_columns:
        transpose_cmd = \
            """| python %(scriptsdir)s/table2table.py
            --transpose""" % P.getParams()
    else:
        transpose_cmd = ""

    statement = """python %(scriptsdir)s/combine_tables.py
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    %(transpose_cmd)s
    | gzip
    > %(outfile)s
    """

    P.run()
コード例 #34
0
def downloadSequinsNeatData(outfile):
    ''' Download the neat Sequins data from NCBI'''

    address_base = 'ftp://ftp-trace.ncbi.nlm.nih.gov/sra/sra-instant/reads/ByExp/sra/SRX/SRX189'

    outfile2srr = {
        'neat-A.fastq.1.gz': 'SRR3743147',
        'neat-B.fastq.1.gz': 'SRR3743148'
    }

    srr2srx = {'SRR3743147': 'SRX1897294', 'SRR3743148': 'SRX1897295'}

    outfile_base = os.path.basename(outfile)

    srr = outfile2srr[outfile_base]
    srx = srr2srx[srr]

    outfile_name = P.snip(outfile_base, '.fastq.1.gz')

    statement = '''
    wget %(address_base)s/%(srx)s/%(srr)s/%(srr)s.sra
    -O %(outfile_name)s.sra
    '''
    P.run()

    outdir = os.path.dirname(outfile)
    statement = Sra.extract(outfile_name + '.sra', outdir)
    P.run()

    statement = '''
    mv %(outdir)s/%(outfile_name)s_1.fastq.gz
    %(outdir)s/%(outfile_name)s.fastq.1.gz; checkpoint;
    mv %(outdir)s/%(outfile_name)s_2.fastq.gz
    %(outdir)s/%(outfile_name)s.fastq.2.gz'''
    P.run()

    os.unlink(outfile_name + '.sra')
コード例 #35
0
def GATKReadGroups(infile, outfile, genome,
                   library="unknown", platform="Illumina",
                   platform_unit="1", track="unknown"):
    '''Reorders BAM according to reference fasta and adds read groups'''
    '''To see the read group information for a BAM file, use:'''
    '''samtools view -H sample.bam | grep '@RG' - no RG currently available '''

    if track == 'unknown':
        track = P.snip(os.path.basename(infile), ".dedup.bam")
    tmpdir_gatk = P.getTempDir('.')
    job_options = getGATKOptions()
    job_threads = 3

    statement = '''picard ReorderSam
                    INPUT=%(infile)s
                    OUTPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    REFERENCE=%(genome)s
                    ALLOW_INCOMPLETE_DICT_CONCORDANCE=true
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(tmpdir_gatk)s/%(track)s.reordered.bam ;
                    checkpoint ;''' % locals()

    statement += '''picard AddOrReplaceReadGroups
                    INPUT=%(tmpdir_gatk)s/%(track)s.reordered.bam
                    OUTPUT=%(outfile)s
                    RGLB=%(library)s
                    RGPL=%(platform)s
                    RGPU=%(platform_unit)s
                    RGSM=%(track)s
                    VALIDATION_STRINGENCY=SILENT ; checkpoint ;''' % locals()

    statement += '''samtools index %(outfile)s ;
                    checkpoint ;''' % locals()
    statement += '''rm -rf %(tmpdir_gatk)s ;''' % locals()

    P.run()
コード例 #36
0
def variantRecalibrator(infile, outfile, genome, mode, dbsnp=None,
                        kgenomes=None, hapmap=None, omni=None, mills=None, track=None):
    '''Create variant recalibration file'''
    job_options = getGATKOptions()
    job_threads = 3

    track = P.snip(outfile, ".recal")
    if mode == 'SNP':
        statement = '''GenomeAnalysisTK -T VariantRecalibrator
        -R %(genome)s
        -input %(infile)s
        -resource:hapmap,known=false,training=true,truth=true,prior=15.0 %(hapmap)s
        -resource:omni,known=false,training=true,truth=true,prior=12.0 %(omni)s
        -resource:dbsnp,known=true,training=false,truth=false,prior=2.0 %(dbsnp)s
        -resource:1000G,known=false,training=true,truth=false,prior=10.0 %(kgenomes)s
        -an QD -an SOR -an MQRankSum
        -an ReadPosRankSum -an FS -an MQ
        --maxGaussians 4
        -mode %(mode)s
        -recalFile %(outfile)s
        -tranchesFile %(track)s.tranches
        -rscriptFile %(track)s.plots.R ''' % locals()
        P.run()
    elif mode == 'INDEL':
        statement = '''GenomeAnalysisTK -T VariantRecalibrator
        -R %(genome)s
        -input %(infile)s
        -resource:mills,known=true,training=true,truth=true,prior=12.0 %(mills)s
        -an QD -an MQRankSum
        -an ReadPosRankSum -an FS -an MQ
        --maxGaussians 4
        --minNumBadVariants 5000
        -mode %(mode)s
        -recalFile %(outfile)s
        -tranchesFile %(track)s.tranches
        -rscriptFile %(track)s.plots.R ''' % locals()
        P.run()
コード例 #37
0
def buildLineCounts(infile, outfile):
    '''compute line counts.

    Files are uncompressed before computing the number of lines.
    '''

    track = P.snip(infile, ".log")

    suffixes = P.asList(PARAMS.get('%s_suffixes' % track, PARAMS["suffixes"]))

    if len(suffixes) == 0:
        raise ValueError('no file types defined for test')

    regex_pattern = ".*\(%s\)" % "\|".join(suffixes)

    regex_pattern = pipes.quote(regex_pattern)

    statement = '''find %(track)s.dir
    -type f
    -regex %(regex_pattern)s
    -exec %(pipeline_scriptsdir)s/cgat_file_apply.sh {} wc -l \;
    | sort -k1,1
    > %(outfile)s'''
    P.run()
コード例 #38
0
def runMutectOnDownsampled(infiles, outfile):
    '''call somatic SNPs using MuTect on downsampled bams'''
    infile, normal_panel = infiles
    infile_tumour = infile.replace(PARAMS["sample_control"],
                                   PARAMS["sample_tumour"])
    basename = P.snip(outfile, "_normal_mutect.vcf")

    call_stats_out = basename + "_call_stats.out"
    mutect_log = basename + ".log"

    (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction,
     tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"],
                   PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"],
                   PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"],
                   PARAMS["mutect_LOD"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    PipelineExome.mutectSNPCaller(infile_tumour, outfile, mutect_log, genome,
                                  cosmic, dbsnp, call_stats_out,
                                  PARAMS['mutect_memory'],
                                  PARAMS['mutect_threads'], quality,
                                  max_alt_qual, max_alt, max_fraction,
                                  tumor_LOD, normal_panel, infile)
コード例 #39
0
def loadMissedReadCounts(infiles, outfile):
    '''load summary table of numbers of missed reads.'''

    def _getlines(inf):
        return len(IOTools.openFile(inf).readlines()) - 1

    tmpfile = P.getTempFile()

    infiles = sorted(infiles)

    tmpfile.write(
        "track\tmapped_genome\tmissed_junctions\tmissed_transcriptome\n")

    for x in range(0, len(infiles), 2):
        junctions, transcriptome = infiles[x], infiles[x + 1]
        track = P.snip(junctions, ".missed_junctions.gz")
        mapped_genome = _getlines(track + ".mapped_reads.gz")
        tmpfile.write("%s\t%i\t%i\t%i\n" % (track,
                                            mapped_genome,
                                            _getlines(junctions),
                                            _getlines(transcriptome)))
    tmpfile.close()
    P.load(tmpfile.name, outfile)
    os.unlink(tmpfile.name)
def collateTaxonBioProjectNumbers(infiles, outfile):
    '''Parse the XML files, and output the project IDs'''

    outf = IOTools.openFile(outfile, 'w')
    outf.write('bioproject\ttaxon_id\ttaxon_name\n')
    outf_failed = IOTools.openFile(P.snip(outfile, '.tsv') + '_discarded.tsv', 'w')
    
    for infile in infiles:
        inf = os.path.basename(infile)[:-len('.xml')]
        tax_id = inf.split('_').pop()

        inf = IOTools.openFile(infile).readlines()
        if len(inf) == 0:
            L.warn('No genbank accession for taxon: %s' % infile)
            outf_failed.write(infile + '\n')
            continue

        name = None
        project = None
        for line in inf:
            line = line.lstrip().strip()
            if line.startswith('DBLINK'):
                assert line.split()[1] == 'BioProject:'
                project = line.split()[2]
            if line.startswith('ORGANISM'):
                name = ' '.join(line.split()[1:])

        if not project:
            L.warn('No genbank accession for taxon: %s' % infile)
            outf_failed.write(infile + '\n')
            continue

        outf.write('\t'.join([project, tax_id, name]) + '\n')

    outf.close()
    outf_failed.close()
コード例 #41
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    # Load into database
    P.load(outf.name,
           outfile,
           options="--add-index=track")
    os.unlink(outf.name)
コード例 #42
0
def buildContigBed(infile, outfile):
    '''
    Gets the contig sizes and co-ordinates from an indexed genome :term:`fasta`
    file and outputs them to :term:`BED` format
    Parameters
    ----------
    infile : str
      infile is constructed from `PARAMS` variable to retrieve
      the `genome` :term:`fasta` file
    Returns
    -------
    outfile : str
      :term:`BED` format file containing contig name, value (0) and contig size
      in nucleotides.  The output file name is defined in
      `PARAMS: interface_contigs_bed`
    '''
    prefix = P.snip(infile, ".fasta")
    fasta = IndexedFasta.IndexedFasta(prefix)
    outs = IOTools.openFile(outfile, "w")

    for contig, size in fasta.getContigSizes(with_synonyms=False).items():
        outs.write("%s\t%i\t%i\n" % (contig, 0, size))

    outs.close()
コード例 #43
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix)
                       for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(
        P.toTable(outfile),
        options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run()
コード例 #44
0
def runPCA(infile, outfile, rownames=1):
    '''
    run principle components analysis on 
    normalised matrix
    '''
    #    ncol = len(open(infile).readline().strip("\n").split("\t"))
    # read in and format data
    R('''dat <- read.csv("%s",
                          header=T,
                          stringsAsFactors=F,
                          sep="\t",
                          row.names=%i)''' % (infile, rownames))
    # run PCA
    R('''pc.dat <- prcomp(as.matrix(t(dat)))''')

    # get scores
    R('''pc.dat.scores <- data.frame(pc.dat$x)''')
    R('''pc.dat.scores$sample <- rownames(pc.dat.scores)''')
    R('''pc.dat.scores <- pc.dat.scores[, c("sample", 
                                          colnames(pc.dat.scores)[1:ncol(pc.dat.scores)-1])]'''
      )
    R('''write.table(pc.dat.scores,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outfile)

    # get the variance explained
    outf_ve = P.snip(outfile, ".tsv") + ".ve.tsv"
    R('''ve <- data.frame(summary(pc.dat)$importance)''')
    R('''ve <- ve[2,]''')
    R('''write.table(ve,
                     file="%s",
                     sep="\t",
                     quote=F,
                     row.names=F)''' % outf_ve)
コード例 #45
0
def STARmap(infile, outfile):
    ''' maps non-repetitive elements to genome '''
    outprefix = P.snip(outfile, ".bam")
    job_threads = PARAMS["STARmap_threads"]
    statement = '''
    STAR  --runMode alignReads
    --runThreadN %(job_threads)i
    --genomeDir %(STARmap_genome)s
    --readFilesIn %(infile)s
    --outSAMunmapped Within
    --outFilterMultimapNmax 1
    --outFilterMultimapScoreRange 1
    --outFileNamePrefix %(outprefix)s
    --outSAMattributes All
    --outStd BAM_Unsorted
    --outSAMtype BAM Unsorted
    --outFilterType BySJout
    --outReadsUnmapped Fastx
    --outFilterScoreMin 10
    --outSAMattrRGline ID:foo
    --alignEndsType Local
    > %(outfile)s 
    '''
    P.run()
コード例 #46
0
def loadPicardHistogram(infiles,
                        outfile,
                        suffix,
                        column,
                        pipeline_suffix=".picard_stats",
                        tablename=False):
    '''extract a histogram from a picard output file and load
    it into database.

    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    column : string
        Column name to take from the histogram.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)
        tablename = tablename.replace("_metrics", "_histogram")

    # some files might be missing
    xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

    if len(xfiles) == 0:
        E.warn("no files for %s" % tablename)
        return

    header = ",".join(
        [P.snip(os.path.basename(x), pipeline_suffix) for x in xfiles])
    filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

    # there might be a variable number of columns in the tables
    # only take the first ignoring the rest

    load_statement = P.build_load_statement(tablename,
                                            options="--add-index=track "
                                            " --header-names=%s,%s"
                                            " --allow-empty-file"
                                            " --replace-header" %
                                            (column, header))

    statement = """cgat combine_tables
    --regex-start="## HISTOGRAM"
    --missing-value=0
    --take=2
    %(filenames)s
    | %(load_statement)s
    >> %(outfile)s
    """

    P.run()
コード例 #47
0
def buildCodingPotential(infile, outfile):
    '''run CPC analysis as in the cpc script.

    This module runs framefinder and blastx on both strands.
    It seems to work, but I have not thoroughly tested it.
    I expect that the false positive rate increases (i.e.,
    predicting non-coding as coding) in cases where the best
    framefinder match and the best blast match are on opposite
    strands. In the original CPC, these would be separated.
    '''

    try:
        cpc_dir = os.environ["CPC_HOME"]
    except KeyError:
        raise ValueError("CPC_HOME environment variable is not set. ")

    tmpdir = P.getTempDir(".")
    track = P.snip(outfile, ".coding.gz")

    # extract features for frame finder
    # replaces extract_framefinder_feats.pl to parse both strands
    with open(os.path.join(tmpdir, "ff.feat"), "w") as outf:
        outf.write("\t".join(("QueryID", "CDSLength", "Score", "Used",
                              "Strict")) + "\n")
        for line in IOTools.openFile("%s.frame.gz" % track):
            if line.startswith(">"):
                try:
                    (id, start, end, score, used, mode, tpe) = \
                        re.match(
                            ">(\S+).*framefinder \((\d+),(\d+)\) score=(\S+) used=(\S+)% \{(\S+),(\w+)\}", line).groups()
                except AttributeError:
                    raise ValueError("parsing error in line %s" % line)
                length = int(end) - int(start) + 1
                strict = int(tpe == "strict")
                outf.write("\t".join((id, str(length), used, str(strict))) +
                           "\n")

    to_cluster = USECLUSTER

    # extract features and prepare svm data
    s = []

    s.append('''
    zcat %(infile)s
    | perl %(cpc_dir)s/libs/blast2table.pl
    | tee %(tmpdir)s/blastx.table
    | perl %(cpc_dir)s/bin/extract_blastx_features.pl
    > %(tmpdir)s/blastx.feat1;
    ''')

    s.append('''
    cat %(track)s_norepeats.fasta
    | perl %(cpc_dir)s/bin/add_missing_entries.pl
       %(tmpdir)s/blastx.feat1
    > %(tmpdir)s/blastx.feat;
    ''')

    # step 2 - prepare data
    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,4,6 NA NA %(tmpdir)s/blastx.feat
    > %(tmpdir)s/blastx.lsv;
    ''')

    s.append('''
    perl %(cpc_dir)s/bin/feat2libsvm.pl -c 2,3,4,5 NA NA %(tmpdir)s/ff.feat
    > %(tmpdir)s/ff.lsv;
    ''')

    s.append('''
    perl -w %(cpc_dir)s/bin/lsv_cbind.pl %(tmpdir)s/blastx.lsv %(tmpdir)s/ff.lsv
    > %(tmpdir)s/test.lsv;
    ''')

    s.append('''
    %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-scale
               -r %(cpc_dir)s/data/libsvm.range
               %(tmpdir)s/test.lsv
    > %(tmpdir)s/test.lsv.scaled;
    ''')

    # step 3: prediction
    m_libsvm_model0 = os.path.join(cpc_dir, "data/libsvm.model0")  # standard
    m_libsvm_model = os.path.join(cpc_dir, "data/libsvm.model")  # Prob
    m_libsvm_model2 = os.path.join(
        cpc_dir, "data/libsvm.model2")  # Prob + weighted version
    m_libsvm_range = os.path.join(cpc_dir, "data/libsvm.range")

    s.append('''
               %(cpc_dir)s/libs/libsvm/libsvm-2.81/svm-predict2
               %(tmpdir)s/test.lsv.scaled
               %(m_libsvm_model0)s
               %(tmpdir)s/test.svm0.predict
    > %(tmpdir)s/test.svm0.stdout 2> %(tmpdir)s/test.svm0.stderr;
    ''')

    s.append('''
    printf "gene_id\\tlength\\tresult\\tvalue\\n"
    | gzip > %(outfile)s;
    cat %(tmpdir)s/test.svm0.predict
    | perl -w %(cpc_dir)s/bin/predict.pl %(track)s_norepeats.fasta
    | gzip >> %(outfile)s;
    ''')

    # generate reports
    s.append('''cat %(tmpdir)s/blastx.feat
    | perl -w %(cpc_dir)s/bin/generate_plot_features.pl %(tmpdir)s/blastx.table <( zcat %(track)s.frame.gz)
    | perl -w %(cpc_dir)s/bin/split_plot_features_by_type.pl %(outfile)s.homology %(outfile)s.orf;
    gzip %(outfile)s.orf %(outfile)s.homology;
    ''')

    # now run it all
    statement = " checkpoint; ".join(s)
    P.run()

    # clean up
    shutil.rmtree(tmpdir)
コード例 #48
0
def loadBAMStats(infiles, outfile):
    '''load output of :func:`buildBAMStats` into database.
    Arguments
    ---------
    infiles : string
        Input files, output from :func:`buildBAMStats`.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    header = ",".join([P.snip(os.path.basename(x), ".readstats")
                       for x in infiles])
    filenames = " ".join(["<( cut -f 1,2 < %s)" % x for x in infiles])
    tablename = P.toTable(outfile)

    load_statement = P.build_load_statement(
        tablename,
        options="--add-index=track "
        " --allow-empty-file")

    E.info("loading bam stats - summary")
    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --ignore-empty
    %(filenames)s
    | perl -p -e "s/bin/track/"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s"""
    P.run()

    for suffix in ("nm", "nh"):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options="--allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run()

    # load mapping qualities, there are two columns per row
    # 'all_reads' and 'filtered_reads'
    # Here, only filtered_reads are used (--take=3)
    for suffix in ("mapq",):
        E.info("loading bam stats - %s" % suffix)
        filenames = " ".join(["%s.%s" % (x, suffix) for x in infiles])

        load_statement = P.build_load_statement(
            "%s_%s" % (tablename, suffix),
            options=" --allow-empty-file")

        statement = """cgat combine_tables
        --header-names=%(header)s
        --skip-titles
        --missing-value=0
        --ignore-empty
        --take=3
        %(filenames)s
        | perl -p -e "s/bin/%(suffix)s/"
        | %(load_statement)s
        >> %(outfile)s """
        P.run()
コード例 #49
0
# load options from the config file
P.get_parameters(["pipeline.yml"])

PARAMS = P.PARAMS

###################################################
###################################################
###################################################

INFILES = glob.glob("*.fastq.1.gz")
OUTFILES = ["filtered.dir/" + x for x in INFILES]

# check the existence of paired files
if PARAMS["paired"] == 1:
    for infile in INFILES:
        second_in_pair = P.snip(infile, ".fastq.1.gz") + ".fastq.2.gz"
        assert os.path.exists(
            second_in_pair
        ), "no second read for file {infile}: should be {second_in_pair}"
        trunclen = PARAMS["trim_trunclen"].split(",")
        maxee = PARAMS["trim_maxee"].split(",")
        assert len(
            trunclen) == 2, "specify 2 values only for paired data (truncLen)"
        assert len(maxee) == 2, "specify 2 values only for paired data (maxee)"

###################################################
###################################################
###################################################


@follows(mkdir("filtered.dir"))
コード例 #50
0
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
コード例 #51
0
def loadPicardMetrics(infiles, outfile, suffix,
                      pipeline_suffix=".picard_stats",
                      tablename=None):
    '''load picard metrics.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile.
    suffix : string
        Suffix to append to table name.
    pipeline_suffix : string
        Suffix to remove from track name.
    tablename : string
        Tablename to use. If unset, the table name will be derived
        from `outfile` and suffix as ``toTable(outfile) + "_" +
        suffix``.
    '''

    if not tablename:
        tablename = "%s_%s" % (P.toTable(outfile), suffix)

    outf = P.getTempFile(".")

    filenames = ["%s.%s" % (x, suffix) for x in infiles]

    first = True
    for filename in filenames:
        track = P.snip(os.path.basename(filename), "%s.%s" %
                       (pipeline_suffix, suffix))

        if not os.path.exists(filename):
            E.warn("File %s missing" % filename)
            continue

        lines = IOTools.openFile(filename, "r").readlines()

        # extract metrics part
        rx_start = re.compile("## METRICS CLASS")
        for n, line in enumerate(lines):
            if rx_start.search(line):
                lines = lines[n + 1:]
                break

        for n, line in enumerate(lines):
            if not line.strip():
                lines = lines[:n]
                break

        if len(lines) == 0:
            E.warn("no lines in %s: %s" % (track, filename))
            continue

        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
            fields = lines[0][:-1].split("\t")
        else:
            f = lines[0][:-1].split("\t")
            if f != fields:
                raise ValueError(
                    "file %s has different fields: expected %s, got %s" %
                    (filename, fields, f))

        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()

    P.load(outf.name,
           outfile,
           tablename=tablename,
           options="--add-index=track --allow-empty-file")

    os.unlink(outf.name)
コード例 #52
0
def sortByName(infile, outfile):
    '''Add number of hits tags to sam file'''
    to_cluster = USECLUSTER
    track = P.snip(outfile, ".bam")
    statement = '''samtools sort -n %(infile)s %(track)s;'''
    P.run()
コード例 #53
0
def compareCheckSums(infiles, outfile):
    '''compare checksum files against existing reference data.
    '''

    outf = IOTools.openFile(outfile, "w")
    outf.write("\t".join((
        ("track", "status",
         "nfiles", "nref",
         "missing", "extra",
         "different",
         "different_md5", "different_lines",
         "files_missing", "files_extra",
         "files_different_md5",
         "files_different_lines"))) + "\n")

    for infile in infiles:
        track = P.snip(infile, ".stats")
        reffile = track + ".ref"

        # regular expression of files to test only for existence
        regex_exist = PARAMS.get('%s_regex_exist' % track, None)
        if regex_exist:
            regex_exist = re.compile(regex_exist)

        regex_linecount = PARAMS.get('%s_regex_linecount' % track, None)
        if regex_linecount:
            regex_linecount = re.compile(regex_linecount)

        if not os.path.exists(reffile):
            raise ValueError('no reference data defined for %s' % track)

        cmp_data = pandas.read_csv(IOTools.openFile(infile),
                                   sep="\t",
                                   index_col=0)

        ref_data = pandas.read_csv(IOTools.openFile(reffile),
                                   sep="\t",
                                   index_col=0)

        shared_files = set(cmp_data.index).intersection(ref_data.index)
        missing = set(ref_data.index).difference(cmp_data.index)
        extra = set(cmp_data.index).difference(ref_data.index)

        different = set(shared_files)

        # remove those for which only check for existence
        if regex_exist:
            different = set([x for x in different
                             if not regex_exist.search(x)])

        # select those for which only check for number of lines
        if regex_linecount:
            check_lines = [x for x in different
                           if regex_linecount.search(x)]

            dd = (cmp_data['nlines'][check_lines] !=
                  ref_data['nlines'][check_lines])

            different_lines = set(dd.index[dd])
            different = different.difference(check_lines)
        else:
            different_lines = set()

        # remainder - check md5
        dd = cmp_data['md5'][different] != ref_data['md5'][different]
        different_md5 = set(dd.index[dd])

        if len(missing) + len(extra) + \
           len(different_md5) + len(different_lines) == 0:
            status = "OK"
        else:
            status = "FAIL"

        outf.write("\t".join(map(str, (
            track,
            status,
            len(cmp_data),
            len(ref_data),
            len(missing),
            len(extra),
            len(different_md5) + len(different_lines),
            len(different_md5),
            len(different_lines),
            ",".join(missing),
            ",".join(extra),
            ",".join(different_md5),
            ",".join(different_lines),
        ))) + "\n")

    outf.close()
コード例 #54
0
def intersectionHeatmap(infiles, outfile):
    ''' calculate the intersection between the infiles and plot'''

    pandas2ri.activate()

    name2genes = {}
    df = pd.DataFrame(columns=["id_1", "id_2", "intersection", "perc"])

    ix = 0
    for inf in infiles:

        name = P.snip(os.path.basename(inf)).split(".")[0]
        name = name.replace(".", "_")

        with IOTools.openFile(inf, "r") as f:
            genes = set()

            for line in f:
                if line[0] == "#":
                    continue

                values = line.strip().split("\t")
                info = values[7].split(";")

                for x in info:
                    if x.split("=")[0] == "SNPEFF_GENE_NAME":
                        gene_name = x.split("=")[1]
                        break

                # if no gene name found, line is skipped
                if gene_name:
                    genes.update((gene_name, ))

        name2genes[name] = genes
        df.loc[ix] = [name, name, len(genes), 1.0]
        ix += 1

    for pair in itertools.permutations(list(name2genes.keys()), 2):
        id_1, id_2 = pair
        intersection = len(name2genes[id_1].intersection(name2genes[id_2]))
        not_intersecting = len(name2genes[id_1].symmetric_difference(
            name2genes[id_2]))
        intersection_perc = float(intersection) / (intersection +
                                                   not_intersecting)

        df.loc[ix] = [id_1, id_2, intersection, intersection_perc]
        ix += 1

    variant = os.path.basename(outfile).replace("overlap_", "").replace(
        "_heatmap.png", "")

    plotIntersectionHeatmap = R('''
    function(df){
    library(ggplot2)
    m_txt = element_text(size=15)
    m_txt_90 = element_text(size=15, angle=90, vjust=0.5, hjust=1)
    l_txt = element_text(size=20)

    p = ggplot(df, aes(id_1, id_2, fill=100*perc)) +
    geom_tile() +
    geom_text(aes(label=intersection), size=3) +
    scale_fill_gradient(name="Intersection (%%)", limits=c(0,100),
                       low="yellow", high="dodgerblue4") +
    theme(axis.text.x = m_txt_90, axis.text.y = m_txt,
          legend.text = m_txt, legend.title = m_txt,
          aspect.ratio=1) +
    xlab("") + ylab("") +
    ggtitle("%(variant)s")

    ggsave("%(outfile)s", width=10, height=10)
    }''' % locals())

    plotIntersectionHeatmap(df)
コード例 #55
0
    '''
    generic split by newline and tab for reading tsv files
    '''
    return line[:-1].split("\t")

#########################################################################
#########################################################################
#########################################################################


@follows(mkdir("gtfs"))
@merge([PARAMS["genesets_abinitio_coding"],
        PARAMS["genesets_reference"]],
       os.path.join(
           "gtfs",
           P.snip(PARAMS["genesets_abinitio_coding"],
                  ".gtf.gz") + "_coding.gtf.gz"))
def buildCodingGeneSet(infiles, outfile):
    '''
    takes the output from cuffcompare of a transcript
    assembly and filters for annotated protein coding
    genes.

    NB "pruned" refers to nomenclature in the transcript
    building pipeline - transcripts that appear in at least
    two samples.

    Because an abinitio assembly will often contain
    fragments of known transcripts and describe them as
    novel, the default behaviour is to produce a set that
    is composed of 'complete' or 'contained' transcripts
    i.e. nothing novel. This may underestimate the number
コード例 #56
0
 def getID(infile):
     return P.snip(os.path.basename(infile),
                   ".mutect.snp.annotated.filtered.vcf")
コード例 #57
0
              --output-filename-pattern=%%DIR%%/
              --deseq-fit-type=%(deseq_fit_type)s
              --deseq-dispersion-method=%(deseq_dispersion_method)s
              --log=%(outfile)s.log
              --fdr=%(edger_fdr)f"
              | grep -v "warnings"
              | gzip
              > %(outfile)s '''

    P.run()


@follows(aggregateTiledReadCounts,
         mkdir(os.path.join(PARAMS["exportdir"], "diff_methylation")))
@files([((data, design), "diff_methylation/%s_%s.deseq.gz" %
         (P.snip(os.path.basename(data),
                 ".counts.tsv.gz"), P.snip(os.path.basename(design), ".tsv")))
        for data, design in itertools.product(
            glob.glob("diff_methylation/*.counts.tsv.gz"),
            P.asList(PARAMS["deseq_designs"]))])
def runDESeq(infiles, outfile):
    '''estimate differential expression using DESeq.

    The final output is a table. It is slightly edited such that
    it contains a similar output and similar fdr compared to cuffdiff.
    '''

    runDE(infiles, outfile, "deseq")


#########################################################################
#########################################################################
コード例 #58
0
def loadIntervals(infile, outfile):
    '''load intervals from :term:`bed` formatted files into
    the database.

    If a :term:`bam` file is associated with a :term:`bed`
    file, re-evaluate the intervals by counting reads within
    the interval. In contrast to the initial pipeline, the
    genome is not binned.

       nprobes: number of reads in interval
       peakcenter: position with maximum number of reads in interval
       avgval: average coverage within interval
    '''

    tmpfile = P.getTempFile(".")

    headers = ("avgval", "disttostart", "genelist", "length", "peakcenter",
               "peakval", "position", "interval_id", "npeaks", "nprobes",
               "contig", "start", "end", "score", "strand")

    tmpfile.write("\t".join(headers) + "\n")

    (avgval, contig, disttostart, end, genelist,
     length, peakcenter, peakval, position,
     start, interval_id, npeaks, nprobes) = \
        0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0

    track = Sample(filename=P.snip(infile, ".bed.gz"))

    bamfiles, offsets = getAssociatedBAMFiles(track)

    if bamfiles:
        E.info("%s: associated bamfiles = %s" % (track, bamfiles))
    else:
        E.info("%s: no bamfiles associated" % (track))

    # open all bamfiles
    samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles]

    c = E.Counter()

    # count tags
    for bed in Bed.iterator(IOTools.openFile(infile, "r")):

        c.input += 1

        if "name" not in bed:
            bed.name = c.input

        try:
            strand = bed["strand"]
        except IndexError:
            strand = "."

        # The fifth field of a bed file can be used to supply a
        # score. Our iterator returns the optional fields as a "fields
        # array". The first of these is the interval name, and the
        # second the score. The score may be more is better or less is
        # better.
        if len(bed.fields) > 1:
            value = bed.fields[1]
            if value != "":
                score = value
            else:
                score = 1
        else:
            score = 1

        if samfiles:
            npeaks, peakcenter, length, avgval, peakval, nprobes = \
                PipelinePeakcalling.countPeaks(
                    bed.contig,
                    bed.start,
                    bed.end,
                    samfiles,
                    offsets)
            if nprobes == 0:
                c.skipped_reads += 1

        else:
            # deal with bed12
            bed_intervals = bed.toIntervals()
            length = sum([e - s for s, e in bed_intervals])
            mid_point = length / 2
            for s, e in bed_intervals:
                peakcenter = s + mid_point
                if peakcenter >= e:
                    mid_point = peakcenter - e
                else:
                    break

            npeaks, avgval, peakval, nprobes = \
                (1,
                 1,
                 1,
                 1)

        c.output += 1
        tmpfile.write("\t".join(
            map(str, (avgval, disttostart, genelist, length, peakcenter,
                      peakval, position, bed.name, npeaks, nprobes, bed.contig,
                      bed.start, bed.end, score, strand))) + "\n")

    if c.output == 0:
        E.warn("%s - no aggregate intervals")

    tmpfile.close()

    P.load(tmpfile.name,
           outfile,
           tablename=os.path.basename("%s_intervals" % track.asTable()),
           options="--allow-empty-file "
           "--add-index=interval_id")

    os.unlink(tmpfile.name)

    E.info("%s\n" % str(c))
コード例 #59
0
def exportMemeCHiPIntervalSequences(infile, outfile):

    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "memechip")
コード例 #60
0
def exportDremeIntervalSequences(infile, outfile):

    track = os.path.basename(P.snip(infile, "_intervals.load"))

    exportIntervalSequences(infile, outfile, track, "dreme")