def filterByCoverage(infiles, outfile):

    fcoverage = PARAMS["coverage_filter"]
    contig_file = infiles[0]
    dbh = sqlite3.connect(
        os.path.join(PARAMS["results_resultsdir"], PARAMS["database_name"]))
    cc = dbh.cursor()
    contigs = set()
    for infile in infiles[1:]:
        dirsplit = infile.split("/")
        infile = os.path.join(
            PARAMS["results_resultsdir"],
            dirsplit[-2].split(".dir")[0] + "-" + dirsplit[-1])
        tablename = P.toTable(os.path.basename(infile))
        if P.snip(contig_file, ".fa") == P.snip(os.path.basename(infile),
                                                ".coverage.load"):
            statement = """SELECT contig_id ave FROM
                           (SELECT contig_id, AVG(coverage) as ave FROM %s GROUP BY contig_id)
                           WHERE ave > %i""" % (tablename,
                                                PARAMS["coverage_filter"])
            for data in cc.execute(statement).fetchall():
                contigs.add(data[0])
    outf = open(outfile, "w")
    print(contigs)
    for fasta in FastaIterator.iterate(IOTools.openFile(contig_file)):
        identifier = fasta.title.split(" ")[0]
        if identifier in contigs:
            outf.write(">%s\n%s\n" % (identifier, fasta.sequence))
    outf.close()
Ejemplo n.º 2
0
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = open('dupstats.txt', 'w')

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [
            x for x in open(statfile, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()
def dedup_bams(infile, outfile):
    '''Use MarkDuplicates to mark dupliceate reads'''
    tempfile = P.snip(outfile, ".bam") + ".temp.bam"
    metrics = P.snip(outfile, ".bam") + ".metrics.tsv"
    temporary = PARAMS["tmpdir"]
    statement = '''MarkDuplicates I=%(infile)s
                                  O=%(tempfile)s
                                  M=%(metrics)s
                                  TMP_DIR=%(temporary)s > %(outfile)s.log;

                    checkpoint;
                                samtools view 
                                -F 1024
                                -b
                                %(tempfile)s
                                > %(outfile)s;
                  
                    checkpoint;
                                rm -r %(tempfile)s;

                    checkpoint;

                                samtools index %(outfile)s'''

    job_memory = "15G"
    P.run()
Ejemplo n.º 4
0
def runMutectReverse(infiles, outfile):
    '''Use control as tumor and vis versa to estimate false positive rate'''
    infile, normal_panel = infiles
    infile_tumour = infile.replace(PARAMS["sample_control"],
                                   PARAMS["sample_tumour"])

    basename = P.snip(outfile, "_normal_mutect.vcf")
    call_stats_out = basename + "_call_stats.out"
    mutect_log = basename + ".log"

    basename = P.snip(outfile, ".mutect.reverse.snp.vcf")
    call_stats_out = basename + "_call_stats.reverse.out"
    coverage_wig_out = basename + "_coverage.reverse.wig"
    mutect_log = basename + ".reverse.log"

    (cosmic, dbsnp, quality, max_alt_qual, max_alt, max_fraction,
     tumor_LOD) = (PARAMS["mutect_cosmic"], PARAMS["gatk_dbsnp"],
                   PARAMS["mutect_quality"], PARAMS["mutect_max_alt_qual"],
                   PARAMS["mutect_max_alt"], PARAMS["mutect_max_fraction"],
                   PARAMS["mutect_LOD"])

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    PipelineExome.mutectSNPCaller(infile, outfile, mutect_log, genome, cosmic,
                                  dbsnp, call_stats_out,
                                  PARAMS['mutect_memory'],
                                  PARAMS['mutect_threads'], quality,
                                  max_alt_qual, max_alt, max_fraction,
                                  tumor_LOD, normal_panel, infile_tumour)
Ejemplo n.º 5
0
def splitMultiAndSingleExonLincRna(infile, outfiles):
    '''
    pulls out the multi-exonic and the single exonic lincRNA transcripts
    from the lincrna.gtf.gz
    '''

    inf = gzip.open(infile)
    multi = gzip.open(P.snip(infile, ".gtf.gz") + ".multi_exon.gtf.gz", "w")
    single = gzip.open(P.snip(infile, ".gtf.gz") + ".single_exon.gtf.gz", "w")

    for entry in GTF.transcript_iterator(GTF.iterator(inf)):
        if len(entry) > 1:
            for exon in entry:
                multi.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".", exon.strand,
                                        "."])) +
                    "\t" + exon.attributes + "\n")

        elif len(entry) == 1:
            for exon in entry:
                single.write(
                    "\t".join(map(str, [exon.contig, exon.source, exon.feature,
                                        exon.start, exon.end, ".",
                                        exon.strand, "."])) +
                    "\t" + exon.attributes + "\n")

    for outfile in outfiles:
        outf = P.snip(outfile, ".gz")
        if not os.path.exists(outfile):
            statement = '''gzip %(outf)s'''
            P.run()
Ejemplo n.º 6
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_dir"],
                                  PARAMS["annotations_interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS["annotations_interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;" " gzip %(removed)s")
    P.run()
Ejemplo n.º 7
0
def loadAlignmentStats(infiles, outfile):
    '''merge alignment stats into single tables.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(f, ".bam.stats")
        fn = f + ".alignment_summary_metrics"
        if not os.path.exists(fn):
            E.warn("file %s missing" % fn)
            continue
        lines = [
            x for x in open(fn, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
               '''
    P.run()

    for suffix, column in (("quality_by_cycle_metrics", "cycle"),
                           ("quality_distribution_metrics", "quality")):

        # some files might be missing - bugs in Picard
        xfiles = [x for x in infiles if os.path.exists("%s.%s" % (x, suffix))]

        header = ",".join([P.snip(x, ".bam.stats") for x in xfiles])
        filenames = " ".join(["%s.%s" % (x, suffix) for x in xfiles])

        tname = "%s_%s" % (tablename, suffix)

        statement = """cgat combine_tables
                      --missing-value=0
                   %(filenames)s
                | cgat csv2db
                      --header-names=%(column)s,%(header)s
                      --replace-header
                      --add-index=track
                      --table=%(tname)s
                >> %(outfile)s
                """

        P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 8
0
def reMergeBamfiles(infiles, sentinel):
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"
    bad_samples = PARAMS["options_to_remove"].split(",")

    to_merge = IDR.filterBadLibraries(infiles, bad_samples)

    IDR.mergeBams(to_merge, outfile)
    P.touch(sentinel)
Ejemplo n.º 9
0
def quantifyWithSalmon(infiles, outfile):
    '''Quantify existing samples against genesets'''
    job_threads = 2
    job_memory = "16G"

    infile, gtffile = infiles
    basefile = os.path.basename(infile)
    sample_name = basefile.split(os.extsep, 1)
    sorted_bam = "sorted_bams/" + sample_name[0] + "_sorted.bam"
    gtfbase = P.snip(os.path.basename(gtffile), ".gz")
    salmonIndex = "salmon_index/" + gtfbase + ".salmon.index"
    fastq1 = P.snip(outfile, "_agg-agg-agg") + ".1.fastq"
    fastq2 = P.snip(outfile, "_agg-agg-agg") + ".2.fastq"
    salmon_options = PARAMS["salmon_quantoptions"]

    statement = '''
    samtools sort -n %(infile)s -o %(sorted_bam)s;
    samtools fastq
         -1 %(fastq1)s
         -2 %(fastq2)s
         -0 /dev/null -s /dev/null -n -F 0x900
         %(sorted_bam)s; 
    salmon quant -i %(salmonIndex)s
        --libType IU
        -1 %(fastq1)s
        -2 %(fastq2)s
        -o %(outfile)s
        %(salmon_options)s; 
    mv %(outfile)s/quant.sf %(outfile)s.sf; 
    rm %(fastq1)s; rm %(fastq2)s; rm %(sorted_bam)s 
    '''

    if infile.endswith(".remote"):
        token = glob.glob("gdc-user-token*")
        filename = "temp_bams/%s" % basefile
        tmpfilename = P.get_temp_filename()
        if os.path.exists(tmpfilename):
            os.unlink(tmpfilename)

        if len(token) > 0:
            token = token[0]
        else:
            token = None

        s, infile = Sra.process_remote_BAM(
            infile,
            token,
            filename,
            filter_bed=os.path.join(
                PARAMS["annotations_dir"],
                PARAMS["annotations_interface_contigs_bed"]))

        infile = " ".join(infile)
        statement = "; ".join(
            ["mkdir %(filename)s", s, statement, "rm -r %(filename)s"])

    P.run(statement)
Ejemplo n.º 10
0
def poolSampleBamfiles(infiles, sentinel):
    """
    Merge filtered sample files for each tissue
    """
    infiles = [P.snip(x, ".sentinel") + ".bam" for x in infiles]
    outfile = P.snip(sentinel, ".sentinel") + ".bam"

    IDR.mergeBams(infiles, outfile)

    P.touch(sentinel)
Ejemplo n.º 11
0
def splitPooledBamfiles(infile, sentinel):
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
def runFIMO(motifs, database, outfile, exportdir, options={}):
    '''run fimo to look for occurances of motifs supplied in sequence database.
    :param:`motifs` is the path to a MEME formated motif file.
    :param:`database` is a fasta file.
    :param:`outfile` is the text output from fimo
    :param:`exportdir` specifies the directory to put exported files (html,gff)
    :param:options is a dictionary: {'option':'value'} will be passed as
                    --option=value and will overwrite options specified in the
                     PARAMs'''

    # if the motifs file is empty, then fimo will return an error
    # this isn't very useful behavoir.

    inlines = IOTools.open_file(motifs).read()
    #print inlines
    if not re.search("MOTIF", inlines):
        E.warning("No motifs found in %s" % motifs)
        P.touch(outfile)
        return
    else:
        E.debug("%s: %i motifs found" %
                (motifs, len(re.findall("MOTIF", inlines))))

    fimo_options = PARAMS.get("fimo_options", "")
    for option, value in options.iteritems():
        fimo_options = re.sub("%s=\S+" % option, "", fimo_options)
        if value is None:
            fimo_options += " --%s" % option
        else:
            fimo_options += " --%s=%s" % (option, value)

    tmpout = P.get_temp_filename()

    track = os.path.basename(outfile)
    exportdir = os.path.abspath(exportdir)

    xmlout = P.snip(outfile, ".txt") + ".xml"
    logfile = P.snip(outfile, ".txt") + ".log"
    gffout = os.path.join(exportdir, track + ".gff")
    htmlout = os.path.join(exportdir, track + ".html")

    statement = ''' fimo --oc %(tmpout)s
                         %(fimo_options)s
                         %(motifs)s
                         %(database)s &> %(logfile)s;
                     mv %(tmpout)s/fimo.txt %(outfile)s;
                     mv %(tmpout)s/fimo.xml %(xmlout)s;
                     mv %(tmpout)s/fimo.gff %(gffout)s;
                     mv %(tmpout)s/fimo.html %(htmlout)s;
                     rm -r %(tmpout)s '''

    P.run(statement)
Ejemplo n.º 13
0
def findNPeaksForPooledPseudoreplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_pooled_consistency_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[
                 str(idr_thresh),
             ],
             infiles=infiles,
             outfiles=outfile)
Ejemplo n.º 14
0
def findNPeaksForIndividualReplicates(infiles, outfile):
    idr_thresh = PARAMS["idr_options_inter_replicate_threshold"]
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module,
             "findNPeaks",
             params=[
                 str(idr_thresh),
             ],
             infiles=infiles,
             outfiles=outfile)
Ejemplo n.º 15
0
def linkBamToWorkingDirs(infiles, outfile):
    '''
    symlink the bam file and index to the working directories
    for execution of the transcript building pipeline
    '''

    bamfile = P.snip(infiles[0], ".bai")
    indexfile = infiles[0]
    directories = [P.snip(logfile, ".log") for logfile in infiles[1]]

    for directory in directories:
        os.symlink(os.path.abspath(bamfile), os.path.join(directory, bamfile))
        os.symlink(
            os.path.abspath(indexfile), os.path.join(directory, indexfile))
    updateFile(outfile)
Ejemplo n.º 16
0
def splitBamfiles(infile, sentinel):
    """
    For all tracks, split the filtered bamfile in two using pysam
    """
    infile = P.snip(infile, ".sentinel") + ".bam"
    outfile = P.snip(sentinel, ".sentinel")
    params = '2'
    try:
        module = P.snip(IDR.__file__, ".py")
    except ValueError:
        module = P.snip(IDR.__file__, ".pyc")

    P.submit(module, "splitBam", params, infile, outfile)

    P.touch(sentinel)
Ejemplo n.º 17
0
def mergeEffects(infiles, outfile):
    '''load transcript effects into single table.'''

    tablename = P.toTable(outfile)
    outf = open('effects.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".effects.gz")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [x for x in gzip.open(f, "r").readlines()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()

    P.load("effect.txt", outfile, options="--add-index=transcript_id")

    for suffix in ("cds", "intron", "splicing", "translation", "genes"):

        outf = open('effects.' + suffix + '.txt', 'w')
        first = True
        for f in infiles:
            track = P.snip(os.path.basename(f), ".effects.gz")
            statfile = f + "." + suffix + ".gz"
            print(statfile)
            if not os.path.exists(statfile):
                E.warn("File %s missing" % statfile)
                continue
            lines = [x for x in gzip.open(statfile, "r").readlines()]
            if first:
                outf.write("%s\t%s" % ("track", lines[0]))
            first = False
            for i in range(1, len(lines)):
                outf.write("%s\t%s" % (track, lines[i]))
        outf.close()
        tmpfilename = outf.name

        P.load(outf.name,
               outfile,
               tablename=tabelname + "_" + suffix,
               options="--add-index=transcript_id "
               "--allow-empty-file "
               "--ignore-column=seq_na "
               "--ignore-column=seq_aa")
Ejemplo n.º 18
0
def loadSummarizedContextStats(infiles,
                               outfile,
                               suffix=".contextstats.tsv.gz"):
    """merge output from :func:`summarizeTagsWithinContex` and load into database.

    Arguments
    ---------
    infiles : list
        List of filenames in :term:`tsv` format. The files should end
        in suffix.
    outfile : string
        Output filename, the table name is derived from `outfile`.
    suffix : string
        Suffix to remove from filename for track name.

    """

    header = ",".join([P.snip(os.path.basename(x), suffix) for x in infiles])
    filenames = " ".join(infiles)

    load_statement = P.build_load_statement(P.to_table(outfile),
                                            options="--add-index=track")

    statement = """cgat combine_tables
    --header-names=%(header)s
    --missing-value=0
    --skip-titles
    %(filenames)s
    | perl -p -e "s/bin/track/; s/\?/Q/g"
    | cgat table2table --transpose
    | %(load_statement)s
    > %(outfile)s
    """
    P.run(statement)
Ejemplo n.º 19
0
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''

    tablename = P.toTable(outfile)

    outf = P.getTempFile()

    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines()
            if not x.startswith("#") and x.strip()
        ]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))

    outf.close()
    tmpfilename = outf.name

    statement = '''cat %(tmpfilename)s
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s 
                > %(outfile)s
               '''
    P.run()

    os.unlink(tmpfilename)
Ejemplo n.º 20
0
def mergeAndLoad(infiles, outfile, suffix):
    '''load categorical tables (two columns) into a database.

    The tables are merged and entered row-wise.

    '''
    header = ",".join([P.tablequote(P.snip(x, suffix)) for x in infiles])
    if suffix.endswith(".gz"):
        filenames = " ".join(
            ["<( zcat %s | cut -f 1,2 )" % x for x in infiles])
    else:
        filenames = " ".join(["<( cat %s | cut -f 1,2 )" % x for x in infiles])

    tablename = P.toTable(outfile)

    statement = """cgat combine_tables
                      --header-names=%(header)s
                      --missing-value=0
                      --ignore-empty
                   %(filenames)s
                | perl -p -e "s/bin/track/"
                | cgat table2table --transpose
                | cgat csv2db
                      --add-index=track
                      --table=%(tablename)s
                > %(outfile)s
            """
    P.run()
Ejemplo n.º 21
0
def GATKpreprocessing(infile, outfile):
    '''Reorders BAM according to reference fasta and add read groups using
       SAMtools, realigns around indels and recalibrates base quality scores
       using GATK'''

    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    tmpdir_gatk = P.get_temp_dir()
    job_memory = PARAMS["gatk_memory"]

    genome = "%s/%s.fa" % (PARAMS["bwa_index_dir"], PARAMS["genome"])

    outfile1 = outfile.replace(".bqsr", ".readgroups.bqsr")
    outfile2 = outfile.replace(".bqsr", ".realign.bqsr")

    PipelineExome.GATKReadGroups(infile, outfile1, genome,
                                 PARAMS["readgroup_library"],
                                 PARAMS["readgroup_platform"],
                                 PARAMS["readgroup_platform_unit"])

    PipelineExome.GATKIndelRealign(outfile1, outfile2, genome,
                                   PARAMS["gatk_threads"])

    IOTools.zap_file(outfile1)

    PipelineExome.GATKBaseRecal(outfile2, outfile, genome,
                                PARAMS["gatk_dbsnp"],
                                PARAMS["gatk_solid_options"])
    IOTools.zap_file(outfile2)
Ejemplo n.º 22
0
def loadPicardCoverageStats(infiles, outfile):
    '''import coverage statistics into database.
    Arguments
    ---------
    infiles : string
        Filenames of files with picard metric information. Each file
        corresponds to a different track.
    outfile : string
        Logfile. The table name will be derived from `outfile`.
    '''

    outf = P.get_temp_file(".")
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".cov")
        lines = [x for x in open(f, "r").readlines()
                 if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))
    outf.close()
    P.load(outf.name,
           outfile,
           options="--ignore-empty --add-index=track")
    os.unlink(outf.name)
def find_utrons(infiles, outfiles):

    infile, reference, classfile = infiles
    job_threads = 2
    job_memory = "16G"

    all_out, part_out, novel_out = outfiles

    track = P.snip(all_out, ".all_utrons.bed.gz")
    current_file = __file__
    pipeline_path = os.path.abspath(current_file)
    pipeline_directory = os.path.dirname(pipeline_path)
    script_path = "pipeline_utrons/find_utrons.py"
    find_utrons_path = os.path.join(pipeline_directory, script_path)

    statement = '''cgat gtf2gtf -I %(infile)s
                             --method=sort
                             --sort-order=gene+transcript
                              -L %(track)s.log
                 | python %(find_utrons_path)s
                             --reffile=%(reference)s
                             --class-file=%(classfile)s
                             --outfile %(all_out)s
                             --partfile=%(part_out)s
                             --novel-file=%(novel_out)s
                              -L %(track)s.log'''

    P.run(statement)
Ejemplo n.º 24
0
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | cgat gtf2gtf
                        --method=sort --sort-order=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''
            zcat %(infile)s | cgat gtf2gtf
            --method=renumber-genes
            --pattern-identifier=%(gene_pattern)s%%i
            | cgat gtf2gtf
            --method=renumber-transcripts
            --pattern-identifier=%(transcript_pattern)s%%i
            | cgat gtf2gtf
            --method=sort --sort-order=gene
            --log=%(outfile)s.log
            | gzip > %(outfile)s'''

    P.run()
Ejemplo n.º 25
0
def run_test(infile, outfile):
    '''run a test.

    Multiple targets are run iteratively.
    '''

    track = P.snip(outfile, ".log")
    pipeline_name = PARAMS.get("%s_pipeline" % track, track[len("test_"):])

    pipeline_targets = P.as_list(PARAMS.get("%s_target" % track, "full"))

    # do not run on cluster, mirror
    # that a pipeline is started from
    # the head node
    #to_cluster = False

    template_statement = ("cd %%(track)s.dir; "
                          "xvfb-run -d cgatflow %%(pipeline_name)s "
                          "%%(pipeline_options)s "
                          "%%(workflow_options)s make %s "
                          "-L ../%%(outfile)s "
                          "-S ../%%(outfile)s.stdout "
                          "-E ../%%(outfile)s.stderr")

    if len(pipeline_targets) == 1:
        statement = template_statement % pipeline_targets[0]
        P.run(statement, ignore_errors=True, job_memory="unlimited")
    else:
        statements = []
        for pipeline_target in pipeline_targets:
            statements.append(template_statement % pipeline_target)
        P.run(statement, ignore_errors=True, job_memory="unlimited")
Ejemplo n.º 26
0
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
Ejemplo n.º 27
0
def makeRates(infiles, outfile):
    '''compute nucleotide substitution rates for transcripts from a gtf file -
    this applies only for transcripts mapped onto the reference genome.

    Sequences from the transcripts are mapped onto the rate genome.

    Softmasked sequence will be ignored unless track is in CONTROL_TRACKS.

    The longest contiguous block is selected ignoring matches to other parts of the genome.
    '''

    infile_sequences, infile_gtf, alignment = infiles

    track = P.snip(infile_sequences, ".fasta")

    if track in TRACKS_CONTROL:
        # when aligning repeats, do not mask lower case characters
        mask = ""
    else:
        mask = "--mask-lowercase"

    # locate target genome from ancestral repeats ini file
    target_genome = os.path.join(PARAMS["genome_dir"],
                                 PARAMS_ANCESTRAL_REPEATS["target"])

    statement = '''gunzip
    < %(infile_gtf)s
    | cgat gtf2gtf --method=sort --sort-order=gene
    | cgat gff2psl
    --is-gtf
    --genome-file=%(genome_dir)s/%(genome)s
    --log=%(outfile)s.log
    | pslMap stdin <(gunzip < %(alignment)s ) stdout
    | sort -k10,10 -k14,14 -k9,9 -k12,12n
    | cgat psl2psl
    --method=merge
    --log=%(outfile)s.log
    | cgat psl2psl
    --method=select-query
    --select=most-nmatches
    --log=%(outfile)s.log
    | cgat psl2psl
    --method=add-sequence
    --target-psl-file=%(target_genome)s
    --queries-tsv-file=%(infile_sequences)s
    --log=%(outfile)s.log
    | %(cmd-farm)s
    --split-at-lines=10000
    --output-header
    --log=%(outfile)s.log
    "cgat psl2table
    %(mask)s
    --method=counts
    --method=baseml
    --baseml-model=REV"
    | gzip
    > %(outfile)s
    '''

    P.run()
Ejemplo n.º 28
0
def exportMotifLocations(infiles, outfile):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [
        x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall()
    ]

    for motif in motifs:

        tmpf = P.get_temp_file(".")

        for infile in infiles:
            table = P.to_table(infile)
            track = P.snip(table, "_mast")
            for x in cc.execute(
                    """SELECT contig, start, end, '%(track)s', evalue
                    FROM %(table)s WHERE motif = '%(motif)s' AND
                    start IS NOT NULL""" % locals()):
                tmpf.write("\t".join(map(str, x)) + "\n")
        tmpf.close()

        outfile = os.path.join(PARAMS["exportdir"], "motifs",
                               "%s.bed.gz" % motif)
        tmpfname = tmpf.name

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run(statement)

        os.unlink(tmpf.name)
Ejemplo n.º 29
0
def exportMotifDiscoverySequences(infile, outfile):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio
    are supplied.

    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.

    '''
    track = P.snip(infile, "_intervals.load")
    dbhandle = connect()

    p = P.substitute_parameters(**locals())
    nseq = PipelineMotifs.writeSequencesForIntervals(
        track,
        outfile,
        dbhandle,
        full=False,
        masker=P.as_list(p['motifs_masker']),
        halfwidth=int(p["motifs_halfwidth"]),
        maxsize=int(p["motifs_max_size"]),
        proportion=p["motifs_proportion"],
        min_sequences=p["motifs_min_sequences"],
        num_sequences=p["motifs_num_sequences"],
        order=p['motifs_score'])

    if nseq == 0:
        E.warn("%s: no sequences - meme skipped" % outfile)
        IOTools.touch_file(outfile)
def runSleuth(design,
              base_dir,
              model,
              contrasts,
              outfile,
              counts,
              tpm,
              fdr,
              lrt=False,
              reduced_model=None):
    ''' run sleuth. Note: all samples in the design table must also
    have a directory with the same name in `base_dir` with kallisto
    results in a file called abundance.h5'''

    outfile_prefix = P.snip(outfile, ".tsv")

    Design = Expression.ExperimentalDesign(design)
    exp = Expression.DEExperiment_Sleuth()

    res = exp.run(Design, base_dir, model, contrasts, outfile_prefix, counts,
                  tpm, fdr, lrt, reduced_model)

    res.getResults(fdr)
    for contrast in set(res.table['contrast']):
        res.plotMA(contrast, outfile_prefix)
        res.plotVolcano(contrast, outfile_prefix)

    res.table.to_csv(outfile, sep="\t", index=False)