Python Pipeline.run Examples, CGAT.Pipeline.run Python Examples

Example #1

0

Show file

File: pipeline_hvc.py Project: BioinformaticsArchive/cgat

def computeOverlapCoding( infile, outfile ):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''
    
    to_cluster = True
    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )

Example #2

0

Show file

File: pipeline_rnaseq_geneset.py Project: jmadzo/cgat

def buildDownstreamFlankBed(infile, outfile):
    """ build interval downstream of gene start for each entry in bed file"""
    window = PARAMS["geneset_flank"]
    faidx = PARAMS["faidx"]
    statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s 
                   | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s"""
    P.run()

Example #3

0

Show file

File: PipelineWindows.py Project: lesheng/cgat

def countReadsWithinWindows(bedfile,
                            windowfile,
                            outfile,
                            counting_method="midpoint"):
    '''count reads given in *tagfile* within intervals in 
    *windowfile*.

    Both files need to be :term:`bed` formatted.

    Counting is done using bedtools. The counting method
    can be 'midpoint' or 'nucleotide'.
    '''
    job_options = "-l mem_free=4G"

    if counting_method == "midpoint":
        f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' '''
    elif counting_method == "nucleotide":
        f = ""
    else:
        raise ValueError("unknown counting method: %s" % counting_method)

    statement = '''
    zcat %(bedfile)s
    %(f)s
    | coverageBed -a stdin -b %(windowfile)s -split
    | sort -k1,1 -k2,2n
    | gzip
    > %(outfile)s
    '''

    P.run()

Example #4

0

Show file

File: pipeline_rnaseq_geneset.py Project: jmadzo/cgat

def ExtendRegion(infile, outfile):
    """convert bed to gtf"""
    statement = """gunzip < %(infile)s 
                   | slopBed -i stdin -g %(faidx)s -b 1000  
                   | gzip
                   > %(outfile)s """
    P.run()

Example #5

0

Show file

File: pipeline_rnaseq_geneset.py Project: jmadzo/cgat

def getNoncodingGeneset(infile, outfile):
    """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """
    ensembl_transcripts = PARAMS["ensembl_transcripts"]
    statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s;
                   echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; 
                   cat %(outfile)s | wc -l >> %(outfile)s.count;"""
    P.run()

Example #6

0

Show file

File: pipeline_rnaseq_geneset.py Project: jmadzo/cgat

def addMissingNoncodingTranscripts(infile, outfile):
    """ Add ensembl gene id to GTF file"""
    ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"]
    statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s  -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf;
                   cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n
                   > %(outfile)s;"""
    P.run()

Example #7

0

Show file

File: mapping_titration.py Project: BioinformaticsArchive/cgat

def buildBAMStats( infile, outfile ):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''python %(scriptsdir)s/bam2stats.py --force 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()

Example #8

0

Show file

File: pipeline_variant_annotation.py Project: jmadzo/cgat

def loadEffects(infile, outfile):
    '''load transcript effects into tables.'''

    root = infile[:-len(".effects.gz")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --from-zipped \
              --index=transcript_id \
              --table=%(root)s_effects \
    < %(infile)s > %(outfile)s
    '''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation"):

        statement = '''
        gunzip < %(infile)s.%(suffix)s.gz
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --allow-empty
        --index=transcript_id 
        --table=%(root)s_effects_%(suffix)s 
        --ignore-column=seq_na
        --ignore-column=seq_aa
        >> %(outfile)s
        '''
        P.run()

Example #9

0

Show file

File: pipeline_fastqToBigWig.py Project: Charlie-George/cgat

def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()

Example #10

0

Show file

File: pipeline_fastqToBigWig.py Project: Charlie-George/cgat

def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)

Example #11

0

Show file

File: pipeline_rnaseqlncrna.py Project: Charlie-George/cgat

def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()

Example #12

0

Show file

File: PipelineMotifs.py Project: lesheng/cgat

def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    to_cluster = True
    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    if os.path.exists(outfile):
        os.remove(outfile)

    for motiffile in motiffiles:
        of = IOTools.openFile(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run()

Example #13

0

Show file

File: pipeline_rnaseqlncrna.py Project: Charlie-George/cgat

def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)

Example #14

0

Show file

File: pipeline_rnaseqlncrna.py Project: Charlie-George/cgat

    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models 
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts 
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile 
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()

Example #15

0

Show file

File: pipeline_motifs.py Project: BioinformaticsArchive/cgat

def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )

Example #16

0

Show file

File: PipelineMotifs.py Project: lesheng/cgat

def collectMEMEResults(tmpdir, target_path, outfile):
    '''collect output from a MEME run in tmpdir
    and copy all over to target_path

    convert images output by MEME (.eps files) to 
    .png files.'''

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile)

    # convert images to png
    epsfiles = glob.glob(os.path.join(target_path, "*.eps"))

    for epsfile in epsfiles:
        b, ext = os.path.splitext(epsfile)
        pngfile = b + ".png"
        statement = '''convert %(epsfile)s %(pngfile)s '''
        P.run()

Example #17

0

Show file

File: PipelineMotifs.py Project: lesheng/cgat

def runTomTom(infile, outfile):
    '''compare ab-initio motifs against tomtom.'''

    tmpdir = P.getTempDir(".")

    to_cluster = True
    databases = " ".join(P.asList(PARAMS["tomtom_databases"]))

    target_path = os.path.join(
        os.path.abspath(PARAMS["exportdir"]), "tomtom", outfile)

    if IOTools.isEmpty(infile):
        E.warn("input is empty - no computation performed")
        P.touch(outfile)
        return

    statement = '''
           tomtom %(tomtom_options)s -oc %(tmpdir)s %(infile)s %(databases)s > %(outfile)s.log
    '''

    P.run()

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "tomtom.txt"), outfile)

Example #18

0

Show file

File: pipeline_transcriptome.py Project: BioinformaticsArchive/cgat

def makeSegments( infile, outfile ):
    '''compute intron overrun.'''

    to_cluster = True

    statement = '''gunzip < %(infile)s 
    | %(scriptsdir)s/gff_sort pos 
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--output-filename-pattern="%(outfile)s.%%s"
		--force 
		--log=%(outfile)s.log 
    > %(outfile)s 
    '''
    P.run()

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene
    | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--force 
		--output-filename-pattern="%(outfile)s_genes.%%s" 
		--log=%(outfile)s.log
    >> %(outfile)s'''
    P.run()

Example #19

0

Show file

File: pipeline_transcriptome.py Project: BioinformaticsArchive/cgat

def loadRepeatInformation( infiles, outfile ):
    '''load genome information.'''
    
    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename( "." )

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )

Example #20

0

Show file

File: pipeline_rnaseqdiffexpression.py Project: jmadzo/cgat

def buildTranscriptLevelReadCounts(infiles, outfile):
    '''count reads falling into transcripts of protein coding gene models.

    .. note::
       In paired-end data sets each mate will be counted. Thus
       the actual read counts are approximately twice the fragment
       counts.

    '''
    bamfile, geneset = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    statement = '''
    zcat %(geneset)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=transcripts
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()

Example #21

0

Show file

File: pipeline_mapping_benchmark.py Project: lesheng/cgat

def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()

Example #22

0

Show file

File: PipelineMotifs.py Project: lesheng/cgat

def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)

Example #23

0

Show file

File: pipeline_rnaseqdiffexpression.py Project: jmadzo/cgat

def buildGeneLevelReadCounts(infiles, outfile):
    '''compute read counts and coverage of exons with reads.
    '''

    bamfile, exons = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    # ignore multi-mapping reads
    statement = '''
    zcat %(exons)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=genes
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()

Example #24

0

Show file

File: pipeline_rnaseqlncrna.py Project: Charlie-George/cgat

def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py 
                        --sort=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py 
                           --renumber-genes=%(gene_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --renumber-transcripts=%(transcript_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --sort=gene 
                           --log=%(outfile)s.log
                          | gzip > %(outfile)s'''

    P.run()

Example #25

0

Show file

File: pipeline_metagenomeassembly.py Project: Charlie-George/cgat

def runSpades(infile, outfile):
    '''
    run spades on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Spades().build(infile)
    P.run()

Example #26

0

Show file

File: pipeline_rnaseqlncrna.py Project: Charlie-George/cgat

def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()

Example #27

0

Show file

File: pipeline_metagenomeassembly.py Project: Charlie-George/cgat

def runSoapdenovo(infile, outfile):
    '''
    run soapdenovo
    '''
    job_options = "-l mem_free=30G"
    statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile)
    P.run()

Example #28

0

Show file

File: pipeline_metagenomeassembly.py Project: Charlie-George/cgat

def runIdba(infile, outfile):
    '''
    run idba on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Idba().build(infile)
    P.run()

Example #29

0

Show file

File: pipeline_transcriptome.py Project: BioinformaticsArchive/cgat

def buildAnnotations( infiles, outfile ):
    '''annotate transcripts by location (intergenic, intronic, ...)'''
    
    infile, annotation = infiles

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=position 
		--counter=classifier 
		--section=exons 
		--section=introns 
		--counter=length 
		--counter=splice 
		--counter=composition-na 
		--counter=splice-comparison 
		--log=%(outfile)s.log 
                --filename-format=gff
		--filename-gff=%(annotation)s 
		--genome-file=%(genome_dir)s/%(genome)s"
    | gzip
    > %(outfile)s
    '''
    P.run()

Example #30

0

Show file

File: pipeline_polyphen.py Project: jmadzo/cgat

def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)

Example #31

0

Show file

File: pipeline_rnaseqlncrna.py Project: santayana/cgat

def buildFullGeneSet(infiles, outfile):
    '''
    produces a final gene set that can be used for 
    differential expression analysis and comparisons
    between protein coding and lncRNA transcripts
    '''
    # change the source to be in keeping with classification
    # of transcripts - f coming from cufflinks assembly
    infs = " ".join(infiles)
    statement = ("zcat %(infs)s |"
                 " sed 's/Cufflinks/protein_coding/g' |"
                 " python %(scriptsdir)s/gtf2gtf.py"
                 "  --sort=gene"
                 "  --log=%(outfile)s.log |"
                 " gzip  > %(outfile)s")
    P.run()

Example #32

0

Show file

def lowerStringencyDeNovos(infiles, outfile):
    '''Filter lower stringency de novo variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open(pedfile),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().0==0&&(SNPEFF_IMPACT=="HIGH"||SNPEFF_IMPACT=="MODERATE")' > %(outfile)s''' % locals(
    )
    P.run()

Example #33

0

Show file

File: pipeline_metagenomebenchmark.py Project: lesheng/cgat

def alignContigsToReference(infile, outfile, param):
    '''
    align the contigs to the reference genomes
    using nucmer
    '''
    print infile, param

    to_cluster = True

    reffile, contigfile = infile, param
    pattern = P.snip(os.path.basename(outfile), ".delta")
    statement = '''nucmer -p %(pattern)s %(reffile)s %(contigfile)s'''
    P.run()
    outf = os.path.basename(outfile)
    statement = '''mv %(outf)s alignment.dir'''
    P.run()

Example #34

0

Show file

File: pipeline_exome.py Project: yangjl/cgat

def filterVariants(infiles, outfile):
    '''Filter variants based on provided jexl expression'''
    to_cluster = USECLUSTER
    infile, pedfile = infiles
    pedigree = csv.DictReader(
        open("%(pedfile)s"),
        delimiter='\t',
        fieldnames=['family', 'sample', 'father', 'mother', 'sex', 'status'])
    for row in pedigree:
        if row['status'] == '2':
            father = row['father']
            mother = row['mother']
            child = row['sample']
    statement = '''GenomeAnalysisTK -T SelectVariants -R %%(bwa_index_dir)s/%%(genome)s.fa --variant %(infile)s -select 'vc.getGenotype("%(father)s").getDP()>=10&&vc.getGenotype("%(mother)s").getDP()>=10&&vc.getGenotype("%(father)s").getAB()<0.05&&vc.getGenotype("%(mother)s").getAB()<0.05&&vc.getGenotype("%(child)s").getAB()>=0.25&&vc.getGenotype("%(child)s").getPL().0>20&&vc.getGenotype("%(child)s").getPL().1==0&&vc.getGenotype("%(child)s").getPL().2>0&&vc.getGenotype("%(father)s").getPL().0==0&&vc.getGenotype("%(father)s").getPL().1>20&&vc.getGenotype("%(father)s").getPL().2>20&&vc.getGenotype("%(mother)s").getPL().0==0&&vc.getGenotype("%(mother)s").getPL().1>20&&vc.getGenotype("%(mother)s").getPL().2>20&&vc.getGenotype("%(child)s").getAD().1>=3' > %(outfile)s''' % locals(
    )
    P.run()

Example #35

0

Show file

File: pipeline_transcriptome.py Project: lesheng/cgat

def loadOverlap(infile, outfile):
    '''load results of overlap computation.'''

    tablename = outfile[:-len("_table.load")]
    statement = '''
	grep -v "\\bna\\b" 
        < %(infile)s 
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
             --map set1:str 
             --map set2:str 
             --index=set1 
             --index=set2 
             --table=%(tablename)s
        > %(outfile)s
    '''
    P.run()

Example #36

0

Show file

File: pipeline_transcriptome.py Project: lesheng/cgat

def runFrameFinder(infile, outfile):
    '''run FrameFinder

    search on both strands (-r TRUE). Note that CPC default is: only forward strand.

    '''
    cpc_dir = "/ifs/apps/bio/cpc-0.9-r2"
    statement = '''
    cat %(infile)s |
    %(cpc_dir)s/libs/estate/bin/framefinder
    -r TRUE -w %(cpc_dir)s/data/framefinder.model /dev/stdin
    | gzip
     > %(outfile)s
    '''

    P.run()

Example #37

0

Show file

def buildCodingExons( infile, outfile ):
    '''build a collection of transcripts from the protein-coding portion of the ENSEMBL gene set.

    All exons are kept
    '''

    to_cluster = True

    statement = '''
    gunzip < %(infile)s 
    | awk '$2 == "protein_coding"' 
    | awk '$3 == "exon"' 
    | python %(scriptsdir)s/gtf2gtf.py --remove-duplicates=gene --log=%(outfile)s.log 
    | gzip > %(outfile)s
    '''
    P.run()

Example #38

0

Show file

File: pipeline_transcriptome.py Project: lesheng/cgat

def makeDistances(infiles, outfile):
    '''compute intron overrun.'''

    infile, annotation = infiles

    statement = '''gunzip
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=distance-genes 
		--log=%(outfile)s.log 
		--filename-gff=<( gunzip < %(annotation)s ) " 
    > %(outfile)s 
    '''
    P.run()

Example #39

0

Show file

File: pipeline_benchmark_rnaseqmappers.py Project: lesheng/cgat

def buildCodingGeneSet(infile, outfile):
    '''build a gene set with only protein coding 
    transcripts.

    Genes are selected via their gene biotype in the GTF file.
    Note that this set will contain all transcripts of protein
    coding genes, including processed transcripts.

    This set includes UTR and CDS.
    '''

    to_cluster = True
    statement = '''
    zcat %(infile)s | awk '$2 == "protein_coding"' | gzip > %(outfile)s
    '''
    P.run()

Example #40

0

Show file

def convertBed2Psl(infile, outfile):
    """convert a bed to a psl file."""

    track = outfile[:-len(".bed.gz")]
    genomefile = os.path.join(PARAMS["genome_dir"],
                              PARAMS["%s_genome" % track])
    if not os.path.exists(genomefile + ".fasta"):
        raise IOError("genome %s does not exist" % genomefile)

    statement = """gunzip < %(infile)s 
    | python %(scriptsdir)s/bed2psl.py 
         --genome=%(genomefile)s
         --log=%(outfile)s.log 
    | gzip > %(outfile)s
    """
    P.run()

Example #41

0

Show file

File: pipeline_vitaminD_annotator.py Project: logust79/cgat-apps

def buildAnnotatorSegments(tmpdir, infile, outfile):
    '''convert segments in bed format to annotator format
    from infile to outfile.
    '''

    tmpsegments = os.path.join(tmpdir, "segments")
    to_cluster = True

    statement = '''
        python %(scriptsdir)s/bed2gff.py < %(infile)s |\
	python %(scriptsdir)s/gff2annotator.py --log=%(outfile)s.log --section=segments > %(tmpsegments)s \
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    return tmpsegments

Example #42

0

Show file

File: pipeline_medip.py Project: lesheng/cgat

def mergeDMRWindows(infile, outfile):
    '''merge overlapping windows.'''

    to_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/medip_merge_intervals.py
          --log=%(outfile)s.log
          --invert
          --output-filename-pattern=%(outfile)s.%%s.bed.gz
    | gzip
    > %(outfile)s
    '''

    P.run()

Example #43

0

Show file

def reportTotalRNAFunctions(infiles, outfiles):
    '''report total RNA functions.'''

    to_cluster = USECLUSTER

    rpkm_filename, annotations_filename = infiles
    expression_filename, diff_filename = outfiles
    statement = '''
    python %(rmaadir)s/report_totalRNA_annotations.py 
           %(rpkm_filename)s 
           %(annotations_filename)s 
           %(expression_filename)s 
           %(diff_filename)s
    '''

    P.run()

Example #44

0

Show file

def copyEnsemblDb(infile, outfile):
    '''copy tables from ensembl database to rnaseq database'''
    table_list = P.asList(PARAMS["ensembl_tables"])
    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    query = """ATTACH "%s" as ensembl;""" % PARAMS["ensembl_db"]
    cc.execute(query)
    for table in table_list:
        cc = dbhandle.cursor()
        query = """CREATE TABLE %s AS SELECT * FROM ensembl.%s;""" % (table,
                                                                      table)
        print query
        cc.execute(query)
    cc.close()
    statement = """touch %(outfile)s;"""
    P.run()

Example #45

0

Show file

def buildGeneTables(infile, outfile):
    '''
    build gene tables
    '''
    if infile.endswith(".gff.gz"):
        outf = gzip.open(outfile, "w")
        outf.write(
            "chr\tsource\tfeature\tstart\tend\tscore\tstrand\tframe\tattributes\n")
        for line in gzip.open(infile).readlines():
            outf.write(line)
        outf.close()
    else:
        statement = '''zcat %(infile)s | python %(scriptsdir)s/fasta2table.py
        -s sequence
        --log=%(outfile)s.log | gzip > %(outfile)s'''
        P.run()

Example #46

0

Show file

File: pipeline_vitaminDMHC.py Project: logust79/cgat-apps

def runMACS( infile, outfile ):

    to_cluster = False

    track = infile[:-len("normbam")]
    try:
        control = pipeline_vitaminD.getControl( track ) + ".bam"
    except AssertionError:
        return

    statement = '''
    macs -t %(infile)s -c %(control)s \
          --name=%(outfile)s \
          --format=bam --tsize=35 --bw=110 --mfold=8 --gsize=6000000 >& %(outfile)s''' 

    P.run( **dict( locals().items() + PARAMS.items() ) )

Example #47

0

Show file

def assignEssentialGenesToContigs(infile, outfile):
    '''
    assign essential genes to contigs
    '''
    dirname = os.path.dirname(infile)
    essential = PARAMS["hmmer_hmm"]
    tempdir = P.getTempDir(".")

    statement = '''zcat %(infile)s > %(tempdir)s/orfs.fa;
    hmmsearch --tblout %(tempdir)s/hmm.out --cut_tc
    --notextw  %(essential)s %(tempdir)s/orfs.fa;
    tail -n+4 %(tempdir)s/hmm.out | sed 's/ * / /g' | cut -f 1,4 -d " "
    | gzip > %(outfile)s'''
    P.run()
    statement = '''rm -rf %(tempdir)s'''
    P.run()

Example #48

0

Show file

File: pipeline_ancestral_repeats.py Project: santayana/cgat

    def buildGenomeAlignment(infile, outfile):
        '''remove non-unique alignments in genomic infile.'''

        statement = '''gunzip < %(infile)s 
        | sort -k10,10 -k12,12n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-query
        --log=%(outfile)s.log
        | sort -k14,14 -k16,16n
        | python %(scriptsdir)s/psl2psl.py
        --method=remove-overlapping-target
        --log=%(outfile)s.log
        | gzip
        >> %(outfile)s
        '''
        P.run()

Example #49

0

Show file

def loadPolyphenMap(infile, outfile):
    '''load polyphen input data.'''

    table = P.toTable(outfile)
    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=track,transcript_id
              --index=contig,pos
              --index=protein_id
              --index=transcript_id
              --table=%(table)s 
    < %(infile)s.map
    > %(outfile)s
    '''
    P.run()

Example #50

0

Show file

File: pipeline_cancer_variant_calling.py Project: TomSmithCGAT/Project38

def createRealignIntervals(infiles, outfile):

    infile, reference = infiles

# need to unload java before runnning GATK as it now runs on java version 7

    statement = '''module unload apps/java/jre1.6.0_26;
    java -Xmx4g -jar
    /ifs/apps/bio/GATK-2.7-2/GenomeAnalysisTK.jar
    -T RealignerTargetCreator
    -R %(reference)s
    -I %(infile)s
    -o %(outfile)s
    ''' % locals()

    P.run()

Example #51

0

Show file

File: pipeline_benchmark_rnaseqmappers.py Project: santayana/cgat

def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    transcriptome data.
    '''

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"  
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()

Example #52

0

Show file

def buildCDNAFasta( infile, outfile ):
    '''load ENSEMBL cdna FASTA file
    
    *infile* is an ENSEMBL cdna file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()

Example #53

0

Show file

File: pipeline_medip.py Project: lesheng/cgat

def buildTileStats(infile, outfile):
    '''compute tiling window size statistics from bed file.'''

    use_cluster = True

    statement = '''
    zcat %(infile)s
    | python %(scriptsdir)s/gff2histogram.py 
                   --force
                   --format=bed 
                   --data=size
                   --method=hist
                   --method=stats
                   --output-filename-pattern=%(outfile)s.%%s.tsv
    > %(outfile)s
    '''
    P.run()

Example #54

0

Show file

File: pipeline_transcriptome.py Project: lesheng/cgat

def exportSequences(infile, outfile):
    '''collect sequences from a gtf file.'''

    prefix = outfile[:-len(".fasta")]

    to_cluster = True
    statement = '''gunzip 
        < %(infile)s
        | python %(scriptsdir)s/gtf2gtf.py --sort=gene
	| python %(scriptsdir)s/gff2fasta.py 
		--is-gtf 
		--genome-file=%(genome_dir)s/%(genome)s
		--log=%(outfile)s.log 
	| python %(toolsdir)s/index_fasta.py --force %(prefix)s - 
        > %(outfile)s.log'''

    P.run()

Example #55

0

Show file

def buildPeptideFasta( infile, outfile ):
    '''create ENSEMBL peptide file

    *infile* is an ENSEMBL .pep.all.fa.gz file.
    '''
    dbname = outfile[:-len(".fasta")]

    statement = '''gunzip 
    < %(infile)s
    | perl -p -e 'if ("^>") { s/ .*//};'
    | python %(scriptsdir)s/index_fasta.py
       --force
    %(dbname)s - 
    > %(dbname)s.log
    '''

    P.run()

Example #56

0

Show file

def annotateVariantsSNPsift(infile, outfile):
    '''Add annotations using SNPsift'''
    to_cluster = USECLUSTER
    job_options = "-pe dedicated 4 -R y -l mem_free=6G"
    track = P.snip(os.path.basename(infile), ".vqsr.vcf")
    dbNSFP = PARAMS["annotation_snpsift_dbnsfp"]
    # The following statement is not fully implemented yet
    #    statement = '''SnpSift.sh geneSets -v /ifs/projects/proj016/data/1000Genomes/msigdb.v4.0.symbols.gmt %(infile)s > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement = '''SnpSift.sh dbnsfp -v %(dbNSFP)s %(infile)s
    > variants/%(track)s_temp1.vcf; checkpoint;''' % locals()

    statement += '''SnpSift.sh annotate /ifs/projects/proj016/data/1000Genomes/00-All.vcf
    variants/%(track)s_temp1.vcf > %(outfile)s ;''' % locals()
    #    statement += '''rm -f variants/*temp*vcf;'''

    P.run()

Example #57

0

Show file

def loadPolyphen(infile, outfile):
    '''load polyphen results.'''

    table = P.toTable(outfile)

    statement = '''
    gunzip 
    < %(infile)s
    | perl -p -e "s/o_acc/protein_id/; s/ +//g; s/^#//;"
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s
              --index=snp_id 
              --index=protein_id
              --table=%(table)s 
              --map=effect:str
    > %(outfile)s
    '''
    P.run()

Example #58

0

Show file

def mergeGeneLists(infiles, outfile):
    '''Merge gene lists into single table and load into SQLite.'''

    tablename = P.toTable(outfile)
    species_list = P.asList(PARAMS["species"])
    anno_list = P.asList(PARAMS["annotations_db"])
    species_lookup = dict(zip(species_list, anno_list))

    # Connect to database and attach annotation databases
    dbhandle = sqlite3.connect(PARAMS["database"])
    for species in species_lookup.iterkeys():
        species_db = species_lookup[species]
        #species_db = anno_base + species_genome + "/" + db_name
        cc = dbhandle.cursor()
        statement = '''ATTACH DATABASE '%(species_db)s' as %(species)s''' % locals(
        )
        print statement
        cc.execute(statement)
        cc.close()

    # Build union statement
    pre = "CREATE TABLE %s AS " % tablename
    statement = ""
    for f in infiles:
        track = P.snip(os.path.basename(f),
                       ".genelist.load").replace("-", "_").replace(".", "_")
        species = track[:2]
        genelist_id = PARAMS["genelist_id"]
        statement += pre + '''SELECT distinct t.gene_id, t.gene_name, "%(species)s" AS species
                       FROM %(track)s_genelist g, %(species)s.transcript_info t
                       WHERE g.gene_id=t.%(genelist_id)s and t.gene_biotype='protein_coding' ''' % locals(
        )
        pre = " UNION "

    print statement
    cc = dbhandle.cursor()
    cc.execute("DROP TABLE IF EXISTS %(tablename)s" % locals())
    cc.execute(statement)
    cc.execute('''CREATE INDEX "glm_idx1" ON "%s" ("gene_id" ASC) ''' %
               tablename)
    cc.execute('''CREATE INDEX "glm_idx2" ON "%s" ("species" ASC) ''' %
               tablename)
    cc.close()

    statement = "touch %s" % outfile
    P.run()

Example #59

0

Show file

def loadRepeatsRates(infile, outfile):
    '''load repeat overlap'''

    table = outfile[:-len(".load")]

    statement = '''gunzip 
    < %(infile)s 
    | awk '$4 > 0'
    | python %(toolsdir)s/csv_cut.py --remove exons_lengths exons_values
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --index=gene_id 
              --map=gene_id:str 
              --table=%(table)s 
              --allow-empty
    > %(outfile)s'''

    P.run()

Example #60

0

Show file

def loadSegments(infile, outfile):
    '''load segments'''

    table = outfile[:-len(".load")]

    for x in (".distances", ".sizes", ".overlaps", "_genes.distances",
              "_genes.sizes", "_genes.overlaps"):
        y = re.sub("\.", "_", x)
        statement = '''
        python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --index=gene_id 
        --map=gene_id:str 
        --table=%(table)s%(y)s 
        < %(infile)s%(x)s
        >> %(outfile)s'''

        P.run()