def buildUniformityOfCoverage(infiles, outfile):
    '''
    build matrix of coverage over contigs
    '''
    bam = infiles[0]
    track = P.snip(os.path.basename(bam), ".bam")
    tmp_bed = P.getTempFilename(".") + ".bed"
    tmp_bam = P.getTempFilename(".") + ".bam"
    
    # filter for mapped reads
    statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s
                   ; samtools index %(tmp_bam)s'''
    P.run()

    for infs in infiles[1:]:
        for inf in infs:
            if P.snip(inf, ".lengths.tsv") == track:
                length_file = inf
                

    statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s'''
    P.run()

    statement = '''python %(scriptsdir)s/bam2peakshape.py 
                   --only-interval %(tmp_bam)s %(tmp_bed)s 
                   --log=%(outfile)s.log 
                   --output-filename-pattern=%(track)s.%%s'''
    P.run()
    os.unlink(tmp_bed)
    os.unlink(tmp_bam)
def buildUniformityOfCoverage(infiles, outfile):
    '''
    build matrix of coverage over contigs
    '''
    bam = infiles[0]
    track = P.snip(os.path.basename(bam), ".bam")
    tmp_bed = P.getTempFilename(".") + ".bed"
    tmp_bam = P.getTempFilename(".") + ".bam"

    # filter for mapped reads
    statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s
                   ; samtools index %(tmp_bam)s'''
    P.run()

    for infs in infiles[1:]:
        for inf in infs:
            if P.snip(inf, ".lengths.tsv") == track:
                length_file = inf

    statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s'''
    P.run()

    statement = '''python %(scriptsdir)s/bam2peakshape.py 
                   --only-interval %(tmp_bam)s %(tmp_bed)s 
                   --log=%(outfile)s.log 
                   --output-filename-pattern=%(track)s.%%s'''
    P.run()
    os.unlink(tmp_bed)
    os.unlink(tmp_bam)
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2,
                                        outfile,
                                        removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;"
                 " gzip %(removed)s")
    P.run()
Beispiel #4
0
def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | python %(scriptsdir)s/psl2chain.py --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint; 
    gunzip < %(outfile)s.sorted.chain.gz 
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)
Beispiel #5
0
def convertPslToChain(infile, outfile):
    '''convert a psl to a chain file.

    see http://genomewiki.ucsc.edu/index.php/Minimal_Steps_For_LiftOver
    '''

    to_cluster = True

    target, query = extractGenomes(infile)

    tmpfilename1 = P.getTempFilename(".")
    tmpfilename2 = P.getTempFilename(".")

    writeContigSizes(target, tmpfilename1)
    writeContigSizes(query, tmpfilename2)

    statement = '''gunzip
    < %(infile)s
    | pslSwap stdin stdout
    | python %(scriptsdir)s/psl2chain.py --log=%(outfile)s.log
    | chainSort stdin stdout
    | gzip
    > %(outfile)s.sorted.chain.gz;
    checkpoint; 
    gunzip < %(outfile)s.sorted.chain.gz 
    | chainNet stdin %(tmpfilename1)s %(tmpfilename2)s stdout /dev/null
    | netChainSubset stdin <( zcat %(outfile)s.sorted.chain ) stdout
    | gzip
    > %(outfile)s'''
    P.run()

    os.unlink(tmpfilename1)
    os.unlink(tmpfilename2)
Beispiel #6
0
def createMAFAlignment(infiles, outfile):
    """
    Takes all .axt files in the input directory, filters them to remove
    files based on supplied regular expressions, converts to a single maf file
    using axtToMaf, filters maf alignments under a specified length.
    """
    outfile = P.snip(outfile, ".gz")
    axt_dir = PARAMS["phyloCSF_location_axt"]
    to_ignore = re.compile(PARAMS["phyloCSF_ignore"])

    axt_files = []
    for axt_file in os.listdir(axt_dir):
        if axt_file.endswith("net.axt.gz") and not to_ignore.search(axt_file):
            axt_files.append(os.path.join(axt_dir, axt_file))
    axt_files = (" ").join(sorted(axt_files))

    E.info("axt files from which MAF alignment will be created: %s" %
           axt_files)

    target_genome = PARAMS["phyloCSF_target_genome"]
    target_contigs = os.path.join(PARAMS["annotations_annotations_dir"],
                                  PARAMS_ANNOTATIONS["interface_contigs"])
    query_genome = PARAMS["phyloCSF_query_genome"]
    query_contigs = os.path.join(PARAMS["phyloCSF_query_assembly"],
                                 PARAMS_ANNOTATIONS["interface_contigs"])

    tmpf1 = P.getTempFilename("./phyloCSF")
    tmpf2 = P.getTempFilename("./phyloCSF")
    to_cluster = False
    # concatenate axt files, then remove headers
    statement = ("zcat %(axt_files)s"
                 " > %(tmpf1)s;"
                 " axtToMaf "
                 "  -tPrefix=%(target_genome)s."
                 "  -qPrefix=%(query_genome)s."
                 "  %(tmpf1)s"
                 "  %(target_contigs)s"
                 "  %(query_contigs)s"
                 "  %(tmpf2)s")
    P.run()

    E.info("Temporary axt file created %s" % os.path.abspath(tmpf1))
    E.info("Temporary maf file created %s" % os.path.abspath(tmpf2))

    removed = P.snip(outfile, ".maf") + "_removed.maf"
    to_cluster = False
    filtered = PipelineLncRNA.filterMAF(tmpf2, outfile, removed,
                                        PARAMS["phyloCSF_filter_alignments"])
    E.info("%s blocks were ignored in MAF alignment"
           " because length of target alignment was too short" % filtered[0])
    E.info("%s blocks were output to filtered MAF alignment" % filtered[1])

    os.unlink(tmpf1)
    os.unlink(tmpf2)
    to_cluster = False
    statement = ("gzip %(outfile)s;" " gzip %(removed)s")
    P.run()
def remapWithBowtie( infiles, outfile ):
    '''re-map unaligned reads.

    Select those reads that have not been mapped from a bam file (flag-value = 4)
    and map again with Bowtie.
    '''

    to_cluster = True

    tmpfilename = P.getTempFilename()

    prefix = outfile[:-len(".bam")]

    infile, subsequence = infiles
    start = START 
    statement = '''
    samtools view %(infile)s |\
    awk '$2 == 4 {printf("@%%s\\n%%s\\n+\\n%%s\\n", $1,$10,$11);}' |\
    bowtie --sam -n 3 %(subsequence)s - 2>%(outfile)s.log |\
    awk -v OFS="\\t" '/^@/ {print;next;} {if ($4 > 0) { $4 += %(start)s } print; }' |\
    samtools import %(genome)s - %(tmpfilename)s >& %(outfile)s.log;
    samtools sort %(tmpfilename)s %(prefix)s;
    samtools index %(outfile)s;
    rm -f %(tmpfilename)s
    '''
    P.run( **dict( locals().items() + PARAMS.items() ) )

    if os.path.exists( tmpfilename ):
        os.unlink( tmpfilename )
Beispiel #8
0
def computeOverlapCoding(infile, outfile):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''

    to_cluster = True
    tmpfilename = P.getTempFilename(dir=".")

    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))

    os.unlink(tmpfilename)
Beispiel #9
0
def exportEnsembl( infile, outfile ):
    '''export gtf file with ensembl transcripts.
    '''
    tmpfile = P.getTempFilename()

    statement = '''
         perl %(scriptsdir)s/ensembl2gtf.pl 
            -dbname %(mysql_database_ensembl)s
            -host %(mysql_host)s
            -user %(mysql_user)s
            -dbpass %(mysql_pass)s
            -dnadbname %(mysql_database_ensembl)s
            -dnahost %(mysql_host)s
            -dnauser %(mysql_user)s
            -dnapass %(mysql_pass)s
            -gtffile %(tmpfile)s
            -schema '%(ensembl_schema)s' 
            -coordsystem %(ensembl_coordsystem)s
            -genetypes %(ensembl_genetypes)s > %(outfile)s.log'''

    P.run()

    statement = 'gzip < %(tmpfile)s > %(outfile)s'
    P.run()

    os.unlink( tmpfile )
Beispiel #10
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(
        track,
        tmpfasta,
        dbhandle,
        full=True,
        masker="dust",
        proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
def mapReadsWithBowtie(infiles, outfile):
    '''map reads with bowtie'''

    inifile, infile = infiles

    to_cluster = USECLUSTER
    job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[
        "bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
Beispiel #12
0
def aggregateWindowsReadCounts(infiles, outfile, regex="(.*)\..*"):
    '''aggregate several results from coverageBed
    into a single file.

    *regex* is used to extract the track name from the filename.
    The default removes any suffix.

    coverageBed outputs the following columns:
    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Windows without any counts will not be output.
    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) '''
        % (x, column) for x in infiles
    ])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    # build track names
    tracks = [
        re.search(regex, os.path.basename(x)).groups()[0] for x in infiles
    ]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
Beispiel #13
0
def aggregateWindowsReadCounts(infiles,
                               outfile,
                               regex="(.*)\..*"):
    '''aggregate several results from coverageBed
    into a single file.

    *regex* is used to extract the track name from the filename.
    The default removes any suffix.

    coverageBed outputs the following columns:
    1 Contig
    2 Start
    3 Stop
    4 Name
    5 The number of features in A that overlapped (by at least one
      base pair) the B interval.
    6 The number of bases in B that had non-zero coverage from features in A.
    7 The length of the entry in B.
    8 The fraction of bases in B that had non-zero coverage from
      features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Windows without any counts will not be output.
    '''

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join(['''<( zcat %s |
              awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) ''' %
                    (x, column) for x in infiles])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    # build track names
    tracks = [re.search(regex, os.path.basename(x)).groups()[0]
              for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(genes) == 1, \
            "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
def mapReadsWithBowtie(infiles, outfile):
    """map reads with bowtie"""

    inifile, infile = infiles

    to_cluster = USECLUSTER
    job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = """
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    """

    P.run()
Beispiel #15
0
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Beispiel #16
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
Beispiel #17
0
def buildGeneModels(infile, outfile):
    '''build transcript models - run cufflinks on each region seperately'''

    to_cluster = USECLUSTER

    track = os.path.basename(outfile[:-len(".gtf")])
    ins_size, std_dev = getInsertSizes("reads/%s" % track)

    tmpfilename = P.getTempFilename()
    nslots = 4

    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    infile = os.path.abspath(infile)
    outfile = os.path.abspath(outfile)

    statement = '''mkdir %(tmpfilename)s; 
    samtools view %(infile)s | sort -k3,3 -k4,4n 2> %(outfile)s.log1 > %(tmpfilename)s/temp.sam;
    cd %(tmpfilename)s; 
    cufflinks --inner-dist-mean %(ins_size)i
              --inner-dist-stddev %(std_dev)i
              --label %(track)s           
              --num-threads %(nslots)i 
              --min-isoform-fraction %(cuff_min_isoform)f
              --pre-mrna-fraction %(cuff_pre_mrna)f 
               %(tmpfilename)s/temp.sam >& %(outfile)s.log2;
    mv transcripts.gtf %(outfile)s >& %(outfile)s.log3;
    rm -rf %(tmpfilename)s >& %(outfile)s.log4 
    '''

    P.run()
def computeOverlapCoding( infile, outfile ):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''
    
    to_cluster = True
    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
def loadRepeatInformation( infiles, outfile ):
    '''load genome information.'''
    
    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename( "." )

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )
Beispiel #20
0
def loadRepeatInformation(infiles, outfile):
    '''load genome information.'''

    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename(".")

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
    gunzip < %(repeatsfile)s 
    | python %(scriptsdir)s/gff2bed.py -v 0 
    | coverageBed -a stdin -b %(tmpfilename)s
    | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
    |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
    --table=%(table)s 
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
def mapReadsWithBowtie(infiles, outfile):
    '''map reads with bowtie'''

    inifile, infile = infiles

    job_options = "-l mem_free=16G"
    job_threads = PARAMS["bowtie_threads"]

    tmpfile = P.getTempFilename()

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --threads %(bowtie_threads)s
           %(bowtie_options)s
           %(bowtie_genome_dir)s/%(genome)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | gzip
    > %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
def buildGeneModels(infile, outfile):
    '''build transcript models - run cufflinks on each region seperately'''

    to_cluster = USECLUSTER    

    track = os.path.basename( outfile[:-len(".gtf")] )
    ins_size, std_dev = getInsertSizes( "reads/%s" % track )

    tmpfilename = P.getTempFilename()
    nslots = 4

    if os.path.exists( tmpfilename ):
        os.unlink( tmpfilename )
    
    infile = os.path.abspath( infile )
    outfile = os.path.abspath( outfile )

    statement = '''mkdir %(tmpfilename)s; 
    samtools view %(infile)s | sort -k3,3 -k4,4n 2> %(outfile)s.log1 > %(tmpfilename)s/temp.sam;
    cd %(tmpfilename)s; 
    cufflinks --inner-dist-mean %(ins_size)i
              --inner-dist-stddev %(std_dev)i
              --label %(track)s           
              --num-threads %(nslots)i 
              --min-isoform-fraction %(cuff_min_isoform)f
              --pre-mrna-fraction %(cuff_pre_mrna)f 
               %(tmpfilename)s/temp.sam >& %(outfile)s.log2;
    mv transcripts.gtf %(outfile)s >& %(outfile)s.log3;
    rm -rf %(tmpfilename)s >& %(outfile)s.log4 
    '''

    P.run()
Beispiel #24
0
def prepareBAMs(infile, outfile):
    '''filter bam files for medip-seq analysis.

    Optional steps include:

    * deduplication - remove duplicate reads
    * quality score filtering - remove reads below a certain quality score.

    '''
    to_cluster = True
    track = P.snip(outfile, ".bam")

    tmpdir = P.getTempFilename()

    current_file = infile

    nfiles = 0
    statement = ["mkdir %(tmpdir)s"]

    if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append('''samtools view -q %%(filtering_quality)i -b 
                             %(current_file)s 
                             2>> %%(outfile)s.log 
                             > %(next_file)s ''' % locals())
        nfiles += 1
        current_file = next_file

    if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        dedup_method = PARAMS["filtering_dedup_method"]

        if dedup_method == 'samtools':
            statement.append('''samtools rmdup - - ''')

        elif dedup_method == 'picard':
            statement.append('''MarkDuplicates INPUT=%(current_file)s
                                               OUTPUT=%(next_file)s
                                               ASSUME_SORTED=true 
                                               METRICS_FILE=%(outfile)s.duplicate_metrics
                                               REMOVE_DUPLICATES=TRUE 
                                               VALIDATION_STRINGENCY=SILENT
                                               2>> %%(outfile)s.log ''' %
                             locals())
        nfiles += 1
        current_file = next_file

    statement.append("mv %%(current_file)s %(outfile)s" % locals())
    statement.append("rm -rf %(tmpdir)s")
    statement.append("samtools index %(outfile)s")

    statement = " ; ".join(statement)

    P.run()

    os.unlink(tmpdir)
def computeOverlapGO( infile, outfile ):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''
    
    to_cluster = False

    filter_goid = set(IOTools.readList( open( PARAMS["filename_gofilter"] ) ))
    filter_genes = set()

    E.info( "number of goids: %i" % len(filter_goid))
    
    for l in open( PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add( id )

    tmpfile1 = P.getTempFile( dir = "." )

    for line in open("ensembl.diff.genes_ovl" ):

        a,b = line[:-1].split( "\t" )
        if b not in filter_genes: continue
        tmpfile1.write(line)
        
    E.info( "number of genes taken: %i" % len(filter_genes))
    
    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )
    
    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
Beispiel #26
0
def prepareBAMs(infile, outfile):
    '''filter bam files for medip-seq analysis.

    Optional steps include:

    * deduplication - remove duplicate reads
    * quality score filtering - remove reads below a certain quality score.

    '''
    to_cluster = True
    track = P.snip(outfile, ".bam")

    tmpdir = P.getTempFilename()

    current_file = infile

    nfiles = 0
    statement = ["mkdir %(tmpdir)s"]

    if "filtering_quality" in PARAMS and PARAMS["filtering_quality"] > 0:
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()
        statement.append( '''samtools view -q %%(filtering_quality)i -b 
                             %(current_file)s 
                             2>> %%(outfile)s.log 
                             > %(next_file)s ''' % locals())
        nfiles += 1
        current_file = next_file

    if "filtering_dedup" in PARAMS and PARAMS["filtering_dedup"]:
        # Picard's MarkDuplicates requries an explicit bam file.
        next_file = "%(tmpdir)s/bam_%(nfiles)i.bam" % locals()

        dedup_method = PARAMS["filtering_dedup_method"]

        if dedup_method == 'samtools':
            statement.append( '''samtools rmdup - - ''' )

        elif dedup_method == 'picard':
            statement.append('''MarkDuplicates INPUT=%(current_file)s
                                               OUTPUT=%(next_file)s
                                               ASSUME_SORTED=true 
                                               METRICS_FILE=%(outfile)s.duplicate_metrics
                                               REMOVE_DUPLICATES=TRUE 
                                               VALIDATION_STRINGENCY=SILENT
                                               2>> %%(outfile)s.log ''' % locals() )
        nfiles += 1
        current_file = next_file

    statement.append("mv %%(current_file)s %(outfile)s" % locals())
    statement.append("rm -rf %(tmpdir)s")
    statement.append("samtools index %(outfile)s")

    statement = " ; ".join(statement)

    P.run()

    os.unlink(tmpdir)
Beispiel #27
0
def computeOverlapGO(infile, outfile):
    '''compute overlap between codingmarkers and windows.
    Only markers of certain GO categories are counted.

    This is done by setting the gene_id and transcript_id of markers of the
    ENSEMBEL gene that it overlaps with. This list is filtered first to
    keep only those ids with valid GO associations
    
    '''

    to_cluster = False

    filter_goid = set(IOTools.readList(open(PARAMS["filename_gofilter"])))
    filter_genes = set()

    E.info("number of goids: %i" % len(filter_goid))

    for l in open(PARAMS["filename_go"]):
        f, id, goid, desc, evd = l[:-1].split("\t")[:5]
        if goid in filter_goid:
            filter_genes.add(id)

    tmpfile1 = P.getTempFile(dir=".")

    for line in open("ensembl.diff.genes_ovl"):

        a, b = line[:-1].split("\t")
        if b not in filter_genes: continue
        tmpfile1.write(line)

    E.info("number of genes taken: %i" % len(filter_genes))

    tmpfile1.close()
    tmpfilename1 = tmpfile1.name

    tmpfilename = P.getTempFilename(dir=".")

    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=%(tmpfilename1)s \
    < %(infile)s > %(tmpfilename)s
    '''

    P.run(**dict(locals().items() + PARAMS.items()))

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))

    os.unlink(tmpfilename)
Beispiel #28
0
def aggregateWindowsReadCounts(infiles, outfile):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    Tiles with no counts will not be output.
    '''

    to_cluster = True

    # get bed format
    bed_columns = Bed.getNumColumns(infiles[0])
    # +1 as awk is 1-based
    column = bed_columns - 4 + 1

    src = " ".join([
        '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$%s );}' ) '''
        % (x, column) for x in infiles
    ])
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0: continue
        assert len(
            genes
        ) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
Beispiel #29
0
def mapReads(infiles, outfile):
    '''map reads using all known junctions and all junctions found before.

    This method requires the explicit genome in bowtiedir together with the
    samtools index. Using a flattened genome file will not work due to 
    the limit of a line length of 65536 in samtools.
    '''

    if not os.path.exists("%(bowtiedir)s/%(genome)s.fa" % PARAMS):
        raise ValueError(
            "genome %(bowtiedir)s/%(genome)s.fa does not exist - create with bowtie-inspect first"
            % PARAMS)

    ins_size, std_dev = getInsertSizes(os.path.dirname(outfile[:-len(".bam")]))

    nslots = 4
    fastq1, fastq2 = infiles[0]

    tmpfilename = P.getTempFilename()

    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    job_options = "-pe dedicated 4-8 -l mem_free=3G -R y"

    to_cluster = USECLUSTER

    junctions_file = "reads/all.junctions"

    # WARNING: contents of tmpfile can get large (20Gb or more)

    statement = '''
    gunzip < %(fastq1)s > %(tmpfilename)s.1.fq;
    gunzip < %(fastq2)s > %(tmpfilename)s.2.fq;
    tophat --output-dir %(tmpfilename)s
           --min-isoform-fraction 0.0 
           --mate-inner-dist %(ins_size)i 
           --mate-std-dev %(std_dev)i 
           --raw-juncs %(junctions_file)s 
           -p %(nslots)i 
           %(bowtiedir)s/%(genome)s
           %(tmpfilename)s.1.fq
           %(tmpfilename)s.2.fq
    >& %(outfile)s.log;
    mv %(tmpfilename)s/accepted_hits.bam %(outfile)s 2>> %(outfile)s.log; 
    rm -rf %(tmpfilename)s 2>> %(outfile)s.log;
    rm -f %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq 2>> %(outfile)s.log
    '''

    P.run()
Beispiel #30
0
    def indexForSailfish(infile, outfile):
        '''create a sailfish index'''

        outdir = P.snip(outfile, "/transcriptome.sfi")
        kmer = int(PARAMS["sailfish_kmer_size"])
        tmp = P.getTempFilename()

        statement = '''gunzip -c %(infile)s > %(tmp)s;
                       module load bio/sailfish;
                       sailfish index -t %(tmp)s 
                       -k %(kmer)i -o %(outdir)s;
                       rm -f %(tmp)s'''

        P.run()
Beispiel #31
0
    def indexForSailfish(infile, outfile):
        '''create a sailfish index'''

        outdir = P.snip(outfile, "/transcriptome.sfi")
        kmer = int(PARAMS["sailfish_kmer_size"])
        tmp = P.getTempFilename()

        statement = '''gunzip -c %(infile)s > %(tmp)s;
                       module load bio/sailfish;
                       sailfish index -t %(tmp)s 
                       -k %(kmer)i -o %(outdir)s;
                       rm -f %(tmp)s'''

        P.run()
def mapReadsWithTophat(infiles, outfile):
    '''map reads with tophat

    '''
    inifile, infile = infiles

    local_params = P.loadParameters(inifile)

    to_cluster = USECLUSTER
    job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[
        "tophat_threads"]

    tmpfile = P.getTempFilename(".")

    #qualfile = P.snip(infile, "csfasta.gz" ) + "qual.gz"
    '''
    gunzip < %(infile)s > %(tmpfile)s.csfasta;
    checkpoint;
    gunzip < %(qualfile)s > %(tmpfile)s.qual;
    checkpoint;
    '''

    statement = '''
    zcat %(infile)s 
    | python %(scriptsdir)s/fastq2solid.py 
           --change-format=integer
           --pattern="%(tmpfile)s.%%s" >& %(outfile)s.log;
    checkpoint;
    tophat --output-dir %(outfile)s.dir                    
           --num-threads %(tophat_threads)s  
           --library-type %(tophat_library_type)s
           --color
           --quals
           --integer-quals
           %(tophat_options)s
           %(tophat_genome_dir)s/%(genome)s_cs
           %(tmpfile)s.csfasta %(tmpfile)s.qual
           >> %(outfile)s.log;
    checkpoint;
    mv %(outfile)s.dir/accepted_hits.bam %(outfile)s;
    checkpoint;
    samtools index %(outfile)s;
    checkpoint;
    rm -f %(tmpfile)s.csfasta %(tmpfile)s.qual
    '''

    # use local parameters to overwrite default ones.
    P.run(**local_params)

    os.unlink(tmpfile)
Beispiel #33
0
def makeCodingPotential( infile, outfile ):
    '''run CPC to predict coding potential.'''

    statement = '''
	cpc.sh %(infile)s 
               %(outfile)s.forward.table 
               %(outfile)s.tmp.dir 
               %(outfile)s.forward.evidence 
               %(codingpotential_database)s > %(outfile)s.log'''
    P.run()

    tmpfilename = P.getTempFilename( "." )
    statement = '''python %(toolsdir)s/fasta2fasta.py 
                          --method=reverse-complement -v 0
                  < %(infile)s
                  > %(tmpfilename)s'''
    P.run()
    
    statement = '''
	cpc.sh %(tmpfilename)s 
               %(outfile)s.reverse.table 
               %(outfile)s.tmp.dir 
               %(outfile)s.reverse.evidence 
               %(codingpotential_database)s >> %(outfile)s.log'''
    P.run()
    
    outf = open(outfile, "w")
    outf.write( "gene_id\tlength\tf_iscoding\tf_value\tf_orfstart\tf_orfend\tf_orfval1\tf_orfval2\tf_orf\tr_iscoding\tr_value\tr_orfstart\tr_orfend\tr_orfval1\tr_orfval2\tr_orf\n")
    outf.close()

    to_cluster = True
    
    statement = '''
	python %(toolsdir)s/combine_tables.py -v 0 
            %(outfile)s.forward.table 
            %(outfile)s.forward.evidence.orf 
            %(outfile)s.reverse.table 
            %(outfile)s.reverse.evidence.orf |\
	cut -f 1,2,3,4,6,7,8,9,10,12,13,15- 
        >> %(outfile)s
    '''
    P.run()

    # save space by compressing the result of the homology searches
    E.info( "compressing CPC output" )
    statement ='''rm -f %(outfile)s.*h**o.gz; gzip %(outfile)s.*h**o'''
    P.run()

    os.unlink( tmpfilename )
Beispiel #34
0
def aggregateTiledReadCounts(infiles, outfile):
    '''aggregate tag counts for each window.

    coverageBed outputs the following columns:
    1) Contig
    2) Start
    3) Stop
    4) Name
    5) The number of features in A that overlapped (by at least one base pair) the B interval.
    6) The number of bases in B that had non-zero coverage from features in A.
    7) The length of the entry in B.
    8) The fraction of bases in B that had non-zero coverage from features in A.

    For bed: use column 5
    For bed6: use column 7
    For bed12: use column 13

    This method uses the maximum number of reads found in any interval as the tag count.

    Tiles with no counts will not be output.
    '''

    to_cluster = True

    src = " ".join(
        [ '''<( zcat %s | awk '{printf("%%s:%%i-%%i\\t%%i\\n", $1,$2,$3,$4 );}' ) ''' % x for x in infiles] )
    tmpfile = P.getTempFilename(".")
    statement = '''paste %(src)s > %(tmpfile)s'''
    P.run()

    tracks = [re.sub("\..*", '', os.path.basename(x)) for x in infiles]

    outf = IOTools.openFile(outfile, "w")
    outf.write("interval_id\t%s\n" % "\t".join(tracks))

    for line in open(tmpfile, "r"):
        data = line[:-1].split("\t")
        genes = list(set([data[x] for x in range(0, len(data), 2)]))
        values = [int(data[x]) for x in range(1, len(data), 2)]
        if sum(values) == 0:
            continue
        assert len(
            genes) == 1, "paste command failed, wrong number of genes per line: '%s'" % line
        outf.write("%s\t%s\n" % (genes[0], "\t".join(map(str, values))))

    outf.close()

    os.unlink(tmpfile)
def mapReads(infiles, outfile):
    '''map reads using all known junctions and all junctions found before.

    This method requires the explicit genome in bowtiedir together with the
    samtools index. Using a flattened genome file will not work due to 
    the limit of a line length of 65536 in samtools.
    '''

    if not os.path.exists( "%(bowtiedir)s/%(genome)s.fa" % PARAMS ):
       raise ValueError( "genome %(bowtiedir)s/%(genome)s.fa does not exist - create with bowtie-inspect first" % PARAMS)

    ins_size, std_dev = getInsertSizes( os.path.dirname( outfile[:-len(".bam") ] ) )
    
    nslots = 4
    fastq1, fastq2 = infiles[0]

    tmpfilename = P.getTempFilename()
    
    if os.path.exists( tmpfilename ):
        os.unlink( tmpfilename )

    job_options= "-pe dedicated 4-8 -l mem_free=3G -R y"

    to_cluster = USECLUSTER

    junctions_file = "reads/all.junctions"

    # WARNING: contents of tmpfile can get large (20Gb or more)

    statement = '''
    gunzip < %(fastq1)s > %(tmpfilename)s.1.fq;
    gunzip < %(fastq2)s > %(tmpfilename)s.2.fq;
    tophat --output-dir %(tmpfilename)s
           --min-isoform-fraction 0.0 
           --mate-inner-dist %(ins_size)i 
           --mate-std-dev %(std_dev)i 
           --raw-juncs %(junctions_file)s 
           -p %(nslots)i 
           %(bowtiedir)s/%(genome)s
           %(tmpfilename)s.1.fq
           %(tmpfilename)s.2.fq
    >& %(outfile)s.log;
    mv %(tmpfilename)s/accepted_hits.bam %(outfile)s 2>> %(outfile)s.log; 
    rm -rf %(tmpfilename)s 2>> %(outfile)s.log;
    rm -f %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq 2>> %(outfile)s.log
    ''' 

    P.run()
def mapReadsWithBowtieAgainstTranscriptome(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    transcriptome data.
    '''

    # Mapping will permit up to one mismatches. This is sufficient
    # as the downstream filter in bams2bam requires the
    # number of mismatches less than the genomic number of mismatches.
    # Change this, if the number of permitted mismatches for the genome
    # increases.

    # Output all valid matches in the best stratum. This will
    # inflate the file sizes due to matches to alternative transcripts
    # but otherwise matches to paralogs will be missed (and such
    # reads would be filtered out).
    to_cluster = USECLUSTER
    job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[
        "bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --sam --set-nh --log=%(outfile)s.log
    | perl -p -e "if (/^\\@HD/) { s/\\bSO:\S+/\\bSO:coordinate/}"  
    | samtools import %(contigs)s - -
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()
Beispiel #37
0
def runCufflinks(infiles, outfile):
    '''estimate expression levels in each set.
    '''

    gtffile, bamfile = infiles

    job_threads = PARAMS["cufflinks_threads"]

    track = os.path.basename(P.snip(gtffile, ".gtf.gz"))

    tmpfilename = P.getTempFilename(".")
    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    gtffile = os.path.abspath(gtffile)
    bamfile = os.path.abspath(bamfile)
    outfile = os.path.abspath(outfile)

    # note: cufflinks adds \0 bytes to gtf file - replace with '.'
    # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb
    # intron.

    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    statement = '''mkdir %(tmpfilename)s;
    cd %(tmpfilename)s;
    cufflinks --label %(track)s
              --GTF <(gunzip < %(gtffile)s)
              --num-threads %(cufflinks_threads)i
              --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa
              --library-type %(cufflinks_library_type)s
              %(cufflinks_options)s
              %(bamfile)s
    | grep -v 'BAM record error'
    >& %(outfile)s;
    perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz;
    gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz;
    gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz;
    '''

    P.run()

    shutil.rmtree(tmpfilename)
Beispiel #38
0
    def indexForSailfish(infile, outfile):
        '''create a sailfish index'''

        outdir = P.snip(outfile, "/transcriptome.sfi")
        kmer = int(PARAMS["sailfish_kmer_size"])
        tmp = P.getTempFilename()

        if zipped:
            statement = '''gunzip -c %(infile)s > %(tmp)s;
                           checkpoint; sailfish index -t %(tmp)s'''
        else:
            statement = '''sailfish index -t %(infile)s'''

        statement += '''-k %(kmer)i -o %(outdir)s;
                        checkpoint; rm -f %(tmp)s'''

        P.run()
Beispiel #39
0
def runCufflinks(infiles, outfile):
    '''estimate expression levels in each set.
    '''

    gtffile, bamfile = infiles

    job_options = "-pe dedicated %i -R y" % PARAMS["cufflinks_threads"]

    track = os.path.basename(P.snip(gtffile, ".gtf.gz"))

    tmpfilename = P.getTempFilename(".")
    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    gtffile = os.path.abspath(gtffile)
    bamfile = os.path.abspath(bamfile)
    outfile = os.path.abspath(outfile)

    # note: cufflinks adds \0 bytes to gtf file - replace with '.'
    # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb
    # intron.

    # AH: removed log messages about BAM record error
    # These cause logfiles to grow several Gigs and are
    # frequent for BAM files not created by tophat.
    # Error is:
    # BAM record error: found spliced alignment without XS attribute
    statement = '''mkdir %(tmpfilename)s;
    cd %(tmpfilename)s;
    cufflinks --label %(track)s
              --GTF <(gunzip < %(gtffile)s)
              --num-threads %(cufflinks_threads)i
              --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa
              --library-type %(cufflinks_library_type)s
              %(cufflinks_options)s
              %(bamfile)s
    | grep -v 'BAM record error'
    >& %(outfile)s;
    perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz;
    gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz;
    gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz;
    '''

    P.run()

    shutil.rmtree(tmpfilename)
Beispiel #40
0
def findJunctions(infiles, outfile):
    '''map reads using all known junctions in order to identify new possible junctions - 
    cat the junctions together and delete the tophat output directories
    '''

    ins_size, std_dev = getInsertSizes(
        os.path.dirname(outfile[:-len(".junctions")]))

    nslots = 4
    fastq1, fastq2 = infiles[0]

    tmpfilename = P.getTempFilename()

    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    job_options = "-pe dedicated 4-8 -l mem_free=3G -R y"

    to_cluster = USECLUSTER

    # tophat does a seek operation on the fq files, hence they
    # need to unpacked into real files
    statement = '''
    gunzip < %(fastq1)s > %(tmpfilename)s.1.fq;
    gunzip < %(fastq2)s > %(tmpfilename)s.2.fq;
    tophat --output-dir %(tmpfilename)s
           --butterfly-search 
           --min-anchor-length 5 
           --closure-search 
           --microexon-search 
           --min-isoform-fraction 0.0 
           --mate-inner-dist %(ins_size)i 
           --mate-std-dev %(std_dev)i 
           --max-intron-length %(max_intron)i 
           --raw-juncs %(junctions_file)s 
           -p %(nslots)i 
           %(bowtiedir)s/%(genome)s
           %(tmpfilename)s.1.fq
           %(tmpfilename)s.2.fq
    >& %(outfile)s.log;
    mv %(tmpfilename)s/junctions.bed %(outfile)s >& %(outfile)s.log2;
    mv %(tmpfilename)s/logs %(outfile)s.logs >& %(outfile)s.log3;
    rm -rf %(tmpfilename)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log4
    '''
    P.run()
def findJunctions(infiles, outfile):
    '''map reads using all known junctions in order to identify new possible junctions - 
    cat the junctions together and delete the tophat output directories
    '''

    ins_size, std_dev = getInsertSizes( os.path.dirname( outfile[:-len(".junctions") ] ) )

    nslots = 4
    fastq1, fastq2 = infiles[0]

    tmpfilename = P.getTempFilename()
    
    if os.path.exists( tmpfilename ):
        os.unlink( tmpfilename )

    job_options= "-pe dedicated 4-8 -l mem_free=3G -R y"
    
    to_cluster = USECLUSTER

    # tophat does a seek operation on the fq files, hence they
    # need to unpacked into real files
    statement = '''
    gunzip < %(fastq1)s > %(tmpfilename)s.1.fq;
    gunzip < %(fastq2)s > %(tmpfilename)s.2.fq;
    tophat --output-dir %(tmpfilename)s
           --butterfly-search 
           --min-anchor-length 5 
           --closure-search 
           --microexon-search 
           --min-isoform-fraction 0.0 
           --mate-inner-dist %(ins_size)i 
           --mate-std-dev %(std_dev)i 
           --max-intron-length %(max_intron)i 
           --raw-juncs %(junctions_file)s 
           -p %(nslots)i 
           %(bowtiedir)s/%(genome)s
           %(tmpfilename)s.1.fq
           %(tmpfilename)s.2.fq
    >& %(outfile)s.log;
    mv %(tmpfilename)s/junctions.bed %(outfile)s >& %(outfile)s.log2;
    mv %(tmpfilename)s/logs %(outfile)s.logs >& %(outfile)s.log3;
    rm -rf %(tmpfilename)s %(tmpfilename)s.1.fq %(tmpfilename)s.2.fq >& %(outfile)s.log4
    ''' 
    P.run()
Beispiel #42
0
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes. 
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)


    outf = open(outfile, "w")
    outf.write("\t".join(["no_transcripts", 
                          "no_genes", 
                          "no_exons_per_transcript", 
                          "no_exons_per_gene",
                          "no_single_exon_transcripts", 
                          "no_multi_exon_transcripts", 
                          "no_single_exon_genes", 
                          "no_multi_exon_genes"]) + "\n")
    outf.write("\t".join(map(str, [PipelineLncRNA.CounterTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(), 
                                   PipelineLncRNA.CounterExonsPerGene(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(), 
                                   PipelineLncRNA.CounterSingleExonGenes(tmpf).count(), 
                                   PipelineLncRNA.CounterMultiExonGenes(tmpf).count()])))


    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
Beispiel #43
0
def buildCoverageOverContigs(infiles, outfile):
    '''
    build histograms of the coverage over each of the contigs
    '''
    bam = infiles[0]
    # genomecoveragebed does not like some of the 
    # output from bwa. bwa outputs some reads
    # that map off the end of contigs
    # as having a leftmost position of 0. This is
    # not ideal. Need to use temporary bam
    # files with only mapped reads - this is 
    # nasty and needs changing
    tempdir = P.getTempDir(".")
    tempname = P.getTempFilename(tempdir) + ".bam"
    P.submit("CGATPipelines.PipelineMetagenomeAssembly", 
             "filterBamOnPos", 
             infiles = bam, 
             outfiles = tempname)

    # tablename where alignment stats live
    tablename = os.path.dirname(
        bam)[:-len(".dir")] + "_" + P.snip(os.path.basename(bam), ".bam") + "_alignment_stats"

    # hack to convert to table - add .load
    tablename = P.toTable(tablename + ".load")
    
    # connect to database
    dbh = connect()
    cc = dbh.cursor()

    # get number of reads aligned from bam2stats
    if PARAMS.get("coverage_scale"):
        scale_factor = cc.execute("""SELECT counts FROM %s
                                     WHERE category == 'reads_mapped'""" % tablename).fetchone()[0]
        scale_factor = 1 / (float(scale_factor) / 1000000)
        scale_options = "-scale %(scale_factor)f"
    else:
        scale_options = ""

    statement = '''genomeCoverageBed -ibam %(tempname)s %(scale_options)s -d | gzip > %(outfile)s;
                   rm -rf %(tempdir)s'''
    P.run()
Beispiel #44
0
def buildBigBed( infile, outfile ):
    '''bed file with intervals that are covered by reads in any of the experiments.
    '''

    to_cluster = True
    to_cluster = False

    tmpfile = P.getTempFilename()

    contig_sizes = os.path.join( PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"] )

    statement = '''
    zcat %(infile)s > %(tmpfile)s;
    bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s;
    rm -f %(tmpfile)s
    '''
    P.run()

    try: os.unlink( tmpfile )
    except OSError: pass
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset,
                           fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.getTempFilename()

    statement = '''
	python %(scriptsdir)s/annotator.py \
		--method=fdr-table \
		--fdr-method=%(fdr_method)s \
		--log=%(outfile)s.log \
                --regex-id="(.*)%(suffix)s" \
                %(infile)s > %(tmpfilename)s
        '''
    P.run()

    tmpfile = P.getTempFile()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
            --table=%(table)s 
    < %(tmpfilename2)s > %(outfile)s'''

    P.run()
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)
Beispiel #46
0
def buildRefcodingGeneSetStats(infile, outfile):
    '''
    counts:
    no. of transcripts
    no. genes
    average number of exons per transcript
    average number of exons per gene
    no. multi-exon transcripts
    no. single exon transcripts
    no. multi-exon genes
    no. single exon genes

    in the coding and lncRNA genesets
    '''

    # calculate exon status for refcoding genes.
    tmpf = P.getTempFilename(".") + ".gz"
    PipelineLncRNA.flagExonStatus(infile, tmpf)

    outf = open(outfile, "w")
    outf.write("\t".join([
        "no_transcripts", "no_genes", "no_exons_per_transcript",
        "no_exons_per_gene", "no_single_exon_transcripts",
        "no_multi_exon_transcripts", "no_single_exon_genes",
        "no_multi_exon_genes"
    ]) + "\n")
    outf.write("\t".join(
        map(str, [
            PipelineLncRNA.CounterTranscripts(tmpf).count(),
            PipelineLncRNA.CounterGenes(tmpf).count(),
            PipelineLncRNA.CounterExonsPerTranscript(tmpf).count(),
            PipelineLncRNA.CounterExonsPerGene(tmpf).count(),
            PipelineLncRNA.CounterSingleExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterMultiExonTranscripts(tmpf).count(),
            PipelineLncRNA.CounterSingleExonGenes(tmpf).count(),
            PipelineLncRNA.CounterMultiExonGenes(tmpf).count()
        ])))

    os.unlink(tmpf)
    os.unlink(tmpf + ".log")
    os.unlink(P.snip(tmpf, ".gz"))
def mapReadsWithBowtieAgainstJunctions(infiles, outfile):
    '''map reads from short read archive sequence using bowtie against
    splice junctions.

    The reads are converted to genomic coordinates.
    '''

    job_options = "-pe dedicated %i -R y -l mem_free=16G" % PARAMS[
        "bowtie_threads"]

    tmpfile = P.getTempFilename()

    infile, reffile, contigs = infiles
    track = P.snip(outfile, ".bam")
    prefix = P.snip(reffile, ".fa")

    to_cluster = USECLUSTER
    statement = '''
    gunzip < %(infile)s > %(tmpfile)s;
    checkpoint;
    bowtie -q
           --sam 
           -C
           --un /dev/null
           --threads %(bowtie_threads)s
           %(transcriptome_options)s 
           --best --strata -a
           %(prefix)s_cs
           %(tmpfile)s
    | python %(scriptsdir)s/bam2bam.py --set-nh --log=%(outfile)s.log
    | python %(scriptsdir)s/rnaseq_junction_bam2bam.py --contig-sizes=%(contigs)s --log=%(outfile)s.log
    | samtools sort - %(track)s;
    checkpoint;
    samtools index %(outfile)s
    checkpoint;
    rm -f %(tmpfile)s
    '''

    P.run()

    os.unlink(tmpfile)
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
Beispiel #49
0
def extractControllLncRNAFastaAlignments(infiles, outfile):
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("/ifs/scratch")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method):
    '''generic import of annotator results.

    Assumes that the suffix of all infiles is the same.
    '''

    infile = " ".join(infiles)
    x, suffix = os.path.splitext(infiles[0])

    tmpfilename = P.getTempFilename()

    statement = '''
	python %(scriptsdir)s/annotator2tsv.py \
		--method=fdr-table \
		--fdr-method=%(fdr_method)s \
		--log=%(outfile)s.log \
                --regex-id="(.*)%(suffix)s" \
                %(infile)s > %(tmpfilename)s
        '''
    P.run(**dict(locals().items() + PARAMS.items()))

    tmpfile = P.getTempFile()

    for line in open(tmpfilename, "r"):
        if line.startswith("id"):
            line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line)
        else:
            line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line)
        tmpfile.write(line)
    tmpfile.close()
    tmpfilename2 = tmpfile.name

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
            --table=%(table)s 
    < %(tmpfilename2)s > %(outfile)s'''

    P.run(**dict(locals().items() + PARAMS.items()))
    os.unlink(tmpfilename)
    os.unlink(tmpfilename2)
Beispiel #51
0
def runCufflinks(infiles, outfile):
    '''estimate expression levels in each set.
    '''

    gtffile, bamfile = infiles
    to_cluster = True

    job_options = "-pe dedicated %i -R y" % PARAMS["cufflinks_threads"]

    track = os.path.basename(P.snip(gtffile, ".gtf.gz"))

    tmpfilename = P.getTempFilename(".")
    if os.path.exists(tmpfilename):
        os.unlink(tmpfilename)

    gtffile = os.path.abspath(gtffile)
    bamfile = os.path.abspath(bamfile)
    outfile = os.path.abspath(outfile)

    # note: cufflinks adds \0 bytes to gtf file - replace with '.'
    # increase max-bundle-length to 4.5Mb due to Galnt-2 in mm9 with a 4.3Mb intron.
    statement = '''mkdir %(tmpfilename)s; 
    cd %(tmpfilename)s; 
    cufflinks --label %(track)s      
              --GTF <(gunzip < %(gtffile)s)
              --num-threads %(cufflinks_threads)i
              --frag-bias-correct %(bowtie_index_dir)s/%(genome)s.fa
              --library-type %(cufflinks_library_type)s
              %(cufflinks_options)s
              %(bamfile)s 
    >& %(outfile)s;
    perl -p -e "s/\\0/./g" < transcripts.gtf | gzip > %(outfile)s.gtf.gz;
    gzip < isoforms.fpkm_tracking > %(outfile)s.fpkm_tracking.gz;
    gzip < genes.fpkm_tracking > %(outfile)s.genes_tracking.gz;
    '''

    P.run()

    shutil.rmtree(tmpfilename)
Beispiel #52
0
def buildBigBed(infile, outfile):
    '''bed file with intervals that are covered by reads in any of the experiments.
    '''

    to_cluster = True
    to_cluster = False

    tmpfile = P.getTempFilename()

    contig_sizes = os.path.join(
        PARAMS["annotations_dir"], PARAMS_ANNOTATIONS["interface_contigs"])

    statement = '''
    zcat %(infile)s > %(tmpfile)s;
    bedToBigBed %(tmpfile)s %(contig_sizes)s %(outfile)s;
    rm -f %(tmpfile)s
    '''
    P.run()

    try:
        os.unlink(tmpfile)
    except OSError:
        pass
def difference_to2(infile, outfile):
    '''compare several bed-files.'''
    track = re.match("version\d+_(.*).bed", infile).groups()[0]

    tmpfile = P.getTempFilename()

    for version in VERSIONS:
        t = tmpfile + "%s" % version
        if version == "version2":
            statement = '''cut -f 5 < %(version)s_%(track)s.bed |\
                    python %(toolsdir)s/data2histogram.py --headers=%(version)s --bin-size=1 --min-value=1 > %(t)s
                    '''
        else:
            statement = '''
            intersectBed -v -a version2_%(track)s.bed -b %(version)s_%(track)s.bed | cut -f 5 |\
            python %(toolsdir)s/data2histogram.py --headers=%(version)s --bin-size=1 --min-value=1 > %(t)s
            '''
        P.run(**dict(locals().items() + PARAMS.items()))

    statement = '''
    python %(toolsdir)s/combine_tables.py --sort-keys=numeric %(tmpfile)s* > %(outfile)s
    '''
    P.run(**dict(locals().items() + PARAMS.items()))