def runSoapdenovo(infile, outfile):
    '''
    run soapdenovo
    '''
    job_options = "-l mem_free=30G"
    statement = PipelineMetagenomeAssembly.SoapDenovo2().build(infile)
    P.run()
Example #2
0
def countReadsWithinWindows(bedfile,
                            windowfile,
                            outfile,
                            counting_method="midpoint"):
    '''count reads given in *tagfile* within intervals in 
    *windowfile*.

    Both files need to be :term:`bed` formatted.

    Counting is done using bedtools. The counting method
    can be 'midpoint' or 'nucleotide'.
    '''
    job_options = "-l mem_free=4G"

    if counting_method == "midpoint":
        f = '''| awk '{a = $2+($3-$2)/2; printf("%s\\t%i\\t%i\\n", $1, a, a+1)}' '''
    elif counting_method == "nucleotide":
        f = ""
    else:
        raise ValueError("unknown counting method: %s" % counting_method)

    statement = '''
    zcat %(bedfile)s
    %(f)s
    | coverageBed -a stdin -b %(windowfile)s -split
    | sort -k1,1 -k2,2n
    | gzip
    > %(outfile)s
    '''

    P.run()
Example #3
0
def collectMEMEResults(tmpdir, target_path, outfile):
    '''collect output from a MEME run in tmpdir
    and copy all over to target_path

    convert images output by MEME (.eps files) to 
    .png files.'''

    # copy over results
    try:
        os.makedirs(os.path.dirname(target_path))
    except OSError:
        # ignore "file exists" exception
        pass

    if os.path.exists(target_path):
        shutil.rmtree(target_path)
    shutil.move(tmpdir, target_path)

    shutil.copyfile(os.path.join(target_path, "meme.txt"), outfile)

    # convert images to png
    epsfiles = glob.glob(os.path.join(target_path, "*.eps"))

    for epsfile in epsfiles:
        b, ext = os.path.splitext(epsfile)
        pngfile = b + ".png"
        statement = '''convert %(epsfile)s %(pngfile)s '''
        P.run()
def runSpades(infile, outfile):
    '''
    run spades on each track
    '''
    job_options = " -l mem_free=30G"
    statement = PipelineMetagenomeAssembly.Spades().build(infile)
    P.run()
def computeOverlapCoding( infile, outfile ):
    '''compute overlap between coding markers and windows.

    This is done by setting the gene_id and transcript_id of markers to the ENSEMBL gene id
    and transcript_id that it overlaps with. Markers not overlapping an ENSEMBL gene id
    are removed.
    '''
    
    to_cluster = True
    tmpfilename = P.getTempFilename( dir = "." )
    
    statement = '''python %(scriptsdir)s/gtf2gtf.py
    --rename=gene \
    --apply=ensembl.diff.genes_ovl \
    < %(infile)s > %(tmpfilename)s
    '''
    
    P.run( **dict( locals().items() + PARAMS.items() ) )

    statement = '''python %(scriptsdir)s/gff2table.py 
    --filename-windows=<(python %(scriptsdir)s/bed2gff.py < windows.bed) 
    --decorator=counts
    --filename-data=%(tmpfilename)s \
    --skip-empty \
    --is-gtf \
    --log=%(outfile)s.log \
    < %(genome)s.fasta > %(outfile)s'''

    P.run( **dict( locals().items() + PARAMS.items() ) )

    os.unlink( tmpfilename )
Example #6
0
def runGLAM2SCAN(infiles, outfile):
    '''run glam2scan on all intervals and motifs.
    '''

    to_cluster = True
    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    controlfile, dbfile, motiffiles = infiles
    controlfile = dbfile[:-len(".fasta")] + ".controlfasta"
    if not os.path.exists(controlfile):
        raise P.PipelineError(
            "control file %s for %s does not exist" % (controlfile, dbfile))

    if os.path.exists(outfile):
        os.remove(outfile)

    for motiffile in motiffiles:
        of = IOTools.openFile(outfile, "a")
        motif, x = os.path.splitext(motiffile)
        of.write(":: motif = %s ::\n" % motif)
        of.close()

        statement = '''
        cat %(dbfile)s %(controlfile)s | %(execglam2scan)s -2 -n %(glam2scan_results)i n %(motiffile)s - >> %(outfile)s
        '''
        P.run()
Example #7
0
def runBioProspector(infiles, outfile, dbhandle):
    '''run bioprospector for motif discovery.

    Bioprospector is run on only the top 10% of peaks.
    '''

    # bioprospector currently not working on the nodes
    to_cluster = False

    # only use new nodes, as /bin/csh is not installed
    # on the old ones.
    # job_options = "-l mem_free=8000M"

    tmpfasta = P.getTempFilename(".")
    track = outfile[:-len(".bioprospector")]
    nseq = writeSequencesForIntervals(track,
                                      tmpfasta,
                                      dbhandle,
                                      full=True,
                                      masker="dust",
                                      proportion=PARAMS["bioprospector_proportion"])

    if nseq == 0:
        E.warn("%s: no sequences - bioprospector skipped" % track)
        P.touch(outfile)
    else:
        statement = '''
    BioProspector -i %(tmpfasta)s %(bioprospector_options)s -o %(outfile)s > %(outfile)s.log
    '''
        P.run()

    os.unlink(tmpfasta)
def buildTranscriptLevelReadCounts(infiles, outfile):
    '''count reads falling into transcripts of protein coding gene models.

    .. note::
       In paired-end data sets each mate will be counted. Thus
       the actual read counts are approximately twice the fragment
       counts.

    '''
    bamfile, geneset = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    statement = '''
    zcat %(geneset)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=transcripts
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()
def buildGeneLevelReadCounts(infiles, outfile):
    '''compute read counts and coverage of exons with reads.
    '''

    bamfile, exons = infiles

    if BamTools.isPaired(bamfile):
        counter = 'readpair-counts'
    else:
        counter = 'read-counts'

    # ignore multi-mapping reads
    statement = '''
    zcat %(exons)s
    | python %(scriptsdir)s/gtf2table.py
          --reporter=genes
          --bam-file=%(bamfile)s
          --counter=length
          --prefix="exons_"
          --counter=%(counter)s
          --prefix=""
          --counter=read-coverage
          --prefix=coverage_
          --min-mapping-quality=%(counting_min_mapping_quality)i
          --multi-mapping=ignore
          --log=%(outfile)s.log
    | gzip
    > %(outfile)s
    '''

    P.run()
def loadPicardAlignStats(infiles, outfile):
    '''Merge Picard alignment stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = P.getTempFile()
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".alignstats")
        if not os.path.exists(f):
            E.warn("File %s missing" % f)
            continue
        lines = [
            x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        for i in range(1, len(lines)):
            outf.write("%s\t%s" % (track, lines[i]))
    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s'''
    P.run()
    os.unlink(tmpfilename)
def loadPicardDuplicateStats(infiles, outfile):
    '''Merge Picard duplicate stats into single table and load into SQLite.'''
    # Join data for all tracks into single file
    outf = open('dupstats.txt', 'w')
    first = True
    for f in infiles:
        track = P.snip(os.path.basename(f), ".dedup.bam")
        statfile = P.snip(f, ".bam") + ".dupstats"
        if not os.path.exists(statfile):
            E.warn("File %s missing" % statfile)
            continue
        lines = [x for x in open(
            statfile, "r").readlines() if not x.startswith("#") and x.strip()]
        if first:
            outf.write("%s\t%s" % ("track", lines[0]))
        first = False
        outf.write("%s\t%s" % (track, lines[1]))

    outf.close()
    tmpfilename = outf.name

    # Load into database
    tablename = P.toTable(outfile)
    statement = '''cat %(tmpfilename)s
                | python %(scriptsdir)s/csv2db.py
                      --index=track
                      --table=%(tablename)s 
                > %(outfile)s '''
    P.run()
def buildUniformityOfCoverage(infiles, outfile):
    '''
    build matrix of coverage over contigs
    '''
    bam = infiles[0]
    track = P.snip(os.path.basename(bam), ".bam")
    tmp_bed = P.getTempFilename(".") + ".bed"
    tmp_bam = P.getTempFilename(".") + ".bam"
    
    # filter for mapped reads
    statement = '''cat %(bam)s | python %(scriptsdir)s/bam2bam.py --filter=mapped --log=/dev/null > %(tmp_bam)s
                   ; samtools index %(tmp_bam)s'''
    P.run()

    for infs in infiles[1:]:
        for inf in infs:
            if P.snip(inf, ".lengths.tsv") == track:
                length_file = inf
                

    statement = '''cat %(length_file)s | awk 'NR>1 {printf("%%s\\t0\\t%%s\\n", $1, $2)}' > %(tmp_bed)s'''
    P.run()

    statement = '''python %(scriptsdir)s/bam2peakshape.py 
                   --only-interval %(tmp_bam)s %(tmp_bed)s 
                   --log=%(outfile)s.log 
                   --output-filename-pattern=%(track)s.%%s'''
    P.run()
    os.unlink(tmp_bed)
    os.unlink(tmp_bam)
def buildBAMStats( infile, outfile ):
    '''Count number of reads mapped, duplicates, etc. '''
    to_cluster = USECLUSTER
    scriptsdir = PARAMS["general_scriptsdir"]
    statement = '''python %(scriptsdir)s/bam2stats.py --force 
                   --output-filename-pattern=%(outfile)s.%%s < %(infile)s > %(outfile)s'''
    P.run()
Example #14
0
def buildDownstreamFlankBed(infile, outfile):
    """ build interval downstream of gene start for each entry in bed file"""
    window = PARAMS["geneset_flank"]
    faidx = PARAMS["faidx"]
    statement = """flankBed -i %(infile)s -g %(faidx)s -l 0 -r %(window)s -s 
                   | python %(scriptsdir)s/bed2bed.py --method=filter-genome --genome-file=%(genome_dir)s/%(genome)s --log %(outfile)s.log > %(outfile)s"""
    P.run()
Example #15
0
def ExtendRegion(infile, outfile):
    """convert bed to gtf"""
    statement = """gunzip < %(infile)s 
                   | slopBed -i stdin -g %(faidx)s -b 1000  
                   | gzip
                   > %(outfile)s """
    P.run()
Example #16
0
def getNoncodingGeneset(infile, outfile):
    """Assume that all transcripts the do not overlap with ensembl coding geneset are noncoding """
    ensembl_transcripts = PARAMS["ensembl_transcripts"]
    statement = """cat %(infile)s | intersectBed -a stdin -b %(ensembl_transcripts)s -v -s > %(outfile)s;
                   echo "transcripts without ensembl coding overlap: " > %(outfile)s.count; 
                   cat %(outfile)s | wc -l >> %(outfile)s.count;"""
    P.run()
Example #17
0
def addMissingNoncodingTranscripts(infile, outfile):
    """ Add ensembl gene id to GTF file"""
    ensembl_noncoding = PARAMS["ensembl_noncoding_gtf"]
    statement = """intersectBed -a %(ensembl_noncoding)s -b %(infile)s  -v -s -f 1 -r > transcripts/missing_ensembl_noncoding_transcripts.gtf;
                   cat %(infile)s transcripts/missing_ensembl_noncoding_transcripts.gtf | sort -k1,1 -k4,4n
                   > %(outfile)s;"""
    P.run()
def extractLncRNAFastaAlignments(infiles, outfile):
    """
    Recieves a MAF file containing pairwise alignments and a gtf12 file
    containing intervals. Outputs a single fasta file containing aligned
    sequence for each interval.
    """
    bed_file, maf_file = infiles
    maf_tmp = P.getTempFilename("./phyloCSF")
    to_cluster = False
    statement = ("gunzip -c %(maf_file)s > %(maf_tmp)s")
    P.run()

    target_genome = PARAMS["genome"]
    query_genome = PARAMS["phyloCSF_query_genome"]

    genome_file = os.path.join(PARAMS["genomedir"], PARAMS["genome"])

    gene_models = PipelineLncRNA.extractMAFGeneBlocks(bed_file,
                                                      maf_tmp,
                                                      genome_file,
                                                      outfile,
                                                      target_genome,
                                                      query_genome,
                                                      keep_gaps=False)
    E.info("%i gene_models extracted" % gene_models)
    os.unlink(maf_tmp)
def loadEffects(infile, outfile):
    '''load transcript effects into tables.'''

    root = infile[:-len(".effects.gz")]

    statement = '''
   python %(scriptsdir)s/csv2db.py %(csv2db_options)s \
              --from-zipped \
              --index=transcript_id \
              --table=%(root)s_effects \
    < %(infile)s > %(outfile)s
    '''
    P.run()

    for suffix in ("cds", "intron", "splicing", "translation"):

        statement = '''
        gunzip < %(infile)s.%(suffix)s.gz
        | python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
        --allow-empty
        --index=transcript_id 
        --table=%(root)s_effects_%(suffix)s 
        --ignore-column=seq_na
        --ignore-column=seq_aa
        >> %(outfile)s
        '''
        P.run()
    def buildFilteredLncRNAGeneSet(infile, outfile):
        '''
        Depending on on filtering_remove_single_exon will:
        i) remove all single exon transcripts from all lncrna models 
        (transcripts)
        ii) remove lncrna loci that only contain single exon transcripts 
        (loci)
        iii) leave all single-exon and multi-exon loci in outfile 
        (None)
        '''

        if not PARAMS["filtering_remove_single_exon"]:
            E.info("Both multi-exon and single-exon lncRNA are retained!")
            statement = ("cp %(infile)s %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "loci":
            E.info("Warning: removing all single-exon"
                   " transcripts from lncRNA set")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status_locus \"s\"'"
                         " gzip > %(outfile)s")
        elif PARAMS["filtering_remove_single_exon"] == "transcripts":
            E.info("Warning: removing loci with only single-exon transcripts")
            statement = ("zcat %(infile)s |"
                         " grep 'exon_status \"s\"'"
                         " gzip > %(outfile)s")
        else:
            raise ValueError("Unregocnised parameter %s"
                             % PARAMS["filtering_remove_single_exon"])
        P.run()
def loadLncRNAClass(infile, outfile):
    '''
    load the lncRNA classifications
    '''
    tablename = os.path.basename(
        filenameToTablename(P.snip(infile, ".gtf.gz")))

    to_cluster = False
    # just load each transcript with its classification
    temp = P.getTempFile()
    inf = IOTools.openFile(infile)
    for transcript in GTF.transcript_iterator(GTF.iterator(inf)):
        temp.write("%s\t%s\t%s\n" % (
            transcript[0].transcript_id, 
            transcript[0].gene_id, 
            transcript[0].source))
    temp.close()

    inf_1 = temp.name
    statement = ("python %(scriptsdir)s/csv2db.py"
                 "  -t %(tablename)s"
                 "  --log=%(outfile)s.log"
                 "  --header=transcript_id,gene_id,class"
                 " < %(inf_1)s > %(outfile)s")
    P.run()
def makeSegments( infile, outfile ):
    '''compute intron overrun.'''

    to_cluster = True

    statement = '''gunzip < %(infile)s 
    | %(scriptsdir)s/gff_sort pos 
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--output-filename-pattern="%(outfile)s.%%s"
		--force 
		--log=%(outfile)s.log 
    > %(outfile)s 
    '''
    P.run()

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=position+gene
    | python %(scriptsdir)s/gtf2gtf.py --merge-transcripts 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | python %(scriptsdir)s/gff2histogram.py 
		--method=values 
		--force 
		--output-filename-pattern="%(outfile)s_genes.%%s" 
		--log=%(outfile)s.log
    >> %(outfile)s'''
    P.run()
def loadRepeatInformation( infiles, outfile ):
    '''load genome information.'''
    
    to_cluster = True

    table = outfile[:-len(".load")]

    repeatsfile, indexfile = infiles

    tmpfilename = P.getTempFilename( "." )

    statement = '''awk '{printf("%%s\\t0\\t%%i\\n", $1, $4)}' < %(indexfile)s > %(tmpfilename)s'''
    P.run()

    statement = '''
        gunzip < %(repeatsfile)s 
        | python %(scriptsdir)s/gff2bed.py -v 0 
        | coverageBed -a stdin -b %(tmpfilename)s
        | awk 'BEGIN { printf("contig\\tstart\\tend\\tnover_entries\\tnover_bases\\tlength\\tpover\\n" );} {print;}'
        |python %(scriptsdir)s/csv2db.py %(csv2db_options)s 
              --table=%(table)s 
        > %(outfile)s
    '''
    P.run()

    os.unlink( tmpfilename )
Example #24
0
def buildBenchmarkInput(infile, outfile):

    tmpfile = P.getTempFile()

    dbhandle = sqlite3.connect(PARAMS["database"])
    cc = dbhandle.cursor()
    statement = '''
    SELECT DISTINCT transcript_id, protein_id FROM peptide_info
    '''
    cc.execute(statement)
    tmpfile.write("transcript_id\tprotein_id\n")
    tmpfile.write("\n".join(["\t".join(x) for x in cc]))
    tmpfile.write("\n")
    tmpfilename = tmpfile.name

    statement = '''
    perl %(scriptsdir)s/extract_fasta.pl %(infile)s
    < cds.fasta 
    python %(scripstdir)s/fasta2variants.py --is-cds  
    | python %(scriptsdir)s/substitute_tokens.py 
             --apply=%(tmpfilename)s
    > %(outfile)s
    '''
    P.run()

    os.unlink(tmpfilename)
def buildPicardAlignStats(infile, outfile):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip(os.path.basename(infile), ".bam")
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals(
    )
    P.run()
def exportMotifDiscoverySequences( infile, outfile ):
    '''export sequences for motif discovery.

    This method requires the _interval tables.

    For motif discovery, only the sequences with the highest S/N ratio are supplied.
    
    1. The top *motifs_proportion* intervals sorted by peakval
    2. Only a region +/- *motifs_halfwidth* around the peak 
    3. At least *motifs_min_sequences*. If there are not enough sequences
          to start with, all will be used.
    4. At most *motifs_max_size* sequences will be output.
    '''
    track = P.snip( infile, "_intervals.load" )
    dbhandle = connect()
        
    p = P.substituteParameters( **locals() )
    nseq = PipelineMotifs.writeSequencesForIntervals( track, 
                                                      outfile,
                                                      dbhandle,
                                                      full = False,
                                                      masker = P.asList(p['motifs_masker']),
                                                      halfwidth = int(p["motifs_halfwidth"]),
                                                      maxsize = int(p["motifs_max_size"]),
                                                      proportion = p["motifs_proportion"],
                                                      min_sequences = p["motifs_min_sequences"],
                                                      num_sequences = p["motifs_num_sequences"],
                                                      order = p['motifs_score'])

    if nseq == 0:
        E.warn( "%s: no sequences - meme skipped" % outfile)
        P.touch( outfile )
def exportMotifLocations( infiles, outfile ):
    '''export motif locations. There will be a bed-file per motif.

    Overlapping motif matches in different tracks will be merged.
    '''

    dbh = connect()
    cc = dbh.cursor()

    motifs = [ x[0] for x in cc.execute( "SELECT motif FROM motif_info" ).fetchall()]

    
    for motif in motifs:

        tmpf = P.getTempFile(".")
        
        for infile in infiles:
            table = P.toTable(infile) 
            track = P.snip( table, "_mast" )
            for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue
                                   FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals() ):
                tmpf.write( "\t".join( map(str, x) ) + "\n" )
        tmpf.close()

        outfile = os.path.join( PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif )
        tmpfname = tmpf.name 

        statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s'''
        P.run()

        os.unlink( tmpf.name )
def buildAnnotations( infiles, outfile ):
    '''annotate transcripts by location (intergenic, intronic, ...)'''
    
    infile, annotation = infiles

    statement = '''gunzip 
    < %(infile)s 
    | python %(scriptsdir)s/gtf2gtf.py --sort=gene
    | %(cmd-farm)s --split-at-column=1 --output-header --log=%(outfile)s.log --max-files=60 
	"python %(scriptsdir)s/gtf2table.py 
		--counter=position 
		--counter=classifier 
		--section=exons 
		--section=introns 
		--counter=length 
		--counter=splice 
		--counter=composition-na 
		--counter=splice-comparison 
		--log=%(outfile)s.log 
                --filename-format=gff
		--filename-gff=%(annotation)s 
		--genome-file=%(genome_dir)s/%(genome)s"
    | gzip
    > %(outfile)s
    '''
    P.run()
def buildContigSummary(infiles, outfile):
    '''
    merge the contig summary statistics
    '''
    stats = collections.defaultdict(list)
    for filepath in infiles:
        dirname = os.path.dirname(filepath)
        stats[dirname].append(os.path.basename(filepath))

    N = PARAMS["scaffold_n"]

    # connect to database
    dbh = connect()
    cc = dbh.cursor()
    for dirname in stats.keys():
        outfname = os.path.join(dirname, "contig.summary.tsv")
        outf = open(outfname, "w")
        outf.write(
            "track\tnscaffolds\tscaffold_length\tN%i\tmean_length\tmedian_length\tmax_length\n" % N)
        for infile in stats[dirname]:
            track = P.snip(
                infile.split(dirname.split(".dir")[0])[1][1:], ".summary.load")
            table = P.toTable(infile)
            data = cc.execute("""SELECT nscaffolds
                                 , scaffold_length
                                 , N50
                                 , mean_length
                                 , median_length
                                 , max_length FROM %s""" % table).fetchone()
            outf.write("\t".join(
                map(str, [track, data[0], data[1], data[2],
                          data[3], data[4], data[5]])) + "\n")
        outf.close()
def renameTranscriptsInPreviousSets(infile, outfile):
    '''
    transcripts need to be renamed because they may use the same
    cufflinks identifiers as we use in the analysis - don't do if they
    have an ensembl id - sort by transcript
    '''
    inf = IOTools.openFile(infile)
    for gtf in GTF.iterator(inf):
        if gtf.gene_id.find("ENSG") != -1:
            statement = '''zcat %(infile)s | grep -v "#"
                        | python %(scriptsdir)s/gtf2gtf.py 
                        --sort=gene
                        --log=%(outfile)s.log
                        | gzip > %(outfile)s'''
        else:
            gene_pattern = "GEN" + P.snip(outfile, ".gtf.gz")
            transcript_pattern = gene_pattern.replace("GEN", "TRAN")
            statement = '''zcat %(infile)s | python %(scriptsdir)s/gtf2gtf.py 
                           --renumber-genes=%(gene_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --renumber-transcripts=%(transcript_pattern)s%%i 
                           | python %(scriptsdir)s/gtf2gtf.py
                           --sort=gene 
                           --log=%(outfile)s.log
                          | gzip > %(outfile)s'''

    P.run()
Example #31
0
def loadFilteringSummary(infile, outfile):
    '''load filtering summary.'''
    P.load(infile, outfile)
Example #32
0
def update_report():
    '''update report.'''

    E.info( "updating documentation" )
    P.run_report( clean = False )
Example #33
0
def build_report():
    '''build report from scratch.'''

    E.info( "starting documentation build process from scratch" )
    P.run_report( clean = True )
Example #34
0
def buildPicardAlignStats( infile, outfile ):
    '''Gather BAM file alignment statistics using Picard '''
    to_cluster = USECLUSTER
    track = P.snip( os.path.basename(infile), ".bam" )
    statement = '''CollectAlignmentSummaryMetrics INPUT=%(infile)s REFERENCE_SEQUENCE=%%(samtools_genome)s ASSUME_SORTED=true OUTPUT=%(outfile)s VALIDATION_STRINGENCY=SILENT ''' % locals()
    P.run()
Example #35
0
def sortByPosition( infile, outfile ):
    '''Add number of hits tags to sam file'''
    to_cluster = USECLUSTER
    track = P.snip(outfile, ".bam")
    statement = '''samtools sort %(infile)s %(track)s;'''
    P.run()
Example #36
0
import pysam
import numpy
import gzip
import fileinput
import CGATPipelines.PipelineTracks as PipelineTracks
import CGATPipelines.PipelineMapping as PipelineMapping

USECLUSTER = True

###################################################
###################################################
###################################################
## Pipeline configuration
###################################################
import CGAT.Pipeline as P
P.getParameters(  ["%s/pipeline.ini" % os.path.splitext(__file__)[0],  "../pipeline.ini", "pipeline.ini" ] )
PARAMS = P.PARAMS

bowtie_options = {'n0m1':"-n 0 -a --best --strata -m 1 -3 1",'n1m1':"-n 1 -a --best --strata -m 1 -3 1",'n2m1':"-n 2 -a --best --strata -m 1 -3 1",'n3m1':"-n 3 -a --best --strata -m 1 -3 1",
                  'n0m2':"-n 0 -a --best --strata -m 2 -3 1",'n1m2':"-n 1 -a --best --strata -m 2 -3 1",'n2m2':"-n 2 -a --best --strata -m 2 -3 1",'n3m2':"-n 3 -a --best --strata -m 2 -3 1",
                  'n0m3':"-n 0 -a --best --strata -m 3 -3 1",'n1m3':"-n 1 -a --best --strata -m 3 -3 1",'n2m3':"-n 2 -a --best --strata -m 3 -3 1",'n3m3':"-n 3 -a --best --strata -m 3 -3 1",
                  'n0m4':"-n 0 -a --best --strata -m 4 -3 1",'n1m4':"-n 1 -a --best --strata -m 4 -3 1",'n2m4':"-n 2 -a --best --strata -m 4 -3 1",'n3m4':"-n 3 -a --best --strata -m 4 -3 1",
                  'n0m5':"-n 0 -a --best --strata -m 5 -3 1",'n1m5':"-n 1 -a --best --strata -m 5 -3 1",'n2m5':"-n 2 -a --best --strata -m 5 -3 1",'n3m5':"-n 3 -a --best --strata -m 5 -3 1",
                  'v0m1':"-v 0 -a --best --strata -m 1 -3 1",'v1m1':"-v 1 -a --best --strata -m 1 -3 1",'v2m1':"-v 2 -a --best --strata -m 1 -3 1",'v3m1':"-v 3 -a --best --strata -m 1 -3 1",
                  'v0m2':"-v 0 -a --best --strata -m 2 -3 1",'v1m2':"-v 1 -a --best --strata -m 2 -3 1",'v2m2':"-v 2 -a --best --strata -m 2 -3 1",'v3m2':"-v 3 -a --best --strata -m 2 -3 1",
                  'v0m3':"-v 0 -a --best --strata -m 3 -3 1",'v1m3':"-v 1 -a --best --strata -m 3 -3 1",'v2m3':"-v 2 -a --best --strata -m 3 -3 1",'v3m3':"-v 3 -a --best --strata -m 3 -3 1",
                  'v0m4':"-v 0 -a --best --strata -m 4 -3 1",'v1m4':"-v 1 -a --best --strata -m 4 -3 1",'v2m4':"-v 2 -a --best --strata -m 4 -3 1",'v3m4':"-v 3 -a --best --strata -m 4 -3 1",
                  'v0m5':"-v 0 -a --best --strata -m 5 -3 1",'v1m5':"-v 1 -a --best --strata -m 5 -3 1",'v2m5':"-v 2 -a --best --strata -m 5 -3 1",'v3m5':"-v 3 -a --best --strata -m 5 -3 1"}

###################################################################
###################################################################
Example #37
0
def replaceBaseWithN(infile, outfile):
    '''replaces the specified base with N'''

    to_cluster = True
    statement = '''python %(scriptsdir)s/fastq2N.py -i %(infile)s %(replace_options)s'''
    P.run()
Example #38
0
def processReads(infiles, outfile):
    '''process reads.'''

    infile, contaminant_file = infiles

    do_sth = False
    to_cluster = True

    infile2 = checkPairs(infile)

    if infile2:
        track = P.snip(outfile, ".fastq.1.gz")
        outfile2 = P.snip(outfile, ".fastq.1.gz") + ".fastq.2.gz"
    else:
        track = P.snip(outfile, ".fastq.gz")

    if PARAMS["process_combine_reads"]:
        E.warn(
            "combining reads cannot be can not be combined with other processing for paired ended reads"
        )
        if not infile2: raise IOError("must have paired data to combine reads")

        read_len, frag_len, frag_stdev = PARAMS["combine_reads_read_length"], \
            PARAMS["combine_reads_fragment_length"], \
            PARAMS["combine_reads_fragment_length_stdev"]

        fragment_options = " ".join(map(str, [read_len, frag_len, frag_stdev]))

        if PARAMS["combine_reads_max_overlap"]:
            E.warn(
                "if specifying --max-overlap read and fragment length options will be ignored"
            )
            max_overlap = "--max-overlap=%i" % PARAMS[
                "combine_reads_max_overlap"]
            fragment_options = ""

        elif not PARAMS["combine_reads_max_overlap"] and len(
                fragment_options.strip().split(" ")) < 3:
            E.warn(
                "have not specified --read-len, --frag-len, --frag-len-stddev: default --max-overlap used"
            )
            max_overlap = ""
            fragment_options = ""

        elif PARAMS["combine_reads_read_length"] and PARAMS[
                "combine_reads_fragment_length"] and PARAMS[
                    "combine_reads_fragment_length_stdev"]:
            if PARAMS["combine_reads_max_overlap"]:
                E.warn(
                    "--max-overlap will override the specified read and fragment length options"
                )
            max_overlap = ""
            fragment_options = """--read-len=%(read_len)i
                                  --fragment-len=%(frag_len)i
                                  --fragment-len-stddev=%(frag_stdev)i""" % locals(
            )
        else:
            max_overlap = ""
            fragment_options = ""

        if not PARAMS["combine_reads_min_overlap"]:
            min_overlap = ""
        else:
            min_overlap = "--min-overlap=%i" % PARAMS[
                "combine_reads_min_overlap"]
        if not PARAMS["combine_reads_threads"]:
            threads = ""
        else:
            threads = "--threads=%i" % PARAMS["combine_reads_threads"]
        if not PARAMS["combine_reads_phred_offset"]:
            phred_offset = ""
        else:
            phred_offset = "--phred-offset=%i" % PARAMS[
                "combine_reads_phred_offset"]
        if not PARAMS["combine_reads_max_mismatch_density"]:
            max_mismatch_density = ""
        else:
            max_mismatch_density = "--max-mismatch-density=%f" % PARAMS[
                "combine_reads_max_mismatch_density"]

        statement = '''flash 
                     %(min_overlap)s
                     %(max_overlap)s
                     %(max_mismatch_density)s
                     %(phred_offset)s
                     %(fragment_options)s
                     --output-prefix=%(track)s
                     %(threads)s
                     --compress
                     %(infile)s %(infile2)s >> %(outfile)s.log
                     '''
        P.run()
        if PARAMS["combine_reads_concatenate"]:
            infiles = " ".join([
                track + x for x in [
                    ".notCombined_1.fastq.gz", ".notCombined_2.fastq.gz",
                    ".extendedFrags.fastq.gz"
                ]
            ])
            statement = '''zcat %(infiles)s | gzip > %(outfile)s; rm -rf %(infiles)s'''
        else:
            statement = '''mv %(track)s.extendedFrags.fastq.gz %(outfile)s'''
        P.run()
        return

    if PARAMS["process_sample"] and infile2:
        E.warn(
            "sampling can not be combined with other processing for paired ended reads"
        )
        statement = '''zcat %(infile)s
        | python %(scriptsdir)s/fastq2fastq.py 
                                   --sample=%(sample_proportion)f 
                                   --pair=%(infile2)s 
                                   --outfile-pair=%(outfile2)s 
                                   --log=%(outfile)s_sample.log
        | gzip 
        > %(outfile)s
        '''

        P.run()
        return

    # fastx does not like quality scores below 64 (Illumina 1.3 format)
    # need to detect the scores and convert
    format = Fastq.guessFormat(IOTools.openFile(infile), raises=False)
    E.info("%s: format guess: %s" % (infile, format))
    offset = Fastq.getOffset(format, raises=False)

    if PARAMS["process_remove_contaminants"]:
        adaptors = listAdaptors(contaminant_file)
        #              %(contamination_trim_type)s
        s = [
            '''
        cutadapt 
              %(adaptors)s
              --overlap=%(contamination_min_overlap_length)i
              --format=fastq
              %(contamination_options)s
              <( zcat < %(infile)s )
              2>> %(outfile)s_contaminants.log
        '''
        ]
        do_sth = True
    else:
        s = ['zcat %(infile)s']

    if PARAMS["process_artifacts"]:
        s.append(
            'fastx_artifacts_filter -Q %(offset)i -v %(artifacts_options)s 2>> %(outfile)s_artifacts.log'
        )
        do_sth = True

    if PARAMS["process_trim"]:
        s.append(
            'fastx_trimmer -Q %(offset)i -v %(trim_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    # NICK - may replace fastx trimmer
    if PARAMS["process_trim_quality"]:
        s.append(
            'fastq_quality_trimmer -Q %(offset)i  -v %(trim_quality_options)s 2>> %(outfile)s_trim.log'
        )
        do_sth = True

    if PARAMS["process_filter"]:
        s.append(
            'fastq_quality_filter -Q %(offset)i -v %(filter_options)s 2>> %(outfile)s_filter.log'
        )
        do_sth = True

    if PARAMS["process_sample"]:
        s.append(
            'python %(scriptsdir)s/fastq2fastq.py --sample=%(sample_proportion)f --log=%(outfile)s_sample.log'
        )

    if not do_sth:
        E.warn("no filtering specified for %s - nothing done" % infile)
        return

    s.append("gzip")
    if not infile2:
        statement = " | ".join(s) + " > %(outfile)s"
        P.run()
    else:
        tmpfile = P.getTempFilename(".")
        tmpfile1 = tmpfile + ".fastq.1.gz"
        tmpfile2 = tmpfile + ".fastq.2.gz"

        E.warn("processing first of pair")
        # first read pair
        statement = " | ".join(s) + " > %(tmpfile1)s"
        P.run()

        # second read pair
        E.warn("processing second of pair")
        infile = infile2
        statement = " | ".join(s) + " > %(tmpfile2)s"
        P.run()

        # reconcile
        E.info("starting reconciliation")
        statement = """python %(scriptsdir)s/fastqs2fastqs.py
                           --method=reconcile
                           --output-pattern=%(track)s.fastq.%%s.gz
                           %(tmpfile1)s %(tmpfile2)s
                     > %(outfile)s_reconcile.log"""

        P.run()

        os.unlink(tmpfile1)
        os.unlink(tmpfile2)
        os.unlink(tmpfile)
Example #39
0
def publish():
    '''publish files.'''
    P.publish_report()
Example #40
0
def loadAllProcessingSummary(infile, outfile):
    P.load(infile, outfile)
def loadCountKeggAssociations(infile, outfile):
    '''
    load counts of KO associations
    '''
    P.load(infile, outfile, "--header=pathway,p_annotated_reads")
Example #42
0
def loadFastqcSummary(infile, outfile):
    P.load(infile, outfile, options="--index=track")
def loadCountContributingReads(infile, outfile):
    '''
    load contributing read counts
    '''
    P.load(infile, outfile)
def loadCountKeggGenes(infile, outfile):
    '''
    load counts of KO associations
    '''
    P.load(infile, outfile, "--header=KO,p_annotated_reads")
def loadMetaphlanRelativeAbundances(infile, outfile):
    '''
    load the metaphlan relative abundances
    '''
    P.load(infile, outfile)
def loadKeggTable(infile, outfile):
    '''
    load KEGG table
    '''
    P.load(infile, outfile)
def countReads(infile, outfile):
    '''count number of reads in input files.'''
    to_cluster = True
    m = PipelineMapping.Counter()
    statement = m.build((infile, ), outfile)
    P.run()
def loadCountLcaTaxa(infile, outfile):
    '''
    load taxa level counts
    '''
    P.load(infile, outfile)
Example #49
0
def convertToGTF( infile, outfile ):
    '''convert bed to gtf'''
    statement = """gunzip < %(infile)s 
                   | python %(scriptsdir)s/bed2gff.py --as-gtf  --log=%(outfile)s.log 
                   > %(outfile)s """
    P.run()
def loadMetaphlanReadmaps(infile, outfile):
    '''
    load the metaphlan read maps
    '''
    P.load(infile, outfile)
Example #51
0
def convertStrandedTranscriptsToBed( infile, outfile ):
    '''Convert GTF to compressed BED file'''
    track = P.snip( os.path.basename(infile), ".gtf" )
    statement = '''cat %(infile)s | python %(scriptsdir)s/gff2bed.py --is-gtf --log=%(outfile)s.log | sort -k1,1 -k2,2n | gzip > %(outfile)s'''
    P.run()
import CGAT.Metaphlan as Metaphlan
import CGATPipelines.PipelineMapping as PipelineMapping
import CGATPipelines.PipelineMappingQC as PipelineMappingQC
import pysam
import CGAT.Fastq as Fastq

###################################################
###################################################
###################################################
# Pipeline configuration
###################################################

# load options from the config file
import CGAT.Pipeline as P

P.getParameters(["pipeline.ini"])

PARAMS = P.PARAMS

###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
###################################################################
import PipelineTracks

# collect fastq.gz tracks
TRACKS = PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
    glob.glob( "*.fastq.gz" ), "(\S+).fastq.gz" ) +\
    PipelineTracks.Tracks(PipelineTracks.Sample3).loadFromDirectory(
        glob.glob("*.fastq.1.gz"), "(\S+).fastq.1.gz")
Example #53
0
def convertGffToGtf( infile, outfile ):
    '''Convert  txt to Gtf'''
    track = P.snip( os.path.basename(infile), ".gff" )
    statement = '''cat %(infile)s | awk 'OFS="\\t" {print $1,$2,$3,$4,$5,$6,$7,$8,"transcript_id \\""$9"\\"; gene_id \\""$9"\\";"}' > %(outfile)s'''
    P.run()
Example #54
0
def renameTranscripts( infile, outfile ):
    '''systematically rename transcripts '''
    statement = '''cat %(infile)s | awk 'OFS="\\t" {print $1,$2,$3,$4,$5,$6,$7,$8,"transcript_id \\"rnaseq_es_novel_transcript_"NR"\\"; gene_id \\"rnaseq_es_novel_gene_"NR"\\"; "}' > %(outfile)s;'''
    P.run()
def buildPolyphenInput(infiles, outfile):
    '''build polyphen input file.

    SNPS across all species are aggregated into a single
    file to avoid multiple submissions for the same variant.

    Mapping to Uniprot ids was not successful - 40% of the
    SNPs would have been lost. Hence I map to ensembl protein
    identifiers. Note that the sequence file is then to be 
    submitted to POLYPHEN as well.

    Note that this method outputs 1-based coordinates for polyphen,
    while the coordinates in the .map file are still 0-based.

    SNPs are assigned a snp_id and a locus_id. The snp_id refers
    to the SNP within a peptide sequence while the locus_id refers
    to the genomic location. If there are alternative
    transcripts overlapping a SNP, the same SNP will get two
    snp_ids, but the same locus_id. As the peptide background might
    be different for the same SNP depending on the transcript,
    its effect needs to be predicted twice.
    '''

    statement = '''SELECT
        transcript_id,
        cds_start,
        cds_end,
        orig_codons,
        variant_codons,
        orig_na,
        variant_na,
        contig,
        snp_position
    FROM %(table)s_cds
    WHERE variant_code = '=' AND code = 'N'
    '''

    dbhandle = connect()
    cc = dbhandle.cursor()

    infiles.sort()

    # ensembl mapping
    map_transcript2id = dict(
        cc.execute(
            "SELECT transcript_id, protein_id FROM annotations.transcript_info WHERE protein_id IS NOT NULL"
        ).fetchall())

    total_counts = E.Counter()
    notfound, found = set(), set()

    outf_map = open(outfile + ".map", "w")
    outf_map.write(
        "snp_id\ttrack\ttranscript_id\tprotein_id\tprotein_pos\tlocus_id\tcontig\tpos\tphase\n"
    )

    outf = open(outfile, "w")

    snps = {}
    locus_ids = {}

    for infile in infiles:

        table = P.toTable(infile)
        track = table[:-len("_effects")]
        print statement % locals()
        cc.execute(statement % locals())

        counts = E.Counter()

        snp_id = 0
        for transcript_id, cds_start, cds_end, orig_codons, variant_codons, orig_na, variant_na, contig, pos in cc:

            counts.input += 1

            if transcript_id not in map_transcript2id:
                notfound.add(transcript_id)
                counts.not_found += 1
                continue

            if "," in variant_codons:
                counts.heterozygous += 1
                continue

            for phase in range(0, 3):
                if orig_na[phase].lower() != variant_na[phase].lower():
                    break

            pid = map_transcript2id[transcript_id]
            # one-based coordinates
            peptide_pos = int(math.floor(cds_start / 3.0)) + 1
            key = "%s-%i-%s" % (pid, peptide_pos, variant_codons)

            if key in snps:
                snp_id = snps[key]
            else:
                snp_id = len(snps)
                snps[key] = snp_id
                outf.write("snp%010i\t%s\t%i\t%s\t%s\n" % (
                    snp_id,
                    pid,
                    peptide_pos,
                    orig_codons,
                    variant_codons,
                ))
                counts.output += 1

            locus_key = "%s-%i-%s" % (contig, pos, variant_codons)
            if locus_key not in locus_ids:
                locus_ids[locus_key] = len(locus_ids)

            # use 0-based coordinates throughout, including peptide pos
            outf_map.write("snp%010i\t%s\t%s\t%s\t%i\tloc%010i\t%s\t%i\t%i\n" %
                           (snp_id, track, transcript_id, pid, peptide_pos - 1,
                            locus_ids[locus_key], contig, pos, phase))

            found.add(transcript_id)

        total_counts += counts

        E.info("%s: %s" % (table, str(counts)))

    outf.close()
    outf_map.close()

    E.info("%s: transcripts: %s found, %i not found" %
           (table, len(found), len(notfound)))

    E.info("total=%s, snp_ids=%i, locus_ids=%i" %
           (str(total_counts), len(snps), len(locus_ids)))
    if notfound:
        E.warn(
            "%i transcripts had SNPS that were ignored because there was no uniprot accession"
            % len(notfound))
        E.warn("notfound: %s" % ",".join(notfound))

    statement = '''sort -k2,2 -k3,3n %(outfile)s > %(outfile)s.tmp; mv %(outfile)s.tmp %(outfile)s'''

    P.run()
Example #56
0
def getGtfStrandedTranscripts( infile, outfile ):
    '''join exons to get transcripts from GTF file'''
    track = P.snip( os.path.basename(infile), ".gtf" )
    statement = '''cat %(infile)s | python %(scriptsdir)s/gtf2gtf.py --join-exons --log=%(outfile)s.log | sort -k1,1 -k4,4n > %(outfile)s'''
    P.run()
import CGAT.Stats as Stats
import pysam

# only update R if called as pipeline
# otherwise - failure with sphinx
from rpy2.robjects import r as R
import rpy2.robjects as ro
import rpy2.robjects.numpy2ri

###################################################################
###################################################################
###################################################################
# Pipeline configuration
import CGAT.Pipeline as P
P.getParameters([
    "%s/pipeline.ini" % os.path.splitext(__file__)[0], "../pipeline.ini",
    "pipeline.ini"
])

PARAMS = P.PARAMS
PARAMS_ANNOTATIONS = P.peekParameters(PARAMS["annotations_dir"],
                                      "pipeline_annotations.py")

SEPARATOR = "|"

###################################################################
###################################################################
###################################################################
# Helper functions mapping tracks to conditions, etc
import CGATPipelines.PipelineTracks as PipelineTracks

def analysePolyphen(infile, outfile):
    '''compute enrichment of SNPs within genes
    and deleterious SNPs within SNPs within genes.

    del: enrichment of deleterious snps within snps per gene
    len: enrichment of snps within genes
    com: enrichment of deleterious snps within gene
    '''

    table = P.toTable(infile)
    tablename_map = "polyphen_map"

    dbhandle = connect()
    cc = dbhandle.cursor()

    statement = '''
        SELECT i.gene_id,
               COUNT(DISTINCT map.locus_id) as nsnps, 
               COUNT(DISTINCT case t.prediction when 'possiblydamaging' then map.locus_id when 'probablydamaging' then map.locus_id else NULL end) AS ndeleterious,
               MAX(s.length)
               FROM %(table)s as t, 
                    %(tablename_map)s as map, 
                    annotations.protein_stats as s,
                    annotations.transcript_info as i 
        WHERE map.snp_id = t.snp_id AND 
              i.transcript_id = map.transcript_id AND
              s.protein_id = map.protein_id
        GROUP BY i.gene_id
     ''' % locals()

    data = cc.execute(statement).fetchall()

    statement = '''SELECT DISTINCT i.gene_id, MAX(s.length) 
                   FROM annotations.transcript_info AS i, annotations.protein_stats AS s 
                   WHERE s.protein_id = i.protein_id 
                   GROUP BY i.gene_id'''
    gene_ids = cc.execute(statement).fetchall()

    total_nsnps = sum([x[1] for x in data])
    total_ndel = sum([x[2] for x in data])
    total_length = sum([x[1] for x in gene_ids])
    del_p = float(total_ndel) / total_nsnps
    len_p = float(total_nsnps) / total_length
    com_p = float(total_ndel) / total_length

    E.info("del: background probability: %i/%i = %f" %
           (total_ndel, total_nsnps, del_p))
    E.info("len: background probability: %i/%i = %f" %
           (total_nsnps, total_length, len_p))
    E.info("com: background probability: %i/%i = %f" %
           (total_ndel, total_length, com_p))

    outf = open(outfile, "w")
    outf.write("\t".join((
        "gene_id",
        "code",
        "length",
        "nsnps",
        "ndel",
        "del_p",
        "del_pvalue",
        "del_qvalue",
        "len_p",
        "len_pvalue",
        "len_qvalue",
        "com_p",
        "com_pvalue",
        "com_qvalue",
    )) + "\n")

    del_pvalues, len_pvalues, com_pvalues = [], [], []
    for gene_id, nsnps, ndel, length in data:

        # use -1, because I need P( x >= X)
        # sf = 1 - cdf and cdf = P( x <= X ), thus sf = 1 - P( x <= X ) = P (x
        # > X ).
        del_pvalues.append(scipy.stats.binom.sf(ndel - 1, nsnps, del_p))
        len_pvalues.append(
            scipy.stats.binom.sf(nsnps - 1, int(round(length)), len_p))
        com_pvalues.append(
            scipy.stats.binom.sf(ndel - 1, int(round(length)), com_p))

    if len(del_pvalues) > 10:
        del_qvalues = Stats.doFDR(del_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        del_qvalues = del_pvalues

    if len(len_pvalues) > 10:
        len_qvalues = Stats.doFDR(len_pvalues).mQValues
    else:
        E.warn("no FDR computed for del")
        len_qvalues = len_pvalues

    if len(com_pvalues) > 10:
        com_q = Stats.doFDR(com_pvalues).mQValues
    else:
        E.warn("no FDR computed for com")
        com_qvalues = com_pvalues

    fdr = PARAMS["polyphen_fdr"]

    found = set()

    for a, del_pvalue, del_qvalue, len_pvalue, len_qvalue, com_pvalue, com_qvalue in \
            zip(data,
                del_pvalues, del_qvalues,
                len_pvalues, len_qvalues,
                com_pvalues, com_qvalues,
                ):
        gene_id, nsnps, ndel, length = a
        found.add(gene_id)

        del_p = float(ndel) / nsnps
        len_p = float(nsnps) / length

        code = "".join(
            [str(int(x < fdr)) for x in (del_qvalue, len_qvalue, com_qvalue)])

        outf.write("\t".join((
            gene_id,
            code,
            "%i" % int(round(length)),
            "%i" % int(nsnps),
            "%i" % int(ndel),
            "%6.4f" % del_p,
            "%6.4g" % del_pvalue,
            "%6.4g" % del_qvalue,
            "%6.4f" % len_p,
            "%6.4g" % len_pvalue,
            "%6.4g" % len_qvalue,
            "%6.4f" % com_p,
            "%6.4g" % com_pvalue,
            "%6.4g" % com_qvalue,
        )) + "\n")

    # add missing genes:
    code = "---"
    for gene_id, length in gene_ids:
        if gene_id in found:
            continue
        outf.write("\t".join((
            gene_id,
            code,
            "%i" % int(round(length)),
            "%i" % 0,
            "%i" % 0,
            "%6.4f" % 0,
            "%6.4g" % 1,
            "%6.4g" % 1,
            "%6.4f" % 0,
            "%6.4g" % 1,
            "%6.4g" % 1,
            "%6.4f" % 0,
            "%6.4g" % 1,
            "%6.4g" % 1,
        )) + "\n")

    outf.close()
Example #59
0
def filterAndMergeGTF(infile, outfile, remove_genes, merge=False):
    '''filter gtf file infile with gene ids in remove_genes
    and write to outfile.

    If *merge* is set, the resultant transcript models are merged by overlap.

    A summary file "<outfile>.summary" contains the number of transcripts that failed 
    various filters.

    A file "<outfile>.removed.tsv.gz" contains the filters that a transcript failed.
    '''

    counter = E.Counter()

    # write summary table
    outf = IOTools.openFile(outfile + ".removed.tsv.gz", "w")
    outf.write("gene_id\tnoverlap\tsection\n")
    for gene_id, r in remove_genes.iteritems():
        for s in r:
            counter[s] += 1
        outf.write("%s\t%i\t%s\n" % (gene_id, len(r), ",".join(r)))
    outf.close()

    # filter gtf file
    tmpfile = P.getTempFile(".")
    inf = GTF.iterator(IOTools.openFile(infile))

    genes_input, genes_output = set(), set()

    for gtf in inf:
        genes_input.add(gtf.gene_id)
        if gtf.gene_id in remove_genes: continue
        genes_output.add(gtf.gene_id)
        tmpfile.write("%s\n" % str(gtf))

    tmpfile.close()
    tmpfilename = tmpfile.name

    outf = IOTools.openFile(outfile + ".summary.tsv.gz", "w")
    outf.write("category\ttranscripts\n")
    for x, y in counter.iteritems():
        outf.write("%s\t%i\n" % (x, y))
    outf.write("input\t%i\n" % len(genes_input))
    outf.write("output\t%i\n" % len(genes_output))
    outf.write("removed\t%i\n" % (len(genes_input) - len(genes_output)))

    outf.close()

    # close-by exons need to be merged, otherwise
    # cuffdiff fails for those on "." strand

    if merge:
        statement = '''
        %(scriptsdir)s/gff_sort pos < %(tmpfilename)s
        | python %(scriptsdir)s/gtf2gtf.py
            --unset-genes="NONC%%06i"
            --log=%(outfile)s.log
        | python %(scriptsdir)s/gtf2gtf.py
            --merge-genes
            --log=%(outfile)s.log
        | python %(scriptsdir)s/gtf2gtf.py
            --merge-exons
            --merge-exons-distance=5
            --log=%(outfile)s.log
        | python %(scriptsdir)s/gtf2gtf.py
            --renumber-genes="NONC%%06i"
            --log=%(outfile)s.log
        | python %(scriptsdir)s/gtf2gtf.py
            --renumber-transcripts="NONC%%06i"
            --log=%(outfile)s.log
        | %(scriptsdir)s/gff_sort genepos 
        | gzip > %(outfile)s
        '''
    else:
        statement = '''
        %(scriptsdir)s/gff_sort pos < %(tmpfilename)s
        | gzip > %(outfile)s
        '''

    P.run()

    os.unlink(tmpfilename)
def buildSharedSNPMatrix(infiles, outfiles):
    '''build matrix of shared coding nonsynonymous SNPs.

    Counts are per locus id.

    Percent identities are only within coding segregating loci
    and thus do not reflect the real divergence.

    '''

    dbhandle = connect()
    cc = dbhandle.cursor()

    segregating_sites = cc.execute(
        'SELECT COUNT( DISTINCT locus_id) FROM polyphen_map').fetchone()[0]

    statement = '''SELECT DISTINCT locus_id, track FROM polyphen_map ORDER BY locus_id'''
    cc.execute(statement)

    matrix = collections.defaultdict(int)
    for k, vals in itertools.groupby(cc, key=lambda x: x[0]):
        tracks = [x[1] for x in list(vals)]
        for t1 in tracks:
            matrix[(t1, t1)] += 1
        if len(tracks) > 1:
            for t1, t2 in itertools.combinations(tracks, 2):
                matrix[(t1, t2)] += 1
                matrix[(t2, t1)] += 1

    all_tracks = set([x[0]
                      for x in matrix.keys()] + [x[1] for x in matrix.keys()])

    # output matrix with shared SNPs.
    outf = open(outfiles[0], "w")
    outf.write("track\t%s\n" % "\t".join(all_tracks))
    for track1 in all_tracks:
        outf.write("%s" % track1)
        for track2 in all_tracks:
            outf.write("\t%i" % matrix[(track1, track2)])
        outf.write("\n")
    outf.close()

    # output matrix with shared segregating sites as
    # distance matrix
    outf = open(outfiles[1], "w")
    outf.write("track\t%s\n" % "\t".join(all_tracks))
    for track1 in all_tracks:
        outf.write("%s" % track1)
        for track2 in all_tracks:
            if track1 == track2:
                outf.write("\t%i" % 0)
            else:
                outf.write("\t%i" %
                           (segregating_sites - matrix[(track1, track2)]))
        outf.write("\n")
    outf.close()

    # output matrix as percent identity matrix
    # percent identity is given as
    # segregating sites - sites where strains differ = segregating_sites - (matrix[i,i] + matrix[j,j] - 2 * matrix[i,j])
    # simplifies to:
    # segsites - matrix[i,i] -matrix[j,j] +
    # divided by the total number of segregating sites
    outf = open(outfiles[2], "w")
    outf.write("track\t%s\n" % "\t".join(all_tracks))
    pids = {}
    for track1 in all_tracks:
        outf.write("%s" % track1)
        for track2 in all_tracks:
            a = segregating_sites - \
                (matrix[(track1, track1)] + matrix[(track2, track2)] -
                 2 * matrix[(track1, track2)])
            pid = 100.0 * a / segregating_sites
            outf.write("\t%6.4f" % pid)
            pids[(track1, track2)] = pid
        outf.write("\n")
    outf.close()

    # distance matrix
    outf = open(outfiles[3], "w")
    outf.write("track\t%s\n" % "\t".join(all_tracks))
    for track1 in all_tracks:
        outf.write("%s" % track1)
        for track2 in all_tracks:
            val = 100.0 - pids[(track1, track2)]
            outf.write("\t%6.4f" % val)
        outf.write("\n")
    outf.close()

    outfile_distance, outfile_tree = outfiles[3], outfiles[4]

    # build tree
    statement = '''python %(scriptsdir)s/matrix2matrix.py
       --output-format=phylip
    < %(outfile_distance)s
    | python %(scriptsdir)s/matrix2tree.py
       --method=nj
    > %(outfile_tree)s
    '''
    P.run()