コード例 #1
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def buildPFAMDomains( infiles, outfile ):
    '''map PFAM domains onto current sequence collection. 
    The mapping is done by ID lookup.'''
    
    infile = infiles[0]
    with IOTools.openFile( "nrdb50.fasta.tsv") as inf:

        reader = csv.DictReader( inf, dialect='excel-tab' )
        map_id2nid = {}
        for row in reader:
            map_id2nid[row['repid']] = row['nid']
    
    rx = re.compile( "(\S+)\/(\d+)-(\d+)\s+(\S+);(.*);" )

    c = E.Counter()
    outf = IOTools.openFile( outfile, "w" )
    with IOTools.openFile( infile ) as inf:
        for entry in FastaIterator.iterate( inf ):
            c.input += 1
            pid, start, end, pfam_id, description = rx.match( entry.title ).groups()
            try:
                outf.write( "%s\t%i\t%i\t%s\n" % (map_id2nid[pid], int(start)-1, int(end), pfam_id ) )
            except KeyError:
                c.missed += 1
                continue
            c.output += 1

    outf.close()
    E.info( c )
コード例 #2
0
ファイル: CPC.py プロジェクト: pombredanne/cgat
    def __call__(self, track, slice = None):
        
        c_transcript = []
        c_gene = []
        for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_transcript.append(len(transcript))
        for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
            c_gene.append(len(gene))

        return odict( ( ("transcript", np.mean(c_transcript)), ("gene",np.mean(c_gene) )) )
コード例 #3
0
ファイル: LncRNACounts.py プロジェクト: pombredanne/cgat
 def __call__(self, track, slice = None):
     
     if slice == "transcript":
         lengths_transcripts = []
         for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in transcript])
             lengths_transcripts.append(length)
         return np.mean(lengths_transcripts)
     
     elif slice == "gene":
         lengths_genes = []
         for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
             length = sum([gtf.end - gtf.start for gtf in gene])
             lengths_genes.append(length)
         return np.mean(lengths_genes)
コード例 #4
0
ファイル: CPC.py プロジェクト: pombredanne/cgat
    def __call__(self, track, slice = None):

        classes = ["antisense"
              , "antisense_upstream"
              , "antisense_downstream"
              , "sense_upstream"
              , "sense_downstream"
              , "intergenic" 
              , "sense_intronic" 
              , "antisense_intronic"]

        coding_set = {}
        for gtf in GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.class.gtf.gz")):
            coding_set[gtf.transcript_id] = gtf.source

        result = {"noncoding": {}, "coding":collections.defaultdict(int)}
        total_nc = float(self.getValue("SELECT COUNT(*) FROM %(track)s_cpc_result WHERE C_NC = 'noncoding'"))
        for c in classes:
            result["noncoding"][c] = (float(self.getValue("""SELECT COUNT(*) FROM lncrna_final_class as a, %s_cpc_result as b WHERE a.class = '%s' 
                                                              AND b.C_NC = 'noncoding' 
                                                              AND a.transcript_id = b.transcript_id""" % (track,c)))/total_nc)*100

        
        total_c = len(coding_set.keys())
        for c in classes:
            ids = self.getValues("SELECT transcript_id FROM %(track)s_cpc_result WHERE C_NC = 'coding'")
            for i in ids:
                if i in coding_set.keys():
                    if coding_set[i] == c:
                        result["coding"][c] += 1
            
        for x, y in result["coding"].iteritems():
            result["coding"][x] = (float(y)/total_c)*100
            
        return result
コード例 #5
0
ファイル: metapipeline_medip.py プロジェクト: yangjl/cgat
def buildSummaryCpGCoverage( infiles, outfile ):
    '''build summary of differentially methylated regions.'''
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n" )

    for track in TRACKS:

        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and name LIKE '%%coveredpos%%' """ % track
                                            ).fetchall()]
        

        for table in tables:
            
            statement = """SELECT '%(track)s' as metatrack,
                         '%(table)s' as track,
                         coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s"""

            for x in cc.execute(statement % locals()):
                outf.write( "\t".join(map(str,x))+ "\n" )

    outf.close()
コード例 #6
0
ファイル: metapipeline_medip.py プロジェクト: yangjl/cgat
def buildSummaryCalledDMRs( infiles, outfile ):
    '''build summary of differentially methylated regions.'''
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n" )

    for track in TRACKS:
        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT 
                         COUNT(*) as ntested, 
                         SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, 
                         SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, 
                         SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold 
                         FROM medip_%(track)s.%(table)s"""

            ntested, nok, nsignificant, n2fold = cc.execute( statement % locals() ).fetchone()

            outf.write( "\t".join( map(str, (track, table, ntested, nok, nsignificant, n2fold )))+ "\n" )

    outf.close()
コード例 #7
0
ファイル: metapipeline_medip.py プロジェクト: yangjl/cgat
def buildSummaryMapping( infiles, outfile ):
    
    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile( outfile, "w" )
    
    table = "bam_stats"

    colnames = None
    for track in TRACKS:
        
        statement = """SELECT * 
                         FROM medip_%(track)s.%(table)s"""
        
        data = cc.execute( statement % locals() ).fetchall()
        _colnames = [x[0] for x in cc.description]
        if not colnames:
            colnames = _colnames
            outf.write( "\t".join( ["metatrack"] + colnames,) + "\n"  )

        assert colnames == _colnames

        for row in data:
            outf.write( "\t".join( map(str, (track,) + row))+ "\n" )

    outf.close()
コード例 #8
0
ファイル: metapipeline_medip.py プロジェクト: jmadzo/cgat
def buildSummaryCpGCoverage(infiles, outfile):
    '''build summary of differentially methylated regions.'''

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("metatrack\ttrack\tcoverage\tncovered\tpcovered\n")

    for track in TRACKS:

        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and name LIKE '%%coveredpos%%' """ % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT '%(track)s' as metatrack,
                         '%(table)s' as track,
                         coverage, ncovered, pcovered FROM medip_%(track)s.%(table)s"""

            for x in cc.execute(statement % locals()):
                outf.write("\t".join(map(str, x)) + "\n")

    outf.close()
コード例 #9
0
ファイル: metapipeline_medip.py プロジェクト: jmadzo/cgat
def buildSummaryCalledDMRs(infiles, outfile):
    '''build summary of differentially methylated regions.'''

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")
    outf.write("metatrack\ttest\tntested\tnok\tnsignificant\tn2fold\n")

    for track in TRACKS:
        tables = [x[0] for x in cc.execute( """SELECT name FROM medip_%s.sqlite_master 
            WHERE type='table' and sql LIKE '%%control_mean%%' and sql LIKE '%%treatment_mean%%'""" % track
                                            ).fetchall()]

        for table in tables:

            statement = """SELECT 
                         COUNT(*) as ntested, 
                         SUM(CASE WHEN status='OK' THEN 1 ELSE 0 END) AS nok, 
                         SUM(CASE WHEN significant THEN 1 ELSE 0 END) AS nsignificant, 
                         SUM(CASE WHEN significant AND (l2fold < -1 OR l2fold > 1) THEN 1 ELSE 0 END) as n2fold 
                         FROM medip_%(track)s.%(table)s"""

            ntested, nok, nsignificant, n2fold = cc.execute(
                statement % locals()).fetchone()

            outf.write(
                "\t".join(map(str, (track, table, ntested, nok, nsignificant, n2fold))) + "\n")

    outf.close()
コード例 #10
0
ファイル: metapipeline_medip.py プロジェクト: jmadzo/cgat
def buildSummaryMapping(infiles, outfile):

    dbh = connect()
    cc = dbh.cursor()

    outf = IOTools.openFile(outfile, "w")

    table = "bam_stats"

    colnames = None
    for track in TRACKS:

        statement = """SELECT * 
                         FROM medip_%(track)s.%(table)s"""

        data = cc.execute(statement % locals()).fetchall()
        _colnames = [x[0] for x in cc.description]
        if not colnames:
            colnames = _colnames
            outf.write("\t".join(["metatrack"] + colnames,) + "\n")

        assert colnames == _colnames

        for row in data:
            outf.write("\t".join(map(str, (track,) + row)) + "\n")

    outf.close()
コード例 #11
0
    def __call__(self, track, slice=None):

        fn = os.path.join(
            DATADIR,
            "replicated_intervals/%(track)s.peakshape.gz.matrix_%(slice)s.gz" %
            locals())
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)

        nrows = len(rownames)
        if nrows == 0:
            return
        if nrows > self.scale:
            take = numpy.array(numpy.floor(
                numpy.arange(0, nrows,
                             float(nrows + 1) / self.scale)),
                               dtype=int)
            rownames = [rownames[x] for x in take]
            matrix = matrix[take]

        return odict(
            (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
コード例 #12
0
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
コード例 #13
0
ファイル: orthology.py プロジェクト: nishantthakur/cgat
    def __call__(self, track, slice=None):
        fn = "ortholog_pairs_with_feature.matrix2"
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)
        return odict((("matrix", matrix), ("rows", rownames), ("columns", colnames)))
コード例 #14
0
    def getReferenceLincRNA(self, reference_gtf):

        lincs = []
        for entry in GTF.iterator(IOTools.openFile(reference_gtf)):
            if entry.source == "lincRNA":
                if entry.gene_id not in lincs:
                    lincs.append(entry.gene_id)
        return len(lincs)
コード例 #15
0
    def __call__(self, track, slice=None):
        fn = "ortholog_pairs_with_feature.matrix2"
        if not os.path.exists(fn):
            return

        x = IOTools.openFile(fn)
        matrix, rownames, colnames = IOTools.readMatrix(x)
        return odict(
            (('matrix', matrix), ('rows', rownames), ('columns', colnames)))
コード例 #16
0
ファイル: LncRNACounts.py プロジェクト: pombredanne/cgat
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     return counts
コード例 #17
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def checkBlastRuns( infiles, outfile ):
    '''check if output files are complete.
    '''
    
    outf = IOTools.openFile( outfile, "w" )

    outf.write( "chunkid\tquery_first\tquery_last\tfound_first\tfound_last\tfound_total\tfound_results\thas_finished\tattempts\t%s\n" %\
                    "\t".join(Logfile.RuntimeInformation._fields))

    for infile in infiles:
        E.debug( "processing %s" % infile)
        chunkid = P.snip( os.path.basename( infile ), ".blast.gz" )
        logfile = infile + ".log"
        chunkfile = P.snip( infile, ".blast.gz" ) + ".fasta"

        with IOTools.openFile( infile ) as inf:
            l = inf.readline()
            ids = set()
            total_results = 0
            for l in inf:
                if l.startswith("#//"): continue
                ids.add( int(l.split("\t")[0] ) )
                total_results += 1
            found_first = min(ids)
            found_last = max(ids)
            found_total = len(ids)

        l = IOTools.getFirstLine( chunkfile )
        query_first = l[1:-1]
        l2 = IOTools.getLastLine( chunkfile, nlines = 2).split("\n")
        query_last = l2[0][1:]

        logresults = Logfile.parse( logfile )
        
        outf.write( "\t".join( map(str, (\
                        chunkid, query_first, query_last,
                        found_first, found_last,
                        found_total, total_results,
                        logresults[-1].has_finished,
                        len(logresults),
                        "\t".join( map(str, logresults[-1]) ) ) ) ) + "\n" )
        
    outf.close()
コード例 #18
0
ファイル: LncRNACounts.py プロジェクト: pombredanne/cgat
    def __call__(self, track, slice = None):
        
        if slice == "transcript":
            lengths_transcripts = []
            for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in transcript])
                lengths_transcripts.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_transcripts, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_transcripts))) )

        
        elif slice == "gene":
            lengths_genes = []
            for gene in GTF.flat_gene_iterator(GTF.iterator(IOTools.openFile(self.getFilename(track)))):
                length = sum([gtf.end - gtf.start for gtf in gene])
                lengths_genes.append(length)
            counts, lower, dx, _ = scipy.stats.cumfreq(lengths_genes, numbins=40, defaultreallimits=(0,20000))
            x = np.arange(counts.size) * dx + lower
            return odict( (("length", x), ("cumulative frequency", counts/len(lengths_genes))) )
コード例 #19
0
ファイル: LncRNACounts.py プロジェクト: pombredanne/cgat
 def __call__(self,track, slice = None):
     
     transcript_counts = collections.defaultdict( set )
     counts = []
     for gtf in GTF.iterator(IOTools.openFile(self.getFilename(track))):
         transcript_counts[gtf.gene_id].add(gtf.transcript_id)
     for gene, transcripts in transcript_counts.iteritems():
         counts.append(len(transcripts))
     count, lower, dx, _ = scipy.stats.cumfreq(counts, numbins=40, defaultreallimits=(1,15))
     x = np.arange(count.size) * dx + lower
     return odict( (("transcript number", x), ("cumulative frequency", count/len(counts))) )
コード例 #20
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def buildNrdb50( infile, outfile ):
    '''build nrdb50
    
    Renumber seqences.'''
    
    outf_fasta = IOTools.openFile( outfile, "w" )
    outf_table = IOTools.openFile( outfile + ".tsv", "w" )
    outf_table.write("nid\tpid\thid\tdescription\tcluster_size\ttaxon\trepid\n" )

    rx = re.compile( "(\S+) (.*) n=(\d+) Tax=(.*) RepID=(\S+)" )

    nid = 1
    for entry in FastaIterator.iterate( IOTools.openFile( infile )):
        outf_fasta.write(">%i\n%s\n" % (nid, entry.sequence ) )
        cluster_name, description, cluster_size, taxon, repid = rx.match( entry.title ).groups()
        hid = computeHID( entry.sequence )
        outf_table.write( "\t".join( (str(nid), cluster_name, hid, description, cluster_size, taxon, repid)) + "\n" )
        nid += 1

    outf_fasta.close()
    outf_table.close()
コード例 #21
0
ファイル: Bed.py プロジェクト: BioinformaticsArchive/cgat
def getNumColumns( filename ):
    '''return number of fields in bed-file by looking at the first 
    entry.
    
    Returns 0 if file is empty.
    '''
    with IOTools.openFile( filename ) as inf:
        for line in inf:
            if line.startswith("#"): continue
            if line.startswith("track"): continue
            return len(line[:-1].split("\t"))
    return 0
コード例 #22
0
ファイル: Bed.py プロジェクト: yangjl/cgat
def getNumColumns(filename):
    '''return number of fields in bed-file by looking at the first 
    entry.
    
    Returns 0 if file is empty.
    '''
    with IOTools.openFile(filename) as inf:
        for line in inf:
            if line.startswith("#"): continue
            if line.startswith("track"): continue
            return len(line[:-1].split("\t"))
    return 0
コード例 #23
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def checkBlastRun( infiles, outfile ):
    '''build summary stats on file.'''

    pairsdbfile, seqfile = infiles
    
    nids = set()
    with IOTools.openFile( seqfile ) as inf:
        for r in FastaIterator.iterate( inf ):
            nids.add( int(r.title) )

    with IOTools.openFile( pairsdbfile ) as inf:
        query_ids, sbjct_ids = set(), set()
        total_results, self_links = 0, 0
        for l in inf:
            l = inf.readline()
            if l.startswith("#//"): continue
            query_id, sbjct_id = l.split("\t")[:2]
            query_ids.add( int(query_id) )
            sbjct_ids.add( int(sbjct_id) )
            if query_id == sbjct_id: self_links += 1
            total_results += 1

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "category\tcounts\n")
    outf.write( "\t".join( map(str, ('nids', len(nids)))) + "\n" )
    outf.write( "\t".join( map(str, ('links', total_results))) + "\n" )
    outf.write( "\t".join( map(str, ('self', self_links))) + "\n" )
    outf.write( "\t".join( map(str, ('queries', len(query_ids)))) + "\n" )
    outf.write( "\t".join( map(str, ('sbjcts', len(sbjct_ids)))) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_queries.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( query_ids )) ) )) + "\n" )
    outf.close()

    outf = IOTools.openFile( outfile + '.missing_sbjcts.gz', 'w' )
    outf.write( 'nid\n' )
    outf.write( "\n".join( map(str, sorted( list( nids.difference( sbjct_ids )) ) )) + "\n" )
    outf.close()
コード例 #24
0
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
コード例 #25
0
def collectGenomeSizes(infile, outfile):
    '''
    output the genome sizes for each genome
    '''
    to_cluster = True
    outf = open(outfile, "w")
    outf.write("genome\tlength\n")
    # assume single fasta entry
    for fasta in FastaIterator.iterate(IOTools.openFile(infile)):
        name = P.snip(os.path.basename(infile), ".fna")
        length = len(list(fasta.sequence))
        outf.write("%s\t%s\n" % (name, str(length)))
    outf.close()
コード例 #26
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def buildPFAMFamilies( infiles, outfile ):

    outf = IOTools.openFile( outfile, "w" )
    outf.write( "family\tshort\tdescription\n" )
    
    infile = infiles[1]
    family, description, short = None, None, None
    c = E.Counter()
    with IOTools.openFile( infile ) as inf:
        for line in inf:
            if line.startswith( "#=GF AC"):
                if family:
                    outf.write( "%s\n" % "\t".join( (family,description,short)))
                    c.output += 1
                family = re.match("#=GF AC\s+(\S+)", line[:-1]).groups()[0]
            elif line.startswith( "#=GF DE"):
                description = re.match("#=GF DE\s+(.+)",line[:-1]).groups()[0]
            elif line.startswith( "#=GF ID"):
                short = re.match("#=GF ID\s+(.+)",line[:-1]).groups()[0]
    outf.write( "%s\n" % "\t".join( (family,description,short)))
    c.outptut += 1
    outf.close()
    E.info(c)
コード例 #27
0
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
コード例 #28
0
def buildAlignmentSizes(infiles, outfile):
    '''
    use bed files to sum the total number of bases
    that are aligned to the genomes
    '''
    outf = open(outfile, "w")
    outf.write("genome\tsize\n")
    for infile in infiles:
        genome = P.snip(os.path.basename(infile), ".bed.gz")
        c = 0
        inf = IOTools.openFile(infile)
        for bed in Bed.iterator(inf):
            c += bed.end - bed.start
        outf.write("%s\t%s\n" % (genome, str(c)))
    outf.close()
コード例 #29
0
def buildMatrixFromTables( infiles, column, column_header = 0, dtype = numpy.float, default = None ):
    '''build a matrix from a column called *column* in a series of input files.
   
    If column_value == None, the first column is taken as the name of the row.

    The columns are given by order of the input files.

    returns matrix, row_headers
    '''
    
    lists = []
    for infile in infiles:
        data = pandas.read_table( IOTools.openFile(infile) )
        lists.append( zip( list( data[column_header] ), list(data[column]) ) )
        
    return buildMatrixFromLists( lists, dtype = dtype, default = default )
コード例 #30
0
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(IOTools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.items():
        rel_abundance[gi] = float(ab) / total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in list(rel_abundance.keys()):
        E.info("processing gi %s" % gi)
        taxid = cc.execute(
            """SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" %
            gi).fetchone()[0]
        species_id = cc.execute(
            """SELECT species_id FROM categories WHERE taxid == '%s'""" %
            taxid).fetchone()[0]
        species_name = cc.execute(
            """SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'"""
            % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" %
               (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.items():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
コード例 #31
0
ファイル: CPC.py プロジェクト: pombredanne/cgat
 def __call__(self, track):
     
     length = {}
     for transcript in GTF.transcript_iterator(GTF.iterator(IOTools.openFile("gtfs/lncrna_filtered.gtf.gz"))):
         length[transcript[0].transcript_id] = sum([gtf.end - gtf.start for gtf in transcript])
     
     score = {}
     dbh = sqlite3.connect("csvdb")
     cc = dbh.cursor()
     for data in cc.execute("SELECT transcript_id, CP_score FROM lncrna_filtered_cpc_result"):
         score[data[0]] = data[1]
 
     result = {"length": [], "score": []}
     for transcript, value in length.iteritems():
         result["length"].append(np.log10(length[transcript]))
         result["score"].append(score[transcript])
     return result
コード例 #32
0
ファイル: Intervals.py プロジェクト: siping/cgat
    def __call__(self, track, slice = None):
        fn = os.path.join( DATADIR, "%(track)s.peakshape.tsv.gz.matrix_%(slice)s.gz" % locals() )
        if not os.path.exists( fn ): 
            return
        
        matrix, rownames, colnames = IOTools.readMatrix( IOTools.openFile( fn ))
        nrows = len(rownames)
        if nrows == 0: return

        if nrows > 1000:
            take = numpy.array( numpy.floor( numpy.arange( 0, nrows, nrows / 1000 ) ), dtype = int )
            rownames = [ rownames[x] for x in take ]
            matrix = matrix[ take ]
            
        return odict( (('matrix', matrix),
                       ('rows', rownames),
                       ('columns', colnames)) )
コード例 #33
0
 def __call__(self, track, slice = None):
     pattern = self.pattern
     fn = os.path.join( DATADIR, "liver_vs_testes/%(track)s%(pattern)s.matrix_%(slice)s.gz" % locals() )
     if not os.path.exists( fn ): 
         return
     
     x = IOTools.openFile( fn )
     matrix, rownames, colnames = IOTools.readMatrix( x )
     
     nrows = len(rownames)
     if nrows == 0: return
     if nrows > self.scale:
         take = numpy.array( numpy.floor( numpy.arange( 0, nrows, float(nrows + 1) / self.scale ) ), dtype = int )
         rownames = [ rownames[x] for x in take ]
         matrix = matrix[ take ]
         
     return odict( (('matrix', matrix),
                    ('rows', rownames),
                    ('columns', colnames)) )
コード例 #34
0
ファイル: MatrixTools.py プロジェクト: yangjl/cgat
def buildMatrixFromTables(infiles,
                          column,
                          column_header=0,
                          dtype=numpy.float,
                          default=None):
    '''build a matrix from a column called *column* in a series of input files.
   
    If column_value == None, the first column is taken as the name of the row.

    The columns are given by order of the input files.

    returns matrix, row_headers
    '''

    lists = []
    for infile in infiles:
        data = pandas.read_table(IOTools.openFile(infile))
        lists.append(zip(list(data[column_header]), list(data[column])))

    return buildMatrixFromLists(lists, dtype=dtype, default=default)
コード例 #35
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = optparse.OptionParser( version = "%prog version: $Id: script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    coords_file=args[0]

    bamfile=pysam.Samfile( args[1], 'rb' )  # bamfile

    options.stdout.write( "gene_id\tcounts\tlength\n" )

    iter = Bed.iterator( IOTools.openFile( coords_file ) )
    for gene_id, exons in itertools.groupby( iter, lambda x: x.name ):

        num_reads=0
        
        anames=set([])
        lgene = 0

        for bed in exons:
            lgene += bed.end - bed.start
            for alignedread in bamfile.fetch(bed.contig, bed.start, bed.end):
                anames.add((alignedread.qname, alignedread.is_read1))

        num_reads = len(anames)
        options.stdout.write( "\t".join( (gene_id,
                                          str(num_reads),
                                          str(lgene ) )) + "\n" )

    ## write footer and output benchmark information.
    E.Stop()
コード例 #36
0
def buildTrueTaxonomicRelativeAbundances(infile, outfile):
    '''
    get species level relative abundances for the simulateds
    data. This involes creating maps between different identifiers
    from the NCBI taxonomy. This is so that the results are comparable
    to species level analysis from metaphlan
    The gi_taxid_nucl is a huge table and therefore this function
    takes an age to run - can think of optimising this somehow
    '''
    to_cluster = True

    total = 0
    rel_abundance = collections.defaultdict(int)
    for fastq in Fastq.iterate(IOTools.openFile(infile)):
        total += 1
        gi = fastq.identifier.split("|")[1]
        rel_abundance[gi] += 1
    for gi, ab in rel_abundance.iteritems():
        rel_abundance[gi] = float(ab)/total

    dbh = sqlite3.connect(PARAMS["database"])
    cc = dbh.cursor()
    result = collections.defaultdict(float)
    for gi in rel_abundance.keys():
        E.info("processing gi %s" % gi)
        taxid = cc.execute("""SELECT taxid FROM gi_taxid_nucl WHERE gi == '%s'""" % gi).fetchone()[0]
        species_id = cc.execute("""SELECT species_id FROM categories WHERE taxid == '%s'""" % taxid).fetchone()[0]
        species_name = cc.execute("""SELECT taxname FROM names WHERE taxid == '%s' AND description == 'scientific name'""" % species_id).fetchone()[0]
        abundance = rel_abundance[gi]
        E.info("mapped gi %s to taxid: %s, species_id: %s, species_name: %s" % (str(gi), str(taxid), str(species_id), species_name))
        result[species_name] += abundance

    outf = open(outfile, "w")
    outf.write("species_name\trelab\n")
    for species_name, abundance in result.iteritems():
        # create names consistent with metaphlan
        species_name = species_name.replace(" ", "_")
        outf.write("%s\t%f\n" % (species_name, abundance))
    outf.close()
コード例 #37
0
def main(argv = None):

    parser = E.OptionParser(version = "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $", usage = globals()["__doc__"])

    parser.add_option("-o","--output_file", type="string", default = None,
                      help="[Optional] Filename to output results to. [default=STDOUT]" )
    parser.add_option("-u","--url",type="string",default="http://www.cbioportal.org/public-portal/webservice.do",
                      help="[Optional] Url to the cBioPortal webservice [default=%default]" )

    cqueryopts = optparse.OptionGroup(parser,"Common parameters","Common arguments to the query")
    cqueryopts.add_option("-s", "--study_id", dest="study_id", type="string", default = None,
                      help="[Required/OPtional]  cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered"  )
    cqueryopts.add_option("-n", "--study_name",dest = "study_name", type = "string", default = None,
                      help="[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this.")
    cqueryopts.add_option("-c", "--case_set_id", dest="case_set_id", type="string", default = None,
                      help="[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' ")
    cqueryopts.add_option("-g", "--gene_list", dest = "gene_list", type = "string", default = None,
                      help="[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML" )
    cqueryopts.add_option("-f","--gene_list_file", dest = "gene_list_file", type="string", default = None,
                          help="[Optional] Filename to read in gene_list from" )
    cqueryopts.add_option("-p", "--profile_id", dest = "profile_id", type = "string",
                      help="[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used." )
    


    squeryopts = optparse.OptionGroup(parser,"Query specific parameters", "Arguments specific to a particular query")
    squeryopts.add_option("--protein_array_type", dest="protein_array_type", type="string", default = "protein_level",
                      help="[Optional] Either protein_level or phosphorylation [default=%default]" )
    squeryopts.add_option("--protein_array_id", dest = "protein_array_id", type = "string",
                      help="[Required for some] comma seperated list of one or more protein array IDs" )
    squeryopts.add_option("--array_info", dest ="protein_array_info", type ="int",  default = 0,  
                      help="[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]" )
    squeryopts.add_option("--report", dest = "report", type = "string", default = "full",
                      help = "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] " )
    squeryopts.add_option("--threshold", dest = "threshold", type="int", default = 2, 
                      help = "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]" )
    
    parser.add_option_group(cqueryopts)
    parser.add_option_group(squeryopts)

    (options,args) = E.Start(parser, add_pipe_options = False, add_output_options = False, argv = argv)

    portal = CBioPortal(url = options.url, study = options.study_id, study_name = options.study_name, case_list_id = options.case_set_id)

    results = []

    if options.gene_list_file:
        infile = IOTools.openFile(options.gene_list_file)
        gene_list = [x.strip() for x in infile]
    elif options.gene_list:
        gene_list = options.gene_list.split(",")

    if options.profile_id:
        profile_id = options.profile_id.split(",")
    else:
        profile_id = None

    if "getCancerStudies" in args:
        results.append(portal.getCancerStudies())
    
    if "getGeneticProfiles" in args:
        results.append(portal.getGeneticProfiles())

    if "getCaseLists" in args:
        results.append(portal.getCaseLists())

    if "getProfileData" in args:
        results.append(portal.getProfileData(gene_list = gene_list, genetic_profile_id = profile_id))

    if "getMutationData" in args:
        results.append(portal.getMutationData(gene_list = gene_list, genetic_profile_id = profile_id))

    if "getClinicalData" in args:
        results.append(portal.getClinicalData())

    if "getProteinArrayInfo" in args:
        results.append(portal.getProteinArrayInfo(gene_list = gene_list, protein_array_type = options.protein_array_type))

    if "getProteinArrayData" in args:
        results.append(portal.getProteinArrayData(protein_array_id = options.protein_array_id, array_info =  options.array_info))

    if "getPercentAltered" in args:
        results.append(portal.getPercentAltered(gene_list = gene_list, genetic_profile_id = profile_id, threshold = options.threshold))

    if "getLink" in args:
        results.append(portal.getLink(gene_list = gene_list, report = options.report))

    if "getOncoprintHTML" in args:
        results.append(portal.getOncoprintHTML(gene_list = gene_list))
    
    if len(results) == 0:
        sys.stderr.write( "No recognised query commands provided")
        sys.exit()
    
    if options.output_file:
        outf = IOTools.openFile(options.output_file, "w")
    else:
        outf = sys.stdout

    
    for result in results:
        try:
            outf.write(tableToString(result))
        except:
            outf.write(result)


    E.Stop()
コード例 #38
0
ファイル: WrapperMEDIPS.py プロジェクト: yangjl/cgat
def main(argv=None):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser(
        version=
        "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $",
        usage=globals()["__doc__"])

    parser.add_option("-f",
                      "--input-format",
                      dest="input_format",
                      type="choice",
                      choices=("bed", "bam"),
                      help="input file format [default=%default].")

    parser.add_option("-u",
                      "--ucsc-genome",
                      dest="ucsc_genome",
                      type="string",
                      help="UCSC genome identifier [default=%default].")

    parser.add_option("-g",
                      "--genome-file",
                      dest="genome_file",
                      type="string",
                      help="filename with genome [default=%default].")

    parser.add_option("-e",
                      "--extension",
                      dest="extension",
                      type="int",
                      help="extension size [default=%default].")

    parser.add_option("-b",
                      "--bin-size",
                      dest="bin_size",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option("-l",
                      "--fragment-length",
                      dest="fragment_length",
                      type="int",
                      help="bin size of genome vector [default=%default].")

    parser.add_option(
        "-s",
        "--saturation-iterations",
        dest="saturation_iterations",
        type="int",
        help="iterations for saturation analysis [default=%default].")

    parser.add_option("-t",
                      "--toolset",
                      dest="toolset",
                      type="choice",
                      action="append",
                      choices=("saturation", "coverage", "rms", "rpm", "all"),
                      help="actions to perform [default=%default].")

    parser.add_option(
        "-w",
        "--bigwig",
        dest="bigwig",
        action="store_true",
        help=
        "store wig files as bigwig files - requires a genome file [default=%default]"
    )

    parser.set_defaults(
        input_format="bam",
        ucsc_genome="hg19",
        genome_file=None,
        extension=400,
        bin_size=50,
        saturation_iterations=10,
        fragment_length=700,
        toolset=[],
        bigwig=False,
    )

    ## add common options (-h/--help, ...) and parse command line
    (options, args) = E.Start(parser, argv=argv, add_output_options=True)

    if len(args) != 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta(options.genome_file)
        contig_sizes = fasta.getContigSizes()

    filename_sample = args[0]

    if len(options.toolset) == 0: options.toolset = ["all"]

    do_all = "all" in options.toolset

    # load MEDIPS
    R.library('MEDIPS')
    genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome
    R.library(genome_file)

    tmpdir = tempfile.mkdtemp()

    E.debug("temporary files are in %s" % tmpdir)

    bin_size = options.bin_size
    extension = options.extension
    fragment_length = options.fragment_length
    saturation_iterations = options.saturation_iterations

    if options.input_format == "bam":
        E.info("converting bam files")
        filename_sample = bamToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))
    elif options.input_format == "bed":
        E.info("converting bed files")
        filename_sample = bedToMEDIPS(filename_sample,
                                      os.path.join(tmpdir, "sample.medips"))

    E.info("loading data")
    R('''CONTROL.SET = MEDIPS.readAlignedSequences(
                       BSgenome = "%(genome_file)s", 
                       file = "%(filename_sample)s" ) ''' % locals())
    slotnames = (("extend", "extend",
                  "%i"), ("distFunction", "distance_function",
                          "%s"), ("slope", "slope", "%f"),
                 ("fragmentLength", "fragment_length",
                  "%i"), ("bin_size", "bin_size",
                          "%i"), ("seq_pattern", "pattern",
                                  "%s"), ("number_regions", "nregions", "%i"),
                 ("number_pattern", "npatterns",
                  "%i"), ("cali_chr", "calibration_contig",
                          "%s"), ("genome_name", "genome", "%s"))

    E.info("computing genome vector")
    R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, 
                       bin_size = %(bin_size)i, 
                       extend=%(extension)i )''' % locals())

    E.info("computing CpG positions")
    R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")'''
      )

    E.info("compute coupling vector")
    R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, 
                       fragmentLength = %(fragment_length)i, 
                       func = "count")''' % locals())

    E.info("compute calibration curve")
    R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''')

    E.info("normalizing")
    R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''')

    outfile = IOTools.openFile(E.getOutputFile("summary.tsv.gz"), "w")
    outfile.write("category\tvalue\n")

    if "saturation" in options.toolset or do_all:
        E.info("saturation analysis")
        R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, 
                            bin_size = %(bin_size)i, 
                            extend = %(extension)i, 
                            no_iterations = %(saturation_iterations)i, 
                            no_random_iterations = 1)''' % locals())

        R.png(E.getOutputFile("saturation.png"))
        R('''MEDIPS.plotSaturation(sr.control)''')
        R('''dev.off()''')

        R('''write.csv( sr.control$estimation, file ='%s' )''' %
          E.getOutputFile("saturation_estimation.csv"))
        outfile.write("estimated_correlation\t%f\n" %
                      R('''sr.control$maxEstCor''')[1])
        outfile.write("true_correlation\t%f\n" %
                      R('''sr.control$maxTruCor''')[1])

    if "coverage" in options.toolset or do_all:
        E.info("CpG coverage analysis")
        R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, 
                                extend = %(extension)i, 
                                no_iterations = 10)''' % locals())

        R.png(E.getOutputFile("cpg_coverage.png"))
        R('''MEDIPS.plotCoverage(cr.control)''')
        R('''dev.off()''')

        # three rows
        R('''write.csv( cr.control$coveredPos, file ='%s' )''' %
          E.getOutputFile("saturation_coveredpos.csv"))
        # coverage threshold
        # number of CpG covered
        # percentage of CpG covered

        R('''write.csv( cr.control$matrix, file ='%s' )''' %
          E.getOutputFile("saturation_matrix.csv"))

        # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''')

    if "calibration" in options.toolset or do_all:
        E.info("plotting calibration")
        R.png(E.getOutputFile("calibration.png"))
        R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)'''
          )
        R('''dev.off()''')

    for slotname, label, pattern in slotnames:
        value = tuple(R('''CONTROL.SET@%s''' % slotname))
        if len(value) == 0: continue
        outfile.write(
            "%s\t%s\n" %
            (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname))[0]))

    outfile.close()

    if "rpm" in options.toolset or do_all:
        outputfile = E.getOutputFile("rpm.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    if "rms" in options.toolset or do_all:
        outputfile = E.getOutputFile("rms.wig")
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")'''
          % locals())
        if options.bigwig:
            bigwig(outputfile, contig_sizes)
        else:
            compress(outputfile)

    shutil.rmtree(tmpdir)

    ## write footer and output benchmark information.
    E.Stop()
コード例 #39
0
ファイル: pairsdb.py プロジェクト: AndreasHeger/adda
def buildSCOPDomains( infiles, outfile ):
    '''reconcile mapped domains into a single domain file.

    * fragments are removed - a domain must map at least 90%
      of its length.

    * domains overlapping on the same sequence with the same
      superfamily classification are merged.
    '''
    
    linksfile, fastafile = infiles

    # filtering criteria
    min_coverage = 0.9
    # only take first four fold classes
    classes = 'abcd'

    rx = re.compile('(\S+)\s(\S+)\s(.*)' )
    id2class = {}
    with IOTools.openFile( fastafile ) as inf:
        for x in FastaIterator.iterate( inf ):
            pid, cls, description = rx.match(x.title).groups()
            id2class[pid] = (cls, len(x.sequence) )
            
    E.info('read mappings for %i sequences' % len(id2class))
    counter = E.Counter()

    with IOTools.openFile( linksfile ) as inf:
        nid2domains = collections.defaultdict( list )
        ndomains = 0
        for line in inf:
            if line.startswith('query_nid'): continue
            if line.startswith('#'): continue
            counter.links += 1
            
            domain_id, nid, evalue, domain_start, domain_end, sbjct_start, sbjct_end, \
                block_sizes, domain_starts, sbjct_starts, \
                bitscore, pid = line[:-1].split()
            
            nid, domain_start, domain_end, sbjct_start, sbjct_end = map(int, \
                                                                       ( nid, domain_start, domain_end, sbjct_start, sbjct_end ))

            family, length = id2class[domain_id]

            cls, fold, superfamily, family = family.split('.')
            if cls not in classes: continue
            if float(domain_end - domain_start) / length < min_coverage: continue
            counter.unmerged_domains += 1
            superfamily = '00%c%03i%03i' % (cls, int(fold), int(superfamily))

            nid2domains[nid].append( (superfamily, sbjct_start, sbjct_end ) )

        counter.sequences = len(nid2domains)

    E.info( 'merging %i domains in %i sequences' % (counter.unmerged_domains, counter.sequences))

    outf = IOTools.openFile( outfile, 'w' )
    outf.write('nid\tstart\tend\tfamily\n')
    for nid, dd in sorted(nid2domains.iteritems()):
        for family, domains in itertools.groupby( dd, key = lambda x: x[0] ):
            unmerged_domains = [ (x[1],x[2]) for x in domains ]
            merged_domains = Intervals.combine( unmerged_domains )
            for start, end in merged_domains:
                counter.domains += 1
                outf.write( '%i\t%i\t%i\t%s\n' % (nid, start, end, family ) )
    outf.close()

    E.info( counter )
コード例 #40
0
def main(argv=None):

    parser = E.OptionParser(
        version=
        "%prog version: $Id: CBioPortal.py 2888 2012-06-07 15:52:00Z ians $",
        usage=globals()["__doc__"])

    parser.add_option(
        "-o",
        "--output_file",
        type="string",
        default=None,
        help="[Optional] Filename to output results to. [default=STDOUT]")
    parser.add_option(
        "-u",
        "--url",
        type="string",
        default="http://www.cbioportal.org/public-portal/webservice.do",
        help="[Optional] Url to the cBioPortal webservice [default=%default]")

    cqueryopts = optparse.OptionGroup(parser, "Common parameters",
                                      "Common arguments to the query")
    cqueryopts.add_option(
        "-s",
        "--study_id",
        dest="study_id",
        type="string",
        default=None,
        help=
        "[Required/OPtional]  cBioPortal ID for study [default=%default].\n This or study_name required for: getGeneticProfiles, getCaseLists, getProteinArrayInfo, getLink,getOncoprintHTML, getPercentAltered, getTotalAltered"
    )
    cqueryopts.add_option(
        "-n",
        "--study_name",
        dest="study_name",
        type="string",
        default=None,
        help=
        "[Required/Optional] cBioPortal Name for study [defualt=%default].\n See above for which commands require this."
    )
    cqueryopts.add_option(
        "-c",
        "--case_set_id",
        dest="case_set_id",
        type="string",
        default=None,
        help=
        "[Required for some] cBioPortal case_set_id specifying the case list to use.\nRequired for getProfileData, getMutationData, getClincalData, getProteinArrayData, getPercentAltered, getTotalAltered. Default is case_set_id for case list 'All Tumours' "
    )
    cqueryopts.add_option(
        "-g",
        "--gene_list",
        dest="gene_list",
        type="string",
        default=None,
        help=
        "[Required for some] Comma seperated list of HUGO gene symbols or Entrez gene IDs.\nRequired for getProfileData, getMutationData, getLink, getOncoprintHTML"
    )
    cqueryopts.add_option("-f",
                          "--gene_list_file",
                          dest="gene_list_file",
                          type="string",
                          default=None,
                          help="[Optional] Filename to read in gene_list from")
    cqueryopts.add_option(
        "-p",
        "--profile_id",
        dest="profile_id",
        type="string",
        help=
        "[Optional] Comma seperated list of cBioPortal genetic_profile_ids. If none are specified then the list of profiles for the study where display in analysis is True is used."
    )

    squeryopts = optparse.OptionGroup(
        parser, "Query specific parameters",
        "Arguments specific to a particular query")
    squeryopts.add_option(
        "--protein_array_type",
        dest="protein_array_type",
        type="string",
        default="protein_level",
        help=
        "[Optional] Either protein_level or phosphorylation [default=%default]"
    )
    squeryopts.add_option(
        "--protein_array_id",
        dest="protein_array_id",
        type="string",
        help=
        "[Required for some] comma seperated list of one or more protein array IDs"
    )
    squeryopts.add_option(
        "--array_info",
        dest="protein_array_info",
        type="int",
        default=0,
        help=
        "[Optional] If 1, antibody infomation will also be exported in a getProteinArrayData query [default=%default]"
    )
    squeryopts.add_option(
        "--report",
        dest="report",
        type="string",
        default="full",
        help=
        "[Optional] Report type to display for getLink. Either full or oncoprint_html [default=%default] "
    )
    squeryopts.add_option(
        "--threshold",
        dest="threshold",
        type="int",
        default=2,
        help=
        "[Optional] Threshold for deciding if an alteration is significant for continuous metrics [default=%default]"
    )

    parser.add_option_group(cqueryopts)
    parser.add_option_group(squeryopts)

    (options, args) = E.Start(parser,
                              add_pipe_options=False,
                              add_output_options=False,
                              argv=argv)

    portal = CBioPortal(url=options.url,
                        study=options.study_id,
                        study_name=options.study_name,
                        case_list_id=options.case_set_id)

    results = []

    if options.gene_list_file:
        infile = IOTools.openFile(options.gene_list_file)
        gene_list = [x.strip() for x in infile]
    elif options.gene_list:
        gene_list = options.gene_list.split(",")

    if options.profile_id:
        profile_id = options.profile_id.split(",")
    else:
        profile_id = None

    if "getCancerStudies" in args:
        results.append(portal.getCancerStudies())

    if "getGeneticProfiles" in args:
        results.append(portal.getGeneticProfiles())

    if "getCaseLists" in args:
        results.append(portal.getCaseLists())

    if "getProfileData" in args:
        results.append(
            portal.getProfileData(gene_list=gene_list,
                                  genetic_profile_id=profile_id))

    if "getMutationData" in args:
        results.append(
            portal.getMutationData(gene_list=gene_list,
                                   genetic_profile_id=profile_id))

    if "getClinicalData" in args:
        results.append(portal.getClinicalData())

    if "getProteinArrayInfo" in args:
        results.append(
            portal.getProteinArrayInfo(
                gene_list=gene_list,
                protein_array_type=options.protein_array_type))

    if "getProteinArrayData" in args:
        results.append(
            portal.getProteinArrayData(
                protein_array_id=options.protein_array_id,
                array_info=options.array_info))

    if "getPercentAltered" in args:
        results.append(
            portal.getPercentAltered(gene_list=gene_list,
                                     genetic_profile_id=profile_id,
                                     threshold=options.threshold))

    if "getLink" in args:
        results.append(
            portal.getLink(gene_list=gene_list, report=options.report))

    if "getOncoprintHTML" in args:
        results.append(portal.getOncoprintHTML(gene_list=gene_list))

    if len(results) == 0:
        sys.stderr.write("No recognised query commands provided")
        sys.exit()

    if options.output_file:
        outf = IOTools.openFile(options.output_file, "w")
    else:
        outf = sys.stdout

    for result in results:
        try:
            outf.write(tableToString(result))
        except:
            outf.write(result)

    E.Stop()
コード例 #41
0
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      choices = ("bed", "bam"),
                      help="input file format [default=%default]."  )
    
    parser.add_option("-u", "--ucsc-genome", dest="ucsc_genome", type="string",
                      help="UCSC genome identifier [default=%default]."  )

    parser.add_option("-g", "--genome-file", dest="genome_file", type="string",
                      help="filename with genome [default=%default]."  )

    parser.add_option("-e", "--extension", dest="extension", type="int",
                      help="extension size [default=%default]."  )

    parser.add_option("-b", "--bin-size", dest="bin_size", type="int",
                      help="bin size of genome vector [default=%default]."  )

    parser.add_option("-l", "--fragment-length", dest="fragment_length", type="int",
                      help="bin size of genome vector [default=%default]."  )

    parser.add_option("-s", "--saturation-iterations", dest="saturation_iterations", type="int",
                      help = "iterations for saturation analysis [default=%default]."  )
    
    parser.add_option( "-t", "--toolset", dest="toolset", type="choice", action="append",
                       choices = ("saturation", "coverage", "rms", "rpm", "all"),
                       help = "actions to perform [default=%default]." )
    
    parser.add_option( "-w", "--bigwig", dest="bigwig", action = "store_true",
                       help = "store wig files as bigwig files - requires a genome file [default=%default]" )

    parser.set_defaults(
        input_format = "bam",
        ucsc_genome = "hg19",
        genome_file = None,
        extension = 400,
        bin_size = 50,
        saturation_iterations = 10,
        fragment_length = 700,
        toolset = [],
        bigwig = False,
        )

    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv, add_output_options = True )

    if len(args) != 1:
        raise ValueError("please specify a filename with sample data")

    if options.bigwig and not options.genome_file:
        raise ValueError("please provide a genome file when outputting bigwig")

    if options.genome_file:
        fasta = IndexedFasta.IndexedFasta( options.genome_file )
        contig_sizes = fasta.getContigSizes()
        
    filename_sample = args[0]

    if len(options.toolset) == 0: options.toolset = ["all"]

    do_all = "all" in options.toolset
    
    # load MEDIPS
    R.library( 'MEDIPS' )
    genome_file = 'BSgenome.Hsapiens.UCSC.%s' % options.ucsc_genome 
    R.library( genome_file )
    
    tmpdir = tempfile.mkdtemp( )

    E.debug( "temporary files are in %s" % tmpdir )

    bin_size = options.bin_size
    extension = options.extension
    fragment_length = options.fragment_length
    saturation_iterations = options.saturation_iterations

    if options.input_format == "bam":
        E.info( "converting bam files" )
        filename_sample = bamToMEDIPS( filename_sample, os.path.join( tmpdir, "sample.medips" ) )
    elif options.input_format == "bed":
        E.info( "converting bed files" )
        filename_sample = bedToMEDIPS( filename_sample, os.path.join( tmpdir, "sample.medips" ) )

    E.info( "loading data" )
    R('''CONTROL.SET = MEDIPS.readAlignedSequences(
                       BSgenome = "%(genome_file)s", 
                       file = "%(filename_sample)s" ) ''' % locals() )
    slotnames = ( ( "extend", "extend", "%i"),
                  ( "distFunction", "distance_function", "%s"),
                  ( "slope", "slope", "%f"),
                  ( "fragmentLength", "fragment_length", "%i" ),
                  ( "bin_size", "bin_size", "%i"),
                  ( "seq_pattern", "pattern", "%s" ),
                  ( "number_regions", "nregions", "%i"),
                  ( "number_pattern", "npatterns", "%i" ),
                  ( "cali_chr", "calibration_contig", "%s"),
                  ( "genome_name", "genome", "%s") )


    E.info( "computing genome vector" )
    R('''CONTROL.SET = MEDIPS.genomeVector(data = CONTROL.SET, 
                       bin_size = %(bin_size)i, 
                       extend=%(extension)i )''' % locals())

    E.info( "computing CpG positions" )
    R('''CONTROL.SET = MEDIPS.getPositions(data = CONTROL.SET, pattern = "CG")''' )

    E.info( "compute coupling vector" )
    R('''CONTROL.SET = MEDIPS.couplingVector(data = CONTROL.SET, 
                       fragmentLength = %(fragment_length)i, 
                       func = "count")''' % locals() )
    
    E.info( "compute calibration curve" )
    R('''CONTROL.SET = MEDIPS.calibrationCurve(data = CONTROL.SET)''')

    E.info( "normalizing" )
    R('''CONTROL.SET = MEDIPS.normalize(data = CONTROL.SET)''')

    outfile = IOTools.openFile( E.getOutputFile( "summary.tsv.gz" ), "w" )
    outfile.write( "category\tvalue\n" )

    if "saturation" in options.toolset or do_all:
        E.info( "saturation analysis" )
        R('''sr.control = MEDIPS.saturationAnalysis(data = CONTROL.SET, 
                            bin_size = %(bin_size)i, 
                            extend = %(extension)i, 
                            no_iterations = %(saturation_iterations)i, 
                            no_random_iterations = 1)''' % locals() )

        R.png( E.getOutputFile( "saturation.png" ) )
        R('''MEDIPS.plotSaturation(sr.control)''')
        R('''dev.off()''')

        R('''write.csv( sr.control$estimation, file ='%s' )'''% E.getOutputFile( "saturation_estimation.csv" ) )
        outfile.write( "estimated_correlation\t%f\n" % R('''sr.control$maxEstCor''')[1] )
        outfile.write( "true_correlation\t%f\n" % R('''sr.control$maxTruCor''')[1] )

    if "coverage" in options.toolset or do_all:
        E.info( "CpG coverage analysis" )
        R('''cr.control = MEDIPS.coverageAnalysis(data = CONTROL.SET, 
                                extend = %(extension)i, 
                                no_iterations = 10)''' % locals())

        R.png( E.getOutputFile( "cpg_coverage.png" ) )
        R('''MEDIPS.plotCoverage(cr.control)''')
        R('''dev.off()''')

        # three rows
        R('''write.csv( cr.control$coveredPos, file ='%s' )'''% E.getOutputFile( "saturation_coveredpos.csv" ) )
        # coverage threshold
        # number of CpG covered
        # percentage of CpG covered

        R('''write.csv( cr.control$matrix, file ='%s' )'''% E.getOutputFile( "saturation_matrix.csv" ) )

        # R('''er.control = MEDIPS.CpGenrich(data = CONTROL.SET)''')

    if "calibration" in options.toolset or do_all:
        E.info( "plotting calibration" )
        R.png( E.getOutputFile( "calibration.png" ) )
        R('''MEDIPS.plotCalibrationPlot(data = CONTROL.SET, linearFit = T, xrange=250)''')
        R('''dev.off()''')

    
    for slotname, label, pattern in slotnames:
        value = tuple(R('''CONTROL.SET@%s''' % slotname ))
        if len(value) == 0: continue
        outfile.write( "%s\t%s\n" % (label, pattern % tuple(R('''CONTROL.SET@%s''' % slotname ))[0] ) )
        
    outfile.close()
        
    if "rpm" in options.toolset or do_all:
        outputfile = E.getOutputFile( "rpm.wig" )
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = T, descr = "rpm")''' % locals())
        if options.bigwig:
            bigwig( outputfile, contig_sizes )
        else:
            compress( outputfile )
    
    if "rms" in options.toolset or do_all:
        outputfile = E.getOutputFile( "rms.wig" )
        R('''MEDIPS.exportWIG(file = '%(outputfile)s', data = CONTROL.SET, raw = F, descr = "rms")''' % locals())
        if options.bigwig:
            bigwig( outputfile, contig_sizes )
        else:
            compress( outputfile )

    shutil.rmtree( tmpdir )

    ## write footer and output benchmark information.
    E.Stop()
コード例 #42
0
ファイル: WrapperZinba.py プロジェクト: siping/cgat
def main( argv = None ):
    """script main.

    parses command line options in sys.argv, unless *argv* is given.
    """

    if not argv: argv = sys.argv

    # setup command line parser
    parser = E.OptionParser( version = "%prog version: $Id: cgat_script_template.py 2871 2010-03-03 10:20:44Z andreas $", 
                                    usage = globals()["__doc__"] )

    parser.add_option("-f", "--input-format", dest="input_format", type="choice",
                      choices = ("bed", "bam"),
                      help="input file format [default=%default]."  )
    
    parser.add_option("-s", "--fragment-size", dest="fragment_size", type="int",
                      help="fragment size [default=%default]."  )

    parser.add_option("-m", "--mappability-dir", dest="mappability_dir", type="string",
                      help="mappability_dir [default=%default]."  )

    parser.add_option("-b", "--bit-filename", dest="bit_filename", type="string",
                      help="2bit genome filename [default=%default]."  )

    parser.add_option("-c", "--control-filename", dest="control_filename", type="string",
                      help="filename of input/control data in bed format [default=%default]."  )

    parser.add_option("-i", "--index-dir", dest="index_dir", type="string",
                      help="index directory [default=%default]."  )

    parser.add_option("-t", "--threads", dest="threads", type="int",
                      help="number of threads to use [default=%default]."  )

    parser.add_option("-q", "--fdr-threshold", dest="fdr_threshold", type="float",
                      help="fdr threshold [default=%default]."  )

    parser.add_option("-a", "--alignability-threshold", dest="alignability_threshold", type="int",
                      help="alignability threshold [default=%default]."  )

    parser.add_option("-p", "--per-contig", dest="per_contig", action = "store_true",
                      help="run analysis per chromosome [default=%default]")

    parser.add_option("-w", "--temp-dir", dest="tempdir", type="string",
                      help="use existing directory as temporary directory [default=%default]."  )

    parser.add_option( "--keep-temp", dest="keep_temp", action = "store_true",
                      help="keep temporary directory [default=%default]")

    parser.add_option( "--action", dest="action", type="choice",
                       choices=("full", "count", "predict", "model"),
                       help="action to perform [default=%default]")

    parser.add_option( "--improvement", dest="improvement", type="float",
                       help="relative improvement of likelihood until convergence [default=%default]")
    
    parser.set_defaults(
        input_format = "bed",
        fragment_size = 200,
        mappability_dir = None,
        threads = 1,
        alignability_threshold = 1,
        bit_filename = None,
        fdr_threshold = 0.05,
        tempdir = None,
        winsize = 250,
        offset = 125,
        cnvWinSize = 1e+05,
        cnvOffset = 2500,
        per_contig = False,
        keep_temp = False,
        filelist = "files.list",
        action = "full",
        improvement = 0.00001,
        )


    ## add common options (-h/--help, ...) and parse command line 
    (options, args) = E.Start( parser, argv = argv )

    if len(args) != 2:
        raise ValueError("please specify a filename with sample data and an output file")

    filename_sample, filename_output = args[0], args[1]
    filename_control = options.control_filename
    
    # load Zinba
    R.library( 'zinba' )

    if not options.tempdir:
        tmpdir = tempfile.mkdtemp( )
    else:
        tmpdir = options.tempdir

    E.debug( "temporary files are in %s" % tmpdir )

    if options.input_format == "bam":
        E.info( "converting bam files to bed" )
        if not os.path.exists( os.path.join( tmpdir, "sample.bed")):
            filename_sample = bamToBed( filename_sample, os.path.join( tmpdir, "sample.bed" ) )
        else:
            E.info("using existing file %(tmpdir)s/sample.bed" % locals() )
            filename_sample = os.path.join( tmpdir, "sample.bed")
        if filename_control:
            if not os.path.exists( os.path.join( tmpdir, "control.bed")):
                filename_control = bamToBed( filename_control, os.path.join( tmpdir, "control.bed" ) )
            else:
                E.info("using existing file %(tmpdir)s/control.bed" % locals() )
                filename_control = os.path.join( os.path.join( tmpdir, "control.bed"))

    fragment_size = options.fragment_size
    threads = options.threads
    bit_filename = options.bit_filename
    mappability_dir = options.mappability_dir
    fdr_threshold = options.fdr_threshold
    tol = options.improvement

    contigs = E.run( "twoBitInfo %(bit_filename)s %(tmpdir)s/contig_sizes" % locals() )
    contig2size = dict( [x.split() for x in IOTools.openFile( os.path.join( tmpdir, "contig_sizes")) ] )

    outdir = filename_output + "_files" 
    if not os.path.exists( outdir ):
        os.mkdir( outdir )
        
    filelist = os.path.join( outdir, filename_output + ".list")
    modelfile = os.path.join( outdir, filename_output + ".model")
    winfile = os.path.join( outdir, filename_output + ".wins")
    winSize=250
    offset=125
    cnvWinSize=100000
    cnvOffset=0
    winGap = 0
    peakconfidence = 1.0 - fdr_threshold

    if not os.path.exists( os.path.join( tmpdir, "basecount")):
        E.info( "computing counts" )

        R( '''basealigncount( inputfile='%(filename_sample)s',
                          outputfile='%(tmpdir)s/basecount',
                          extension=%(fragment_size)i,
                          filetype='bed',
                          twoBitFile='%(bit_filename)s' )
                          '''  % locals() )
    else:
        E.info( "using existing counts" )

    # tried incremental updates
    # for contig, size in contig2size.iteritems():
    #     for size in 
    #     fn = os.path.join( tmpdir, "sample_%(contig)s_win%(size)ibp_offset(offset)ibp.txt" % locals() )
    if options.action == "count":

        E.info("computing window counts only - saving results in %s" % outdir )
        R('''buildwindowdata(
                     seq='%(filename_sample)s', 
                     align='%(mappability_dir)s',
                     input='%(filename_control)s', 
                     twoBit='%(bit_filename)s', 
                     winSize=%(winSize)i,
                     offset=%(offset)i,
                     cnvWinSize=%(cnvWinSize)i,
                     cnvOffset=%(cnvOffset)i,
                     filelist='%(filelist)s',
                     filetype='bed',  
                     extension=%(fragment_size)s,
                     outdir='%(outdir)s/') ''' % locals() )

    elif options.action == "model":

        # The important option is buildwin = 0
        # parameterized for broad == FALSE and input present
        # see zinba.R
        # model selection only on chr19.
        R('''run.zinba( 
                filelist='%(filelist)s',
                formula=NULL,formulaE=NULL,formulaZ=NULL,
                outfile='%(filename_output)s',
                seq='%(filename_sample)s', 
                input='%(filename_control)s', 
                filetype='bed',  
                align='%(mappability_dir)s',
                twoBit='%(bit_filename)s', 
                extension=%(fragment_size)s, 
                winSize=%(winSize)i,
                offset=%(offset)i,
                cnvWinSize=%(cnvWinSize)i,
                cnvOffset=%(cnvOffset)i,
                basecountfile='%(tmpdir)s/basecount',
                buildwin=0,
                threshold=%(fdr_threshold)f,
                pquant=1,
                peakconfidence=%(peakconfidence)f,
                winGap=%(winGap)i,
                tol=%(tol)f,
                initmethod="count",
                method="mixture",
                numProc=%(threads)i,
                printFullOut=1,
                interaction=FALSE,
                selectmodel=TRUE,
                selectchr='chr19',
                selectcovs=c("input_count"),
                selecttype="complete",
                FDR=TRUE)''' % locals())

    elif options.action == "predict":
    
        # The important option is buildwin = 0 and selectmodel = FALSE
        # parameterized for broad == FALSE and input present
        # see zinba.R
        # model selection only on chr19.
        if not os.path.exists( modelfile ):
            raise OSError( "model file %s does not exist" )

        E.info( "reading model from %s" % modelfile )

        R('''
        final=read.table('%(modelfile)s', header=T, sep="\t")
        final=final[final$fail==0,]
        bestBIC=which.min(final$BIC)
        formula=as.formula(paste("exp_count~",final$formula[bestBIC]))
        formulaE=as.formula(paste("exp_count~",final$formulaE[bestBIC]))
        formulaZ=as.formula(paste("exp_count~",final$formulaZ[bestBIC]))
        cat("Background formula is:\n\t")
        print(formula)
        cat("Enrichment formula is:\n\t")
        print(formulaE)
        cat("Zero-inflated formula is:\n\t")
        print(formulaE)
        ''' % locals() )

        E.info( "predicting peaks" )

        R('''run.zinba(
                filelist='%(filelist)s',
                outfile='%(filename_output)s',
                seq='%(filename_sample)s',
                input='%(filename_control)s',
                filetype='bed',
                align='%(mappability_dir)s',
                twoBit='%(bit_filename)s',
                extension=%(fragment_size)s,
                winSize=%(winSize)i,
                offset=%(offset)i,
                cnvWinSize=%(cnvWinSize)i,
                cnvOffset=%(cnvOffset)i,
                basecountfile='%(tmpdir)s/basecount',
                buildwin=0,
                threshold=%(fdr_threshold)f,
                pquant=1,
                winGap=%(winGap)i,
                initmethod="count",
                tol=%(tol)f,
                method="mixture",
                numProc=%(threads)i,
                printFullOut=1,
                interaction=FALSE,
                selectmodel=FALSE,
                formula=formula,
                formulaE=formulaE,
                formulaZ=formulaZ,
                peakconfidence=%(peakconfidence)f,
                FDR=TRUE)''' % locals())

    elif options.action == "per_contig":

        E.info("processing per chromosome" )
        for contig, size in contig2size.iteritems():
            if contig not in ("chr16",): continue

            E.info("processing contig %s" % contig)
            filename_sample_contig = filename_sample + "_%s" % contig
            filename_control_contig = filename_control + "_%s" % contig
            if not os.path.exists( filename_output + "_files" ):
                os.mkdir( filename_output + "_files" )
            filename_output_contig = os.path.join( filename_output + "_files", contig )
            filename_basecounts_contig = os.path.join( tmpdir, "basecount_%s" % contig)

            E.run( "grep %(contig)s < %(filename_sample)s > %(filename_sample_contig)s" % locals() )
            E.run( "grep %(contig)s < %(filename_control)s > %(filename_control_contig)s" % locals() )

            if not os.path.exists( filename_basecounts_contig ):
                E.info( "computing counts" )

                R( '''basealigncount( inputfile='%(filename_sample_contig)s',
                                  outputfile='%(filename_basecounts_contig)s',
                                  extension=%(fragment_size)i,
                                  filetype='bed',
                                  twoBitFile='%(bit_filename)s' )
                                  '''  % locals() )
            else:
                E.info( "using existing counts" )

            # run zinba, do not build window data
            R( '''zinba( refinepeaks=1,
                     seq='%(filename_sample_contig)s',
                     input='%(filename_control_contig)s',
                     filetype='bed',
                     align='%(mappability_dir)s',
                     twoBit='%(bit_filename)s',
                     outfile='%(filename_output_contig)s',
                     extension=%(fragment_size)s,
                     basecountfile='%(filename_basecounts_contig)s',
                     numProc=%(threads)i,
                     threshold=%(fdr_threshold)f,
                     broad=FALSE,
                     printFullOut=0,
                     interaction=FALSE,
                     mode='peaks',
                     FDR=TRUE) '''  % locals() )
    elif options.action == "full":

        # run zinba, do not build window data
        R( '''zinba( refinepeaks=1,
                     seq='%(filename_sample)s',
                     input='%(filename_control)s',
                     filetype='bed',
                     align='%(mappability_dir)s',
                     twoBit='%(bit_filename)s',
                     outfile='%(filename_output)s',
                     extension=%(fragment_size)s,
                     basecountfile='%(tmpdir)s/basecount',
                     numProc=%(threads)i,
                     threshold=%(fdr_threshold)f,
                     broad=FALSE,
                     printFullOut=0,
                     interaction=FALSE,
                     mode='peaks',
                     FDR=TRUE) '''  % locals() )

    if not (options.tempdir or options.keep_temp):
        shutil.rmtree( tmpdir )

    ## write footer and output benchmark information.
    E.Stop()