def buildNormalizedBAM( infiles, outfile ):
    '''run MACS.'''
    
    min_reads = getMinimumMappedReads( glob.glob("*.readstats") )
    infile, statsfile = infiles
    num_reads = getMappedReads( statsfile )
    
    pysam_in = pysam.Samfile( infile, "rb" )
    pysam_out = pysam.Samfile( outfile, "wb", template = pysam_in )

    ninput, noutput = 0, 0

    take = [1] * min_reads + [0] * (num_reads-min_reads)
    random.shuffle( take )

    # iterate over mapped reads
    for read in pysam_in.fetch():
        if take[ninput]:
            pysam_out.write( read )
            noutput += 1
        ninput += 1

    pysam_in.close()
    pysam_out.close()

    P.info( "buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0*noutput/ninput, min_reads ))
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info("importing expression data from %s" % infile)

    dbhandle = sqlite3.connect(PARAMS["database"])

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute(statement)
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader(open(infile, "rU"))

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False)

    headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"),
               ("mRna - Description", "description"), ('mRNA Accession',
                                                       'mrna_id'),
               ('mRNA  Source', 'source'), ('mRNA - xhyb', 'xhyb'),
               ('GO Biological Process ID',
                'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'),
               ('GO Cellular Component ID',
                'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'),
               ('GO Molecular Function ID',
                'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'),
               ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name'))

    old_headers = set([x[0] for x in headers])
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open(outfiles[0] + suffix, "w")
    outf.write("# %s\n" % infile)
    outs = open(outfiles[1] + suffix, "w")
    outs.write("# %s\n" % infile)

    writer = csv.writer(outf)

    for row in reader:
        if first:
            first = False
            writer.writerow(row)

            for x, old_header in enumerate(row):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append(x)

            # write headers to all files
            outs.write("\t".join(new_headers) + "\n")

            for exp, columns in map_exp2columns.items():
                output_files.write(
                    exp, "\t".join(
                        ("cluster_id", Stats.Summary().getHeader(), "\t".join(
                            ["R%i" % i for i in range(len(columns))]))) + "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1

            if probeset not in map_cluster2transcript:
                writer.writerow(row)
                counts.skipped += 1
                continue
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join(new_row) + "\n")

            for exp, cols in map_exp2columns.items():
                data = [row[x] for x in cols]
                output_files.write(
                    exp, "\t".join(
                        (probeset, str(Stats.Summary(
                            [float(x)
                             for x in data])), "\t".join(data))) + "\n")

    outf.close()
    if counts.duplicates > 0:
        P.warn("duplicate probeset/clusters")

    P.info("probeset source information: %s" % str(counts))
    output_files.close()
def buildExpressionTracks( infile, outfiles, map_exp2columns, suffix ):
    '''build expression tracks.

    read the analysis from FILENAME_EXPRESSION
    
    ..note::
       The file A589_Data_RMA.csv does NOT always contain the probeset_id 
       in the first column, but instead it might be the transcript_cluster_id.
       A possible explanation is that if several probesets map to the same
       transcript cluster, the transcript cluster is normalized.
       
       The set of cluster_id and probeset ids are completely non-overlapping.

    Hence, the :term:`cluster_id` will be used.
    '''

    E.info( "importing expression data from %s" % infile )
    
    dbhandle = sqlite3.connect( PARAMS["database"] )

    cc = dbhandle.cursor()
    statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript"
    cc.execute( statement )
    map_cluster2transcript, map_probeset2cluster = {}, {}
    for probeset, cluster, transcript_id in cc.fetchall():
        map_probeset2cluster[probeset] = cluster
        map_cluster2transcript[cluster] = transcript_id

    reader = csv.reader( open(infile,"rU") )

    first = True
    # do not delete old files as this function is called several times
    output_files = IOTools.FilePool( output_pattern = "exp%s.data", force = False )

    headers = (
        ("Probe Set ID", "cluster_id"),
        ("Gene Symbol", "genesymbol"),
        ("mRna - Description", "description"),
        ('mRNA Accession',  'mrna_id'),
        ('mRNA  Source', 'source' ),
        ('mRNA - xhyb', 'xhyb'),
        ('GO Biological Process ID', 'go_biol_id'),
        ('GO Biological Process Term', 'go_biol_term'),    
        ('GO Cellular Component ID', 'go_cell_id'),  
        ('GO Cellular Component Term', 'go_cell_term'), 
        ('GO Molecular Function ID', 'go_mol_id'),
        ('GO Molecular Function Term', 'go_mol_term'),
        ('Pathway Source', 'pw_source' ),       
        ('Pathway Name', 'pw_name' ) )

    old_headers = set( [x[0] for x in headers] )
    new_headers = [x[1] for x in headers]
    take = []
    index_soure, index_accession, index_probeset = None, None, None
    counts = E.Counter()
    found = set()

    outf = open( outfiles[0] + suffix, "w")
    outf.write( "# %s\n" % infile )
    outs = open( outfiles[1] + suffix, "w")
    outs.write( "# %s\n" % infile )
    
    writer = csv.writer( outf )

    for row in reader:
        if first:
            first = False
            writer.writerow( row )

            for x, old_header in enumerate(row ):
                if old_header == "mRNA  Source": index_source = len(take)
                if old_header == "mRNA Accession": index_accession = len(take)
                if old_header == "Probe Set ID": index_probeset = len(take)
                if old_header in old_headers: take.append( x )

            # write headers to all files
            outs.write("\t".join(new_headers)+ "\n")

            for exp,columns in map_exp2columns.items():
                output_files.write( exp, 
                                    "\t".join( ("cluster_id",
                                                Stats.Summary().getHeader(),
                                                "\t".join(["R%i" % i for i in range(len(columns))])))+ "\n")
        else:
            new_row = []
            for x in take:
                if row[x].strip() != "---":
                    new_row.append(row[x].strip())
                else:
                    new_row.append("")

            probeset = new_row[index_probeset].strip()
            if probeset in map_probeset2cluster:
                probeset = map_probeset2cluster[probeset]
                counter.mapped_to_cluster += 1
                
            if probeset not in map_cluster2transcript:
                writer.writerow( row )
                counts.skipped += 1
                continue 
            else:
                if probeset in found:
                    counts.duplicates += 1
                counts.output += 1
                found.add(probeset)

            outs.write("\t".join( new_row )+ "\n")

            for exp,cols in map_exp2columns.items():
                data = [row[x] for x in cols ]
                output_files.write( exp, "\t".join( (probeset,
                                                     str(Stats.Summary([float(x) for x in data ])),
                                                     "\t".join( data ) )) + "\n" )

                
    outf.close()
    if counts.duplicates > 0:
        P.warn( "duplicate probeset/clusters" )

    P.info( "probeset source information: %s" % str(counts) )
    output_files.close()
Exemple #4
0
def buildNormalizedBAM( infiles, outfile, normalize = True ):
    '''build a normalized BAM file.

    Infiles are merged and duplicated reads are removed. 
    If *normalize* is set, reads are removed such that all 
    files will have approximately the same number of reads.
    '''

    min_reads = getMinimumMappedReads( glob.glob("*.readstats") )
    
    samfiles = []
    num_reads = 0
    for infile, statsfile in infiles:
        samfiles.append( pysam.Samfile( infile, "rb" ) )
        num_reads += getMappedReads( statsfile )

    threshold = float(min_reads) / num_reads 

    pysam_out = pysam.Samfile( outfile, "wb", template = samfiles[0] )

    ninput, noutput, nduplicates = 0, 0, 0

    # iterate over mapped reads
    last_contig, last_pos = None, None
    for pysam_in in samfiles:
        for read in pysam_in.fetch():

            ninput += 1
            if read.rname == last_contig and read.pos == last_pos:
                nduplicates += 1
                continue

            if normalize and random.random() <= threshold:
                pysam_out.write( read )
                noutput += 1

            last_contig, last_pos = read.rname, read.pos

        pysam_in.close()

    pysam_out.close()

    logs = open( outfile + ".log", "w")
    logs.write("# min_reads=%i, threshold= %5.2f\n" % \
                   (min_reads, threshold))
    logs.write("set\tcounts\tpercent\n")
    logs.write("ninput\t%i\t%5.2f%%\n" % (ninput, 100.0) )
    nwithout_dups = ninput - nduplicates
    logs.write("duplicates\t%i\t%5.2f%%\n" % (nduplicates,100.0*nduplicates/ninput))
    logs.write("without duplicates\t%i\t%5.2f%%\n" % (nwithout_dups,100.0*nwithout_dups/ninput))
    logs.write("target\t%i\t%5.2f%%\n" %   (min_reads,100.0*min_reads/nwithout_dups))
    logs.write("noutput\t%i\t%5.2f%%\n" % (noutput,100.0*noutput/nwithout_dups))
    
    logs.close()
    
    # if more than one samfile: sort
    if len(samfiles) > 1:
        tmpfilename = P.getTempFilename()
        pysam.sort( outfile, tmpfilename )
        shutil.move( tmpfilename + ".bam", outfile )
        os.unlink( tmpfilename )

    pysam.index( outfile )

    P.info( "buildNormalizedBam: %i input, %i output (%5.2f%%), should be %i" % (ninput, noutput, 100.0*noutput/ninput, min_reads ))