def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.toTable(outfile) tmpfile = P.getTempFile(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match( ":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError( "parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = ( match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = ( float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = ( float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write(str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % (motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s -b sqlite --index=id --index=motif --index=id,motif --table=%(tablename)s --allow-empty --map=base_qualities:text < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.toTable(outfile) tmpfile = P.getTempFile(".") tmpfile.write( MAST.Match().header +\ "\tmotif\tcontig" \ "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" \ "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" \ "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n" ) lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.getTempFile(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % \ (motif_fg, contig, "\t".join( map(str, controls[id]["l"] )), "\t".join( map(str, controls[id]["r"] )), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n" ) tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s -b sqlite --index=id --index=motif --index=id,motif --table=%(tablename)s --allow-empty --map=base_qualities:text < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tablename = outfile[:-len(".load")] tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match( ":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise P.PipelineError( "parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.iteritems(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() #start, end = int(start), int(end) #match.start += start #match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join(map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ --map=base_qualities:text \ < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def loadGLAM2SCAN(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. ''' tablename = outfile[:-len(".load")] tmpfile = tempfile.NamedTemporaryFile(delete=False) tmpfile.write( "motif\tid\tnmatches\tscore\tscores\tncontrols\tmax_controls\n") lines = IOTools.openFile(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) for chunk in range(len(chunks) - 1): # use real file, as parser can not deal with a # list of lines try: motif = re.match(":: motif = (\S+) ::", lines[chunks[chunk]]).groups()[0] except AttributeError: raise P.PipelineError("parsing error in line '%s'" % lines[chunks[chunk]]) if chunks[chunk] + 1 == chunks[chunk + 1]: L.warn("no results for motif %s - ignored" % motif) continue tmpfile2 = tempfile.NamedTemporaryFile(delete=False) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() glam = Glam2Scan.parse(IOTools.openFile(tmpfile2.name, "r")) os.unlink(tmpfile2.name) # collect control data full_matches = collections.defaultdict(list) controls = collections.defaultdict(list) for match in glam.matches: m = match.id.split("_") track, id = m[:2] if len(m) == 2: full_matches[id].append(match) else: controls[id].append(match.score) for id, matches in full_matches.iteritems(): nmatches = len(matches) scores = [x.score for x in matches] score = max(scores) # move to genomic coordinates #contig, start, end = re.match( "(\S+):(\d+)..(\d+)", match.id).groups() #start, end = int(start), int(end) #match.start += start #match.end += start contig = "" if id not in controls: P.warn("no controls for %s - increase evalue?" % id) c = controls[id] if len(c) == 0: mmax = "" else: mmax = max(c) tmpfile.write("\t".join( map(str, (motif, id, nmatches, score, ",".join(map(str, scores)), len(c), mmax))) + "\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' python %(scriptsdir)s/csv2db.py %(csv2db_options)s \ -b sqlite \ --index=id \ --index=motif \ --index=id,motif \ --table=%(tablename)s \ --map=base_qualities:text \ < %(tmpfilename)s > %(outfile)s ''' P.run() os.unlink(tmpfile.name)
def printTracks(infile, outfile): P.warn("\n\n\n\nprinting tracks:") for track in EXPERIMENTS: print "\t" print track
def buildExpressionTracks(infile, outfiles, map_exp2columns, suffix): '''build expression tracks. read the analysis from FILENAME_EXPRESSION ..note:: The file A589_Data_RMA.csv does NOT always contain the probeset_id in the first column, but instead it might be the transcript_cluster_id. A possible explanation is that if several probesets map to the same transcript cluster, the transcript cluster is normalized. The set of cluster_id and probeset ids are completely non-overlapping. Hence, the :term:`cluster_id` will be used. ''' E.info("importing expression data from %s" % infile) dbhandle = sqlite3.connect(PARAMS["database"]) cc = dbhandle.cursor() statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript" cc.execute(statement) map_cluster2transcript, map_probeset2cluster = {}, {} for probeset, cluster, transcript_id in cc.fetchall(): map_probeset2cluster[probeset] = cluster map_cluster2transcript[cluster] = transcript_id reader = csv.reader(open(infile, "rU")) first = True # do not delete old files as this function is called several times output_files = IOTools.FilePool(output_pattern="exp%s.data", force=False) headers = (("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"), ("mRna - Description", "description"), ('mRNA Accession', 'mrna_id'), ('mRNA Source', 'source'), ('mRNA - xhyb', 'xhyb'), ('GO Biological Process ID', 'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'), ('GO Cellular Component ID', 'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'), ('GO Molecular Function ID', 'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'), ('Pathway Source', 'pw_source'), ('Pathway Name', 'pw_name')) old_headers = set([x[0] for x in headers]) new_headers = [x[1] for x in headers] take = [] index_soure, index_accession, index_probeset = None, None, None counts = E.Counter() found = set() outf = open(outfiles[0] + suffix, "w") outf.write("# %s\n" % infile) outs = open(outfiles[1] + suffix, "w") outs.write("# %s\n" % infile) writer = csv.writer(outf) for row in reader: if first: first = False writer.writerow(row) for x, old_header in enumerate(row): if old_header == "mRNA Source": index_source = len(take) if old_header == "mRNA Accession": index_accession = len(take) if old_header == "Probe Set ID": index_probeset = len(take) if old_header in old_headers: take.append(x) # write headers to all files outs.write("\t".join(new_headers) + "\n") for exp, columns in map_exp2columns.items(): output_files.write( exp, "\t".join( ("cluster_id", Stats.Summary().getHeader(), "\t".join( ["R%i" % i for i in range(len(columns))]))) + "\n") else: new_row = [] for x in take: if row[x].strip() != "---": new_row.append(row[x].strip()) else: new_row.append("") probeset = new_row[index_probeset].strip() if probeset in map_probeset2cluster: probeset = map_probeset2cluster[probeset] counter.mapped_to_cluster += 1 if probeset not in map_cluster2transcript: writer.writerow(row) counts.skipped += 1 continue else: if probeset in found: counts.duplicates += 1 counts.output += 1 found.add(probeset) outs.write("\t".join(new_row) + "\n") for exp, cols in map_exp2columns.items(): data = [row[x] for x in cols] output_files.write( exp, "\t".join( (probeset, str(Stats.Summary( [float(x) for x in data])), "\t".join(data))) + "\n") outf.close() if counts.duplicates > 0: P.warn("duplicate probeset/clusters") P.info("probeset source information: %s" % str(counts)) output_files.close()
def buildExpressionTracks( infile, outfiles, map_exp2columns, suffix ): '''build expression tracks. read the analysis from FILENAME_EXPRESSION ..note:: The file A589_Data_RMA.csv does NOT always contain the probeset_id in the first column, but instead it might be the transcript_cluster_id. A possible explanation is that if several probesets map to the same transcript cluster, the transcript cluster is normalized. The set of cluster_id and probeset ids are completely non-overlapping. Hence, the :term:`cluster_id` will be used. ''' E.info( "importing expression data from %s" % infile ) dbhandle = sqlite3.connect( PARAMS["database"] ) cc = dbhandle.cursor() statement = "SELECT DISTINCT probeset, cluster_id, transcript_id FROM probeset2transcript" cc.execute( statement ) map_cluster2transcript, map_probeset2cluster = {}, {} for probeset, cluster, transcript_id in cc.fetchall(): map_probeset2cluster[probeset] = cluster map_cluster2transcript[cluster] = transcript_id reader = csv.reader( open(infile,"rU") ) first = True # do not delete old files as this function is called several times output_files = IOTools.FilePool( output_pattern = "exp%s.data", force = False ) headers = ( ("Probe Set ID", "cluster_id"), ("Gene Symbol", "genesymbol"), ("mRna - Description", "description"), ('mRNA Accession', 'mrna_id'), ('mRNA Source', 'source' ), ('mRNA - xhyb', 'xhyb'), ('GO Biological Process ID', 'go_biol_id'), ('GO Biological Process Term', 'go_biol_term'), ('GO Cellular Component ID', 'go_cell_id'), ('GO Cellular Component Term', 'go_cell_term'), ('GO Molecular Function ID', 'go_mol_id'), ('GO Molecular Function Term', 'go_mol_term'), ('Pathway Source', 'pw_source' ), ('Pathway Name', 'pw_name' ) ) old_headers = set( [x[0] for x in headers] ) new_headers = [x[1] for x in headers] take = [] index_soure, index_accession, index_probeset = None, None, None counts = E.Counter() found = set() outf = open( outfiles[0] + suffix, "w") outf.write( "# %s\n" % infile ) outs = open( outfiles[1] + suffix, "w") outs.write( "# %s\n" % infile ) writer = csv.writer( outf ) for row in reader: if first: first = False writer.writerow( row ) for x, old_header in enumerate(row ): if old_header == "mRNA Source": index_source = len(take) if old_header == "mRNA Accession": index_accession = len(take) if old_header == "Probe Set ID": index_probeset = len(take) if old_header in old_headers: take.append( x ) # write headers to all files outs.write("\t".join(new_headers)+ "\n") for exp,columns in map_exp2columns.items(): output_files.write( exp, "\t".join( ("cluster_id", Stats.Summary().getHeader(), "\t".join(["R%i" % i for i in range(len(columns))])))+ "\n") else: new_row = [] for x in take: if row[x].strip() != "---": new_row.append(row[x].strip()) else: new_row.append("") probeset = new_row[index_probeset].strip() if probeset in map_probeset2cluster: probeset = map_probeset2cluster[probeset] counter.mapped_to_cluster += 1 if probeset not in map_cluster2transcript: writer.writerow( row ) counts.skipped += 1 continue else: if probeset in found: counts.duplicates += 1 counts.output += 1 found.add(probeset) outs.write("\t".join( new_row )+ "\n") for exp,cols in map_exp2columns.items(): data = [row[x] for x in cols ] output_files.write( exp, "\t".join( (probeset, str(Stats.Summary([float(x) for x in data ])), "\t".join( data ) )) + "\n" ) outf.close() if counts.duplicates > 0: P.warn( "duplicate probeset/clusters" ) P.info( "probeset source information: %s" % str(counts) ) output_files.close()