def loadPicardCoverageStats(infiles, outfile): '''import coverage statistics into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.get_temp_file(".") first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def exportMotifLocations(infiles, outfile): '''export motif locations. There will be a bed-file per motif. Overlapping motif matches in different tracks will be merged. ''' dbh = connect() cc = dbh.cursor() motifs = [ x[0] for x in cc.execute("SELECT motif FROM motif_info").fetchall() ] for motif in motifs: tmpf = P.get_temp_file(".") for infile in infiles: table = P.to_table(infile) track = P.snip(table, "_mast") for x in cc.execute( """SELECT contig, start, end, '%(track)s', evalue FROM %(table)s WHERE motif = '%(motif)s' AND start IS NOT NULL""" % locals()): tmpf.write("\t".join(map(str, x)) + "\n") tmpf.close() outfile = os.path.join(PARAMS["exportdir"], "motifs", "%s.bed.gz" % motif) tmpfname = tmpf.name statement = '''mergeBed -i %(tmpfname)s -nms | gzip > %(outfile)s''' P.run(statement) os.unlink(tmpf.name)
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.get_temp_file(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame( [['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.get_temp_file(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.open_file(infile, "r")) tmpfile = P.get_temp_file() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs(match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.open_file(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def buildPicardDuplicationStats(infile, outfile): '''run picard:MarkDuplicates Record duplicate metrics using Picard, the marked records are discarded. Arguments --------- infile : string Input filename in :term:`BAM` format. outfile : string Output filename with picard output. ''' job_memory = PICARD_MEMORY picard_opts = '-Xmx%(job_memory)s -XX:+UseParNewGC -XX:+UseConcMarkSweepGC' % locals( ) job_threads = 3 if BamTools.getNumReads(infile) == 0: E.warn("no reads in %s - no metrics" % infile) IOTools.touch_file(outfile) return # currently, MarkDuplicates cannot handle split alignments from gsnap # these can be identified by the custom XT tag. if ".gsnap.bam" in infile: tmpf = P.get_temp_file(".") tmpfile_name = tmpf.name statement = '''samtools view -h %(infile)s | awk "!/\\tXT:/" | samtools view /dev/stdin -S -b > %(tmpfile_name)s; ''' % locals() data_source = tmpfile_name else: statement = "" data_source = infile statement += '''picard %(picard_opts)s MarkDuplicates INPUT=%(data_source)s ASSUME_SORTED=true METRICS_FILE=%(outfile)s OUTPUT=/dev/null VALIDATION_STRINGENCY=SILENT ''' P.run(statement) if ".gsnap.bam" in infile: os.unlink(tmpfile_name)
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("motif\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.to_table(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("id") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.get_temp_file(".") # parse the text file for line in IOTools.open_file(infile): if line.startswith("#Query"): tmpfile.write('\t'.join(("target_name", "query_id", "target_id", "optimal_offset", "pvalue", "evalue", "qvalue", "Overlap", "query_consensus", "target_consensus", "orientation")) + "\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.is_empty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method,track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def genericImportAnnotator(infiles, outfile, table, workspace, slice, subset, fdr_method): '''generic import of annotator results. Assumes that the suffix of all infiles is the same. ''' infile = " ".join(infiles) x, suffix = os.path.splitext(infiles[0]) tmpfilename = P.get_temp_filename() statement = ''' cgat annotator2tsv \ --method=fdr-table \ --fdr-method=%(fdr_method)s \ --log=%(outfile)s.log \ --regex-identifier="(.*)%(suffix)s" \ %(infile)s > %(tmpfilename)s ''' P.run(statement) tmpfile = P.get_temp_file() for line in open(tmpfilename, "r"): if line.startswith("id"): line = "subset\tworkspace\tslice\t" + re.sub("^id", "track", line) else: line = "%s\t%s\t%s\t%s" % (subset, workspace, slice, line) tmpfile.write(line) tmpfile.close() tmpfilename2 = tmpfile.name statement = ''' cgat csv2db %(csv2db_options)s \ --table=%(table)s < %(tmpfilename2)s > %(outfile)s''' P.run(**dict(list(locals().items()) + list(P.get_params().items()))) os.unlink(tmpfilename) os.unlink(tmpfilename2)
def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if IOTools.is_empty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadStrandSpecificity(infiles, outfile, suffix="strand", tablename=None): ''' ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") table_count = 0 table_join = None for infile in infiles: name = P.snip(os.path.basename(infile), ".strand") table = pd.read_csv(infile, sep="\t", comment="#") table["track"] = name if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["MSR", "ISR", "OSR", "ISF", "MSF", "OSF", "SF", "SR", "track"], how="outer") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadMAST(infile, outfile): '''parse mast file and load into database. Parse several motif runs and add them to the same table. Add columns for the control data as well. ''' tablename = P.to_table(outfile) tmpfile = P.get_temp_file(".") tmpfile.write(MAST.Match().header + "\tmotif\tcontig" "\tl_evalue\tl_pvalue\tl_nmatches\tl_length\tl_start\tl_end" "\tr_evalue\tr_pvalue\tr_nmatches\tr_length\tr_start\tr_end" "\tmin_evalue\tmin_pvalue\tmax_nmatches" + "\n") lines = IOTools.open_file(infile).readlines() chunks = [x for x in range(len(lines)) if lines[x].startswith("::")] chunks.append(len(lines)) def readChunk(lines, chunk): # use real file, as MAST parser can not deal with a # list of lines tmpfile2 = P.get_temp_file(".") try: motif, part = re.match(":: motif = (\S+) - (\S+) ::", lines[chunks[chunk]]).groups() except AttributeError: raise ValueError("parsing error in line '%s'" % lines[chunks[chunk]]) E.info("reading %s - %s" % (motif, part)) tmpfile2.write("".join(lines[chunks[chunk] + 1:chunks[chunk + 1]])) tmpfile2.close() mast = MAST.parse(IOTools.open_file(tmpfile2.name, "r")) os.unlink(tmpfile2.name) return motif, part, mast def splitId(s, mode): '''split background match id has three parts: track _ id _ pos track might contain '_'. ''' d = match.id.split("_") if mode == "bg": return "_".join(d[:-2]), d[-2], d[-1] elif mode == "fg": return "_".join(d[:-1]), d[-1] for chunk in range(0, len(chunks) - 1, 2): motif_fg, part, mast_fg = readChunk(lines, chunk) assert part == "foreground" motif_bg, part, mast_bg = readChunk(lines, chunk + 1) assert part == "background" assert motif_fg == motif_bg # index control data controls = collections.defaultdict(dict) for match in mast_bg.matches: track, id, pos = splitId(match.id, "bg") controls[id][pos] = (match.evalue, match.pvalue, match.nmotifs, match.length, match.start, match.end) for match in mast_fg.matches: # remove track and pos track, match.id = splitId(match.id, "fg") # move to genomic coordinates contig, start, end = re.match("(\S+):(\d+)..(\d+)", match.description).groups() if match.nmotifs > 0: start, end = int(start), int(end) match.start += start match.end += start match.positions = [x + start for x in match.positions] id = match.id if id not in controls: P.warn("no controls for %s - increase MAST evalue" % id) if "l" not in controls[id]: controls[id]["l"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) if "r" not in controls[id]: controls[id]["r"] = (float(PARAMS["mast_evalue"]), 1, 0, 0, 0, 0) min_evalue = min(controls[id]["l"][0], controls[id]["r"][0]) min_pvalue = min(controls[id]["l"][1], controls[id]["r"][1]) max_nmatches = max(controls[id]["l"][2], controls[id]["r"][2]) tmpfile.write( str(match) + "\t%s\t%s\t%s\t%s\t%s\t%s\t%s" % ( motif_fg, contig, "\t".join(map(str, controls[id]["l"])), "\t".join(map(str, controls[id]["r"])), str(min_evalue), str(min_pvalue), str(max_nmatches), ) + "\n") tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadGOs(infiles, outfile, tablename): '''import GO results into a single table. This method also computes a global QValue over all tracks, genesets and annotation sets. Arguments --------- infiles : string Output files of several runGO analyses outfile : string Output filename, contains log information tablename : string Table name for storing results. ''' header = False tempf1 = P.get_temp_file() pvalues = [] for infile in infiles: indir = infile + ".dir" if not os.path.exists(indir): continue track, geneset, annotationset = re.search("^(\S+)_vs_(\S+)\.(\S+)", infile).groups() for filename in glob.glob(os.path.join(indir, "*.overall")): for line in open(filename, "r"): if line.startswith("#"): continue data = line[:-1].split("\t") if line.startswith("code"): if header: continue tempf1.write("track\tgeneset\tannotationset\t%s" % line) header = True assert data[10] == "pover" and data[ 11] == "punder", "format error, expected pover-punder, got %s-%s" % ( data[10], data[11]) continue tempf1.write("%s\t%s\t%s\t%s" % (track, geneset, annotationset, line)) pvalues.append(min(float(data[10]), float(data[11]))) tempf1.close() E.info("analysing %i pvalues" % len(pvalues)) fdr = Stats.doFDR(pvalues) E.info("got %i qvalues" % len(fdr.mQValues)) qvalues = ["global_qvalue"] + fdr.mQValues tempf2 = P.get_temp_file() for line, qvalue in zip(open(tempf1.name, "r"), qvalues): tempf2.write("%s\t%s\n" % (line[:-1], str(qvalue))) tempf2.close() P.load(tempf2.name, outfile, tablename=tablename, options="--allow-empty-file " "--add-index=category " "--add-index=track,geneset,annotationset " "--add-index=geneset " "--add-index=annotationset " "--add-index=goid ") os.unlink(tempf1.name) os.unlink(tempf2.name)
def loadPicardMetrics(infiles, outfile, suffix, pipeline_suffix=".picard_stats", tablename=None): '''load picard metrics. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") filenames = ["%s.%s" % (x, suffix) for x in infiles] first = True for filename in filenames: track = P.snip(os.path.basename(filename), "%s.%s" % (pipeline_suffix, suffix)) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.open_file(filename, "r").readlines() # extract metrics part rx_start = re.compile("## METRICS CLASS") for n, line in enumerate(lines): if rx_start.search(line): lines = lines[n + 1:] break for n, line in enumerate(lines): if not line.strip(): lines = lines[:n] break if len(lines) == 0: E.warn("no lines in %s: %s" % (track, filename)) continue if first: outf.write("%s\t%s" % ("track", lines[0])) fields = lines[0][:-1].split("\t") else: f = lines[0][:-1].split("\t") if f != fields: raise ValueError( "file %s has different fields: expected %s, got %s" % (filename, fields, f)) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load(outf.name, outfile, tablename=tablename, options="--add-index=track --allow-empty-file") os.unlink(outf.name)
def loadIntervals(infile, outfile): '''load intervals from :term:`bed` formatted files into the database. If a :term:`bam` file is associated with a :term:`bed` file, re-evaluate the intervals by counting reads within the interval. In contrast to the initial pipeline, the genome is not binned. nprobes: number of reads in interval peakcenter: position with maximum number of reads in interval avgval: average coverage within interval ''' tmpfile = P.get_temp_file(".") headers = ("avgval", "disttostart", "genelist", "length", "peakcenter", "peakval", "position", "interval_id", "npeaks", "nprobes", "contig", "start", "end", "score", "strand") tmpfile.write("\t".join(headers) + "\n") (avgval, contig, disttostart, end, genelist, length, peakcenter, peakval, position, start, interval_id, npeaks, nprobes) = \ 0, "", 0, 0, "", 0, 0, 0, 0, 0, 0, 0, 0 track = Sample(filename=P.snip(infile, ".bed.gz")) bamfiles, offsets = getAssociatedBAMFiles(track) if bamfiles: E.info("%s: associated bamfiles = %s" % (track, bamfiles)) else: E.info("%s: no bamfiles associated" % (track)) # open all bamfiles samfiles = [pysam.Samfile(fn, "rb") for fn in bamfiles] c = E.Counter() # count tags for bed in Bed.iterator(IOTools.open_file(infile, "r")): c.input += 1 if "name" not in bed: bed.name = c.input try: strand = bed["strand"] except IndexError: strand = "." # The fifth field of a bed file can be used to supply a # score. Our iterator returns the optional fields as a "fields # array". The first of these is the interval name, and the # second the score. The score may be more is better or less is # better. if len(bed.fields) > 1: value = bed.fields[1] if value != "": score = value else: score = 1 else: score = 1 if samfiles: npeaks, peakcenter, length, avgval, peakval, nprobes = \ PipelinePeakcalling.countPeaks( bed.contig, bed.start, bed.end, samfiles, offsets) if nprobes == 0: c.skipped_reads += 1 else: # deal with bed12 bed_intervals = bed.toIntervals() length = sum([e - s for s, e in bed_intervals]) mid_point = length / 2 for s, e in bed_intervals: peakcenter = s + mid_point if peakcenter >= e: mid_point = peakcenter - e else: break npeaks, avgval, peakval, nprobes = \ (1, 1, 1, 1) c.output += 1 tmpfile.write("\t".join(map( str, (avgval, disttostart, genelist, length, peakcenter, peakval, position, bed.name, npeaks, nprobes, bed.contig, bed.start, bed.end, score, strand))) + "\n") if c.output == 0: E.warn("%s - no aggregate intervals") tmpfile.close() P.load(tmpfile.name, outfile, tablename=os.path.basename("%s_intervals" % track.asTable()), options="--allow-empty-file " "--add-index=interval_id") os.unlink(tmpfile.name) E.info("%s\n" % str(c))
def getRepeatsFromUCSC(dbhandle, repclasses, outfile, remove_contigs_regex=None): '''download repeats from UCSC database and write to `outfile` in :term:`gff` format. This method downloads repeats from the repeatmasker track at the UCSC. Arguments --------- dbhandle : object Database handle to UCSC mysql database repclasses : list List of repeat classes to select. If empty, all repeat classes will be collected. outfile : string Filename of output file in :term:`gff` format. remove_contigs_regex : list If given, remove repeats on contigs matching the regular expression given. ''' # Repeats are either stored in a single ``rmsk`` table (hg19) or in # individual ``rmsk`` tables (mm9) like chr1_rmsk, chr2_rmsk, .... # In order to do a single statement, the ucsc mysql database is # queried for tables that end in rmsk. cc = dbhandle.execute("SHOW TABLES LIKE '%%rmsk'") tables = [x[0] for x in cc.fetchall()] if len(tables) == 0: raise ValueError("could not find any `rmsk` tables") # now collect repeats tmpfile = P.get_temp_file(".") for table in tables: sql = """SELECT genoName, 'repeat', 'exon', genoStart+1, genoEnd, '.', strand, '.', CONCAT('class \\"', repClass, '\\"; family \\"', repFamily, '\\"; repName \\"', repName, '\\";') FROM %(table)s""" if repclasses: repclasses_str = ",".join( ["'" + x.strip() + "'" for x in repclasses]) sql += ''' WHERE repClass in (%(repclasses_str)s) ''' % locals() sql = sql % locals() E.debug("executing sql statement: %s" % sql) cc = dbhandle.execute(sql) for data in cc.fetchall(): tmpfile.write("\t".join(map(str, data)) + "\n") tmpfile.close() # sort gff and make sure that names are correct tmpfilename = tmpfile.name statement = [ '''cat %(tmpfilename)s | sort -t$'\\t' -k1,1 -k4,4n | cgat gff2gff --method=sanitize --sanitize-method=genome --skip-missing --genome-file=%(genome_dir)s/%(genome)s --log=%(outfile)s.log ''' ] if remove_contigs_regex: statement.append('--contig-pattern="{}"'.format( ",".join(remove_contigs_regex))) statement.append('| gzip > %(outfile)s') statement = " ".join(statement) P.run(statement) os.unlink(tmpfilename)
def buildGenomicFunctionalAnnotation(gtffile, dbh, outfiles, job_memory="4G"): '''output a bed file with functional annotations. The genomic region a gene covers is taken from the `gtffile`. There should only be one entry per gene, i.e. exons should have been combined into a gene territory. Each entry in the output bed file is a gene territory. Bed entries are labeled by functional annotations associated by that gene. Ambiguities in territories are resolved by outputting annotations for all genes within a territory. The output file contains annotations for both GO and GOSlim. These are prefixed by ``go:`` and ``goslim:``. Arguments --------- gtffile : string ENSEMBL geneset in :term:`gtf` format. dbh : object Database handle to retrieve GO assignments for each gene outfiles : list Output filenames. The first is a :term:`bed` formatted file of gene territories. The second is a :term:`tsv` formatted table mapping GO terms to their description. ''' outfile_bed, outfile_tsv = outfiles gene2region = {} for gtf in GTF.iterator(IOTools.open_file(gtffile, "r")): gid = gtf.gene_id.split(":") for g in gid: gene2region[g] = (gtf.contig, gtf.start, gtf.end, gtf.strand) cc = dbh.cursor() outf = P.get_temp_file(".") c = E.Counter() term2description = {} for db in ('go', 'goslim'): for gene_id, go_id, description in cc.execute( "SELECT gene_id, go_id, description FROM %s_assignments" % db): try: contig, start, end, strand = gene2region[gene_id] except KeyError: c.notfound += 1 continue outf.write("\t".join( map(str, (contig, start, end, "%s:%s" % (db, go_id), 1, strand))) + "\n") term2description["%s:%s" % (db, go_id)] = description outf.close() tmpfname = outf.name statement = '''sort -k1,1 -k2,2n < %(tmpfname)s | uniq | gzip > %(outfile_bed)s''' P.run(statement, job_memory=job_memory) outf = IOTools.open_file(outfile_tsv, "w") outf.write("term\tdescription\n") for term, description in term2description.items(): outf.write("%s\t%s\n" % (term, description)) outf.close() os.unlink(tmpfname)
def buildCDSFasta(infiles, outfile): '''output CDS sequences. This method works by taking the CDNA and peptide sequence of a particular transcript and aligning them in order to remove any frameshifts. .. note:: This method is untested. Arguments --------- infile : string ENSEMBL :term:`gtf` formatted file outfile : string indexed file in :term:`fasta` format with CDS sequences. ''' infile_cdnas, infile_peptides_fasta = infiles dbname = outfile[:-len(".fasta")] statement = '''gunzip < %(infile_cdnas)s | cgat gff2fasta --is-gtf --genome=%(genome_dir)s/%(genome)s | cgat index_fasta %(dbname)s --force-output - > %(dbname)s.log ''' P.run(statement) tmpfile = P.get_temp_file(".") dbhandle = sqlite3.connect(PARAMS["database_name"]) cc = dbhandle.cursor() tmpfile.write("protein_id\ttranscript_id\n") tmpfile.write("\n".join([ "%s\t%s" % x for x in cc.execute("SELECT DISTINCT protein_id, transcript_id " "FROM transcript_info") ])) tmpfile.write("\n") tmpfile.close() tmpfilename = tmpfile.name statement = ''' cgat peptides2cds --peptides-fasta-file=%(infile_peptides_fasta)s --cdnas=%(infile_cdnas)s --map=%(tmpfilename)s --output-format=fasta --log=%(outfile)s.log | cgat index_fasta %(dbname)s --force-output - > %(dbname)s.log ''' P.run(statement) os.unlink(tmpfilename)