def loadQcMeasures(infile, outfile): ''' load QC measures into CSVDB ''' P.load(infile, outfile, options="--add-index=track")
def loadExonValidation(infiles, outfile): ''' load individual and merged exon validation stats For each sample, the exon validation stats are loaded into a table named by sample and mapper [sample]_[mapper]_overrun The merge alignment stats for all samples are merged and loaded into single table called exon_validation Parameters ---------- infiles : list Input filenames with exon validation stats outfile : str Output filename ''' suffix = ".exon.validation.tsv.gz" P.merge_and_load(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadNCG(outfile): '''Load NCG into database''' infile = PARAMS["cancergenes_table"] # infile = "/ifs/projects/proj053/backup/NCG/cancergenes2016.tsv" P.load(infile, outfile, options="--add-index=symbol")
def loadSleuthTableGenes(infile, outfile, gene_info, gene_biotypes, database, annotations_database): tmpfile = P.getTempFilename("/ifs/scratch/") table = os.path.basename(gene_info) if gene_biotypes: where_cmd = "WHERE " + " OR ".join( ["gene_biotype = '%s'" % x for x in gene_biotypes.split(",")]) else: where_cmd = "" select = """SELECT DISTINCT gene_id, gene_name FROM annotations.%(table)s %(where_cmd)s""" % locals() df1 = pd.read_table(infile, sep="\t") df1.set_index("test_id", drop=False, inplace=True) df2 = pd.read_sql(select, connect(database, annotations_database)) df2.set_index("gene_id", drop=False, inplace=True) df = df1.join(df2) df.to_csv(tmpfile, sep="\t", index=True) options = "--add-index=gene_id" P.load(tmpfile, outfile, options=options) os.unlink(tmpfile)
def loadMutectExtendedOutput(infile, outfile): '''Load mutect extended output into database''' infile = infile.replace(".mutect.snp.vcf", "_call_stats.out") indices = "contig,position" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadPicardCoverageStats(infiles, outfile): '''import coverage statistics into database. Arguments --------- infiles : string Filenames of files with picard metric information. Each file corresponds to a different track. outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.get_temp_file(".") first = True for f in infiles: track = P.snip(os.path.basename(f), ".cov") lines = [x for x in open(f, "r").readlines() if not x.startswith("#") and x.strip()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False outf.write("%s\t%s" % (track, lines[1])) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadSailfish(infile, outfile): ''' load Sailfish gene counts data into CSVDB ''' P.load(infile, outfile)
def loadExonValidation(infiles, outfile): '''merge alignment stats into single tables.''' suffix = suffix = ".exon.validation.tsv.gz" mergeAndLoad(infiles, outfile, suffix=suffix) for infile in infiles: track = P.snip(infile, suffix) o = "%s_overrun.load" % track P.load(infile + ".overrun.gz", o)
def loadAnnotations(infile, outfile): '''load variant annotations into database''' P.load(infile, outfile, options="--map=gene_id:str " "--add-index=gene_id " "--map=base_qualities:text ")
def loadIdxstats(infiles, outfile): '''take list of file paths to samtools idxstats output files and merge to create single dataframe containing mapped reads per contig for each track. This dataframe is then loaded into database. Loads tables into the database * idxstats_reads_per_chromosome Arguments --------- infiles : list list where each element is a string of the filename containing samtools idxstats output. Filename format is expected to be 'sample.idxstats' outfile : string Logfile. The table name will be derived from `outfile`. ''' outf = P.get_temp_file(".") dfs = [] for f in infiles: track = P.snip(f, ".idxstats").split('/')[-1] if not os.path.exists(f): E.warn("File %s missing" % f) continue # reformat idx stats df = pandas.read_csv(f, sep='\t', header=None) df.columns = ['region', 'length', 'mapped', 'unmapped'] # calc total reads mapped & unmappedpep total_reads = df.unmapped.sum() + df.mapped.sum() total_mapped_reads = df.mapped.sum() reformatted_df = pandas.DataFrame( [['total_mapped_reads', total_mapped_reads], ['total_reads', total_reads], ['track', track]], columns=(['region', 'mapped'])) # reformat the df df = df.append(reformatted_df, ignore_index=True) df.set_index('region', inplace=True) df1 = df[['mapped']].T # set track as index df1.set_index('track', inplace=True) dfs.append(df1) # merge dataframes into single table master_df = pandas.concat(dfs) master_df.drop('*', axis=1, inplace=True) # transform dataframe to avoid reaching column limit master_df = master_df.T master_df.to_csv(outf, sep='\t', index=True) outf.close() P.load(outf.name, outfile, options="--ignore-empty --add-index=track") os.unlink(outf.name)
def loadVariantAnnotation(infile, outfile): '''Load VCF annotations into database''' if infile.endswith("indels.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" elif infile.endswith("mutect.snp.annotated.filtered.tsv"): indices = "CHROM,POS,SNPEFF_GENE_NAME" P.load(infile, outfile, options="--add-index=%(indices)s" % locals())
def loadCPCResults(infile, outfile): ''' load the results of the cpc analysis ''' P.load(infile, outfile, options="--header-names=transcript_id,feature,C_NC,CP_score " "--add-index=transcript_id")
def loadPolyphenMap(infile, outfile): '''load polyphen input data.''' P.load(infile + ".map", outfile, options="--add-index=snp_id " "--add-index=track,transcript_id " "--add-index=contig,pos " "--add-index=protein_id " "--add-index=transcript_id ")
def loadTranscriptProfile(infiles, outfile, suffix="transcript_profile", tablename=None): '''load transcript profiles into one table. Arguments --------- infiles : string Filenames of files with matrix from bam2geneprofile. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s" % (suffix) outf = P.get_temp_file(".") table_count = 0 table_join = None for infile in infiles: matrix_file = str(infile) + ".geneprofileabsolutedistancefromthreeprimeend.matrix.tsv.gz" name = P.snip(os.path.basename(infile), ".transcriptprofile.gz") table = pd.read_csv(matrix_file, sep="\t") table.rename(columns={'none': name}, inplace=True) table.drop(["area", "counts", "background"], axis=1, inplace=True) if table_count == 0: table_join = table table_count += 1 else: table_join = table.merge(table_join, on=["bin", "region", "region_bin"], how="left") table_join.to_csv(outf, sep="\t", index=False) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=bin") os.unlink(outf.name)
def loadBioProspector(infile, outfile): '''load results from bioprospector.''' target_path = os.path.join(os.path.abspath(P.get_params()["exportdir"]), "bioprospector") try: os.makedirs(target_path) except OSError: pass track = infile[:-len(".bioprospector")] results = Bioprospector.parse(IOTools.open_file(infile, "r")) tmpfile = P.get_temp_file() tmpfile.write("id\tmotif\tstart\tend\tstrand\tarrangement\n") for x, motifs in enumerate(results): outname = os.path.join(target_path, "%s_%02i.png" % (track, x)) Bioprospector.build_logo([y.sequence for y in motifs.matches], outname) for match in motifs.matches: distance = abs(match.start + match.width1 - (match.end - match.width2)) if match.strand in ("+-", "-+"): arrangement = "ER" elif match.strand in ("++", "--"): arrangement = "DR" else: arrangement = "SM" distance = 0 arrangement += "%i" % distance strand = match.strand[0] id = re.sub(".*_", "", match.id) tmpfile.write("%s\t%i\t%i\t%i\t%s\t%s\n" % (id, x, match.start, match.end, strand, arrangement)) tmpfile.close() P.load(tmpfile.name, outfile, options="--add-index=id " "--add-index=motif " "--add-index=id,motif " "--allow-empty-file " "--map=base_qualities:text") os.unlink(tmpfile.name)
def loadCountReads(infiles, outfile, suffix="nreads", pipeline_suffix=".nreads", tablename=None): '''load read counts. Arguments --------- infiles : string Filenames of files with number of reads per sample. Each file corresponds to a different track. outfile : string Logfile. suffix : string Suffix to append to table name. pipeline_suffix : string Suffix to remove from track name. tablename : string Tablename to use. If unset, the table name will be derived from `outfile` and suffix as ``to_table(outfile) + "_" + suffix``. ''' if not tablename: tablename = "%s_%s" % (P.to_table(outfile), suffix) outf = P.get_temp_file(".") outf.write("%s\t%s\n" % ("track", "nreads")) for filename in infiles: track = P.snip(os.path.basename(filename), pipeline_suffix) if not os.path.exists(filename): E.warn("File %s missing" % filename) continue lines = IOTools.open_file(filename, "r").readlines() for line in lines: count = line.split("\t")[1] outf.write("%s\t%s\n" % (track, count)) outf.close() P.load(infile=outf.name, outfile=outfile, tablename=tablename, options="--add-index=track") os.unlink(outf.name)
def loadManualAnnotations(infile, outfile): tmp = P.get_temp_filename(".") annotation = P.snip(infile, "_annotations.tsv") with IOTools.open_file(tmp, "w") as outf: outf.write("%s\tgene_id\n" % annotation) with IOTools.open_file(infile, "r") as inf: for line in inf: outf.write("%s\t%s" % (annotation, line)) P.load(tmp, outfile, options="--add-index=gene_id") os.unlink(tmp)
def mergeEffects(infiles, outfile): '''load transcript effects into single table.''' tablename = P.toTable(outfile) outf = open('effects.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load("effect.txt", outfile, options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation", "genes"): outf = open('effects.' + suffix + '.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".effects.gz") statfile = f + "." + suffix + ".gz" print(statfile) if not os.path.exists(statfile): E.warn("File %s missing" % statfile) continue lines = [x for x in gzip.open(statfile, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() tmpfilename = outf.name P.load(outf.name, outfile, tablename=tabelname + "_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".meme") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadExpression(infile, outfile): if not os.path.isfile(outfile): P.load(infile, outfile, options="-i Sample -i gene_id -i transcript_id", job_memory="16G") else: pass if not os.path.isfile("expression.dir/utrons_expression.txt"): subprocess.call([ "sqlite3", PARAMS["database_name"], ".headers on", ".mode tab", ".output expression.dir/utrons_expression.txt", "select * from utrons_expression" ]) else: pass
def loadMotifInformation(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("motif\n") for infile in infiles: if IOTools.is_empty(infile): continue motif = P.snip(infile, ".motif") outf.write("%s\n" % motif) outf.close() P.load(outf.name, outfile, "--allow-empty-file") os.unlink(outf.name)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") temp.close() P.load(temp.name, outfile, options="--add-index=seq_id") os.unlink(temp.name)
def loadTomTom(infile, outfile): '''load tomtom results''' tablename = P.to_table(outfile) resultsdir = os.path.join(os.path.abspath(PARAMS["exportdir"]), "tomtom", infile) xml_file = os.path.join(resultsdir, "tomtom.xml") if not os.path.exists(xml_file): E.warn("no tomtom output - skipped loading ") P.touch(outfile) return # get the motif name from the xml file tree = xml.etree.ElementTree.ElementTree() tree.parse(xml_file) motifs = tree.find("targets") name2alt = {} for motif in motifs.getiterator("motif"): name = motif.get("id") alt = motif.get("alt") name2alt[name] = alt tmpfile = P.get_temp_file(".") # parse the text file for line in IOTools.open_file(infile): if line.startswith("#Query"): tmpfile.write('\t'.join(("target_name", "query_id", "target_id", "optimal_offset", "pvalue", "evalue", "qvalue", "Overlap", "query_consensus", "target_consensus", "orientation")) + "\n") continue data = line[:-1].split("\t") target_name = name2alt[data[1]] tmpfile.write("%s\t%s" % (target_name, line)) tmpfile.close() P.load(tmpfile.name, outfile) os.unlink(tmpfile.name)
def identify_splice_sites(infiles, outfiles): infile = infiles outfile, outfile_load = outfiles current_file = __file__ pipeline_path = os.path.abspath(current_file) pipeline_directory = os.path.dirname(pipeline_path) script_path = "pipeline_utrons/splicesites_start_end_sizes.py" ss_path = os.path.join(pipeline_directory, script_path) statement = ''' python %(ss_path)s %(infile)s %(outfile)s; sort -u %(outfile)s > %(outfile)s_2.txt; rm %(outfile)s; mv %(outfile)s_2.txt %(outfile)s; sed -i $'1i transcript_id\\tstrand\\tss5\\tss3\\tcontig\\tsplice_site_start\\tsplice_site_end\\tutron_size' %(outfile)s ''' P.run(statement) P.load( outfile, outfile_load, options= "-i transcript_id -i ss5 -i ss3 -i splice_site_start -i splice_site_end -i utron_size", job_memory="16G")
def loadEffects(infile, outfile): '''load transcript effects into tables.''' root = infile[:-len(".effects.gz")] P.load(infile, outfile, tablename=root + "_effects", options="--add-index=transcript_id") for suffix in ("cds", "intron", "splicing", "translation"): P.load(infile, outfile, tablename=root + "_effects_" + suffix, options="--add-index=transcript_id " "--allow-empty-file " "--ignore-column=seq_na " "--ignore-column=seq_aa")
def loadMemeSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("method\ttrack\n") for infile in infiles: if IOTools.is_empty(infile): continue method = re.match("(.+).dir/", infile).groups()[0] track = os.path.basename(".".join(infile.split(".")[:-1])) outf.write("%s\t%s\n" % (method,track)) outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def mergeAnnotations(infiles, outfile): '''load variant annotations into single database table''' tablename = P.toTable(outfile) outf = open('anno.txt', 'w') first = True for f in infiles: track = P.snip(os.path.basename(f), ".annotations.gz") if not os.path.exists(f): E.warn("File %s missing" % f) continue lines = [x for x in gzip.open(f, "r").readlines()] if first: outf.write("%s\t%s" % ("track", lines[0])) first = False for i in range(1, len(lines)): outf.write("%s\t%s" % (track, lines[i])) outf.close() P.load('anno.text', outfile)
def loadTranscriptSummary(infile, outfile): '''summarize binding information per transcript.''' dbh = connect() table = P.toTable(outfile) cc = dbh.cursor() # sqlite can not do full outer join cc.execute("""DROP TABLE IF EXISTS %(table)s""" % locals()) transcripts = [ x[0] for x in cc.execute( "SELECT DISTINCT(transcript_id) FROM annotations.transcript_info"). fetchall() ] tmpf = P.getTempFile() tables = ("tata", "cpg") titles = tables vals = [] for table in tables: t = set([ x[0] for x in cc.execute("SELECT DISTINCT(transcript_id) FROM %(table)s" % locals()).fetchall() ]) vals.append(t) tmpf.write("transcript_id\t%s\n" % "\t".join(titles)) for transcript_id in transcripts: tmpf.write("%s\t%s\n" % (transcript_id, "\t".join( [str(int(transcript_id in v)) for v in vals]))) tmpf.close() P.load(tmpf.name, outfile) os.unlink(tmpf.name)
def loadMemeChipSummary(infiles, outfile): '''load information about motifs into database.''' outf = P.get_temp_file(".") outf.write("track\tnpeaks\twidth\tmasking\tpath\n") for infile in infiles: if IOTools.is_empty(infile): continue fn = P.snip(os.path.basename(infile), ".memechip") track, npeaks, width, masking = fn.split(".") outf.write("\t".join(map(str, (track, npeaks, width, masking, fn))) + "\n") outf.close() P.load(outf.name, outfile) os.unlink(outf.name)
def loadLncRNAClass(infile, outfile): ''' load the lncRNA classifications ''' # just load each transcript with its classification temp = P.getTempFile(".") inf = IOTools.openFile(infile) for transcript in GTF.transcript_iterator(GTF.iterator(inf)): temp.write("%s\t%s\t%s\n" % (transcript[0].transcript_id, transcript[0].gene_id, transcript[0].source)) temp.close() P.load(temp.name, outfile, options="--header-names=transcript_id,gene_id,class " "--add-index=transcript_id " "--add-index=gene_id") os.unlink(temp.name)