def estimateEnrichmentOfTFBS(infiles, outfile): ''' Estimate the significance of trnascription factors that are associated with a foreground set of intervals vs a background set ''' E.info("Running Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) # required files match_table = "match_result" # we don't know which order the foreground and backgorund # will come in background = [ infile for infile in infiles if re.search("background", infile) ][0] foreground = [ infile for infile in infiles if re.search("foreground", infile) ][0] # run significance testing PipelineTFM.testSignificanceOfMatrices(background, foreground, PARAMS["database"], match_table, outfile) E.info("Completed Fisher's exact test for " "TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles]))
def matchBackgroundForSequenceComposition(infiles, outfile): ''' take the background set and subset it for intervals with the same sequence composition distribution that is the same as the foreground set (for the composition statistic specified in config file). This requires that the background set is sufficiently large. ''' # get gene set name track = re.match("GC_content.dir/" "(.+)\.(?:background|foreground)\.gc\.load", infiles[0]).groups()[0] # get list of foreground genes input_background = "%s.background.tsv" % track # get list of backround genes input_foreground = "%s.foreground.tsv" % track # get name of fasta file containing intervals fasta_file = os.path.basename(INPUT_FILE)[:-len(".gtf.gz")] fasta_file = os.path.join("fasta.dir", fasta_file) PipelineTFM.matchBgSequenceComposition(infiles, input_background, input_foreground, fasta_file, outfile, PARAMS["database"], PARAMS["genesets_header"], PARAMS["background_match_stat"])
def calculateCpGcomposition(infiles, outfile): ''' calculate the GC content for the CpG matched data Should be the same as the CpG content of the foreground set ''' PipelineTransfacMatch.calculateCpGComposition(infiles[0], infiles[1], outfile)
def calculateGCContent(infiles, outfile): ''' calculate the GC content across foreground and background sets ''' PipelineTransfacMatch.calculateCpGComposition(infiles[0], infiles[1], outfile)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Estimate the significance of trnascription factors that are associated with a foreground set of intervals vs a background set ''' E.info("Running Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) # required files match_table = "match_result" # we don't know which order the foreground and backgorund # will come in background = [infile for infile in infiles if re.search("background", infile)][0] foreground = [infile for infile in infiles if re.search("foreground", infile)][0] # run significance testing PipelineTFM.testSignificanceOfMatrices(background, foreground, PARAMS["database"], match_table, outfile) E.info("Completed Fisher's exact test for " "TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles]))
def calculateGCContent(infiles, outfile): ''' calculate the GC content across foreground and background sets ''' PipelineTransfacMatch.calculateCpGComposition( infiles[0], infiles[1], outfile)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Estimate the significance of transcription factors that are associated with a foreground set of intervals vs a background set matched for sequence composition. ''' E.info("Running Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) # required files match_table = "match_result" # we don't know which order the foreground and background will come in background = [infile for infile in infiles if re.search("background", infile)][0] foreground = ["%s.foreground.tsv" % re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0] for infile in infiles if re.search("foreground", infile)][0] # run significance testing PipelineTFM.testSignificanceOfMatrices(background, foreground, PARAMS["database"], match_table, outfile, PARAMS["genesets_header"]) E.info("Completed Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles]))
def matchBackgroundForSequenceComposition(infiles, outfile): ''' take the background set and subset it for intervals with the same sequence composition distribution that is the same as the foreground set (for the composition statistic specified in config file). This requires that the background set is sufficiently large. ''' # get gene set name track = re.match( "GC_content.dir/" "(.+)\.(?:background|foreground)\.gc\.load", infiles[0]).groups()[0] # get list of foreground genes input_background = "%s.background.tsv" % track # get list of backround genes input_foreground = "%s.foreground.tsv" % track # get name of fasta file containing intervals fasta_file = os.path.basename(INPUT_FILE)[:-len(".gtf.gz")] fasta_file = os.path.join("fasta.dir", fasta_file) PipelineTFM.matchBgSequenceComposition(infiles, input_background, input_foreground, fasta_file, outfile, PARAMS["database"], PARAMS["genesets_header"], PARAMS["background_match_stat"])
def calculateGCContent(infiles, outfile): ''' calculate the GC content across foreground and background sets ''' PipelineTFM.calculateSequenceComposition(infiles[0], infiles[1], outfile, PARAMS["genesets_header"])
def calculateMatchedGCComposition(infiles, outfile): ''' calculate the GC content for the CpG matched data Should be the same as the CpG content of the foreground set ''' PipelineTFM.calculateSequenceComposition(infiles[0], infiles[1], outfile)
def buildMatchMetrics(infile, outfile): ''' match outputs transcription factors that are found in the supplied sequences. We are interested in the following metrics: * No. unique transcription factors found per sequence * Maximal number of TF motifs found per sequence ''' tablename = filenameToTablename(os.path.basename(P.snip(infile, ".load"))) + "_result" PipelineTransfacMatch.frequencyMetrics(PARAMS["database"], tablename, outfile)
def buildMatchMetrics(infile, outfile): ''' match outputs transcription factors that are found in the supplied sequences. We are interested in the following metrics: * No. unique transcription factors found per sequence * Maximal number of TF motifs found per sequence ''' tablename = filenameToTablename(os.path.basename(P.snip( infile, ".load"))) + "_result" PipelineTFM.frequencyMetrics(PARAMS["database"], tablename, outfile)
def matchBackgroundForCpGComposition(infiles, outfile): ''' take the background set and subset it for intervals with a CpG distribution that is the same as the foreground set - this requires that the background set is sufficiently large ''' track = re.match( "GC_content.dir/(.+)\.(?:background|foreground)\.gc\.load", infiles[0]).groups()[0] input_background = "%s.background.tsv" % track input_foreground = "%s.foreground.tsv" % track PipelineTransfacMatch.matchBackgroundForCpGComposition( infiles, input_background, input_foreground, PARAMS["database"], outfile)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") temp.close() to_cluster = True job_options = "-l mem_free=64G" inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --index=seq_id" " %(csv2db_options)s" " < %(inf)s > %(outfile)s") P.run() os.unlink(temp.name)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile() temp.write( "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n" ) for details in PipelineTransfacMatch.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --index=seq_id %(csv2db_options)s < %(inf)s > %(outfile)s''' P.run() os.remove(inf)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' estimate the significance of trnascription factors that are associated with a foreground set of intervals vs a background set matched for CpG content ''' # required files match_table = "match_result" # we don't know which order the foreground and backgorund will come in background = [ infile for infile in infiles if re.search("background", infile)][0] foreground = ["%s.foreground.tsv" % re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0] for infile in infiles if re.search("foreground", infile)][0] # run significance testing PipelineTransfacMatch.testSignificanceOfMatrices( background, foreground, PARAMS["database"], match_table, outfile)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") temp.close() to_cluster = True job_options = "-l mem_free=64G" inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = ("python %(scriptsdir)s/csv2db.py" " -t %(tablename)s" " --log=%(outfile)s.log" " --index=seq_id" " %(csv2db_options)s" " < %(inf)s > %(outfile)s") P.run() os.unlink(temp.name)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' estimate the significance of trnascription factors that are associated with a foreground set of intervals vs a background set ''' # required files match_table = "match_result" #we don't know which order the foreground and backgorund will come in background = [ infile for infile in infiles if re.search("background", infile) ][0] foreground = [ infile for infile in infiles if re.search("foreground", infile) ][0] # run significance testing PipelineTransfacMatch.testSignificanceOfMatrices( background, foreground, PARAMS["database"], match_table, outfile)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Estimate the significance of transcription factors that are associated with a foreground set of intervals vs a background set matched for sequence composition. ''' E.info("Running Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) # required files match_table = "match_result" # we don't know which order the foreground and background # will come in background = [ infile for infile in infiles if re.search("background", infile) ][0] foreground = [ "%s.foreground.tsv" % re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0] for infile in infiles if re.search("foreground", infile) ][0] # run significance testing # MM: added in directionality into FET - might only be looking for # enrichment OR depletion so don't want to hammer those p-value # too hard pval_direct = PARAMS['fisher_direction'] PipelineTFM.testSignificanceOfMatrices(background, foreground, PARAMS["database"], match_table, outfile, PARAMS["genesets_header"], pval_direct) E.info("Completed Fisher's exact test for " "TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles]))
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Estimate the significance of transcription factors that are associated with a foreground set of intervals vs a background set matched for sequence composition. ''' E.info("Running Fisher's exact test for TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) # required files match_table = "match_result" # we don't know which order the foreground and background # will come in background = [infile for infile in infiles if re.search("background", infile)][0] foreground = ["%s.foreground.tsv" % re.match(".+/(.+)\.foreground\.gc\.tsv", infile).groups()[0] for infile in infiles if re.search("foreground", infile)][0] # run significance testing # MM: added in directionality into FET - might only be looking for # enrichment OR depletion so don't want to hammer those p-value # too hard pval_direct = PARAMS['fisher_direction'] PipelineTFM.testSignificanceOfMatrices(background, foreground, PARAMS["database"], match_table, outfile, PARAMS["genesets_header"], pval_direct) E.info("Completed Fisher's exact test for " "TF enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles]))
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT seq_id, matrix_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(tfbs_state, dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing perms = int(PARAMS['sig_testing_nperms']) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms) out_frame = pandas.DataFrame(out_dict).T out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join( map(str, [ details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence ])) + "\n") temp.close() P.load(temp.name, outfile, options="--add-index=seq_id") os.unlink(temp.name)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile() temp.write( "seq_id\tmatrix_id\tposition\tstrand\tcore_score\tmatrix_score\tsequence\n") for details in PipelineTransfacMatch.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") inf = temp.name tablename = filenameToTablename(os.path.basename(infile)) statement = '''python %(scriptsdir)s/csv2db.py -t %(tablename)s --log=%(outfile)s.log --index=seq_id %(csv2db_options)s < %(inf)s > %(outfile)s''' P.run() os.remove(inf)
def loadMatchResults(infile, outfile): ''' load the results of the match analysis into sqlite database ''' temp = P.getTempFile("./match.dir") temp.write("seq_id\tmatrix_id\tposition\tstrand\t" "core_score\tmatrix_score\tsequence\n") for details in PipelineTFM.match_iterator(infile): temp.write("\t".join(map(str, [details.seq_id, details.matrix_id, details.position, details.strand, details.core_score, details.matrix_score, details.sequence])) + "\n") temp.close() P.load(temp.name, outfile, options="--add-index=seq_id") os.unlink(temp.name)
def estimateEnrichmentOfTFBS(infiles, outfile): ''' Test for enrichment of TFBS within a gene set by permutation. ''' E.info("Running permutation testing for TFBS enrichment between %s" % " & ".join([os.path.basename(x) for x in infiles])) dbh = sqlite3.connect(PARAMS['database']) # table from sql db match_table = "match_result" tfbs_state = '''SELECT matrix_id, seq_id FROM %s;''' % match_table tfbs_table = pdsql.read_sql(sql=tfbs_state, con=dbh, index_col='matrix_id') # get foreground and background gene files # setup gc content dataframes background = [inf for inf in infiles if re.search("background", inf)][0] foreground = [inf for inf in infiles if re.search("foreground", inf)][0] back_gc = pandas.read_table(background, sep="\t", index_col=0, header=0) bg_gene_id = [x.split(" ")[0] for x in back_gc.index.tolist()] back_gc['gene_id'] = bg_gene_id back_gc.index = bg_gene_id fore_gc = pandas.read_table(foreground, sep="\t", index_col=0, header=0) fg_gene_id = [x.split(" ")[0] for x in fore_gc.index.tolist()] fore_gc['gene_id'] = fg_gene_id fore_gc.index = fg_gene_id # run permutation significance testing # check if there are sufficient genes in the background # to do a permutation test. If not, do Fishers' exact. # if yes, but less than specific number of permutations, # limit to this number. perms = int(PARAMS['sig_testing_nperms']) poss_perms = PipelineTFM.nCr(n=len(back_gc.index), r=len(fore_gc.index)) if poss_perms < 1000: E.warn("Insufficient background genes to perform" "permutations. Please use Fisher's Exact test") raise ValueError("Insufficient background size. " "Cannot use permutation test") elif poss_perms > 1000 and poss_perms < perms: E.info("Maximum possible permutations with this background" " set is %i. Running %i permutations only" % (poss_perms, poss_perms)) out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=poss_perms, bg_stat=PARAMS[""]) else: bg_stat = PARAMS["background_match_stat"] out_dict = PipelineTFM.permuteTFBSEnrich(tfbs_table=tfbs_table, fg_gc=fore_gc, bg_gc=back_gc, nPerms=perms, bg_stat=bg_stat) out_frame = pandas.DataFrame(out_dict).T pyadjust = R['p.adjust'] pvs = robjects.FloatVector([p for p in out_frame['pvalue']]) out_frame['qvalue'] = pyadjust(pvs) out_frame.to_csv(outfile, sep="\t", index_label='matrix_id')