def main(use_config=True, outputdir=None, results=None, md_results=None, mdd_results=None, motif_distances=None, md=False, mdd=False, debug=False, jobid=None, output_type=False, p_cutoff=None, plot_format=None): '''This script creates output files associated with TFEA ''' start_time = time.time() if use_config: from TFEA import config outputdir = config.vars['OUTPUT'] figuredir = config.vars['FIGUREDIR'] results = config.vars['RESULTS'] md_results = config.vars['MD_RESULTS'] mdd_results = config.vars['MDD_RESULTS'] motif_distances = config.vars['MOTIF_DISTANCES'] md = config.vars['MD'] mdd = config.vars['MDD'] debug = config.vars['DEBUG'] jobid = config.vars['JOBID'] output_type = config.vars['OUTPUT_TYPE'] p_cutoff = np.log(config.vars['PADJCUTOFF']) padj_cutoff = np.log(config.vars['PADJCUTOFF']) label1 = config.vars['LABEL1'] label2 = config.vars['LABEL2'] plotall = config.vars['PLOTALL'] singlemotif = config.vars['SINGLEMOTIF'] plot_format = config.vars['PLOT_FORMAT'] print("Creating output...", end=' ', flush=True, file=sys.stderr) TFEA_header = [ '#TF', 'E-Score', 'Corrected E-Score', 'Events', 'GC', 'FPKM', 'P-adj', 'Corrected P-adj' ] description = [ 'Motif Name', 'Enrichment Score', 'Enrichment Score following GC correction', 'Number of motif instances within analyzed regions', 'GC-content of motif', 'FPKM of the gene associated with the motif if an annotation is provided', 'Adjusted P-value (Bonferroni)', 'Adjusted P-value (Bonferroni) after GC correction' ] sort_index = [5, 3, 2, -1] txt_output(outputdir=outputdir, results=results, outname='results.txt', sortindex=sort_index, header=TFEA_header) plot.plot_global_MA(results, p_cutoff=p_cutoff, title='TFEA MA-Plot', xlabel='$Log_{10}$(Events)', ylabel='E-Score', savepath=figuredir / (f'TFEA_MA.{plot_format}'), plot_format=plot_format, c_index=1, x_index=3, y_index=2, p_index=-1, ylimits=[-1, 1]) # plot.plot_global_volcano(results, p_cutoff=p_cutoff, title='TFEA Volcano Plot', # xlabel='Area Under the Curve (AUC)', # ylabel='-log10(P-adj)', # savepath=figuredir / 'TFEA_volcano.png', # dpi=dpi) # plot.plot_global_gc(results, p_cutoff=p_cutoff, title='TFEA GC-Plot', # xlabel='Motif GC-content', # ylabel='Area Under the Curve (AUC)', # savepath=figuredir / 'TFEA_GC.png', dpi=dpi, # x_index=-3, # y_index=1, # p_index=-1) if md: header = ['#TF', 'MD-Score', 'Events', 'p-val'] txt_output(outputdir=outputdir, results=md_results, outname='md_results.txt', header=header, sortindex=[-1], log=False) plot.plot_global_MA(md_results, p_cutoff=p_cutoff, title='MD MA-Plot', xlabel='Log10(Motif Hits)', ylabel='MD-Score Difference', savepath=figuredir / (f'MD_MA.{plot_format}'), plot_format=plot_format, x_index=2, y_index=1, p_index=-1, ylimits=[-1, 1]) plot.plot_global_volcano(md_results, p_cutoff=p_cutoff, title='MD Volcano Plot', xlabel='MD-Score Difference', ylabel='-log10(P-val)', savepath=figuredir / (f'MD_volcano.{plot_format}'), plot_format=plot_format) if mdd: header = ['#TF', 'MDD-Score', 'Events', 'p-val'] txt_output(outputdir=outputdir, results=mdd_results, outname='mdd_results.txt', header=header, sortindex=[-1], log=False) plot.plot_global_MA(mdd_results, p_cutoff=p_cutoff, title='MDD MA-Plot', xlabel='Log10(Motif Hits)', ylabel='Differential MD-Score Difference', savepath=figuredir / (f'MDD_MA.{plot_format}'), plot_format=plot_format, x_index=2, y_index=1, p_index=-1, ylimits=[-1, 1]) plot.plot_global_volcano(mdd_results, p_cutoff=p_cutoff, title='MDD Volcano Plot', xlabel='Differential MD-Score Difference', ylabel='-log10(P-val)', savepath=figuredir / (f'MDD_volcano.{plot_format}'), plot_format=plot_format) total_time = time.time() - start_time if use_config: config.vars['OUTPUTtime'] = total_time if output_type == 'html': if use_config: # summary_html_output(config_object=config.vars, outputdir=outputdir) module_list = [('COMBINE', config.vars['COMBINE'], config.vars['COMBINEtime']), ('RANK', config.vars['RANK'], config.vars['RANKtime']), ('SCANNER', config.vars['SCANNER'], config.vars['SCANNERtime']), ('ENRICHMENT', config.vars['ENRICHMENT'], config.vars['ENRICHMENTtime']), ('OUTPUT', config.vars['OUTPUT_TYPE'], config.vars['OUTPUTtime'])] else: module_list = [] create_motif_result_htmls(results=results, results_header=TFEA_header, outputdir=outputdir, padj_cutoff=padj_cutoff, singlemotif=singlemotif, plotall=plotall, auc_index=2, padj_index=-1, plot_format=plot_format) html_output(results=results, results_header=TFEA_header, description=description, module_list=module_list, outputdir=outputdir, label1=label1, label2=label2, padj_cutoff=padj_cutoff, plotall=plotall, auc_index=2, padj_index=-1, sortindex=sort_index, plot_format=plot_format) print("done in: " + str(datetime.timedelta(seconds=int(total_time))), file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid)
def main(use_config=True, fasta_file=False, md_fasta1=False, md_fasta2=False, ranked_file=None, md_bedfile1=None, md_bedfile2=None, scanner=None, md=None, largewindow=None, smallwindow=None, genomehits=None, fimo_background=None, genomefasta=None, tempdir=None, fimo_motifs=None, singlemotif=None, fimo_thresh=None, debug=None, mdd=None, jobid=None, cpus=None): '''This is the main script of the SCANNER module. It returns motif distances to regions of interest by either scanning fasta files on the fly using fimo or homer or by using bedtools closest on a center bed file and a database of bed files corresponding to motif hits across the genome Parameters ---------- use_config : boolean Whether to use a config module to assign variables. fasta_file : str Full path to a fasta file md_fasta1 : str Full path to a fasta file corresponding to a single condition. Only required if md score analysis desired md_fasta2 : str Full path to a fasta file corresponding to a single condition. Only required if md score analysis desired ranked_file : str Full path to a ranked bed file used in calculating background for fimo scanning. Only necessary if fimo scanning desired scanner : str Scanning method desired md : boolean Whether md score analysis is desired. If True, requires bed files for each condition. These can be generated in the COMBINE module. largewindow : int Half-length of total window size to use when defining cutoffs for how far out to measure motif distances smallwindow : int Half-length of window size to use when defining cutoffs for significant motif hits genomehits : str Full path to a folder containing bed files of motif hits across the genome fimo_background : int, str, or boolean Defines whether to use a background file when performing fimo motif scanning. A user can specify any int for window size, smallwindow, largewindow, or False if not desired. genomefasta : str Full path to a fasta file for desired genome tempdir : str Full path to a directory where files will be saved fimo_motifs : str Full path to a .meme formatted motif database singlemotif : str or boolean Whether to perform scanning on only a subset of motifs. A user can specify a single motif or a ',' separated list of motifs. fimo_thresh : str A float formatted as a string to be used when calling fimo to specify the p-value cutoff threshold debug : boolean Whether to print debug statements specifically within the multiprocess module Returns ------- motif_distances : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances1 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances2 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region Raises ------ InputError If an unknown scanner option is specified ''' start_time = time.time() if use_config: from TFEA import config fasta_file = config.vars['FASTA_FILE'] md_fasta1 = config.vars['MD_FASTA1'] md_fasta2 = config.vars['MD_FASTA2'] mdd_fasta1 = config.vars['MDD_FASTA1'] mdd_fasta2 = config.vars['MDD_FASTA2'] ranked_file = config.vars['RANKED_FILE'] md_bedfile1 = config.vars['MD_BEDFILE1'] md_bedfile2 = config.vars['MD_BEDFILE2'] mdd_bedfile1 = config.vars['MDD_BEDFILE1'] mdd_bedfile2 = config.vars['MDD_BEDFILE2'] scanner = config.vars['SCANNER'] md = config.vars['MD'] largewindow = config.vars['LARGEWINDOW'] smallwindow = config.vars['SMALLWINDOW'] genomehits = config.vars['GENOMEHITS'] fimo_background = config.vars['FIMO_BACKGROUND'] genomefasta = config.vars['GENOMEFASTA'] tempdir = config.vars['TEMPDIR'] fimo_motifs = config.vars['FIMO_MOTIFS'] singlemotif = config.vars['SINGLEMOTIF'] fimo_thresh = config.vars['FIMO_THRESH'] debug = config.vars['DEBUG'] mdd = config.vars['MDD'] mdd_pval = config.vars['MDD_PVAL'] mdd_percent = config.vars['MDD_PERCENT'] pvals = config.vars['PVALS'] cpus = config.vars['CPUS'] jobid = config.vars['JOBID'] print("Scanning regions using " + scanner + "...", flush=True, file=sys.stderr) motif_distances = None md_distances1 = None md_distances2 = None mdd_distances1 = None mdd_distances2 = None if not fasta_file and scanner != 'genome hits': fasta_file = getfasta(bedfile=ranked_file, genomefasta=genomefasta, tempdir=tempdir, outname='ranked_file.fa') if os.stat(fasta_file).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting RANKED_FILE to fasta failed." ) if md: if not md_fasta1: md_fasta1 = getfasta(bedfile=md_bedfile1, genomefasta=genomefasta, tempdir=tempdir, outname='md1_fasta.fa') if not md_fasta2: md_fasta2 = getfasta(bedfile=md_bedfile2, genomefasta=genomefasta, tempdir=tempdir, outname='md2_fasta.fa') if os.stat(md_fasta1).st_size == 0 or os.stat(md_fasta2).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting MD bedfiles to fasta failed." ) if mdd: if not mdd_fasta1: mdd_fasta1 = getfasta(bedfile=mdd_bedfile1, genomefasta=genomefasta, tempdir=tempdir, outname='mdd1_fasta.fa') if not mdd_fasta2: mdd_fasta2 = getfasta(bedfile=mdd_bedfile2, genomefasta=genomefasta, tempdir=tempdir, outname='mdd2_fasta.fa') if os.stat(mdd_fasta1).st_size == 0 or os.stat( mdd_fasta2).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting MDD bedfiles to fasta failed." ) #FIMO if scanner == 'fimo': #Get background file, if none desired set to 'None' if fasta_file and fimo_background: background_file = fasta_markov(tempdir=tempdir, fastafile=fasta_file, order='1') elif fimo_background == 'largewindow': background_file = fimo_background_file(window=int(largewindow), tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif fimo_background == 'smallwindow': background_file = fimo_background_file(window=int(smallwindow), tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif type(fimo_background) == int: background_file = fimo_background_file(window=fimo_background, tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif type(fimo_background) == str: background_file = fimo_background else: background_file = None #Get motifs to scan through if singlemotif != False: motif_list = singlemotif.split(',') else: motif_list = fimo_motif_names(motifdatabase=fimo_motifs) #Perform fimo on desired motifs print("\tTFEA:", file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=fasta_file, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) motif_distances = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) #FIMO for md score fasta files if md: print("\tMD:", file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=md_fasta1, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) md_distances1 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) fimo_keywords = dict(bg_file=background_file, fasta_file=md_fasta2, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) md_distances2 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) if use_config: config.vars['MD_DISTANCES1'] = md_distances1 config.vars['MD_DISTANCES2'] = md_distances2 if mdd: print("\tMDD:", file=sys.stderr) print(f'\t Completed: 0/{len(motif_distances)} ', end=' ', file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=mdd_fasta1, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) mdd_distances1 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) fimo_keywords = dict(bg_file=background_file, fasta_file=mdd_fasta2, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) mdd_distances2 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) # mdd_distances1 = [] # mdd_distances2 = [] # mdd_sorted_indices = np.argsort(pvals) # for i, single_motif_distances in enumerate(motif_distances, 1): # motif = single_motif_distances[0] # mdd_distances = single_motif_distances[1:] # print("pval len:", len(pvals), file=sys.stderr) # print("mdd_indices len:", len(mdd_sorted_indices), file=sys.stderr) # print("mdd_dist len:", len(mdd_distances), file=sys.stderr) # mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices] # if mdd_percent != False: # cutoff = int(len(mdd_sorted_distances)*mdd_percent) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # else: # sorted_pvals = [pvals[i] for i in mdd_sorted_indices] # cutoff = int(len([p for p in sorted_pvals if p < mdd_pval])) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr) if use_config: config.vars['MDD_DISTANCES1'] = mdd_distances1 config.vars['MDD_DISTANCES2'] = mdd_distances2 #HOMER elif scanner == 'homer': raise exceptions.InputError( "Homer scanning is not supported at this time.") #GENOME HITS elif scanner == 'genome hits': #Get motifs to analyze if singlemotif == False: motif_list = os.listdir(genomehits) else: motif_list = [ os.path.join(genomehits, motif) for motif in singlemotif.split(',') ] #Perform bedtools closest to get distances ranked_file = get_center(bedfile=ranked_file, outname=ranked_file) print("\tTFEA:", file=sys.stderr) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=ranked_file, tempdir=tempdir, distance_cutoff=largewindow, rank_index=3) motif_distances = multiprocess.main(function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) #GENOME HITS for md score bed files if md: print("\tMD:", file=sys.stderr) md_bedfile1 = get_center(bedfile=md_bedfile1, outname=md_bedfile1) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=md_bedfile1, tempdir=tempdir, distance_cutoff=largewindow) md_distances1 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) md_bedfile2 = get_center(bedfile=md_bedfile2, outname=md_bedfile2) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=md_bedfile2, tempdir=tempdir, distance_cutoff=largewindow) md_distances2 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) if use_config: config.vars['MD_DISTANCES1'] = md_distances1 config.vars['MD_DISTANCES2'] = md_distances2 if mdd: print("\tMDD:", file=sys.stderr) print(f'\t Completed: 0/{len(motif_distances)} ', end=' ', file=sys.stderr) mdd_bedfile1 = get_center(bedfile=mdd_bedfile1, outname=mdd_bedfile1) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=mdd_bedfile1, tempdir=tempdir, distance_cutoff=largewindow) mdd_distances1 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) mdd_bedfile2 = get_center(bedfile=mdd_bedfile2, outname=mdd_bedfile2) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=mdd_bedfile2, tempdir=tempdir, distance_cutoff=largewindow) mdd_distances2 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) # mdd_distances1 = [] # mdd_distances2 = [] # mdd_sorted_indices = np.argsort(pvals) # for i, single_motif_distances in enumerate(motif_distances, 1): # motif = single_motif_distances[0] # mdd_distances = single_motif_distances[1:] # mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices] # if mdd_percent != False: # cutoff = int(len(mdd_sorted_distances)*mdd_percent) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # else: # sorted_pvals = [pvals[i] for i in mdd_sorted_indices] # cutoff = int(len([p for p in sorted_pvals if p < mdd_pval])) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr) if use_config: config.vars['MDD_DISTANCES1'] = mdd_distances1 config.vars['MDD_DISTANCES2'] = mdd_distances2 else: raise exceptions.InputError("SCANNER option not recognized.") if use_config: config.vars['MOTIF_DISTANCES'] = motif_distances total_time = time.time() - start_time if use_config: config.vars['SCANNERtime'] = total_time #Remove large fasta files from output folder # if fasta_file: # fasta_file.unlink() # if md_fasta1: # md_fasta1.unlink() # if md_fasta2: # md_fasta2.unlink() # if mdd_fasta1: # mdd_fasta1.unlink() # if mdd_fasta2: # mdd_fasta2.unlink() print("done in: " + str(datetime.timedelta(seconds=int(total_time))), file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid) return motif_distances, md_distances1, md_distances2, mdd_distances1, mdd_distances2
def run(): #Imports #============================================================================== import sys import subprocess import shutil from pathlib import Path #Add TFEA srcdirectory into path srcdirectory = Path(__file__).absolute().parent sys.path.insert(0, srcdirectory) from TFEA import process_inputs #ARGUMENT PARSING #============================================================================== '''We begin by parsing user arguments. TFEA can be run in two ways and these are not mutually exclusive. TFEA has traditional command line flags that a user may specify. Additionally, a user may provide a configuration file (.ini) with all necessary inputs. Finally, a user may provide both a configuration file and command line flags. In this case, the command line flags will overwrite any redundant options in the configuration file. ''' #Process user inputs in a separate module parser = process_inputs.read_arguments() #Display help message when no args are passed. if len(sys.argv) == 1: parser.print_help() sys.exit(1) #TEST module #============================================================================== '''If test flag specified, run unittests and exit. ''' test_install = parser.parse_args().TEST_INSTALL if test_install: subprocess.call(["python3", srcdirectory / 'test' / 'test_install.py']) sys.exit() test_full = parser.parse_args().TEST_FULL if test_full: sbatch = parser.parse_args().SBATCH if not sbatch: subprocess.call( ["python3", srcdirectory / 'test' / 'test_full.py']) sys.exit() else: error_file = str(srcdirectory / 'test' / 'test_files' / 'TFEA_test.err') output_file = str(srcdirectory / 'test' / 'test_files' / 'TFEA_test.out') subprocess.call([ "sbatch", "--error=" + error_file, "--output=" + output_file, "--mail-user="******"TFEA tests submitted as sbatch job. It can be " "monitored using:\ntail -f " + error_file)) sys.exit() #Rerun module #============================================================================== '''If rerun flag specified, rerun all rerun.sh files in specified directory ''' rerun = parser.parse_args().RERUN if rerun: for path in rerun: for rerun_script in Path(path).glob('**/rerun.sh'): subprocess.call(["sh", rerun_script]) sys.exit() #VERIFICATION OF USER INPUTS #============================================================================== '''This section of the code reads config file and user specified flags, makes sure these are complete and not conflicting and writes them to config.py within TFEA for global use across modules ''' process_inputs.verify_arguments(parser=parser) #CREATING DIRECTORIES #============================================================================== '''TFEA creates the specified output directory if it doesn't exist. Within the output directory, 3 directories are created: 'temp_files', 'e_and_o', and 'plots'. These contain temporary files, stderr and stdout files, and figures generated by TFEA. This is also the module where the special --sbatch flag is handled ''' process_inputs.create_directories(srcdirectory=srcdirectory) #============================================================================== #MAIN SCRIPT #============================================================================== #SECONDARY IMPORTS #============================================================================== import multiprocessing as mp from TFEA import config from TFEA import multiprocess #Print starting statements #============================================================================== print("TFEA start: ", file=sys.stderr) #Print multiprocessing information to stderr if config.vars['DEBUG']: mp.log_to_stderr() multiprocess.current_mem_usage(config.vars['JOBID']) #COMBINE module #============================================================================== '''This module is a pre-processing step where a user may specify how to handle multiple bed file inputs. The goal is to arrive at a single bed file to input into subsequent modules. ''' if config.vars['COMBINE'] != False: from TFEA import combine combine.main() #RANK module #============================================================================== '''This module decides how to rank regions within the bed files. If genome hits specified then the ranked output will only contain the center of each region (since we will perform bedtools closest later) ''' if config.vars['RANK'] != False: from TFEA import rank rank.main() #SCANNER module #============================================================================== '''This module returns motif distances to regions of interest. This is accomplished either by scanning regions on the fly using fimo or homer, or by running bedtools closest on region centers compared to a database of motif hits across the genome. ''' from TFEA import scanner scanner.main() #ENRICHMENT module #============================================================================== '''Where the bulk of TFEA analysis occurs. Some components of plotting module are contained within this enrichment module ''' from TFEA import enrichment enrichment.main() #OUTPUT module #============================================================================== '''A module to write output to either a txt or html file ''' from TFEA import output output.main() print("TFEA done. Output in:", config.vars['OUTPUT'], file=sys.stderr) #Delete temp_files directory #============================================================================== if not config.vars['DEBUG']: shutil.rmtree(config.vars['TEMPDIR'])
def main(use_config=True, motif_distances=None, md_distances1=None, md_distances2=None, mdd_distances1=None, mdd_distances2=None, enrichment=None, output_type=None, permutations=None, debug=None, largewindow=None, smallwindow=None, md=None, mdd=None, cpus=None, jobid=None, pvals=None, fcs=None, p_cutoff=None, figuredir=None, plotall=False, fimo_motifs=None, meta_profile_dict=None, label1=None, label2=None, dpi=None, motif_fpkm={}, bootstrap=False, gc=None, plot_format=None): '''This is the main script of the ENRICHMENT module. It takes as input a list of distances outputted from the SCANNER module and calculates an enrichment score, a p-value, and in some instances an adjusted p-value for each motif. Parameters ---------- use_config : boolean Whether to use a config module to assign variables. motif_distances : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances1 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances2 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region enrichment : str The type of enrichment analysis to perform output_type : str Determines what some functions will output. At this point, this is mostly intended for debug purposes. permutations : int Number of random shuffling permutations to perform to calculate a p-value debug : boolean Whether to print debug statements specifically within the multiprocess module largewindow : int A distance cutoff value used within auc_bgcorrect smallwindow : int A distance cutoff value used within the md score analysis Returns ------- results : list of lists A list of lists corresponding to enrichment statistics for each motif md_results : list of lists A list of lists corresponding to md-score statistics for each motif ''' start_time = time.time() if use_config: motif_distances = config.vars['MOTIF_DISTANCES'] md_distances1 = config.vars['MD_DISTANCES1'] md_distances2 = config.vars['MD_DISTANCES2'] mdd_distances1 = config.vars['MDD_DISTANCES1'] mdd_distances2 = config.vars['MDD_DISTANCES2'] enrichment = config.vars['ENRICHMENT'] permutations = config.vars['PERMUTATIONS'] debug = config.vars['DEBUG'] largewindow = config.vars['LARGEWINDOW'] smallwindow = config.vars['SMALLWINDOW'] pvals = config.vars['PVALS'] fcs = config.vars['FCS'] md = config.vars['MD'] mdd = config.vars['MDD'] cpus = config.vars['CPUS'] jobid = config.vars['JOBID'] p_cutoff = np.log(config.vars['PADJCUTOFF']) figuredir = config.vars['FIGUREDIR'] plotall = config.vars['PLOTALL'] fimo_motifs = config.vars['FIMO_MOTIFS'] meta_profile_dict = config.vars['META_PROFILE'] label1 = config.vars['LABEL1'] label2 = config.vars['LABEL2'] output_type = config.vars['OUTPUT_TYPE'] bootstrap = config.vars['BOOTSTRAP'] gc = config.vars['GC'] plot_format = config.vars['PLOT_FORMAT'] try: motif_fpkm = config.vars['MOTIF_FPKM'] except: motif_fpkm = {} print("Calculating enrichment...", flush=True, file=sys.stderr) results = None md_results = None mdd_results = None if enrichment == 'auc': gc_correct = {} linear_regression = None if gc: print('\tCorrecting GC:', file=sys.stderr) auc_keywords = dict(fimo_motifs=fimo_motifs) motif_gc_auc = multiprocess.main(function=get_auc_gc, args=motif_distances, kwargs=auc_keywords, debug=debug, jobid=jobid, cpus=cpus) #Calculate linear regression based on AUC and GC content of motifs varx = np.array([i[2] for i in motif_gc_auc]) vary = np.array([i[1] for i in motif_gc_auc]) mask = ~np.isnan(varx) & ~np.isnan(vary) linear_regression = [ x for x in stats.linregress(varx[mask], vary[mask]) ] slope, intercept, _, _, _ = linear_regression for key, _, gc in motif_gc_auc: offset = slope * gc + intercept gc_correct[key] = offset print('\tCalculating E-Score:', file=sys.stderr) # manager = Manager() # meta_profile_dict = manager.dict(meta_profile_dict) auc_keywords = dict(permutations=permutations, use_config=use_config, output_type=output_type, pvals=pvals, plotall=plotall, p_cutoff=p_cutoff, figuredir=figuredir, largewindow=largewindow, fimo_motifs=fimo_motifs, meta_profile_dict=meta_profile_dict, label1=label1, label2=label2, fcs=fcs, motif_fpkm=motif_fpkm, tests=len(motif_distances), bootstrap=bootstrap, gc_correct=gc_correct, plot_format=plot_format) results = multiprocess.main(function=auc_simulate_and_plot, args=motif_distances, kwargs=auc_keywords, debug=debug, jobid=jobid, cpus=cpus) plot.plot_global_gc(results, p_cutoff=p_cutoff, title='TFEA GC-Plot', xlabel='Motif GC-content', ylabel='Non-corrected E-Score', savepath=figuredir / ('TFEA_GC.' + plot_format), linear_regression=linear_regression, plot_format=plot_format, x_index=4, y_index=1, c_index=2, p_index=-1, ylimits=[-1, 1]) # results = list() # for motif_distance in motif_distances: # results.append(area_under_curve(motif_distance, **auc_keywords)) # padj_bonferroni(results) # elif enrichment == 'anderson-darling': # results = multiprocess.main(function=anderson_darling, # args=motif_distances, debug=debug, # jobid=jobid, cpus=cpus) # elif enrichment == 'auc_bgcorrect': # print('\tTFEA:', file=sys.stderr) # auc_bgcorrect_keywords = dict(permutations=permutations) # results = multiprocess.main(function=area_under_curve_bgcorrect, # args=motif_distances, # kwargs=auc_bgcorrect_keywords, # debug=debug, jobid=jobid, cpus=cpus) # padj_bonferroni(results) else: raise exceptions.InputError( "Enrichment option not recognized or supported.") if md: print('\tMD:', file=sys.stderr) md_results = calculate_md(md_distances1=md_distances1, md_distances2=md_distances2, smallwindow=smallwindow, jobid=jobid, cpus=cpus, debug=debug) if use_config: config.vars['MD_RESULTS'] = md_results if mdd: print('\tMDD:', file=sys.stderr) mdd_results = calculate_md(md_distances1=mdd_distances1, md_distances2=mdd_distances2, smallwindow=smallwindow, jobid=jobid, cpus=cpus, debug=debug) if use_config: config.vars['MDD_RESULTS'] = mdd_results if use_config: config.vars['RESULTS'] = results total_time = time.time() - start_time if use_config: config.vars['ENRICHMENTtime'] = total_time #Remove large meta profile file # meta_profile_file.unlink() if type(meta_profile_dict) == pathlib.PosixPath: shutil.rmtree(meta_profile_dict, ignore_errors=True) print("done in: " + str(datetime.timedelta(seconds=int(total_time))), file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid) return results, md_results, mdd_results
def main(use_config=True, bed1=None, bed2=None, method=None, tempdir=None, md=None, largewindow=None, scanner=None, debug=False, label1=None, label2=None, jobid=None): '''This is the main script of the combine function that is called within TFEA. Default arguments are assigned to variables within config.vars. Parameters ---------- use_config : boolean Whether to use a config module to assign variables. bed1 : list A list of strings specifying full paths to bed files corresponding to a single condition (replicates) bed2 : list A list of strings specifying full paths to bed files corresponding to a single condition (replicates) method : str Method for combining input bed files into a single bed file tempdir : str Full path to a directory where files will be saved md : boolean Whether md-score bed files are generated largewindow : int Half-length of window size to use when generating md-score related bed files scanner : str Scanner method to use in SCANNER module. Only needed if md also specified. If equal to 'genome hits', md bed files generated will be only contain one base and be centered at the middle of the region Returns ------- None - Assigns varaibles within config if use_config set to True Raises ------ FileEmptyError If any resulting file is empty ''' start_time = time.time() if use_config: bed1 = config.vars['BED1'] bed2 = config.vars['BED2'] method = config.vars['COMBINE'] tempdir = config.vars['TEMPDIR'] md = config.vars['MD'] md_bedfile1 = config.vars['MD_BEDFILE1'] md_bedfile2 = config.vars['MD_BEDFILE2'] largewindow = config.vars['LARGEWINDOW'] scanner = config.vars['SCANNER'] label1 = config.vars['LABEL1'] label2 = config.vars['LABEL2'] debug = config.vars['DEBUG'] jobid = config.vars['JOBID'] print("Combining Regions...", end=' ', flush=True, file=sys.stderr) if md_bedfile1 and md_bedfile2: centered_md_bedfile1 = tempdir / 'md_bedfile1.centered.bed' centered_md_bedfile2 = tempdir / 'md_bedfile2.centered.bed' md = md and (not md_bedfile1 or not md_bedfile2 ) #Boolean to determine whether to generate MD bed files md_pybedtool1 = BedTool(str(md_bedfile1)) md_pybedtool1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(centered_md_bedfile1) md_pybedtool2 = BedTool(str(md_bedfile2)) md_pybedtool2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(centered_md_bedfile2) if use_config: config.vars['MD_BEDFILE1'] = centered_md_bedfile1 config.vars['MD_BEDFILE2'] = centered_md_bedfile2 #Use MuMerge to merge bed files if method == 'mumerge': mumerge_input = tempdir / 'mumerge_input.txt' combined_file = tempdir / 'combined_file.mumerge' #Write MuMerge input file # with open(mumerge_input, 'w') as F: # F.write("#file\tsampid\tgroup\n") # for i,bedpath in enumerate(bed1, 1): # F.write(f'{bedpath}\t{label1}{i}\t{label1}\n') # for i,bedpath in enumerate(bed2, 1): # F.write(f'{bedpath}\t{label2}{i}\t{label2}\n') #MuMerge Command - output to combined_file.mumerge.bed combined_file = mumerge(mumerge_input, combined_file, bed1=bed1, bed2=bed2, label1=label1, label2=label2) clean_combined_file = tempdir / 'combined_file.mumerge.clean.bed' combined_pybedtool = BedTool(str(combined_file)) combined_pybedtool.remove_invalid().saveas(clean_combined_file) combined_file = clean_combined_file # combined_file = Path(str(combined_file) + '_MUMERGE.bed') #Perform simple merge same as merge all for md bed files if md: md_bedfile1 = tempdir / "md_bedfile1.mumerge" md_mumerge_input1 = tempdir / "md_mumerge_input1.txt" md_bedfile1 = mumerge(md_mumerge_input1, md_bedfile1, bed1=bed1, label1=label1, label2=label2) md_pybedtool1 = BedTool(str(md_bedfile1)) md_bedfile1 = tempdir / "md_bedfile1.mumerge.final.bed" md_pybedtool1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) md_bedfile2 = tempdir / "md_bedfile2.mumerge" md_mumerge_input2 = tempdir / "md_mumerge_input2.txt" md_bedfile2 = mumerge(md_mumerge_input2, md_bedfile2, bed2=bed2, label1=label1, label2=label2) md_pybedtool2 = BedTool(str(md_bedfile2)) md_bedfile2 = tempdir / "md_bedfile2.mumerge.final.bed" md_pybedtool2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4) # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4) # md_merged_bed1.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_merged_bed2.each(center_feature).each(extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) #Merge all bed regions, for MD merge condition replicates elif method == 'mergeall': combined_file = tempdir / "combined_file.mergeall.bed" merged_bed = merge_bed(beds=bed1 + bed2) # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.merge.bed" md_bedfile2 = tempdir / "md_bedfile2.merge.bed" # md_merged_bed1 = merge_bed(beds=bed1).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') # md_merged_bed2 = merge_bed(beds=bed2).each(featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_merged_bed1 = merge_bed(beds=bed1).each( featurefuncs.extend_fields, 4) md_merged_bed2 = merge_bed(beds=bed2).each( featurefuncs.extend_fields, 4) md_merged_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_merged_bed1.saveas(md_bedfile1) md_merged_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_merged_bed2.saveas(md_bedfile2) elif method == 'tfitclean': # combined_file = tfit_clean(beds=bed1+bed2, tempdir=tempdir) combined_file = tempdir / "combined_file.tfitclean.bed" size_cut = 200 cleaned_bed = clean_bed(beds=bed1 + bed2, size_cut=size_cut) # cleaned_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) cleaned_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.clean.bed" md_bedfile2 = tempdir / "md_bedfile2.clean.bed" md_cleaned_bed1 = clean_bed(beds=bed1) md_cleaned_bed2 = clean_bed(beds=bed2) # md_cleaned_bed1.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1) md_cleaned_bed1.saveas(combined_file) # md_cleaned_bed2.each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2) md_cleaned_bed2.saveas(combined_file) #Intersect all bed regions, for MD intersect condition replicates elif method == 'intersectall': combined_file = tempdir / 'combined_file.intersectall.bed' intersected_bed = intersect_bed(beds=bed1 + bed2) # intersected_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) intersected_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.intersect.bed" md_bedfile2 = tempdir / "md_bedfile2.intersect.bed" md_intersected_bed1 = intersect_bed(beds=bed1) md_intersected_bed2 = intersect_bed(beds=bed2) md_intersected_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_intersected_bed1.saveas(combined_file) md_intersected_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_intersected_bed2.saveas(combined_file) #Merge all regions, filter small regions. For MD perform this for each condition elif method == 'tfitremovesmall': # combined_file = tfit_remove_small(beds=bed1+bed2, tempdir=tempdir) size_cut = 200 combined_file = tempdir / "combined_file.mergeallnosmall.bed" merged_bed = merge_bed(beds=bed1 + bed2) # merged_bed.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.filter(lambda b: b.stop - b.start > size_cut).saveas( combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.merge.bed" md_bedfile2 = tempdir / "md_bedfile2.merge.bed" md_merged_bed1 = merge_bed(beds=bed1) md_merged_bed2 = merge_bed(beds=bed2) # md_merged_bed1.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile1) md_merged_bed1.filter( lambda b: b.stop - b.start > size_cut).saveas(combined_file) # md_merged_bed2.filter(lambda b: b.stop - b.start > size_cut).each(center_feature).each(extend_feature, size=largewindow).saveas(md_bedfile2) md_merged_bed2.filter( lambda b: b.stop - b.start > size_cut).saveas(combined_file) #Intersect replicates, merge conditions. For MD intersect condition replicates elif method == 'intersect/merge': # combined_file = intersect_merge_bed(bed1=bed1, bed2=bed2, tempdir=tempdir) combined_file = tempdir / 'combined_file.intermerge.bed' intersected_bed1 = intersect_bed(beds=bed1) intersected_bed2 = intersect_bed(beds=bed2) merged_bed = intersected_bed1.cat(intersected_bed2).merge().sort() # merged_bed.each(center_feature).each(extend_feature, size=largewindow).saveas(combined_file) merged_bed.remove_invalid().saveas(combined_file) if md: md_bedfile1 = tempdir / "md_bedfile1.intersect.bed" md_bedfile2 = tempdir / "md_bedfile2.intersect.bed" md_intersected_bed1 = intersect_bed(beds=bed1).each( featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_intersected_bed2 = intersect_bed(beds=bed2).each( featurefuncs.extend_fields, 4).each(featurefuncs.rename, '1') md_intersected_bed1.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile1) # md_intersected_bed1.saveas(md_bedfile1) md_intersected_bed2.each(center_feature).each( extend_feature, size=largewindow).remove_invalid().saveas(md_bedfile2) # md_intersected_bed2.saveas(md_bedfile2) else: raise exceptions.InputError("Error: COMBINE option not recognized.") #Check to make sure no files are empty if os.stat(combined_file).st_size == 0: raise exceptions.FileEmptyError( "Error in COMBINE module. Resulting bed file is empty.") if md: if os.stat(md_bedfile1).st_size == 0 or os.stat( md_bedfile2).st_size == 0: raise exceptions.FileEmptyError( "Error in COMBINE module. Resulting md bed file is empty.") if use_config: #Assign MD_BEDFILE variables in config config.vars['MD_BEDFILE1'] = md_bedfile1 config.vars['MD_BEDFILE2'] = md_bedfile2 #Assign COMBINED_FILE variable in config if use_config: config.vars['COMBINED_FILE'] = combined_file #Record time, print total_time = time.time() - start_time if use_config: config.vars['COMBINEtime'] = total_time print("done in: " + str(datetime.timedelta(seconds=int(total_time))), ". Processing", len(combined_file.read_text().split('\n')), "regions", file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid)