def calculate_md(md_distances1=None, md_distances2=None, smallwindow=None, jobid=None, cpus=None, debug=None): md_keywords = dict(smallwindow=smallwindow) md_results = multiprocess.main(function=md_score, args=list( zip(sorted(md_distances1), sorted(md_distances2))), kwargs=md_keywords, debug=debug, jobid=jobid, cpus=cpus) md_results = md_score_p(md_results) return md_results
# a = [] # for j in range(20000): # a.append(['motif'] + [j for j in range(3000)]) # # current_mem_usage(0) # # b = np.array([[j for j in range(3000)] for i in range(20000)]) # current_mem_usage(0) # b = [] # for j in range(3000): # c = np.zeros(20000) # for i in range(20000): # c[i] = j # b.append(('motif', c)) # current_mem_usage(0) # from multiprocessing import Manager # manager = Manager() # b = manager.list(b) current_mem_usage(0) import matplotlib.pyplot as plt from TFEA import multiprocess def plot_scatter(arg): F = plt.figure(figsize=(15,15)) for i in range(1, 25): ax = F.add_subplot(5,5,i) ax.scatter([x for x in range(20000)],[y for y in range(20000)]) ax.set_ylabel('hello') ax.set_xlabel('hello') plt.close() multiprocess.main(function=plot_scatter, args=[i for i in range(5)], cpus=5, debug=True, kwargs={}, jobid=0) current_mem_usage(0)
def main(use_config=True, fasta_file=False, md_fasta1=False, md_fasta2=False, ranked_file=None, md_bedfile1=None, md_bedfile2=None, scanner=None, md=None, largewindow=None, smallwindow=None, genomehits=None, fimo_background=None, genomefasta=None, tempdir=None, fimo_motifs=None, singlemotif=None, fimo_thresh=None, debug=None, mdd=None, jobid=None, cpus=None): '''This is the main script of the SCANNER module. It returns motif distances to regions of interest by either scanning fasta files on the fly using fimo or homer or by using bedtools closest on a center bed file and a database of bed files corresponding to motif hits across the genome Parameters ---------- use_config : boolean Whether to use a config module to assign variables. fasta_file : str Full path to a fasta file md_fasta1 : str Full path to a fasta file corresponding to a single condition. Only required if md score analysis desired md_fasta2 : str Full path to a fasta file corresponding to a single condition. Only required if md score analysis desired ranked_file : str Full path to a ranked bed file used in calculating background for fimo scanning. Only necessary if fimo scanning desired scanner : str Scanning method desired md : boolean Whether md score analysis is desired. If True, requires bed files for each condition. These can be generated in the COMBINE module. largewindow : int Half-length of total window size to use when defining cutoffs for how far out to measure motif distances smallwindow : int Half-length of window size to use when defining cutoffs for significant motif hits genomehits : str Full path to a folder containing bed files of motif hits across the genome fimo_background : int, str, or boolean Defines whether to use a background file when performing fimo motif scanning. A user can specify any int for window size, smallwindow, largewindow, or False if not desired. genomefasta : str Full path to a fasta file for desired genome tempdir : str Full path to a directory where files will be saved fimo_motifs : str Full path to a .meme formatted motif database singlemotif : str or boolean Whether to perform scanning on only a subset of motifs. A user can specify a single motif or a ',' separated list of motifs. fimo_thresh : str A float formatted as a string to be used when calling fimo to specify the p-value cutoff threshold debug : boolean Whether to print debug statements specifically within the multiprocess module Returns ------- motif_distances : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances1 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances2 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region Raises ------ InputError If an unknown scanner option is specified ''' start_time = time.time() if use_config: from TFEA import config fasta_file = config.vars['FASTA_FILE'] md_fasta1 = config.vars['MD_FASTA1'] md_fasta2 = config.vars['MD_FASTA2'] mdd_fasta1 = config.vars['MDD_FASTA1'] mdd_fasta2 = config.vars['MDD_FASTA2'] ranked_file = config.vars['RANKED_FILE'] md_bedfile1 = config.vars['MD_BEDFILE1'] md_bedfile2 = config.vars['MD_BEDFILE2'] mdd_bedfile1 = config.vars['MDD_BEDFILE1'] mdd_bedfile2 = config.vars['MDD_BEDFILE2'] scanner = config.vars['SCANNER'] md = config.vars['MD'] largewindow = config.vars['LARGEWINDOW'] smallwindow = config.vars['SMALLWINDOW'] genomehits = config.vars['GENOMEHITS'] fimo_background = config.vars['FIMO_BACKGROUND'] genomefasta = config.vars['GENOMEFASTA'] tempdir = config.vars['TEMPDIR'] fimo_motifs = config.vars['FIMO_MOTIFS'] singlemotif = config.vars['SINGLEMOTIF'] fimo_thresh = config.vars['FIMO_THRESH'] debug = config.vars['DEBUG'] mdd = config.vars['MDD'] mdd_pval = config.vars['MDD_PVAL'] mdd_percent = config.vars['MDD_PERCENT'] pvals = config.vars['PVALS'] cpus = config.vars['CPUS'] jobid = config.vars['JOBID'] print("Scanning regions using " + scanner + "...", flush=True, file=sys.stderr) motif_distances = None md_distances1 = None md_distances2 = None mdd_distances1 = None mdd_distances2 = None if not fasta_file and scanner != 'genome hits': fasta_file = getfasta(bedfile=ranked_file, genomefasta=genomefasta, tempdir=tempdir, outname='ranked_file.fa') if os.stat(fasta_file).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting RANKED_FILE to fasta failed." ) if md: if not md_fasta1: md_fasta1 = getfasta(bedfile=md_bedfile1, genomefasta=genomefasta, tempdir=tempdir, outname='md1_fasta.fa') if not md_fasta2: md_fasta2 = getfasta(bedfile=md_bedfile2, genomefasta=genomefasta, tempdir=tempdir, outname='md2_fasta.fa') if os.stat(md_fasta1).st_size == 0 or os.stat(md_fasta2).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting MD bedfiles to fasta failed." ) if mdd: if not mdd_fasta1: mdd_fasta1 = getfasta(bedfile=mdd_bedfile1, genomefasta=genomefasta, tempdir=tempdir, outname='mdd1_fasta.fa') if not mdd_fasta2: mdd_fasta2 = getfasta(bedfile=mdd_bedfile2, genomefasta=genomefasta, tempdir=tempdir, outname='mdd2_fasta.fa') if os.stat(mdd_fasta1).st_size == 0 or os.stat( mdd_fasta2).st_size == 0: raise exceptions.FileEmptyError( "Error in SCANNER module. Converting MDD bedfiles to fasta failed." ) #FIMO if scanner == 'fimo': #Get background file, if none desired set to 'None' if fasta_file and fimo_background: background_file = fasta_markov(tempdir=tempdir, fastafile=fasta_file, order='1') elif fimo_background == 'largewindow': background_file = fimo_background_file(window=int(largewindow), tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif fimo_background == 'smallwindow': background_file = fimo_background_file(window=int(smallwindow), tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif type(fimo_background) == int: background_file = fimo_background_file(window=fimo_background, tempdir=tempdir, bedfile=ranked_file, genomefasta=genomefasta, order='1') elif type(fimo_background) == str: background_file = fimo_background else: background_file = None #Get motifs to scan through if singlemotif != False: motif_list = singlemotif.split(',') else: motif_list = fimo_motif_names(motifdatabase=fimo_motifs) #Perform fimo on desired motifs print("\tTFEA:", file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=fasta_file, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) motif_distances = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) #FIMO for md score fasta files if md: print("\tMD:", file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=md_fasta1, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) md_distances1 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) fimo_keywords = dict(bg_file=background_file, fasta_file=md_fasta2, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) md_distances2 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) if use_config: config.vars['MD_DISTANCES1'] = md_distances1 config.vars['MD_DISTANCES2'] = md_distances2 if mdd: print("\tMDD:", file=sys.stderr) print(f'\t Completed: 0/{len(motif_distances)} ', end=' ', file=sys.stderr) fimo_keywords = dict(bg_file=background_file, fasta_file=mdd_fasta1, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) mdd_distances1 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) fimo_keywords = dict(bg_file=background_file, fasta_file=mdd_fasta2, tempdir=tempdir, motifdatabase=fimo_motifs, thresh=fimo_thresh, largewindow=largewindow) mdd_distances2 = multiprocess.main(function=fimo, args=motif_list, kwargs=fimo_keywords, debug=debug, jobid=jobid, cpus=cpus) # mdd_distances1 = [] # mdd_distances2 = [] # mdd_sorted_indices = np.argsort(pvals) # for i, single_motif_distances in enumerate(motif_distances, 1): # motif = single_motif_distances[0] # mdd_distances = single_motif_distances[1:] # print("pval len:", len(pvals), file=sys.stderr) # print("mdd_indices len:", len(mdd_sorted_indices), file=sys.stderr) # print("mdd_dist len:", len(mdd_distances), file=sys.stderr) # mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices] # if mdd_percent != False: # cutoff = int(len(mdd_sorted_distances)*mdd_percent) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # else: # sorted_pvals = [pvals[i] for i in mdd_sorted_indices] # cutoff = int(len([p for p in sorted_pvals if p < mdd_pval])) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr) if use_config: config.vars['MDD_DISTANCES1'] = mdd_distances1 config.vars['MDD_DISTANCES2'] = mdd_distances2 #HOMER elif scanner == 'homer': raise exceptions.InputError( "Homer scanning is not supported at this time.") #GENOME HITS elif scanner == 'genome hits': #Get motifs to analyze if singlemotif == False: motif_list = os.listdir(genomehits) else: motif_list = [ os.path.join(genomehits, motif) for motif in singlemotif.split(',') ] #Perform bedtools closest to get distances ranked_file = get_center(bedfile=ranked_file, outname=ranked_file) print("\tTFEA:", file=sys.stderr) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=ranked_file, tempdir=tempdir, distance_cutoff=largewindow, rank_index=3) motif_distances = multiprocess.main(function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) #GENOME HITS for md score bed files if md: print("\tMD:", file=sys.stderr) md_bedfile1 = get_center(bedfile=md_bedfile1, outname=md_bedfile1) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=md_bedfile1, tempdir=tempdir, distance_cutoff=largewindow) md_distances1 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) md_bedfile2 = get_center(bedfile=md_bedfile2, outname=md_bedfile2) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=md_bedfile2, tempdir=tempdir, distance_cutoff=largewindow) md_distances2 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) if use_config: config.vars['MD_DISTANCES1'] = md_distances1 config.vars['MD_DISTANCES2'] = md_distances2 if mdd: print("\tMDD:", file=sys.stderr) print(f'\t Completed: 0/{len(motif_distances)} ', end=' ', file=sys.stderr) mdd_bedfile1 = get_center(bedfile=mdd_bedfile1, outname=mdd_bedfile1) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=mdd_bedfile1, tempdir=tempdir, distance_cutoff=largewindow) mdd_distances1 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) mdd_bedfile2 = get_center(bedfile=mdd_bedfile2, outname=mdd_bedfile2) bedtools_distance_keywords = dict(genomehits=genomehits, ranked_center_file=mdd_bedfile2, tempdir=tempdir, distance_cutoff=largewindow) mdd_distances2 = multiprocess.main( function=bedtools_closest, args=motif_list, kwargs=bedtools_distance_keywords, debug=debug, jobid=jobid, cpus=cpus) # mdd_distances1 = [] # mdd_distances2 = [] # mdd_sorted_indices = np.argsort(pvals) # for i, single_motif_distances in enumerate(motif_distances, 1): # motif = single_motif_distances[0] # mdd_distances = single_motif_distances[1:] # mdd_sorted_distances = [mdd_distances[i] for i in mdd_sorted_indices] # if mdd_percent != False: # cutoff = int(len(mdd_sorted_distances)*mdd_percent) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # else: # sorted_pvals = [pvals[i] for i in mdd_sorted_indices] # cutoff = int(len([p for p in sorted_pvals if p < mdd_pval])) # mdd_distances2.append([motif] + mdd_sorted_distances[:cutoff]) # mdd_distances1.append([motif] + mdd_sorted_distances[cutoff:]) # # print(f'\r\t Completed: {i}/{len(motif_distances)} ', end=' ', flush=True, file=sys.stderr) if use_config: config.vars['MDD_DISTANCES1'] = mdd_distances1 config.vars['MDD_DISTANCES2'] = mdd_distances2 else: raise exceptions.InputError("SCANNER option not recognized.") if use_config: config.vars['MOTIF_DISTANCES'] = motif_distances total_time = time.time() - start_time if use_config: config.vars['SCANNERtime'] = total_time #Remove large fasta files from output folder # if fasta_file: # fasta_file.unlink() # if md_fasta1: # md_fasta1.unlink() # if md_fasta2: # md_fasta2.unlink() # if mdd_fasta1: # mdd_fasta1.unlink() # if mdd_fasta2: # mdd_fasta2.unlink() print("done in: " + str(datetime.timedelta(seconds=int(total_time))), file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid) return motif_distances, md_distances1, md_distances2, mdd_distances1, mdd_distances2
def main(use_config=True, motif_distances=None, md_distances1=None, md_distances2=None, mdd_distances1=None, mdd_distances2=None, enrichment=None, output_type=None, permutations=None, debug=None, largewindow=None, smallwindow=None, md=None, mdd=None, cpus=None, jobid=None, pvals=None, fcs=None, p_cutoff=None, figuredir=None, plotall=False, fimo_motifs=None, meta_profile_dict=None, label1=None, label2=None, dpi=None, motif_fpkm={}, bootstrap=False, gc=None, plot_format=None): '''This is the main script of the ENRICHMENT module. It takes as input a list of distances outputted from the SCANNER module and calculates an enrichment score, a p-value, and in some instances an adjusted p-value for each motif. Parameters ---------- use_config : boolean Whether to use a config module to assign variables. motif_distances : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances1 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region md_distances2 : list of lists A list containing a list for each motif scanned. For each motif, the list begins with the motif name as a string and is followed by int values corresponding to the motif distance for each region (ranked). A '.' value means the motif was not within the given region enrichment : str The type of enrichment analysis to perform output_type : str Determines what some functions will output. At this point, this is mostly intended for debug purposes. permutations : int Number of random shuffling permutations to perform to calculate a p-value debug : boolean Whether to print debug statements specifically within the multiprocess module largewindow : int A distance cutoff value used within auc_bgcorrect smallwindow : int A distance cutoff value used within the md score analysis Returns ------- results : list of lists A list of lists corresponding to enrichment statistics for each motif md_results : list of lists A list of lists corresponding to md-score statistics for each motif ''' start_time = time.time() if use_config: motif_distances = config.vars['MOTIF_DISTANCES'] md_distances1 = config.vars['MD_DISTANCES1'] md_distances2 = config.vars['MD_DISTANCES2'] mdd_distances1 = config.vars['MDD_DISTANCES1'] mdd_distances2 = config.vars['MDD_DISTANCES2'] enrichment = config.vars['ENRICHMENT'] permutations = config.vars['PERMUTATIONS'] debug = config.vars['DEBUG'] largewindow = config.vars['LARGEWINDOW'] smallwindow = config.vars['SMALLWINDOW'] pvals = config.vars['PVALS'] fcs = config.vars['FCS'] md = config.vars['MD'] mdd = config.vars['MDD'] cpus = config.vars['CPUS'] jobid = config.vars['JOBID'] p_cutoff = np.log(config.vars['PADJCUTOFF']) figuredir = config.vars['FIGUREDIR'] plotall = config.vars['PLOTALL'] fimo_motifs = config.vars['FIMO_MOTIFS'] meta_profile_dict = config.vars['META_PROFILE'] label1 = config.vars['LABEL1'] label2 = config.vars['LABEL2'] output_type = config.vars['OUTPUT_TYPE'] bootstrap = config.vars['BOOTSTRAP'] gc = config.vars['GC'] plot_format = config.vars['PLOT_FORMAT'] try: motif_fpkm = config.vars['MOTIF_FPKM'] except: motif_fpkm = {} print("Calculating enrichment...", flush=True, file=sys.stderr) results = None md_results = None mdd_results = None if enrichment == 'auc': gc_correct = {} linear_regression = None if gc: print('\tCorrecting GC:', file=sys.stderr) auc_keywords = dict(fimo_motifs=fimo_motifs) motif_gc_auc = multiprocess.main(function=get_auc_gc, args=motif_distances, kwargs=auc_keywords, debug=debug, jobid=jobid, cpus=cpus) #Calculate linear regression based on AUC and GC content of motifs varx = np.array([i[2] for i in motif_gc_auc]) vary = np.array([i[1] for i in motif_gc_auc]) mask = ~np.isnan(varx) & ~np.isnan(vary) linear_regression = [ x for x in stats.linregress(varx[mask], vary[mask]) ] slope, intercept, _, _, _ = linear_regression for key, _, gc in motif_gc_auc: offset = slope * gc + intercept gc_correct[key] = offset print('\tCalculating E-Score:', file=sys.stderr) # manager = Manager() # meta_profile_dict = manager.dict(meta_profile_dict) auc_keywords = dict(permutations=permutations, use_config=use_config, output_type=output_type, pvals=pvals, plotall=plotall, p_cutoff=p_cutoff, figuredir=figuredir, largewindow=largewindow, fimo_motifs=fimo_motifs, meta_profile_dict=meta_profile_dict, label1=label1, label2=label2, fcs=fcs, motif_fpkm=motif_fpkm, tests=len(motif_distances), bootstrap=bootstrap, gc_correct=gc_correct, plot_format=plot_format) results = multiprocess.main(function=auc_simulate_and_plot, args=motif_distances, kwargs=auc_keywords, debug=debug, jobid=jobid, cpus=cpus) plot.plot_global_gc(results, p_cutoff=p_cutoff, title='TFEA GC-Plot', xlabel='Motif GC-content', ylabel='Non-corrected E-Score', savepath=figuredir / ('TFEA_GC.' + plot_format), linear_regression=linear_regression, plot_format=plot_format, x_index=4, y_index=1, c_index=2, p_index=-1, ylimits=[-1, 1]) # results = list() # for motif_distance in motif_distances: # results.append(area_under_curve(motif_distance, **auc_keywords)) # padj_bonferroni(results) # elif enrichment == 'anderson-darling': # results = multiprocess.main(function=anderson_darling, # args=motif_distances, debug=debug, # jobid=jobid, cpus=cpus) # elif enrichment == 'auc_bgcorrect': # print('\tTFEA:', file=sys.stderr) # auc_bgcorrect_keywords = dict(permutations=permutations) # results = multiprocess.main(function=area_under_curve_bgcorrect, # args=motif_distances, # kwargs=auc_bgcorrect_keywords, # debug=debug, jobid=jobid, cpus=cpus) # padj_bonferroni(results) else: raise exceptions.InputError( "Enrichment option not recognized or supported.") if md: print('\tMD:', file=sys.stderr) md_results = calculate_md(md_distances1=md_distances1, md_distances2=md_distances2, smallwindow=smallwindow, jobid=jobid, cpus=cpus, debug=debug) if use_config: config.vars['MD_RESULTS'] = md_results if mdd: print('\tMDD:', file=sys.stderr) mdd_results = calculate_md(md_distances1=mdd_distances1, md_distances2=mdd_distances2, smallwindow=smallwindow, jobid=jobid, cpus=cpus, debug=debug) if use_config: config.vars['MDD_RESULTS'] = mdd_results if use_config: config.vars['RESULTS'] = results total_time = time.time() - start_time if use_config: config.vars['ENRICHMENTtime'] = total_time #Remove large meta profile file # meta_profile_file.unlink() if type(meta_profile_dict) == pathlib.PosixPath: shutil.rmtree(meta_profile_dict, ignore_errors=True) print("done in: " + str(datetime.timedelta(seconds=int(total_time))), file=sys.stderr) if debug: multiprocess.current_mem_usage(jobid) return results, md_results, mdd_results