def set_peak_size(self, peak_bed, seqlen=200): """set all input peaks to 200bp Arguments: peak_bed {[bed]} -- [input peak bed file] Keyword Arguments: seqlen {int} -- [peak length] (default: {200}) Returns: [type] -- [200bp peak file] """ gsizedic = Genome(self.genome).sizes peaks = BedTool(peak_bed) fl2 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) for peak in peaks: if peak.length < seqlen or peak.length > seqlen: # get the summit and the flanking low and high sequences summit = (peak.start + peak.end) // 2 start, end = summit - seqlen // 2, summit + seqlen // 2 else: start, end = peak.start, peak.end # remove seq which langer than chromosome length or smaller than 0 if start > 0 and end < int(gsizedic[peak.chrom]): fl2.write(f"{peak.chrom}\t{start}\t{end}\n") return fl2.name
def mk_peak(self, epeak): epeak200 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) with open(epeak) as peakfile, open(epeak200.name, "w") as npeakfile: for line in peakfile: a = line.split() chrm = a[0] start = int(a[1]) summit = int(a[9]) nsummit = start + summit if nsummit < 100: nsummit = 100 npeakfile.write(f"{chrm}\t{nsummit-100}\t{nsummit+100}\n") return epeak200.name
def get_PWMScore(self, fin_regions_fa): """ Scan motif in every peak. Arguments: fin_regions_fa {[type]} -- [input fasta file] Returns: [type] -- [pfmscorefile] """ pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids] s = Scanner(ncpus=self.ncore) s.set_motifs(self.pfmfile) s.set_threshold(threshold=0.0) s.set_genome(self.genome) with open(self.pfmfile) as f: motifs = read_motifs(f) chunksize = 10000 # Run 10k peaks one time. with tqdm(total=len(seqs)) as pbar: for chunk in range(0, len(seqs), chunksize): chunk_seqs = seqs[chunk : chunk + chunksize] # print(chunk, "-", chunk + chunksize, "enhancers") pfm_score = [] it = s.best_score(chunk_seqs, zscore=True, gc=True) # We are using GC-normalization for motif scan because many sequence is GC-enriched. # GimmeMotif develop branch already include GC-normalization option now. for seq, scores in zip(chunk_seqs, it): for motif, score in zip(motifs, scores): pfm_score.append([motif.id, seq, score]) pbar.update(1) pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"]) pfm_score = pfm_score.set_index("motif") # print("\tCombine") pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"])) # When we built model, rank and minmax normalization was used. cols = ["enhancer", "zscore", "zscoreRank"] write_header = False if chunk == 0: write_header = True pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header) # pbar.update(chunk + chunksize) return pfmscorefile.name
def get_correlation(self, corrfiles, features): df = pd.read_hdf(features) df = df[["source_target"]] df.source_target = [i.upper() for i in list(df.source_target)] df = df.set_index("source_target") for i, corrfile in enumerate(corrfiles): corr = pd.read_table(corrfile, sep="\t", index_col=0) corr = corr.rename( columns={corr.columns[0]: "corr_file{}".format(i + 1)}) df = df.join(corr) corr_file = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) # outfile = os.path.join(outdir, "correlation.txt") df.to_csv(corr_file, sep="\t") return corr_file.name
def quantileNormalize(self, bed_input): rank = [] with open(self.peak_rank) as p: for i in p: rank.append(float(i[:-1])) bed = pd.read_csv(bed_input, header=None, sep="\t") t = np.searchsorted(np.sort(bed[3]), bed[3]) bed[3] = [rank[i] for i in t] bed[1] = [int(i) + 900 for i in bed[1].tolist()] bed[2] = [int(i) - 900 for i in bed[2].tolist()] quantile_bed = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) bed.to_csv(quantile_bed, sep="\t", header=False, index=False) return quantile_bed.name
def get_peakRPKM(self, fin_rpkm): # When we built model, the peak intensity was ranked and scaled. peaks = pd.read_table(fin_rpkm, names=["chrom", "start", "end", "peakRPKM"]) peaks["peak"] = ( peaks["chrom"] + ":" + peaks["start"].astype(str) + "-" + peaks["end"].astype(str) ) add = peaks["peakRPKM"][peaks["peakRPKM"] > 0].min() peaks["log10_peakRPKM"] = np.log10(peaks["peakRPKM"] + add) peaks["peakRPKMScale"] = minmax_scale(peaks["log10_peakRPKM"]) peaks["peakRPKMRank"] = minmax_scale(rankdata(peaks["log10_peakRPKM"])) peakrpkmfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) cols = ["peak", "peakRPKM", "log10_peakRPKM", "peakRPKMScale", "peakRPKMRank"] peaks[cols].to_csv(peakrpkmfile, sep="\t", index=False) return peakrpkmfile.name
def clear_peak_df(self, ddf): """ Filter the enhancer peaks in promoter range. """ global alltfs alltfs = list(set(ddf.factor)) enhancerbed = pd.DataFrame(set(ddf.enhancer)) enhancerbed[["chr", "site"]] = enhancerbed[0].str.split(":", expand=True) enhancerbed[["start", "end"]] = enhancerbed.site.str.split("-", expand=True) enhancerbed.drop(columns=[0, "site"], inplace=True) enhancerfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) enhancerbed.to_csv(enhancerfile, sep="\t", header=False, index=False) # print(enhancerfile.name) return alltfs, enhancerfile.name
def runCov(self, bam_input): covfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) covcmd = f"multiBamCov -bams {bam_input} -bed {self.peak_2k} > {covfile.name}" process = subprocess.Popen(covcmd, shell=True, stdout=subprocess.PIPE) process.wait() return covfile.name
def get_factorExpression(self, fin_expression): import numpy as np import pandas as pd from scipy.stats import rankdata from sklearn import preprocessing import warnings warnings.filterwarnings("ignore") factorsExpression = {} # for line in open(self.motifs2factors): # if not line.split("\t")[1].strip().split(",") == [""]: # for factor in line.split("\t")[1].strip().split(","): # factorsExpression[factor.upper()] = [] for tf in alltfs: factorsExpression[tf] = [] for f in fin_expression: with open(f) as fa: for line in fa: if not line.startswith("target_id"): gene = line.split("\t")[0].upper() expression = float(line.split("\t")[1]) if gene in factorsExpression: if expression < 1e-10: expression = 1e-10 factorsExpression[gene].append( np.log10(expression)) # for line in open(fin_b): # if not line.startswith('target_id'): # gene = line.split('\t')[0].upper() # expression = float(line.split('\t')[4]) # if gene in factorsExpression: # if expression < 1e-10: # expression = 1e-10 # factorsExpression[gene].append(np.log10(expression)) factors_expression_file = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) factors_expression_file.write("#factor\tfactorExpression\n") for factor in factorsExpression: if len(factorsExpression[factor]) == 0: factors_expression_file.write("{}\t{}\n".format( factor, np.log10(1e-10))) else: factors_expression_file.write("{}\t{}\n".format( factor, np.mean(factorsExpression[factor]))) scores_df = pd.read_table(factors_expression_file.name, sep="\t", index_col=0) # scores_df['factorExpressionRank'] = preprocessing.MinMaxScaler().fit_transform(rankdata(scores_df['factorExpression'], method='average')) scores_df["factorExpressionRank"] = preprocessing.MinMaxScaler( ).fit_transform( rankdata(scores_df["factorExpression"], method="average").reshape(-1, 1)) scores_df.to_csv(factors_expression_file.name, sep="\t") return factors_expression_file.name
def get_expression(self, fin_expression, features, min_tpm=1e-10, column="tpm"): # df = dd.read_hdf(features) # print(features.head()) # features = dd.from_pandas(features, chunksize=100000) df = pd.read_hdf(features, key="/features", columns=["source_target", "factor", "gene"]) # df = features # df = df[["source_target", "factor", "gene"]] df.source_target = [i.upper() for i in list(df.source_target)] df.gene = [i.upper() for i in list(df.gene)] df = df.set_index("source_target") # fa2name={} # fa2=open("/home/qxu/projects/regulatoryNetwork/run20180716/scripts/data/gene2name.txt","r") # #switch the Factor name to gene name # for i in fa2: # a=i.split() # if a[0].startswith("gene"): # fa2name[a[1]]=a[0] # flist=[] # for f in list(df["factor"]): # if str.lower(f) in fa2name: # flist.append(fa2name[str.lower(f)]) # elif f in fa2name: # flist.append(fa2name[f]) # else: # flist.append("") # df["gfactor"]=flist # Take mean of all TPMs expression = pd.DataFrame( pd.concat( [ pd.read_table(f, index_col=0)[[column]] for f in fin_expression ], axis=1, ).mean(1), columns=[column], ) expression.index = [i.upper() for i in list(expression.index)] # print(expression) expression[column] = np.log2(expression[column] + 1e-5) df = df.join(expression, on="factor") df = df.rename(columns={column: "factor_expression"}) df = df.join(expression, on="gene") df = df.rename(columns={column: "target_expression"}) df = df.dropna() for col in ["factor_expression", "target_expression"]: df[col + ".scale"] = minmax_scale(df[col]) df[col + ".rank.scale"] = minmax_scale(rankdata(df[col])) # with ProgressBar(): # df.compute(num_workers = self.ncore) expression_file = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) # outfile = os.path.join(outdir, "expression.txt") df.to_csv(expression_file, sep="\t") return expression_file.name
def aggregate_binding(self, ddf, prom, p, weight): # ddf = dd.read_hdf(binding, key="/binding")[["factor", "enhancer", "binding"]] # ddf = dd.read_csv(binding, sep="\t")[["factor", "enhancer", "binding"]] prom_table = ddf.merge(prom, left_on="enhancer", right_on="loc") prom_table = prom_table.groupby(["factor", "gene"])[["binding"]].max() prom_table = prom_table.rename( columns={"binding": "max_binding_in_promoter"}) prom_table = prom_table.reset_index() prom_table["source_target"] = (prom_table["factor"].map(str) + "_" + prom_table["gene"].map(str)) f_table = ddf.merge(p, left_on="enhancer", right_on="loc") sum_enh = f_table.groupby(["factor", "gene"])[["binding"]].count() f_table["sum_weighted_logodds"] = (f_table["binding"].div( f_table["binding"].mean()).apply( np.log, meta=("binding", np.float64)).rmul(50000).div(f_table["dist"])) f_table["sum_logodds"] = (f_table["binding"].div( f_table["binding"].mean()).apply(np.log, meta=("binding", np.float64))) weight = dd.read_csv(weight) f_table = f_table.merge(weight, how="left", on="dist") f_table["sum_dist_weight"] = f_table["binding"] * f_table["weight"] f_table_sum = f_table.groupby(["factor", "gene"]).sum()[[ "sum_weighted_logodds", "sum_logodds", "binding", "sum_dist_weight" ]] f_table_max = f_table.groupby(["factor", "gene"])[["binding", "sum_dist_weight"]].max() f_table_sum = f_table_sum.rename(columns={"binding": "sum_binding"}) f_table_max = f_table_max.rename(columns={"binding": "max_binding"}) f_table_max = f_table_max.rename( columns={"sum_dist_weight": "max_sum_dist_weight"}) sum_enh = sum_enh.rename(columns={"binding": "enhancers"}) f_table_sum = f_table_sum.reset_index() f_table_max = f_table_max.reset_index() f_table = f_table.reset_index() sum_enh = sum_enh.reset_index() f_table_sum["source_target"] = f_table_sum[ "factor"] + "_" + f_table_sum["gene"] f_table_max["source_target"] = f_table_max[ "factor"] + "_" + f_table_max["gene"] f_table["source_target"] = f_table["factor"] + "_" + f_table["gene"] sum_enh["source_target"] = sum_enh["factor"] + "_" + sum_enh["gene"] f_table_max = f_table_max.rename(columns={"factor": "factor2"}) f_table_max = f_table_max.rename(columns={"gene": "gene2"}) f_table = f_table_sum.merge(f_table_max, left_on="source_target", right_on="source_target", how="outer") f_table = f_table.merge(sum_enh, left_on="source_target", right_on="source_target", how="outer") f_table = f_table.merge(prom_table, left_on="source_target", right_on="source_target", how="outer") f_table = f_table[[ "source_target", "factor", "gene", "sum_weighted_logodds", "sum_dist_weight", "sum_logodds", "sum_binding", "enhancers", "max_binding_in_promoter", "max_binding", "max_sum_dist_weight", ]] f_table["log_sum_binding"] = (f_table["sum_binding"].add(1e-5).apply( np.log, meta=("sum_binding", np.float64))) f_table["log_enhancers"] = (f_table["enhancers"].add(1).apply( np.log, meta=("enhancers", np.float64))) f_table["factor"] = f_table["source_target"].str.replace("_.*", "") f_table["gene"] = f_table["source_target"].str.replace(".*_", "") f_table["max_binding_in_promoter"] = f_table[ "max_binding_in_promoter"].fillna(0) features_file = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) # print("computing, output file {}".format(features_file.name)) with ProgressBar(): f_table.compute(num_workers=self.ncore) f_table.to_hdf(features_file.name, key="/features") return features_file.name
def distance_weight(self, include_promoter=False, include_enhancer=True, alpha=1e4, padding=100000, keep1=5000, remove=2000): """ Built weight distribution from TSS. """ # alpha is half site, default setting is 1e4, which means at 1e4 position weight is 0.5 # padding is the full range we used # remove is promoter removed range # keep1 is keep full binding score range u = -math.log(1.0 / 3.0) * 1e5 / alpha if include_promoter and include_enhancer: weight1 = pd.DataFrame({ "weight": [1 for z in range(1, remove + 1)], "dist": range(1, remove + 1) }) weight2 = pd.DataFrame({ "weight": [1 for z in range(remove + 1, keep1 + 1)], "dist": range(remove + 1, keep1 + 1), }) weight3 = pd.DataFrame({ "weight": [ 2.0 * math.exp(-u * math.fabs(z) / 1e5) / (1.0 + math.exp(-u * math.fabs(z) / 1e5)) for z in range(1, padding - keep1 + 1) ], "dist": range(keep1 + 1, padding + 1), }) elif not include_promoter and include_enhancer: weight1 = pd.DataFrame({ "weight": [0 for z in range(1, remove + 1)], "dist": range(1, remove + 1) }) weight2 = pd.DataFrame({ "weight": [1 for z in range(remove + 1, keep1 + 1)], "dist": range(remove + 1, keep1 + 1), }) weight3 = pd.DataFrame({ "weight": [ 2.0 * math.exp(-u * math.fabs(z) / 1e5) / (1.0 + math.exp(-u * math.fabs(z) / 1e5)) for z in range(1, padding - keep1 + 1) ], "dist": range(keep1 + 1, padding + 1), }) elif include_promoter and not include_enhancer: weight1 = pd.DataFrame({ "weight": [1 for z in range(1, remove + 1)], "dist": range(1, remove + 1) }) weight2 = pd.DataFrame({ "weight": [0 for z in range(remove + 1, keep1 + 1)], "dist": range(remove + 1, keep1 + 1), }) weight3 = pd.DataFrame({ "weight": [0 for z in range(1, padding - keep1 + 1)], "dist": range(keep1 + 1, padding + 1), }) else: weight1 = pd.DataFrame({ "weight": [0 for z in range(1, remove + 1)], "dist": range(1, remove + 1) }) weight2 = pd.DataFrame({ "weight": [0 for z in range(remove + 1, keep1 + 1)], "dist": range(remove + 1, keep1 + 1), }) weight3 = pd.DataFrame({ "weight": [0 for z in range(1, padding - keep1 + 1)], "dist": range(keep1 + 1, padding + 1), }) weight = pd.concat([weight1, weight2, weight3]) weightfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False) weight.to_csv(weightfile) return weightfile.name