Ejemplo n.º 1
0
    def set_peak_size(self, peak_bed, seqlen=200):
        """set all input peaks to 200bp
        Arguments:
            peak_bed {[bed]} -- [input peak bed file]

        Keyword Arguments:
            seqlen {int} -- [peak length] (default: {200})

        Returns:
            [type] -- [200bp peak file]
        """
        gsizedic = Genome(self.genome).sizes

        peaks = BedTool(peak_bed)
        fl2 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)

        for peak in peaks:
            if peak.length < seqlen or peak.length > seqlen:
                # get the summit and the flanking low and high sequences
                summit = (peak.start + peak.end) // 2
                start, end = summit - seqlen // 2, summit + seqlen // 2
            else:
                start, end = peak.start, peak.end

            # remove seq which langer than chromosome length or smaller than 0
            if start > 0 and end < int(gsizedic[peak.chrom]):
                fl2.write(f"{peak.chrom}\t{start}\t{end}\n")

        return fl2.name
Ejemplo n.º 2
0
 def mk_peak(self, epeak):
     epeak200 = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
     with open(epeak) as peakfile, open(epeak200.name, "w") as npeakfile:
         for line in peakfile:
             a = line.split()
             chrm = a[0]
             start = int(a[1])
             summit = int(a[9])
             nsummit = start + summit
             if nsummit < 100:
                 nsummit = 100
             npeakfile.write(f"{chrm}\t{nsummit-100}\t{nsummit+100}\n")
     return epeak200.name
Ejemplo n.º 3
0
    def get_PWMScore(self, fin_regions_fa):
        """ Scan motif in every peak.

        Arguments:
            fin_regions_fa {[type]} -- [input fasta file]

        Returns:
            [type] -- [pfmscorefile]
        """
        pfmscorefile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        seqs = [s.split(" ")[0] for s in as_fasta(fin_regions_fa, genome=self.genome).ids]

        s = Scanner(ncpus=self.ncore)
        s.set_motifs(self.pfmfile)
        s.set_threshold(threshold=0.0)
        s.set_genome(self.genome)

        with open(self.pfmfile) as f:
            motifs = read_motifs(f)

        chunksize = 10000
        # Run 10k peaks one time.

        with tqdm(total=len(seqs)) as pbar:
            for chunk in range(0, len(seqs), chunksize):
                chunk_seqs = seqs[chunk : chunk + chunksize]
                # print(chunk, "-", chunk + chunksize, "enhancers")
                pfm_score = []
                it = s.best_score(chunk_seqs, zscore=True, gc=True)
                # We are using GC-normalization for motif scan because many sequence is GC-enriched.
                # GimmeMotif develop branch already include GC-normalization option now.
                for seq, scores in zip(chunk_seqs, it):
                    for motif, score in zip(motifs, scores):
                        pfm_score.append([motif.id, seq, score])
                    pbar.update(1)
                pfm_score = pd.DataFrame(pfm_score, columns=["motif", "enhancer", "zscore"])
                pfm_score = pfm_score.set_index("motif")

                # print("\tCombine")
                pfm_score["zscoreRank"] = minmax_scale(rankdata(pfm_score["zscore"]))
                # When we built model, rank and minmax normalization was used.
                cols = ["enhancer", "zscore", "zscoreRank"]
                write_header = False
                if chunk == 0:
                    write_header = True
                pfm_score[cols].to_csv(pfmscorefile, sep="\t", header=write_header)
                # pbar.update(chunk + chunksize)

        return pfmscorefile.name
Ejemplo n.º 4
0
    def get_correlation(self, corrfiles, features):
        df = pd.read_hdf(features)
        df = df[["source_target"]]
        df.source_target = [i.upper() for i in list(df.source_target)]
        df = df.set_index("source_target")

        for i, corrfile in enumerate(corrfiles):
            corr = pd.read_table(corrfile, sep="\t", index_col=0)
            corr = corr.rename(
                columns={corr.columns[0]: "corr_file{}".format(i + 1)})
            df = df.join(corr)

        corr_file = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        # outfile = os.path.join(outdir, "correlation.txt")
        df.to_csv(corr_file, sep="\t")
        return corr_file.name
Ejemplo n.º 5
0
    def quantileNormalize(self, bed_input):
        rank = []
        with open(self.peak_rank) as p:
            for i in p:
                rank.append(float(i[:-1]))

        bed = pd.read_csv(bed_input, header=None, sep="\t")
        t = np.searchsorted(np.sort(bed[3]), bed[3])
        bed[3] = [rank[i] for i in t]
        bed[1] = [int(i) + 900 for i in bed[1].tolist()]
        bed[2] = [int(i) - 900 for i in bed[2].tolist()]

        quantile_bed = NamedTemporaryFile(mode="w",
                                          dir=mytmpdir(),
                                          delete=False)
        bed.to_csv(quantile_bed, sep="\t", header=False, index=False)
        return quantile_bed.name
Ejemplo n.º 6
0
    def get_peakRPKM(self, fin_rpkm):
        # When we built model, the peak intensity was ranked and scaled.
        peaks = pd.read_table(fin_rpkm, names=["chrom", "start", "end", "peakRPKM"])
        peaks["peak"] = (
            peaks["chrom"]
            + ":"
            + peaks["start"].astype(str)
            + "-"
            + peaks["end"].astype(str)
        )
        add = peaks["peakRPKM"][peaks["peakRPKM"] > 0].min()
        peaks["log10_peakRPKM"] = np.log10(peaks["peakRPKM"] + add)
        peaks["peakRPKMScale"] = minmax_scale(peaks["log10_peakRPKM"])
        peaks["peakRPKMRank"] = minmax_scale(rankdata(peaks["log10_peakRPKM"]))

        peakrpkmfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        cols = ["peak", "peakRPKM", "log10_peakRPKM", "peakRPKMScale", "peakRPKMRank"]
        peaks[cols].to_csv(peakrpkmfile, sep="\t", index=False)

        return peakrpkmfile.name
Ejemplo n.º 7
0
    def clear_peak_df(self, ddf):
        """
        Filter the enhancer peaks in promoter range.
        """

        global alltfs
        alltfs = list(set(ddf.factor))

        enhancerbed = pd.DataFrame(set(ddf.enhancer))
        enhancerbed[["chr", "site"]] = enhancerbed[0].str.split(":",
                                                                expand=True)
        enhancerbed[["start", "end"]] = enhancerbed.site.str.split("-",
                                                                   expand=True)
        enhancerbed.drop(columns=[0, "site"], inplace=True)

        enhancerfile = NamedTemporaryFile(mode="w",
                                          dir=mytmpdir(),
                                          delete=False)
        enhancerbed.to_csv(enhancerfile, sep="\t", header=False, index=False)
        # print(enhancerfile.name)
        return alltfs, enhancerfile.name
Ejemplo n.º 8
0
 def runCov(self, bam_input):
     covfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
     covcmd = f"multiBamCov -bams {bam_input} -bed {self.peak_2k} > {covfile.name}"
     process = subprocess.Popen(covcmd, shell=True, stdout=subprocess.PIPE)
     process.wait()
     return covfile.name
Ejemplo n.º 9
0
    def get_factorExpression(self, fin_expression):
        import numpy as np
        import pandas as pd
        from scipy.stats import rankdata
        from sklearn import preprocessing
        import warnings

        warnings.filterwarnings("ignore")
        factorsExpression = {}

        # for line in open(self.motifs2factors):
        #     if not line.split("\t")[1].strip().split(",") == [""]:
        #         for factor in line.split("\t")[1].strip().split(","):
        #             factorsExpression[factor.upper()] = []

        for tf in alltfs:
            factorsExpression[tf] = []

        for f in fin_expression:
            with open(f) as fa:
                for line in fa:
                    if not line.startswith("target_id"):
                        gene = line.split("\t")[0].upper()
                        expression = float(line.split("\t")[1])
                        if gene in factorsExpression:
                            if expression < 1e-10:
                                expression = 1e-10
                            factorsExpression[gene].append(
                                np.log10(expression))

        # for line in open(fin_b):
        #     if not line.startswith('target_id'):
        #         gene = line.split('\t')[0].upper()
        #         expression = float(line.split('\t')[4])
        #         if gene in factorsExpression:
        #             if expression < 1e-10:
        #                 expression = 1e-10
        #             factorsExpression[gene].append(np.log10(expression))

        factors_expression_file = NamedTemporaryFile(mode="w",
                                                     dir=mytmpdir(),
                                                     delete=False)
        factors_expression_file.write("#factor\tfactorExpression\n")
        for factor in factorsExpression:
            if len(factorsExpression[factor]) == 0:
                factors_expression_file.write("{}\t{}\n".format(
                    factor, np.log10(1e-10)))
            else:
                factors_expression_file.write("{}\t{}\n".format(
                    factor, np.mean(factorsExpression[factor])))

        scores_df = pd.read_table(factors_expression_file.name,
                                  sep="\t",
                                  index_col=0)
        # scores_df['factorExpressionRank'] = preprocessing.MinMaxScaler().fit_transform(rankdata(scores_df['factorExpression'], method='average'))
        scores_df["factorExpressionRank"] = preprocessing.MinMaxScaler(
        ).fit_transform(
            rankdata(scores_df["factorExpression"],
                     method="average").reshape(-1, 1))

        scores_df.to_csv(factors_expression_file.name, sep="\t")
        return factors_expression_file.name
Ejemplo n.º 10
0
    def get_expression(self,
                       fin_expression,
                       features,
                       min_tpm=1e-10,
                       column="tpm"):
        # df = dd.read_hdf(features)
        # print(features.head())
        # features = dd.from_pandas(features, chunksize=100000)
        df = pd.read_hdf(features,
                         key="/features",
                         columns=["source_target", "factor", "gene"])
        # df = features
        # df = df[["source_target", "factor", "gene"]]
        df.source_target = [i.upper() for i in list(df.source_target)]
        df.gene = [i.upper() for i in list(df.gene)]
        df = df.set_index("source_target")
        # fa2name={}
        # fa2=open("/home/qxu/projects/regulatoryNetwork/run20180716/scripts/data/gene2name.txt","r")

        # #switch the Factor name to gene name
        # for i in fa2:
        #     a=i.split()
        #     if a[0].startswith("gene"):
        #         fa2name[a[1]]=a[0]

        # flist=[]
        # for f in list(df["factor"]):
        #     if str.lower(f) in fa2name:
        #         flist.append(fa2name[str.lower(f)])
        #     elif f in fa2name:
        #         flist.append(fa2name[f])
        #     else:
        #         flist.append("")
        # df["gfactor"]=flist

        # Take mean of all TPMs
        expression = pd.DataFrame(
            pd.concat(
                [
                    pd.read_table(f, index_col=0)[[column]]
                    for f in fin_expression
                ],
                axis=1,
            ).mean(1),
            columns=[column],
        )
        expression.index = [i.upper() for i in list(expression.index)]
        # print(expression)
        expression[column] = np.log2(expression[column] + 1e-5)
        df = df.join(expression, on="factor")
        df = df.rename(columns={column: "factor_expression"})
        df = df.join(expression, on="gene")
        df = df.rename(columns={column: "target_expression"})

        df = df.dropna()

        for col in ["factor_expression", "target_expression"]:
            df[col + ".scale"] = minmax_scale(df[col])
            df[col + ".rank.scale"] = minmax_scale(rankdata(df[col]))

        # with ProgressBar():
        #     df.compute(num_workers = self.ncore)
        expression_file = NamedTemporaryFile(mode="w",
                                             dir=mytmpdir(),
                                             delete=False)
        # outfile = os.path.join(outdir, "expression.txt")
        df.to_csv(expression_file, sep="\t")
        return expression_file.name
Ejemplo n.º 11
0
    def aggregate_binding(self, ddf, prom, p, weight):

        # ddf = dd.read_hdf(binding, key="/binding")[["factor", "enhancer", "binding"]]
        # ddf = dd.read_csv(binding, sep="\t")[["factor", "enhancer", "binding"]]
        prom_table = ddf.merge(prom, left_on="enhancer", right_on="loc")
        prom_table = prom_table.groupby(["factor", "gene"])[["binding"]].max()
        prom_table = prom_table.rename(
            columns={"binding": "max_binding_in_promoter"})
        prom_table = prom_table.reset_index()
        prom_table["source_target"] = (prom_table["factor"].map(str) + "_" +
                                       prom_table["gene"].map(str))

        f_table = ddf.merge(p, left_on="enhancer", right_on="loc")
        sum_enh = f_table.groupby(["factor", "gene"])[["binding"]].count()
        f_table["sum_weighted_logodds"] = (f_table["binding"].div(
            f_table["binding"].mean()).apply(
                np.log,
                meta=("binding", np.float64)).rmul(50000).div(f_table["dist"]))
        f_table["sum_logodds"] = (f_table["binding"].div(
            f_table["binding"].mean()).apply(np.log,
                                             meta=("binding", np.float64)))

        weight = dd.read_csv(weight)
        f_table = f_table.merge(weight, how="left", on="dist")
        f_table["sum_dist_weight"] = f_table["binding"] * f_table["weight"]

        f_table_sum = f_table.groupby(["factor", "gene"]).sum()[[
            "sum_weighted_logodds", "sum_logodds", "binding", "sum_dist_weight"
        ]]
        f_table_max = f_table.groupby(["factor",
                                       "gene"])[["binding",
                                                 "sum_dist_weight"]].max()

        f_table_sum = f_table_sum.rename(columns={"binding": "sum_binding"})
        f_table_max = f_table_max.rename(columns={"binding": "max_binding"})
        f_table_max = f_table_max.rename(
            columns={"sum_dist_weight": "max_sum_dist_weight"})

        sum_enh = sum_enh.rename(columns={"binding": "enhancers"})
        f_table_sum = f_table_sum.reset_index()
        f_table_max = f_table_max.reset_index()

        f_table = f_table.reset_index()
        sum_enh = sum_enh.reset_index()

        f_table_sum["source_target"] = f_table_sum[
            "factor"] + "_" + f_table_sum["gene"]
        f_table_max["source_target"] = f_table_max[
            "factor"] + "_" + f_table_max["gene"]

        f_table["source_target"] = f_table["factor"] + "_" + f_table["gene"]
        sum_enh["source_target"] = sum_enh["factor"] + "_" + sum_enh["gene"]
        f_table_max = f_table_max.rename(columns={"factor": "factor2"})
        f_table_max = f_table_max.rename(columns={"gene": "gene2"})

        f_table = f_table_sum.merge(f_table_max,
                                    left_on="source_target",
                                    right_on="source_target",
                                    how="outer")
        f_table = f_table.merge(sum_enh,
                                left_on="source_target",
                                right_on="source_target",
                                how="outer")
        f_table = f_table.merge(prom_table,
                                left_on="source_target",
                                right_on="source_target",
                                how="outer")
        f_table = f_table[[
            "source_target",
            "factor",
            "gene",
            "sum_weighted_logodds",
            "sum_dist_weight",
            "sum_logodds",
            "sum_binding",
            "enhancers",
            "max_binding_in_promoter",
            "max_binding",
            "max_sum_dist_weight",
        ]]
        f_table["log_sum_binding"] = (f_table["sum_binding"].add(1e-5).apply(
            np.log, meta=("sum_binding", np.float64)))
        f_table["log_enhancers"] = (f_table["enhancers"].add(1).apply(
            np.log, meta=("enhancers", np.float64)))
        f_table["factor"] = f_table["source_target"].str.replace("_.*", "")
        f_table["gene"] = f_table["source_target"].str.replace(".*_", "")
        f_table["max_binding_in_promoter"] = f_table[
            "max_binding_in_promoter"].fillna(0)

        features_file = NamedTemporaryFile(mode="w",
                                           dir=mytmpdir(),
                                           delete=False)
        # print("computing, output file {}".format(features_file.name))
        with ProgressBar():
            f_table.compute(num_workers=self.ncore)
        f_table.to_hdf(features_file.name, key="/features")

        return features_file.name
Ejemplo n.º 12
0
    def distance_weight(self,
                        include_promoter=False,
                        include_enhancer=True,
                        alpha=1e4,
                        padding=100000,
                        keep1=5000,
                        remove=2000):
        """
        Built weight distribution from TSS.
        """
        # alpha is half site, default setting is 1e4, which means at 1e4 position weight is 0.5
        # padding is the full range we used
        # remove is promoter removed range
        # keep1 is keep full binding score range

        u = -math.log(1.0 / 3.0) * 1e5 / alpha

        if include_promoter and include_enhancer:
            weight1 = pd.DataFrame({
                "weight": [1 for z in range(1, remove + 1)],
                "dist": range(1, remove + 1)
            })
            weight2 = pd.DataFrame({
                "weight": [1 for z in range(remove + 1, keep1 + 1)],
                "dist":
                range(remove + 1, keep1 + 1),
            })
            weight3 = pd.DataFrame({
                "weight": [
                    2.0 * math.exp(-u * math.fabs(z) / 1e5) /
                    (1.0 + math.exp(-u * math.fabs(z) / 1e5))
                    for z in range(1, padding - keep1 + 1)
                ],
                "dist":
                range(keep1 + 1, padding + 1),
            })
        elif not include_promoter and include_enhancer:
            weight1 = pd.DataFrame({
                "weight": [0 for z in range(1, remove + 1)],
                "dist": range(1, remove + 1)
            })
            weight2 = pd.DataFrame({
                "weight": [1 for z in range(remove + 1, keep1 + 1)],
                "dist":
                range(remove + 1, keep1 + 1),
            })
            weight3 = pd.DataFrame({
                "weight": [
                    2.0 * math.exp(-u * math.fabs(z) / 1e5) /
                    (1.0 + math.exp(-u * math.fabs(z) / 1e5))
                    for z in range(1, padding - keep1 + 1)
                ],
                "dist":
                range(keep1 + 1, padding + 1),
            })
        elif include_promoter and not include_enhancer:
            weight1 = pd.DataFrame({
                "weight": [1 for z in range(1, remove + 1)],
                "dist": range(1, remove + 1)
            })
            weight2 = pd.DataFrame({
                "weight": [0 for z in range(remove + 1, keep1 + 1)],
                "dist":
                range(remove + 1, keep1 + 1),
            })
            weight3 = pd.DataFrame({
                "weight": [0 for z in range(1, padding - keep1 + 1)],
                "dist":
                range(keep1 + 1, padding + 1),
            })
        else:
            weight1 = pd.DataFrame({
                "weight": [0 for z in range(1, remove + 1)],
                "dist": range(1, remove + 1)
            })
            weight2 = pd.DataFrame({
                "weight": [0 for z in range(remove + 1, keep1 + 1)],
                "dist":
                range(remove + 1, keep1 + 1),
            })
            weight3 = pd.DataFrame({
                "weight": [0 for z in range(1, padding - keep1 + 1)],
                "dist":
                range(keep1 + 1, padding + 1),
            })

        weight = pd.concat([weight1, weight2, weight3])

        weightfile = NamedTemporaryFile(mode="w", dir=mytmpdir(), delete=False)
        weight.to_csv(weightfile)
        return weightfile.name