def get_df(self, window=100): print("building GC content") data = tools._base_content(self.filename, window, "GC") names = self.fasta.names lengths = self.fasta.lengths GC = [np.nanmean(data[name]) for name in names] nreads = [0] * len(GC) covStats = [0] * len(GC) if self.mode == "canu": for i, comment in enumerate(self.fasta.comments): read = [x for x in comment.split() if x.startswith("reads")][0] covStat = [ x for x in comment.split() if x.startswith("covStat") ][0] read = read.split("=")[1] covStat = covStat.split("=")[1] nreads[i] = int(read) covStats[i] = float(covStat) #if self.bamfile df = pd.DataFrame({ "GC": list(GC), "length": lengths, "name": names, "nread": nreads, "covStat": covStats }) # deal with the bamfile if self.bam: bam_df = self.bam.get_df() bam_df = bam_df.query("flag in [0,16]") bam_df.set_index("qname", inplace=True) chrom_name = bam_df.loc[self.fasta.names]["rname"] df["chromosome"] = list(chrom_name) self._df = df.copy() return df
def get_gc(self, window=100): data = tools._base_content(self.filename, window, "GC") names = self.fasta.names lengths = self.fasta.lengths GC = [100 * np.nanmean(data[name]) for name in names] return GC