def fetch_intervals(self, gr: GenomeRange): """ Parameters ---------- gr : {str, GenomeRange} Returns ------- intervals : pandas.core.frame.DataFrame Annotation interval table. """ rows = [ row for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end) ] if not rows: gr.change_chrom_names() for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end): rows.append(row) columns = [ 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' ] df = pd.DataFrame(rows, columns=columns) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) df['gene_name'] = df['attribute'].str.extract( ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";') df['gene_name'].fillna("", inplace=True) return df
def fetch_intervals(self, gr: GenomeRange): """ Parameters ---------- gr : {str, GenomeRange} Returns ------- intervals : pandas.core.frame.DataFrame Annotation interval table. """ rows = [ row for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end) ] if not rows: gr.change_chrom_names() for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end): rows.append(row) columns = [ 'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand', 'frame', 'attribute' ] df = pd.DataFrame(rows, columns=columns) df['start'] = df['start'].astype(int) df['end'] = df['end'].astype(int) name_attr = self.properties.get("name_attr", "auto") if name_attr == "auto": gene_name = df['attribute'].str.extract( ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";') if gene_name.hasnans: gene_id = df['attribute'].str.extract( ".*gene_id (.*?) ").iloc[:, 0].str.strip('\";') gene_name.fillna(gene_id, inplace=True) if gene_name.hasnans: pos_str = df['seqname'].astype(str) + ":" +\ df['start'].astype(str) + "-" +\ df['end'].astype(str) gene_name.fillna(pos_str, inplace=True) df['feature_name'] = gene_name else: df['feature_name'] = df['attribute'].str.extract( f".*{name_attr} (.*?) ").iloc[:, 0].str.strip('\";') return df
def __load(self, gr): rows = [] ix_pos = self.properties['col_pos'] ix_pval = self.properties['col_pval'] for items in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end): items[ix_pos] = int(items[ix_pos]) items[ix_pval] = float(items[ix_pval]) rows.append(items) return rows
def __load(self, genome_range): rows = [] gr = genome_range for it in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end, split=True): rows.append([it[0], int(it[1]), int(it[2]), float(it[3])]) return rows
def load(self, genome_range): gr = genome_range return [[it[0], int(it[1]), int(it[2]), float(it[3])] for it in tabix_query( self.bgz_file, gr.chrom, gr.start, gr.end, split=True)]