Beispiel #1
0
    def fetch_intervals(self, gr: GenomeRange):
        """

        Parameters
        ----------
        gr : {str, GenomeRange}

        Returns
        -------
        intervals : pandas.core.frame.DataFrame
            Annotation interval table.
        """
        rows = [
            row
            for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end)
        ]
        if not rows:
            gr.change_chrom_names()
            for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end):
                rows.append(row)

        columns = [
            'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand',
            'frame', 'attribute'
        ]
        df = pd.DataFrame(rows, columns=columns)
        df['start'] = df['start'].astype(int)
        df['end'] = df['end'].astype(int)
        df['gene_name'] = df['attribute'].str.extract(
            ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";')
        df['gene_name'].fillna("", inplace=True)
        return df
Beispiel #2
0
    def fetch_intervals(self, gr: GenomeRange):
        """

        Parameters
        ----------
        gr : {str, GenomeRange}

        Returns
        -------
        intervals : pandas.core.frame.DataFrame
            Annotation interval table.
        """
        rows = [
            row
            for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end)
        ]
        if not rows:
            gr.change_chrom_names()
            for row in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end):
                rows.append(row)

        columns = [
            'seqname', 'source', 'feature', 'start', 'end', 'score', 'strand',
            'frame', 'attribute'
        ]
        df = pd.DataFrame(rows, columns=columns)
        df['start'] = df['start'].astype(int)
        df['end'] = df['end'].astype(int)
        name_attr = self.properties.get("name_attr", "auto")
        if name_attr == "auto":
            gene_name = df['attribute'].str.extract(
                ".*gene_name (.*?) ").iloc[:, 0].str.strip('\";')
            if gene_name.hasnans:
                gene_id = df['attribute'].str.extract(
                    ".*gene_id (.*?) ").iloc[:, 0].str.strip('\";')
                gene_name.fillna(gene_id, inplace=True)
                if gene_name.hasnans:
                    pos_str = df['seqname'].astype(str) + ":" +\
                              df['start'].astype(str) + "-" +\
                              df['end'].astype(str)
                    gene_name.fillna(pos_str, inplace=True)
            df['feature_name'] = gene_name
        else:
            df['feature_name'] = df['attribute'].str.extract(
                f".*{name_attr} (.*?) ").iloc[:, 0].str.strip('\";')
        return df
Beispiel #3
0
 def __load(self, gr):
     rows = []
     ix_pos = self.properties['col_pos']
     ix_pval = self.properties['col_pval']
     for items in tabix_query(self.bgz_file, gr.chrom, gr.start, gr.end):
         items[ix_pos] = int(items[ix_pos])
         items[ix_pval] = float(items[ix_pval])
         rows.append(items)
     return rows
Beispiel #4
0
 def __load(self, genome_range):
     rows = []
     gr = genome_range
     for it in tabix_query(self.bgz_file,
                           gr.chrom,
                           gr.start,
                           gr.end,
                           split=True):
         rows.append([it[0], int(it[1]), int(it[2]), float(it[3])])
     return rows
Beispiel #5
0
 def load(self, genome_range):
     gr = genome_range
     return [[it[0], int(it[1]),
              int(it[2]), float(it[3])] for it in tabix_query(
                  self.bgz_file, gr.chrom, gr.start, gr.end, split=True)]