コード例 #1
0
def fetch_gene_gtf(gtf_fname: str, gene_ids_fname: str):
    """
        LOADS wormbase_gene
        This function fetches and parses the canonical geneset GTF
        and yields a dictionary for each row.
    """
    gene_gtf = read_gtf_as_dataframe(gtf_fname)
    gene_ids = get_gene_ids(gene_ids_fname)

    # Add locus column
    # Rename seqname to chrom
    gene_gtf = gene_gtf.rename({'seqname': 'chrom'}, axis='columns')
    gene_gtf = gene_gtf.assign(
        locus=[gene_ids.get(x) for x in gene_gtf.gene_id])
    gene_gtf = gene_gtf.assign(
        chrom_num=[CHROM_NUMERIC[x] for x in gene_gtf.chrom])
    gene_gtf = gene_gtf.assign(pos=(((gene_gtf.end - gene_gtf.start) / 2) +
                                    gene_gtf.start).map(int))
    gene_gtf.frame = gene_gtf.frame.apply(lambda x: x if x != "." else None)
    gene_gtf.exon_number = gene_gtf.exon_number.apply(lambda x: x
                                                      if x != "" else None)
    gene_gtf['arm_or_center'] = gene_gtf.apply(
        lambda row: arm_or_center(row['chrom'], row['pos']), axis=1)
    for row in gene_gtf.to_dict('records'):
        yield row
コード例 #2
0
def test_ensembl_gtf_gene_names():
    df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH)
    gene_names = set(df["gene_name"])
    assert gene_names == EXPECTED_GENE_NAMES, "Wrong gene names: %s, missing %s and unexpected %s" % (
        gene_names,
        EXPECTED_GENE_NAMES.difference(gene_names),
        gene_names.difference(EXPECTED_GENE_NAMES),
    )
コード例 #3
0
def test_ensembl_gtf_gene_names():
    df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH)
    gene_names = set(df["gene_name"])
    assert gene_names == EXPECTED_GENE_NAMES, \
        "Wrong gene names: %s, missing %s and unexpected %s" % (
            gene_names,
            EXPECTED_GENE_NAMES.difference(gene_names),
            gene_names.difference(EXPECTED_GENE_NAMES)
        )
コード例 #4
0
    def _load_full_dataframe_from_gtf(self):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf_as_dataframe(
            self.gtf_path,
            column_converters={
                "seqname": normalize_chromosome,
                "strand": normalize_strand,
            },
            infer_biotype_column=True)

        features = set(df["feature"])
        column_names = set(df.keys())

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if "gene" not in features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"gene": "gene_id"},
                extra_columns={
                    "gene": {
                        "gene_name",
                        "gene_biotype"
                    }.intersection(column_names),
                },
                missing_value="")
            logger.info("Done.")

        if "transcript" not in features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
コード例 #5
0
ファイル: gtf.py プロジェクト: vreuter/pyensembl
    def _load_full_dataframe_from_gtf(self):
        """
        Parse this genome source's GTF file and load it as a Pandas DataFrame
        """
        logger.info("Reading GTF from %s", self.gtf_path)
        df = read_gtf_as_dataframe(self.gtf_path,
                                   column_converters={
                                       "seqname": normalize_chromosome,
                                       "strand": normalize_strand,
                                   },
                                   infer_biotype_column=True)

        features = set(df["feature"])
        column_names = set(df.keys())

        # older Ensembl releases don't have "gene" or "transcript"
        # features, so fill in those rows if they're missing
        if "gene" not in features:
            # if we have to reconstruct gene feature rows then
            # fill in values for 'gene_name' and 'gene_biotype'
            # but only if they're actually present in the GTF
            logger.info("Creating missing gene features...")
            df = create_missing_features(dataframe=df,
                                         unique_keys={"gene": "gene_id"},
                                         extra_columns={
                                             "gene":
                                             {"gene_name", "gene_biotype"
                                              }.intersection(column_names),
                                         },
                                         missing_value="")
            logger.info("Done.")

        if "transcript" not in features:
            logger.info("Creating missing transcript features...")
            df = create_missing_features(
                dataframe=df,
                unique_keys={"transcript": "transcript_id"},
                extra_columns={
                    "transcript": {
                        "gene_id",
                        "gene_name",
                        "gene_biotype",
                        "transcript_name",
                        "transcript_biotype",
                        "protein_id",
                    }.intersection(column_names)
                },
                missing_value="")
            logger.info("Done.")

        return df
コード例 #6
0
def mouse_gene_intervals():
    df = read_gtf_as_dataframe(GENCODE_MM10_FILE)
    df = df[df.feature == 'gene' & df.feature_type == 'protein_coding']
    print(len(df))
    trees = {chromosome_strand: IntervalTree() for chromosome_strand in product(MOUSE_CHROMOSOMES, ['+', '-'])}
    for _, row in df.iterrows():
        if row['end'] > row['start']:
            # end is included, start count at 0 instead of 1
            trees[row['seqname'] + row['strand']][row['start'] - 1:row['end']
                                  ] = (row['gene_id'])

    logging.info('Built mouse exon tree with {} nodes'
                 .format(sum([len(tree) for tree in trees.values()])))

    return trees
コード例 #7
0
def tr_gene_map(gtf):
    gtf_df = read_gtf_as_dataframe(gtf)
    tr_df = gtf_df[gtf_df.feature == 'transcript']
    tr_gene_map = tr_df.loc[:, ['transcript_id', 'gene_id']]
    tr_gene_map = tr_gene_map.set_index('transcript_id')
    return tr_gene_map
コード例 #8
0
ファイル: test_refseq_gtf.py プロジェクト: hammerlab/gtfparse
def test_read_refseq_gtf_as_dataframe():
    gtf_df = read_gtf_as_dataframe(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_df)
コード例 #9
0
ファイル: data.py プロジェクト: moritzschaefer/pavooc
def read_gencode(genome=GENOME):
    '''
    Buffered gencode read with HAVANA/ENSEMBL merged
    Swissprot IDs are merged and start-end indexing is adjusted
    Returns relevant columns only
    Returns the gencode dataframe but with havana and ensembl merged
    '''
    if genome == 'hg19':
        df = read_gtf_as_dataframe(GENCODE_HG19_FILE)
    elif genome == 'hg38':
        df = read_gtf_as_dataframe(GENCODE_HG38_FILE)
    elif genome == 'mm10':
        df = read_gtf_as_dataframe(GENCODE_MM10_FILE)

    df.exon_number = df.exon_number.apply(pd.to_numeric, errors='coerce')
    df.protein_id = df.protein_id.map(lambda v: v[:v.find('.')])
    df.exon_id = df.exon_id.map(lambda v: v[:v.find('.')])
    df.gene_id = df.gene_id.map(lambda v: v[:v.find('.')])
    df.transcript_id = df.transcript_id.map(lambda v: v[:v.find('.')])

    # only take protein_coding genes/transcripts/exons
    df = df[
        (df['gene_type'] == 'protein_coding') &
        (df['feature'].isin(['gene', 'transcript', 'exon', 'UTR'])) &
        (df['seqname'].isin(CHROMOSOMES))]
    # drop all transcripts and exons that have no protein_id
    df.drop(df.index[(df.protein_id == '') & (
        df.feature.isin(['exon', 'transcript', 'UTR']))], inplace=True)

    # only take exons and transcripts which contain a basic-tag
    non_basic_transcripts = (df['feature'].isin(['transcript', 'exon', 'UTR'])) & \
        ~(df['tag'].str.contains('basic'))
    df.drop(df.index[non_basic_transcripts], inplace=True)

    # add swissprot id mappings
    protein_id_mapping = load_protein_mapping()
    protein_id_mapping = protein_id_mapping[
        protein_id_mapping.ID_NAME == 'Ensembl_PRO'][
        ['swissprot_id', 'protein_id']]

    df = df.merge(protein_id_mapping, how='left', on='protein_id')

    # delete ENSEMBL entries which come from both, HAVANA and ENSEMBL
    mixed_ids = df[['gene_id', 'source']].drop_duplicates()

    counts = mixed_ids.gene_id.value_counts()
    duplicate_ids = counts.index[counts == 2]
    df.drop(df.index[
        df.gene_id.isin(duplicate_ids) &
        (df.source == 'ENSEMBL')], inplace=True)

    # fix indexing
    df.start -= 1

    # drop alternative_3or5_UTR transcripts
    # df = df.drop(df.index[df.tag.str.contains('alternative_')])

    # drop all genes which have no transcripts
    valid_genes = df[df['feature'] == 'transcript'].gene_id.drop_duplicates()
    # double check, there are no orphan-exons or so
    assert set(valid_genes) == \
        set(df[df['feature'] == 'exon'].gene_id.drop_duplicates())
    df.drop(df.index[~df.gene_id.isin(valid_genes)], inplace=True)

    # select best transcript
    df = df.groupby('gene_id').apply(_filter_best_transcript)
    df.reset_index(level=0, drop=True, inplace=True)

    return df[[
        'feature', 'gene_id', 'transcript_id',
        'start', 'end', 'exon_id', 'exon_number',
        'gene_name', 'transcript_type', 'strand',
        'gene_type', 'tag', 'protein_id', 'swissprot_id',
        'score', 'seqname', 'source']]
コード例 #10
0
def test_ensembl_gtf_columns():
    df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH)
    features = set(df["feature"])
    eq_(features, EXPECTED_FEATURES)
コード例 #11
0
def test_read_stringtie_gtf_as_dataframe():
    gtf_df = read_gtf_as_dataframe(B16_GTF_PATH)
    _check_required_columns(gtf_df)
    _check_string_cov_and_FPKM(gtf_df)
コード例 #12
0
    fig = self.plt.figure()

    cmap = self.plt.cm.tab20
    color = iter(cmap(np.linspace(0, 1, len(data_map))))
    for label, data_arrays in data_map.items():
        self.plt.plot(data_arrays[0], data_arrays[1], marker, label=label, color=next(color))

    if legend:
        self.plt.legend(loc=legend_loc)
    self._set_properties_and_close(fig, title, xlab, ylab)


if __name__ == '__main__':
    args = parser.parse_args()

    df = read_gtf_as_dataframe(args.gtf)
    df_trs = df[df["feature"] == "exon"]

    et = defaultdict(lambda: defaultdict(dict))

    for row in df_trs.itertuples():
        et[row.gene_id][row.transcript_id][int(row.exon_number)] = float(row.cov)

    etp = defaultdict(lambda: defaultdict(list))
    for gene, trs_info in sorted(et.items(), key=lambda x: x[0]):
        for trs, exon_info in trs_info.items():
            exon_numbers = np.array(list(exon_info.keys()), dtype=int)
            exon_cov = np.array(list(exon_info.values()), dtype=float)
            # exon_cov = exon_cov / np.sum(exon_cov)
            exon_cov = np.log(exon_cov + 1)
            etp[gene][trs].extend([exon_numbers, exon_cov])
コード例 #13
0
def load_dataset(drop_locus=True):
    '''
    Load and prepare the achilles dataset to be processed be azimuth feature
    extraction
    :returns: Xdf, Y, gene_position, target_genes as in azimuth.load_dataset
    '''
    activity_scores = pd.read_csv(ACHILLES_GUIDE_ACTIVITY_SCORES_FILE,
                                  sep='\t')
    guide_map = pd.read_csv(ACHILLES_GUIDE_MAPPING, sep='\t')
    guide_map.dropna(inplace=True)
    guide_map.rename(index=str, columns={'Gene': 'Target'}, inplace=True)
    guide_map = guide_map.groupby('Guide').first()
    activity_scores.dropna(inplace=True)
    activity_scores.set_index('Guide', inplace=True)
    df = guide_map.join(activity_scores)

    # TODO why hg38 and not 37
    hg38 = read_gtf_as_dataframe(GENCODE_HG38_FILE)
    hg38 = hg38.loc[(hg38.feature == 'gene')]
    # remove duplicate gene names (chrX, chrY) by using first one
    # this might be inaccurate but shouldn't have a big impact.
    # It affects around 60 datapoints only (out of 70000)
    hg38 = hg38.groupby('gene_name').first().reset_index()

    # fix wrong gene names
    hg38['gene_id'] = hg38['gene_id'].apply(lambda v: v[:15])
    merged_mapping = GENE_ID_MAPPING.merge(hg38[['gene_id', 'gene_name']],
                                           how='inner',
                                           on='gene_id')
    df.Target = df.Target.apply(lambda gene: gene if (gene == hg38.gene_name).
                                any() else _first_or_none(merged_mapping.loc[
                                    merged_mapping.symbol == gene].gene_name))
    df.dropna(inplace=True)
    #

    contexts = df.apply(lambda row: _find_context(row.name,
                                                  *row.Locus.split('_')[:2]),
                        axis=1)

    df['30mer'] = [c[0] for c in contexts]
    df['Strand'] = [c[1] for c in contexts]
    df.dropna(inplace=True)
    Y = pd.DataFrame({'score_drug_gene_rank': df['Activity']}, index=df.index)

    # calculate percent peptide and 'Amino Acid Cut position'
    df_positions = df.merge(hg38[['gene_name', 'start', 'end']],
                            left_on='Target',
                            right_on='gene_name')
    nt_cut_position = df_positions.Locus.map(lambda v: int(v.split('_')[1]))
    pp = (100.0 * (nt_cut_position - df_positions.start) /
          (df_positions.end - df_positions.start))
    # 'Amino Acid Cut position' is just a very stupid heuristic because I am
    # too lazy to calculate the real value
    aacp = (pp / 100.0) * ((df_positions.end - df_positions.start) / 100)

    df.drop(['Activity'], axis=1, inplace=True)
    if drop_locus:
        df.drop(['Locus'], axis=1, inplace=True)
    gene_position = pd.DataFrame({
        'Percent Peptide': pp,
        'Amino Acid Cut position': aacp
    })
    gene_position.set_index(df.index, inplace=True)
    df.index.name = 'Sequence'
    df['drug'] = 'nodrug'
    target_genes = df['Target'].drop_duplicates()
    df.reset_index(inplace=True)
    df.set_index(['Sequence', 'Target', 'drug'], inplace=True)

    return df, Y, gene_position, target_genes
コード例 #14
0
ファイル: test_refseq_gtf.py プロジェクト: noahpieta/gtfparse
def test_read_refseq_gtf_as_dataframe():
    gtf_df = read_gtf_as_dataframe(REFSEQ_GTF_PATH)
    _check_required_columns(gtf_df)
コード例 #15
0
def test_ensembl_gtf_columns():
    df = read_gtf_as_dataframe(ENSEMBL_GTF_PATH)
    features = set(df["feature"])
    eq_(features, EXPECTED_FEATURES)
コード例 #16
0
def read_hg38():
    print('read gencode')
    df = read_gtf_as_dataframe(GENCODE_HG38_FILE)
    df.gene_id = df.gene_id.apply(lambda gid: gid[:15])

    return df
コード例 #17
0
def test_read_stringtie_gtf_as_dataframe_float_values():
    gtf_df = read_gtf_as_dataframe(
        B16_GTF_PATH,
        column_converters={"cov": float, "FPKM": float})
    _check_required_columns(gtf_df)
    _check_float_cov_and_FPKM(gtf_df)