import pandas as pd
import numpy as np
import subprocess
import sys


def ac_to_name(chromosome_dic, chromosome):
    name = chromosome_dic[chromosome]
    if not name.startswith('chr'):
        name = 'chr' + name
    return name


gff = sys.argv[1]
genome = sys.argv[2]
chrome_dic = make_ac_name_map(genome)
chrome_dic['NC_012920.1'] = 'chrM_NC_012920.1'
# read and extract info
annotation = subprocess.getoutput('zcat ' + gff + ' | grep -v "#"')
annotation_df = pd.read_csv(StringIO(annotation), sep='\t', header=None)
annotation_df['gene'] = annotation_df[8].str.extract('gene=(.*?);')
annotation_df['ID'] = annotation_df[8].str.extract('ID=(.*?);')
annotation_df['tag'] = annotation_df[8].str.extract('tag=(.*?);')
annotation_df['transcript_id'] = annotation_df[8].str.extract(
    'transcript_id=(.*?)$')
annotation_df.fillna('.', inplace=True)
# filter RefSeq Select exon info
annotation_df = annotation_df[annotation_df[2].isin(['exon'])
                              & annotation_df[1].str.contains('RefSeq')
                              & annotation_df[0].str.contains('NC_')].copy()
annotation_df_select = annotation_df[annotation_df['tag'] ==
Esempio n. 2
0
    def get_assembly_map(self, assembly_name):
        """return a list of accessions for the specified assembly name (e.g., GRCh38.p5)

        """
        return make_ac_name_map(assembly_name)