import pandas as pd import numpy as np import subprocess import sys def ac_to_name(chromosome_dic, chromosome): name = chromosome_dic[chromosome] if not name.startswith('chr'): name = 'chr' + name return name gff = sys.argv[1] genome = sys.argv[2] chrome_dic = make_ac_name_map(genome) chrome_dic['NC_012920.1'] = 'chrM_NC_012920.1' # read and extract info annotation = subprocess.getoutput('zcat ' + gff + ' | grep -v "#"') annotation_df = pd.read_csv(StringIO(annotation), sep='\t', header=None) annotation_df['gene'] = annotation_df[8].str.extract('gene=(.*?);') annotation_df['ID'] = annotation_df[8].str.extract('ID=(.*?);') annotation_df['tag'] = annotation_df[8].str.extract('tag=(.*?);') annotation_df['transcript_id'] = annotation_df[8].str.extract( 'transcript_id=(.*?)$') annotation_df.fillna('.', inplace=True) # filter RefSeq Select exon info annotation_df = annotation_df[annotation_df[2].isin(['exon']) & annotation_df[1].str.contains('RefSeq') & annotation_df[0].str.contains('NC_')].copy() annotation_df_select = annotation_df[annotation_df['tag'] ==
def get_assembly_map(self, assembly_name): """return a list of accessions for the specified assembly name (e.g., GRCh38.p5) """ return make_ac_name_map(assembly_name)