def split_gtf(gtf, outdir, novel=False): gtf_df = gtfparse.read_gtf(gtf) if 'gene_type' in gtf_df.columns: gtf_df.loc[:, 'gene_biotype'] = gtf_df.gene_type gtf_df.drop('gene_type', axis=1, inplace=True) elif 'gene_biotype' in gtf_df.columns: pass else: gtf_df.loc[:, 'gene_biotype'] = 'protein_coding' type_label = 'gene_biotype' if novel: gtf_df.loc[ :, type_label] = gtf_df.loc[:, type_label].map( GENCODE_CATEGORY_MAP) else: gtf_df.loc[ :, type_label] = gtf_df.loc[:, type_label].map( simplify_gene_type) outdir = Path(outdir) outdir.mkdir(parents=True, exist_ok=True) for gt, grp in gtf_df.groupby(type_label): gt_file = outdir / f'{gt}.gtf' with open(gt_file, 'w') as gt_inf: for idx in grp.index: outline = dfline2gtfline(grp.loc[idx]) gt_inf.write(outline)
def ensp_to_hugo_map(datastore="./data"): """ You should download the file Homo_sapiens.GRCh38.95.gtf from : ftp://ftp.ensembl.org/pub/release-95/gtf/homo_sapiens/Homo_sapiens.GRCh38.95.gtf.gz Store the file in datastore """ savefile = datastore + "/datastore/ensp_ensg_df.pkl" # If df is already stored, return the corresponding dictionary if os.path.isfile(savefile): f = open(savefile, 'rb') df = pickle.load(f) f.close() else: df = read_gtf(datastore + "/datastore/Homo_sapiens.GRCh38.95.gtf") df = df[df['protein_id'] != ''][['gene_id', 'protein_id']].drop_duplicates() df.to_pickle(savefile) # ENSG to hugo map with open(datastore + "/datastore/ensembl_map.txt") as csv_file: next(csv_file) # Skip first line csv_reader = csv.reader(csv_file, delimiter='\t') ensg_map = {row[1]: row[0] for row in csv_reader if row[0] != ""} # ENSP to hugo map ensmap = {} for index, row in df.iterrows(): if row['gene_id'] in ensg_map.keys(): ensmap[row['protein_id']] = ensg_map[row['gene_id']] return ensmap
def process_gene_annot(fpath, outPath): """ :param fpath: string representing path to file :param outPath: string representing path for output file :return output: df containing headers=[chr, gene_id, genename, start, end] Note: ----- use gtfparse to load gtf file. https://github.com/openvax/gtfparse """ # load data geneDf = read_gtf(fpath) # retrieve genes only df_genes = geneDf[geneDf["feature"] == "gene"] # select wanted columns cols = ['seqname', 'gene_id', 'transcript_name', 'start', 'end'] subdf_genes = df_genes[cols] # retrieve chr str chrStr = subdf_genes['seqname'].str.split('chr', n=1, expand=True) subdf_genes['chr'] = chrStr[1] # drop seqname and keep chr column sub_gene = subdf_genes.drop(['seqname'], axis=1) # reorder columns sub_gene = sub_gene[['chr', 'gene_id', 'transcript_name', 'start', 'end']] sub_gene.columns = ['chr', 'gene_id', 'genename', 'start', 'end'] return sub_gene
def find_bio_type(filePath): # open the file df = read_gtf(filePath) # filter of info based in the columns # translate a DateFrame in array for manipulation source_array = df["gene_biotype"].__array__() return set(source_array)
def main(): parser = argparse.ArgumentParser(description=""" python ExonUnion.py Calculate the union of the exons of a list of transcript. chr10 27035524 27150016 ABI1 76 - NM_001178120 10006 protein-coding abl-interactor 1 27037498 27149792 10 27035524,27040526,27047990,27054146,27057780,27059173,27060003,27065993,27112066,27149675, 27037674,27040712,27048164,27054247,27057921,27059274,27060018,27066170,27112234,27150016, """) parser.add_argument("--gtf_file") # parser.add_argument("--transcript_bed") # parser.add_argument('-o', '--options', default='yo', # help="Some option", type='str') # parser.add_argument('-u', '--useless', action='store_true', # help='Another useless option') args = parser.parse_args() # returns GTF with essential columns such as "feature", "seqname", "start", "end" # alongside the names of any optional keys which appeared in the attribute column df = read_gtf(args.gtf_file) # filter DataFrame to gene entries on chrY #df_transcripts = df[df["feature"] == "transcript"] #df_transcripts = df_transcripts[df_transcripts['gene_type'] == 'protein_coding'] #df_transcripts = df[df["transcript_name"] == "SAMD11-201"] #df_transcripts = df[df["gene_name"] == "AC114490.2-201"] df_transcripts = df[df["gene_id"] == "ENSG00000163867.17"] #df_transcripts = df.head() # gene_id = "ENST00000445297" # df_transcripts = df[df["transcript_id"].str.contains(gene_id)] #df_genes_chrY = df_genes[df_genes["seqname"] == "Y"] print(df_transcripts.to_string()) print("--")
def load_transcript_fpkm_dict_from_gtf( gtf_path, transcript_id_column_name="reference_id", fpkm_column_name="FPKM", feature_column_name="feature"): """ Load a GTF file generated by StringTie which contains transcript-level quantification of abundance. Returns a dictionary mapping Ensembl IDs of transcripts to FPKM values. """ df = gtfparse.read_gtf(gtf_path, column_converters={fpkm_column_name: float}) transcript_ids = _get_gtf_column(transcript_id_column_name, gtf_path, df) fpkm_values = _get_gtf_column(fpkm_column_name, gtf_path, df) features = _get_gtf_column(feature_column_name, gtf_path, df) logging.info("Loaded %d rows from %s" % (len(transcript_ids), gtf_path)) logging.info("Found %s transcript entries" % sum(feature == "transcript" for feature in features)) result = { transcript_id: float(fpkm) for (transcript_id, fpkm, feature) in zip(transcript_ids, fpkm_values, features) if ((transcript_id is not None) and (len(transcript_id) > 0) and ( feature == "transcript")) } logging.info("Keeping %d transcript rows with reference IDs" % (len(result), )) return result
def parse_expression_file(args, vcf_reader, vcf_writer): if args.format == 'stringtie' and args.mode == 'transcript': df_all = read_gtf(args.expression_file) df = df_all[df_all["feature"] == "transcript"] id_column = resolve_stringtie_id_column(args, df.columns.values) else: id_column = resolve_id_column(args) df = pd.read_csv(args.expression_file, sep='\t') if args.ignore_ensembl_id_version: df['transcript_without_version'] = df[id_column].apply( lambda x: re.sub(r'\.[0-9]+$', '', x)) expression_column = resolve_expression_column(args) if expression_column not in df.columns.values: vcf_reader.close() vcf_writer.close() raise Exception( "ERROR: expression_column header {} does not exist in expression_file {}" .format(expression_column, args.expression_file)) if id_column not in df.columns.values: vcf_reader.close() vcf_writer.close() raise Exception( "ERROR: id_column header {} does not exist in expression_file {}". format(id_column, args.expression_file)) return df, id_column, expression_column
def find_features(filePath): # open the file df = read_gtf(filePath) # filter of info based in the columns # translate a DateFrame in array for manipulation feature = df["feature"].__array__() return set(feature)
def load_annotation(self, annotation_path): """ A method for loading annotation using gtfparse library """ annotation_type = path_features(annotation_path)["extension"] try: log.debug("Loading "+annotation_path) if (annotation_type == "gtf"): # disable root log due to gtfparse logging interference log_root = logging.getLogger("") log_root.disabled = True # read gtf annotation_df = gtfparse.read_gtf(annotation_path) # enable root log log_root.disabled = False # limit only to a feature of interest annotation_df[annotation_df["feature"] == "exon"] # trim to required cols only annotation_df = annotation_df[["seqname", "start", "end", "strand", "gene_id", "transcript_id"]] except Exception as e: raise ValueError("An error occured while loading annotation: "+str(e)) annotation_df["start"] -= 1 self.annotation_df = {"whole": annotation_df}
def display_gtf_geneids(gtffile: str, feature_type: Optional[List[str]] = None): """Display the geneids present in a GTF/GFF Parameters ---------- gtffile : `str` GTF/GFF file that matches the input FASTA file. Preferably one from Ensembl/GENCODE. Gzipped GTF/GFF files are acceptable, though their use may impose a performance penality. feature_type : Returns ------- `None` """ gtf = read_gtf(gtffile) if feature_type is not None: gtf = gtf[gtf.feature == feature_type] gene_set = gtf["gene_id"].unique() print(f"{len(gene_set)} genes found. These include:") for _ in gene_set: print(_)
def main(gtf, out_dir, linc): gtf_df = read_gtf(gtf) linc_genes = gtf_df[gtf_df.transcript_biotype == 'lincRNA'].gene_id.unique() gtf_df.gene_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'], inplace=True) gtf_df.transcript_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'], inplace=True) if 'gene_name' in gtf_df.columns: mask = (gtf_df.gene_name == "") gtf_df.loc[mask, 'gene_name'] = gtf_df.loc[mask, 'gene_id'] else: gtf_df.loc[:, 'gene_name'] = gtf_df.loc[:, 'gene_id'] if linc: gtf_df = gtf_df.set_index('gene_id') gtf_df.loc[linc_genes, 'gene_biotype'] = 'lincRNA' gtf_df.gene_biotype.replace({'lncRNA': 'genic_lncRNA'}, inplace=True) gtf_df = gtf_df.reset_index() gene_df = gtf_df[gtf_df.gene_id != ""] gene_type_df = gene_df.loc[:, ['gene_id', 'gene_name', 'gene_biotype' ]].drop_duplicates() gene_type_file = os.path.join(out_dir, 'gene_type.txt') gene_type_df.to_csv(gene_type_file, sep='\t', index=False) tr_df = gtf_df[gtf_df.transcript_id != ""] tr_type_df = tr_df.loc[:, [ 'transcript_id', 'gene_id', 'gene_name', 'transcript_biotype', 'gene_biotype' ]].drop_duplicates() tr_type_file = os.path.join(out_dir, 'transcript_type.txt') tr_type_df.to_csv(tr_type_file, sep='\t', index=False)
def find_gene(filePath): # open the file df = read_gtf(filePath) # filter of info based in the columns # translate a DateFrame in array for manipulation source_array = df[df["feature"] == 'gene'].__array__() return source_array
def find_chromossome(filePath): # open the file df = read_gtf(filePath) # filter of info based in the columns # translate a DateFrame in array for manipulation source_array = df["seqname"].__array__() return set(source_array)
def main(gtf_file, compare_table, cov_file, species): gtf_df = gtfparse.read_gtf(gtf_file) gene_df = gtf_df[gtf_df.feature == 'gene'] gene_type_df = gene_df.loc[:, ['gene_id', 'gene_biotype']].drop_duplicates() gene_type_df = gene_type_df.set_index('gene_id') gene_type_df.gene_biotype.replace(gtf_tools['dict_GENCODE_CATEGORY_MAP'], inplace=True) gene_type_counts = gene_type_df.gene_biotype.value_counts() compare_table_df = pd.read_table(compare_table) assembly_genes = list() ref_assembly_df = compare_table_df[compare_table_df.category_relative == 'exonic_overlap'] for each in ref_assembly_df.ref_gene_id: assembly_genes.extend(each.split(',')) assembly_genes = list(set(assembly_genes)) assembly_gene_type_df = gene_type_df.loc[assembly_genes] assembly_gene_type_counts = assembly_gene_type_df.gene_biotype.value_counts( ) merged_df = pd.concat([gene_type_counts, assembly_gene_type_counts], axis=1) merged_df.columns = ['referrence', 'assembly'] merged_df.loc[:, 'coverage'] = merged_df.assembly / merged_df.referrence merged_df.loc[:, 'species'] = species merged_df.to_csv(cov_file, sep='\t', header=False)
def extract_landmarks(gtf, landmarks=ALL_LANDMARKS): """Given an gene annotation GFF/GTF file, # Arguments gtf: File path or a loaded `pd.DataFrame` with columns: seqname, feature, start, end, strand landmarks: list or a dictionary of landmark extractors (function or name) # Note When landmark extractor names are used, they have to be implemented in the module `concise.preprocessing.position` # Returns Dictionary of pd.DataFrames with landmark positions (columns: seqname, position, strand) """ if isinstance(gtf, str): _logger.info("Reading gtf file..") gtf = read_gtf(gtf) _logger.info("Done") _logger.info("Running landmark extractors..") # landmarks to a dictionary with a function assert isinstance(landmarks, (list, tuple, set, dict)) if isinstance(landmarks, dict): landmarks = {k: _get_fun(v) for k, v in landmarks.items()} else: landmarks = { _to_string(fn_str): _get_fun(fn_str) for fn_str in landmarks } r = {k: _validate_pos(v(gtf)) for k, v in landmarks.items()} _logger.info("Done!") return r
def bed12_process(): #file_path = "/home/zyang/Project/CRC/step49_track/gencode.vM25.basic.annotation.gtf" file_path = "/data3/zhaochen/project/colon_cancer/colon_chip/genomeTrack/gencode.vM25.basic.annotation.gtf" df = read_gtf(file_path) print(df.columns) print(df[0:10]) df_genes = df[df["feature"] == "transcript"] df_sub = df_genes[['gene_name', 'transcript_id']] df_sub.to_csv("gene_name_and_transcriptID.txt", sep="\t", index=False) #bed_file = "/home/zyang/Project/CRC/step49_track/gencode.vM25.basic.annotation.sort.bed12" bed_file = "/data3/zhaochen/project/colon_cancer/colon_chip/genomeTrack/gencode.vM25.basic.annotation.sort.bed12" bed_df = pd.read_csv(bed_file, sep="\t",\ names=["chr", "start", "end", "name", "score", "strand", "thick_start", "thick_end",\ "rgb", "block_count", "block_size", "block_start"]) new_bed = bed_df.merge(df_sub, left_on="name", right_on="transcript_id", how="left") new_bed =new_bed[["chr", "start", "end", "gene_name", "score", "strand", "thick_start", "thick_end",\ "rgb", "block_count", "block_size", "block_start"]] new_bed.to_csv("sorted.changeName.bed", sep="\t", index=False, header=False)
def parse_gtf(fileParse, feature, fileDest): df = read_gtf(fileParse) # e.g df = read_gtf('C:/Users/breno/Desktop/Homo_sapiens.GRCh38.91.gtf') # filter of info based in the columns # df_genes = df[df["feature"] == "gene"] # df_exons = df[df["feature"] == "exon"] # df_introns = df[df["feature"] == "intron"] # df_cds = df[df["feature"] == "CDS"] # df_genes = df[df["feature"] == "gene"] # filter of info based in the columns # translate a DateFrame in array for manipulation str_genes = df[df["feature"] == feature].__array__() # str_genes = df[df["feature"] == "gene"].__array__() # get quantity of the genes and storage in the variable tam tam = len(str_genes) # open the file for write files_gene = open(fileDest, 'w') # e.g files_gene = open('C:/Users/breno/Desktop/gene.txt', 'w') # iterate by array and write in the file for i in range(0, tam - 1): for j in range(0, 25): # write the feature one an one # before insert a space bar for count the number de attr files_gene.write(str_genes[i][j].__str__()) files_gene.write('|') # jump line in the file before write a position of the array files_gene.write('\n') files_gene.close()
def load_dataframe(self, file_resources): # Parse lncRNA gtf df = read_gtf(file_resources["long_noncoding_RNAs.gtf"] ) # Returns a dask dataframe df['gene_id'] = df['gene_id'].str.replace( "[.].*", "") # Removing .# ENGS gene version number at the end df['transcript_id'] = df['transcript_id'].str.replace("[.].*", "") return df
def pickle_annotations(gtf_path, columns: list, features=['exon']): annotations = read_gtf( gtf_path, features=features).filter( columns).sort_values(by=['seqname', 'transcript_id', 'exon_number']) annotations.to_pickle("annotations.pkl") sys.exit()
def load_exons(model_gtf): """load GTF exons into a list of Series objects of exons""" gtf_df = read_gtf(model_gtf) gtf_df = gtf_df.loc[gtf_df.feature == 'exon'] if len(gtf_df) == 0: raise GtfException("no exon records found") fixup_gtf_attrs(gtf_df) return [gtf_df.iloc[i] for i in range(len(gtf_df))]
def _load_gtf_as_dataframe(self, usecols=None, features=None): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf( self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True, usecols=usecols, features=features) column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features expect_transcript_feature = features is None or "transcript" in features observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if expect_gene_feature and "gene" not in observed_features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features( dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": { "gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if expect_transcript_feature and "transcript" not in observed_features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def gtf(): # save a smaller version of the annotation # from concise.preprocessing.landmarks import read_gtf # gtf_path = "/s/genomes/human/hg38/GRCh38.p7/gencode.v25.annotation.gtf" # gtf = read_gtf(gtf_path) # gtf_small = gtf[gtf.seqnames == "chr22"] # gtf_small.to_pickle("data/gencode_v25_chr22.gtf.pkl.gz") # 116k return read_gtf("data/gencode.v24.annotation_chr22.gtf.gz")
def read_gene_gtf_file(self): # ['start','end','gene_name','gene_id','seqname','exon_number','feature'] data_frame = read_gtf( self.gene_gtf_file_name, usecols=['start', 'end', 'seqname', 'feature', 'gene_biotype']) data = data_frame.query("feature == 'gene'") data = data.reset_index(drop=True) data.to_csv("gene_file.csv")
def test_ensembl_gtf_gene_names_with_usecols_gzip(): df = read_gtf(ENSEMBL_GTF_PATH + ".gz", usecols=["gene_name"]) gene_names = set(df["gene_name"]) assert gene_names == EXPECTED_GENE_NAMES, \ "Wrong gene names: %s, missing %s and unexpected %s" % ( gene_names, EXPECTED_GENE_NAMES.difference(gene_names), gene_names.difference(EXPECTED_GENE_NAMES) )
def identify_transcripts(gtf_file, regtools_file, sample_name): """Filter GTF tsv file to find tumor junction coordinates from regtools Args: gtf_file (string): path to gtf file regtools_file (string): path to (filtered) regtools output excel file Returns: junctions.gtf (file): filtered gtf only containing transcripts that correspond to regtools junctions transcripts.fa (file): fasta file with coding transcript sequences corresponding to junctions.gtf file gtf_transcripts (df): list of altered transcripts """ # convert gtf to pd df gtf_data = read_gtf(gtf_file) # read in regtools significant junctions as pd df junctions = pd.read_excel(regtools_file, sheet_name='Sheet1') junctions = junctions.loc[junctions['Sample'] == sample_name] print(junctions) total_transcripts = {} for row_index, row in junctions.iterrows(): start_exons = gtf_data.loc[(gtf_data['end'] == row["start"]) & (gtf_data["seqname"] == row["chrom"])] stop_exons = gtf_data.loc[(gtf_data['start'] == row["end"]) & (gtf_data["seqname"] == row["chrom"])] print(start_exons, stop_exons) transcript_dict = { t: row["gene_names"] for t in list(start_exons['transcript_id']) if t in list(stop_exons['transcript_id']) } total_transcripts.update(transcript_dict) gtf_transcripts = gtf_data.loc[ (gtf_data["transcript_id"].isin(total_transcripts.keys())) & (gtf_data["feature"] == "transcript")] gtf_transcripts["gene"] = [ total_transcripts[x] for x in gtf_transcripts["transcript_id"] ] # filter gtf_transcripts if "transcript_type" in gtf_transcripts.columns: gtf_transcripts = gtf_transcripts.loc[ gtf_transcripts["transcript_type"] == "protein_coding"] # write subsetted gtf file write_file = open("junctions.gtf", "w") for item, gene in zip(list(gtf_transcripts["transcript_id"]), list(gtf_transcripts["gene"])): for line in open(gtf_file).readlines(): if re.search(item, line): new_line = line.strip() + f' gene_name "{gene}";\n' write_file.write(new_line) write_file.close() # create BED12 file from junctions.gtf # create fasta file with transcript (exon-only) sequences subprocess.Popen( "gtfToGenePred junctions.gtf test.genePhred && genePredToBed test.genePhred results.bed && bedtools getfasta -fi ~/Documents/ref_fasta/GRCh38.d1.vd1.fa -fo transcripts.fa -bed results.bed -split -name -s && sed -i.bak 's/(-)//;s/(+)//' transcripts.fa", shell=True) return gtf_transcripts
def gene_loc_inf(gtf_file, outfile): gtf_df = gtfparse.read_gtf(gtf_file) gene_chrom = gtf_df.groupby(['gene_id'])['seqname'].first() gene_chrom.name = 'chrom' gene_start = gtf_df.groupby(['gene_id'])['start'].min() gene_end = gtf_df.groupby(['gene_id'])['end'].max() gene_strand = gtf_df.groupby(['gene_id'])['strand'].first() gene_loc_df = pd.concat([gene_chrom, gene_start, gene_end, gene_strand], axis=1) gene_loc_df.to_csv(outfile, sep='\t')
def _load_gtf_as_dataframe(self, usecols=None, features=None): """ Parse this genome source's GTF file and load it as a Pandas DataFrame """ logger.info("Reading GTF from %s", self.gtf_path) df = read_gtf(self.gtf_path, column_converters={ "seqname": normalize_chromosome, "strand": normalize_strand, }, infer_biotype_column=True, usecols=usecols, features=features) column_names = set(df.keys()) expect_gene_feature = features is None or "gene" in features expect_transcript_feature = features is None or "transcript" in features observed_features = set(df["feature"]) # older Ensembl releases don't have "gene" or "transcript" # features, so fill in those rows if they're missing if expect_gene_feature and "gene" not in observed_features: # if we have to reconstruct gene feature rows then # fill in values for 'gene_name' and 'gene_biotype' # but only if they're actually present in the GTF logger.info("Creating missing gene features...") df = create_missing_features(dataframe=df, unique_keys={"gene": "gene_id"}, extra_columns={ "gene": {"gene_name", "gene_biotype" }.intersection(column_names), }, missing_value="") logger.info("Done.") if expect_transcript_feature and "transcript" not in observed_features: logger.info("Creating missing transcript features...") df = create_missing_features( dataframe=df, unique_keys={"transcript": "transcript_id"}, extra_columns={ "transcript": { "gene_id", "gene_name", "gene_biotype", "transcript_name", "transcript_biotype", "protein_id", }.intersection(column_names) }, missing_value="") logger.info("Done.") return df
def main(gtf, output): gtf_df = gtfparse.read_gtf(gtf) gtf_exon_df = gtf_df[gtf_df.feature == 'exon'] gtf_exon_df.loc[:, 'exon_len'] = gtf_exon_df.end - gtf_exon_df.start + 1 tr_len = gtf_exon_df.groupby(['transcript_id'])['exon_len'].sum() tr_gene = gtf_exon_df.loc[:, ['transcript_id', 'gene_id']].drop_duplicates() tr_gene = tr_gene.set_index('transcript_id') tr_gene_len = pd.concat([tr_len, tr_gene], axis=1) gene_len = tr_gene_len.groupby(['gene_id'])['exon_len'].median() gene_len.to_csv(output, header=False, sep='\t')
def main(input_gtf, in_silico_circ, real_circ_table, gene_exp): real_circ_df = pd.read_table(real_circ_table) real_circ_df.loc[:, 'chrom'] = real_circ_df.chrom.astype(str) real_circ_df = real_circ_df.set_index(['chrom', 'start', 'end']) circ_num = len(real_circ_df) is_circ_dict = OrderedDict() gtf_df = gtfparse.read_gtf(input_gtf) exon_df = gtf_df[gtf_df.feature == 'exon'] exon_df = exon_df.set_index('transcript_id') exon_df = exon_df.sort_values(['seqname', 'start']) exon_df.loc[:, 'start_0base'] = exon_df.start - 1 # gene exp cat df if gene_exp is not None: gene_exp_df = pd.read_table(gene_exp) exp_genes_df = gene_exp_df[gene_exp_df.tpm >= 10] exon_df = exon_df[exon_df.gene_id.isin(exp_genes_df.Gene_id)] for each_tr in exon_df.index.unique(): # in silico circRNA from same transcript set if each_tr not in real_circ_df.isoformName.unique(): continue each_tr_exons = exon_df.loc[each_tr] each_tr_introns = get_introns(each_tr_exons) if each_tr_introns is None: continue exon_num = len(each_tr_exons) chrom = str(each_tr_exons.iloc[0].seqname) strand = each_tr_exons.iloc[0].strand gene = each_tr_exons.iloc[0].gene_id for each_com in itertools.combinations_with_replacement( range(exon_num), 2): start = each_tr_exons.iloc[each_com[0]].start_0base end = each_tr_exons.iloc[each_com[1]].end # filter real circRNA from in silico circRNA if (chrom, start, end) in real_circ_df.index: continue flank_intron = get_flank_intron(chrom, each_com, each_tr_introns) is_circ_dict.setdefault('chrom', []).append(chrom) is_circ_dict.setdefault('start', []).append(start) is_circ_dict.setdefault('end', []).append(end) is_circ_dict.setdefault('strand', []).append(strand) is_circ_dict.setdefault('flankIntron', []).append(flank_intron) is_circ_dict.setdefault('isoformName', []).append(each_tr) is_circ_dict.setdefault('geneID', []).append(gene) is_circ_df = pd.DataFrame(is_circ_dict) np.random.seed(0) selected_circ = np.random.choice(is_circ_df.index.values, circ_num) is_circ_df = is_circ_df.loc[selected_circ] is_circ_df.loc[:, 'circRNAID'] = [ 'in_silico_circ_{num:0>10}'.format(num=each + 1) for each in range(len(is_circ_df)) ] is_circ_df = is_circ_df.set_index('circRNAID') is_circ_df.to_csv(in_silico_circ, sep='\t')
def load_annotations(gtf_path, columns: list, features=['exon']): if not __debug__: # load from pickled object annotations = pd.read_pickle(gtf_path) return annotations # NOTE: GENCODE data format; # https://www.gencodegenes.org/pages/data_format.html annotations = read_gtf(gtf_path ).query("feature == {}".format(features)).filter( columns).sort_values(by=['seqname', 'transcript_id', 'exon_number']) return annotations
def __init__(self, intervals_file, fasta_file, gtf_file, filter_protein_coding=True, target_file=None, use_linecache=True): if sys.version_info[0] != 3: warnings.warn( "Only Python 3 is supported. You are using Python {0}".format( sys.version_info[0])) self.gtf = read_gtf(gtf_file) self.filter_protein_coding = filter_protein_coding if self.filter_protein_coding: if "gene_type" in self.gtf: self.gtf = self.gtf[self.gtf["gene_type"] == "protein_coding"] elif "gene_biotype" in self.gtf: self.gtf = self.gtf[self.gtf["gene_biotype"] == "protein_coding"] else: warnings.warn( "Gtf doesn't have the field 'gene_type' or 'gene_biotype'. Considering genomic landmarks" + "of all genes not just protein_coding.") if not np.any(self.gtf.seqname.str.contains("chr")): self.gtf["seqname"] = "chr" + self.gtf["seqname"] # intervals if use_linecache: self.bt = BedToolLinecache(intervals_file) else: self.bt = BedTool(intervals_file) # extractors self.fasta_file = fasta_file self.seq_extractor = None self.dist_extractor = None # here the DATALOADER_DIR contains the path to the current directory self.dist_transformer = DistanceTransformer( ALL_LANDMARKS, DATALOADER_DIR + "/dataloader_files/position_transformer.pkl") # target if target_file: self.target_dataset = TxtDataset(target_file) assert len(self.target_dataset) == len(self.bt) else: self.target_dataset = None