def get_ftp_file(ftp, binary_loc, path=None, test=False): url = "http://" + ftp + "/" + binary_loc if test: # if test only download small part from io import BytesIO import gzip headers = {"Range": "bytes=0-1000000"} c = requests.get(url, headers=headers).content s = gzip.GzipFile(fileobj=BytesIO(c)).read(1000000) f = s.decode().rsplit("\n", 1)[0] with tempfile.NamedTemporaryFile(suffix=".gtf", mode="w+") as t: t.write(f) gr = pr.read_gtf(t.name) else: if path and os.path.dirname(path): os.makedirs(os.path.dirname(path), exist_ok=True) c = requests.get(url).content if not path: with tempfile.NamedTemporaryFile(suffix=".gtf.gz", mode="wb+") as t: t.write(c) gr = pr.read_gtf(t.name) else: if os.path.dirname(path): os.makedirs(os.path.dirname(path), exist_ok=True) fh = open(path, "wb+") fh.write(c) fh.close() gr = pr.read_gtf(path) return gr
def process_polya_bed(polyA_bed_path=None, atlas_version=2, outfile=None): ''' ''' if atlas_version == 1: # No need to add 'chr' prefix polya_bed = pyr.readers.read_bed(f=polya_bed_path) # read in GTF with multi-overlap transcripts tr_gtf = pyr.read_gtf(f=tr_gtf_path) tr_gtf = get_last_exons(ranges_obj=tr_gtf) polya_bed = join_by_intersect(pyranges1=polya_bed, pyranges2=tr_gtf) # short name column already in correct format - skip to adding long name polya_bed = add_paqr_long_name(pyranges=polya_bed) # add columns with order along exon & number of exons on transcript polya_bed = add_n_along_exon(pyranges=polya_bed) polya_bed = get_total_n_on_exon(pyranges=polya_bed) # first 6 already provided in format of version 1 BED file column_order = [ 'Chromosome', 'Start', 'End', 'Name', 'Score', 'Strand', 'n_along_exon', 'total_n_on_exon', 'paqr_long_name', 'gene_id' ] write_to_paqr_bed(pyranges=polya_bed, outfile=outfile, col_order=column_order) elif atlas_version == 2: # need to add chr prefix before joining polya_bed = tidy_chromosome_column(polya_bed_path=polya_bed_path) # read in GTF with multi-overlap transcripts tr_gtf = pyr.read_gtf(f=tr_gtf_path) tr_gtf = get_last_exons(ranges_obj=tr_gtf) polya_bed = join_by_intersect(pyranges1=polya_bed, pyranges2=tr_gtf) # version 2 doesn't have name column in formatting required for PAQR - need to add both short & long name polya_bed = add_paqr_name(pyranges=polya_bed) polya_bed = add_paqr_long_name(pyranges=polya_bed) # add columns with order along exon & number of exons on transcript polya_bed = add_n_along_exon(pyranges=polya_bed) polya_bed = get_total_n_on_exon(pyranges=polya_bed) # v2.0 has custom format column_order = [ 'Chromosome', 'Start', 'End', 'paqr_name', 'ThickEnd', 'Strand', 'n_along_exon', 'total_n_on_exon', 'paqr_long_name', 'gene_id' ] write_to_paqr_bed(pyranges=polya_bed, outfile=outfile, col_order=column_order)
def gtfToBed(gtf, output): gr = pr.read_gtf(gtf) df = gr.df geneAndTrans = df[(df["Feature"] == "gene") | (df["Feature"] == "transcript")] AnnoBed = geneAndTrans.loc[:, [ "Chromosome", "Start", "End", "gene_name", "Strand", "gene_id" ]] AnnoBed = AnnoBed.drop_duplicates() AnnoBed = AnnoBed.rename( columns={ 'Chromosome': 'chromosome', 'Start': 'start', 'End': 'end', 'gene_name': 'symbol', 'Strand': 'strand', 'gene_id': 'product_accession' }) AnnoBed.loc[:, "start"] = AnnoBed.loc[:, "start"] - 1 AnnoBed.loc[:, "end"] = AnnoBed.loc[:, "end"] - 1 AnnoBed.loc[AnnoBed.strand == '+', 'TSS'] = AnnoBed.start AnnoBed.loc[AnnoBed.strand == '-', 'TSS'] = AnnoBed.end AnnoBed.TSS = AnnoBed.TSS.astype(int) AnnoBed["coordinate"] = [ x[0] + ':' + str(x[1]) + '-' + str(x[2]) for x in AnnoBed.values.tolist() ] AnnoBed = AnnoBed[[ 'chromosome', 'start', 'end', 'coordinate', 'product_accession', 'strand', 'symbol', 'TSS' ]] AnnoBed.to_csv(output, index=None, sep='\t')
def _read_utr( gtf_file, feature_type="5UTR", infer_from_cds=False, on_error_warn=True, ) -> pd.DataFrame: """ Read, extract and filter valid UTRs from the given gtf_file :param gtf_file: path to the GTF file :param feature_type: type of the feature that will be filtered for. In general '5UTR' or '3UTR'. :param infer_from_cds: Substract the CDS from the exon regions to infer the UTR regions. Will use 'feature_type' to decide whether '5UTR' or '3UTR' should be returned. :param on_error_warn: Do not break on error; instead throw warning. """ import pyranges df = pyranges.read_gtf(gtf_file, as_df=True) utr_df = UTRFetcher.get_utr_from_gtf(df, feature_type=feature_type, infer_from_cds=infer_from_cds, on_error_warn=on_error_warn) utr_df = utr_df.set_index("transcript_id") return utr_df
def _read_cds( gtf_file, filter_valid_transcripts=False, filter_biotype=False, filter_tag=False, duplicate_attr=None, on_error_warn=True, ): """ Read, extract and filter valid cds from the given gtf_file :param gtf_file: path to the GTF file """ import pyranges if duplicate_attr == None: # One row in the GTF file can have multiple tags; # therefore, to filter them we have to allow duplicate attrs. duplicate_attr = filter_tag df = pyranges.read_gtf(gtf_file, as_df=True, duplicate_attr=duplicate_attr) cds = CDSFetcher.get_cds_from_gtf( df, filter_valid_transcripts=filter_valid_transcripts, filter_biotype=filter_biotype, filter_tag=filter_tag, on_error_warn=on_error_warn) cds = cds.set_index("transcript_id") return cds
def _read_intervals(gtf_path=None, bed_path=None, pranges=None, intervals=None, interval_attrs=None, duplicate_attr=False): alternatives = [bed_path, pranges, intervals, gtf_path] if sum(i is not None for i in alternatives) != 1: raise ValueError('only one of `gth_path`, `bed_path`, `pranges`,' '`intervals` or should given as input.') if gtf_path: import pyranges pranges = pyranges.read_gtf(gtf_path, duplicate_attr=duplicate_attr) elif bed_path: import pyranges pranges = pyranges.read_bed(bed_path) elif intervals: if interval_attrs is not None: raise ValueError( '`interval_attrs` is not valid with `intervals`') pranges = intervals_to_pyranges(intervals) return pranges
def ensembl_gtf(): """ >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ >>> # | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_biotype | +19 | >>> # | (category) | (object) | (category) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | >>> # |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------| >>> # | 1 | havana | gene | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | >>> # | 1 | havana | transcript | 11868 | 14409 | . | + | . | transcribed_unprocessed_pseudogene | ... | >>> # | 1 | havana | exon | 11868 | 12227 | . | + | . | transcribed_unprocessed_pseudogene | ... | >>> # | 1 | havana | exon | 12612 | 12721 | . | + | . | transcribed_unprocessed_pseudogene | ... | >>> # | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | >>> # | 1 | havana | gene | 1173055 | 1179555 | . | - | . | lncRNA | ... | >>> # | 1 | havana | transcript | 1173055 | 1179555 | . | - | . | lncRNA | ... | >>> # | 1 | havana | exon | 1179364 | 1179555 | . | - | . | lncRNA | ... | >>> # | 1 | havana | exon | 1173055 | 1176396 | . | - | . | lncRNA | ... | >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+------------------------------------+-------+ >>> # Stranded PyRanges object has 2,446 rows and 28 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. >>> # 19 hidden columns: gene_id, gene_name, gene_source, gene_version, tag, transcript_biotype, transcript_id, transcript_name, transcript_source, transcript_support_level, ... (+ 9 more.) """ full_path = get_example_path("ensembl_human.gtf.gz") return pr.read_gtf(full_path)
def read_exon_pyranges(gtf_file, overhang=(100, 100), first_last=True): ''' Read exon as pyranges from gtf_file Args: gtf_file: gtf file from ensembl/gencode. overhang: padding of exon to match variants. first_last: set overhang of first and last exon of the gene to zero so seq intergenic region will not be processed. ''' df_gtf = pyranges.read_gtf(gtf_file).df df_exons = df_gtf[df_gtf['Feature'] == 'exon'] df_exons = df_exons[[ 'Chromosome', 'Start', 'End', 'Strand', 'exon_id', 'gene_id', 'gene_name', 'transcript_id' ]] if first_last: df_genes = df_gtf[df_gtf['Feature'] == 'transcript'] df_genes.set_index('transcript_id', inplace=True) df_genes = df_genes.loc[df_exons['transcript_id']] df_genes.set_index(df_exons.index, inplace=True) starting = df_exons['Start'] == df_genes['Start'] ending = df_exons['End'] == df_genes['End'] df_exons.loc[:, 'left_overhang'] = ~starting * overhang[0] df_exons.loc[:, 'right_overhang'] = ~ending * overhang[1] df_exons.loc[:, 'Start'] -= df_exons['left_overhang'] df_exons.loc[:, 'End'] += df_exons['right_overhang'] return pyranges.PyRanges(df_exons)
def test_BaseVariantMatcher__read_intervals(): pranges = pyranges.read_gtf(gtf_file) with pytest.raises(ValueError): pr = BaseVariantMatcher._read_intervals(pranges=pranges, gtf_path=gtf_file) with pytest.raises(ValueError): pr = BaseVariantMatcher._read_intervals(intervals=intervals, interval_attrs=['gene_id']) pr = BaseVariantMatcher._read_intervals(gtf_path=gtf_file) assert pr.Chromosome.tolist() == ['chr1'] * 5 assert pr.Start.tolist() == [200, 200, 200, 1049, 3029] assert pr.End.tolist() == [4230, 4230, 402, 1340, 4230] # assert len(pr.intervals.tolist()) == 5 pr = BaseVariantMatcher._read_intervals(bed_path=example_intervals_bed) assert pr.Chromosome.tolist() == ['chr1'] * 4 assert pr.Start.tolist() == [2, 2, 2, 602] assert pr.End.tolist() == [1000, 5000, 1002, 604] # assert len(pr.intervals.tolist()) == 4 pr = BaseVariantMatcher._read_intervals(pranges=pranges) assert pr.Chromosome.tolist() == ['chr1'] * 5 assert pr.Start.tolist() == [200, 200, 200, 1049, 3029] assert pr.End.tolist() == [4230, 4230, 402, 1340, 4230] # assert len(pr.intervals.tolist()) == 5 pr = BaseVariantMatcher._read_intervals(intervals=intervals) assert pr.df.Chromosome.tolist() == ['chr1', 'chr1'] assert pr.df.Start.tolist() == [1, 23] assert pr.df.End.tolist() == [10, 30] assert pr.df.Strand.tolist() == ['+', '-'] assert len(pr.intervals.tolist()) == 2
def gencode_gtf(): """ >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+ >>> # | Chromosome | Source | Feature | Start | End | Score | Strand | Frame | gene_id | +15 | >>> # | (category) | (object) | (category) | (int32) | (int32) | (object) | (category) | (object) | (object) | ... | >>> # |--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------| >>> # | chr1 | HAVANA | gene | 11868 | 14409 | . | + | . | ENSG00000223972.5 | ... | >>> # | chr1 | HAVANA | transcript | 11868 | 14409 | . | + | . | ENSG00000223972.5 | ... | >>> # | chr1 | HAVANA | exon | 11868 | 12227 | . | + | . | ENSG00000223972.5 | ... | >>> # | chr1 | HAVANA | exon | 12612 | 12721 | . | + | . | ENSG00000223972.5 | ... | >>> # | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | >>> # | chr1 | HAVANA | exon | 1430549 | 1430662 | . | - | . | ENSG00000225285.1 | ... | >>> # | chr1 | HAVANA | transcript | 1430663 | 1434520 | . | - | . | ENSG00000225285.1 | ... | >>> # | chr1 | HAVANA | exon | 1434177 | 1434520 | . | - | . | ENSG00000225285.1 | ... | >>> # | chr1 | HAVANA | exon | 1430663 | 1430954 | . | - | . | ENSG00000225285.1 | ... | >>> # +--------------+------------+--------------+-----------+-----------+------------+--------------+------------+-------------------+-------+ >>> # Stranded PyRanges object has 4,995 rows and 24 columns from 1 chromosomes. >>> # For printing, the PyRanges was sorted on Chromosome and Strand. >>> # 15 hidden columns: gene_type, gene_name, level, havana_gene, transcript_id, transcript_type, transcript_name, transcript_support_level, tag, ... (+ 6 more.) """ full_path = get_example_path("gencode_human.gtf.gz") return pr.read_gtf(full_path)
def check(path_gtf): gtf = pr.read_gtf(path_gtf) biotypes = get_biotypes(gtf) print(biotypes.value_counts().to_markdown(tablefmt="psql", floatfmt=",.0f")) del gtf
def __init__( self, fasta_file, gtf_file, ): genome_annotation = pr.read_gtf(gtf_file, as_df=True) roi = get_roi_from_genome_annotation(genome_annotation) roi = pr.PyRanges(roi) super().__init__( regions_of_interest=roi, reference_sequence=FastaStringExtractor(fasta_file), )
def test_read_gtf(): gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True) assert len(gr.columns) == 28 df = gr.df transcript = df.iloc[1] assert transcript['tag'] == 'basic' exon = df[df['exon_id'] == 'ENSE00003812156'].iloc[0] assert exon['tag'] == 'basic' gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True, duplicate_attr=True) assert len(gr.columns) == 28 df = gr.df transcript = df.iloc[1] assert transcript['tag'] == 'basic' exon = df[df['exon_id'] == 'ENSE00003812156'].iloc[0] assert exon['tag'] == 'CCDS,basic'
def edit_gtf(input_gtf, chr_no): df = pr.read_gtf(input_gtf).df df = df[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'gene_id', 'gene_biotype']] df = df[df['Feature'] == 'gene'] df = df[df['gene_biotype'] == 'protein_coding'] df.drop(df[df['Chromosome'] == 'Mt'].index, inplace=True) df.drop(df[df['Chromosome'] == 'Pt'].index, inplace=True) df = df.astype({'Chromosome': 'int32'}) dfs = [df[df['Chromosome'] == x] for x in range(1, chr_no + 1)] gtf = [] for df in dfs: df.reset_index(drop=True, inplace=True) gtf.append(df) return gtf
def file_to_grange(f, dtype=np.int32, filetype="reads"): from pyranges import PyRanges, read_gtf if dtype == np.int64: extended = True else: extended = False if filetype == "reads": df = read_file(f, dtype) gr = PyRanges(df, extended=extended) elif filetype == "annotation": gr = read_gtf(f, annotation="ensembl") return gr
def __init__( self, fasta_file, gtf_file, vcf_file, vcf_file_tbi=None, vcf_lazy=True, ): genome_annotation = pr.read_gtf(gtf_file, as_df=True) roi = get_roi_from_genome_annotation(genome_annotation) roi = pr.PyRanges(roi) from kipoiseq.extractors import MultiSampleVCF super().__init__(regions_of_interest=roi, reference_sequence=FastaStringExtractor(fasta_file), variants=MultiSampleVCF(vcf_file, lazy=vcf_lazy))
def main(fastafile: str = typer.Option(..., help="fasta file"), gtffile: str = typer.Option(..., help="gtf file"), outfile: str = typer.Option(..., help="output mRNA fasta file")): """根据gtf提取基因的mRNA序列,同一基因的不同转录本会merge起来,每个基因只输出一个合并后的mRNA序列""" gr = pr.read_gtf(gtffile) df = gr.merge(by=["Feature", "gene_id"], strand=False).as_df() seq = Fasta(fasta) with open(outfile, 'w') as f: for gene, gdf in df.loc[df['Feature'] == 'exon', :].groupby('gene_id'): f.write(f'>{gene}\n') content = [] for chrom, start, end in gdf.sort_values('Start')[[ 'Chromosome', 'Start', 'End' ]].values: content.append(seq[chrom][start:end].seq) else: f.write(''.join(content) + '\n')
def read_ensemble_genes_gtf(gtf_filename) -> PyRanges: """ Read an ensembl gtf and extract gene start end Parameters ---------- gtf_filename : str GTF filename Returns ------- PyRanges Genes bounds """ genes = pr.read_gtf(gtf_filename, as_df=True) genes = genes.groupby(['Chromosome', 'gene_id', 'gene_name'], observed=True).agg({ 'Start': min, 'End': max }).reset_index() genes = pr.PyRanges(genes) return genes
def __init__(self, gtf_file, fasta_file, num_upstream, num_downstream, gtf_filter='gene_type == "protein_coding"', anchor='tss', transform=one_hot_dna, interval_attrs=["gene_id", "Strand"], use_strand=True): # Read and filter gtf gtf = pr.read_gtf(gtf_file).df if gtf_filter: if isinstance(gtf_filter, str): gtf = gtf.query(gtf_filter) else: gtf = gtf_filter(gtf) # Extract anchor if isinstance(anchor, str): anchor = anchor.lower() if anchor in self._function_mapping: anchor = self._function_mapping[anchor] else: raise Exception("No valid anchorpoint was chosen") self._gtf_anchor = anchor(gtf) # Other parameters self._use_strand = use_strand self._fa = FastaStringExtractor(fasta_file, use_strand=self._use_strand) self._transform = transform if self._transform is None: self._transform = lambda x: x self._num_upstream = num_upstream self._num_downstream = num_downstream self._interval_attrs = interval_attrs
def test_pyranges_to_intervals(): pranges = pyranges.read_gtf(gtf_file) intervals = list( pyranges_to_intervals(pranges, interval_attrs=[ 'gene_id', 'gene_name', 'transcript_id', 'exon_id' ])) assert len(intervals) == 5 assert intervals[4].attrs['gene_id'] == 'ENSG00000012048' assert intervals[4].attrs['gene_name'] == 'BRCA1' assert intervals[4].attrs['transcript_id'] == 'ENST00000357654' assert intervals[4].attrs['exon_id'] == 'ENSE00003510592' pranges = pyranges.read_bed(example_intervals_bed) intervals = list(pyranges_to_intervals(pranges)) assert len(intervals) == 4 assert intervals[0].start == 2 assert pranges.Chromosome.tolist() == ['chr1'] * 4 assert pranges.Start.tolist() == [2, 2, 2, 602] assert pranges.End.tolist() == [1000, 5000, 1002, 604]
def adjust_gtf(file, vcf_file, new_file): vcf = vcf_file df = pr.read_gtf(file).df df = df[df['Feature'] == 'gene'] df = df[df['gene_biotype'] == 'protein_coding'] df = df[['Chromosome', 'Source', 'Feature', 'Start', 'End', 'Score', 'Strand', 'gene_id']] # To ensure we use just chromosome 1-5, excluding Mt and Pt df = df[df['Chromosome'].isin(['1', '2', '3', '4', '5'])] df.reset_index(inplace=True, drop=True) print('processing gtf') for idx, shift in enumerate(vcf['SHIFT']): if shift != 0: position = int(vcf['POS'][idx]) chrom = vcf['#CHROM'][idx] for chr, start, end in zip(enumerate(df['Chromosome']), df['Start'], df['End']): start = int(start) end = int(end) if chr[1] == chrom: if position < start and position < end: df.loc[chr[0], 'Start'] = df.loc[chr[0], 'Start'] + shift df.loc[chr[0], 'End'] = df.loc[chr[0], 'End'] + shift elif start < position < end: df.loc[chr[0], 'End'] = df.loc[chr[0], 'End'] + shift df.to_csv(new_file, header=False, index=False, sep='\t')
def gencode_gtf(): full_path = get_example_path("gencode_human.gtf.gz") return pr.read_gtf(full_path)
def ensembl_gtf(): full_path = get_example_path("ensembl_human.gtf.gz") return pr.read_gtf(full_path)
def test_read_gtf(): gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=False) assert list(gr.df.columns[:4]) == "Chromosome Start End Strand".split()
def test_read_gff3(): gr = pr.read_gtf("tests/test_data/gencode.gff3", full=False) assert list(gr.df.columns[:4]) == "Chromosome Start End Strand".split()
def gtf(): return pr.read_gtf("tests/test_data/ensembl.gtf")
#!/usr/bin/env python # coding: utf-8 import pyranges import os import numpy as np import logging # gtf file with gene annotations gtf_path = snakemake.input.gtf target_regions_path = snakemake.input.target_regions logging.basicConfig(filename=snakemake.log[0]) anno = pyranges.read_gtf(gtf_path) # filter protein_coding_genes = anno[(anno.Feature == 'gene') & (anno.gene_biotype == 'protein_coding')] target_regions = pyranges.read_bed(target_regions_path) protein_coding_genes = protein_coding_genes.overlap(target_regions) logging.info('found {} protein coding genes.'.format( len(protein_coding_genes))) id_name = np.array([ '_'.join([i, n]) for i, n in zip(protein_coding_genes.gene_id, protein_coding_genes.gene_name) ])
def test_read_gtf(): gr = pr.read_gtf("tests/test_data/ensembl.gtf", full=True) assert len(gr.columns) == 28
def generateIlluminaWindowFromKb(t2gPath, ecPath, splicePath, unsplicePath, gtfPath, illuminaWindowDir, windowSize): """ generate illumina windows from kb_python results(workflow: nuclei) t2gPath: index file ecPath: matrix ec splicePath: filtered spliced bus unsplicePath: filtered spliced bus gtfPath: gtf anno file, used to create kb ref illuminaWindowDir: dir stored illumina reads, end with '/' windowSize: windowSize """ kbParseTools.mkdir(illuminaWindowDir) logger.info("start parse gff file") gtfDf = pr.read_gtf(gtfPath, as_df=True) gtfDf = gtfDf.query("Feature == 'exon'").reindex( ['Chromosome', 'Start', 'End', 'gene_id'], axis=1) gtfDf = gtfDf.assign(Gene=lambda x: x["gene_id"]).groupby("Gene").agg({ "Chromosome": lambda x: x.iloc[0], "Start": 'min', "End": 'max' }) gtfDf = gtfDf.assign( StartWin=lambda x: x["Start"] // windowSize - 1, EndWin=lambda x: x["End"] // windowSize + 1, ) gtfDf = gtfDf.to_dict('index') logger.info("start parse bus file") kbUmiSpliceMappingInfoDf = kbParseTools.getBustoolsMappingResult( t2gPath, ecPath, splicePath) kbUmiUnspliceMappingInfoDf = kbParseTools.getBustoolsMappingResult( t2gPath, ecPath, unsplicePath) kbUmiMappingInfoDf = pd.concat( [kbUmiUnspliceMappingInfoDf, kbUmiSpliceMappingInfoDf]) kbUmiMappingInfoDf = kbUmiMappingInfoDf.groupby('barcodeUmi').agg( {'geneLs': lambda x: __getSetOutersect(*x)}) kbUmiMappingInfoDf = kbUmiMappingInfoDf.assign( geneCounts=lambda df: df['geneLs'].map(len)).query("geneCounts >= 1") kbUmiMappingInfoDf = kbUmiMappingInfoDf.reset_index().assign( barcode=lambda df: df["barcodeUmi"].str.split("_").str[0], umi=lambda df: df["barcodeUmi"].str.split("_").str[1], ).assign(seq=lambda df: df["barcode"] + df["umi"]) illuminaWindowContentDt = defaultdict(lambda: defaultdict(lambda: [])) for oneUmiNt in kbUmiMappingInfoDf.itertuples(): for gene in oneUmiNt.geneLs: geneGtfDt = gtfDf[gene] geneChr = geneGtfDt['Chromosome'] geneStartWin = geneGtfDt['StartWin'] geneEndWin = geneGtfDt['EndWin'] for singleWin in range(geneStartWin, geneEndWin + 1): illuminaWindowContentDt[geneChr][singleWin].append( f'>{oneUmiNt.barcodeUmi}\n{oneUmiNt.seq}') i = 0 totalCounts = sum([len(x) for x in illuminaWindowContentDt.values()]) with ThreadPoolExecutor(24) as mtT: for chromNum, chromDt in illuminaWindowContentDt.items(): chromFastaDir = f'{illuminaWindowDir}{chromNum}/' kbParseTools.mkdir(chromFastaDir) for windowNum, windowLs in chromDt.items(): i += 1 mtT.submit(writeWindowFasta, chromFastaDir, windowNum, windowLs, i, totalCounts)
def load_gff_cds(fp: str) -> Tuple[PyRanges, PyRanges]: # Load necessary fields from GTF/GFF2 file ranges: pd.DataFrame = read_gtf( fp, as_df=True)[GFF_FIELDS].rename(columns={'Frame': 'frame'}) logging.debug("GTF/GFF2 file: %d features found." % ranges.shape[0]) # Drop unnecessary features ranges = ranges[ranges.Feature.isin(GFF_FEATURES)] logging.debug("GTF/GFF2 file: %d CDS features found." % ranges.shape[0]) # Compress identifiers ranges.transcript_id = ranges.transcript_id.astype('category') ranges.gene_id = ranges.gene_id.astype('category') # Extract UTR features utr_mask: pd.Series = ranges.Feature == 'UTR' utr_ranges: PyRanges = PyRanges( df=ranges[utr_mask].drop('Feature', axis=1)) ranges = ranges[~utr_mask] del utr_mask # Transform frames ranges.frame = _get_frames(ranges.frame) assert ranges.frame.dtype == 'int8' # Extract stop codon features stop_mask: pd.Series = ranges.Feature == 'stop_codon' stop_codons: Dict[str, int] = { r.transcript_id: r.End if r.Strand == '+' else r.Start for r in ranges[stop_mask].itertuples() } ranges = ranges[~stop_mask] del stop_mask # Sort CDS features by genomic coordinates ranges = ranges.drop('Feature', axis=1).sort_values(by=[ 'gene_id', 'transcript_id', 'Chromosome', 'Strand', 'Start', 'End' ], ignore_index=True) # Validate number of gene and transcript identifiers gene_n: int = ranges.gene_id.cat.categories.size transcript_n: int = ranges.transcript_id.cat.categories.size logging.debug("GTF/GFF2 file: %d genes found." % gene_n) logging.debug("GTF/GFF2 file: %d transcripts found." % transcript_n) if gene_n == 0 or transcript_n == 0: raise ValueError("No gene or transcript ID found in GTF/GFF file!") if transcript_n > gene_n: raise ValueError( "Multiple transcripts per gene in GTF/GFF file are not supported!") # Check for missing identifiers if ranges.gene_id.isnull().values.any(): raise ValueError("Missing gene ID in GTF/GFF2 file!") if ranges.transcript_id.isnull().values.any(): raise ValueError("Missing transcript ID in GTF/GFF2 file!") # Check only one transcript per gene is present gene_transcript_counts: pd.DataFrame = ranges.groupby( ['gene_id', 'transcript_id'], sort=False).size().reset_index(name='counts') gene_transcript_counts = gene_transcript_counts[ gene_transcript_counts.counts > 0] if gene_transcript_counts.shape[0] > gene_n: raise ValueError( "Multiple transcripts per gene in GTF/GFF file are not supported!") # Assign a sequential index to each CDS feature (5' to 3') ranges['exon_index'] = ranges.groupby(['transcript_id'], sort=False).pipe(get_exon_indices) # Append the stop codons to the last CDS features strands: Set[str] = set(ranges.Strand.cat.categories.values) if '+' in strands: last_cds_plus: np.ndarray = _get_last_cds_indices(ranges, '+') ranges.loc[last_cds_plus, 'End'] += 3 del last_cds_plus if '-' in strands: last_cds_minus: np.ndarray = _get_last_cds_indices(ranges, '-') ranges.loc[last_cds_minus, 'Start'] -= 3 del last_cds_minus # TODO: validate with stop codon features del stop_codons # Convert to PyRanges cds_ranges: PyRanges = PyRanges(df=ranges) return cds_ranges, utr_ranges