def parse_matchAnnot(fa_or_fq, filename, not_pbid=False, parse_FL_coverage=False): pbids = [] fl_cov = {} # only used if parse_FL_coverage is True for r in SeqIO.parse(open(fa_or_fq), type_fa_or_fq(fa_or_fq)): _id = r.id if not_pbid else r.id.split("|")[0] pbids.append(_id) if parse_FL_coverage: try: cov = int( r.description.split("full_length_coverage=")[1].split(";") [0]) fl_cov[_id] = cov except: logger.error( f"WARNING: Unable to extract `full_length_coverage=` from {r.description}. Mark as NA." ) fl_cov[_id] = "NA" match = defaultdict(lambda: (None, None, 0)) # ex: PB.1.1 -> (NOC2L, NOC2L-001, 5) for line in open(filename): i = line.find("result:") if i >= 0: raw = line[i:].strip().split() if len(raw) < 7: continue pbid = raw[1] if not_pbid else raw[1].split("|")[0] gene = raw[2] isoform = raw[3] score = int(raw[7]) if score > match[pbid][1]: match[pbid] = (gene, isoform, score) f = open(f"{filename}.parsed.txt", "w") f.write("pbid\tpbgene\trefisoform\trefgene\tscore") if parse_FL_coverage: f.write("\tcount_fl") f.write("\n") for pbid in pbids: if not_pbid: pbpre = pbid else: pbpre = pbid.split(".")[1] _cov_text = f"\t{fl_cov[pbid]}" if parse_FL_coverage else "" if pbid not in match: f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n") else: gene, isoform, score = match[pbid] if gene is None: f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n") else: f.write( f"{pbid}\t{pbpre}\t{isoform}\t{gene}\t{score}{_cov_text}\n" ) f.close() logger.info(f"Output written to: {f.name}")
def get_abundance_post_collapse( group_file: Path, cluster_report_csv: Path, output_prefix: str, restricted_movies: Optional[List[str]] = None, ): """ :param collapse_prefix: collapse prefix filename (must have .group.txt present) :param prefix_dict: :param output_prefix: :param restricted_movies: :return: """ if not group_file.exists(): logger.error(f"File {group_file.name} does not exist. Abort!") sys.exit(-1) if not cluster_report_csv.exists(): logger.error(f"File {cluster_report_csv.name} does not exist. Abort!") sys.exit(-1) cid_info = read_group_filename(group_file, is_cid=True) output_read_count_IsoSeq_csv(cid_info, cluster_report_csv, f"{output_prefix}.read_stat.txt") logger.info(f"Read stat file written to {output_prefix}.read_stat.txt") make_abundance_file( f"{output_prefix}.read_stat.txt", f"{output_prefix}.abundance.txt", restricted_movies=restricted_movies, ) logger.info(f"Abundance file written to {output_prefix}.abundance.txt")
def match_fusion_record( self, records: List[GFF.gmapRecord]) -> Optional[GFF.gmapRecord]: """ records --- in order, the records of a single fusion. """ good = [] # match the first record, requiring additionally that the precise 3' end matches cands = self.match_record_to_tree(records[0], check_5_dist=False, check_3_dist=True) # for each candidate (ex: PB.8.1, extract the full set of records and match them) for cand in cands: m = seqid_rex.match(cand) fusion_id = m.group(1) if self.check_records_match(records, self.record_d_fusion[fusion_id]): good.append(fusion_id) if len(good) == 0: return None elif len(good) == 1: return good[0] else: logger.error( "ERROR! more than one possible candidate in match_fusion_record! DEBUG." ) logger.error(f"MATCHED: {good}") sys.exit(-1)
def iter_gmap_sam_for_fusion(gmap_sam_filename, fusion_candidates, transfrag_len_dict): """ Iterate through a sorted GMAP SAM file Continuously yield a group of overlapping records {'+': [r1, r2, ...], '-': [r3, r4....]} """ records = [] iterator = BioReaders.GMAPSAMReader(gmap_sam_filename, True, query_len_dict=transfrag_len_dict) for r in iterator: if r.qID in fusion_candidates: records = [r] break for r in iterator: if len(records) >= 1 and (r.sID == records[-1].sID and r.sStart < records[-1].sStart): logger.error("ERROR: SAM file is NOT sorted. ABORT!") sys.exit(-1) if len(records) >= 1 and (r.sID != records[0].sID or r.sStart > records[-1].sEnd): yield (sep_by_strand(records)) records = [] if r.qID in fusion_candidates: records.append(r) if len(records) > 0: yield (sep_by_strand(records))
def sanity_check_seqids(seqids: List[str]): for seqid in seqids: m = seqid_rex.match(seqid) if m is None: logger.error( f"Expected ID format (ex: PB.1.2) not followed by {seqid}! Abort!" ) sys.exit(-1)
def brangus(vcf_filename, out_filename, unzip_snps=None): if unzip_snps is None: unzip_snps = defaultdict(lambda: {}) for r in vcfpy.Reader(vcf_filename): unzip_snps[r.CHROM][r.POS] = r logger.info(f"Finished reading {vcf_filename}") with open(out_filename, "w") as out_f: FIELDS = [ "dir", "chrom", "pos", "strand", "ref", "alt_Short", "alt_PB", "in_Short", "in_PB", "cov_Short", "cov_PB", "genomic_HP", ] writer = DictWriter(out_f, FIELDS, delimiter="\t") writer.writeheader() dirs = glob.glob("by_loci/*size*/") for d1 in dirs: mpileup = Path(d1, "ccs.mpileup") mapfile = Path(d1, "fake.mapping.txt") vcffile = Path(d1, "phased.partial.vcf") config = Path(d1, "config") nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND") if not vcffile.exists(): if not nosnp.exists(): logger.error(f"Skipping {d1} because no SNPs found.") else: logger.info(f"Evaluating {d1}.") strand = "NA" if config.exists(): # find the strand this gene family is on for line in open(config): if line.startswith("ref_strand="): strand = line.strip().split("=")[1] good_positions, cov_at_pos = get_positions_to_recover( mapfile, mpileup, unzip_snps, min_cov=30) name = d1.split("/")[1] eval_isophase( vcffile, unzip_snps, good_positions, cov_at_pos, {}, {}, writer, name, strand, ) return
def main( input_csv: str = typer.Argument(..., help="Input CSV"), output_csv: str = typer.Argument(..., help="Output CSV"), bc_rank_file: str = typer.Argument( ..., help="Cell barcode rank file from short read data" ), only_top_ranked: bool = typer.Option( False, help="Only output those that are top-ranked. Must have --bc_rank_file.", ), dropseq_clean_report: str = typer.Option( ..., help="Output from running DetectBeadSubstitutionErrors in DropSeq cookbook (ex: star_gene_exon_tagged_clean_substitution.bam_report.txt)", ), dropseq_synthesis_report: str = typer.Option( ..., help="Output from running DetectBeadSynthesisErrors in DropSeq cookbook (ex: star_gene_exon_tagged_clean_substitution_clean2.bam_report.txt)", ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: shortread_bc = {} # dict of cell barcode -> "Y" for top ranked if bc_rank_file is not None: reader = DictReader(open(bc_rank_file), delimiter="\t") for r in reader: shortread_bc[r["cell_barcode"]] = r["top_ranked"] else: if only_top_ranked: logger.error("--bc_rank_file must be given if using --only_top_ranked!") sys.exit(-1) bc_repair_dict = None if dropseq_clean_report is not None: bc_repair_dict = read_dropseq_clean_report(dropseq_clean_report) if dropseq_synthesis_report is not None: bc_repair_dict = read_dropseq_synthesis_report( dropseq_synthesis_report, bc_repair_dict ) umi_bc_error_correct( input_csv, output_csv, shortread_bc, only_top_ranked, bc_repair_dict, )
def main( fasta_filename: str = typer.Argument(...), output_prefix: str = typer.Argument(...), copy: int = typer.Option( 1, help="Number of copies to simulate per input sequence (default: 1)", ), ins: float = typer.Option( 0, "--ins", "-i", help="Insert error rate [0-1] (default: 0)", ), dele: float = typer.Option( 0, "--dele", "-d", help="Deletion error rate [0-1] (default: 0)", ), sub: float = typer.Option( 0, "--sub", "-s", help="Substitution error rate [0-1] (default: 0)", ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: if sub < 0 or sub > 1: logger.error("Substitution error must be between 0-1!") sys.exit(-1) if ins < 0 or ins > 1: logger.error("Insertion error must be between 0-1!") sys.exit(-1) if dele < 0 or dele > 1: logger.error("Deletion error must be between 0-1!") sys.exit(-1) if sub + ins + dele > 1: logger.error("Total sub+ins+del error cannot exceed 1!") sys.exit(-1) profile = [sub, sub + ins, ins + dele, 1.0] fasta_filename = Path(fasta_filename) idpre = output_prefix ith = 0 for r in SeqIO.parse(open(fasta_filename), "fasta"): for _ in range(copy): ith += 1 print( f">{idpre}_{ith}_{r.id[:r.id.find('|')]}\n{sim_seq(r.seq.tostring(), profile)}" )
def get_roi_len(seqid: str): # before isoseq3: <movie>/<zmw>/<start>_<end>_CCS # for isoseq3: <movie>/<zmw>/ccs if seqid.endswith("/ccs"): logger.info( "WARNING: isoseq3 format detected. Output `length` column will be `NA`." ) return "NA" elif not seqid.endswith("_CCS"): logger.error( "Sequence ID format must be <movie>/<zmw>/<start>_<end>_CCS or <movie>/<zmw>/ccs! Abort!" ) sys.exit(-1) s, e, junk = seqid.split("/")[2].split("_") return abs(int(s) - int(e))
def main( genome_fasta: str = typer.Argument(..., help="Reference genome fasta"), flnc_filename: str = typer.Argument(..., help="FLNC fastq file"), gff_filename: str = typer.Argument( ..., help="GFF file of transcripts, IDs must be PB.X.Y"), stat_filename: str = typer.Argument( ..., help="Tab-delimited read stat file linking FLNC to PB.X.Y"), coverage: int = typer.Option( 40, "--coverage", "-c", help="Minimum FLNC coverage required (default: 40)", ), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ) -> None: if Path("by_loci").exists() and Path("by_loci").is_dir(): logger.error( "Directory by_loci/ already exists. Delete before running!") sys.exit(-1) if not Path(genome_fasta).exists(): logger.error(f"Cannot find genome FASTA {genome_fasta}. Abort!") sys.exit(-1) if not Path(flnc_filename).exists(): logger.error(f"Cannot find FLNC file {flnc_filename}. Abort!") sys.exit(-1) if not Path(gff_filename).exists(): logger.error(f"Cannot find GFF file {gff_filename}. Abort!") sys.exit(-1) if not Path(stat_filename).exists(): logger.error(f"Cannot find Stat file {stat_filename}. Abort!") sys.exit(-1) logger.info(f"Reading genome fasta {genome_fasta}...") genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_fasta), "fasta")) select_loci_to_phase(genome_d, gff_filename, stat_filename, flnc_filename, coverage)
def main( config: Union[str, Path] = typer.Argument(..., help="Config filename"), output_prefix: str = typer.Argument(..., help="Output prefix"), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): try: ( sample_dirs, gff_filename, genome_filename, junction_filename, ) = read_config(config) except FileNotFoundError as error: logger.error(error) sanity_check(sample_dirs, gff_filename, genome_filename, junction_filename) if genome_filename is not None: logger.info(f"Reading genome file {genome_filename}...") genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta")) else: logger.info("No genome file given. Ignore.") genome_d = None if junction_filename is not None: logger.info(f"Reading junction file {junction_filename}....") junction_bed = read_annotation_junction_bed(junction_filename) else: logger.info("No junction file given. Ignore.") junction_bed = None summarize_junctions( sample_dirs, gff_filename, output_prefix, genome_d, junction_bed, )
def read_config( filename: Path) -> Tuple[Dict[str, Path], List[str], Path, Path, Path]: """ SAMPLE=<name>;<path> must also have GFF_FILENAME= optional: GENOME_FILENAME= JUNCTION_FILENAME= GROUP_FILENAME= Everything else will be ignored (so you can re-use sample.config for chain_samples.py) """ sample_dirs: Dict[str, Path] = {} sample_names: List[str] = [] gff_filename: Optional[Union[str, Path]] = None genome_filename: Optional[Union[str, Path]] = None junction_filename: Optional[Union[str, Path]] = None if not filename.exists(): raise FileNotFoundError( f"The config file {filename} could not be found!") with open(filename) as f: for line in f: if line.startswith("tmpSAMPLE="): logger.error( "Please only use SAMPLE=, not tmpSAMPLE= for junction reports!" ) sys.exit(-1) elif line.startswith("SAMPLE="): name, path = line.strip()[len("SAMPLE="):].split(";") if name.startswith("tmp_"): logger.error( f"Sample names are not allowed to start with tmp_! Please change {name} to something else." ) sys.exit(-1) sample_dirs[name] = Path(path).resolve() sample_names.append(name) elif line.startswith("GFF_FILENAME="): gff_filename = Path(line.strip()[len("GFF_FILENAME="):]) elif line.startswith("GENOME_FILENAME="): genome_filename = Path(line.strip()[len("GENOME_FILENAME="):]) elif line.startswith("JUNCTION_FILENAME="): junction_filename = Path( line.strip()[len("JUNCTION_FILENAME="):]) if gff_filename is None: raise Exception( f"Expected GFF_FILENAME= but not in config file {filename}! Abort." ) if len(sample_names) == 0: logger.error("No samples given. Exit.") sys.exit(-1) return sample_dirs, gff_filename, genome_filename, junction_filename
def main( bam_filename: str = typer.Argument( ..., help="CCS BAM with cDNA primer removed (post LIMA)"), output_prefix: str = typer.Argument(..., help="Output prefix"), umi_len: int = typer.Option(..., "-u", "--umi_len", help="Length of UMI"), bc_len: int = typer.Option(..., "-b", "--bc_len", help="Length of cell barcode"), tso_len: int = typer.Option(0, "-t", "--tso_len", help="Length of TSO (for G5-10X only)"), umi_type: umi_types = typer.Option(..., help="Location of the UMI"), g5_clip_seq: Optional[str] = typer.Option( None, help="Sequence before UMI for G5-clip (for G5-clip only)"), bc_rank_file: Optional[str] = typer.Option( None, help="(Optional) cell barcode rank file from short read data"), version: bool = typer.Option( None, "--version", callback=version_callback, is_eager=True, help="Prints the version of the SQANTI3 package.", ), ): if bc_len < 0: logger.error("bc_len can't be a negative number!") sys.exit(-1) if umi_len < 0: logger.error("umi_len can't be a negative number!") sys.exit(-1) if umi_len + bc_len <= 0: logger.error("umi_len + bc_len must be at least 1 bp long!") sys.exit(-1) # ToDo: figure out later how to do top ranked barcodes for 10X data shortread_bc = {} # dict of cell barcode -> "Y" for top ranked if bc_rank_file is not None: reader = DictReader(open(bc_rank_file), delimiter="\t") for r in reader: shortread_bc[r["cell_barcode"]] = r["top_ranked"] clip_out( bam_filename, umi_len, bc_len, output_prefix, umi_type, shortread_bc, tso_len, g5_clip_seq, )
def sanity_check_collapse_input(count_filename: Path, gff_filename: Path, rep_filename: Path, sample_directory: Path) -> None: """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ # group_filename = f"{input_prefix}.group.txt" if not rep_filename.exists(): raise RuntimeError( f"Input sequence file {rep_filename.name} not found. Abort!") if not count_filename.exists(): raise RuntimeError(f"File {count_filename.name} not found. Abort!") if not gff_filename.exists(): raise RuntimeError(f"File {gff_filename.name} not found. Abort!") if not sample_directory.exists(): raise RuntimeError( f"The directory {sample_directory.name} not found. Abort!") rep_type = "fastq" if rep_filename.suffix in (".fq", ".fastq") else "fasta" pbids1 = {r.id for r in SeqIO.parse(open(rep_filename), rep_type)} pbids2 = {r.seqid for r in GFF.collapseGFFReader(gff_filename)} pbids3 = set(read_count_file(count_filename)[0].keys()) if (len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(pbids1) != len(pbids3)): logger.error( "The number of PBID records in the files disagree! Sanity check failed." ) logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}") logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}") logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}") sys.exit(-1) return None
def sanity_check( sample_dirs: List[Dict[str, Union[str, Path]]], gff_filename: Union[str, Path], genome_filename: Optional[Union[str, Path]] = None, junction_filename: Optional[Union[str, Path]] = None, ) -> None: for d in sample_dirs.values(): file = Path(d, gff_filename) if not file.exists(): logger.error(f"Expected GFF file {file} does not exist. Abort!") sys.exit(-1) if genome_filename is not None and not Path(genome_filename).exists(): logger.error( f"Genome file {genome_filename} given but does not exist. Abort!") sys.exit(-1) if junction_filename is not None and not Path(junction_filename).exists(): logger.error( f"Junction file {junction_filename} given but does not exist. Abort!" ) sys.exit(-1)
def sanity_check_collapse_input(input_prefix: str) -> Tuple[Path, Path, Path]: """ Check that 1. the count, gff, rep files exist 2. the number of records agree among the three """ # group_filename = f"{input_prefix}.group.txt" count_filename = Path(f"{input_prefix}.abundance.txt") gff_filename = Path(f"{input_prefix}.gff") rep_filename = Path(f"{input_prefix}.rep.fq") if not count_filename.exists(): logger.error(f"File {count_filename} does not exist. Abort!") sys.exit(-1) if not gff_filename.exists(): logger.error(f"File {gff_filename} does not exist. Abort!") sys.exit(-1) if not rep_filename.exists(): logger.error(f"File {rep_filename} does not exist. Abort!") sys.exit(-1) pbids1 = {[r.id for r in SeqIO.parse(open(rep_filename, "r"), "fastq")]} pbids2 = {[r.seqid for r in GFF.collapseGFFReader(gff_filename)]} pbids3 = {read_count_file(count_filename)[0].keys()} if ( len(pbids1) != len(pbids2) or len(pbids2) != len(pbids3) or len(pbids1) != len(pbids3) ): logger.error( "The number of PBID records in the files disagree! Sanity check failed." ) logger.error(f"# of PBIDs in {rep_filename}: {len(pbids1)}") logger.error(f"# of PBIDs in {gff_filename}: {len(pbids2)}") logger.error(f"# of PBIDs in {count_filename}: {len(pbids3)}") sys.exit(-1) return count_filename, gff_filename, rep_filename
def iter_gmap_sam(self, gmap_sam_filename: Union[str, Path], ignored_fout: TextIOWrapper): """ Iterate over a SORTED GMAP SAM file. Return a collection of records that overlap by at least 1 base. """ def sep_by_clustertree( records: List[BioReaders.GMAPSAMRecord], ) -> List[BioReaders.GMAPSAMRecord]: tree = ClusterTree(0, 0) for i, r in enumerate(records): tree.insert(r.sStart, r.sEnd, i) result = [] for *_, indices in tree.getregions(): result.append([records[i] for i in indices]) return result def sep_by_strand( records: List[BioReaders.GMAPSAMRecord], ) -> Dict[str, List[BioReaders.GMAPSAMRecord]]: """ Note! Must further separate again within each strand. Because of initially processing the strands together, could've collapesd some genes. """ output = {"+": [], "-": []} for r in records: output[r.flag.strand].append(r) # process + strand using ClusterTree output["+"] = sep_by_clustertree(output["+"]) output["-"] = sep_by_clustertree(output["-"]) return output gmap_sam_reader = BioReaders.GMAPSAMReader( filename=gmap_sam_filename, has_header=True, query_len_dict=self.transfrag_len_dict, ) quality_alignments = self.get_quality_alignments( gmap_sam_reader=gmap_sam_reader, ignored_fout=ignored_fout) # find first acceptably mapped read try: records: List[BioReaders.GMAPSAMRecord] = [ next(quality_alignments) ] max_end = records[0].sEnd except StopIteration: logger.error(f"No valid records from {gmap_sam_filename}!") return # go through remainder of alignments and group by subject ID for r in quality_alignments: if r.sID == records[0].sID and r.sStart < records[-1].sStart: logger.error("SAM file is NOT sorted. ABORT!") sys.exit(-1) if r.sID != records[0].sID or r.sStart > max_end: yield sep_by_strand(records) records = [r] max_end = r.sEnd else: records.append(r) max_end = max(max_end, r.sEnd) yield sep_by_strand(records)
def read(self): """ GFF files (0) seqname (1) annotation source (2) feature: gene|transcript|CDS|exon|UTR (3) 1-based start # MUST CONVERT TO 0-based!!! (4) 1-based end (5) score (I think it's similarity for GMAP) (6) strand: +|- (7) phase (8) extra stuff (gene ID, transcript ID...) For gmap output, a series is delimited by '###' line """ cur = self.f.tell() line = self.f.readline().strip() if self.f.tell() == cur: raise StopIteration("EOF reached!!") raw = line.strip().split("\t") while raw[0].startswith("#"): line = self.f.readline().strip() raw = line.strip().split("\t") if len(raw) == 0 or raw[0] == "": raise StopIteration("EOF reached!!") assert raw[2] == "gene" raw = self.f.readline().strip().split("\t") assert raw[2] == "mRNA" seqname = raw[0] strand = raw[6] for blob in raw[8].split(";"): if blob.startswith("coverage="): coverage = float(blob[9:]) elif blob.startswith("identity="): identity = float(blob[9:]) elif blob.startswith("Name="): seqid = blob[5:] rec = gmapRecord(seqname, coverage, identity, strand, seqid) cds_exons = [] cds_seq_start = None cds_seq_end = None while True: line = self.f.readline().strip() if line.startswith("##"): rec.cds_exons = cds_exons rec.cds_seq_start = cds_seq_start rec.cds_seq_end = cds_seq_end return rec raw = line.split("\t") feature = raw[2] if feature == "exon": rstart1, rend1 = int(raw[3]), int(raw[4]) score = float(raw[5]) rstrand = raw[6] # this is the strand on the reference genome for blob in raw[8].split(";"): if blob.startswith("Target="): # sstrand is the strand on the query sequence _, sstart1, send1, sstrand = blob.split() sstart1 = int(sstart1) send1 = int(send1) rec.sstrand = sstrand try: rec.add_exon(rstart1 - 1, rend1, sstart1 - 1, send1, rstrand, score) except AssertionError: logger.error(f"{rec.seqid} has non-colinear exons!") while True: line = self.f.readline().strip() if line.startswith("##"): return rec rec.strand = rstrand elif feature == "CDS": rstart1, rend1 = int(raw[3]), int(raw[4]) cds_exons.append(Interval(rstart1 - 1, rend1)) for blob in raw[8].split(";"): if blob.startswith("Target="): junk, sstart1, send1, sstrand = blob.split() sstart1 = int(sstart1) send1 = int(send1) cds_seq_start = (sstart1 - 1 if cds_seq_start is None else cds_seq_start) cds_seq_end = send1 else: raise Exception(f"Not supposed to see type {feature} here!!")
def read_config( filename: Union[str, Path] ) -> Tuple[Dict[str, Path], List[str], str, str, str, str]: # Okay, why is this a thing? Why not just pass arguments? """ tmpSAMPLE=<name>;<path> SAMPLE=<name>;<path> must also have GROUP_FILENAME= GFF_FILENAME= COUNT_FILENAME= optional: FASTQ_FILENAME= """ sample_dirs = {} sample_names = [] group_filename, gff_filename, count_filename = None, None, None fastq_filename = None no_more_tmp = False with open(filename) as f: for line in f: if line.startswith("tmpSAMPLE="): if no_more_tmp: logger.error( "Cannot have tmp_ samples after non-tmp_ samples! Abort!" ) sys.exit(-1) name, path = line.strip()[len("tmpSAMPLE="):].split(";") if name.startswith("tmp_"): logger.error( f"Sample names are not allowed to start with tmp_! " f"Please change {name} to something else.") sys.exit(-1) sample_dirs[name] = Path(path).resolve() sample_names.append(f"tmp_{name}") elif line.startswith("SAMPLE="): no_more_tmp = True name, path = line.strip()[len("SAMPLE="):].split(";") if name.startswith("tmp_"): logger.error( f"Sample names are not allowed to start with tmp_! " f"Please change {name} to something else.") sys.exit(-1) sample_dirs[name] = Path(path).resolve() sample_names.append(name) elif line.startswith("GROUP_FILENAME="): group_filename = line.strip()[len("GROUP_FILENAME="):] elif line.startswith("GFF_FILENAME="): gff_filename = line.strip()[len("GFF_FILENAME="):] elif line.startswith("COUNT_FILENAME="): count_filename = line.strip()[len("COUNT_FILENAME="):] elif line.startswith("FASTQ_FILENAME="): fastq_filename = line.strip()[len("FASTQ_FILENAME="):] if group_filename is None: raise FileNotFoundError( f"Expected GROUP_FILENAME= but not in config file {filename}! Abort." ) if count_filename is None: raise FileNotFoundError( f"Expected COUNT_FILENAME= but not in config file {filename}! Abort." ) if gff_filename is None: raise FileNotFoundError( f"Expected GFF_FILENAME= but not in config file {filename}! Abort." ) if len(sample_names) == 0: logger.error("No samples given. Exit.") sys.exit(-1) # return signature is: # sample_dirs = Dict[sample_name, Path(sample_path)] # sample_names = List[sample_name] # group_filename = str # gff_filename = str # count_filename = str # fastq_filename = str # so, for the test data, we get: # sample_dirs = { # 'A': Path('tests/test_data/chaining/A'), # 'B': Path('tests/test_data/chaining/B') # } # sample_names = ["A", "B"] # group_filename = touse.group.txt # gff_filename = touse.gff # count_filename = touse.count.txt # fastq_filename = touse.rep.fq return ( sample_dirs, sample_names, group_filename, gff_filename, count_filename, fastq_filename, )
def collate_info( fusion_prefix: str, class_filename: str, genepred_filename: str, total_fl_count: Optional[int] = None, config_filename: Optional[str] = None, genome_dict: Optional[dict] = None, cds_gff_filename: Optional[str] = None, min_fl_count: int = 2, min_breakpoint_dist_kb: int = 10, include_Mt_genes: bool = False, ) -> None: global_info = {} # holding information for general information if config_filename is not None: logger.info(f"Reading config file {config_filename}...") for line in open(config_filename): k, v = line.strip().split("=") global_info[k] = v gene_to_id = {} # gene name --> ensembl ID for line in open(genepred_filename): raw = line.strip().split() gene_to_id[raw[11]] = raw[0] d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record orf_dict = {} # read SQANTI3 classification file for r in DictReader(open(class_filename), delimiter="\t"): m = fusion_pbid.match(r["isoform"]) if m is None: logger.error( "ERROR: fusion pbid must follow format `PBfusion.X.Y`. Abort!") sys.exit(-1) gene_index, isoform_index = m.group(1), m.group(2) d[gene_index][isoform_index] = r orf_dict[r["isoform"]] = r["ORF_seq"] # get sequences seq_dict = { r.id.split("|")[0]: r.seq for r in SeqIO.parse(open(f"{fusion_prefix}.rep.fa"), "fasta") } # get count information count_d = defaultdict(lambda: "NA") count_filename = f"{fusion_prefix}.abundance.txt" if Path(count_filename).exists(): for r in DictReader(open(count_filename), delimiter="\t"): count_d[r["pbid"]] = int(r["count_fl"]) if total_fl_count is None: logger.info( "Total FL count not given --- using the sum FL count from fusions only instead." ) total_fl_count = sum(count_d.values()) # get breakpoint information gff_d = defaultdict( lambda: {}) # PBfusion.X --> isoform index -> sqanti3 record if cds_gff_filename is None: gff_filename = f"{fusion_prefix}.gff" else: gff_filename = cds_gff_filename for r in collapseGFFReader(gff_filename): m = fusion_pbid.match(r.seqid) if m is None: logger.error( f"ERROR: fusion pbid in {gff_filename} must follow format `PBfusion.X.Y`. Abort!" ) sys.exit(-1) gene_index, isoform_index = m.group(1), int(m.group(2)) gff_d[gene_index][isoform_index] = r if r.strand not in ("+", "-"): logger.error( f"ERROR: fusion {r.seqid} did not specify strand in {gff_filename}! Abort!" ) sys.exit(-1) fields2 = list(global_info.keys()) + FIELDS with open(f"{fusion_prefix}.annotated.txt", "w") as f, open(f"{fusion_prefix}.annotated_ignored.txt", "w") as f_bad: writer = DictWriter(f, fields2, delimiter=",") writer.writeheader() writer_bad = DictWriter(f_bad, fields2, delimiter=",") writer_bad.writeheader() for gene_index, iso_dict in d.items(): iso_dict = list( iso_dict.items()) # (isoform index, classification record) iso_dict.sort(key=lambda x: x[0]) has_novel = any(r["associated_gene"].startswith("novelGene") or r["associated_gene"] == "" for junk, r in iso_dict) pbid = f"PBfusion.{str(gene_index)}" gff_info = list(gff_d[gene_index].items()) gff_info.sort(key=lambda x: x[0]) rec1 = gff_info[0][1] rec2 = gff_info[-1][1] ( left_breakpoint, left_seq, right_breakpoint, right_seq, ) = get_breakpoint_n_seq(rec1, rec2, genome_dict) left_exon_count = len(rec1.ref_exons) right_exon_count = len(rec2.ref_exons) gene1 = iso_dict[0][1]["associated_gene"] gene2 = iso_dict[-1][1]["associated_gene"] if cds_gff_filename is not None: left_cds_exon_count = len(rec1.cds_exons) right_cds_exon_count = len(rec2.cds_exons) else: left_cds_exon_count = "NA" right_cds_exon_count = "NA" left_orf, right_orf = "NA", "NA" if orf_dict is not None: seqid1 = gff_info[0][1].seqid seqid2 = gff_info[-1][1].seqid left_orf = orf_dict[seqid1] right_orf = orf_dict[seqid2] info = { "UniqueID": pbid, "FusionName": "--".join([_r["associated_gene"] for (_index, _r) in iso_dict]), "LeftGeneName": gene1, "LeftGeneID": gene_to_id[gene1] if gene1 in gene_to_id else "NA", "LeftBreakpoint": left_breakpoint, "LeftFlankingSequence": left_seq, "RightGeneName": gene2, "RightGeneID": gene_to_id[gene2] if gene2 in gene_to_id else "NA", "RightBreakpoint": right_breakpoint, "RightFlankingSequence": right_seq, "JunctionSupport": "NA", "SpanningReads": count_d[pbid], "ReadCountScore": (count_d[pbid] * (10**6) / total_fl_count) if count_d[pbid] != "NA" else "NA", "Sequence": seq_dict[pbid], "LeftORF": left_orf, "RightORF": right_orf, "LeftExonCount": left_exon_count, "RightExonCount": right_exon_count, "LeftCDSExonCount": left_cds_exon_count, "RightCDSExonCount": right_cds_exon_count, "Comments": "PASS", } info.update(global_info) left_chr, left_break, left_strand = left_breakpoint.split(":") right_chr, right_break, right_strand = right_breakpoint.split(":") if has_novel: info["Comments"] = "FAIL:NovelGene" elif gene1 == gene2: info["Comments"] = "FAIL:SameGene" elif info["SpanningReads"] != "NA" and info[ "SpanningReads"] < min_fl_count: info["Comments"] = "FAIL:TooFewFLReads" elif not include_Mt_genes and (gene1.startswith("MT-") or gene2.startswith("MT-")): info["Comments"] = "FAIL:MtGenes" elif (left_chr == right_chr and abs(int(left_break) - int(right_break)) / 1000 <= min_breakpoint_dist_kb): info["Comments"] = "FAIL:BreakpointTooClose" if info["Comments"].startswith("FAIL:"): writer_bad.writerow(info) else: writer.writerow(info)
def clip_out( bam_filename: str, umi_len: int, bc_len: int, output_prefix: str, UMI_type: umi_types, shortread_bc: Optional[Dict[str, str]] = None, tso_len: int = 0, g5_clip_seq: Optional[str] = None, ) -> None: """ :param bam_filename: BAM of post-LIMA (primer-trimmed) CCS sequences :param UMI_type: either 'A3' or 'G5' or 'G5-10X' :param shortread_bc: a dict of barcode -> "Y|N" for top-ranked. If given, came from short read data. -------- G5-10X -------- 5' primer -- BC --- UMI -- TSO --- GGG --- transcript --- polyA -------- G5-clip assumes input is like below, where the 5'/3' primer already removed by lima Here, we will only clip out the UMI, and write out the rest of the sequence, keeping the RT + transcript There is no assumption about the polyA tail existing or not -------- 5' primer -- UMI -- [RT primer] --- transcript --- 3' primer """ if shortread_bc is None: shortread_bc = dict() if UMI_type not in ("A3", "G5", "G5-10X", "G5-clip"): raise ValueError( f"UMI is of the wrong type. Got {UMI_type} Must be one of 'A3', 'G5', 'G5-10X', 'G5-clip'" ) umi_bc_len = umi_len + bc_len if UMI_type == "G5-clip": try: import parasail except ImportError: logger.error("need parasail library for G5-clip mode! Abort!") sys.exit(-1) para_mat = parasail.matrix_create("ACGT", 2, -5) para_search_len = umi_len + len(g5_clip_seq) + 10 FIELDS = [ "id", "clip_len", "extra", "UMI", "BC", "BC_rev", "BC_match", "BC_top_rank", ] if tso_len > 0: FIELDS += ["TSO"] with pysam.AlignmentFile(bam_filename, "rb", check_sq=False) as reader: with open(f"{output_prefix}.trimmed.csv", "w") as f1, pysam.AlignmentFile( f"{output_prefix}.trimmed.bam", "wb", header=reader.header) as f2: writer1 = DictWriter( f1, FIELDS, delimiter="\t", dialect="unix", ) writer1.writeheader() for r in reader: d = r.to_dict() # is_rev_strand = r.flag >> 4 & 1 if r.flag >> 4 & 1: d["seq"] = str(Seq(r.seq).reverse_complement()) d["qual"] = r.qual[::-1] new_tags = [] for tag in d["tags"]: if (tag.startswith("dq:i:") or tag.startswith("iq:i:") or tag.startswith("sq:i:")): tag = tag[:5] + tag[::-1][:-5] new_tags.append(tag) d["tags"] = new_tags d["flag"] = "4" # convert it back to not being rev complemented if UMI_type == "A3": A_start, A_end = find_Aend(d["seq"]) if A_end > 0: seq2 = d["seq"][ A_end:] # should be just UMI + BC, unless UMI started with 'A's diff = len(seq2) - umi_bc_len if diff < 0: # UMI may have started with 'A's seq2 = d["seq"][A_end + diff:] seq_extra = "NA" if diff > 0: seq_extra = seq2[:diff] if bc_len == 0: seq_bc = "" else: seq_bc = seq2[-bc_len:] if umi_len == 0: seq_umi = "" else: if bc_len == 0: seq_umi = seq2[-umi_len:] else: seq_umi = seq2[-(bc_len + umi_len):-bc_len] # reverse complement BC because it's always listed in rev comp in short read data seq_bc_rev = str(Seq(seq_bc).reverse_complement()) match = "Y" if seq_bc_rev in shortread_bc else "N" match_top = ("Y" if (match == "Y" and shortread_bc[seq_bc_rev] == "Y") else "N") rec = { "id": r.qname, "clip_len": len(seq2), "extra": seq_extra, "UMI": seq_umi, "BC": seq_bc, "BC_rev": seq_bc_rev, "BC_match": match, "BC_top_rank": match_top, } writer1.writerow(rec) # subset the sequence to include only the polyAs d["seq"] = d["seq"][:A_end] d["qual"] = d["qual"][:A_end] assert len(d["seq"]) == len(d["qual"]) new_tags = [] for tag in d["tags"]: if tag.startswith( "zs:B"): # defunct CCS tag, don't use pass elif (tag.startswith("dq:i:") or tag.startswith("iq:i:") or tag.startswith("sq:i:")): tag = tag[:A_end + 5] new_tags.append(tag) else: new_tags.append(tag) d["tags"] = new_tags x = pysam.AlignedSegment.from_dict(d, r.header) f2.write(x) elif UMI_type == "G5": G_start, G_end = find_Gstart(d["seq"]) if G_start > 0: seq2 = d["seq"][:G_start] # should be just UMI diff = len(seq2) - umi_len if diff < 0: # UMI may have ended with Gs seq2 = d["seq"][:G_start - diff] seq_extra = "NA" if diff > 0: seq_extra = seq2[:diff] seq2 = seq2[diff:] rec = { "id": r.qname, "clip_len": len(seq2), "extra": seq_extra, "UMI": seq2, "BC": "NA", # Brendan's current design has only UMI, no BC "BC_rev": "NA", "BC_match": "NA", "BC_top_rank": "NA", } writer1.writerow(rec) # subset the sequence to remove the UMIs and "G"s d["seq"] = d["seq"][G_end:] d["qual"] = d["qual"][G_end:] assert len(d["seq"]) == len(d["qual"]) new_tags = [] for tag in d["tags"]: if tag.startswith( "zs:B"): # defunct CCS tag, don't use pass elif (tag.startswith("dq:i:") or tag.startswith("iq:i:") or tag.startswith("sq:i:")): tag = tag[:5] + tag[5 + G_end:] new_tags.append(tag) else: new_tags.append(tag) d["tags"] = new_tags x = pysam.AlignedSegment.from_dict(d, r.header) f2.write(x) elif UMI_type == "G5-clip": o1 = parasail.sg_qx_trace(d["seq"][:para_search_len], g5_clip_seq, 10, 3, para_mat) # 'tags': ['bx:B:i,22,20', # ... # 'qe:i:2835', # 'bc:B:S,0,1', # 'bl:Z:CCCGCGTGGCCTCCTGAATTAT', # 'bt:Z:CATTGCCACTGTCTTCTGCT', # 'RG:Z:70de1488/0--1']} c_num, c_type = next( iter_cigar_string(str(o1.cigar.decode, "utf-8"))) if c_type == "I": # this is the (extra) + UMI seq2 = d["seq"][:c_num] seq_extra = "NA" diff = len(seq2) - umi_len if diff < 0: # we need to get a few more bases from the primers tag_dict = dict(x.split(":", 1) for x in d["tags"]) try: if tag_dict["bc"] == "B:S,0,1": # + strand assert tag_dict["bl"].startswith("Z:") Fseq = tag_dict["bl"][ 2:] # trimming away the Z: elif tag_dict["bc"] == "B:S,1,0": # - strand assert tag_dict["bt"].startswith("Z:") Fseq = str( Seq(tag_dict["bt"] [2:]).reverse_complement()) seq2 = ( Fseq[diff:] + seq2 ) # rescue bases from the trimmed F primer except KeyError: pass # just silently not do anything and output the shorter UMI # print("WARNING: older version of lima output, lacking 'bc' tag. Ignoring read {0}...".format(r.qname)) elif diff > 0: # there's extras seq_extra = seq2[:diff] seq2 = seq2[diff:] rec = { "id": r.qname, "clip_len": len(seq2), "extra": seq_extra, "UMI": seq2, "BC": "NA", # Brendan's current design has only UMI, no BC "BC_rev": "NA", "BC_match": "NA", "BC_top_rank": "NA", } writer1.writerow(rec) # subset the sequence to remove the UMI (but keep the G5 clip seq) d["seq"] = d["seq"][c_num:] d["qual"] = d["qual"][c_num:] assert len(d["seq"]) == len(d["qual"]) new_tags = [] for tag in d["tags"]: if tag.startswith( "zs:B"): # defunct CCS tag, don't use pass elif (tag.startswith("dq:i:") or tag.startswith("iq:i:") or tag.startswith("sq:i:")): tag = tag[:5] + tag[5 + c_num:] new_tags.append(tag) else: new_tags.append(tag) d["tags"] = new_tags x = pysam.AlignedSegment.from_dict(d, r.header) f2.write(x) elif UMI_type == "G5-10X": # need to first invert the sequence so polyA is at the end d["seq"] = str(Seq(d["seq"]).reverse_complement()) d["qual"] = d["qual"][::-1] # now it is BC -- UMI -- TSO -- GGG -- transcript -- polyA umi_bc_tso_len = bc_len + umi_len + tso_len G_start, G_end = find_Gstart( d["seq"][umi_bc_tso_len:umi_bc_tso_len + 10]) # pdb.set_trace() if G_start >= 0: G_start += umi_bc_tso_len G_end += umi_bc_tso_len seq2 = d["seq"][:G_start] # this is BC - UMI - TSO seq_tso = seq2[-tso_len:] + d["seq"][G_start:G_end] diff = len(seq2) - umi_bc_tso_len if diff > 0: # beginning may have included untrimmed primers seq_extra = seq2[:diff] seq2 = seq2[diff:] seq_bc = seq2[:bc_len] seq_umi = seq2[bc_len:umi_bc_len] elif diff == 0: seq_extra = "NA" seq_bc = seq2[:bc_len] seq_umi = seq2[bc_len:umi_bc_len] elif ( diff < 0 ): # we may have accidentally trimmed away some bases for BC, can't do anything seq_extra = "NA" seq_bc = seq2[:bc_len + diff] seq_umi = seq2[bc_len + diff:umi_bc_len + diff] # reverse complement BC because it's always listed in rev comp in short read data seq_bc_rev = str(Seq(seq_bc).reverse_complement()) match = "Y" if seq_bc_rev in shortread_bc else "N" match_top = ("Y" if (match == "Y" and shortread_bc[seq_bc_rev] == "Y") else "N") rec = { "id": r.qname, "clip_len": len(seq2) + (G_end - G_start), "extra": seq_extra, "UMI": seq_umi, "BC": seq_bc, "TSO": seq_tso, "BC_rev": seq_bc_rev, "BC_match": match, "BC_top_rank": match_top, } writer1.writerow(rec) # subset the sequence to remove the UMIs and "G"s d["seq"] = d["seq"][G_end:] d["qual"] = d["qual"][G_end:] assert len(d["seq"]) == len(d["qual"]) new_tags = [] for tag in d["tags"]: if tag.startswith( "zs:B"): # defunct CCS tag, don't use pass elif (tag.startswith("dq:i:") or tag.startswith("iq:i:") or tag.startswith("sq:i:")): tag = tag[:5] + tag[5 + G_end:] new_tags.append(tag) else: new_tags.append(tag) d["tags"] = new_tags x = pysam.AlignedSegment.from_dict(d, r.header) f2.write(x)
def filter_by_count( input_prefix: str, output_prefix: str, min_count: int, dun_use_group_count: bool = False, ) -> None: group_filename = f"{input_prefix}.group.txt" count_filename = f"{input_prefix}.abundance.txt" gff_filename = f"{input_prefix}.gff" rep_filenames = [ (f"{input_prefix}.rep.fq", "fastq"), (f"{input_prefix}.rep.fastq", "fastq"), (f"{input_prefix}.rep.fa", "fasta"), (f"{input_prefix}.rep.fasta", "fasta"), ] rep_filename = None rep_type = None for x, feature in rep_filenames: if os.path.exists(x): rep_filename = x rep_type = feature if rep_filename is None: logger.error( f"Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!" ) sys.exit(-1) if not dun_use_group_count: # read group group_max_count_fl = {} group_max_count_p = {} for line in open(group_filename): # ex: PB.1.1 i0HQ_54b0ca|c58773/f30p16/700 pbid, members = line.strip().split("\t") group_max_count_fl[pbid] = 0 group_max_count_p[pbid] = 0 members = members.split(",") for m in members: i = m.find("|") if i > 0: tmp = m.split("|")[1].split("/")[1] # ex: tmp = f30p16 else: tmp = m.split("/")[1] fl_count, p_count = tmp.split("p") fl_count = int(fl_count[1:]) p_count = int(p_count) group_max_count_fl[pbid] = max(group_max_count_fl[pbid], fl_count) group_max_count_p[pbid] = max(group_max_count_p[pbid], p_count) # read abundance first with open(count_filename) as f: count_header = "" while True: cur_pos = f.tell() line = f.readline() if not line.startswith("#"): f.seek(cur_pos) break else: count_header += line d = {r["pbid"]: r for r in DictReader(f, delimiter="\t")} for k, v in d.items(): print(k, v) # group_max_count_p NOT used for now good = [ x for x in d if int(d[x]["count_fl"]) >= min_count and (dun_use_group_count or group_max_count_fl[x] >= min_count) ] # write output GFF with open(f"{output_prefix}.gff", "w") as f: for r in GFF.collapseGFFReader(gff_filename): if r.seqid in good: GFF.write_collapseGFF_format(f, r) # write output rep.fq with open( f"{output_prefix}.rep.{('fq' if rep_type == 'fastq' else 'fa')}", "w" ) as f: for r in SeqIO.parse(open(rep_filename), rep_type): if r.name.split("|")[0] in good: SeqIO.write(r, f, rep_type) # write output to .abundance.txt with open(f"{output_prefix}.abundance.txt", "w") as f: f.write(count_header) writer = DictWriter( f, fieldnames=[ "pbid", "count_fl", "count_nfl", "count_nfl_amb", "norm_fl", "norm_nfl", "norm_nfl_amb", ], delimiter="\t", lineterminator="\n", ) writer.writeheader() for k in good: r = d[k] writer.writerow(r) logger.info( f"Output written to: {output_prefix}.gff\n" f"Output written to: {rep_filename}\n" f"Output written to: {output_prefix}.abundance.txt" )
def demux_for_subsamping( class_filename, fasta_filename, demux_count_file, output_prefix, out_group_dict, ignore_novel, ): # read SQANTI classification to get known gene/transcript name d = {} # pbid --> record for r in DictReader(open(class_filename), delimiter="\t"): d[r["isoform"]] = r # get read lengths lens = {} # pbid -> length for r in SeqIO.parse(open(fasta_filename), "fasta"): lens[r.id] = len(r.seq) writers = {} handles = {} out_groups = set(out_group_dict.values()) for g in out_groups: handles[g] = open( f"{output_prefix}_{g}_only.{'ignore_novel' if ignore_novel else 'use_novel'}.for_subsampling.txt", "w", ) writers[g] = DictWriter(handles[g], FIELDNAMES, delimiter="\t") writers[g].writeheader() reader = DictReader(open(demux_count_file), delimiter=",") for r in reader: if r["id"] not in d: logger.info( f"WARNING: skipping {r['id']} because not in {class_filename}", ) continue m = pbid_rex.match(r["id"]) if m is None: logger.error( f"ERROR: unable to parse ID {r['id']}. Expected format PB.X.Y!", ) sys.exit(-1) newrec = { "pbid": r["id"], "pbgene": m.group(1), "length": lens[r["id"]] } gene = d[r["id"]]["associated_gene"] trans = d[r["id"]]["associated_transcript"] if gene.startswith("novel") and ignore_novel: gene = "NA" if trans.startswith("novel"): if ignore_novel: trans = "NA" else: trans += r[ "id"] # add an unique identified to make this "novel" refgene unique newrec["refgene"] = gene newrec["refisoform"] = trans group_counts = Counter() for b, g in out_group_dict.items(): group_counts[g] += int(r[b]) for g in out_groups: newrec["fl_count"] = group_counts[g] writers[g].writerow(newrec) for h in handles.values(): h.close()
def make_file_for_subsample( input_prefix: str, output_prefix: str, demux_file=None, matchAnnot_parsed=None, sqanti_class=None, include_single_exons=False, ) -> None: """ Two files must exist: .abundance.txt and .rep.fq so we can make the length """ count_filename = f"{input_prefix}.abundance.txt" rep_filenames = [ (f"{input_prefix}.rep.fq", "fastq"), (f"{input_prefix}.rep.fastq", "fastq"), (f"{input_prefix}.rep.fa", "fasta"), (f"{input_prefix}.rep.fasta", "fasta"), ] rep_filename = None rep_type = None for x, feature in rep_filenames: if Path(x).exists(): rep_filename = x rep_type = feature if rep_filename is None: logger.error( "Expected to find input fasta or fastq files {input_prefix}.rep.fa or {input_prefix}.rep.fq. Not found. Abort!" ) sys.exit(-1) if not include_single_exons: from cupcake.sequence.GFF import collapseGFFReader gff_filename = f"{input_prefix}.gff" logger.info(f"Reading {gff_filename} to exclude single exons...") # good_ids = [] good_ids = [ r.seqid for r in collapseGFFReader(gff_filename) if len(r.ref_exons) >= 2 ] # for r in collapseGFFReader(gff_filename): # if len(r.ref_exons) >= 2: # good_ids.append(r.seqid) else: good_ids = [] if demux_file is None and not Path(count_filename).exists(): logger.error(f"Cannot find {count_filename}. Abort!") sys.exit(-1) if matchAnnot_parsed is not None and not Path(matchAnnot_parsed).exists(): logger.error(f"Cannot find {matchAnnot_parsed}. Abort!") sys.exit(-1) if sqanti_class is not None and not Path(sqanti_class).exists(): logger.error(f"Cannot find {sqanti_class}. Abort!") sys.exit(-1) if matchAnnot_parsed is not None: with open(matchAnnot_parsed) as ma: match_dict = {r["pbid"]: r for r in DictReader(ma, delimiter="\t")} for k in match_dict: match_dict[k]["category"] = match_dict[k]["score"] elif sqanti_class is not None: logger.info(f"Reading {sqanti_class} to get gene/isoform assignment...") match_dict = {} with open(sqanti_class) as sc: for r in DictReader(sc, delimiter="\t"): if r["associated_transcript"] == "novel": refisoform = f"novel_{r['isoform']}" else: refisoform = r["associated_transcript"] match_dict[r["isoform"]] = { "refgene": r["associated_gene"], "refisoform": refisoform, "category": r["structural_category"], } else: match_dict = None with open(rep_filename) as rf: seqlen_dict = { r.id.split("|")[0]: len(r.seq) for r in SeqIO.parse(rf, rep_type) } to_write = {} if demux_file is None: to_write["all"] = {} with open(count_filename) as f: while True: cur = f.tell() if not f.readline().startswith("#"): f.seek(cur) break for r in DictReader(f, delimiter="\t"): if r["pbid"] in good_ids or include_single_exons: to_write["all"][r["pbid"]] = r["count_fl"] else: d, samples = read_demux_fl_count_file(demux_file) for s in samples: to_write[s] = {} for pbid, d2 in d.items(): for s in samples: if pbid in good_ids or include_single_exons: to_write[s][pbid] = d2[s] for sample in to_write: with Path(f"{output_prefix}.{sample}.txt").open("a+") as h: if matchAnnot_parsed is None and sqanti_class is None: h.write("pbid\tpbgene\tlength\tfl_count\n") else: h.write( "pbid\tpbgene\tlength\trefisoform\trefgene\tcategory\tfl_count\n" ) for pbid in to_write[sample]: if matchAnnot_parsed is not None or sqanti_class is not None: if pbid not in match_dict: logger.warning( f"Ignoring {pbid} because not in annotation (SQANTI/MatchAnnot) file." ) continue m = match_dict[pbid] h.write(f"{pbid}\t{pbid.split('.')[1]}\t{seqlen_dict[pbid]}\t") h.write(f'{m["refisoform"]}\t{m["refgene"]}\t{m["category"]}\t') else: h.write(f'{pbid}\t{pbid.split(".")[1]}\t{seqlen_dict[pbid]}\t') h.write(f"{to_write[sample][pbid]}\n") logger.info( f"Output written to {Path(f'{output_prefix}.{sample}.txt').resolve()}." )
def collate_gene_info( group_filename, csv_filename, class_filename, output_filename, ontarget_filename=None, dedup_ORF_prefix=None, no_extra_base=False, is_clustered=False, ): """ <id>, <pbid>, <length>, <transcript>, <gene>, <category>, <ontarget Y|N|NA>, <ORFgroup NA|NoORF|groupID>, <UMI>, <BC> """ FIELDS = [ "id", "pbid", "length", "transcript", "gene", "category", "ontarget", "ORFgroup", "UMI", "UMIrev", "BC", "BCrev", ] group_info = read_group_info(group_filename) umi_bc_info = { r["id"]: r for r in DictReader(open(csv_filename), delimiter="\t") } sqanti_info = { r["isoform"]: r for r in DictReader(open(class_filename), delimiter="\t") } if ontarget_filename is not None: ontarget_info = { r["read_id"]: r for r in DictReader(open(ontarget_filename), delimiter="\t") } if dedup_ORF_prefix is not None: dedup_ORF_info = ( {} ) # seqid --> which group they belong to (ex: PB.1.2 --> ORFgroup_PB.1_1) for line in open(f"{dedup_ORF_prefix}.group.txt"): group_id, members = line.strip().split("\t") for pbid in members.split(","): dedup_ORF_info[pbid] = group_id f = open(output_filename, "w") writer = DictWriter(f, FIELDS, delimiter="\t") writer.writeheader() for ccs_id, pbid in group_info.items(): if pbid not in sqanti_info: logger.error(f"ignoring ID {pbid} cuz not in classification file.") continue if is_clustered: # id: 1-ATCGAATGT-GCTTCTTTCACCTATCGATGATGGCTCAT-m64015_200531_015713/110297924/ccs _index, _umi, _bc, _ccs_id = ccs_id.split("-") ccs_id = _ccs_id if no_extra_base and (not is_clustered and umi_bc_info[ccs_id]["extra"] != "NA"): logger.info(f"ignoring ID {pbid} cuz extra bases.") continue rec = {"id": ccs_id, "pbid": pbid} rec["length"] = sqanti_info[pbid]["length"] rec["category"] = sqanti_info[pbid]["structural_category"] rec["transcript"] = sqanti_info[pbid]["associated_transcript"] rec["gene"] = sqanti_info[pbid]["associated_gene"] if is_clustered: rec["UMI"] = _umi rec["BC"] = _bc else: rec["UMI"] = umi_bc_info[ccs_id]["UMI"] rec["BC"] = umi_bc_info[ccs_id]["BC"] rec["UMIrev"] = Seq(rec["UMI"]).reverse_complement() rec["BCrev"] = Seq(rec["BC"]).reverse_complement() if ontarget_filename is None: rec["ontarget"] = "NA" else: rec["ontarget"] = "Y" if ontarget_info[pbid]["genes"] != "" else "N" if dedup_ORF_prefix is None: rec["ORFgroup"] = "NA" else: if pbid not in dedup_ORF_info: rec["ORFgroup"] = "NoORF" else: rec["ORFgroup"] = dedup_ORF_info[pbid] writer.writerow(rec) f.close()