Esempio n. 1
0
def make_fake_genome(genome_d, gff_info, locus, output_prefix, output_name):

    chrom = gff_info[locus].chrom
    regions = gff_info[locus].regions

    with open(f"{output_prefix}.fasta", "w") as f:
        f.write(">" + output_name + "\n")
        for s, e in regions:
            f.write(str(genome_d[chrom][s:e].seq))
        f.write("\n")
        f.close()

    # for mapping, write <0-based index on fake genome>, <ref chrom>, <0-based index on ref genome>
    with open(f"{output_prefix}.mapping.txt", "w") as f:
        i = 0
        for s, e in regions:
            for j in range(s, e):
                f.write(f"{i},{chrom},{j}\n")
                i += 1

    with open(f"{output_prefix}.pbids.txt", "w") as f:
        f.write("\n".join(gff_info[locus].isoforms) + "\n")

    logger.info(
        f"Output written to {output_prefix}.fasta, {output_prefix}.mapping.txt, {output_prefix}.pbids.txt.",
    )
Esempio n. 2
0
def get_seq_stats(filename, binwidth):
    print("file type is:", type_fa_or_fq(filename))

    with open(f"{filename.name}.seqlengths.txt", "w") as f:
        lens = []
        for r in SeqIO.parse(open(filename), type_fa_or_fq(filename)):
            f.write(f"{r.id}	{str(len(r.seq))}\n")
            lens.append(len(r.seq))

    logger.info(f"{len(lens)} sequences")
    logger.info(f"min: {min(lens)}")
    logger.info(f"max: {max(lens)}")
    logger.info(f"avg: {sum(lens) * 1.0 / len(lens)}")

    # print by 1 kb bins
    logger.info("Length Breakdown by kb range:")

    _max = (max(lens) // binwidth) + 1
    bins = [0] * _max
    for x in lens:
        bins[x // binwidth] += 1

    for i in range(0, _max):
        if binwidth == 1000:
            print(f"{i}-{i + 1} kb: {bins[i]}")
        else:
            print(f"{i * binwidth}-{(i + 1) * binwidth}: {bins[i]}")

    print("5-95% percentile:", np.percentile(lens, 5), np.percentile(lens, 95))
Esempio n. 3
0
def error_correct_haplotypes(hap_obj, isoform_tally, diff_arr,
                             hap_count_ordered):

    # create new hap_obj and old_to_new_map dict
    new_hap_obj = Haplotypes(hap_obj.hap_var_positions, hap_obj.ref_at_pos,
                             hap_obj.count_of_vars_by_pos)
    old_to_new_map = {}
    for i, j in enumerate(diff_arr.argmin(axis=0)):
        # haplotype i maps to haplotype hap_count_ordered[j][0]
        k = hap_count_ordered[j][0]
        new_hap_index, msg = new_hap_obj.match_or_add_haplotype(
            hap_obj.haplotypes[k])
        old_to_new_map[i] = new_hap_index

    # now create a new isoform_tally
    new_isoform_tally = {}
    for k, v in isoform_tally.items():
        new_isoform_tally[k] = Counter()
        for old_hap_index, count in v.items():
            if old_hap_index not in old_to_new_map:
                logger.info(f"Discarding: {hap_obj.haplotypes[old_hap_index]}")
                continue
            new_hap_index = old_to_new_map[old_hap_index]
            new_isoform_tally[k][new_hap_index] += count
    return old_to_new_map, new_hap_obj, new_isoform_tally
Esempio n. 4
0
def main(
    sam_filename: str = typer.Argument(...),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    sam_filename = Path(sam_filename)
    if sam_filename.suffix != ".sam":
        raise RuntimeError("Only accepts files ending in .sam. Abort!")

    prefix = sam_filename.stem
    output_gff = f"{prefix}.collapsed.gff"

    with open(output_gff, "w") as f:
        reader = GMAPSAMReader(sam_filename, True)
        for r in reader:
            if r.sID == "*":
                continue
            r.strand = r.flag.strand
            r.geneid = r.qID
            r.seqid = r.qID
            r.chr = r.sID
            r.ref_exons = r.segments
            r.start = r.sStart
            r.end = r.sEnd
            r.cds_exons = None
            write_collapseGFF_format(f, r)

    logger.info(f"Output written to {output_gff}.")
Esempio n. 5
0
    def readGTF(self, filename):
        """
        .coords files
        (0) gene name
        (1) chr
        (2) number of exons
        (3) strand
        (4) list of space-separated 1-based start, 1-based end
        """
        for line in open(filename):
            raw = line.strip().split()
            tID = raw[0]
            seqname = raw[1]
            ith = 0

            if tID in self.transcript:
                logger.info(f"duplicate tID {tID} seen, ignore!")
                continue

            self.transcript_info[tID] = {"chr": seqname}

            for i in range(4, len(raw), 2):
                start0 = int(raw[i]) - 1
                end1 = int(raw[i + 1])
                self.genome[seqname].insert(start0, end1, tID)
                self.transcript[tID].insert(start0, end1, {
                    "ith": ith,
                    "chr": seqname
                })
                self.exon[(start0, end1)].append((tID, ith, seqname))

                i += 1
Esempio n. 6
0
def parse_matchAnnot(fa_or_fq,
                     filename,
                     not_pbid=False,
                     parse_FL_coverage=False):
    pbids = []
    fl_cov = {}  # only used if parse_FL_coverage is True
    for r in SeqIO.parse(open(fa_or_fq), type_fa_or_fq(fa_or_fq)):
        _id = r.id if not_pbid else r.id.split("|")[0]
        pbids.append(_id)
        if parse_FL_coverage:
            try:
                cov = int(
                    r.description.split("full_length_coverage=")[1].split(";")
                    [0])
                fl_cov[_id] = cov
            except:
                logger.error(
                    f"WARNING: Unable to extract `full_length_coverage=` from {r.description}. Mark as NA."
                )
                fl_cov[_id] = "NA"

    match = defaultdict(lambda:
                        (None, None, 0))  # ex: PB.1.1 -> (NOC2L, NOC2L-001, 5)

    for line in open(filename):
        i = line.find("result:")
        if i >= 0:
            raw = line[i:].strip().split()
            if len(raw) < 7:
                continue
            pbid = raw[1] if not_pbid else raw[1].split("|")[0]
            gene = raw[2]
            isoform = raw[3]
            score = int(raw[7])
            if score > match[pbid][1]:
                match[pbid] = (gene, isoform, score)

    f = open(f"{filename}.parsed.txt", "w")
    f.write("pbid\tpbgene\trefisoform\trefgene\tscore")
    if parse_FL_coverage:
        f.write("\tcount_fl")
    f.write("\n")
    for pbid in pbids:
        if not_pbid:
            pbpre = pbid
        else:
            pbpre = pbid.split(".")[1]
        _cov_text = f"\t{fl_cov[pbid]}" if parse_FL_coverage else ""
        if pbid not in match:
            f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n")
        else:
            gene, isoform, score = match[pbid]
            if gene is None:
                f.write(f"{pbid}\t{pbpre}\tNA\tNA\tNA{_cov_text}\n")
            else:
                f.write(
                    f"{pbid}\t{pbpre}\t{isoform}\t{gene}\t{score}{_cov_text}\n"
                )
    f.close()
    logger.info(f"Output written to: {f.name}")
def get_abundance_post_collapse(
    group_file: Path,
    cluster_report_csv: Path,
    output_prefix: str,
    restricted_movies: Optional[List[str]] = None,
):
    """

    :param collapse_prefix: collapse prefix filename (must have .group.txt present)
    :param prefix_dict:
    :param output_prefix:
    :param restricted_movies:
    :return:
    """

    if not group_file.exists():
        logger.error(f"File {group_file.name} does not exist. Abort!")
        sys.exit(-1)

    if not cluster_report_csv.exists():
        logger.error(f"File {cluster_report_csv.name} does not exist. Abort!")
        sys.exit(-1)

    cid_info = read_group_filename(group_file, is_cid=True)

    output_read_count_IsoSeq_csv(cid_info, cluster_report_csv,
                                 f"{output_prefix}.read_stat.txt")
    logger.info(f"Read stat file written to {output_prefix}.read_stat.txt")
    make_abundance_file(
        f"{output_prefix}.read_stat.txt",
        f"{output_prefix}.abundance.txt",
        restricted_movies=restricted_movies,
    )
    logger.info(f"Abundance file written to {output_prefix}.abundance.txt")
def main(
    fasta_filename: str = typer.Argument(
        ..., help="Fasta file from which to simulate phasing data from."),
    ploidity: int = typer.Option(2, "-p"),
    err_sub: float = typer.Option(...),
    copies: str = typer.Option(...),
    write_fastq: bool = typer.Option(False),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:

    assert 2 <= ploidity <= 6

    copies = list(map(int, copies.split(",")))
    assert len(copies) == ploidity

    for r in SeqIO.parse(open(fasta_filename), "fasta"):
        d2 = r.id.split("|")[0]
        logger.info(f"making {d2}")
        Path(d2).mkdir(parents=True, exist_ok=True)
        simulate_phasing_data(
            seq0=r.seq.tostring(),
            err_sub=err_sub,
            ploidity=ploidity,
            copies=copies,
            write_fastq=write_fastq,
            working_dir=d2,
        )
def scrub_sample_GFFs(
    sample_dirs: Dict[str, str],
    gff_filename: Union[str, Path],
    count_filename: Union[str, Path],
    group_filename: Union[str, Path],
    fastq_filename: Union[str, Path],
    output_prefix: str,
    tree: IntervalTree,
) -> None:

    for _, d in sample_dirs.items():
        with Path(d, f"{output_prefix}.gff.tmp").open("w") as outf:
            for r in GFF.collapseGFFReader(Path(d, gff_filename)):
                n = len(r.ref_exons)
                if n == 1:
                    GFF.write_collapseGFF_format(outf, r)

                new_ref_exons = scrub_ref_exons(r, tree)
                if new_ref_exons is None:
                    logger.info("No changes made due to error:", r.seqid)
                else:
                    # print "before:", r.ref_exons
                    # print "after :", new_ref_exons
                    r.ref_exons = new_ref_exons
                GFF.write_collapseGFF_format(outf, r)
        cleanup_scrubbed_files_redundancy(
            outf.name,
            Path(d, group_filename),
            Path(d, count_filename),
            Path(d, fastq_filename) if fastq_filename is not None else None,
            Path(d, output_prefix),
        )
Esempio n. 10
0
def trim5p3p_multithreaded(fastq_filename: Union[str, Path],
                           output_prefix: str, chunks: int) -> None:
    # first figure out how many records there are and record positions
    num_lines = 0
    for _ in open(fastq_filename, "r"):
        num_lines += 1
    num_records = num_lines // 4
    chunk_size = (num_records // chunks) + (num_records % chunks > 0)
    logger.info(
        f"{fastq_filename} has {num_records} records, {chunk_size} per chunk")

    pools = []
    records = []
    count = 0
    i = 1
    for r in SeqIO.parse(open(fastq_filename), "fastq"):
        count += 1
        records.append(r)
        if count >= chunk_size:
            p = Process(target=trim5p3p,
                        args=(records, f"{output_prefix}.{str(i)}"))
            p.start()
            print(f"Starting worker {i}...")
            pools.append(p)
            records = []
            count = 0
            i += 1
    p = Process(target=trim5p3p, args=(records, f"{output_prefix}.{str(i)}"))
    p.start()
    logger.info(f"Starting worker {i}...")
    pools.append(p)

    for p in pools:
        p.join()
Esempio n. 11
0
def link_files(
    src_dir: str, out_dir=Path.cwd()) -> Tuple[Path, Path, Path, Path]:
    """
    :param src_dir: job directory
    Locate mapped.fastq, read-stat, classify report link to current directory
    """

    src_dir = Path(src_dir)
    # location for mapped fastq in IsoSeq3
    mapped_fastq = src_dir.joinpath("outputs",
                                    "collapse_isoforms.fastq")  # for <SL8
    mapped_fasta = src_dir.joinpath(
        "outputs", "collapse_isoforms.fasta")  # SL8+ only fasta
    # mapped_gff = os.path.join(
    #     os.path.abspath(src_dir), "outputs", "collapse_isoforms.gff"
    # )
    read_stat = src_dir.joinpath("outputs", "collapse_isoforms.read_stat.txt")
    primer_csv = src_dir.joinpath("outputs", "flnc.report.csv")

    if mapped_fastq.exists():
        logger.info("Detecting IsoSeq task directories...")
        return out_dir, mapped_fastq, read_stat, primer_csv
    elif mapped_fasta.exists():
        logger.info("Detecting IsoSeq task directories...")
        return out_dir, mapped_fasta, read_stat, primer_csv
    else:
        raise FileNotFoundError(
            "Cannot find expected files (ex: collapse_isoforms.fastq) in job directory! Does not look like a Iso-Seq job!"
        )
Esempio n. 12
0
def brangus(vcf_filename, out_filename, unzip_snps=None):
    if unzip_snps is None:
        unzip_snps = defaultdict(lambda: {})
        for r in vcfpy.Reader(vcf_filename):
            unzip_snps[r.CHROM][r.POS] = r

    logger.info(f"Finished reading {vcf_filename}")
    with open(out_filename, "w") as out_f:
        FIELDS = [
            "dir",
            "chrom",
            "pos",
            "strand",
            "ref",
            "alt_Short",
            "alt_PB",
            "in_Short",
            "in_PB",
            "cov_Short",
            "cov_PB",
            "genomic_HP",
        ]
        writer = DictWriter(out_f, FIELDS, delimiter="\t")
        writer.writeheader()
        dirs = glob.glob("by_loci/*size*/")
        for d1 in dirs:
            mpileup = Path(d1, "ccs.mpileup")
            mapfile = Path(d1, "fake.mapping.txt")
            vcffile = Path(d1, "phased.partial.vcf")
            config = Path(d1, "config")
            nosnp = Path(d1, "phased.partial.NO_SNPS_FOUND")
            if not vcffile.exists():
                if not nosnp.exists():
                    logger.error(f"Skipping {d1} because no SNPs found.")
            else:
                logger.info(f"Evaluating {d1}.")
                strand = "NA"
                if config.exists():  # find the strand this gene family is on
                    for line in open(config):
                        if line.startswith("ref_strand="):
                            strand = line.strip().split("=")[1]
                good_positions, cov_at_pos = get_positions_to_recover(
                    mapfile, mpileup, unzip_snps, min_cov=30)
                name = d1.split("/")[1]
                eval_isophase(
                    vcffile,
                    unzip_snps,
                    good_positions,
                    cov_at_pos,
                    {},
                    {},
                    writer,
                    name,
                    strand,
                )

    return
Esempio n. 13
0
def convert_sam_rec_to_gff3_rec(r, source, qid_index_dict=None):
    """
    :param r: GMAPSAMRecord record
    :param qid_seen: list of qIDs processed so far -- if redundant, we have to put a unique suffix
    :return SeqRecord ready to be written as GFF3
    """
    if r.sID == "*":
        logger.info(f"Skipping {r.qID} because unmapped.")
        return None
    t_len = sum(e.end - e.start for e in r.segments)
    seq = Seq("A" * t_len)  # DO NOT CARE since sequence is not written in GFF3
    rec = SeqRecord(seq, r.sID)
    strand = 1 if r.flag.strand == "+" else -1

    # indels = r.num_ins + r.num_del
    # mismatches = r.num_nonmatches
    # matches = r.num_mat_or_sub - r.num_nonmatches

    if qid_index_dict is not None:
        if r.qID in qid_index_dict:
            qid_index_dict[r.qID] += 1
            r.qID += f"_dup{str(qid_index_dict[r.qID])}"
        else:
            qid_index_dict[r.qID] += 1

    gene_qualifiers = {"source": source, "ID": r.qID, "Name": r.qID}  # for gene record
    #    mRNA_qualifiers = {"source": source, "ID": r.qID+'.mRNA', "Name": r.qID+'.mRNA', "Parent": r.qID,
    #                       "coverage": "{0:.2f}".format(r.qCoverage*10**2) if r.qCoverage is not None else "NA",
    #                       "identity": "{0:.2f}".format(r.identity*10**2),
    #                       "matches": matches, "mismatches": mismatches, "indels": indels}

    # gene line, one per record
    top_feature = SeqFeature(
        FeatureLocation(r.sStart, r.sEnd),
        type="gene",
        strand=strand,
        qualifiers=gene_qualifiers,
    )
    # mRNA line, one per record
    top_feature.sub_features = (
        []
    )  # top_feature.sub_features = [SeqFeature(FeatureLocation(r.sStart, r.sEnd), type="mRNA", strand=strand, qualifiers=mRNA_qualifiers)]

    # exon lines, as many exons per record
    for i, e in enumerate(r.segments):
        _id = f"{r.qID}.exon{i+1}"
        exon_qual = {"source": source, "ID": _id, "Name": _id}
        top_feature.sub_features.append(
            SeqFeature(
                FeatureLocation(e.start, e.end),
                type="exon",
                strand=strand,
                qualifiers=exon_qual,
            )
        )
    rec.features = [top_feature]
    return rec
Esempio n. 14
0
def shaded_bed12_post_sqanti(
    sqanti_class_filename: Union[str, Path],
    input_bed12: Union[str, Path],
    output_prefix: str,
    FL_fieldnames: List[str] = ["FL"],
    ok_to_ignore: bool = False,
) -> None:

    # read input BED12 file into dict
    bed_info = {}  # isoform --> bed record
    for line in open(input_bed12):
        raw = line.strip().split()
        bed_info[raw[3]] = raw

    CPM_fieldnames = {}  # CPM -> FL field names
    for k in FL_fieldnames:
        assert k.startswith("FL")
        if k == "FL":
            CPM_fieldnames["CPM"] = "FL"
        else:
            assert k.startswith("FL.")
            CPM_fieldnames["CPM." + k[3:]] = k

    # group SQANTI3 classification file by `associated_gene`
    records_by_gene = defaultdict(lambda: [])
    total_fl_count_dict = Counter()
    for r in DictReader(open(sqanti_class_filename), delimiter="\t"):
        records_by_gene[r["associated_gene"]].append(r)
        for cpm_k, fl_k in CPM_fieldnames.items():
            total_fl_count_dict[cpm_k] += int(r[fl_k]) if r[fl_k] != "NA" else 0

    for cpm_k in total_fl_count_dict:
        if total_fl_count_dict[cpm_k] == 0:
            raise RuntimeError(
                f"No counts observed in column `{CPM_fieldnames[cpm_k]}`. Ignore!"
            )

    logger.info(f"Generating count RGB for columns: {', '.join(CPM_fieldnames.keys())}")
    bed_writers = {}
    for cpm_k in CPM_fieldnames:
        outfile = f"{output_prefix}.{cpm_k}.bed12"
        logger.info(f"Writing output to {outfile}....")
        with open(outfile, "w") as bed_writers[cpm_k]:
            bed_writers[cpm_k].write("track name=PacBioColored itemRgb=On\n")

    # calculate FL CPM
    for _, records in records_by_gene.items():
        for r in records:
            for cpm_k, fl_k in CPM_fieldnames.items():
                r[cpm_k] = (
                    (int(r[fl_k]) if r[fl_k] != "NA" else 0)
                    * (10 ** 6)
                    / total_fl_count_dict[cpm_k]
                )
        shade_isoforms_for_gene_group(records, bed_info, bed_writers, ok_to_ignore)
Esempio n. 15
0
def fq2fa(input_file):
    if not (input_file.lower().endswith(".fastq")
            or input_file.lower().endswith(".fq")):
        raise AssertionError(
            f"Input {input_file} does not end with .fastq or .fq! Abort")

    output = Path(input_file).with_suffix(".fasta")

    for r in SeqIO.parse(open(input_file), "fastq"):
        SeqIO.write(r, output, "fasta")

    logger.info(f"Output written to {output}")
Esempio n. 16
0
def collect_all_vcf(
    dirs: str,
    vcf_filename: str = "phased.partial.vcf",
    output: str = "IsoSeq_IsoPhase.vcf",
) -> None:
    no_snp_found_filename = Path(f"{Path(vcf_filename).stem}.NO_SNPS_FOUND")
    snps_by_chrom = defaultdict(lambda: [])

    reader = None

    for d in dirs:
        filename = Path(d, vcf_filename)
        if not filename.exists():
            if not no_snp_found_filename.exists():
                logger.info("VCF file {filename} does not exist. Skipping.")
            continue
        with open(filename) as rf:
            reader = vcfpy.Reader(rf)

            for r in reader:
                c = Counter()  # genotype -> count
                for x in r.samples:
                    if x.data.GT.count("|") == 0:
                        c[x.data.GT] += x.data.HQ
                    else:
                        for i, gt in enumerate(x.data.GT.split("|")):
                            c[gt] += x.data.HQ[i]
                c_keys = c.keys()
                genotype = "|".join(str(k) for k in c_keys)
                counts = ",".join(str(c[k]) for k in c_keys)
                r.samples = [
                    vcfpy.Call(
                        r,
                        "SAMPLE",
                        vcfpy.OrderedDict([("GT", genotype), ("HQ", counts)]),
                    )
                ]
                snps_by_chrom[r.CHROM].append((r.POS, r))

    keys = list(snps_by_chrom.keys())
    keys.sort()

    if reader is not None:
        reader.samples = ["SAMPLE"]
        with open(output, "w") as f:
            f = vcfpy.Writer(f, reader)
            for k in keys:
                v = snps_by_chrom[k]
                v.sort(key=lambda x: x[0])
                for _, rec in v:
                    f.write_record(rec)
        print("Output written to:", output)
def main(
    sample_config: str = typer.Argument(...),
    summary_report: str = typer.Argument(...),
    output_prefix: str = typer.Argument(...),
    min_sample: int = typer.Option(
        1, "-S", help="Minimum number of samples as evidence (default: 1)"),
    min_transcript: int = typer.Option(
        2, "-T",
        help="Minimum number of transcripts as evidence (default: 2)"),
    # parser.add_argument("-C", "--accept_all_canonical", action="store_true", default=False, help="Accept all canonical jucntions (default: false)")
    scrubbed_junction_file: Optional[Union[str, Path]] = typer.Option(
        None,
        help=
        "Scrubbed junction bed --- if given, directly use it to scrub GFFs."),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):
    (
        sample_dirs,
        sample_names,
        group_filename,
        gff_filename,
        count_filename,
        fastq_filename,
    ) = sp.read_config(sample_config)

    report_filename = summary_report

    if scrubbed_junction_file is None:
        output_filename = f"{output_prefix}.scrubbed.junction.bed"
        tree = scrub_junctions(report_filename, output_filename, min_sample,
                               min_transcript, True)
        logger.info(f"Scrubbed junction written to: {output_filename}")
    else:
        output_filename = scrubbed_junction_file
        logger.info(f"Reading scrubbed junction file: {output_filename}")
        tree = read_scrubbed_junction_to_tree(output_filename)

    scrub_sample_GFFs(
        sample_dirs,
        gff_filename,
        count_filename,
        group_filename,
        fastq_filename,
        output_prefix,
        tree,
    )
def regroup_gff(
    pooled_gff, demux_count_file, output_prefix, out_group_dict, in_fafq=None
):
    """
    :param pooled_sam: SAM file
    :param demux_count_file: comma-delimited per-barcode count file
    :param output_prefix: output prefix for GFF
    :param out_group_dict: dict of barcode name --> group to be long in  (ex: {'EM1':'EM', 'EM2':'EM'})
    :param in_fafq: optional fasta/fastq that was input to SAM
    """
    if in_fafq is not None:
        type_fafq = get_type_fafq(in_fafq)
    in_tissue = defaultdict(
        lambda: set()
    )  # pbid --> list of tissue it is in (EM, END, R)

    for r in DictReader(open(demux_count_file), delimiter=","):
        for k, v in r.items():
            if k != "id" and int(v) > 0:
                in_tissue[r["id"]].add(k)

    # in_tissue = dict(in_tissue)

    handles = {}
    handles_fafq = {}
    for g in out_group_dict.values():
        handles[g] = open(f"{output_prefix}_{g}_only.gff", "w")
        if in_fafq is not None:
            handles_fafq[g] = open(f"{output_prefix}_{g}_only.{type_fafq}", "w")

    if in_fafq is not None:
        fafq_dict = SeqIO.to_dict(SeqIO.parse(open(in_fafq), type_fafq))
        fafq_dict_keys = list(fafq_dict.keys())
        for k in fafq_dict_keys:
            m = rex_pbid.match(k)
            if m is not None:
                fafq_dict[m.group(1)] = fafq_dict[k]
    reader = GFF.collapseGFFReader(pooled_gff)
    for r in reader:
        groups_to_write_in = set()
        pbid = r.seqid
        if pbid not in in_tissue:
            logger.info(
                f"WARNING: {pbid} does not belong to any group indicated by outgroup_dict"
            )
        for tissue in in_tissue[pbid]:
            groups_to_write_in.add(out_group_dict[tissue])

        for g in groups_to_write_in:
            GFF.write_collapseGFF_format(handles[g], r)
            if in_fafq is not None:
                SeqIO.write(fafq_dict[pbid], handles_fafq[g], type_fafq)
Esempio n. 19
0
def link_files(src_dir, out_dir=Path.cwd()):
    """
    :param src_dir: job directory
    Locate HQ isoform, (cluster) report.csv, (classify) file.csv link to current directory
    """
    src_dir = Path(src_dir)
    # location for HQ fastq in IsoSeq1
    hq_fastq = src_dir.joinpath(
        "tasks",
        "pbtranscript.tasks.combine_cluster_bins-0",
        "hq_isoforms.fastq",
    )
    # location for HQ fastq in IsoSeq2
    hq_fastq2 = src_dir.joinpath(
        "tasks",
        "pbtranscript2tools.tasks.collect_polish-0",
        "all_arrowed_hq.fastq",
    )
    # location for cluster report in IsoSeq1
    cluster_csv = src_dir.joinpath(
        "tasks",
        "pbtranscript.tasks.combine_cluster_bins-0",
        "cluster_report.csv",
    )
    cluster_csv2 = src_dir.joinpath(
        "tasks",
        "pbtranscript2tools.tasks.collect_polish-0",
        "report.csv",
    )
    # location for classify report in IsoSeq1 and 2
    primer_csv = src_dir.joinpath("tasks", "pbcoretools.tasks.gather_csv-1",
                                  "file.csv")

    if hq_fastq.exists():
        logger.info("Detecting IsoSeq1 task directories...")
        hq_fastq.symlink_to(out_dir.joinpath("hq_isoforms.fastq"))
        cluster_csv.symlink_to(out_dir.joinpath("cluster_report.csv"))
        primer_csv.symlink_to(out_dir.joinpath("classify_report.csv"))
        isoseq_version = "1"
    else:
        logger.info("Detecting IsoSeq2 task directories...")
        hq_fastq2.symlink_to(out_dir.joinpath("hq_isoforms.fastq"))
        cluster_csv2.symlink_to(out_dir.joinpath("cluster_report.csv"))
        primer_csv.symlink_to(out_dir.joinpath("classify_report.csv"))
        isoseq_version = "2"
    return (
        out_dir,
        "hq_isoforms.fastq",
        "cluster_report.csv",
        "classify_report.csv",
        isoseq_version,
    )
Esempio n. 20
0
def main(gtf):
    transcript_tally = {}
    for tID in gtf.transcript:
        transcript_tally[tID] = [0] * len(gtf.get_exons(tID))
    for r in btabBlockReader(
            "sim_gencode_20x_first1000_test2.gmap.tophits.btab"):
        path = btab_reclist_to_interval_list_0basedStart(r)
        info = match_transcript(gtf, r[0]["chr"], path)
        if info["matchedExons"] is None:
            logger.info(f"Did not find a match for {r[0]['seqid']}!")
            continue
        for i, _ in info["matchedExons"]:
            transcript_tally[info["tID"]][i] += 1
    return transcript_tally
Esempio n. 21
0
def err_correct(
    genome_file: Path,
    sam_file: Path,
    output_err_corrected_fasta: Path,
    genome_dict: Dict[str, SeqIO.SeqRecord] = None,
) -> None:
    if genome_dict is None:
        genome_dict = {}
        logger.info(f"Loading {genome_file.name}")
        for r in tqdm(SeqIO.parse(OpenFile(genome_file, "r"), "fasta")):
            genome_dict[r.name] = r
        logger.info(f"Finished reading {genome_file}")

    with open(output_err_corrected_fasta, "w") as f:
        reader = BioReaders.GMAPSAMReader(str(sam_file), True)
        for r in tqdm(reader):
            logger.info(r)
            if r.sID == "*":
                continue
            seq = consistute_genome_seq_from_exons(genome_dict, r.sID,
                                                   r.segments, r.flag.strand)
            # logger.info(f">{r.qID}")
            f.write(f">{r.qID}\n{seq}\n")

    logger.info(f"output written to {output_err_corrected_fasta}")
def get_roi_len(seqid: str):
    # before isoseq3: <movie>/<zmw>/<start>_<end>_CCS
    # for isoseq3: <movie>/<zmw>/ccs
    if seqid.endswith("/ccs"):
        logger.info(
            "WARNING: isoseq3 format detected. Output `length` column will be `NA`."
        )
        return "NA"
    elif not seqid.endswith("_CCS"):
        logger.error(
            "Sequence ID format must be <movie>/<zmw>/<start>_<end>_CCS or <movie>/<zmw>/ccs! Abort!"
        )
        sys.exit(-1)
    s, e, junk = seqid.split("/")[2].split("_")
    return abs(int(s) - int(e))
Esempio n. 23
0
def main(
    genome_fasta: str = typer.Argument(..., help="Reference genome fasta"),
    flnc_filename: str = typer.Argument(..., help="FLNC fastq file"),
    gff_filename: str = typer.Argument(
        ..., help="GFF file of transcripts, IDs must be PB.X.Y"),
    stat_filename: str = typer.Argument(
        ..., help="Tab-delimited read stat file linking FLNC to PB.X.Y"),
    coverage: int = typer.Option(
        40,
        "--coverage",
        "-c",
        help="Minimum FLNC coverage required (default: 40)",
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    if Path("by_loci").exists() and Path("by_loci").is_dir():
        logger.error(
            "Directory by_loci/ already exists. Delete before running!")
        sys.exit(-1)

    if not Path(genome_fasta).exists():
        logger.error(f"Cannot find genome FASTA {genome_fasta}. Abort!")
        sys.exit(-1)

    if not Path(flnc_filename).exists():
        logger.error(f"Cannot find FLNC file {flnc_filename}. Abort!")
        sys.exit(-1)

    if not Path(gff_filename).exists():
        logger.error(f"Cannot find GFF file {gff_filename}. Abort!")
        sys.exit(-1)

    if not Path(stat_filename).exists():
        logger.error(f"Cannot find Stat file {stat_filename}. Abort!")
        sys.exit(-1)

    logger.info(f"Reading genome fasta {genome_fasta}...")
    genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_fasta), "fasta"))

    select_loci_to_phase(genome_d, gff_filename, stat_filename, flnc_filename,
                         coverage)
Esempio n. 24
0
def fa2fq(input_file):
    if not (
        input_file.lower().endswith(".fasta") or input_file.lower().endswith(".fa")
    ):
        raise AssertionError(
            f"Input {input_file} does not end with .fasta or .fa! Abort"
        )
    output = Path(input_file).with_suffix(".fastq")

    f = open(output, "w")
    for r in SeqIO.parse(open(input_file), "fasta"):
        r.letter_annotations["phred_quality"] = [60] * len(r.seq)
        SeqIO.write(r, f, "fastq")
    f.close()

    logger.info(f"Output written to {f.name}")
    return f.name
Esempio n. 25
0
def main_pasa(gtf):
    pasa_tally = {}
    for tID in gtf.transcript:
        pasa_tally[tID] = [0] * len(gtf.get_exons(tID))
    pasa = GTF(
        "sim_gencode_20x_first1000_test2.pasa_assemblies.denovo_transcript_isoforms.gtf"
    )
    for tID in pasa.transcript:
        path = pasa.get_exons(tID)
        seqname = pasa.exon[(path[0].start, path[0].end)][0][2]

        info = match_transcript(gtf, seqname, path)
        if info["matchedExons"] is None:
            logger.info(f"Did not find a match for {format(tID)}!")
            continue
        for i, j in info["matchedExons"]:
            pasa_tally[info["tID"]][i] += 1
    return pasa_tally
def scrub_ref_exons(r: Dict[str, Any],
                    tree: IntervalTree) -> Optional[List[Interval]]:
    n = len(r.ref_exons)
    new_ref_exons = []
    cur_start = r.ref_exons[0].start
    for i in range(n - 1):
        donor = r.ref_exons[i].end - 1  # make it 0-based
        accep = r.ref_exons[i + 1].start  # start is already 0-based
        match = find_best_match_junction(tree[r.chr, r.strand], donor, accep)
        if match is None:
            logger.info(
                f"donor-acceptor site {r.chr},{r.strand},{donor}-{accep} has no hit in tree!"
            )
            return None

        new_ref_exons.append(Interval(cur_start, match.start + 1))
        cur_start = match.end
    new_ref_exons.append(Interval(cur_start, r.ref_exons[-1].end))
    return new_ref_exons
def sep_by_primer(filename, output_prefix, sample_size):
    filetype = type_fa_or_fq(filename)

    ids = [r.id for r in SeqIO.parse(open(filename), filetype)]

    n = len(ids)
    if sample_size > n:
        logger.warning(
            f"WARNING: {filename} contains only {n} sequences but subsample size at {sample_size}! Simply output whole file."
        )

    chosen_ids = random.sample(ids, min(n, sample_size))

    with open(f"{output_prefix}.random{str(sample_size)}.{filetype}", "w") as f:
        for r in SeqIO.parse(open(filename), filetype):
            if r.id in chosen_ids:
                SeqIO.write(r, f, filetype)

        logger.info(f"Randomly selected sequences written to {f.name}.")
Esempio n. 28
0
def main(
    snps_filename: str = typer.Argument(
        ..., help="Filename containing list of .snps_files to process."),
    genome_filename: str = typer.Argument(
        ...,
        help="Genome fasta. Chromosome IDs must agree with .snps_files files!",
    ),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):
    snps_filename = Path(snps_filename)
    genome_filename = Path(genome_filename)

    snps_files = []
    # sanity checking of input files
    for filename in open(snps_filename):
        if not filename.suffix(".snps"):
            raise FileNotFoundError(
                f"Input files listed in {snps_filename} must end with .snps_files!"
            )
        if not filename.exists():
            raise FileNotFoundError(f"{filename} does not exist! Abort.")
        snps_files.append(filename)

    if not genome_filename.exists():
        raise FileNotFoundError(
            f"Genome file {genome_filename} does not exist!")

    logger.info(f"Reading genome file {genome_filename}....")
    genome_d = LazyFastaReader(genome_filename)

    # quick checking if the genome chromosomes have the |arrow|arrow style suffix, if they do, process it
    keys = list(genome_d.keys())
    for k in keys:
        k2 = k.split("|")[0]
        if k2 != k and k2 not in keys:
            genome_d.d[k2] = genome_d.d[k]
            logger.info(
                f"Detected | string in chromosome ID, stripping {k} to {k2}...."
            )
    logger.info("Finished reading genome.")

    for snp_file in snps_files:
        assert snp_file.suffix(".snps")
        vcf_file = snp_file.with_suffix(".vcf")
        logger.info(f"Processing {snp_file} --> {vcf_file}")
        write_snp_to_vcf(snp_file, vcf_file, genome_filename, genome_d)
Esempio n. 29
0
def demux_isoseq2_no_genome(
    job_dir: Optional[Path] = None,
    hq_fastq: Optional[Path] = None,
    cluster_csv: Optional[Path] = None,
    classify_csv: Optional[Path] = None,
    output_filename=sys.stdout,
):

    if job_dir is not None:
        (
            out_dir_ignore,
            hq_fastq,
            cluster_csv,
            classify_csv,
            isoseq_version,
        ) = link_files(job_dir)
        assert isoseq_version in ("1", "2")
    else:
        for _ in (
                hq_fastq,
                cluster_csv,
                classify_csv,
        ):
            if not _.exists():
                raise FileNotFoundError(f"{_.name} was not found!")

    # info: dict of hq_isoform --> primer --> FL count
    logger.info(f"Reading {classify_csv}...")
    max_primer, classify_csv = read_classify_csv(classify_csv)
    logger.info(f"Reading {cluster_csv}...")
    info = read_cluster_csv(cluster_csv, classify_csv, isoseq_version)

    with open(output_filename, "w") as f:
        f.write(
            f"id,{','.join('primer' + str(i) for i in range(max_primer + 1))}\n"
        )
        logger.info(f"Reading {hq_fastq}...")
        for r in SeqIO.parse(open(hq_fastq), "fastq"):
            if isoseq_version == "1":
                m = hq1_id_rex.match(r.id)
            else:
                m = hq2_id_rex.match(r.id)

            if m is None:
                raise RuntimeError(
                    f"Unexpected HQ isoform ID format: {r.id}! Abort.")
            cid = m.group(1)
            f.write(r.id)
            for p in range(max_primer + 1):
                f.write(f",{info[cid][p]}")
            f.write("\n")
        logger.info(f"Count file written to {f.name}.")
Esempio n. 30
0
def main(
    input_file: str = typer.Option(...,
                                   "--input",
                                   "-i",
                                   help="Input fasta or fastq."),
    sam_filename: str = typer.Option(...,
                                     "--sam_filename",
                                     "-s",
                                     help="Aligned SAM filename."),
    genome_filename: str = typer.Option(...,
                                        "--genome_filename",
                                        "-g",
                                        help="Genome fasta."),
    output_prefix: str = typer.Option(...,
                                      "--output_prefix",
                                      "-o",
                                      help="Output prefix."),
    gff: Optional[str] = typer.Option(None, "--gff", help="Annotation GFF."),
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
):

    # read genome
    logger.info(f"Reading genome {genome_filename}...")
    genome_d = SeqIO.to_dict(SeqIO.parse(open(genome_filename), "fasta"))

    # read gff
    if gff is not None:
        logger.info(f"Reading annotation {gff}...")
        junction_info = read_annotation_for_junction_info(gff)
    else:
        junction_info = None

    evaluate_alignment_sam(input_file, sam_filename, genome_d, output_prefix,
                           junction_info)