コード例 #1
0
ファイル: fusion_finder.py プロジェクト: Magdoll/cDNA_Cupcake
def fusion_main(fa_or_fq_filename, sam_filename, output_prefix, cluster_report_csv=None, is_fq=False, allow_extra_5_exons=True, skip_5_exon_alt=True, prefix_dict_pickle_filename=None, min_locus_coverage=.05, min_total_coverage=.99, min_locus_coverage_bp=1, min_dist_between_loci=10000):
    """
    (1) identify fusion candidates (based on mapping, total coverage, identity, etc)
    (2) group/merge the fusion exons, using an index to point to each individual part
    (3) use BranchSimple to write out a tmp GFF where
         PBfusion.1.1 is the first part of a fusion gene
         PBfusion.1.2 is the second part of a fusion gene
    (4) read the tmp file from <3> and modify it so that
         PBfusion.1 just represents the fusion gene (a single transcript GFF format)
    """
    compressed_records_pointer_dict = defaultdict(lambda: [])
    merged_exons = []
    merged_i = 0

    # step (0). check for duplicate IDs
    check_ids_unique(fa_or_fq_filename, is_fq=is_fq)

    # step (1). identify fusion candidates
    bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq)
    fusion_candidates = find_fusion_candidates(sam_filename, bs.transfrag_len_dict, min_locus_coverage, min_locus_coverage_bp, min_total_coverage, min_dist_between_loci)

    # step (2). merge the fusion exons
    for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates, bs.transfrag_len_dict):
        for v in recs.itervalues():
            if len(v) > 0:
                o = merge_fusion_exons(v, max_fusion_point_dist=100, max_exon_end_dist=0, allow_extra_5_exons=allow_extra_5_exons)
                for group in o:
                    merged_exons.append(group)
                    for r in group: compressed_records_pointer_dict[r.qID].append(merged_i)
                    merged_i += 1

    # step (3). use BranchSimple to write a temporary file
#    f_good = open(output_prefix + '.gff', 'w')
    f_group = open('branch_tmp.group.txt', 'w')
#    f_bad = f_good
    gene_index = 1
    already_seen = set()
    for qid,indices in compressed_records_pointer_dict.iteritems():
        combo = tuple(indices)
        if combo in already_seen:
            print "combo seen:", combo
            #raw_input("")
            continue
        already_seen.add(combo)
#        if gene_index == 7:
#            pdb.set_trace()
        for isoform_index,i in enumerate(indices):
            bs.cuff_index = gene_index # for set to the same
            records = merged_exons[i]
            f_group.write("{p}.{i}.{j}\t{ids}\n".format(p="PBfusion", i=gene_index, j=isoform_index, ids=",".join(r.qID for r in records)))
#            bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \
#                    f_good, f_bad, f_group, tolerate_end=100, \
#                    starting_isoform_index=isoform_index, gene_prefix='PBfusion')
        gene_index += 1
#    f_good.close()
#    f_bad.close()
    f_group.close()


    # step (4). read the tmp file and modify to display per fusion gene
    # IMPORTANT: sometimes a fusion can involve more than 2 loci!
    f_group = open(output_prefix + '.group.txt', 'w')
    group_info = {} # ex: PBfusion.1 --> [id1, id2, id3...]
    count = 0
    with open('branch_tmp.group.txt') as f:
        while True:
            line = f.readline().strip()
            if len(line) == 0: break
            pbid1, groups1 = line.strip().split('\t')
            group = set(groups1.split(','))
            while True:
                cur_pos = f.tell()
                line = f.readline().strip()
                if len(line) == 0: break
                new_pbid, new_group = line.strip().split('\t')
                if new_pbid.split('.')[1]!=pbid1.split('.')[1]:
                    f.seek(cur_pos)
                    break
                else: # still in the same fusion group
                    group = group.intersection(new_group.split(','))
            f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')], ",".join(group)))
            group_info[pbid1[:pbid1.rfind('.')]] = list(group)
            count += 1
    f_group.close()
    #os.remove('branch_tmp.group.txt')

    gff_filename = output_prefix + '.gff'
    group_filename = output_prefix + '.group.txt'
    if is_fq:
        output_filename = output_prefix + '.rep.fq'
    else:
        output_filename = output_prefix + '.rep.fa'
    pick_rep(fa_or_fq_filename, sam_filename, gff_filename, group_filename, output_filename, is_fq=is_fq, pick_least_err_instead=False)

    print >> sys.stderr, "{0} fusion candidates identified.".format(count)
    print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format(output_prefix, output_filename)

    # (optional) step 5. get count information
    if cluster_report_csv is not None:
        get_abundance_post_collapse(output_prefix, cluster_report_csv, output_prefix)
        print >> sys.stderr, "Count information written to: {0}.abundance.txt".format(output_prefix)
コード例 #2
0
def main(args):

    ### sanity check that input file and input SAM exists
    if not os.path.exists(args.input):
        print >> sys.stderr, "Input file {0} does not exist. Abort.".format(
            args.fasta)
        sys.exit(-1)

    if not os.path.exists(args.sam):
        print >> sys.stderr, "SAM file {0} does not exist. Abort.".format(
            args.sam)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)

    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')

    if args.flnc_coverage > 0:
        f_good = open(args.prefix + '.collapsed.good.gff', 'w')
        f_bad = open(args.prefix + '.collapsed.bad.gff', 'w')
        cov_threshold = args.flnc_coverage
    else:
        f_good = open(args.prefix + '.collapsed.gff', 'w')
        f_bad = f_good
        cov_threshold = 1
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')

    b = branch_simple2.BranchSimple(args.input,
                                    cov_threshold=cov_threshold,
                                    min_aln_coverage=args.min_aln_coverage,
                                    min_aln_identity=args.min_aln_identity,
                                    is_fq=args.fq)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter:  # recs is {'+': list of list of records, '-': list of list of records}
        for v in recs.itervalues():
            for v2 in v:
                if len(v2) > 0:
                    b.process_records(v2, args.allow_extra_5exon, False,
                                      f_good, f_bad, f_txt)

    ignored_fout.close()
    f_good.close()
    try:
        f_bad.close()
    except:
        pass
    f_txt.close()

    if args.max_fuzzy_junction > 0:  # need to further collapse those that have fuzzy junctions!
        collapse_fuzzy_junctions(
            f_good.name,
            f_txt.name,
            args.allow_extra_5exon,
            internal_fuzzy_max_dist=args.max_fuzzy_junction)
        os.rename(f_good.name, f_good.name + '.unfuzzy')
        os.rename(f_txt.name, f_txt.name + '.unfuzzy')
        os.rename(f_good.name + '.fuzzy', f_good.name)
        os.rename(f_txt.name + '.fuzzy', f_txt.name)

    if args.fq:
        outfile = args.prefix + ".collapsed.rep.fq"
    else:
        outfile = args.prefix + ".collapsed.rep.fa"
    if args.allow_extra_5exon:  # 5merge, pick longest
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=False,
                 bad_gff_filename=f_bad.name)
    else:
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=True,
                 bad_gff_filename=f_bad.name)

    print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name
    print >> sys.stderr, "Output written to:"
    print >> sys.stderr, f_good.name
    print >> sys.stderr, f_txt.name
    print >> sys.stderr, outfile
    print >> sys.stderr, args
コード例 #3
0
def fusion_main(fa_or_fq_filename,
                sam_filename,
                output_prefix,
                cluster_report_csv=None,
                is_fq=False,
                allow_extra_5_exons=True,
                skip_5_exon_alt=True,
                min_locus_coverage=.05,
                min_total_coverage=.99,
                min_locus_coverage_bp=1,
                min_dist_between_loci=10000,
                min_identity=0.95,
                is_flnc=False):
    """
    (1) identify fusion candidates (based on mapping, total coverage, identity, etc)
    (2) group/merge the fusion exons, using an index to point to each individual part
    (3) use BranchSimple to write out a tmp GFF where
         PBfusion.1.1 is the first part of a fusion gene
         PBfusion.1.2 is the second part of a fusion gene
    (4) read the tmp file from <3> and modify it so that
         PBfusion.1 just represents the fusion gene (a single transcript GFF format)
    """
    compressed_records_pointer_dict = defaultdict(lambda: [])
    merged_exons = []
    merged_i = 0

    # step (0). check for duplicate IDs
    check_ids_unique(fa_or_fq_filename, is_fq=is_fq)

    # step (1). identify fusion candidates
    bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq)
    fusion_candidates = find_fusion_candidates(sam_filename,
                                               bs.transfrag_len_dict,
                                               min_locus_coverage,
                                               min_locus_coverage_bp,
                                               min_total_coverage,
                                               min_dist_between_loci,
                                               min_identity=min_identity)

    # step (2). merge the fusion exons
    for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates,
                                         bs.transfrag_len_dict):
        for v in recs.values():
            if len(v) > 0:
                o = merge_fusion_exons(v,
                                       max_fusion_point_dist=100,
                                       max_exon_end_dist=0,
                                       allow_extra_5_exons=allow_extra_5_exons)
                for group in o:
                    merged_exons.append(group)
                    for r in group:
                        compressed_records_pointer_dict[r.qID].append(merged_i)
                    merged_i += 1

    # step (3). use BranchSimple to write a temporary file
    f_group = open('branch_tmp.group.txt', 'w')
    gene_index = 1
    already_seen = set()
    for qid, indices in compressed_records_pointer_dict.items():
        combo = tuple(indices)
        if combo in already_seen:
            #print("combo seen:", combo)
            continue
        already_seen.add(combo)

        # print the fusion transcript in order of the transcription
        for i in indices:
            bs.cuff_index = gene_index  # for set to the same
            records = merged_exons[i]
            isoform_index = get_isoform_index(fusion_candidates[qid],
                                              records[0].sID,
                                              records[0].sStart,
                                              records[0].sEnd)
            f_group.write("{p}.{i}.{j}\t{ids}\n".format(
                p="PBfusion",
                i=gene_index,
                j=isoform_index + 1,
                ids=",".join(r.qID for r in records)))
        gene_index += 1
    f_group.close()

    # step (4). read the tmp file and modify to display per fusion gene
    # IMPORTANT: sometimes a fusion can involve more than 2 loci!
    f_group = open(output_prefix + '.group.txt', 'w')
    group_info = {}  # ex: PBfusion.1 --> [id1, id2, id3...]
    count = 0
    with open('branch_tmp.group.txt') as f:
        while True:
            line = f.readline().strip()
            if len(line) == 0: break
            pbid1, groups1 = line.strip().split('\t')
            group = set(groups1.split(','))
            while True:
                cur_pos = f.tell()
                line = f.readline().strip()
                if len(line) == 0: break
                new_pbid, new_group = line.strip().split('\t')
                if new_pbid.split('.')[1] != pbid1.split('.')[1]:
                    f.seek(cur_pos)
                    break
                else:  # still in the same fusion group
                    group = group.intersection(new_group.split(','))
            f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')],
                                              ",".join(group)))
            group_info[pbid1[:pbid1.rfind('.')]] = list(group)
            count += 1
    f_group.close()
    #os.remove('branch_tmp.group.txt')

    gff_filename = output_prefix + '.gff'
    group_filename = output_prefix + '.group.txt'
    if is_fq:
        output_filename = output_prefix + '.rep.fq'
    else:
        output_filename = output_prefix + '.rep.fa'
    pick_rep(fa_or_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             fusion_candidates,
             is_fq=is_fq)

    print("{0} fusion candidates identified.".format(count), file=sys.stdout)
    print("Output written to: {0}.gff, {0}.group.txt, {1}".format(
        output_prefix, output_filename),
          file=sys.stdout)

    # (optional) step 5. get count information
    if cluster_report_csv is not None:
        get_abundance_post_collapse(output_prefix, cluster_report_csv,
                                    output_prefix)
        print("Count information written to: {0}.abundance.txt".format(
            output_prefix),
              file=sys.stdout)
    elif is_flnc:
        print("Input is FLNC. Outputting FLNC counts per fusion.")
        with open(output_prefix + '.abundance.txt', 'w') as f:
            f.write("pbid\tcount_fl\n")
            for pbid, members in group_info.items():
                f.write("{0}\t{1}\n".format(pbid, len(members)))
        print("Count information written to: {0}.abundance.txt".format(
            output_prefix),
              file=sys.stdout)
コード例 #4
0
def main(
    input_filename: str = typer.Option(...,
                                       "--input",
                                       help="Input FA/FQ filename"),
    sam: str = typer.Option(..., help="Sorted SAM filename"),
    fq: bool = typer.Option(False, "--fq",
                            help="Input is a fastq file"),  # store_true
    prefix: str = typer.Option(...,
                               "-p",
                               "--prefix",
                               help="Output filename prefix"),
    min_aln_coverage: float = typer.Option(0.99,
                                           "--min-coverage",
                                           "-c",
                                           help="Minimum alignment coverage"),
    min_aln_identity: float = typer.Option(0.95,
                                           "--min-identity",
                                           "-i",
                                           help="Minimum alignment identity"),
    max_fuzzy_junction: int = typer.Option(5, help="Max fuzzy junction dist"),
    max_5_diff: int = typer.Option(
        1000, help="Maximum allowed 5' difference if on same exon"),
    max_3_diff: int = typer.Option(
        100, help="Maximum allowed 3' difference if on same exon"),
    flnc_coverage: int = typer.Option(
        -1,
        help=
        "Minimum # of FLNC reads, only use this for aligned FLNC reads, otherwise results undefined!",
    ),
    gen_mol_count: bool = typer.Option(
        False,
        help=
        "Generate a .abundance.txt file based on the number of input sequences collapsed. Use only if input is FLNC or UMI-dedup output",
    ),  # store_true
    allow_extra_5exon: bool = typer.Option(
        True,
        "--dun-merge-5-shorter",
        help="Don't collapse shorter 5' transcripts (default: turned off)",
    ),  # store_false
    version: bool = typer.Option(
        None,
        "--version",
        callback=version_callback,
        is_eager=True,
        help="Prints the version of the SQANTI3 package.",
    ),
) -> None:
    # sanity check that input file and input SAM exists
    if not Path(str(input_filename)).exists():
        raise FileNotFoundError(
            f"Input file {input_filename} does not exist. Abort.")

    if not Path(sam).exists():
        raise FileNotFoundError(f"SAM file {sam} does not exist. Abort.")

    # check for duplicate IDs
    check_ids_unique(input_filename, is_fq=fq)

    with open(f"{prefix}.ignored_ids.txt", "w") as ignored_fout:

        if flnc_coverage > 0:
            # keep these files closed *until* we need to write to them
            f_good = Path(f"{prefix}.collapsed.good.gff")
            f_bad = Path(f"{prefix}.collapsed.bad.gff")
            cov_threshold = flnc_coverage
        else:
            f_good = Path(f"{prefix}.collapsed.gff")
            f_bad = f_good
            cov_threshold = 1
        f_txt = Path(f"{prefix}.collapsed.group.txt")

        b = branch_simple2.BranchSimple(
            transfrag_filename=input_filename,
            cov_threshold=cov_threshold,
            min_aln_coverage=min_aln_coverage,
            min_aln_identity=min_aln_identity,
            is_fq=fq,
            max_5_diff=max_5_diff,
            max_3_diff=max_3_diff,
        )
        rec_iter = b.iter_gmap_sam(sam, ignored_fout)
        # recs is {'+': list of list of records, '-': list of list of records}
        for recs in rec_iter:
            for v in recs.values():
                for v2 in v:
                    if len(v2) > 0:
                        b.process_records(
                            records=v2,
                            allow_extra_5_exons=allow_extra_5exon,
                            skip_5_exon_alt=False,
                            f_good=f_good,
                            f_bad=f_bad,
                            f_group=f_txt,
                        )

    # need to further collapse those that have fuzzy junctions!
    if max_fuzzy_junction > 0:
        collapse_fuzzy_junctions(
            f_good,
            f_txt,
            allow_extra_5exon,
            internal_fuzzy_max_dist=max_fuzzy_junction,
            max_5_diff=max_5_diff,
            max_3_diff=max_3_diff,
        )
        Path(f_good.name).rename(f"{f_good.name}.unfuzzy")
        Path(f_txt.name).rename(f"{f_txt.name}.unfuzzy")
        Path(f"{f_good.name}.fuzzy").rename(f_good.name)
        Path(f"{f_txt.name}.fuzzy").rename(f_txt.name)

    if fq:
        outfile = f"{prefix}.collapsed.rep.fq"
    else:
        outfile = f"{prefix}.collapsed.rep.fa"
    if allow_extra_5exon:  # 5merge, pick longest
        pick_rep(
            fa_fq_filename=input_filename,
            gff_filename=f_good,
            group_filename=f_txt,
            output_filename=outfile,
            is_fq=fq,
            pick_least_err_instead=False,
            bad_gff_filename=f_bad.name,
        )
    else:
        pick_rep(
            fa_fq_filename=input_filename,
            gff_filename=f_good,
            group_filename=f_txt,
            output_filename=outfile,
            is_fq=fq,
            pick_least_err_instead=True,
            bad_gff_filename=f_bad.name,
        )

    if gen_mol_count:
        outfile = f"{prefix}.collapsed.abundance.txt"
        with open(outfile, "w") as f:
            f.write("pbid\tcount_fl\n")
            for line in open(f_txt.name):
                pbid, members = line.strip().split()
                f.write(f"{pbid}\t{members.count(',')+1}\n")

    logger.info(f"Ignored IDs written to: {ignored_fout.name}")
    logger.info(f"Output written to: {f_good.name}\n{f_txt.name}\n{outfile}\n")
コード例 #5
0
def main(args):

    ### sanity check that input file and input SAM exists
    if not os.path.exists(args.input):
        print("Input file {0} does not exist. Abort.".format(args.input),
              file=sys.stderr)
        sys.exit(-1)

    if not os.path.exists(args.sam):
        print("SAM file {0} does not exist. Abort.".format(args.sam),
              file=sys.stderr)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)

    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')

    if args.flnc_coverage > 0:
        f_good = open(args.prefix + '.collapsed.good.gff', 'w')
        f_bad = open(args.prefix + '.collapsed.bad.gff', 'w')
        cov_threshold = args.flnc_coverage
    else:
        f_good = open(args.prefix + '.collapsed.gff', 'w')
        f_bad = f_good
        cov_threshold = 1
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')

    b = branch_simple2.BranchSimple(args.input,
                                    cov_threshold=cov_threshold,
                                    min_aln_coverage=args.min_aln_coverage,
                                    min_aln_identity=args.min_aln_identity,
                                    is_fq=args.fq,
                                    max_5_diff=args.max_5_diff,
                                    max_3_diff=args.max_3_diff)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter:  # recs is {'+': list of list of records, '-': list of list of records}
        for v in recs.values():
            for v2 in v:
                if len(v2) > 0:
                    b.process_records(v2, args.allow_extra_5exon, False,
                                      f_good, f_bad, f_txt)

    ignored_fout.close()
    f_good.close()
    try:
        f_bad.close()
    except:
        pass
    f_txt.close()

    if args.max_fuzzy_junction > 0:  # need to further collapse those that have fuzzy junctions!
        collapse_fuzzy_junctions(
            f_good.name,
            f_txt.name,
            args.allow_extra_5exon,
            internal_fuzzy_max_dist=args.max_fuzzy_junction)
        os.rename(f_good.name, f_good.name + '.unfuzzy')
        os.rename(f_txt.name, f_txt.name + '.unfuzzy')
        os.rename(f_good.name + '.fuzzy', f_good.name)
        os.rename(f_txt.name + '.fuzzy', f_txt.name)

    if args.fq:
        outfile = args.prefix + ".collapsed.rep.fq"
    else:
        outfile = args.prefix + ".collapsed.rep.fa"
    if args.allow_extra_5exon:  # 5merge, pick longest
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=False,
                 bad_gff_filename=f_bad.name)
    else:
        pick_rep(args.input,
                 f_good.name,
                 f_txt.name,
                 outfile,
                 is_fq=args.fq,
                 pick_least_err_instead=True,
                 bad_gff_filename=f_bad.name)

    if args.gen_mol_count:
        outfile = args.prefix + '.collapsed.abundance.txt'
        with open(outfile, 'w') as f:
            f.write("pbid\tcount_fl\n")
            for line in open(f_txt.name):
                pbid, members = line.strip().split()
                f.write("{0}\t{1}\n".format(pbid, members.count(',') + 1))

    print("Ignored IDs written to: {0}".format(ignored_fout.name),
          file=sys.stdout)
    print("Output written to: {0}\n{1}\n{2}\n{3}\n".format(
        f_good.name, f_txt.name, outfile, args),
          file=sys.stdout)
コード例 #6
0
ファイル: fusion_finder.py プロジェクト: zgt1021/cDNA_Cupcake
def fusion_main(fa_or_fq_filename,
                sam_filename,
                output_prefix,
                cluster_report_csv=None,
                is_fq=False,
                allow_extra_5_exons=True,
                skip_5_exon_alt=True,
                prefix_dict_pickle_filename=None,
                min_locus_coverage=.05,
                min_total_coverage=.99,
                min_locus_coverage_bp=1,
                min_dist_between_loci=10000):
    """
    (1) identify fusion candidates (based on mapping, total coverage, identity, etc)
    (2) group/merge the fusion exons, using an index to point to each individual part
    (3) use BranchSimple to write out a tmp GFF where
         PBfusion.1.1 is the first part of a fusion gene
         PBfusion.1.2 is the second part of a fusion gene
    (4) read the tmp file from <3> and modify it so that
         PBfusion.1 just represents the fusion gene (a single transcript GFF format)
    """
    compressed_records_pointer_dict = defaultdict(lambda: [])
    merged_exons = []
    merged_i = 0

    # step (0). check for duplicate IDs
    check_ids_unique(fa_or_fq_filename, is_fq=is_fq)

    # step (1). identify fusion candidates
    bs = branch_simple2.BranchSimple(fa_or_fq_filename, is_fq=is_fq)
    fusion_candidates = find_fusion_candidates(
        sam_filename, bs.transfrag_len_dict, min_locus_coverage,
        min_locus_coverage_bp, min_total_coverage, min_dist_between_loci)

    # step (2). merge the fusion exons
    for recs in iter_gmap_sam_for_fusion(sam_filename, fusion_candidates,
                                         bs.transfrag_len_dict):
        for v in recs.itervalues():
            if len(v) > 0:
                o = merge_fusion_exons(v,
                                       max_fusion_point_dist=100,
                                       max_exon_end_dist=0,
                                       allow_extra_5_exons=allow_extra_5_exons)
                for group in o:
                    merged_exons.append(group)
                    for r in group:
                        compressed_records_pointer_dict[r.qID].append(merged_i)
                    merged_i += 1

    # step (3). use BranchSimple to write a temporary file
#    f_good = open(output_prefix + '.gff', 'w')
    f_group = open('branch_tmp.group.txt', 'w')
    #    f_bad = f_good
    gene_index = 1
    already_seen = set()
    for qid, indices in compressed_records_pointer_dict.iteritems():
        combo = tuple(indices)
        if combo in already_seen:
            print "combo seen:", combo
            #raw_input("")
            continue
        already_seen.add(combo)
        #        if gene_index == 7:
        #            pdb.set_trace()
        for isoform_index, i in enumerate(indices):
            bs.cuff_index = gene_index  # for set to the same
            records = merged_exons[i]
            f_group.write("{p}.{i}.{j}\t{ids}\n".format(
                p="PBfusion",
                i=gene_index,
                j=isoform_index,
                ids=",".join(r.qID for r in records)))
#            bs.process_records(records, allow_extra_5_exons, skip_5_exon_alt, \
#                    f_good, f_bad, f_group, tolerate_end=100, \
#                    starting_isoform_index=isoform_index, gene_prefix='PBfusion')
        gene_index += 1


#    f_good.close()
#    f_bad.close()
    f_group.close()

    # step (4). read the tmp file and modify to display per fusion gene
    f_group = open(output_prefix + '.group.txt', 'w')
    group_info = {}  # ex: PBfusion.1 --> [id1, id2, id3...]
    count = 0
    with open('branch_tmp.group.txt') as f:
        while True:
            line = f.readline().strip()
            if len(line) == 0: break
            pbid1, groups1 = line.strip().split('\t')
            pbid2, groups2 = f.readline().strip().split('\t')
            assert pbid1.split('.')[1] == pbid2.split('.')[1]
            group = set(groups1.split(',')).intersection(groups2.split(','))
            f_group.write("{0}\t{1}\n".format(pbid1[:pbid1.rfind('.')],
                                              ",".join(group)))
            group_info[pbid1[:pbid1.rfind('.')]] = list(group)
            count += 1
    f_group.close()
    #os.remove('branch_tmp.group.txt')

    gff_filename = output_prefix + '.gff'
    group_filename = output_prefix + '.group.txt'
    if is_fq:
        output_filename = output_prefix + '.rep.fq'
    else:
        output_filename = output_prefix + '.rep.fa'
    pick_rep(fa_or_fq_filename,
             sam_filename,
             gff_filename,
             group_filename,
             output_filename,
             is_fq=is_fq,
             pick_least_err_instead=False)

    print >> sys.stderr, "{0} fusion candidates identified.".format(count)
    print >> sys.stderr, "Output written to: {0}.gff, {0}.group.txt, {1}".format(
        output_prefix, output_filename)

    # (optional) step 5. get count information
    if cluster_report_csv is not None:
        get_abundance_post_collapse(output_prefix, cluster_report_csv,
                                    output_prefix)
        print >> sys.stderr, "Count information written to: {0}.abundance.txt".format(
            output_prefix)
コード例 #7
0
def main(args):

    ### sanity check that input file and input SAM exists
    if not os.path.exists(args.input):
        print >> sys.stderr, "Input file {0} does not exist. Abort.".format(args.fasta)
        sys.exit(-1)

    if not os.path.exists(args.sam):
        print >> sys.stderr, "SAM file {0} does not exist. Abort.".format(args.sam)
        sys.exit(-1)

    # check for duplicate IDs
    check_ids_unique(args.input, is_fq=args.fq)

    ignored_fout = open(args.prefix + '.ignored_ids.txt', 'w')

    if args.flnc_coverage > 0:
        f_good = open(args.prefix + '.collapsed.good.gff', 'w')
        f_bad = open(args.prefix + '.collapsed.bad.gff', 'w')
        cov_threshold = args.flnc_coverage
    else:
        f_good = open(args.prefix + '.collapsed.gff', 'w')
        f_bad = f_good
        cov_threshold = 1
    f_txt = open(args.prefix + '.collapsed.group.txt', 'w')

    b = branch_simple2.BranchSimple(args.input, cov_threshold=cov_threshold, min_aln_coverage=args.min_aln_coverage, min_aln_identity=args.min_aln_identity, is_fq=args.fq, max_5_diff=args.max_5_diff, max_3_diff=args.max_3_diff)
    iter = b.iter_gmap_sam(args.sam, ignored_fout)
    for recs in iter: # recs is {'+': list of list of records, '-': list of list of records}
        for v in recs.itervalues():
            for v2 in v:
                if len(v2) > 0: b.process_records(v2, args.allow_extra_5exon, False, f_good, f_bad, f_txt)

    ignored_fout.close()
    f_good.close()
    try:
        f_bad.close()
    except:
        pass
    f_txt.close()

    if args.max_fuzzy_junction > 0: # need to further collapse those that have fuzzy junctions!
        collapse_fuzzy_junctions(f_good.name, f_txt.name, args.allow_extra_5exon, internal_fuzzy_max_dist=args.max_fuzzy_junction)
        os.rename(f_good.name, f_good.name+'.unfuzzy')
        os.rename(f_txt.name, f_txt.name+'.unfuzzy')
        os.rename(f_good.name+'.fuzzy', f_good.name)
        os.rename(f_txt.name+'.fuzzy', f_txt.name)

    if args.fq:
        outfile = args.prefix+".collapsed.rep.fq"
    else:
        outfile = args.prefix+".collapsed.rep.fa"
    if args.allow_extra_5exon: # 5merge, pick longest
        pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=False, bad_gff_filename=f_bad.name)
    else:
        pick_rep(args.input, f_good.name, f_txt.name, outfile, is_fq=args.fq, pick_least_err_instead=True, bad_gff_filename=f_bad.name)

    print >> sys.stderr, "Ignored IDs written to:", ignored_fout.name
    print >> sys.stderr, "Output written to:"
    print >> sys.stderr, f_good.name
    print >> sys.stderr, f_txt.name
    print >> sys.stderr, outfile
    print >> sys.stderr, args