コード例 #1
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-graph")
    cli_logging_parser = get_logging_cli_parser()
    parser.add_argument("--acnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--acnt-separator", default="\t")
    parser.add_argument("--acnt-extra-separator", default=";")
    parser.add_argument("--scnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--scnt-separator", default="\t")
    parser.add_argument("--scnt-extra-separator", default=";")
    parser.add_argument("--clone")
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    writer_parser = subparsers.add_parser("write", parents=[cli_logging_parser])
    writer_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    writer_parser.add_argument("--style", choices=["edge-list"], default="edge-list")
    writer_parser.add_argument("--separator", default="\t")
    writer_parser.add_argument("--include-absent", action="store_true", dest="include_cn_0")
    args = parser.parse_args()
    segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True)
    adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True)
    if args.command == "write":
        hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies)
        if args.clone is None:
            common_clones = set(acnt.keys()) & set(scnt.keys())
            if len(common_clones) == 0:
                raise ValueError("No common clones in Adjacency and Segment Copy Number tensors")
            args.clone = sorted(common_clones)[0]
        acnp, scnp = acnt[args.clone], scnt[args.clone]
        hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp)
        hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp)
        if not args.include_cn_0:
            hiag.remove_edges_with_zero_cn()
        write_graph_to_destination(graph=hiag, destination=args.output, style=args.style)
コード例 #2
0
ファイル: rck_adg_stats.py プロジェクト: raphael-group/RCK
def main():
    cli_logging_parser = get_logging_cli_parser()
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADG-STATS")
    parser.add_argument('--version', action='version', version=rck.version)
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    #######
    labeling_group_size_parser = subparsers.add_parser(
        "size-l",
        parents=[cli_logging_parser],
        help="Group size for RCK AdjGROUP in input file")
    labeling_group_size_parser.add_argument("rck_adg",
                                            type=argparse.FileType("rt"),
                                            nargs="?",
                                            default=sys.stdin)
    labeling_group_size_parser.add_argument("-o",
                                            "--output",
                                            type=argparse.FileType("wt"),
                                            default=sys.stdout)
    labeling_group_size_parser.add_argument("--no-allow-zero-values",
                                            action="store_false",
                                            dest="allow_zero_values")
    labeling_group_size_parser.add_argument("--min", type=int, default=-1)
    labeling_group_size_parser.add_argument("--max", type=int, default=-1)
    #######
    args = parser.parse_args()
    if args.command == "size-l":
        adj_groups = read_adjacency_groups_from_source(source=args.rck_adg)
        labeling_adg = [
            ag for ag in adj_groups
            if ag.group_type == AdjacencyGroupType.LABELING
        ]
        tally = groups_size_tally(adjacency_groups=labeling_adg)
        min_key, max_key = min(tally.keys()), max(tally.keys())
        if args.max != -1:
            max_key = args.max
        if args.min != -1:
            min_key = args.min
        min_value = 0
        for key in tally:
            if key < min_key:
                min_value += tally[key]
        print("<{min_key}".format(min_key=min_key),
              min_value,
              sep=",",
              file=args.output)
        for value in range(min_key, max_key):
            if value not in tally and not args.allow_zero_values:
                continue
            print(value, tally.get(value, 0), sep=",", file=args.output)
        max_value = 0
        for key in tally:
            if key >= max_key:
                max_value += tally[key]
        print(">={max_key}".format(max_key=max_key),
              max_value,
              sep=",",
              file=args.output)
コード例 #3
0
ファイル: rck_adj_process.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-process")
    parser.add_argument('--version', action='version', version=rck.version)
    ####
    shared_parser = get_shared_nas_parser()
    cli_logging_parser = get_logging_cli_parser()
    shared_parser.add_argument("--output",
                               "-o",
                               dest="rck_adj_file",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    shared_parser.add_argument("--no-sort", action="store_false", dest="sort")
    ####
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ####
    filter_parser = subparsers.add_parser(
        "filter", parents=[shared_parser, cli_logging_parser])
    filter_parser.add_argument("rck_adj",
                               type=argparse.FileType("rt"),
                               nargs="+",
                               default=[sys.stdin])
    filter_parser.add_argument("--keep-extra-field-regex",
                               action="append",
                               default=None)
    filter_parser.add_argument("--keep-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--keep-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--keep-annotate",
                               action="store_true",
                               dest="annotate_retained")
    filter_parser.add_argument("--keep-annotate-s-extra-field",
                               default=None,
                               dest="annotate_seg_extra_field")
    filter_parser.add_argument("--keep-annotate-short-circ",
                               action="store_true",
                               dest="annotate_shirt_circ")
    filter_parser.add_argument("--keep-annotate-extra-prefix",
                               dest="annotate_extra_prefix")
    filter_parser.add_argument("--remove-extra-field-regex",
                               action="append",
                               default=None)
    filter_parser.add_argument("--remove-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--remove-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--min-size", type=int, default=0)
    filter_parser.add_argument("--max-size", type=int, default=1000000000)
    filter_parser.add_argument("--no-allow-inter-chr",
                               action="store_false",
                               dest="allow_inter_chr")
    filter_parser.add_argument("--no-allow-intra-chr",
                               action="store_false",
                               dest="allow_intra_chr")
    filter_parser.add_argument("--size-extra-field", default="svlen")
    filter_parser.add_argument("--size-extra-field-no-abs",
                               action="store_false",
                               dest="size_extra_field_abs")
    filter_parser.add_argument("--size-extra-seq-field")
    ####
    cat_parser = subparsers.add_parser(
        "cat",
        parents=[shared_parser, cli_logging_parser],
        help=
        "Concatenate Adjacencies in input files (NOTE: different from \"merge\")"
    )
    cat_parser.add_argument("rck_adj",
                            type=argparse.FileType("rt"),
                            nargs="+",
                            default=[sys.stdin])
    cat_parser.add_argument("--enforce-unique-ids",
                            action="store_true",
                            dest="enforce_unique_ids")
    cat_parser.add_argument("--id-collision-strategy",
                            choices=['skip', 'error'],
                            default='error')
    ####
    reciprocal_parser = subparsers.add_parser(
        "reciprocal",
        parents=[shared_parser, cli_logging_parser],
        help="ensure that reciprocal novel adjacencies are treated as such")
    reciprocal_parser.add_argument("rck_adj",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    reciprocal_parser.add_argument("--max-distance", type=int, default=50)
    ####
    haploid_parser = subparsers.add_parser(
        "haploid",
        parents=[shared_parser, cli_logging_parser],
        help=
        "collapse any info that is allele/haplotype-specific into a haploid mode"
    )
    haploid_parser.add_argument("rck_adj",
                                type=argparse.FileType("rt"),
                                nargs="+",
                                default=[sys.stdin])
    ####
    update_parser = subparsers.add_parser(
        "update",
        parents=[shared_parser, cli_logging_parser],
        help=
        "Updates adjacencies in the 'adj' with the info from --source based on aid matches. Outputs updated --target entries"
    )
    update_parser.add_argument("rck_adj", type=argparse.FileType("rt"))
    update_parser.add_argument("--source",
                               type=argparse.FileType("rt"),
                               required=True)
    update_parser.add_argument("--exclude-extra-fields", default="")
    update_parser.add_argument("--include-extra-fields", default="")
    update_parser.add_argument("--no-include-missing",
                               action="store_false",
                               dest="include_missing")
    update_parser.add_argument("--no-coords-update",
                               action="store_false",
                               dest="coord_update")
    update_parser.add_argument("--no-coord1-update",
                               action="store_false",
                               dest="coord1_update")
    update_parser.add_argument("--no-coord2-update",
                               action="store_false",
                               dest="coord2_update")
    update_parser.add_argument("--no-strands-update",
                               action="store_false",
                               dest="strands_update")
    update_parser.add_argument("--no-strand1-update",
                               action="store_false",
                               dest="strand1_update")
    update_parser.add_argument("--no-strand2-update",
                               action="store_false",
                               dest="strand2_update")
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADK-process")
    processed_adjacencies = []
    if args.o_extra_fields is None or len(
            args.o_extra_fields) == 0 or args.o_extra_fields == ",":
        extra = None
    elif args.o_extra_fields != "all":
        extra = args.o_extra_fields.split(",")
    else:
        extra = args.o_extra_fields
    if args.command == "cat":
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        if args.enforce_unique_ids:
            processed_ids = set()
            adjacencies = []
            for adj in adjacencies:
                aid = adj.extra.get(EXTERNAL_NA_ID, adj.idx)
                if aid in processed_ids:
                    logger.debug(
                        "Adjacency id {aid} has been encountered more than once"
                        .format(aid=aid))
                    if args.id_collision_strategy == "skip":
                        continue
                    elif args.id_collision_strategy == "error":
                        raise ValueError(
                            "More than one adjacency with id {aid}".format(
                                aid=aid))
                adjacencies.append(adj)
                processed_ids.add(aid)
            adjacencies = adjacencies
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=adjacencies,
                                         extra=extra,
                                         sort_adjacencies=args.sort)
        exit(0)
    elif args.command == "filter":
        logger.info(
            "Filtering input adjacencies from following sources {sources}".
            format(sources=",".join(map(str, args.rck_adj))))
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        include_chrs_regions_strings = []
        exclude_chrs_regions_strings = []
        if args.chrs_include is not None:
            for chrs_lists in args.chrs_include:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        include_chrs_regions_strings.append(chr_name)
        if args.chrs_include_file is not None:
            for chr_name in get_chrs_regions_string_lists_from_source(
                    source=args.chrs_include_file):
                include_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude is not None:
            for chrs_lists in args.chrs_exclude:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        exclude_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude_file is not None:
            for chr_name in get_chrs_regions_string_list_from_file(
                    file_name=args.chrs_exclude_file):
                exclude_chrs_regions_strings.append(chr_name)
        include_regions = [
            parse_segment_chr_region(string)
            for string in include_chrs_regions_strings
        ]
        exclude_regions = [
            parse_segment_chr_region(string)
            for string in exclude_chrs_regions_strings
        ]
        adjacencies = filter_adjacencies_by_chromosomal_regions(
            adjacencies=adjacencies,
            include=include_regions,
            exclude=exclude_regions,
            include_both=args.include_both,
            exclude_both=args.exclude_both,
            include_spanning=args.include_spanning,
            exclude_spanning=args.exclude_spanning,
            annotate_retained=args.annotate_retained,
            annotate_retained_extra_field_prefix=args.annotate_extra_prefix,
            annotated_retained_segments_extra_field=args.
            annotate_seg_extra_field,
            annotate_short_circ=args.annotate_shirt_circ)
        keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
        if args.keep_extra_field_regex_file is not None:
            keep_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.keep_extra_field_regex_file)))
        remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
        if args.remove_extra_field_regex_file is not None:
            remove_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.remove_extra_field_regex_file)))
        keep_extra_field = get_extra_field_regexes(
            string_entries=keep_extra_field_entries)
        remove_extra_field = get_extra_field_regexes(
            string_entries=remove_extra_field_entries)
        adjacencies = filter_adjacencies_by_extra(
            adjacencies=adjacencies,
            keep_extra_field=keep_extra_field,
            keep_extra_field_missing_strategy=args.
            keep_extra_field_missing_strategy,
            remove_extra_field=remove_extra_field,
            remove_extra_field_missing_strategy=args.
            remove_extra_field_missing_strategy)
        adjacencies = filter_adjacencies_by_size(
            adjacencies=adjacencies,
            min_size=args.min_size,
            max_size=args.max_size,
            size_extra_field=args.size_extra_field,
            size_extra_seq_field=args.size_extra_seq_field,
            allow_inter_chr=args.allow_inter_chr,
            size_extra_field_abs=args.size_extra_field_abs,
            allow_intra_chr=args.allow_intra_chr,
        )
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=adjacencies,
                                         sort_adjacencies=False,
                                         extra=extra)
        exit(0)
    elif args.command == "reciprocal":
        adjacencies = read_adjacencies_from_source(source=args.rck_adj)
        processed_adjacencies = refined_adjacencies_reciprocal(
            novel_adjacencies=adjacencies,
            max_distance=args.max_distance,
            inplace=True)
    elif args.command == "haploid":
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        haploid_adjacencies = iter_haploid_adjacencies(adjacencies=adjacencies,
                                                       copy=False)
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=haploid_adjacencies,
                                         sort_adjacencies=False,
                                         extra=extra)
        exit(0)
    elif args.command == "update":
        adjacencies = read_adjacencies_from_source(source=args.rck_adj)
        source_adjacencies = read_adjacencies_from_source(source=args.source)
        extra_include = {
            v
            for v in args.include_extra_fields.split(",") if len(v) > 0
        }
        extra_exclude = {
            v
            for v in args.exclude_extra_fields.split(",") if len(v) > 0
        }
        processed_adjacencies = update_adjacencies(
            target_adjacencies=adjacencies,
            source_adjacencies=source_adjacencies,
            update_coords=args.update_coords,
            update_coord1=args.update_coord1,
            update_coord2=args.update_coord2,
            update_strands=args.update_strands,
            update_strand1=args.update_strand1,
            update_strand2=args.update_strand2,
            extra_exclude=extra_exclude,
            extra_include=extra_include,
            include_missing=args.include_missing)
    if len(processed_adjacencies) > 0:
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=processed_adjacencies,
                                         extra=extra,
                                         sort_adjacencies=args.sort)
コード例 #4
0
ファイル: rck_kar_stats.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-stats", parents=[get_logging_cli_parser()])
    parser.add_argument("--verbose", choices=[0, 1, 2, 3, 4, 5], type=int, default=5)
    parser.add_argument("--acnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--acnt-separator", default="\t")
    parser.add_argument("--acnt-extra-separator", default=";")
    parser.add_argument("--scnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--scnt-separator", default="\t")
    parser.add_argument("--scnt-extra-separator", default=";")
    parser.add_argument("--scnb", type=argparse.FileType("rt"))
    parser.add_argument("--scnb-separator", default="\t")
    parser.add_argument("--scnb-extra-separator", default=";")
    parser.add_argument("--nas-fp", type=float, default=-1.0)
    parser.add_argument("--adjacency-groups", type=argparse.FileType("rt"))
    parser.add_argument("--adg-separator", default="\t")
    parser.add_argument("--adg-aids-separator", default=",")
    parser.add_argument("--adg-extra-separator", default=";")
    parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
    parser.add_argument("--telomere-positions-separator", default="\t")
    parser.add_argument("--telomere-positions-extra-separator", default=";")
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-KAR-stats")
    logger.info("Reading segment copy number tensor from {file}".format(file=args.scnt))
    segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True)
    logger.info("Reading adjacency copy number tensor from {file}".format(file=args.acnt))
    adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True)
    if args.scnb is not None:
        logger.info("Reading segment copy number boundaries tensor from {file}".format(file=args.scnb))
        _, scnb = read_scnb_from_source(source=args.scnb, separator=args.scnb_separator, extra_separator=args.scnb_extra_separator, remove_cnb_data_from_segs=True)
    else:
        logger.info("No segment copy number boundaries tensor is provided via --scnb flag")
        scnb = None
    if args.adjacency_groups is not None:
        logger.info("Reading adjacency groups information from {file}".format(file=args.adjacency_groups))
        groups = read_adjacency_groups_from_source(source=args.adjacency_groups, separator=args.adg_separator,
                                                   extra_separator=args.adg_extra_separator, aids_separator=args.adg_aids_separator)
    else:
        logger.info("No adjacency groups information is provided via --adjacency-groups flag")
        groups = []
    if args.telomere_positions is not None:
        logger.info("Reading telomere positions from {file}".format(file=args.telomere_positions))
        telomeres = read_positions_from_source(source=args.telomere_positions, separator=args.telomeres_positions_separator,
                                               extra_separator=args.telomere_positions_extra_separator)
    else:
        logger.info("No telomere positions are provided via --telomere-positions flag. Defaulting to reference telomere positions".format(file=args.telomere_positions))
        telomeres = get_ref_telomeres_from_segments(segments=segments)
    segments_by_chrs = defaultdict(list)
    for segment in segments:
        segments_by_chrs[segment.chromosome].append(segment)
    print("A total of {cnt} chromosomes are observed".format(cnt=len(segments_by_chrs)))
    total_segments_cnt = 0
    for chr_name, chr_segments in segments_by_chrs.items():
        total_segments_cnt += len(chr_segments)
        if args.verbose >= 3:
            print("Chromosome {chr_name} has {cnt} segments".format(chr_name=chr_name, cnt=len(chr_segments)))
    print("A total of {cnt} segments are observed".format(cnt=total_segments_cnt))
    novel_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.NOVEL]
    reference_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.REFERENCE]
    print("A total of {cnt} adjacencies ({n_cnt} novel; {r_cnt} reference)".format(cnt=len(novel_adjacencies) + len(reference_adjacencies),
                                                                                   n_cnt=len(novel_adjacencies), r_cnt=len(reference_adjacencies)))

    adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies}
    if groups is not None:
        for ag in groups:
            ag.populate_adjacencies_via_ids(source=adjacencies, source_by_ids=adjacencies_by_external_ids)
        molecule_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.MOLECULE]
        labeling_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.LABELING]
        general_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.GENERAL]
        if len(molecule_groups) > 0:
            logger.info("Checking compliance with {cnt} molecule groups".format(cnt=len(molecule_groups)))
            molecule_groups_violations = adjacency_groups_molecule_violations(groups=molecule_groups, acnt=acnt)
            if len(molecule_groups_violations):
                logger.error("A total of {cnt} molecule groups DO NOT agree with input karyotype. See molecule groups ids below".format(cnt=len(molecule_groups)))
                logger.error(", ".join([ag.gid for ag in molecule_groups_violations]))
            else:
                logger.info("All molecule groups agree with input karyotype")
        else:
            logger.info("No molecule groups were provided. Nothing to check.")
        if len(labeling_groups) > 0:
            logger.info("Checking compliance with {cnt} labeling groups".format(cnt=len(labeling_groups)))
            labeling_groups_violations = adjacency_groups_labeling_violations(groups=labeling_groups, acnt=acnt)
            if len(labeling_groups_violations):
                logger.error("A total of {cnt} labeling groups DO NOT agree with input karyotype. See labeling groups ids below".format(cnt=len(labeling_groups_violations)))
                logger.error(", ".join([ag.gid for ag in labeling_groups_violations]))
            else:
                logger.info("All labeling groups agree with input karyotype")
        else:
            logger.info("No labeling groups were provided. Nothing to check.")
        if len(general_groups) > 0:
            logger.info("Checking compliance with {cnt} general groups".format(cnt=len(general_groups)))
            general_groups_violations = adjacency_groups_general_violations(groups=general_groups, acnt=acnt)
            if len(general_groups_violations):
                logger.error("A total of {cnt} general groups DO NOT agree with input karyotype. See general groups ids below".format(cnt=len(general_groups_violations)))
                logger.error(", ".join([ag.gid for ag in general_groups_violations]))
            else:
                logger.info("All general groups agree with input karyotype")
    else:
        logger.info("No information about adjacency groups were provided. Nothing to check.")

    clone_ids = sorted(set(scnt.keys()) & set(acnt.keys()))
    for clone_id in clone_ids:
        logger.info("Checking balancing and telomeres for clone {clone_id}".format(clone_id=clone_id))
        hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies)
        scnp = scnt[clone_id]
        acnp = acnt[clone_id]
        hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp)
        hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp)
        hiag.remove_edges_with_zero_cn()
        logger.info("Checking that every vertex has a copy number excess >= 0.")
        for node in hiag.nodes(data=False):
            if hiag.node_imbalance(node=node) < 0:
                logger.warning("Something went WRONG! On segment extremity {node} there is a negative copy number excess...".format(node=str(node)))
        logger.info("Getting inferred telomeres.")
        diploid_telomeres = hiag.get_telomeres()
        inferred_hapl_telomeres_ids = {p.stable_id_non_hap for p in diploid_telomeres}
        input_hapl_telomers_ids = {p.stable_id_non_hap for p in telomeres}
        if inferred_hapl_telomeres_ids > input_hapl_telomers_ids:
            logger.error("Something went WRONG! Following segments extremities, while not specified specified as possible telomere sites were inferred as such.")
            logger.error(",".join(map(str, sorted(inferred_hapl_telomeres_ids - input_hapl_telomers_ids))))
        else:
            logger.info("Everything is OK! in clone {clone_id} all extremities have non-negative copy number excess, and inferred telomere sites concur with the input"
                        "".format(clone_id=clone_id))
        length = 0
        for u, v, data in hiag.segment_edges():
            s: Segment = data["object"]
            length += s.length * data["copy_number"]
        logger.info(f"Total length for clone {clone_id} = {length}")
        chromosome_cnt = sum(hiag.node_imbalance(node) for node in hiag.nodes(data=False)) / 2
        logger.info(f"Total number of chromosomes in clone {clone_id} = {chromosome_cnt}")
コード例 #5
0
ファイル: rck_scnt_stats.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-stats")
    cli_logging_parser = get_logging_cli_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    #####
    distance_parser = subparsers.add_parser("distance",
                                            parents=[cli_logging_parser])
    distance_parser.add_argument('--scnt1',
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt1-separator", default="\t")
    distance_parser.add_argument("--scnt1-extra-separator", default=";")
    distance_parser.add_argument("--scnt1-clone-ids", default=None)
    distance_parser.add_argument('--scnt2',
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt2-separator", default="\t")
    distance_parser.add_argument("--scnt2-extra-separator", default=";")
    distance_parser.add_argument("--scnt2-clone-ids", default=None)
    distance_parser.add_argument("--topn", type=int, default=3)
    distance_parser.add_argument("--verbose",
                                 action="store_true",
                                 dest="verbose")
    distance_parser.add_argument("--both-haplotype-specific",
                                 action="store_true",
                                 dest="both_haplotype_specific")
    distance_parser.add_argument('-o',
                                 '--output',
                                 type=argparse.FileType("wt"),
                                 default=sys.stdout)
    #####
    args = parser.parse_args()
    if args.command == "distance":
        scnt1_clone_ids = args.scnt1_clone_ids if args.scnt1_clone_ids is None else args.scnt1_clone_ids.split(
            ",")
        segments1, scnt1 = read_scnt_from_source(
            source=args.scnt1,
            separator=args.scnt1_separator,
            extra_separator=args.scnt1_extra_separator,
            clone_ids=scnt1_clone_ids)
        scnt2_clone_ids = args.scnt2_clone_ids if args.scnt2_clone_ids is None else args.scnt2_clone_ids.split(
            ",")
        segments2, scnt2 = read_scnt_from_source(
            source=args.scnt2,
            separator=args.scnt2_separator,
            extra_separator=args.scnt2_extra_separator,
            clone_ids=scnt2_clone_ids)
        result = cn_distance(
            segments1=segments1,
            scnt1=scnt1,
            segments2=segments2,
            scnt2=scnt2,
            both_haplotype_specific=args.both_haplotype_specific)
        sorted_result = sorted([(key, value) for key, value in result.items()],
                               key=lambda entry: sum(entry[1].values()))
        output_result = sorted_result[:args.topn]
        if args.verbose:
            print(
                f'Length-weighted segment copy number distance for tensors in {args.scnt1.name} and {args.scnt2.name}',
                file=args.output)
        for cnt, (case, clone_specific_distance) in enumerate(output_result,
                                                              start=1):
            print(
                f'{cnt}. Best distance (total) of {sum(clone_specific_distance.values()):,} with clone-specific ones {clone_specific_distance}, for case {case}',
                file=args.output)
コード例 #6
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-rck2x")
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ####
    shatterseek_parser = subparsers.add_parser(
        "shatterseek", parents=[cli_logging_parser, chr_strip_parser])
    shatterseek_parser.add_argument("rck_scnt",
                                    type=argparse.FileType("rt"),
                                    default=sys.stdin)
    shatterseek_parser.add_argument("--clone-id", required=True)
    shatterseek_parser.add_argument("--separator", default="\t")
    shatterseek_parser.add_argument("--extra-separator", default=";")
    shatterseek_parser.add_argument("--default-cn", type=int, default=0)
    shatterseek_parser.add_argument("--output-header",
                                    action="store_true",
                                    dest="output_header")
    shatterseek_parser.add_argument("-o",
                                    "--output",
                                    type=argparse.FileType("wt"),
                                    default=sys.stdout)
    ####
    circa_dens_parser = subparsers.add_parser(
        "circa-dens", parents=[cli_logging_parser, chr_strip_parser])
    circa_dens_parser.add_argument("rck_scnt",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    circa_dens_parser.add_argument("--clone-id", required=True)
    circa_dens_parser.add_argument("--separator", default="\t")
    circa_dens_parser.add_argument("--extra-separator", default=";")
    circa_dens_parser.add_argument("--cna-type",
                                   choices=["ampl", "del"],
                                   default="ampl")
    circa_dens_parser.add_argument("--haploid",
                                   action="store_true",
                                   dest="haploid")
    circa_dens_parser.add_argument("--inverse",
                                   action="store_true",
                                   dest="inverse")
    circa_dens_parser.add_argument("--window-size", type=int, default=10000000)
    circa_dens_parser.add_argument("--chr-sizes", type=argparse.FileType("rt"))
    circa_dens_parser.add_argument("-o",
                                   "--output",
                                   type=argparse.FileType("wt"),
                                   default=sys.stdout)
    ####
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-SCNT")

    if args.command == "shatterseek":
        logger.info(
            "Starting converting RCK Segment Copy Number Tensor data to ShatterSeek"
        )
        logger.debug(
            "Specified clone is {clone_id}".format(clone_id=args.clone_id))
        logger.info("Reading RCK formatted data from {file}".format(
            file=args.rck_scnt))
        segments, scnt = read_scnt_from_source(
            source=args.rck_scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        logger.info(
            "Read CN data is translated into a haploid (!!!) version of itself."
        )
        haploid_scnt = get_haploid_scnt(segments=segments, scnt=scnt)
        logger.info(
            "Writing data for clone {clone_id} in a ShatterSeek suitable format to {file}"
            .format(clone_id=args.clone_id, file=args.output))
        write_scnt_to_shatterseek_destination(destination=args.output,
                                              segments=segments,
                                              scnt=haploid_scnt,
                                              clone_id=args.clone_id,
                                              default=args.default_cn,
                                              output_header=args.output_header)
    elif args.command == "circa-dens":
        logger.info(
            "Starting computing ampl/del statistics from RKC Segment Copy Number Tensor Format"
        )
        logger.debug(
            "Specified clone is {clone_id}".format(clone_id=args.clone_id))
        logger.info("Reading RCK formatted data from {file}".format(
            file=args.rck_scnt))
        segments, scnt = read_scnt_from_source(
            source=args.rck_scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        chr_sizes = args.chr_sizes
        if args.chr_sizes is not None:
            chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
        circa_segments_cna_fractions = get_circa_segments_cna_fractions(
            segments=segments,
            scnt=scnt,
            clone_id=args.clone_id,
            window_size=args.window_size,
            chr_sizes=chr_sizes,
            cna_type=args.cna_type,
            haploid=args.haploid)
        segments = []
        total_average = 0
        total_length = 0
        for segment, cna_fraction in circa_segments_cna_fractions.items():
            value = cna_fraction * segment.length / args.window_size
            if args.inverse:
                value = 1 - value
            segment.extra[args.cna_type + "_fraction"] = value
            total_length += segment.length
            total_average += cna_fraction * segment.length
            segments.append(segment)
        logger.info("Total average cna fraction is " +
                    str(total_average / total_length))
        write_segments_to_circa_destination(
            destination=args.output,
            segments=segments,
            extra=[args.cna_type + "_fraction"])
    logger.info("Success!")
コード例 #7
0
ファイル: rck_adj_rck2x.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-NAS-rck2x")
    parser.add_argument('--version', action='version', version=rck.version)
    cli_logging_parser = get_logging_cli_parser()
    ###
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ###
    vcf_parser = subparsers.add_parser(
        "vcf-sniffles",
        parents=[cli_logging_parser],
        help="Convert RCK Adjacencies to the VCF (Sniffles) format")
    vcf_parser.add_argument("rck_adj",
                            type=argparse.FileType("rt"),
                            default=sys.stdin)
    vcf_parser.add_argument("--separator", default="\t")
    vcf_parser.add_argument("--extra-separator", default=";")
    vcf_parser.add_argument("--output",
                            "-o",
                            type=argparse.FileType("wt"),
                            default=sys.stdout)
    vcf_parser.add_argument("--o-extra-fields", default="all")
    vcf_parser.add_argument("--o-no-include-ref",
                            action="store_false",
                            dest="include_ref")
    vcf_parser.add_argument("--clone-suffix", default="")
    vcf_parser.add_argument("--dummy-clone", default="dummy_clone")
    vcf_parser.add_argument("--dummy-clone-gt-extra")
    vcf_parser.add_argument("--dummy-gt", default="./.")
    vcf_parser.add_argument("--alt-extra")
    vcf_parser.add_argument("--ref-extra")
    ###
    circa_parser = subparsers.add_parser(
        "circa",
        parents=[cli_logging_parser],
        help="Convert RCK Adjacencies to the TSV format supported by Circa")
    circa_parser.add_argument("rck_adj",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    circa_parser.add_argument("--separator", default="\t")
    circa_parser.add_argument("--extra-separator", default=";")
    circa_parser.add_argument("--size-extra-field")
    circa_parser.add_argument("--size-extra-field-no-abs",
                              action="store_false",
                              dest="size_extra_field_abs")
    circa_parser.add_argument("--size-extra-seq-field")
    circa_parser.add_argument("--output",
                              "-o",
                              type=argparse.FileType("wt"),
                              default=sys.stdout)
    ###
    circa_density_parser = subparsers.add_parser(
        "circa-dens",
        parents=[cli_logging_parser],
        help=
        "Convert RCK Adjacencies to the TSV format with adjacencies density cnt per window supported by Circa"
    )
    circa_density_parser.add_argument("rck_adj",
                                      type=argparse.FileType("rt"),
                                      default=sys.stdin)
    circa_density_parser.add_argument("--separator", default="\t")
    circa_density_parser.add_argument("--extra-separator", default=";")
    circa_density_parser.add_argument("--window-size",
                                      type=int,
                                      default=10000000)
    circa_density_parser.add_argument("--chr-sizes",
                                      type=argparse.FileType("rt"))
    circa_density_parser.add_argument("--element",
                                      choices=["breakend", "adj"],
                                      default="breakend")
    circa_density_parser.add_argument("--element-adj-cnt-full",
                                      action="store_true",
                                      dest="circa_element_adj_cnt_full")
    circa_density_parser.add_argument("-o",
                                      "--output",
                                      type=argparse.FileType("wt"),
                                      default=sys.stdout)
    ###
    bedpe_parser = subparsers.add_parser(
        "bedpe",
        parents=[cli_logging_parser],
        help=
        "Convert RCK Adjacencies to the BEDPE format with only intra-chromosomal adjacencies considered"
    )
    bedpe_parser.add_argument("rck_adj",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    bedpe_parser.add_argument("--separator", default="\t")
    bedpe_parser.add_argument("--extra-separator", default=";")
    bedpe_parser.add_argument("--name-extra-field", default=None)
    bedpe_parser.add_argument("-o",
                              "--output",
                              type=argparse.FileType("wt"),
                              default=sys.stdout)
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-NAS-rck2x")
    logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
    adjacencies = read_adjacencies_from_source(
        source=args.rck_adj,
        extra_separator=args.extra_separator,
        separator=args.separator)
    if args.command == "vcf-sniffles":
        if not args.include_ref:
            logger.debug(
                "Reference adjacencies were excluded from the output.")
            adjacencies = list(
                filter(lambda a: a.adjacency_type == AdjacencyType.NOVEL,
                       adjacencies))
        if args.o_extra_fields is None or len(
                args.o_extra_fields) == 0 or args.o_extra_fields == ",":
            extra = None
        elif args.o_extra_fields != "all":
            extra = args.o_extra_fields.split(",")
        else:
            extra = args.o_extra_fields
        logger.debug("Output extra fields are identified as {o_extra}".format(
            o_extra=",".join(extra) if extra is not None else ""))
        logger.info(
            "Converting RCK formatted adjacencies to the VCF (Sniffles) format"
        )
        logger.info("Writing adjacencies to {file}".format(file=args.output))
        write_adjacencies_to_vcf_sniffles_destination(
            destination=args.output,
            adjacencies=adjacencies,
            extra=extra,
            dummy_clone=args.dummy_clone,
            clone_suffix=args.clone_suffix,
            alt_extra=args.alt_extra,
            ref_extra=args.ref_extra,
            dummy_clone_gt_extra=args.dummy_clone_gt_extra,
            dummy_gt=args.dummy_gt)
    elif args.command == "circa":
        logger.info(
            "Converting input RCK formatted adjacencies into a Circa suitable format (extra column get transformed into a size column)"
        )
        logger.info(
            "Writing adjacencies info suitable for Circa to {file}".format(
                file=args.output))
        write_adjacencies_to_circa_destination(
            destination=args.output,
            adjacencies=adjacencies,
            size_extra_field=args.size_extra_field,
            size_extra_seq_field=args.size_extra_seq_field,
            size_abs=args.size_extra_field_abs)
    elif args.command == "circa-dens":
        logger.info(
            "Computing cnt of input RCK formatted adjacencies per window into a CIRCA suitable format"
        )
        chr_sizes = args.chr_sizes
        if args.chr_sizes is not None:
            chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
        circa_adj_cnts = get_circa_adj_cnt(
            adjacencies=adjacencies,
            window_size=args.window_size,
            chr_sizes=chr_sizes,
            element=args.element,
            adj_full_cnt=args.circa_element_adj_cnt_full)
        segments = []
        for segment, cnt in circa_adj_cnts.items():
            segment.extra[args.element +
                          "_cnt"] = cnt * segment.length / args.window_size
            segments.append(segment)
        write_segments_to_circa_destination(destination=args.output,
                                            segments=segments,
                                            extra=[args.element + "_cnt"])
    elif args.command == "bedpe":
        logger.info(
            f"Converting and writing input RCK formatted adjacencies into BEDPE format to {args.output}"
        )
        adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies,
                                                 allow_inter_chr=True)
        write_adjacencies_to_bedpe_destination(
            destination=args.output,
            adjacencies=adjacencies,
            name_extra_field=args.name_extra_field)
    logger.info("Success")
コード例 #8
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-input-refine",
                                     parents=[get_logging_cli_parser()])
    parser.add_argument("--version", action="version", version=rck.version)
    parser.add_argument("--scnt", required=True)
    parser.add_argument("--adjacencies", required=True)
    parser.add_argument("--clone-ids", default=None)
    parser.add_argument("--scnt-separator", default="\t")
    parser.add_argument("--adjacencies-separator", default="\t")
    parser.add_argument("--no-merge-fragments",
                        action="store_false",
                        dest="merge_fragments")
    parser.add_argument("--fragments-max-merge-gap",
                        type=int,
                        default=1000000000)
    parser.add_argument("--no-fill-gaps-fragments",
                        action="store_false",
                        dest="fill_gaps_fragments")
    parser.add_argument("--fragments-max-fill-gap",
                        type=int,
                        default=1000000000)
    parser.add_argument("--no-allow-unit-segments",
                        action="store_false",
                        dest="allow_unit_segments")
    parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
    parser.add_argument("--telomere-positions-separator", default="\t")
    parser.add_argument("--output-scnt", required=True)
    parser.add_argument("--output-fragments", required=True)
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-input-refine")
    clone_ids = args.clone_ids.split(
        ",") if args.clone_ids is not None else None
    scnt_file = get_full_path(args.scnt_file)
    adj_file = get_full_path(args.adj)
    segments, scnt = read_scnt_from_file(file_name=scnt_file,
                                         clone_ids=clone_ids,
                                         separator=args.scnt_separator)
    clone_ids = sorted(set(scnt.keys()))
    segments, scnt, segments_ids_mapping = refined_scnt(
        segments=segments,
        scnt=scnt,
        merge_fragments=args.merge_fragments,
        max_merge_gap=args.fragments_max_merge_gap,
        fill_gaps=args.fill_gaps_fragments,
        max_fill_gap=args.fragments_max_fill_gap)

    adjacencies = read_adjacencies_from_file(
        file_name=adj_file, separator=args.adjacencies_separator)
    if args.telomere_positions is not None:
        telomere_positions = read_positions_from_source(
            source=args.telomere_positions,
            separator=args.telomere_positions_separator)
    else:
        telomere_positions = []
    fragments = deepcopy(segments)
    segments, scnt = refined_scnt_with_adjacencies_and_telomeres(
        segments=segments,
        scnt=scnt,
        adjacencies=adjacencies,
        telomere_positions=telomere_positions)
    refined_scnt_file = os.path.expanduser(args.refined_scnt_file)
    refined_scnt_file = os.path.abspath(refined_scnt_file)
    fragments_file = get_full_path(path=args.output_fragments)

    write_segments_to_file(file_name=fragments_file, segments=fragments)
    write_scnt_to_file(file_name=refined_scnt_file,
                       scnt=scnt,
                       segments=segments)
コード例 #9
0
def main():
    parser = argparse.ArgumentParser()
    logging_parser = get_logging_cli_parser()
    ########
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ########
    lr_extraction_parser = subparsers.add_parser("extract-lr",
                                                 parents=[logging_parser])
    lr_extraction_parser.add_argument("rck_nas",
                                      type=argparse.FileType("rt"),
                                      default=sys.stdin)
    lr_extraction_parser.add_argument("-o",
                                      "--output",
                                      type=argparse.FileType("wt"),
                                      default=sys.stdout)
    lr_extraction_parser.add_argument("--min-sv-cnt", type=int, default=2)
    lr_extraction_parser.add_argument("--lr-field",
                                      default="support_read_names")
    #########
    lr_alignment_filter_parser = subparsers.add_parser(
        "filter-alignment", parents=[logging_parser])
    lr_alignment_filter_parser.add_argument("alignment",
                                            nargs="?",
                                            type=str,
                                            default="-")
    lr_alignment_filter_parser.add_argument("--i-alignment-format",
                                            type=str,
                                            choices=["bam", "sam", "cram"],
                                            default="bam")
    lr_alignment_filter_parser.add_argument("-r",
                                            "--reads",
                                            type=argparse.FileType("rt"),
                                            required=True)
    lr_alignment_filter_parser.add_argument("--r-separator", default="\t")
    lr_alignment_filter_parser.add_argument("--s-separator", default="\t")
    lr_alignment_filter_parser.add_argument("-o",
                                            "--output",
                                            type=str,
                                            default="-")
    lr_alignment_filter_parser.add_argument("--o-alignment-format",
                                            type=str,
                                            choices=["bam", "sam", "cram"],
                                            default="bam")
    #########
    labeling_constraint_inference_parser = subparsers.add_parser(
        "label-const-inf", parents=[logging_parser])
    labeling_constraint_inference_parser.add_argument("alignment",
                                                      type=str,
                                                      default="-")
    labeling_constraint_inference_parser.add_argument(
        "--i-alignment-format",
        type=str,
        choices=["bam", "sam", "cram"],
        default="bam")
    labeling_constraint_inference_parser.add_argument(
        "--rck-nas", type=argparse.FileType("rt"), required=True)
    labeling_constraint_inference_parser.add_argument("--min-sv-cnt",
                                                      type=int,
                                                      default=2)
    labeling_constraint_inference_parser.add_argument(
        "--lr-field", default="support_read_names")
    labeling_constraint_inference_parser.add_argument(
        "-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
    #########
    labeling_constraint_combine_parser = subparsers.add_parser(
        "label-const-com", parents=[logging_parser])
    labeling_constraint_combine_parser.add_argument(
        "label-constr", type=argparse.FileType("rt"), nargs="+")
    labeling_constraint_combine_parser.add_argument(
        "-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
    #########
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-LR")
    if args.command == "extract-lr":
        nas = read_adjacencies_from_source(source=args.rck_nas)
        reads_to_nas = defaultdict(list)
        for na in nas:
            reads_str = na.extra.get(args.lr_field, "")
            reads = reads_str.split(",")
            for read in reads:
                if len(read) == 0:
                    continue
                reads_to_nas[read].append(na)
        extracted_read_names = {
            read
            for read in reads_to_nas
            if len(reads_to_nas[read]) >= args.min_sv_cnt
        }
        for read_name in extracted_read_names:
            print(read_name, file=args.output)
    elif args.command == "filter-alignment":
        reads = get_reads_set_from_source(source=args.reads)
        imode = get_mode_str(format=args.i_alignment_format, input=True)
        omode = get_mode_str(format=args.o_alignment_format, input=False)
        with pysam.AlignmentFile(args.alignment, imode) as i_stream:
            with pysam.AlignmentFile(args.output, omode,
                                     template=i_stream) as o_stream:
                for entry in i_stream:
                    if entry.qname in reads:
                        o_stream.write(entry)
    elif args.command == "label-const-inf":
        constraints = infer_labeling_constraints(
            rck_nas_source=args.rck_nas,
            alignment_file=args.alignment,
            i_alignment_format=args.i_alignment_format,
            lr_field=args.lr_field,
            min_sv_cnt=args.min_sv_cnt,
            logger=logger)

    elif args.command == "label-constr-com":
        pass
コード例 #10
0
ファイル: rck_adj_x2rck.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-x2rck")
    parser.add_argument('--version', action='version', version=rck.version)
    ####
    shared_parser = get_shared_nas_parser()
    shared_parser.add_argument("--output",
                               "-o",
                               dest="rck_adj_file",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    ####
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ####
    lumpy_parser = subparsers.add_parser(
        "lumpy",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Lumpy VCF SV calls into RCK NAS format")
    lumpy_parser.add_argument("--id-suffix", dest="id_suffix", default="lumpy")
    lumpy_parser.add_argument("lumpy_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    longranger_parser = subparsers.add_parser(
        "longranger",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert LongRanger VCF SV calls into RCK NAS format")
    longranger_parser.add_argument("--id-suffix",
                                   dest="id_suffix",
                                   default="longranger")
    longranger_parser.add_argument("longranger_vcf_file",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    ####
    naibr_parser = subparsers.add_parser(
        "naibr",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert NAIBR NAS calls into RCK NAS format")
    naibr_parser.add_argument("--id-suffix", dest="id_suffix", default="naibr")
    naibr_parser.add_argument("naibr_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    manta_parser = subparsers.add_parser(
        "manta",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Manta VCF SV calls into RCK NAS format")
    manta_parser.add_argument("--id-suffix", dest="id_suffix", default="manta")
    manta_parser.add_argument("manta_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    sniffles_parser = subparsers.add_parser(
        "sniffles",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Sniffles VCF SV calls into RCK NAS format")
    sniffles_parser.add_argument("--id-suffix",
                                 dest="id_suffix",
                                 default="sniffles")
    sniffles_parser.add_argument("sniffles_vcf_file",
                                 type=argparse.FileType("rt"),
                                 default=sys.stdin)
    ####
    grocsv = subparsers.add_parser(
        "grocsvs",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert GROCSVS VCF SV calls into RCK NAS format")
    grocsv.add_argument("--id-suffix", dest="id_suffix", default="grocsv")
    grocsv.add_argument("grocsv_vcf_file",
                        type=argparse.FileType("rt"),
                        default=sys.stdin)
    grocsv.add_argument("--samples")
    grocsv.add_argument("--samples-all-any",
                        choices=["all", "any"],
                        default="any")
    grocsv.add_argument("--samples-only",
                        action="store_true",
                        dest="samples_only")
    ####
    delly = subparsers.add_parser(
        "delly",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Delly VCF SV calls into RCK NAS format")
    delly.add_argument("--id-suffix", dest="id_suffix", default="delly")
    delly.add_argument("delly_vcf_file",
                       type=argparse.FileType("rt"),
                       default=sys.stdin)
    delly.add_argument("--stream",
                       action="store_true",
                       dest="delly_force_stream")
    ####
    pbsv = subparsers.add_parser(
        "pbsv",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert PBSV VCF SV calls into RCK NAS format")
    pbsv.add_argument("--id-suffix", dest="id_suffix", default="pbsv")
    pbsv.add_argument("--sample", default=None)
    pbsv.add_argument("pbsv_vcf_file",
                      type=argparse.FileType("rt"),
                      default=sys.stdin)
    ####
    remixt = subparsers.add_parser(
        "remixt",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert ReMixT Novel adjacencies calls into RCK format")
    remixt.add_argument("--i-separator", default="\t")
    remixt.add_argument("--id-suffix", dest="id_suffix", default="remixt")
    remixt.add_argument("--clone-ids",
                        choices=["1", "2", "1,2"],
                        default="1,2")
    remixt.add_argument("--skip-absent",
                        action="store_true",
                        dest="skip_absent")
    remixt.add_argument("--no-remixt-na-correction",
                        action="store_false",
                        dest="remixt_correction")
    remixt.add_argument("remixt_file",
                        type=argparse.FileType("rt"),
                        default=sys.stdin)
    ####
    gundem2015_parser = subparsers.add_parser(
        "gundem2015",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert SV calls from Gundem et al (2015) (BRASS2???) "
        "into RCK NAS format")
    gundem2015_parser.add_argument("--id-suffix",
                                   dest="id_suffix",
                                   default="gundem2015")
    gundem2015_parser.add_argument("gundem2015_file",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    gundem2015_parser.add_argument("--i-separator", default="\t")
    gundem2015_parser.add_argument("--samples", nargs="+", required=True)
    gundem2015_parser.add_argument("--min-sample-cnt", type=int, default=1)
    gundem2015_parser.add_argument("--no-flip-second-strand",
                                   action="store_false",
                                   dest="flip_second_strand")
    ####
    survivor_parser = subparsers.add_parser(
        "survivor",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Covert SURVIVOR SV merging results into RCK format")
    survivor_parser.add_argument("--id-suffix",
                                 dest="id_suffix",
                                 default="survivor")
    survivor_parser.add_argument("survivor_vcf_file",
                                 type=argparse.FileType("rt"),
                                 default=sys.stdin)
    survivor_parser.add_argument("--samples")
    survivor_parser.add_argument("--samples-sources")
    survivor_parser.add_argument("--samples-separator", default="\t")
    survivor_parser.add_argument("--samples-extra-separator", default=";")
    survivor_parser.add_argument("--samples-suffix-extra",
                                 action="store_true",
                                 dest="suffix_sample_extra")
    survivor_parser.add_argument("--survivor-prefix", default="")
    ####
    svaba_parser = subparsers.add_parser(
        "svaba",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert SvABA SV calls into RCK format")
    svaba_parser.add_argument("--id-suffix", dest="id_suffix", default="svaba")
    svaba_parser.add_argument("svaba_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    svaba_parser.add_argument("--i-type",
                              choices=["indel", "sv"],
                              default="sv")
    svaba_parser.add_argument("--samples")
    svaba_parser.add_argument("--samples-all-any",
                              choices=["all", "any"],
                              default="any")
    svaba_parser.add_argument("--samples-only",
                              action="store_true",
                              dest="samples_only")
    ####
    breakdancer_parser = subparsers.add_parser(
        "breakdancer",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Breakdancer(-max) SV calls into RCK format")
    breakdancer_parser.add_argument("--id-suffix",
                                    dest="id_suffix",
                                    default="breakdancer")
    breakdancer_parser.add_argument("breakdancer_file",
                                    type=argparse.FileType("rt"),
                                    default=sys.stdin)
    ####
    args = parser.parse_args()
    setup = build_setup(args=args)
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-ADJ-x2rck")
    nas = []
    if args.o_extra_fields is None or len(
            args.o_extra_fields) == 0 or args.o_extra_fields == ",":
        extra = None
    elif args.o_extra_fields != "all":
        extra = args.o_extra_fields.split(",")
    else:
        extra = args.o_extra_fields
    if args.command == "lumpy":
        logger.info(
            "Starting converting adjacencies from the Lumpy VCF format to that of RCK"
        )
        logger.info("Reading Lumpy VCF records from {file}".format(
            file=args.lumpy_vcf_file))
        lumpy_vcf_records = get_vcf_records_from_source(
            source=args.lumpy_vcf_file)
        logger.info("Converting Lumpy VCF records to RCK adjacencies")
        nas = get_nas_from_lumpy_vcf_records(
            lumpy_vcf_records=lumpy_vcf_records, setup=setup)
    elif args.command == "longranger":
        logger.info(
            "Starting converting adjacencies from the LongRanger VCf format to that of RCK"
        )
        logger.info("Reading LongRanger VCF records from {file}".format(
            file=args.longranger_vcf_file))
        longranger_vcf_records = get_vcf_records_from_source(
            source=args.longranger_vcf_file)
        logger.info('Converting LongRanger VCF records to RCK adjacencies')
        nas = get_nas_from_longranger_vcf_records(
            longranger_vcf_records=longranger_vcf_records, setup=setup)
    elif args.command == "naibr":
        logger.info(
            "Starting converting adjacencies from NAIBR records to that of RCK"
        )
        logger.info(
            "Reading and converting NAIBR records from {file} to RCK adjacencies"
            .format(file=args.naibr_file))
        nas = get_nas_from_naibr_source(source=args.naibr_file, setup=setup)
    elif args.command == "manta":
        logger.info(
            "Starting converting adjacencies from Manta records to that of RCK"
        )
        logger.info("Reading Manta VCF records from {file}".format(
            file=args.manta_vcf_file))
        manta_vcf_records = get_vcf_records_from_source(
            source=args.manta_vcf_file)
        logger.info("Converting Manta VCF records to RCK adjacencies")
        nas = get_nas_from_manta_vcf_records(
            manta_vcf_records=manta_vcf_records, setup=setup)
    elif args.command == "sniffles":
        logger.info(
            "Starting converting adjacencies from Sinffles records to that of RCK"
        )
        logger.info("Reading Sniffles VCF records from {file}".format(
            file=args.sniffles_vcf_file))
        sniffles_vcf_records = get_vcf_records_from_source(
            source=args.sniffles_vcf_file)
        logger.info("Converting Sniffles VCF records to RCK adjacencies")
        nas = get_nas_from_sniffles_vcf_records(
            sniffles_vcf_records=sniffles_vcf_records, setup=setup)
    elif args.command == "grocsvs":
        logger.info(
            "Starting converting adjacencies from GROCSVS records to that of RCK"
        )
        logger.info("Reading GROCSVS VCF records from {file}".format(
            file=args.grocsv_vcf_file))
        samples = args.samples.split(
            ",") if args.samples is not None else args.samples
        grocsv_vcf_records = get_vcf_records_from_source(
            source=args.grocsv_vcf_file)
        logger.info("Converting GROCSVS VCF records to RCK adjacencies")
        nas = get_nas_from_grocsv_vcf_records(
            grocsv_vcf_records=grocsv_vcf_records,
            setup=setup,
            samples=samples,
            samples_all_any=args.samples_all_any,
            samples_only=args.samples_only)
    elif args.command == "delly":
        logger.info(
            "Starting converting adjacencies from Delly records to that of RCK"
        )
        if args.delly_force_stream:
            logger.info("Forced stream is enabled")
            logger.info(
                "Streamlining reading and converting Delly VCF from {i_file} to RCK adjacencies into {o_file}"
                "".format(i_file=args.delly_vcf_file,
                          o_file=args.rck_nas_file))
            delly_vcf_to_nas_stream(source=args.delly_vcf_file,
                                    dest=args.rck_nas_file,
                                    setup=setup,
                                    extra=extra)
            sys.exit(0)
        else:
            logger.info("Reading Delly VCF records from {file}".format(
                file=args.delly_vcf_file))
            delly_vcf_records = get_vcf_records_from_source(
                source=args.delly_vcf_file)
            logger.info("Converting Delly VCF records to rCK adjacencies")
            nas = get_nas_from_delly_vcf_records(
                delly_vcf_records=delly_vcf_records, setup=setup)
    elif args.command == "pbsv":
        logger.info(
            "Starting converting adjacencies from PBSV records to that of RCK")
        logger.info("Reading PBSV VCF records from {file}".format(
            file=args.pbsv_vcf_file))
        pbsv_vcf_records = get_vcf_records_from_source(
            source=args.pbsv_vcf_file)
        logger.info("Converting PBSV VCF records to RCK adjacencies")
        nas = get_nas_from_pbsv_vcf_records(pbsv_vcf_records=pbsv_vcf_records,
                                            setup=setup,
                                            sample=args.sample)
    elif args.command == "gundem2015":
        logger.info(
            "Starting converting adjacencies from Gundem et al 2015 (BRASS2???) to that of RCK"
        )
        logger.info(
            "Reading Gundem 2015 et al (BRASS???) records from {file}".format(
                file=args.gundem2015_file))
        nas = get_nas_from_gundem2015_source(
            source=args.gundem2015_file,
            setup=setup,
            separator=args.i_separator,
            flip_second_strand=args.flip_second_strand)
        logger.info(
            "Extracting adjacencies for sample {samples} with a minimum cnt of {min_cnt}"
            .format(samples=",".join(args.samples),
                    min_cnt=args.min_sample_cnt))
        nas = processed_gundem2015_adjacencies(
            adjacencies=nas,
            sample_names=args.samples,
            min_per_sample_cnt=args.min_sample_cnt)
    elif args.command == "remixt":
        logger.info(
            "Starting converting adjacencies and their (haploid) copy numbers from ReMixT to that of RCK"
        )
        logger.info(
            "Reading and converting ReMixT resotds from {file} to RCK adjacencies"
            .format(file=args.remixt_file))
        clone_ids = args.clone_ids.split(",")
        nas = get_nas_from_remixt_source(
            source=args.remixt_file,
            setup=setup,
            separator=args.i_separator,
            clone_ids=clone_ids,
            skip_absent=args.skip_absent,
            remixt_na_correction=args.remixt_correction)
    elif args.command == "survivor":
        sample_names = args.samples.split(
            ",") if args.samples is not None else []
        sample_sources = args.samples_sources.split(
            ",") if args.samples_sources is not None else []
        if len(sample_names) != len(sample_sources):
            logger.warning(
                "Provided samples' length {sample_cnt} ({samples}) does not match that of samples source length {sample_sources_cnt} (sample_sources)"
                "".format(sample_cnt=len(sample_names),
                          samples=",".join(sample_names),
                          sample_sources_cnt=len(sample_sources),
                          sample_sources=",".join(sample_sources)))
        logger.info(
            "Starting converting adjacencies from SURVIVOR to that of RCK")
        logger.info("Reading SURVIVOR records from {file}".format(
            file=args.survivor_vcf_file))
        survivor_vcf_records = get_vcf_records_from_source(
            source=args.survivor_vcf_file)
        logger.debug("Reading source-samples adjacencies (in RCK format)")
        adjacencies_by_ids_by_sample_name = {}
        for sample_name, sample_source in zip(sample_names, sample_sources):
            try:
                file_name = get_full_path(sample_source)
                adjacencies = read_adjacencies_from_file(
                    file_name=file_name,
                    separator=args.samples_separator,
                    extra_separator=args.samples_extra_separator)
                adjacencies_by_ids = {
                    adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased):
                    adj
                    for adj in adjacencies
                }
                adjacencies_by_ids_by_sample_name[
                    sample_name] = adjacencies_by_ids
            except IOError:
                logger.warning(
                    "Unable to reader source adjacency information from {source}"
                    .format(source=sample_source))
        logger.info("Converting SURVIVOR VCF records from {file}".format(
            file=args.survivor_vcf_file))
        nas = get_nas_from_survivor_vcf_records(
            survivor_vcf_records=survivor_vcf_records,
            setup=setup,
            adjacencies_by_ids_by_sample_name=adjacencies_by_ids_by_sample_name,
            suffix_sample_extra=args.suffix_sample_extra,
            survivor_prefix=args.survivor_prefix)
    elif args.command == "svaba":
        logger.info("Starting converting adjacencies from SvABA to RCK")
        logger.info("Reading SvABA VCF records from {file}".format(
            file=args.svaba_vcf_file))
        svaba_vcf_records = get_vcf_records_from_source(
            source=args.svaba_vcf_file)
        logger.info("Converting SvABA VCF records to RCK adjacencies")
        samples = args.samples.split(
            ",") if args.samples is not None else args.samples
        nas = get_nas_from_svaba_vcf_records(
            svaba_vcf_records=svaba_vcf_records,
            source_type=args.i_type,
            setup=setup,
            samples=samples,
            samples_all_any=args.samples_all_any,
            samples_only=args.samples_only)
    elif args.command == "breakdancer":
        logger.info(
            "Starting converting adjacencies from Breakdancer(-max) to RCK")
        logger.info(
            "Reading and converting Breakdancer(-max) records from {file}".
            format(file=args.breakdancer_file))
        nas = get_nas_from_breakdancer_source(source=args.breakdancer_file,
                                              setup=setup)
    logger.info(
        "A total of {cnt} adjacencies were obtained.".format(cnt=len(nas)))
    logger.debug("Output extra fields were identified as {o_extra}".format(
        o_extra=",".join(extra)))
    include_chrs_regions_strings = []
    exclude_chrs_regions_strings = []
    if args.chrs_include is not None:
        for chrs_lists in args.chrs_include:
            for chrs_list in chrs_lists:
                for chr_name in chrs_list.split(","):
                    include_chrs_regions_strings.append(chr_name)
    if args.chrs_include_file is not None:
        for chr_name in get_chrs_regions_string_lists_from_source(
                source=args.chrs_include_file):
            include_chrs_regions_strings.append(chr_name)
    if args.chrs_exclude is not None:
        for chrs_lists in args.chrs_exclude:
            for chrs_list in chrs_lists:
                for chr_name in chrs_list.split(","):
                    exclude_chrs_regions_strings.append(chr_name)
    if args.chrs_exclude_file is not None:
        for chr_name in get_chrs_regions_string_list_from_file(
                file_name=args.chrs_exclude_file):
            exclude_chrs_regions_strings.append(chr_name)
    include_regions = [
        parse_segment_chr_region(string)
        for string in include_chrs_regions_strings
    ]
    exclude_regions = [
        parse_segment_chr_region(string)
        for string in exclude_chrs_regions_strings
    ]
    logger.debug("Include chromosomes : {include_chromosomes}".format(
        include_chromosomes=",".join(map(str, include_regions))))
    logger.debug("Exclude chromosomes : {exclude_chromosomes}".format(
        exclude_chromosomes=",".join(map(str, exclude_regions))))
    logger.info("Filtering adjacencies based on input/exclude chromosomes")
    nas = filter_adjacencies_by_chromosomal_regions(
        adjacencies=nas,
        include=include_regions,
        exclude=exclude_regions,
        include_both=args.include_both,
        exclude_both=args.exclude_both)
    nas = list(nas)
    logger.info(
        "A total of {cnt} adjacencies were retained after filtering".format(
            cnt=len(nas)))
    logger.info(
        "Writing RCK adjacencies to {file}".format(file=args.rck_adj_file))
    write_adjacencies_to_destination(destination=args.rck_adj_file,
                                     adjacencies=nas,
                                     extra=extra)
    logger.info("Success")
コード例 #11
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-infer")
    parser.add_argument("--version", action="version", version=rck.version)
    cli_logging_parser = get_logging_cli_parser()

    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ###
    sniffles_molecule_group_parser = subparsers.add_parser(
        "sniffles-m", parents=[cli_logging_parser])
    sniffles_molecule_group_parser.add_argument("rck_adj",
                                                type=argparse.FileType("rt"),
                                                default=sys.stdin)
    sniffles_molecule_group_parser.add_argument("--i-separator", default="\t")
    sniffles_molecule_group_parser.add_argument("--i-extra-separator",
                                                default=";")
    sniffles_molecule_group_parser.add_argument("--extra-rnames-field",
                                                default="rnames")
    sniffles_molecule_group_parser.add_argument("--fp",
                                                type=float,
                                                default=0.5)
    sniffles_molecule_group_parser.add_argument("--gid-suffix",
                                                dest="gid_suffix",
                                                default="sniffles-M")
    sniffles_molecule_group_parser.add_argument("-o",
                                                "--output",
                                                type=argparse.FileType("wt"),
                                                default=sys.stdout)
    sniffles_molecule_group_parser.add_argument("--o-separator", default="\t")
    sniffles_molecule_group_parser.add_argument("--o-aids-separator",
                                                default=",")
    sniffles_molecule_group_parser.add_argument("--o-extra-separator",
                                                default=";")
    ###
    short_nas_labeling_group_parser = subparsers.add_parser(
        "short-l", parents=[cli_logging_parser])
    short_nas_labeling_group_parser.add_argument("rck_adj",
                                                 type=argparse.FileType("rt"),
                                                 default=sys.stdin)
    short_nas_labeling_group_parser.add_argument("--i-separator", default="\t")
    short_nas_labeling_group_parser.add_argument("--i-extra-separator",
                                                 default=";")
    short_nas_labeling_group_parser.add_argument("--max-size",
                                                 type=int,
                                                 default=50000000)
    short_nas_labeling_group_parser.add_argument(
        "--allow-intermediate-same",
        action="store_true",
        dest="allow_intermediate_same")
    short_nas_labeling_group_parser.add_argument("--allow-intermediate-tra",
                                                 action="store_true",
                                                 dest="allow_intermediate_tra")
    short_nas_labeling_group_parser.add_argument("--no-inv-signatures",
                                                 action="store_false",
                                                 dest="allow_inv_signature")
    short_nas_labeling_group_parser.add_argument("--no-refine",
                                                 action="store_false",
                                                 dest="refine")
    short_nas_labeling_group_parser.add_argument("--fp", type=float, default=1)
    short_nas_labeling_group_parser.add_argument("--gid-suffix",
                                                 dest="gid_suffix",
                                                 default="short-nas-L")
    short_nas_labeling_group_parser.add_argument("-o",
                                                 "--output",
                                                 type=argparse.FileType("wt"),
                                                 default=sys.stdout)
    short_nas_labeling_group_parser.add_argument("--o-separator", default="\t")
    short_nas_labeling_group_parser.add_argument("--o-aids-separator",
                                                 default=",")
    short_nas_labeling_group_parser.add_argument("--o-extra-separator",
                                                 default=";")
    ###
    sniffles_labeling_group_parser = subparsers.add_parser(
        "sniffles-l", parents=[cli_logging_parser])
    sniffles_labeling_group_parser.add_argument("--rck-adj",
                                                type=argparse.FileType("rt"),
                                                required=True)
    sniffles_labeling_group_parser.add_argument("--i-separator", default="\t")
    sniffles_labeling_group_parser.add_argument("--i-extra-separator",
                                                default=";")
    sniffles_labeling_group_parser.add_argument("--alignment", required=True)
    sniffles_labeling_group_parser.add_argument("--alignment-format",
                                                choices=["sam", "bam", "cram"],
                                                default="bam")
    sniffles_labeling_group_parser.add_argument("--extra-rnames-field",
                                                default="rnames")
    sniffles_labeling_group_parser.add_argument("--no-refine",
                                                action="store_false",
                                                dest="refine")
    sniffles_labeling_group_parser.add_argument("--fp", type=float, default=1)
    sniffles_labeling_group_parser.add_argument("--gid-suffix",
                                                default="sniffles-L")
    sniffles_labeling_group_parser.add_argument("-o",
                                                "--output",
                                                default=sys.stdout,
                                                type=argparse.FileType("wt"))
    sniffles_labeling_group_parser.add_argument("--o-separator", default="\t")
    sniffles_labeling_group_parser.add_argument("--o-aids-separator",
                                                default=",")
    sniffles_labeling_group_parser.add_argument("--o-extra-separator",
                                                default=";")
    ###
    filter_alignment_parser = subparsers.add_parser(
        "filter-alignment", parents=[cli_logging_parser])
    filter_alignment_parser.add_argument("--rck-adj",
                                         type=argparse.FileType("rt"),
                                         required=True)
    filter_alignment_parser.add_argument("--i-separator", default="\t")
    filter_alignment_parser.add_argument("--i-extra-separator", default=";")
    filter_alignment_parser.add_argument("--extra-rnames-field",
                                         default="rnames")
    filter_alignment_parser.add_argument("--alignment", required=True)
    filter_alignment_parser.add_argument("--alignment-format",
                                         choices=["sam", "bam", "cram"],
                                         default="bam")
    filter_alignment_parser.add_argument("-o", "--output", required=True)
    filter_alignment_parser.add_argument("--output-format",
                                         choices=["sam", "bam", "cram"],
                                         default="bam")
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADJ-GROUPS-infer")
    if args.command == "sniffles-m":
        logger.info(
            "Inferring molecule adjacency groups from adjacencies with Sniffles RNAMES support extra info."
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator)
        logger.info(
            "Inferring molecule adjacency groups from read adjacencies")
        adj_groups = infer_sniffles_molecule_groups(
            adjacencies=adjacencies,
            extra_rnames_field=args.extra_rnames_field,
            gid_suffix=args.gid_suffix)
        logger.info("Inferred {cnt} molecule adjacency groups".format(
            cnt=len(adj_groups)))
        logger.info(
            "Writing inferred molecule adjacency groups to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            extra_separator=args.o_extra_separator,
            aids_separator=args.o_aids_separator,
            extra_fill="")
    elif args.command == "short-l":
        logger.info(
            "Inferring labeling adjacency groups from adjacencies from adjacencies."
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator)
        logger.info(
            "Inferring labeling adjacency groups from read adjacencies")
        adj_groups = infer_short_nas_labeling_groups(
            adjacencies=adjacencies,
            gid_suffix=args.gid_suffix,
            max_size=args.max_size,
            allow_intermediate_same=args.allow_intermediate_same,
            allow_intermediate_tra=args.allow_intermediate_tra,
            allow_inv_signatures=args.allow_inv_signature)
        logger.info("Inferred {cnt} labeling adjacency groups".format(
            cnt=len(adj_groups)))
        if args.refine:
            logger.info("Refining inferred labeling adjacency groups")
            adj_groups = refined_labeling_groups(adj_groups=adj_groups,
                                                 gid_suffix=args.gid_suffix)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remain".format(
                cnt=len(adj_groups)))
        logger.info(
            "Writing inferred labeling adjacency group s to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator,
            extra_fill="")
    elif args.command == "sniffles-l":
        logger.info(
            "Inferring labeling adjacency groups from adjacencies, and their reads-of-origin alignments"
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            extra_separator=args.i_extra_separator,
            separator=args.i_separator)
        logger.info(
            "Inferring labeling adjacency groups from read adjacencies and their reads-of-origin alignments"
        )
        adj_groups = infer_alignment_labeling_groups(
            adjacencies=adjacencies,
            alignment_file_name=args.alignment,
            alignment_format=args.alignment_format,
            extra_rnames_field=args.extra_rnames_field,
            gid_suffix=args.gid_suffix)
        logger.info(
            "Inferred {cnt} labeling adjacency groups. There can be many duplicates, refinement shall take care of it."
            .format(cnt=len(adj_groups)))
        if args.refine:
            logger.info("Refining inferred labeling adjacency groups")
            adj_groups = refined_labeling_groups(adj_groups=adj_groups,
                                                 gid_suffix=args.gid_suffix)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remain".format(
                cnt=len(adj_groups)))
        logger.info(
            "Writing inferred labeling adjacency group s to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator,
            extra_fill="")
    elif args.command == "filter-alignment":
        logger.info(
            "Filtering input read alignment to retain only reads mentioned as supporting adjacencies from the input"
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            extra_separator=args.i_extra_separator,
            separator=args.i_separator)
        logger.info(
            "Filtering input alignment form file {file} and writing result in {o_file}"
            .format(file=args.alignment, o_file=args.output))
        filter_alignment(adjacencies=adjacencies,
                         alignment_file_name=args.alignment,
                         alignment_format=args.alignment_format,
                         extra_rnames_field=args.extra_rnames_field,
                         output_alignment_file_name=args.output,
                         output_alignment_format=args.output_format)
        exit(0)
コード例 #12
0
ファイル: rck_scnt_process.py プロジェクト: raphael-group/RCK
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-process")
    cli_logging_parser = get_logging_cli_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ###
    refine_parser = subparsers.add_parser("refine",
                                          parents=[cli_logging_parser])
    refine_parser.add_argument('scnt',
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    refine_parser.add_argument("--separator", default="\t")
    refine_parser.add_argument("--no-allow-missing-clones",
                               action="store_false",
                               dest="allow_missing_clones")
    refine_parser.add_argument("--clone-ids", default=None)
    refine_parser.add_argument("--no-merge-fragments",
                               action="store_false",
                               dest="merge_fragments")
    refine_parser.add_argument("--max-merge-gap", type=int, default=1000000)
    refine_parser.add_argument("--no-fill-gaps",
                               action="store_false",
                               dest="fill_gaps")
    refine_parser.add_argument("--max-fill-gap", type=int, default=1000000)
    refine_parser.add_argument('--output',
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    ###
    align_parser = subparsers.add_parser("align", parents=[cli_logging_parser])
    align_parser.add_argument("scnt", nargs="+")
    align_parser.add_argument("--separator", default="\t")
    align_parser.add_argument("--output-suffix", default="aligned")
    align_parser.add_argument("--no-allow-unit-segments",
                              action="store_false",
                              dest="allow_unit_segments")
    align_parser.add_argument("--output-dir", default="")
    ###
    distance_parser = subparsers.add_parser("distance",
                                            parents=[cli_logging_parser])
    distance_parser.add_argument("--scnt1",
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt1-separator", default="\t")
    distance_parser.add_argument("--scnt1-extra-separator", default=";")
    distance_parser.add_argument("--scnt2",
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt2-separator", default="\t")
    distance_parser.add_argument("--scnt2-extra-separator", default=";")
    distance_parser.add_argument("--clone-ids", default=None)
    distance_parser.add_argument("--output",
                                 "-o",
                                 type=argparse.FileType("wt"),
                                 default=sys.stdout)
    ###
    filter_parser = subparsers.add_parser("filter",
                                          parents=[cli_logging_parser])
    filter_parser.add_argument("scnt",
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    filter_parser.add_argument("--separator", default="\t")
    filter_parser.add_argument("--extra-separator", default=";")
    filter_parser.add_argument("--o-extra-fields", default="all")
    filter_parser.add_argument("--chrs-include", action="append", nargs=1)
    filter_parser.add_argument("--chrs-include-file",
                               type=argparse.FileType("rt"))
    filter_parser.add_argument("--chrs-include-no-full",
                               action="store_false",
                               dest="include_full")
    filter_parser.add_argument("--chrs-exclude", action="append", nargs=1)
    filter_parser.add_argument("--chrs-exclude-file",
                               type=argparse.FileType("rt"))
    filter_parser.add_argument("--chrs-exclude-full",
                               action="store_true",
                               dest="exclude_full")
    filter_parser.add_argument("--keep-extra-field-regex",
                               nargs="+",
                               default=None)
    filter_parser.add_argument("--keep-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--keep-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--remove-extra-field-regex",
                               nargs="+",
                               default=None)
    filter_parser.add_argument("--remove-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--remove-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--min-size", type=int, default=0)
    filter_parser.add_argument("--max-size", type=int, default=1000000000)
    filter_parser.add_argument("-o",
                               "--output",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    ###
    haploid_parser = subparsers.add_parser("haploid",
                                           parents=[cli_logging_parser])
    haploid_parser.add_argument("scnt",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    haploid_parser.add_argument("--separator", default="\t")
    haploid_parser.add_argument("--extra-separator", default=";")
    haploid_parser.add_argument("--output",
                                "-o",
                                type=argparse.FileType("wt"),
                                default=sys.stdout)
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-SCNT-process")

    if args.command == "refine":
        clone_ids = args.clone_ids.split(
            ",") if args.clone_ids is not None else None
        logger.debug(
            "Clone ids identified as {clone_ids}. If None -- all clone ids will be processed."
            .format(clone_ids=",".join(clone_ids)))
        logger.info("Reading Segment Copy Number Tensor form {file}".format(
            file=args.scnt))
        segments, scnt = read_scnt_from_source(source=args.scnt,
                                               clone_ids=clone_ids,
                                               separator=args.separator)
        logger.info("Refining Segment Copy Number Tensor from {file}".format(
            file=args.scnt))
        segments, scnt, _ = refined_scnt(segments=segments,
                                         scnt=scnt,
                                         merge_fragments=args.merge_fragments,
                                         max_merge_gap=args.max_merge_gap,
                                         fill_gaps=args.fill_gaps,
                                         max_fill_gap=args.max_fill_gap)
        logger.info(
            "Writing refined Segment Copy Number Tensor to {file}".format(
                file=args.output))
        write_scnt_to_destination(destination=args.output,
                                  scnt=scnt,
                                  segments=segments,
                                  clone_ids=clone_ids,
                                  separator=args.separator)
    elif args.command == "align":
        scnt_files = {}
        for path in args.scnt:
            full_path = get_full_path(path=path)
            name = os.path.splitext(os.path.basename(full_path))[0]
            if name.endswith(".scnt"):
                name = name[:-5]
            if name.endswith("."):
                name = name[:-1]
            scnt_files[name] = full_path
        logger.debug(
            "Input Segment Copy Number Tensors (SCNT) identified as {input_scnts}"
            .format(input_scnts=" , ".join(scnt_files.values())))
        scnts_by_name = {}
        segments_by_name = {}
        clone_ids_by_scnt = {}
        logger.info("Reading input SCNTs")
        for name, path in scnt_files.items():
            logger.debug(
                "Reading SCNT from {file}".format(file=scnt_files[name]))
            segments, scnt = read_scnt_from_file(file_name=scnt_files[name],
                                                 separator=args.separator)
            clone_ids_by_scnt[name] = sorted(scnt.keys())
            scnts_by_name[name] = scnt
            segments_by_name[name] = segments
        if len(scnts_by_name.values()) == 1:
            logger.warning(
                "Only one input SCNT identified. Doing nothing with it, outputting as is."
            )
            aligned_segments_by_name, aligned_scnts_by_name = segments_by_name, scnts_by_name
        else:
            logger.info("Aligning input SCNTs.")
            aligned_segments_by_name, aligned_scnts_by_name = aligned_scnts(
                segments_by_sample_names=segments_by_name,
                scnts_by_sample_names=scnts_by_name)
        result_base_names = {}
        cnt = 0
        for name in sorted(scnt_files.keys()):
            new_name = name
            if name in result_base_names:
                new_name = name + str(cnt)
                cnt += 1
            new_name = new_name + "." + args.output_suffix
            result_base_names[name] = new_name
        output_dir = args.output_dir if args.output_dir != "" else os.getcwd()
        output_dir = get_full_path(path=output_dir)
        logger.info("Writing aligned SCNTs")
        for name, new_name in result_base_names.items():
            scnt = aligned_scnts_by_name[name]
            segments = aligned_segments_by_name[name]
            scnt_path = os.path.join(output_dir, new_name + "rck.scnt.tsv")
            logger.debug("Writing aligned SCNT {scnt_name} to {file}".format(
                scnt_name=name, file=scnt_path))
            write_scnt_to_file(file_name=scnt_path,
                               segments=segments,
                               scnt=scnt,
                               separator=args.separator)
    elif args.command == "filter":
        logger.info(
            "Filtering input segments from following sources {sources}".format(
                sources=args.scnt))
        segments = stream_segments_from_source(
            source=args.scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        include_chrs_regions_strings = []
        exclude_chrs_regions_strings = []
        if args.chrs_include is not None:
            for chrs_lists in args.chrs_include:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        include_chrs_regions_strings.append(chr_name)
        if args.chrs_include_file is not None:
            for chr_name in get_chrs_regions_string_lists_from_source(
                    source=args.chrs_include_file):
                include_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude is not None:
            for chrs_lists in args.chrs_exclude:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        exclude_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude_file is not None:
            for chr_name in get_chrs_regions_string_list_from_file(
                    file_name=args.chrs_exclude_file):
                exclude_chrs_regions_strings.append(chr_name)
        include_regions = [
            parse_segment_chr_region(string)
            for string in include_chrs_regions_strings
        ]
        exclude_regions = [
            parse_segment_chr_region(string)
            for string in exclude_chrs_regions_strings
        ]
        segments = filter_segments_by_chromosomal_regions(
            segments=segments,
            include=include_regions,
            exclude=exclude_regions,
            include_full=args.include_full,
            exclude_full=args.exclude_full)
        keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
        if args.keep_extra_field_regex_file is not None:
            keep_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.keep_extra_field_regex_file)))
        remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
        if args.remove_extra_field_regex_file is not None:
            remove_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.remove_extra_field_regex_file)))
        keep_extra_field = get_extra_field_regexes(
            string_entries=keep_extra_field_entries)
        remove_extra_field = get_extra_field_regexes(
            string_entries=remove_extra_field_entries)
        segments = filter_segments_by_extra(
            segments=segments,
            keep_extra_field=keep_extra_field,
            keep_extra_field_missing_strategy=args.
            keep_extra_field_missing_strategy,
            remove_extra_field=remove_extra_field,
            remove_extra_field_missing_strategy=args.
            remove_extra_field_missing_strategy)
        segments = filter_segments_by_size(segments=segments,
                                           min_size=args.min_size,
                                           max_size=args.max_size)
        write_segments_to_destination(destination=args.output,
                                      segments=segments)

    elif args.command == "haploid":
        segments = stream_segments_from_source(
            source=args.scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        haploid_segments = iter_haploid_segments(segments=segments, copy=False)
        write_segments_to_destination(destination=args.output,
                                      segments=haploid_segments)
    elif args.command == "distance":
        clone_ids = args.clone_ids
        if args.clone_ids is not None:
            clone_ids = args.clone_ids.split(",")
        segments1, scnt1 = read_scnt_from_source(
            source=args.scnt1,
            clone_ids=clone_ids,
            separator=args.scnt1_separator,
            extra_separator=args.scnt1_extra_separator,
            remove_cn_data_from_segs=True)
        segments2, scnt2 = read_scnt_from_source(
            source=args.scnt2,
            clone_ids=clone_ids,
            separator=args.scnt2_separator,
            extra_separator=args.scnt2_extra_separator,
            remove_cn_data_from_segs=True)
        segments_by_sample_names = {"1": segments1, "2": segments2}
        scnts_by_sample_names = {"1": scnt1, "2": scnt2}
        segments_by_sample_names, scnts_by_sample_names = aligned_scnts(
            segments_by_sample_names=segments_by_sample_names,
            scnts_by_sample_names=scnts_by_sample_names)
        segments = segments_by_sample_names["1"]
        scnt1, scnt2 = scnts_by_sample_names["1"], scnts_by_sample_names["2"]
        distance = cn_distance_inter_scnt(tensor1=scnt1,
                                          tensor2=scnt2,
                                          segments=segments,
                                          check_clone_ids_match=True)
        print("distance = ", distance)

    logger.info("Success!")
コード例 #13
0
ファイル: rck_adj_stats.py プロジェクト: raphael-group/RCK
def main():
    cli_logging_parser = get_logging_cli_parser()
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-STATS")
    parser.add_argument('--version', action='version', version=rck.version)
    ######
    shared_parser = get_shared_nas_parser()
    shared_parser.add_argument("--no-vis", action="store_false", dest="vis")
    shared_parser.add_argument("--vis-interactive", action="store_true")
    shared_parser.add_argument("--output-dir",
                               "-o",
                               dest="output_dir",
                               default=os.getcwd())
    ######
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ######
    cnt_parser = subparsers.add_parser(
        "cnt",
        parents=[shared_parser],
        help="Counting stats for RCK NAS in input file")
    cnt_parser.add_argument("rck_nas",
                            type=argparse.FileType("rt"),
                            default=sys.stdin)
    cnt_parser.add_argument("--ann", action="store_true")
    cnt_parser.add_argument("--ann-list", nargs=1)
    cnt_parser.add_argument("--ann-field", default="svtype", type=str)
    cnt_parser.add_argument("--ann-missing", default="unknown")
    cnt_parser.add_argument(
        "--bins",
        nargs=1,
        default=",".join([
            "-1", "0", "50", "100", "500", "1000", "5000", "10000", "50000",
            "100000", "500000", "1000000", "5000000", "10000000"
        ]),
        # default=",".join(["-1", "0", "50", "100", "200", "300", "400", "500"])
    )
    cnt_parser.add_argument("--cnt-output-subdir", default="cnt")
    cnt_parser.add_argument("--title", default="")
    cnt_parser.add_argument("--no-bar-values",
                            action="store_false",
                            dest="bar_values")
    # cnt_parser.add_argument("--per-chr", action="store_true")
    ######
    lr_parser = subparsers.add_parser(
        "lr",
        parents=[shared_parser],
        help=
        "Counting stats w.r.t. long read information in the input RCK NAS file"
    )
    lr_parser.add_argument("rck_nas",
                           type=argparse.FileType("rt"),
                           default=sys.stdin)
    lr_parser.add_argument("--lr-field", default="support_read_names")
    lr_parser.add_argument("--title", default="")
    lr_parser.add_argument("--no-bar-values",
                           action="store_false",
                           dest="bar_values")
    lr_parser.add_argument("--lr-output-subdir", default="lr")
    ######
    merged_parser = subparsers.add_parser(
        "merged",
        parents=[shared_parser],
        help="Counting statistics over merged RCK NAS")
    merged_parser.add_argument("rck_nas",
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    merged_parser.add_argument("--origin-field", default=ORIGIN_IDS)
    merged_parser.add_argument("--origin-sep", default=",")
    merged_parser.add_argument("--origin-regex", default=".*_(?P<source>.*)")
    merged_parser.add_argument("--no-origin-field", choices=["skip", "self"])
    merged_parser.add_argument("---merged-output-subdir", default="merged")
    merged_parser.add_argument("--title", default="")
    #######
    support_parser = subparsers.add_parser(
        "support",
        parents=[shared_parser],
        help="Counting support statistics over merged RCK NAS")
    support_parser.add_argument("rck_nas",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    support_parser.add_argument("--sources",
                                type=argparse.FileType("rt"),
                                nargs="+")
    support_parser.add_argument("--title", default="")
    support_parser.add_argument("--no-bar-values",
                                action="store_false",
                                dest="bar_values")
    support_parser.add_argument("--support-output-subdir", default="support")
    #######
    survivor_stat_parser = subparsers.add_parser("survivor-stat",
                                                 parents=[cli_logging_parser])
    survivor_stat_parser.add_argument("rck_adj",
                                      type=argparse.FileType("rt"),
                                      default=sys.stdin)
    survivor_stat_parser.add_argument("--separator", default="\t")
    survivor_stat_parser.add_argument("--extra-separator", default=";")
    survivor_stat_parser.add_argument("--sources-field",
                                      default="supporting_sources")
    survivor_stat_parser.add_argument(
        "--size-bins",
        type=str,
        default=
        "1,100,200,300,400,500,750,1000,2000,5000,10000,50000,100000,500000")
    survivor_stat_parser.add_argument("--size-extra-field", default="svlen")
    survivor_stat_parser.add_argument("--size-extra-field-no-abs",
                                      action="store_false",
                                      dest="size_extra_field_abs")
    survivor_stat_parser.add_argument("--size-extra-seq-field")
    survivor_stat_parser.add_argument("-o",
                                      "--output",
                                      type=argparse.FileType("wt"),
                                      default=sys.stdout)
    #######
    complex_parser = subparsers.add_parser("complex-signatures",
                                           parents=[cli_logging_parser])
    complex_parser.add_argument("rck_adj",
                                nargs="?",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    complex_parser.add_argument("--separator", default="\t")
    complex_parser.add_argument("--extra-separator", default=";")
    complex_parser.add_argument("--pre-reciprocal",
                                action="store_true",
                                dest="pre_reciprocal")
    complex_parser.add_argument("--pre-reciprocal-max-dist",
                                type=int,
                                default=50)
    complex_parser.add_argument("--min-k", type=int, default=3)
    complex_parser.add_argument("--output",
                                "-o",
                                type=argparse.FileType("wt"),
                                default=sys.stdout)
    complex_parser.add_argument("--output-separator", default="\t")
    complex_parser.add_argument("--output-internal-separator", default=",")
    #######
    args = parser.parse_args()
    # if args.vis
    if args.command == "survivor-stat":
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.separator,
            extra_separator=args.extra_separator)
        bins = get_size_bins(bins_strs=args.size_bins.split(","))
        tally = merged_source_tally(
            adjacencies=adjacencies,
            bins=bins,
            extra_sources_field=args.sources_field,
            size_extra_field=args.size_extra_field,
            size_extra_field_abs=args.size_extra_field_abs,
            size_extra_seq_field=args.size_extra_seq_field)
        header_entries = ["bin"] + [
            ";".join(entry) for entry in sorted(tally.keys())
        ]
        writer = csv.DictWriter(args.output,
                                fieldnames=header_entries,
                                delimiter=",")
        writer.writeheader()
        for lb, rb in zip([None] + bins, bins):
            data = {"bin": "[{lb}-{rb})".format(lb=lb, rb=rb)}
            for source in sorted(tally.keys()):
                str_source = ";".join(source)
                data[str_source] = tally[source].get(rb, 0)
            writer.writerow(data)
    elif args.command == "complex-signatures":
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.separator,
            extra_separator=args.extra_separator)
        if args.pre_reciprocal:
            adjacencies = refined_adjacencies_reciprocal(
                novel_adjacencies=adjacencies,
                max_distance=args.pre_reciprocal_max_dist)
        complex_rearrangements_signatures = get_complex_rearrangements_signatures(
            adjacencies=adjacencies)
        complex_rearrangements_signatures = [
            crs for crs in complex_rearrangements_signatures
            if crs.k >= args.min_k
        ]
        write_complex_rearr_signature_groups_to_destination(
            destination=args.output,
            signatures=complex_rearrangements_signatures,
            separator=args.output_separator,
            internal_separator=args.output_internal_separator)
    elif args.command == "cnt":
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(color_codes=True)
        nas = read_adjacencies_from_source(source=args.rck_nas)
        use_annotations = args.ann
        bins = set()
        for bin_str in args.bins.split(","):
            bin_value = int(bin_str)
            bins.add(bin_value)
        bins = sorted(bins)
        if bins[-1] < 500000000:
            bins.append(500000000)
        if use_annotations:
            annotations = set()
            if len(args.ann_list) == 1 and args.ann_list[0] == "all":
                allow_all_annotations = True
            else:
                allow_all_annotations = False
            for anns_str in args.ann_list:
                anns = [ann.lower() for ann in anns_str.split(",")]
                for ann in anns:
                    annotations.add(ann)
            annotations = sorted(annotations)
            nas_by_anns = defaultdict(list)
            for na in nas:
                na_ann = na.extra.get(args.ann_field, args.ann_missing).lower()
                if allow_all_annotations or na_ann in annotations:
                    nas_by_anns[na_ann].append(na)
        else:
            nas_by_anns = defaultdict(list)
            for na in nas:
                na_ann = str(na.position1.strand) + str(na.position2.strand)
                nas_by_anns[na_ann].append(na)
            # separate figures
        for na_ann, nas in nas_by_anns.items():
            lengths = [get_length(na=na) for na in nas]
            bin_cnts = defaultdict(int)
            for length in lengths:
                for l, r in zip(bins[:-1], bins[1:]):
                    if length < r:
                        bin_cnts[l] += 1
                        break

            if args.vis:
                values = [bin_cnts[i] for i in bins]
                x_axis_values = [i for i in range(len(bins))]
                plt.figure(figsize=(20, 10))
                bars = plt.bar(x_axis_values, values, label=na_ann, color="g")
                if args.bar_values:
                    add_bar_values(ax=plt.gca(), bars=bars)
                x_axis_values = [i for i in range(len(bins))]
                plt.xticks(x_axis_values, [
                    "[{l}-\n{r})".format(l=get_str_size_label(int_size=l),
                                         r=get_str_size_label(int_size=r))
                    for l, r in zip(bins[:-1], bins[1:])
                ])
                plt.tick_params(labelsize=20)
                plt.legend(prop={'size': 20})
                plt.xlabel("{ann} lengths".format(ann=na_ann), fontsize=20)
                plt.ylabel("# of {ann}".format(ann=na_ann), fontsize=20)
                plt.title(args.title, fontsize=20)
                # plt.xticks(bins, [str(b) for b in bins])
                plot_path = os.path.abspath(
                    os.path.join(args.output_dir, args.cnt_output_subdir))
                if not os.path.exists(plot_path):
                    os.makedirs(plot_path)
                plot_file_name = os.path.join(
                    plot_path, "{prefix}_cnt.png".format(
                        prefix=na_ann.replace(os.path.sep, "_")))
                plt.savefig(plot_file_name)
                if args.vis_interactive:
                    plt.show()
                plt.clf()
        # one figure
        fig, ax = plt.subplots()
        fig.set_figheight(20)
        fig.set_figwidth(20)
        diff = .9 / len(list(nas_by_anns.keys()))
        if len(list(nas_by_anns.keys())) % 2 == 0:
            extra_diff = diff / 2
        else:
            extra_diff = 0
        x_axis_values = [i for i in range(len(bins))]
        colors = ["b", "g", "r", "c", "m", "y", "b", "w"]
        for cnt, (na_ann, nas) in enumerate(nas_by_anns.items()):
            lengths = [get_length(na=na) for na in nas]
            bin_cnts = defaultdict(int)
            for length in lengths:
                for l, r in zip(bins[:-1], bins[1:]):
                    if length < r:
                        bin_cnts[l] += 1
                        break
            if args.vis:
                values = [bin_cnts[i] for i in bins]
                x_axis_values_tmp = [x + diff * cnt for x in x_axis_values]
                color = colors[cnt]
                ax.bar(x_axis_values_tmp,
                       values,
                       width=diff,
                       color=color,
                       label="{ann}".format(ann=na_ann))
        plt.xticks(x_axis_values, [
            "[{l}-\n{r})".format(l=get_str_size_label(int_size=l),
                                 r=get_str_size_label(int_size=r))
            for l, r in zip(bins[:-1], bins[1:])
        ])
        plt.tick_params(labelsize=20)
        plt.legend(prop={'size': 20})
        plt.xlabel("lenghts", fontsize=20)
        plt.ylabel("# of SVs", fontsize=20)
        plt.title(args.title, fontsize=20)
        plot_path = os.path.abspath(
            os.path.join(args.output_dir, args.cnt_output_subdir))
        if not os.path.exists(plot_path):
            os.makedirs(plot_path)
        plot_file_name = os.path.join(plot_path, "bins_cnt.png")
        plt.savefig(plot_file_name)
        if args.vis_interactive:
            plt.show()
        plt.gcf()

        fig, ax = plt.subplots()
        fig.set_figheight(20)
        fig.set_figwidth(20)
        x_axis_values = {
            ann: cnt
            for cnt, ann in enumerate(nas_by_anns.keys())
        }
        values = {ann: len(nas_by_anns[ann]) for ann in nas_by_anns.keys()}
        x_axis_values = [
            x_axis_values[ann] for ann in sorted(nas_by_anns.keys())
        ]
        values = [values[ann] for ann in sorted(nas_by_anns.keys())]
        x_ticks = [ann for ann in sorted(nas_by_anns.keys())]
        bars = plt.bar(x_axis_values, values, color="g")
        if args.bar_values:
            add_bar_values(ax=ax, bars=bars)
        plt.xticks(x_axis_values, x_ticks)
        plt.tick_params(labelsize=20)
        plt.xlabel("SVs", fontsize=20)
        plt.ylabel("# of SVs", fontsize=20)
        plt.title(args.title, fontsize=20)
        plot_path = os.path.abspath(
            os.path.join(args.output_dir, args.cnt_output_subdir))
        if not os.path.exists(plot_path):
            os.makedirs(plot_path)
        plot_file_name = os.path.join(plot_path, "cnt.png")
        plt.savefig(plot_file_name)
        if args.vis_interactive:
            plt.show()
        plt.gcf()
    elif args.command == "lr":
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(color_codes=True)
        nas = read_adjacencies_from_source(source=args.rck_nas)
        reads_to_nas = defaultdict(list)
        for na in nas:
            reads_str = na.extra.get(args.lr_field, "")
            reads = reads_str.split(",")
            for read in reads:
                if len(read) == 0:
                    continue
                reads_to_nas[read].append(na)
        nas_cnts = {len(reads_to_nas[read]) for read in reads_to_nas.keys()}
        x_axis_values = [i for i in range(2, max(nas_cnts))]
        values = defaultdict(int)
        for read in reads_to_nas.keys():
            values[len(reads_to_nas[read])] += 1
        values = [values[i] for i in x_axis_values]
        x_ticks = sorted(x_axis_values)
        fig, ax = plt.subplots()
        fig.set_figheight(20)
        fig.set_figwidth(20)
        bars = plt.bar(x_axis_values, values, color="g")
        if args.bar_values:
            add_bar_values(ax=ax, bars=bars)
        plt.xticks(x_axis_values, x_ticks)
        plt.xlabel("# of SVs")
        plt.ylabel("# of reads supporting x SVs")
        plt.title(args.title)
        plot_path = os.path.abspath(
            os.path.join(args.output_dir, args.lr_output_subdir))
        if not os.path.exists(plot_path):
            os.makedirs(plot_path)
        plot_file_name = os.path.join(plot_path, "lr_cnt.png")
        plt.savefig(plot_file_name)
        if args.vis_interactive:
            plt.show()
        plt.gcf()
    elif args.command == "merged":
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(color_codes=True)
        nas = read_adjacencies_from_source(source=args.rck_nas)
        source_pattern = re.compile(args.origin_regex)
        source_groups_cnt = defaultdict(int)
        for na in nas:
            naid = na.extra.get(EXTERNAL_NA_ID, na.idx)
            if args.origin_field not in na.extra:
                if args.no_origin_field == "skip":
                    continue
                elif args.no_origin_field == "self":
                    origin = naid
                else:
                    raise Exception(
                        "Unknown strategy {no_origin_field} for Adjacency {naid} that misses the origin field"
                        "".format(no_origin_field=args.no_origin_field,
                                  naid=naid))
            else:
                origin = na.extra[args.origin_field]
            origin_strings = origin.split(args.origin_sep)
            origins = []
            for origin_string in origin_strings:
                origin_match = source_pattern.match(origin_string)
                if origin_match is None:
                    continue
                origins.append(origin_match.group("source"))
            if len(origins) == 0:
                continue
            origins = tuple(sorted(origins))
            source_groups_cnt[origins] += 1
        source_group_set_cnt = defaultdict(int)
        groups = set()
        pairwise_cnts = defaultdict(int)
        print("\n----- Quantitative subgroups ----\n")
        for key, value in sorted(source_groups_cnt.items(),
                                 key=lambda entry: entry[1]):
            for element in key:
                groups.add(element)
            counter = Counter(key)
            if len(set(key)) == 1:
                group = set(key).pop()
                pairwise_cnts[(group, group)] += value
            else:
                for g1, g2 in itertools.combinations(sorted(set(key)), r=2):
                    pairwise_cnts[tuple(sorted([g1, g2]))] += value
                    pairwise_cnts[tuple(reversed(sorted([g1, g2])))] += value
            source_group_set_cnt[tuple(sorted(counter.keys()))] += value
            counter_str = ", ".join([
                "{name} ({cnt})".format(name=name, cnt=cnt)
                for name, cnt in sorted(counter.items(),
                                        key=lambda entry: (entry[1], entry[0]))
            ])
            print("{key} :: {value}".format(key=counter_str, value=value))

        print("\n----- Groups ----- \n")
        print(", ".join(sorted(groups)))

        print("\n----- By group size (then by size)------\n")
        for key, value in sorted(source_group_set_cnt.items(),
                                 key=lambda entry: (len(entry[0]), entry[1])):
            print("{key} :: {value}".format(key=",".join(key), value=value))

        print("\n----- By size (then by group size)------\n")
        for key, value in sorted(source_group_set_cnt.items(),
                                 key=lambda entry: (entry[1], len(entry[0]))):
            print("{key} :: {value}".format(key=",".join(key), value=value))

        print("\n----- Pairsize ------\n")
        for key, value in sorted(pairwise_cnts.items()):
            if key[0] >= key[1]:
                print("{key} :: {value}".format(key=",".join(key),
                                                value=value))

        if args.vis:
            ser = pd.Series(list(pairwise_cnts.values()),
                            index=pd.MultiIndex.from_tuples(
                                pairwise_cnts.keys()))
            df = ser.unstack().fillna(0)
            df = df.astype('int64')
            sns.set(font_scale=1.5)
            sns.heatmap(df, annot=True, fmt="d")
            plt.title(args.title)
            plot_path = os.path.abspath(
                os.path.join(args.output_dir, args.merged_output_subdir))
            if not os.path.exists(plot_path):
                os.makedirs(plot_path)
            plot_file_name = os.path.join(plot_path,
                                          "pairwise_merged_cnts.png")

            plt.savefig(plot_file_name, bbox_inches='tight')
        if args.vis_interactive:
            plt.show()
    elif args.command == "support":
        import seaborn as sns
        import matplotlib.pyplot as plt
        sns.set(color_codes=True)
        max_support = 30
        import statistics
        nas = read_adjacencies_from_source(source=args.rck_nas)
        source_nas = []
        if args.sources is not None:
            for source in args.sources:
                source_nas.extend(read_adjacencies_from_source(source=source))
        source_nas_by_ids = {
            na.extra.get(EXTERNAL_NA_ID, na.idx): na
            for na in source_nas
        }
        counts = defaultdict(int)
        nas_read_cnt = defaultdict(int)
        for na in nas:
            if "support_read_names" in na.extra:
                support_cnt = len(na.extra["support_read_names"].split(","))
                if support_cnt > max_support:
                    support_cnt = max_support
                counts[support_cnt] += 1
                nas_read_cnt[na] += support_cnt
            elif "origin_ids" in na.extra:
                origin_nas = [
                    source_nas_by_ids[origin_id]
                    for origin_id in na.extra["origin_ids"].split(",")
                    if origin_id in source_nas_by_ids
                ]
                if len(origin_nas) == 0:
                    continue
                supports = []
                for origin_na in origin_nas:
                    if "support_read_names" in origin_na.extra:
                        supports.append(
                            len(origin_na.extra["support_read_names"].split(
                                ",")))
                support_cnt = int(statistics.mean(supports))
                if support_cnt > max_support:
                    support_cnt = max_support
                counts[support_cnt] += 1
                nas_read_cnt[na] += support_cnt
            else:
                counts[-1] += 1
        x_axis_values = [i for i in range(1, max(counts) + 1)]
        values = [counts[i] for i in x_axis_values]
        x_ticks = sorted(x_axis_values)
        fig, ax = plt.subplots()
        fig.set_figheight(20)
        fig.set_figwidth(20)
        bars = plt.bar(x_axis_values, values, color="g")
        if args.bar_values:
            add_bar_values(ax=ax, bars=bars)
        plt.xticks(x_axis_values, x_ticks)
        plt.xlabel("# of supporting reads")
        plt.ylabel("# of SVs")
        plt.title(args.title)
        plot_path = os.path.abspath(
            os.path.join(args.output_dir, args.support_output_subdir))
        if not os.path.exists(plot_path):
            os.makedirs(plot_path)
        plot_file_name = os.path.join(plot_path, "support_cnt.png")
        plt.savefig(plot_file_name)
        if args.vis_interactive:
            plt.show()
        plt.gcf()
        top_svs = {
            na: cnt
            for na, cnt in nas_read_cnt.items()
            if cnt >= 28 and na.distance_non_hap > 300
        }
        result = random.sample(top_svs.keys(), 10)
        for entry in result:
            print(entry, entry.distance_non_hap)
コード例 #14
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-process")
    parser.add_argument("--version", action="version", version=rck.version)
    cli_logging_parser = get_logging_cli_parser()

    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ###
    cat_parser = subparsers.add_parser("cat", parents=[cli_logging_parser])
    cat_parser.add_argument("rck_adg",
                            type=argparse.FileType("rt"),
                            nargs="+",
                            default=[sys.stdin])
    cat_parser.add_argument("--i-separator", default="\t")
    cat_parser.add_argument("--i-extra-separator", default=";")
    cat_parser.add_argument("--i-aids-separator", default=",")
    cat_parser.add_argument("--enforce-unique-ids",
                            action="store_true",
                            dest="enforce_unique_ids")
    cat_parser.add_argument("--id-collision-strategy",
                            choices=["skip", "error"],
                            default="error")
    cat_parser.add_argument("-o",
                            "--output",
                            type=argparse.FileType("wt"),
                            default=sys.stdout)
    cat_parser.add_argument("--o-separator", default="\t")
    cat_parser.add_argument("--o-aids-separator", default=",")
    cat_parser.add_argument("--o-extra-separator", default=";")
    ###
    refine_parser = subparsers.add_parser("refine",
                                          parents=[cli_logging_parser])
    refine_parser.add_argument("rck_adg",
                               nargs="?",
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    refine_parser.add_argument("--i-separator", default="\t")
    refine_parser.add_argument("--i-extra-separator", default=";")
    refine_parser.add_argument("--i-aids-separator", default=",")
    # refine_parser.add_argument("--no-refine-m", action="store_false", dest="refine_m")
    # refine_parser.add_argument("--no-refine-l", action="store_false", dest="refine_l")
    # refine_parser.add_argument("--no-refine-n", action="store_false", dest="refine_n")
    refine_parser.add_argument("--gid-suffix", default="refined")
    refine_parser.add_argument("-o",
                               "--output",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    refine_parser.add_argument("--o-separator", default="\t")
    refine_parser.add_argument("--o-aids-separator", default=",")
    refine_parser.add_argument("--o-extra-separator", default=";")
    ###
    project_parser = subparsers.add_parser("project",
                                           parents=[cli_logging_parser])
    project_parser.add_argument("rck_adg",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    project_parser.add_argument("--i-separator", default="\t")
    project_parser.add_argument("--i-extra-separator", default=";")
    project_parser.add_argument("--i-aids-separator", default=",")
    project_parser.add_argument("--adjacencies",
                                required=True,
                                type=argparse.FileType("rt"))
    project_parser.add_argument("--adj-separator", default="\t")
    project_parser.add_argument("--adj-extra-separator", default=";")
    project_parser.add_argument("--gid-suffix", default="projected")
    project_parser.add_argument("-o",
                                "--output",
                                type=argparse.FileType("wt"),
                                default=sys.stdout)
    project_parser.add_argument("--o-separator", default="\t")
    project_parser.add_argument("--o-aids-separator", default=",")
    project_parser.add_argument("--o-extra-separator", default=";")
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADJ-GROUPS-process")
    if args.command == "cat":
        adj_groups = itertools.chain(*(stream_adjacency_groups_from_source(
            source=adj_group_source,
            separator=args.i_separator,
            aids_separator=args.i_aids_separator,
            extra_separator=args.i_extra_separator)
                                       for adj_group_source in args.rck_adg))
        if args.enforce_unique_ids:
            pass
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator)
    elif args.command == "refine":
        logger.info("Refining input adjacency groups")
        logger.info(
            "Reading adjacency groups from {file}".format(file=args.rck_adg))
        adg_groups = read_adjacency_groups_from_source(
            source=args.rck_adg,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator,
            aids_separator=args.i_aids_separator)
        logger.info("A total of {cnt} adjacency groups has been read".format(
            cnt=len(adg_groups)))
        molecule_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.MOLECULE
        ]
        logger.info(
            "A total of {cnt} molecule adjacency groups has been read".format(
                cnt=len(molecule_groups)))
        labeling_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.LABELING
        ]
        logger.info(
            "A total of {cnt} labeling adjacency groups has been read".format(
                cnt=len(labeling_groups)))
        general_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.GENERAL
        ]
        logger.info(
            "A total of {cnt} general adjacency groups has been read".format(
                cnt=len(general_groups)))
        logger.info("Refining molecule adjacency groups")
        refined_molecule_groups = molecule_groups
        logger.info(
            "A total of {cnt} refined molecule adjacency groups remains".
            format(cnt=len(refined_molecule_groups)))
        logger.info("Refining labeling adjacency groups")
        r_labeling_groups = refined_labeling_groups(
            adj_groups=labeling_groups,
            gid_suffix="" if len(args.gid_suffix) == 0 else args.gid_suffix +
            "-L",
            retain_source_gids=True)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remains".
            format(cnt=len(r_labeling_groups)))
        logger.info("Refining general adjacency groups")
        refined_general_groups = general_groups
        logger.info(
            "A total of {cnt} refined labeling general adjacency groups remains"
            .format(cnt=len(refined_general_groups)))
        adj_groups = itertools.chain(refined_molecule_groups,
                                     r_labeling_groups, refined_general_groups)
        logger.info("Writing refined adjacency groups to {file}".format(
            file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator)
    elif args.command == "project":
        logger.info(
            "Projecting input adjacency groups based on input adjacencies")
        logger.info(
            "Reading adjacency groups from {file}".format(file=args.rck_adg))
        adg_groups = read_adjacency_groups_from_source(
            source=args.rck_adg,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator,
            aids_separator=args.i_aids_separator)
        logger.info("A total of {cnt} adjacency gorups has been read".format(
            cnt=len(adg_groups)))
        adjacencies = read_adjacencies_from_source(
            source=args.adjacencies,
            separator=args.adj_separator,
            extra_separator=args.adj_extra_separator)
        p_groups = projected_groups(groups=adg_groups,
                                    adjacencies=adjacencies,
                                    gid_suffix=args.gid_suffix)
        logger.info("A total of {cnt} projected groups remained".format(
            cnt=len(p_groups)))
        logger.info("Writing projected adjacency groups to {file}".format(
            file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=p_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator)
コード例 #15
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-x2rck")
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ####
    titan_parser = subparsers.add_parser("titan", parents=[cli_logging_parser, chr_strip_parser])
    titan_parser.add_argument("titan_ichor_seg")
    titan_parser.add_argument("--sample-name", required=True)
    titan_parser.add_argument("--clone-ids", default=None)
    titan_parser.add_argument("--separator", default="\t")
    titan_parser.add_argument("--corrected-cn-fix", choices=["None", "equal", "relative-dist"], default="None")
    titan_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    battenberg_parser = subparsers.add_parser("battenberg", parents=[cli_logging_parser, chr_strip_parser])
    battenberg_parser.add_argument("battenberg", type=argparse.FileType("rt"), default=sys.stdin)
    battenberg_parser.add_argument("--separator", default="\t")
    battenberg_parser.add_argument("--sample-name", required=True)
    battenberg_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
    battenberg_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    hatchet_parser = subparsers.add_parser("hatchet", parents=[cli_logging_parser, chr_strip_parser])
    hatchet_parser.add_argument("hatchet", type=str)
    hatchet_parser.add_argument("--separator", default="\t")
    hatchet_parser.add_argument("--min-usage", type=float, default=0.01)
    hatchet_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    group = hatchet_parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--sample-name", default=None)
    group.add_argument("--clone-ids", default=None)
    ####
    remixt_parser = subparsers.add_parser("remixt", parents=[cli_logging_parser, chr_strip_parser])
    remixt_parser.add_argument("remixt", type=argparse.FileType("rt"), default=sys.stdin)
    remixt_parser.add_argument("--separator", default="\t")
    remixt_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
    remixt_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    ginkgo_parser = subparsers.add_parser("ginkgo", parents=[cli_logging_parser, chr_strip_parser])
    ginkgo_parser.add_argument("ginkgo", type=argparse.FileType("rt"), default=sys.stdin)
    ginkgo_parser.add_argument("--separator", default="\t")
    ginkgo_parser.add_argument("--sample-name", required=True)
    ginkgo_parser.add_argument("--dummy-clone-name", default="1")
    ginkgo_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    gff_parser = subparsers.add_parser("gff", parents=[cli_logging_parser, chr_strip_parser])
    gff_parser.add_argument("gff", type=str)
    gff_parser.add_argument("--chr-mapping-file", type=argparse.FileType("rt"))
    gff_parser.add_argument("--chr-mapping-missing-strategy", choices=["keep", "skip"], default="keep")
    gff_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT")

    if args.command == "titan":
        logger.info("Converting allele-specific segment copy values form TitanCNA format to RCK")
        titan_full_path = get_full_path(path=args.titan_ichor_seg)
        if args.clone_ids is None:
            logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=titan_full_path))
            clone_ids = titan_get_clone_ids_from_file(file_name=titan_full_path, sample_name=args.sample_name, separator=args.separator)
        else:
            clone_ids = sorted(set(args.clone_ids.split(",")))
        logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        with open(args.titan_ichor_seg, "rt") as source:
            logger.info("Reading allele-specific segment copy number values from {file}".format(file=titan_full_path))
            segments, scnt = get_scnt_from_titan_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator,
                                                        corrected_cn_fix=args.corrected_cn_fix, chr_strip=args.strip_chr)
            logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
            write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
    elif args.command == "battenberg":
        logger.info("Converting allele-specific segment copy values form Battenberg format to RCK")
        clone_ids = args.clone_ids.split(",")
        logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        logger.info("Reading allele-specific segment copy number values form {file}".format(file=args.battenberg))
        segments, scnt = get_scnt_from_battenberg_source(source=args.battenberg, sample_name=args.sample_name, separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
    elif args.command == "hatchet":
        hatchet_full_path = get_full_path(path=args.hatchet)
        logger.info("Converting allele-specific segment copy values form HATCHet format to RCK")
        if args.clone_ids is None:
            logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=hatchet_parser))
            clone_ids = hatchet_get_clone_ids_from_file(file_name=hatchet_full_path, sample_name=args.sample_name, separator=args.separator, min_usage=args.min_usage)
        else:
            clone_ids = sorted(set(args.clone_ids.split(",")))
        logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        with open(hatchet_full_path) as source:
            logger.info("Reading allele-specific segment copy number values from {file}".format(file=hatchet_full_path))
            segments, scnt = get_scnt_from_hatchet_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator, chr_strip=args.strip_chr)
            logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
            write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
    elif args.command == "remixt":
        logger.info("Converting allele-specific segment copy values form ReMixT format to RCK")
        clone_ids = args.clone_ids.split(",")
        logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        logger.info("Reading allele-specific segment copy number values from {file}".format(file=args.remixt))
        segments, scnt = get_scnt_from_remixt_source(source=args.remixt, separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
    elif args.command == "ginkgo":
        logger.info("Converting *haploid* segments copy values from Ginkgo format to RCK")
        logger.info("Reading *haploid* segments copy values from {file}".format(file=args.ginkgo))
        segments, scnt = get_scnt_from_ginkgo_source(source=args.ginkgo, sample_name=args.sample_name, dummy_clone=args.dummy_clone_name,
                                                     separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing *haploid* segments copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=set(args.dummy_clone_name), separator=args.separator)
    elif args.command == "gff":
        logger.info("Converting segments data from GFF format to RCK")
        logger.info("Reading segments from {file}".format(file=args.gff))
        chr_mappings = None
        if args.chr_mapping_file is not None:
            chr_mappings = {}
            logger.info("Reading chromosome mapping data from {file}".format(file=args.chr_mapping_file))
            for line in args.chr_mapping_file:
                line = line.strip()
                data = line.split("\t")
                chr_mappings[data[0]] = data[1]
        segments = get_segments_from_gff_file(file_name=args.gff, chr_strip=args.strip_chr,
                                              chr_mapping=chr_mappings, chr_mapping_missing_strategy=args.chr_mapping_missing_strategy)
        logger.info("Writing segments in RCK format to {file}".format(file=args.output))
        write_segments_to_destination(destination=args.output, segments=segments)
    logger.info("Success!")