Ejemplo n.º 1
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-KAR-stats", parents=[get_logging_cli_parser()])
    parser.add_argument("--verbose", choices=[0, 1, 2, 3, 4, 5], type=int, default=5)
    parser.add_argument("--acnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--acnt-separator", default="\t")
    parser.add_argument("--acnt-extra-separator", default=";")
    parser.add_argument("--scnt", required=True, type=argparse.FileType("rt"))
    parser.add_argument("--scnt-separator", default="\t")
    parser.add_argument("--scnt-extra-separator", default=";")
    parser.add_argument("--scnb", type=argparse.FileType("rt"))
    parser.add_argument("--scnb-separator", default="\t")
    parser.add_argument("--scnb-extra-separator", default=";")
    parser.add_argument("--nas-fp", type=float, default=-1.0)
    parser.add_argument("--adjacency-groups", type=argparse.FileType("rt"))
    parser.add_argument("--adg-separator", default="\t")
    parser.add_argument("--adg-aids-separator", default=",")
    parser.add_argument("--adg-extra-separator", default=";")
    parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
    parser.add_argument("--telomere-positions-separator", default="\t")
    parser.add_argument("--telomere-positions-extra-separator", default=";")
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-KAR-stats")
    logger.info("Reading segment copy number tensor from {file}".format(file=args.scnt))
    segments, scnt = read_scnt_from_source(source=args.scnt, separator=args.scnt_separator, extra_separator=args.scnt_extra_separator, remove_cn_data_from_segs=True)
    logger.info("Reading adjacency copy number tensor from {file}".format(file=args.acnt))
    adjacencies, acnt = read_acnt_from_source(source=args.acnt, separator=args.acnt_separator, extra_separator=args.acnt_extra_separator, remove_cn_data_from_adj=True)
    if args.scnb is not None:
        logger.info("Reading segment copy number boundaries tensor from {file}".format(file=args.scnb))
        _, scnb = read_scnb_from_source(source=args.scnb, separator=args.scnb_separator, extra_separator=args.scnb_extra_separator, remove_cnb_data_from_segs=True)
    else:
        logger.info("No segment copy number boundaries tensor is provided via --scnb flag")
        scnb = None
    if args.adjacency_groups is not None:
        logger.info("Reading adjacency groups information from {file}".format(file=args.adjacency_groups))
        groups = read_adjacency_groups_from_source(source=args.adjacency_groups, separator=args.adg_separator,
                                                   extra_separator=args.adg_extra_separator, aids_separator=args.adg_aids_separator)
    else:
        logger.info("No adjacency groups information is provided via --adjacency-groups flag")
        groups = []
    if args.telomere_positions is not None:
        logger.info("Reading telomere positions from {file}".format(file=args.telomere_positions))
        telomeres = read_positions_from_source(source=args.telomere_positions, separator=args.telomeres_positions_separator,
                                               extra_separator=args.telomere_positions_extra_separator)
    else:
        logger.info("No telomere positions are provided via --telomere-positions flag. Defaulting to reference telomere positions".format(file=args.telomere_positions))
        telomeres = get_ref_telomeres_from_segments(segments=segments)
    segments_by_chrs = defaultdict(list)
    for segment in segments:
        segments_by_chrs[segment.chromosome].append(segment)
    print("A total of {cnt} chromosomes are observed".format(cnt=len(segments_by_chrs)))
    total_segments_cnt = 0
    for chr_name, chr_segments in segments_by_chrs.items():
        total_segments_cnt += len(chr_segments)
        if args.verbose >= 3:
            print("Chromosome {chr_name} has {cnt} segments".format(chr_name=chr_name, cnt=len(chr_segments)))
    print("A total of {cnt} segments are observed".format(cnt=total_segments_cnt))
    novel_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.NOVEL]
    reference_adjacencies = [adj for adj in adjacencies if adj.adjacency_type == AdjacencyType.REFERENCE]
    print("A total of {cnt} adjacencies ({n_cnt} novel; {r_cnt} reference)".format(cnt=len(novel_adjacencies) + len(reference_adjacencies),
                                                                                   n_cnt=len(novel_adjacencies), r_cnt=len(reference_adjacencies)))

    adjacencies_by_external_ids = {adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased): adj for adj in adjacencies}
    if groups is not None:
        for ag in groups:
            ag.populate_adjacencies_via_ids(source=adjacencies, source_by_ids=adjacencies_by_external_ids)
        molecule_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.MOLECULE]
        labeling_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.LABELING]
        general_groups = [ag for ag in groups if ag.group_type == AdjacencyGroupType.GENERAL]
        if len(molecule_groups) > 0:
            logger.info("Checking compliance with {cnt} molecule groups".format(cnt=len(molecule_groups)))
            molecule_groups_violations = adjacency_groups_molecule_violations(groups=molecule_groups, acnt=acnt)
            if len(molecule_groups_violations):
                logger.error("A total of {cnt} molecule groups DO NOT agree with input karyotype. See molecule groups ids below".format(cnt=len(molecule_groups)))
                logger.error(", ".join([ag.gid for ag in molecule_groups_violations]))
            else:
                logger.info("All molecule groups agree with input karyotype")
        else:
            logger.info("No molecule groups were provided. Nothing to check.")
        if len(labeling_groups) > 0:
            logger.info("Checking compliance with {cnt} labeling groups".format(cnt=len(labeling_groups)))
            labeling_groups_violations = adjacency_groups_labeling_violations(groups=labeling_groups, acnt=acnt)
            if len(labeling_groups_violations):
                logger.error("A total of {cnt} labeling groups DO NOT agree with input karyotype. See labeling groups ids below".format(cnt=len(labeling_groups_violations)))
                logger.error(", ".join([ag.gid for ag in labeling_groups_violations]))
            else:
                logger.info("All labeling groups agree with input karyotype")
        else:
            logger.info("No labeling groups were provided. Nothing to check.")
        if len(general_groups) > 0:
            logger.info("Checking compliance with {cnt} general groups".format(cnt=len(general_groups)))
            general_groups_violations = adjacency_groups_general_violations(groups=general_groups, acnt=acnt)
            if len(general_groups_violations):
                logger.error("A total of {cnt} general groups DO NOT agree with input karyotype. See general groups ids below".format(cnt=len(general_groups_violations)))
                logger.error(", ".join([ag.gid for ag in general_groups_violations]))
            else:
                logger.info("All general groups agree with input karyotype")
    else:
        logger.info("No information about adjacency groups were provided. Nothing to check.")

    clone_ids = sorted(set(scnt.keys()) & set(acnt.keys()))
    for clone_id in clone_ids:
        logger.info("Checking balancing and telomeres for clone {clone_id}".format(clone_id=clone_id))
        hiag = construct_hiag_inflate_from_haploid_data(hapl_segments=segments, hapl_adjacencies=adjacencies)
        scnp = scnt[clone_id]
        acnp = acnt[clone_id]
        hiag.assign_copy_numbers_from_scn_profile(scn_profile=scnp)
        hiag.assign_copy_numbers_from_acn_profile(acn_profile=acnp)
        hiag.remove_edges_with_zero_cn()
        logger.info("Checking that every vertex has a copy number excess >= 0.")
        for node in hiag.nodes(data=False):
            if hiag.node_imbalance(node=node) < 0:
                logger.warning("Something went WRONG! On segment extremity {node} there is a negative copy number excess...".format(node=str(node)))
        logger.info("Getting inferred telomeres.")
        diploid_telomeres = hiag.get_telomeres()
        inferred_hapl_telomeres_ids = {p.stable_id_non_hap for p in diploid_telomeres}
        input_hapl_telomers_ids = {p.stable_id_non_hap for p in telomeres}
        if inferred_hapl_telomeres_ids > input_hapl_telomers_ids:
            logger.error("Something went WRONG! Following segments extremities, while not specified specified as possible telomere sites were inferred as such.")
            logger.error(",".join(map(str, sorted(inferred_hapl_telomeres_ids - input_hapl_telomers_ids))))
        else:
            logger.info("Everything is OK! in clone {clone_id} all extremities have non-negative copy number excess, and inferred telomere sites concur with the input"
                        "".format(clone_id=clone_id))
        length = 0
        for u, v, data in hiag.segment_edges():
            s: Segment = data["object"]
            length += s.length * data["copy_number"]
        logger.info(f"Total length for clone {clone_id} = {length}")
        chromosome_cnt = sum(hiag.node_imbalance(node) for node in hiag.nodes(data=False)) / 2
        logger.info(f"Total number of chromosomes in clone {clone_id} = {chromosome_cnt}")
Ejemplo n.º 2
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-process")
    parser.add_argument('--version', action='version', version=rck.version)
    ####
    shared_parser = get_shared_nas_parser()
    cli_logging_parser = get_logging_cli_parser()
    shared_parser.add_argument("--output",
                               "-o",
                               dest="rck_adj_file",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    shared_parser.add_argument("--no-sort", action="store_false", dest="sort")
    ####
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ####
    filter_parser = subparsers.add_parser(
        "filter", parents=[shared_parser, cli_logging_parser])
    filter_parser.add_argument("rck_adj",
                               type=argparse.FileType("rt"),
                               nargs="+",
                               default=[sys.stdin])
    filter_parser.add_argument("--keep-extra-field-regex",
                               action="append",
                               default=None)
    filter_parser.add_argument("--keep-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--keep-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--keep-annotate",
                               action="store_true",
                               dest="annotate_retained")
    filter_parser.add_argument("--keep-annotate-s-extra-field",
                               default=None,
                               dest="annotate_seg_extra_field")
    filter_parser.add_argument("--keep-annotate-short-circ",
                               action="store_true",
                               dest="annotate_shirt_circ")
    filter_parser.add_argument("--keep-annotate-extra-prefix",
                               dest="annotate_extra_prefix")
    filter_parser.add_argument("--remove-extra-field-regex",
                               action="append",
                               default=None)
    filter_parser.add_argument("--remove-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--remove-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--min-size", type=int, default=0)
    filter_parser.add_argument("--max-size", type=int, default=1000000000)
    filter_parser.add_argument("--no-allow-inter-chr",
                               action="store_false",
                               dest="allow_inter_chr")
    filter_parser.add_argument("--no-allow-intra-chr",
                               action="store_false",
                               dest="allow_intra_chr")
    filter_parser.add_argument("--size-extra-field", default="svlen")
    filter_parser.add_argument("--size-extra-field-no-abs",
                               action="store_false",
                               dest="size_extra_field_abs")
    filter_parser.add_argument("--size-extra-seq-field")
    ####
    cat_parser = subparsers.add_parser(
        "cat",
        parents=[shared_parser, cli_logging_parser],
        help=
        "Concatenate Adjacencies in input files (NOTE: different from \"merge\")"
    )
    cat_parser.add_argument("rck_adj",
                            type=argparse.FileType("rt"),
                            nargs="+",
                            default=[sys.stdin])
    cat_parser.add_argument("--enforce-unique-ids",
                            action="store_true",
                            dest="enforce_unique_ids")
    cat_parser.add_argument("--id-collision-strategy",
                            choices=['skip', 'error'],
                            default='error')
    ####
    reciprocal_parser = subparsers.add_parser(
        "reciprocal",
        parents=[shared_parser, cli_logging_parser],
        help="ensure that reciprocal novel adjacencies are treated as such")
    reciprocal_parser.add_argument("rck_adj",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    reciprocal_parser.add_argument("--max-distance", type=int, default=50)
    ####
    haploid_parser = subparsers.add_parser(
        "haploid",
        parents=[shared_parser, cli_logging_parser],
        help=
        "collapse any info that is allele/haplotype-specific into a haploid mode"
    )
    haploid_parser.add_argument("rck_adj",
                                type=argparse.FileType("rt"),
                                nargs="+",
                                default=[sys.stdin])
    ####
    update_parser = subparsers.add_parser(
        "update",
        parents=[shared_parser, cli_logging_parser],
        help=
        "Updates adjacencies in the 'adj' with the info from --source based on aid matches. Outputs updated --target entries"
    )
    update_parser.add_argument("rck_adj", type=argparse.FileType("rt"))
    update_parser.add_argument("--source",
                               type=argparse.FileType("rt"),
                               required=True)
    update_parser.add_argument("--exclude-extra-fields", default="")
    update_parser.add_argument("--include-extra-fields", default="")
    update_parser.add_argument("--no-include-missing",
                               action="store_false",
                               dest="include_missing")
    update_parser.add_argument("--no-coords-update",
                               action="store_false",
                               dest="coord_update")
    update_parser.add_argument("--no-coord1-update",
                               action="store_false",
                               dest="coord1_update")
    update_parser.add_argument("--no-coord2-update",
                               action="store_false",
                               dest="coord2_update")
    update_parser.add_argument("--no-strands-update",
                               action="store_false",
                               dest="strands_update")
    update_parser.add_argument("--no-strand1-update",
                               action="store_false",
                               dest="strand1_update")
    update_parser.add_argument("--no-strand2-update",
                               action="store_false",
                               dest="strand2_update")
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADK-process")
    processed_adjacencies = []
    if args.o_extra_fields is None or len(
            args.o_extra_fields) == 0 or args.o_extra_fields == ",":
        extra = None
    elif args.o_extra_fields != "all":
        extra = args.o_extra_fields.split(",")
    else:
        extra = args.o_extra_fields
    if args.command == "cat":
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        if args.enforce_unique_ids:
            processed_ids = set()
            adjacencies = []
            for adj in adjacencies:
                aid = adj.extra.get(EXTERNAL_NA_ID, adj.idx)
                if aid in processed_ids:
                    logger.debug(
                        "Adjacency id {aid} has been encountered more than once"
                        .format(aid=aid))
                    if args.id_collision_strategy == "skip":
                        continue
                    elif args.id_collision_strategy == "error":
                        raise ValueError(
                            "More than one adjacency with id {aid}".format(
                                aid=aid))
                adjacencies.append(adj)
                processed_ids.add(aid)
            adjacencies = adjacencies
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=adjacencies,
                                         extra=extra,
                                         sort_adjacencies=args.sort)
        exit(0)
    elif args.command == "filter":
        logger.info(
            "Filtering input adjacencies from following sources {sources}".
            format(sources=",".join(map(str, args.rck_adj))))
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        include_chrs_regions_strings = []
        exclude_chrs_regions_strings = []
        if args.chrs_include is not None:
            for chrs_lists in args.chrs_include:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        include_chrs_regions_strings.append(chr_name)
        if args.chrs_include_file is not None:
            for chr_name in get_chrs_regions_string_lists_from_source(
                    source=args.chrs_include_file):
                include_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude is not None:
            for chrs_lists in args.chrs_exclude:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        exclude_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude_file is not None:
            for chr_name in get_chrs_regions_string_list_from_file(
                    file_name=args.chrs_exclude_file):
                exclude_chrs_regions_strings.append(chr_name)
        include_regions = [
            parse_segment_chr_region(string)
            for string in include_chrs_regions_strings
        ]
        exclude_regions = [
            parse_segment_chr_region(string)
            for string in exclude_chrs_regions_strings
        ]
        adjacencies = filter_adjacencies_by_chromosomal_regions(
            adjacencies=adjacencies,
            include=include_regions,
            exclude=exclude_regions,
            include_both=args.include_both,
            exclude_both=args.exclude_both,
            include_spanning=args.include_spanning,
            exclude_spanning=args.exclude_spanning,
            annotate_retained=args.annotate_retained,
            annotate_retained_extra_field_prefix=args.annotate_extra_prefix,
            annotated_retained_segments_extra_field=args.
            annotate_seg_extra_field,
            annotate_short_circ=args.annotate_shirt_circ)
        keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
        if args.keep_extra_field_regex_file is not None:
            keep_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.keep_extra_field_regex_file)))
        remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
        if args.remove_extra_field_regex_file is not None:
            remove_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.remove_extra_field_regex_file)))
        keep_extra_field = get_extra_field_regexes(
            string_entries=keep_extra_field_entries)
        remove_extra_field = get_extra_field_regexes(
            string_entries=remove_extra_field_entries)
        adjacencies = filter_adjacencies_by_extra(
            adjacencies=adjacencies,
            keep_extra_field=keep_extra_field,
            keep_extra_field_missing_strategy=args.
            keep_extra_field_missing_strategy,
            remove_extra_field=remove_extra_field,
            remove_extra_field_missing_strategy=args.
            remove_extra_field_missing_strategy)
        adjacencies = filter_adjacencies_by_size(
            adjacencies=adjacencies,
            min_size=args.min_size,
            max_size=args.max_size,
            size_extra_field=args.size_extra_field,
            size_extra_seq_field=args.size_extra_seq_field,
            allow_inter_chr=args.allow_inter_chr,
            size_extra_field_abs=args.size_extra_field_abs,
            allow_intra_chr=args.allow_intra_chr,
        )
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=adjacencies,
                                         sort_adjacencies=False,
                                         extra=extra)
        exit(0)
    elif args.command == "reciprocal":
        adjacencies = read_adjacencies_from_source(source=args.rck_adj)
        processed_adjacencies = refined_adjacencies_reciprocal(
            novel_adjacencies=adjacencies,
            max_distance=args.max_distance,
            inplace=True)
    elif args.command == "haploid":
        adjacencies = itertools.chain(*(stream_adjacencies_from_source(
            source=rck_adj_source) for rck_adj_source in args.rck_adj))
        haploid_adjacencies = iter_haploid_adjacencies(adjacencies=adjacencies,
                                                       copy=False)
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=haploid_adjacencies,
                                         sort_adjacencies=False,
                                         extra=extra)
        exit(0)
    elif args.command == "update":
        adjacencies = read_adjacencies_from_source(source=args.rck_adj)
        source_adjacencies = read_adjacencies_from_source(source=args.source)
        extra_include = {
            v
            for v in args.include_extra_fields.split(",") if len(v) > 0
        }
        extra_exclude = {
            v
            for v in args.exclude_extra_fields.split(",") if len(v) > 0
        }
        processed_adjacencies = update_adjacencies(
            target_adjacencies=adjacencies,
            source_adjacencies=source_adjacencies,
            update_coords=args.update_coords,
            update_coord1=args.update_coord1,
            update_coord2=args.update_coord2,
            update_strands=args.update_strands,
            update_strand1=args.update_strand1,
            update_strand2=args.update_strand2,
            extra_exclude=extra_exclude,
            extra_include=extra_include,
            include_missing=args.include_missing)
    if len(processed_adjacencies) > 0:
        write_adjacencies_to_destination(destination=args.rck_adj_file,
                                         adjacencies=processed_adjacencies,
                                         extra=extra,
                                         sort_adjacencies=args.sort)
Ejemplo n.º 3
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-input-refine",
                                     parents=[get_logging_cli_parser()])
    parser.add_argument("--version", action="version", version=rck.version)
    parser.add_argument("--scnt", required=True)
    parser.add_argument("--adjacencies", required=True)
    parser.add_argument("--clone-ids", default=None)
    parser.add_argument("--scnt-separator", default="\t")
    parser.add_argument("--adjacencies-separator", default="\t")
    parser.add_argument("--no-merge-fragments",
                        action="store_false",
                        dest="merge_fragments")
    parser.add_argument("--fragments-max-merge-gap",
                        type=int,
                        default=1000000000)
    parser.add_argument("--no-fill-gaps-fragments",
                        action="store_false",
                        dest="fill_gaps_fragments")
    parser.add_argument("--fragments-max-fill-gap",
                        type=int,
                        default=1000000000)
    parser.add_argument("--no-allow-unit-segments",
                        action="store_false",
                        dest="allow_unit_segments")
    parser.add_argument("--telomere-positions", type=argparse.FileType("rt"))
    parser.add_argument("--telomere-positions-separator", default="\t")
    parser.add_argument("--output-scnt", required=True)
    parser.add_argument("--output-fragments", required=True)
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-input-refine")
    clone_ids = args.clone_ids.split(
        ",") if args.clone_ids is not None else None
    scnt_file = get_full_path(args.scnt_file)
    adj_file = get_full_path(args.adj)
    segments, scnt = read_scnt_from_file(file_name=scnt_file,
                                         clone_ids=clone_ids,
                                         separator=args.scnt_separator)
    clone_ids = sorted(set(scnt.keys()))
    segments, scnt, segments_ids_mapping = refined_scnt(
        segments=segments,
        scnt=scnt,
        merge_fragments=args.merge_fragments,
        max_merge_gap=args.fragments_max_merge_gap,
        fill_gaps=args.fill_gaps_fragments,
        max_fill_gap=args.fragments_max_fill_gap)

    adjacencies = read_adjacencies_from_file(
        file_name=adj_file, separator=args.adjacencies_separator)
    if args.telomere_positions is not None:
        telomere_positions = read_positions_from_source(
            source=args.telomere_positions,
            separator=args.telomere_positions_separator)
    else:
        telomere_positions = []
    fragments = deepcopy(segments)
    segments, scnt = refined_scnt_with_adjacencies_and_telomeres(
        segments=segments,
        scnt=scnt,
        adjacencies=adjacencies,
        telomere_positions=telomere_positions)
    refined_scnt_file = os.path.expanduser(args.refined_scnt_file)
    refined_scnt_file = os.path.abspath(refined_scnt_file)
    fragments_file = get_full_path(path=args.output_fragments)

    write_segments_to_file(file_name=fragments_file, segments=fragments)
    write_scnt_to_file(file_name=refined_scnt_file,
                       scnt=scnt,
                       segments=segments)
Ejemplo n.º 4
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-rck2x")
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ####
    shatterseek_parser = subparsers.add_parser(
        "shatterseek", parents=[cli_logging_parser, chr_strip_parser])
    shatterseek_parser.add_argument("rck_scnt",
                                    type=argparse.FileType("rt"),
                                    default=sys.stdin)
    shatterseek_parser.add_argument("--clone-id", required=True)
    shatterseek_parser.add_argument("--separator", default="\t")
    shatterseek_parser.add_argument("--extra-separator", default=";")
    shatterseek_parser.add_argument("--default-cn", type=int, default=0)
    shatterseek_parser.add_argument("--output-header",
                                    action="store_true",
                                    dest="output_header")
    shatterseek_parser.add_argument("-o",
                                    "--output",
                                    type=argparse.FileType("wt"),
                                    default=sys.stdout)
    ####
    circa_dens_parser = subparsers.add_parser(
        "circa-dens", parents=[cli_logging_parser, chr_strip_parser])
    circa_dens_parser.add_argument("rck_scnt",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    circa_dens_parser.add_argument("--clone-id", required=True)
    circa_dens_parser.add_argument("--separator", default="\t")
    circa_dens_parser.add_argument("--extra-separator", default=";")
    circa_dens_parser.add_argument("--cna-type",
                                   choices=["ampl", "del"],
                                   default="ampl")
    circa_dens_parser.add_argument("--haploid",
                                   action="store_true",
                                   dest="haploid")
    circa_dens_parser.add_argument("--inverse",
                                   action="store_true",
                                   dest="inverse")
    circa_dens_parser.add_argument("--window-size", type=int, default=10000000)
    circa_dens_parser.add_argument("--chr-sizes", type=argparse.FileType("rt"))
    circa_dens_parser.add_argument("-o",
                                   "--output",
                                   type=argparse.FileType("wt"),
                                   default=sys.stdout)
    ####
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-SCNT")

    if args.command == "shatterseek":
        logger.info(
            "Starting converting RCK Segment Copy Number Tensor data to ShatterSeek"
        )
        logger.debug(
            "Specified clone is {clone_id}".format(clone_id=args.clone_id))
        logger.info("Reading RCK formatted data from {file}".format(
            file=args.rck_scnt))
        segments, scnt = read_scnt_from_source(
            source=args.rck_scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        logger.info(
            "Read CN data is translated into a haploid (!!!) version of itself."
        )
        haploid_scnt = get_haploid_scnt(segments=segments, scnt=scnt)
        logger.info(
            "Writing data for clone {clone_id} in a ShatterSeek suitable format to {file}"
            .format(clone_id=args.clone_id, file=args.output))
        write_scnt_to_shatterseek_destination(destination=args.output,
                                              segments=segments,
                                              scnt=haploid_scnt,
                                              clone_id=args.clone_id,
                                              default=args.default_cn,
                                              output_header=args.output_header)
    elif args.command == "circa-dens":
        logger.info(
            "Starting computing ampl/del statistics from RKC Segment Copy Number Tensor Format"
        )
        logger.debug(
            "Specified clone is {clone_id}".format(clone_id=args.clone_id))
        logger.info("Reading RCK formatted data from {file}".format(
            file=args.rck_scnt))
        segments, scnt = read_scnt_from_source(
            source=args.rck_scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        chr_sizes = args.chr_sizes
        if args.chr_sizes is not None:
            chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
        circa_segments_cna_fractions = get_circa_segments_cna_fractions(
            segments=segments,
            scnt=scnt,
            clone_id=args.clone_id,
            window_size=args.window_size,
            chr_sizes=chr_sizes,
            cna_type=args.cna_type,
            haploid=args.haploid)
        segments = []
        total_average = 0
        total_length = 0
        for segment, cna_fraction in circa_segments_cna_fractions.items():
            value = cna_fraction * segment.length / args.window_size
            if args.inverse:
                value = 1 - value
            segment.extra[args.cna_type + "_fraction"] = value
            total_length += segment.length
            total_average += cna_fraction * segment.length
            segments.append(segment)
        logger.info("Total average cna fraction is " +
                    str(total_average / total_length))
        write_segments_to_circa_destination(
            destination=args.output,
            segments=segments,
            extra=[args.cna_type + "_fraction"])
    logger.info("Success!")
Ejemplo n.º 5
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-x2rck")
    parser.add_argument('--version', action='version', version=rck.version)
    ####
    shared_parser = get_shared_nas_parser()
    shared_parser.add_argument("--output",
                               "-o",
                               dest="rck_adj_file",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    ####
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ####
    lumpy_parser = subparsers.add_parser(
        "lumpy",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Lumpy VCF SV calls into RCK NAS format")
    lumpy_parser.add_argument("--id-suffix", dest="id_suffix", default="lumpy")
    lumpy_parser.add_argument("lumpy_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    longranger_parser = subparsers.add_parser(
        "longranger",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert LongRanger VCF SV calls into RCK NAS format")
    longranger_parser.add_argument("--id-suffix",
                                   dest="id_suffix",
                                   default="longranger")
    longranger_parser.add_argument("longranger_vcf_file",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    ####
    naibr_parser = subparsers.add_parser(
        "naibr",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert NAIBR NAS calls into RCK NAS format")
    naibr_parser.add_argument("--id-suffix", dest="id_suffix", default="naibr")
    naibr_parser.add_argument("naibr_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    manta_parser = subparsers.add_parser(
        "manta",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Manta VCF SV calls into RCK NAS format")
    manta_parser.add_argument("--id-suffix", dest="id_suffix", default="manta")
    manta_parser.add_argument("manta_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    ####
    sniffles_parser = subparsers.add_parser(
        "sniffles",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Sniffles VCF SV calls into RCK NAS format")
    sniffles_parser.add_argument("--id-suffix",
                                 dest="id_suffix",
                                 default="sniffles")
    sniffles_parser.add_argument("sniffles_vcf_file",
                                 type=argparse.FileType("rt"),
                                 default=sys.stdin)
    ####
    grocsv = subparsers.add_parser(
        "grocsvs",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert GROCSVS VCF SV calls into RCK NAS format")
    grocsv.add_argument("--id-suffix", dest="id_suffix", default="grocsv")
    grocsv.add_argument("grocsv_vcf_file",
                        type=argparse.FileType("rt"),
                        default=sys.stdin)
    grocsv.add_argument("--samples")
    grocsv.add_argument("--samples-all-any",
                        choices=["all", "any"],
                        default="any")
    grocsv.add_argument("--samples-only",
                        action="store_true",
                        dest="samples_only")
    ####
    delly = subparsers.add_parser(
        "delly",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Delly VCF SV calls into RCK NAS format")
    delly.add_argument("--id-suffix", dest="id_suffix", default="delly")
    delly.add_argument("delly_vcf_file",
                       type=argparse.FileType("rt"),
                       default=sys.stdin)
    delly.add_argument("--stream",
                       action="store_true",
                       dest="delly_force_stream")
    ####
    pbsv = subparsers.add_parser(
        "pbsv",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert PBSV VCF SV calls into RCK NAS format")
    pbsv.add_argument("--id-suffix", dest="id_suffix", default="pbsv")
    pbsv.add_argument("--sample", default=None)
    pbsv.add_argument("pbsv_vcf_file",
                      type=argparse.FileType("rt"),
                      default=sys.stdin)
    ####
    remixt = subparsers.add_parser(
        "remixt",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert ReMixT Novel adjacencies calls into RCK format")
    remixt.add_argument("--i-separator", default="\t")
    remixt.add_argument("--id-suffix", dest="id_suffix", default="remixt")
    remixt.add_argument("--clone-ids",
                        choices=["1", "2", "1,2"],
                        default="1,2")
    remixt.add_argument("--skip-absent",
                        action="store_true",
                        dest="skip_absent")
    remixt.add_argument("--no-remixt-na-correction",
                        action="store_false",
                        dest="remixt_correction")
    remixt.add_argument("remixt_file",
                        type=argparse.FileType("rt"),
                        default=sys.stdin)
    ####
    gundem2015_parser = subparsers.add_parser(
        "gundem2015",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert SV calls from Gundem et al (2015) (BRASS2???) "
        "into RCK NAS format")
    gundem2015_parser.add_argument("--id-suffix",
                                   dest="id_suffix",
                                   default="gundem2015")
    gundem2015_parser.add_argument("gundem2015_file",
                                   type=argparse.FileType("rt"),
                                   default=sys.stdin)
    gundem2015_parser.add_argument("--i-separator", default="\t")
    gundem2015_parser.add_argument("--samples", nargs="+", required=True)
    gundem2015_parser.add_argument("--min-sample-cnt", type=int, default=1)
    gundem2015_parser.add_argument("--no-flip-second-strand",
                                   action="store_false",
                                   dest="flip_second_strand")
    ####
    survivor_parser = subparsers.add_parser(
        "survivor",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Covert SURVIVOR SV merging results into RCK format")
    survivor_parser.add_argument("--id-suffix",
                                 dest="id_suffix",
                                 default="survivor")
    survivor_parser.add_argument("survivor_vcf_file",
                                 type=argparse.FileType("rt"),
                                 default=sys.stdin)
    survivor_parser.add_argument("--samples")
    survivor_parser.add_argument("--samples-sources")
    survivor_parser.add_argument("--samples-separator", default="\t")
    survivor_parser.add_argument("--samples-extra-separator", default=";")
    survivor_parser.add_argument("--samples-suffix-extra",
                                 action="store_true",
                                 dest="suffix_sample_extra")
    survivor_parser.add_argument("--survivor-prefix", default="")
    ####
    svaba_parser = subparsers.add_parser(
        "svaba",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert SvABA SV calls into RCK format")
    svaba_parser.add_argument("--id-suffix", dest="id_suffix", default="svaba")
    svaba_parser.add_argument("svaba_vcf_file",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    svaba_parser.add_argument("--i-type",
                              choices=["indel", "sv"],
                              default="sv")
    svaba_parser.add_argument("--samples")
    svaba_parser.add_argument("--samples-all-any",
                              choices=["all", "any"],
                              default="any")
    svaba_parser.add_argument("--samples-only",
                              action="store_true",
                              dest="samples_only")
    ####
    breakdancer_parser = subparsers.add_parser(
        "breakdancer",
        parents=[shared_parser, cli_logging_parser, chr_strip_parser],
        help="Convert Breakdancer(-max) SV calls into RCK format")
    breakdancer_parser.add_argument("--id-suffix",
                                    dest="id_suffix",
                                    default="breakdancer")
    breakdancer_parser.add_argument("breakdancer_file",
                                    type=argparse.FileType("rt"),
                                    default=sys.stdin)
    ####
    args = parser.parse_args()
    setup = build_setup(args=args)
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-ADJ-x2rck")
    nas = []
    if args.o_extra_fields is None or len(
            args.o_extra_fields) == 0 or args.o_extra_fields == ",":
        extra = None
    elif args.o_extra_fields != "all":
        extra = args.o_extra_fields.split(",")
    else:
        extra = args.o_extra_fields
    if args.command == "lumpy":
        logger.info(
            "Starting converting adjacencies from the Lumpy VCF format to that of RCK"
        )
        logger.info("Reading Lumpy VCF records from {file}".format(
            file=args.lumpy_vcf_file))
        lumpy_vcf_records = get_vcf_records_from_source(
            source=args.lumpy_vcf_file)
        logger.info("Converting Lumpy VCF records to RCK adjacencies")
        nas = get_nas_from_lumpy_vcf_records(
            lumpy_vcf_records=lumpy_vcf_records, setup=setup)
    elif args.command == "longranger":
        logger.info(
            "Starting converting adjacencies from the LongRanger VCf format to that of RCK"
        )
        logger.info("Reading LongRanger VCF records from {file}".format(
            file=args.longranger_vcf_file))
        longranger_vcf_records = get_vcf_records_from_source(
            source=args.longranger_vcf_file)
        logger.info('Converting LongRanger VCF records to RCK adjacencies')
        nas = get_nas_from_longranger_vcf_records(
            longranger_vcf_records=longranger_vcf_records, setup=setup)
    elif args.command == "naibr":
        logger.info(
            "Starting converting adjacencies from NAIBR records to that of RCK"
        )
        logger.info(
            "Reading and converting NAIBR records from {file} to RCK adjacencies"
            .format(file=args.naibr_file))
        nas = get_nas_from_naibr_source(source=args.naibr_file, setup=setup)
    elif args.command == "manta":
        logger.info(
            "Starting converting adjacencies from Manta records to that of RCK"
        )
        logger.info("Reading Manta VCF records from {file}".format(
            file=args.manta_vcf_file))
        manta_vcf_records = get_vcf_records_from_source(
            source=args.manta_vcf_file)
        logger.info("Converting Manta VCF records to RCK adjacencies")
        nas = get_nas_from_manta_vcf_records(
            manta_vcf_records=manta_vcf_records, setup=setup)
    elif args.command == "sniffles":
        logger.info(
            "Starting converting adjacencies from Sinffles records to that of RCK"
        )
        logger.info("Reading Sniffles VCF records from {file}".format(
            file=args.sniffles_vcf_file))
        sniffles_vcf_records = get_vcf_records_from_source(
            source=args.sniffles_vcf_file)
        logger.info("Converting Sniffles VCF records to RCK adjacencies")
        nas = get_nas_from_sniffles_vcf_records(
            sniffles_vcf_records=sniffles_vcf_records, setup=setup)
    elif args.command == "grocsvs":
        logger.info(
            "Starting converting adjacencies from GROCSVS records to that of RCK"
        )
        logger.info("Reading GROCSVS VCF records from {file}".format(
            file=args.grocsv_vcf_file))
        samples = args.samples.split(
            ",") if args.samples is not None else args.samples
        grocsv_vcf_records = get_vcf_records_from_source(
            source=args.grocsv_vcf_file)
        logger.info("Converting GROCSVS VCF records to RCK adjacencies")
        nas = get_nas_from_grocsv_vcf_records(
            grocsv_vcf_records=grocsv_vcf_records,
            setup=setup,
            samples=samples,
            samples_all_any=args.samples_all_any,
            samples_only=args.samples_only)
    elif args.command == "delly":
        logger.info(
            "Starting converting adjacencies from Delly records to that of RCK"
        )
        if args.delly_force_stream:
            logger.info("Forced stream is enabled")
            logger.info(
                "Streamlining reading and converting Delly VCF from {i_file} to RCK adjacencies into {o_file}"
                "".format(i_file=args.delly_vcf_file,
                          o_file=args.rck_nas_file))
            delly_vcf_to_nas_stream(source=args.delly_vcf_file,
                                    dest=args.rck_nas_file,
                                    setup=setup,
                                    extra=extra)
            sys.exit(0)
        else:
            logger.info("Reading Delly VCF records from {file}".format(
                file=args.delly_vcf_file))
            delly_vcf_records = get_vcf_records_from_source(
                source=args.delly_vcf_file)
            logger.info("Converting Delly VCF records to rCK adjacencies")
            nas = get_nas_from_delly_vcf_records(
                delly_vcf_records=delly_vcf_records, setup=setup)
    elif args.command == "pbsv":
        logger.info(
            "Starting converting adjacencies from PBSV records to that of RCK")
        logger.info("Reading PBSV VCF records from {file}".format(
            file=args.pbsv_vcf_file))
        pbsv_vcf_records = get_vcf_records_from_source(
            source=args.pbsv_vcf_file)
        logger.info("Converting PBSV VCF records to RCK adjacencies")
        nas = get_nas_from_pbsv_vcf_records(pbsv_vcf_records=pbsv_vcf_records,
                                            setup=setup,
                                            sample=args.sample)
    elif args.command == "gundem2015":
        logger.info(
            "Starting converting adjacencies from Gundem et al 2015 (BRASS2???) to that of RCK"
        )
        logger.info(
            "Reading Gundem 2015 et al (BRASS???) records from {file}".format(
                file=args.gundem2015_file))
        nas = get_nas_from_gundem2015_source(
            source=args.gundem2015_file,
            setup=setup,
            separator=args.i_separator,
            flip_second_strand=args.flip_second_strand)
        logger.info(
            "Extracting adjacencies for sample {samples} with a minimum cnt of {min_cnt}"
            .format(samples=",".join(args.samples),
                    min_cnt=args.min_sample_cnt))
        nas = processed_gundem2015_adjacencies(
            adjacencies=nas,
            sample_names=args.samples,
            min_per_sample_cnt=args.min_sample_cnt)
    elif args.command == "remixt":
        logger.info(
            "Starting converting adjacencies and their (haploid) copy numbers from ReMixT to that of RCK"
        )
        logger.info(
            "Reading and converting ReMixT resotds from {file} to RCK adjacencies"
            .format(file=args.remixt_file))
        clone_ids = args.clone_ids.split(",")
        nas = get_nas_from_remixt_source(
            source=args.remixt_file,
            setup=setup,
            separator=args.i_separator,
            clone_ids=clone_ids,
            skip_absent=args.skip_absent,
            remixt_na_correction=args.remixt_correction)
    elif args.command == "survivor":
        sample_names = args.samples.split(
            ",") if args.samples is not None else []
        sample_sources = args.samples_sources.split(
            ",") if args.samples_sources is not None else []
        if len(sample_names) != len(sample_sources):
            logger.warning(
                "Provided samples' length {sample_cnt} ({samples}) does not match that of samples source length {sample_sources_cnt} (sample_sources)"
                "".format(sample_cnt=len(sample_names),
                          samples=",".join(sample_names),
                          sample_sources_cnt=len(sample_sources),
                          sample_sources=",".join(sample_sources)))
        logger.info(
            "Starting converting adjacencies from SURVIVOR to that of RCK")
        logger.info("Reading SURVIVOR records from {file}".format(
            file=args.survivor_vcf_file))
        survivor_vcf_records = get_vcf_records_from_source(
            source=args.survivor_vcf_file)
        logger.debug("Reading source-samples adjacencies (in RCK format)")
        adjacencies_by_ids_by_sample_name = {}
        for sample_name, sample_source in zip(sample_names, sample_sources):
            try:
                file_name = get_full_path(sample_source)
                adjacencies = read_adjacencies_from_file(
                    file_name=file_name,
                    separator=args.samples_separator,
                    extra_separator=args.samples_extra_separator)
                adjacencies_by_ids = {
                    adj.extra.get(EXTERNAL_NA_ID, adj.stable_id_non_phased):
                    adj
                    for adj in adjacencies
                }
                adjacencies_by_ids_by_sample_name[
                    sample_name] = adjacencies_by_ids
            except IOError:
                logger.warning(
                    "Unable to reader source adjacency information from {source}"
                    .format(source=sample_source))
        logger.info("Converting SURVIVOR VCF records from {file}".format(
            file=args.survivor_vcf_file))
        nas = get_nas_from_survivor_vcf_records(
            survivor_vcf_records=survivor_vcf_records,
            setup=setup,
            adjacencies_by_ids_by_sample_name=adjacencies_by_ids_by_sample_name,
            suffix_sample_extra=args.suffix_sample_extra,
            survivor_prefix=args.survivor_prefix)
    elif args.command == "svaba":
        logger.info("Starting converting adjacencies from SvABA to RCK")
        logger.info("Reading SvABA VCF records from {file}".format(
            file=args.svaba_vcf_file))
        svaba_vcf_records = get_vcf_records_from_source(
            source=args.svaba_vcf_file)
        logger.info("Converting SvABA VCF records to RCK adjacencies")
        samples = args.samples.split(
            ",") if args.samples is not None else args.samples
        nas = get_nas_from_svaba_vcf_records(
            svaba_vcf_records=svaba_vcf_records,
            source_type=args.i_type,
            setup=setup,
            samples=samples,
            samples_all_any=args.samples_all_any,
            samples_only=args.samples_only)
    elif args.command == "breakdancer":
        logger.info(
            "Starting converting adjacencies from Breakdancer(-max) to RCK")
        logger.info(
            "Reading and converting Breakdancer(-max) records from {file}".
            format(file=args.breakdancer_file))
        nas = get_nas_from_breakdancer_source(source=args.breakdancer_file,
                                              setup=setup)
    logger.info(
        "A total of {cnt} adjacencies were obtained.".format(cnt=len(nas)))
    logger.debug("Output extra fields were identified as {o_extra}".format(
        o_extra=",".join(extra)))
    include_chrs_regions_strings = []
    exclude_chrs_regions_strings = []
    if args.chrs_include is not None:
        for chrs_lists in args.chrs_include:
            for chrs_list in chrs_lists:
                for chr_name in chrs_list.split(","):
                    include_chrs_regions_strings.append(chr_name)
    if args.chrs_include_file is not None:
        for chr_name in get_chrs_regions_string_lists_from_source(
                source=args.chrs_include_file):
            include_chrs_regions_strings.append(chr_name)
    if args.chrs_exclude is not None:
        for chrs_lists in args.chrs_exclude:
            for chrs_list in chrs_lists:
                for chr_name in chrs_list.split(","):
                    exclude_chrs_regions_strings.append(chr_name)
    if args.chrs_exclude_file is not None:
        for chr_name in get_chrs_regions_string_list_from_file(
                file_name=args.chrs_exclude_file):
            exclude_chrs_regions_strings.append(chr_name)
    include_regions = [
        parse_segment_chr_region(string)
        for string in include_chrs_regions_strings
    ]
    exclude_regions = [
        parse_segment_chr_region(string)
        for string in exclude_chrs_regions_strings
    ]
    logger.debug("Include chromosomes : {include_chromosomes}".format(
        include_chromosomes=",".join(map(str, include_regions))))
    logger.debug("Exclude chromosomes : {exclude_chromosomes}".format(
        exclude_chromosomes=",".join(map(str, exclude_regions))))
    logger.info("Filtering adjacencies based on input/exclude chromosomes")
    nas = filter_adjacencies_by_chromosomal_regions(
        adjacencies=nas,
        include=include_regions,
        exclude=exclude_regions,
        include_both=args.include_both,
        exclude_both=args.exclude_both)
    nas = list(nas)
    logger.info(
        "A total of {cnt} adjacencies were retained after filtering".format(
            cnt=len(nas)))
    logger.info(
        "Writing RCK adjacencies to {file}".format(file=args.rck_adj_file))
    write_adjacencies_to_destination(destination=args.rck_adj_file,
                                     adjacencies=nas,
                                     extra=extra)
    logger.info("Success")
Ejemplo n.º 6
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-NAS-rck2x")
    parser.add_argument('--version', action='version', version=rck.version)
    cli_logging_parser = get_logging_cli_parser()
    ###
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ###
    vcf_parser = subparsers.add_parser(
        "vcf-sniffles",
        parents=[cli_logging_parser],
        help="Convert RCK Adjacencies to the VCF (Sniffles) format")
    vcf_parser.add_argument("rck_adj",
                            type=argparse.FileType("rt"),
                            default=sys.stdin)
    vcf_parser.add_argument("--separator", default="\t")
    vcf_parser.add_argument("--extra-separator", default=";")
    vcf_parser.add_argument("--output",
                            "-o",
                            type=argparse.FileType("wt"),
                            default=sys.stdout)
    vcf_parser.add_argument("--o-extra-fields", default="all")
    vcf_parser.add_argument("--o-no-include-ref",
                            action="store_false",
                            dest="include_ref")
    vcf_parser.add_argument("--clone-suffix", default="")
    vcf_parser.add_argument("--dummy-clone", default="dummy_clone")
    vcf_parser.add_argument("--dummy-clone-gt-extra")
    vcf_parser.add_argument("--dummy-gt", default="./.")
    vcf_parser.add_argument("--alt-extra")
    vcf_parser.add_argument("--ref-extra")
    ###
    circa_parser = subparsers.add_parser(
        "circa",
        parents=[cli_logging_parser],
        help="Convert RCK Adjacencies to the TSV format supported by Circa")
    circa_parser.add_argument("rck_adj",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    circa_parser.add_argument("--separator", default="\t")
    circa_parser.add_argument("--extra-separator", default=";")
    circa_parser.add_argument("--size-extra-field")
    circa_parser.add_argument("--size-extra-field-no-abs",
                              action="store_false",
                              dest="size_extra_field_abs")
    circa_parser.add_argument("--size-extra-seq-field")
    circa_parser.add_argument("--output",
                              "-o",
                              type=argparse.FileType("wt"),
                              default=sys.stdout)
    ###
    circa_density_parser = subparsers.add_parser(
        "circa-dens",
        parents=[cli_logging_parser],
        help=
        "Convert RCK Adjacencies to the TSV format with adjacencies density cnt per window supported by Circa"
    )
    circa_density_parser.add_argument("rck_adj",
                                      type=argparse.FileType("rt"),
                                      default=sys.stdin)
    circa_density_parser.add_argument("--separator", default="\t")
    circa_density_parser.add_argument("--extra-separator", default=";")
    circa_density_parser.add_argument("--window-size",
                                      type=int,
                                      default=10000000)
    circa_density_parser.add_argument("--chr-sizes",
                                      type=argparse.FileType("rt"))
    circa_density_parser.add_argument("--element",
                                      choices=["breakend", "adj"],
                                      default="breakend")
    circa_density_parser.add_argument("--element-adj-cnt-full",
                                      action="store_true",
                                      dest="circa_element_adj_cnt_full")
    circa_density_parser.add_argument("-o",
                                      "--output",
                                      type=argparse.FileType("wt"),
                                      default=sys.stdout)
    ###
    bedpe_parser = subparsers.add_parser(
        "bedpe",
        parents=[cli_logging_parser],
        help=
        "Convert RCK Adjacencies to the BEDPE format with only intra-chromosomal adjacencies considered"
    )
    bedpe_parser.add_argument("rck_adj",
                              type=argparse.FileType("rt"),
                              default=sys.stdin)
    bedpe_parser.add_argument("--separator", default="\t")
    bedpe_parser.add_argument("--extra-separator", default=";")
    bedpe_parser.add_argument("--name-extra-field", default=None)
    bedpe_parser.add_argument("-o",
                              "--output",
                              type=argparse.FileType("wt"),
                              default=sys.stdout)
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-NAS-rck2x")
    logger.info("Reading adjacencies from {file}".format(file=args.rck_adj))
    adjacencies = read_adjacencies_from_source(
        source=args.rck_adj,
        extra_separator=args.extra_separator,
        separator=args.separator)
    if args.command == "vcf-sniffles":
        if not args.include_ref:
            logger.debug(
                "Reference adjacencies were excluded from the output.")
            adjacencies = list(
                filter(lambda a: a.adjacency_type == AdjacencyType.NOVEL,
                       adjacencies))
        if args.o_extra_fields is None or len(
                args.o_extra_fields) == 0 or args.o_extra_fields == ",":
            extra = None
        elif args.o_extra_fields != "all":
            extra = args.o_extra_fields.split(",")
        else:
            extra = args.o_extra_fields
        logger.debug("Output extra fields are identified as {o_extra}".format(
            o_extra=",".join(extra) if extra is not None else ""))
        logger.info(
            "Converting RCK formatted adjacencies to the VCF (Sniffles) format"
        )
        logger.info("Writing adjacencies to {file}".format(file=args.output))
        write_adjacencies_to_vcf_sniffles_destination(
            destination=args.output,
            adjacencies=adjacencies,
            extra=extra,
            dummy_clone=args.dummy_clone,
            clone_suffix=args.clone_suffix,
            alt_extra=args.alt_extra,
            ref_extra=args.ref_extra,
            dummy_clone_gt_extra=args.dummy_clone_gt_extra,
            dummy_gt=args.dummy_gt)
    elif args.command == "circa":
        logger.info(
            "Converting input RCK formatted adjacencies into a Circa suitable format (extra column get transformed into a size column)"
        )
        logger.info(
            "Writing adjacencies info suitable for Circa to {file}".format(
                file=args.output))
        write_adjacencies_to_circa_destination(
            destination=args.output,
            adjacencies=adjacencies,
            size_extra_field=args.size_extra_field,
            size_extra_seq_field=args.size_extra_seq_field,
            size_abs=args.size_extra_field_abs)
    elif args.command == "circa-dens":
        logger.info(
            "Computing cnt of input RCK formatted adjacencies per window into a CIRCA suitable format"
        )
        chr_sizes = args.chr_sizes
        if args.chr_sizes is not None:
            chr_sizes = read_chr_sizes_from_source(source=args.chr_sizes)
        circa_adj_cnts = get_circa_adj_cnt(
            adjacencies=adjacencies,
            window_size=args.window_size,
            chr_sizes=chr_sizes,
            element=args.element,
            adj_full_cnt=args.circa_element_adj_cnt_full)
        segments = []
        for segment, cnt in circa_adj_cnts.items():
            segment.extra[args.element +
                          "_cnt"] = cnt * segment.length / args.window_size
            segments.append(segment)
        write_segments_to_circa_destination(destination=args.output,
                                            segments=segments,
                                            extra=[args.element + "_cnt"])
    elif args.command == "bedpe":
        logger.info(
            f"Converting and writing input RCK formatted adjacencies into BEDPE format to {args.output}"
        )
        adjacencies = filter_adjacencies_by_size(adjacencies=adjacencies,
                                                 allow_inter_chr=True)
        write_adjacencies_to_bedpe_destination(
            destination=args.output,
            adjacencies=adjacencies,
            name_extra_field=args.name_extra_field)
    logger.info("Success")
Ejemplo n.º 7
0
def main():
    parser = argparse.ArgumentParser()
    logging_parser = get_logging_cli_parser()
    ########
    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ########
    lr_extraction_parser = subparsers.add_parser("extract-lr",
                                                 parents=[logging_parser])
    lr_extraction_parser.add_argument("rck_nas",
                                      type=argparse.FileType("rt"),
                                      default=sys.stdin)
    lr_extraction_parser.add_argument("-o",
                                      "--output",
                                      type=argparse.FileType("wt"),
                                      default=sys.stdout)
    lr_extraction_parser.add_argument("--min-sv-cnt", type=int, default=2)
    lr_extraction_parser.add_argument("--lr-field",
                                      default="support_read_names")
    #########
    lr_alignment_filter_parser = subparsers.add_parser(
        "filter-alignment", parents=[logging_parser])
    lr_alignment_filter_parser.add_argument("alignment",
                                            nargs="?",
                                            type=str,
                                            default="-")
    lr_alignment_filter_parser.add_argument("--i-alignment-format",
                                            type=str,
                                            choices=["bam", "sam", "cram"],
                                            default="bam")
    lr_alignment_filter_parser.add_argument("-r",
                                            "--reads",
                                            type=argparse.FileType("rt"),
                                            required=True)
    lr_alignment_filter_parser.add_argument("--r-separator", default="\t")
    lr_alignment_filter_parser.add_argument("--s-separator", default="\t")
    lr_alignment_filter_parser.add_argument("-o",
                                            "--output",
                                            type=str,
                                            default="-")
    lr_alignment_filter_parser.add_argument("--o-alignment-format",
                                            type=str,
                                            choices=["bam", "sam", "cram"],
                                            default="bam")
    #########
    labeling_constraint_inference_parser = subparsers.add_parser(
        "label-const-inf", parents=[logging_parser])
    labeling_constraint_inference_parser.add_argument("alignment",
                                                      type=str,
                                                      default="-")
    labeling_constraint_inference_parser.add_argument(
        "--i-alignment-format",
        type=str,
        choices=["bam", "sam", "cram"],
        default="bam")
    labeling_constraint_inference_parser.add_argument(
        "--rck-nas", type=argparse.FileType("rt"), required=True)
    labeling_constraint_inference_parser.add_argument("--min-sv-cnt",
                                                      type=int,
                                                      default=2)
    labeling_constraint_inference_parser.add_argument(
        "--lr-field", default="support_read_names")
    labeling_constraint_inference_parser.add_argument(
        "-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
    #########
    labeling_constraint_combine_parser = subparsers.add_parser(
        "label-const-com", parents=[logging_parser])
    labeling_constraint_combine_parser.add_argument(
        "label-constr", type=argparse.FileType("rt"), nargs="+")
    labeling_constraint_combine_parser.add_argument(
        "-o", "--output", type=argparse.FileType("rt"), default=sys.stdout)
    #########
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args,
                                           program_name="RCK-UTILS-LR")
    if args.command == "extract-lr":
        nas = read_adjacencies_from_source(source=args.rck_nas)
        reads_to_nas = defaultdict(list)
        for na in nas:
            reads_str = na.extra.get(args.lr_field, "")
            reads = reads_str.split(",")
            for read in reads:
                if len(read) == 0:
                    continue
                reads_to_nas[read].append(na)
        extracted_read_names = {
            read
            for read in reads_to_nas
            if len(reads_to_nas[read]) >= args.min_sv_cnt
        }
        for read_name in extracted_read_names:
            print(read_name, file=args.output)
    elif args.command == "filter-alignment":
        reads = get_reads_set_from_source(source=args.reads)
        imode = get_mode_str(format=args.i_alignment_format, input=True)
        omode = get_mode_str(format=args.o_alignment_format, input=False)
        with pysam.AlignmentFile(args.alignment, imode) as i_stream:
            with pysam.AlignmentFile(args.output, omode,
                                     template=i_stream) as o_stream:
                for entry in i_stream:
                    if entry.qname in reads:
                        o_stream.write(entry)
    elif args.command == "label-const-inf":
        constraints = infer_labeling_constraints(
            rck_nas_source=args.rck_nas,
            alignment_file=args.alignment,
            i_alignment_format=args.i_alignment_format,
            lr_field=args.lr_field,
            min_sv_cnt=args.min_sv_cnt,
            logger=logger)

    elif args.command == "label-constr-com":
        pass
Ejemplo n.º 8
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-infer")
    parser.add_argument("--version", action="version", version=rck.version)
    cli_logging_parser = get_logging_cli_parser()

    subparsers = parser.add_subparsers(title="commands", dest="command")
    subparsers.required = True
    ###
    sniffles_molecule_group_parser = subparsers.add_parser(
        "sniffles-m", parents=[cli_logging_parser])
    sniffles_molecule_group_parser.add_argument("rck_adj",
                                                type=argparse.FileType("rt"),
                                                default=sys.stdin)
    sniffles_molecule_group_parser.add_argument("--i-separator", default="\t")
    sniffles_molecule_group_parser.add_argument("--i-extra-separator",
                                                default=";")
    sniffles_molecule_group_parser.add_argument("--extra-rnames-field",
                                                default="rnames")
    sniffles_molecule_group_parser.add_argument("--fp",
                                                type=float,
                                                default=0.5)
    sniffles_molecule_group_parser.add_argument("--gid-suffix",
                                                dest="gid_suffix",
                                                default="sniffles-M")
    sniffles_molecule_group_parser.add_argument("-o",
                                                "--output",
                                                type=argparse.FileType("wt"),
                                                default=sys.stdout)
    sniffles_molecule_group_parser.add_argument("--o-separator", default="\t")
    sniffles_molecule_group_parser.add_argument("--o-aids-separator",
                                                default=",")
    sniffles_molecule_group_parser.add_argument("--o-extra-separator",
                                                default=";")
    ###
    short_nas_labeling_group_parser = subparsers.add_parser(
        "short-l", parents=[cli_logging_parser])
    short_nas_labeling_group_parser.add_argument("rck_adj",
                                                 type=argparse.FileType("rt"),
                                                 default=sys.stdin)
    short_nas_labeling_group_parser.add_argument("--i-separator", default="\t")
    short_nas_labeling_group_parser.add_argument("--i-extra-separator",
                                                 default=";")
    short_nas_labeling_group_parser.add_argument("--max-size",
                                                 type=int,
                                                 default=50000000)
    short_nas_labeling_group_parser.add_argument(
        "--allow-intermediate-same",
        action="store_true",
        dest="allow_intermediate_same")
    short_nas_labeling_group_parser.add_argument("--allow-intermediate-tra",
                                                 action="store_true",
                                                 dest="allow_intermediate_tra")
    short_nas_labeling_group_parser.add_argument("--no-inv-signatures",
                                                 action="store_false",
                                                 dest="allow_inv_signature")
    short_nas_labeling_group_parser.add_argument("--no-refine",
                                                 action="store_false",
                                                 dest="refine")
    short_nas_labeling_group_parser.add_argument("--fp", type=float, default=1)
    short_nas_labeling_group_parser.add_argument("--gid-suffix",
                                                 dest="gid_suffix",
                                                 default="short-nas-L")
    short_nas_labeling_group_parser.add_argument("-o",
                                                 "--output",
                                                 type=argparse.FileType("wt"),
                                                 default=sys.stdout)
    short_nas_labeling_group_parser.add_argument("--o-separator", default="\t")
    short_nas_labeling_group_parser.add_argument("--o-aids-separator",
                                                 default=",")
    short_nas_labeling_group_parser.add_argument("--o-extra-separator",
                                                 default=";")
    ###
    sniffles_labeling_group_parser = subparsers.add_parser(
        "sniffles-l", parents=[cli_logging_parser])
    sniffles_labeling_group_parser.add_argument("--rck-adj",
                                                type=argparse.FileType("rt"),
                                                required=True)
    sniffles_labeling_group_parser.add_argument("--i-separator", default="\t")
    sniffles_labeling_group_parser.add_argument("--i-extra-separator",
                                                default=";")
    sniffles_labeling_group_parser.add_argument("--alignment", required=True)
    sniffles_labeling_group_parser.add_argument("--alignment-format",
                                                choices=["sam", "bam", "cram"],
                                                default="bam")
    sniffles_labeling_group_parser.add_argument("--extra-rnames-field",
                                                default="rnames")
    sniffles_labeling_group_parser.add_argument("--no-refine",
                                                action="store_false",
                                                dest="refine")
    sniffles_labeling_group_parser.add_argument("--fp", type=float, default=1)
    sniffles_labeling_group_parser.add_argument("--gid-suffix",
                                                default="sniffles-L")
    sniffles_labeling_group_parser.add_argument("-o",
                                                "--output",
                                                default=sys.stdout,
                                                type=argparse.FileType("wt"))
    sniffles_labeling_group_parser.add_argument("--o-separator", default="\t")
    sniffles_labeling_group_parser.add_argument("--o-aids-separator",
                                                default=",")
    sniffles_labeling_group_parser.add_argument("--o-extra-separator",
                                                default=";")
    ###
    filter_alignment_parser = subparsers.add_parser(
        "filter-alignment", parents=[cli_logging_parser])
    filter_alignment_parser.add_argument("--rck-adj",
                                         type=argparse.FileType("rt"),
                                         required=True)
    filter_alignment_parser.add_argument("--i-separator", default="\t")
    filter_alignment_parser.add_argument("--i-extra-separator", default=";")
    filter_alignment_parser.add_argument("--extra-rnames-field",
                                         default="rnames")
    filter_alignment_parser.add_argument("--alignment", required=True)
    filter_alignment_parser.add_argument("--alignment-format",
                                         choices=["sam", "bam", "cram"],
                                         default="bam")
    filter_alignment_parser.add_argument("-o", "--output", required=True)
    filter_alignment_parser.add_argument("--output-format",
                                         choices=["sam", "bam", "cram"],
                                         default="bam")
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADJ-GROUPS-infer")
    if args.command == "sniffles-m":
        logger.info(
            "Inferring molecule adjacency groups from adjacencies with Sniffles RNAMES support extra info."
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator)
        logger.info(
            "Inferring molecule adjacency groups from read adjacencies")
        adj_groups = infer_sniffles_molecule_groups(
            adjacencies=adjacencies,
            extra_rnames_field=args.extra_rnames_field,
            gid_suffix=args.gid_suffix)
        logger.info("Inferred {cnt} molecule adjacency groups".format(
            cnt=len(adj_groups)))
        logger.info(
            "Writing inferred molecule adjacency groups to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            extra_separator=args.o_extra_separator,
            aids_separator=args.o_aids_separator,
            extra_fill="")
    elif args.command == "short-l":
        logger.info(
            "Inferring labeling adjacency groups from adjacencies from adjacencies."
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator)
        logger.info(
            "Inferring labeling adjacency groups from read adjacencies")
        adj_groups = infer_short_nas_labeling_groups(
            adjacencies=adjacencies,
            gid_suffix=args.gid_suffix,
            max_size=args.max_size,
            allow_intermediate_same=args.allow_intermediate_same,
            allow_intermediate_tra=args.allow_intermediate_tra,
            allow_inv_signatures=args.allow_inv_signature)
        logger.info("Inferred {cnt} labeling adjacency groups".format(
            cnt=len(adj_groups)))
        if args.refine:
            logger.info("Refining inferred labeling adjacency groups")
            adj_groups = refined_labeling_groups(adj_groups=adj_groups,
                                                 gid_suffix=args.gid_suffix)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remain".format(
                cnt=len(adj_groups)))
        logger.info(
            "Writing inferred labeling adjacency group s to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator,
            extra_fill="")
    elif args.command == "sniffles-l":
        logger.info(
            "Inferring labeling adjacency groups from adjacencies, and their reads-of-origin alignments"
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            extra_separator=args.i_extra_separator,
            separator=args.i_separator)
        logger.info(
            "Inferring labeling adjacency groups from read adjacencies and their reads-of-origin alignments"
        )
        adj_groups = infer_alignment_labeling_groups(
            adjacencies=adjacencies,
            alignment_file_name=args.alignment,
            alignment_format=args.alignment_format,
            extra_rnames_field=args.extra_rnames_field,
            gid_suffix=args.gid_suffix)
        logger.info(
            "Inferred {cnt} labeling adjacency groups. There can be many duplicates, refinement shall take care of it."
            .format(cnt=len(adj_groups)))
        if args.refine:
            logger.info("Refining inferred labeling adjacency groups")
            adj_groups = refined_labeling_groups(adj_groups=adj_groups,
                                                 gid_suffix=args.gid_suffix)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remain".format(
                cnt=len(adj_groups)))
        logger.info(
            "Writing inferred labeling adjacency group s to {file}".format(
                file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator,
            extra_fill="")
    elif args.command == "filter-alignment":
        logger.info(
            "Filtering input read alignment to retain only reads mentioned as supporting adjacencies from the input"
        )
        logger.info(
            "Reading adjacencies from {file}".format(file=args.rck_adj))
        adjacencies = read_adjacencies_from_source(
            source=args.rck_adj,
            extra_separator=args.i_extra_separator,
            separator=args.i_separator)
        logger.info(
            "Filtering input alignment form file {file} and writing result in {o_file}"
            .format(file=args.alignment, o_file=args.output))
        filter_alignment(adjacencies=adjacencies,
                         alignment_file_name=args.alignment,
                         alignment_format=args.alignment_format,
                         extra_rnames_field=args.extra_rnames_field,
                         output_alignment_file_name=args.output,
                         output_alignment_format=args.output_format)
        exit(0)
Ejemplo n.º 9
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-process")
    cli_logging_parser = get_logging_cli_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ###
    refine_parser = subparsers.add_parser("refine",
                                          parents=[cli_logging_parser])
    refine_parser.add_argument('scnt',
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    refine_parser.add_argument("--separator", default="\t")
    refine_parser.add_argument("--no-allow-missing-clones",
                               action="store_false",
                               dest="allow_missing_clones")
    refine_parser.add_argument("--clone-ids", default=None)
    refine_parser.add_argument("--no-merge-fragments",
                               action="store_false",
                               dest="merge_fragments")
    refine_parser.add_argument("--max-merge-gap", type=int, default=1000000)
    refine_parser.add_argument("--no-fill-gaps",
                               action="store_false",
                               dest="fill_gaps")
    refine_parser.add_argument("--max-fill-gap", type=int, default=1000000)
    refine_parser.add_argument('--output',
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    ###
    align_parser = subparsers.add_parser("align", parents=[cli_logging_parser])
    align_parser.add_argument("scnt", nargs="+")
    align_parser.add_argument("--separator", default="\t")
    align_parser.add_argument("--output-suffix", default="aligned")
    align_parser.add_argument("--no-allow-unit-segments",
                              action="store_false",
                              dest="allow_unit_segments")
    align_parser.add_argument("--output-dir", default="")
    ###
    distance_parser = subparsers.add_parser("distance",
                                            parents=[cli_logging_parser])
    distance_parser.add_argument("--scnt1",
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt1-separator", default="\t")
    distance_parser.add_argument("--scnt1-extra-separator", default=";")
    distance_parser.add_argument("--scnt2",
                                 type=argparse.FileType("rt"),
                                 required=True)
    distance_parser.add_argument("--scnt2-separator", default="\t")
    distance_parser.add_argument("--scnt2-extra-separator", default=";")
    distance_parser.add_argument("--clone-ids", default=None)
    distance_parser.add_argument("--output",
                                 "-o",
                                 type=argparse.FileType("wt"),
                                 default=sys.stdout)
    ###
    filter_parser = subparsers.add_parser("filter",
                                          parents=[cli_logging_parser])
    filter_parser.add_argument("scnt",
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    filter_parser.add_argument("--separator", default="\t")
    filter_parser.add_argument("--extra-separator", default=";")
    filter_parser.add_argument("--o-extra-fields", default="all")
    filter_parser.add_argument("--chrs-include", action="append", nargs=1)
    filter_parser.add_argument("--chrs-include-file",
                               type=argparse.FileType("rt"))
    filter_parser.add_argument("--chrs-include-no-full",
                               action="store_false",
                               dest="include_full")
    filter_parser.add_argument("--chrs-exclude", action="append", nargs=1)
    filter_parser.add_argument("--chrs-exclude-file",
                               type=argparse.FileType("rt"))
    filter_parser.add_argument("--chrs-exclude-full",
                               action="store_true",
                               dest="exclude_full")
    filter_parser.add_argument("--keep-extra-field-regex",
                               nargs="+",
                               default=None)
    filter_parser.add_argument("--keep-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--keep-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--remove-extra-field-regex",
                               nargs="+",
                               default=None)
    filter_parser.add_argument("--remove-extra-field-regex-file",
                               type=argparse.FileType("rt"),
                               default=None)
    filter_parser.add_argument("--remove-extra-field-missing-strategy",
                               choices=[KEEP, REMOVE],
                               default=KEEP)
    filter_parser.add_argument("--min-size", type=int, default=0)
    filter_parser.add_argument("--max-size", type=int, default=1000000000)
    filter_parser.add_argument("-o",
                               "--output",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    ###
    haploid_parser = subparsers.add_parser("haploid",
                                           parents=[cli_logging_parser])
    haploid_parser.add_argument("scnt",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    haploid_parser.add_argument("--separator", default="\t")
    haploid_parser.add_argument("--extra-separator", default=";")
    haploid_parser.add_argument("--output",
                                "-o",
                                type=argparse.FileType("wt"),
                                default=sys.stdout)
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-SCNT-process")

    if args.command == "refine":
        clone_ids = args.clone_ids.split(
            ",") if args.clone_ids is not None else None
        logger.debug(
            "Clone ids identified as {clone_ids}. If None -- all clone ids will be processed."
            .format(clone_ids=",".join(clone_ids)))
        logger.info("Reading Segment Copy Number Tensor form {file}".format(
            file=args.scnt))
        segments, scnt = read_scnt_from_source(source=args.scnt,
                                               clone_ids=clone_ids,
                                               separator=args.separator)
        logger.info("Refining Segment Copy Number Tensor from {file}".format(
            file=args.scnt))
        segments, scnt, _ = refined_scnt(segments=segments,
                                         scnt=scnt,
                                         merge_fragments=args.merge_fragments,
                                         max_merge_gap=args.max_merge_gap,
                                         fill_gaps=args.fill_gaps,
                                         max_fill_gap=args.max_fill_gap)
        logger.info(
            "Writing refined Segment Copy Number Tensor to {file}".format(
                file=args.output))
        write_scnt_to_destination(destination=args.output,
                                  scnt=scnt,
                                  segments=segments,
                                  clone_ids=clone_ids,
                                  separator=args.separator)
    elif args.command == "align":
        scnt_files = {}
        for path in args.scnt:
            full_path = get_full_path(path=path)
            name = os.path.splitext(os.path.basename(full_path))[0]
            if name.endswith(".scnt"):
                name = name[:-5]
            if name.endswith("."):
                name = name[:-1]
            scnt_files[name] = full_path
        logger.debug(
            "Input Segment Copy Number Tensors (SCNT) identified as {input_scnts}"
            .format(input_scnts=" , ".join(scnt_files.values())))
        scnts_by_name = {}
        segments_by_name = {}
        clone_ids_by_scnt = {}
        logger.info("Reading input SCNTs")
        for name, path in scnt_files.items():
            logger.debug(
                "Reading SCNT from {file}".format(file=scnt_files[name]))
            segments, scnt = read_scnt_from_file(file_name=scnt_files[name],
                                                 separator=args.separator)
            clone_ids_by_scnt[name] = sorted(scnt.keys())
            scnts_by_name[name] = scnt
            segments_by_name[name] = segments
        if len(scnts_by_name.values()) == 1:
            logger.warning(
                "Only one input SCNT identified. Doing nothing with it, outputting as is."
            )
            aligned_segments_by_name, aligned_scnts_by_name = segments_by_name, scnts_by_name
        else:
            logger.info("Aligning input SCNTs.")
            aligned_segments_by_name, aligned_scnts_by_name = aligned_scnts(
                segments_by_sample_names=segments_by_name,
                scnts_by_sample_names=scnts_by_name)
        result_base_names = {}
        cnt = 0
        for name in sorted(scnt_files.keys()):
            new_name = name
            if name in result_base_names:
                new_name = name + str(cnt)
                cnt += 1
            new_name = new_name + "." + args.output_suffix
            result_base_names[name] = new_name
        output_dir = args.output_dir if args.output_dir != "" else os.getcwd()
        output_dir = get_full_path(path=output_dir)
        logger.info("Writing aligned SCNTs")
        for name, new_name in result_base_names.items():
            scnt = aligned_scnts_by_name[name]
            segments = aligned_segments_by_name[name]
            scnt_path = os.path.join(output_dir, new_name + "rck.scnt.tsv")
            logger.debug("Writing aligned SCNT {scnt_name} to {file}".format(
                scnt_name=name, file=scnt_path))
            write_scnt_to_file(file_name=scnt_path,
                               segments=segments,
                               scnt=scnt,
                               separator=args.separator)
    elif args.command == "filter":
        logger.info(
            "Filtering input segments from following sources {sources}".format(
                sources=args.scnt))
        segments = stream_segments_from_source(
            source=args.scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        include_chrs_regions_strings = []
        exclude_chrs_regions_strings = []
        if args.chrs_include is not None:
            for chrs_lists in args.chrs_include:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        include_chrs_regions_strings.append(chr_name)
        if args.chrs_include_file is not None:
            for chr_name in get_chrs_regions_string_lists_from_source(
                    source=args.chrs_include_file):
                include_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude is not None:
            for chrs_lists in args.chrs_exclude:
                for chrs_list in chrs_lists:
                    for chr_name in chrs_list.split(","):
                        exclude_chrs_regions_strings.append(chr_name)
        if args.chrs_exclude_file is not None:
            for chr_name in get_chrs_regions_string_list_from_file(
                    file_name=args.chrs_exclude_file):
                exclude_chrs_regions_strings.append(chr_name)
        include_regions = [
            parse_segment_chr_region(string)
            for string in include_chrs_regions_strings
        ]
        exclude_regions = [
            parse_segment_chr_region(string)
            for string in exclude_chrs_regions_strings
        ]
        segments = filter_segments_by_chromosomal_regions(
            segments=segments,
            include=include_regions,
            exclude=exclude_regions,
            include_full=args.include_full,
            exclude_full=args.exclude_full)
        keep_extra_field_entries = args.keep_extra_field_regex if args.keep_extra_field_regex is not None else []
        if args.keep_extra_field_regex_file is not None:
            keep_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.keep_extra_field_regex_file)))
        remove_extra_field_entries = args.remove_extra_field_regex if args.remove_extra_field_regex is not None else []
        if args.remove_extra_field_regex_file is not None:
            remove_extra_field_entries.extend(
                list(
                    iter_over_string_entries_from_source(
                        source=args.remove_extra_field_regex_file)))
        keep_extra_field = get_extra_field_regexes(
            string_entries=keep_extra_field_entries)
        remove_extra_field = get_extra_field_regexes(
            string_entries=remove_extra_field_entries)
        segments = filter_segments_by_extra(
            segments=segments,
            keep_extra_field=keep_extra_field,
            keep_extra_field_missing_strategy=args.
            keep_extra_field_missing_strategy,
            remove_extra_field=remove_extra_field,
            remove_extra_field_missing_strategy=args.
            remove_extra_field_missing_strategy)
        segments = filter_segments_by_size(segments=segments,
                                           min_size=args.min_size,
                                           max_size=args.max_size)
        write_segments_to_destination(destination=args.output,
                                      segments=segments)

    elif args.command == "haploid":
        segments = stream_segments_from_source(
            source=args.scnt,
            separator=args.separator,
            extra_separator=args.extra_separator)
        haploid_segments = iter_haploid_segments(segments=segments, copy=False)
        write_segments_to_destination(destination=args.output,
                                      segments=haploid_segments)
    elif args.command == "distance":
        clone_ids = args.clone_ids
        if args.clone_ids is not None:
            clone_ids = args.clone_ids.split(",")
        segments1, scnt1 = read_scnt_from_source(
            source=args.scnt1,
            clone_ids=clone_ids,
            separator=args.scnt1_separator,
            extra_separator=args.scnt1_extra_separator,
            remove_cn_data_from_segs=True)
        segments2, scnt2 = read_scnt_from_source(
            source=args.scnt2,
            clone_ids=clone_ids,
            separator=args.scnt2_separator,
            extra_separator=args.scnt2_extra_separator,
            remove_cn_data_from_segs=True)
        segments_by_sample_names = {"1": segments1, "2": segments2}
        scnts_by_sample_names = {"1": scnt1, "2": scnt2}
        segments_by_sample_names, scnts_by_sample_names = aligned_scnts(
            segments_by_sample_names=segments_by_sample_names,
            scnts_by_sample_names=scnts_by_sample_names)
        segments = segments_by_sample_names["1"]
        scnt1, scnt2 = scnts_by_sample_names["1"], scnts_by_sample_names["2"]
        distance = cn_distance_inter_scnt(tensor1=scnt1,
                                          tensor2=scnt2,
                                          segments=segments,
                                          check_clone_ids_match=True)
        print("distance = ", distance)

    logger.info("Success!")
Ejemplo n.º 10
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-ADJ-GROUPS-process")
    parser.add_argument("--version", action="version", version=rck.version)
    cli_logging_parser = get_logging_cli_parser()

    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ###
    cat_parser = subparsers.add_parser("cat", parents=[cli_logging_parser])
    cat_parser.add_argument("rck_adg",
                            type=argparse.FileType("rt"),
                            nargs="+",
                            default=[sys.stdin])
    cat_parser.add_argument("--i-separator", default="\t")
    cat_parser.add_argument("--i-extra-separator", default=";")
    cat_parser.add_argument("--i-aids-separator", default=",")
    cat_parser.add_argument("--enforce-unique-ids",
                            action="store_true",
                            dest="enforce_unique_ids")
    cat_parser.add_argument("--id-collision-strategy",
                            choices=["skip", "error"],
                            default="error")
    cat_parser.add_argument("-o",
                            "--output",
                            type=argparse.FileType("wt"),
                            default=sys.stdout)
    cat_parser.add_argument("--o-separator", default="\t")
    cat_parser.add_argument("--o-aids-separator", default=",")
    cat_parser.add_argument("--o-extra-separator", default=";")
    ###
    refine_parser = subparsers.add_parser("refine",
                                          parents=[cli_logging_parser])
    refine_parser.add_argument("rck_adg",
                               nargs="?",
                               type=argparse.FileType("rt"),
                               default=sys.stdin)
    refine_parser.add_argument("--i-separator", default="\t")
    refine_parser.add_argument("--i-extra-separator", default=";")
    refine_parser.add_argument("--i-aids-separator", default=",")
    # refine_parser.add_argument("--no-refine-m", action="store_false", dest="refine_m")
    # refine_parser.add_argument("--no-refine-l", action="store_false", dest="refine_l")
    # refine_parser.add_argument("--no-refine-n", action="store_false", dest="refine_n")
    refine_parser.add_argument("--gid-suffix", default="refined")
    refine_parser.add_argument("-o",
                               "--output",
                               type=argparse.FileType("wt"),
                               default=sys.stdout)
    refine_parser.add_argument("--o-separator", default="\t")
    refine_parser.add_argument("--o-aids-separator", default=",")
    refine_parser.add_argument("--o-extra-separator", default=";")
    ###
    project_parser = subparsers.add_parser("project",
                                           parents=[cli_logging_parser])
    project_parser.add_argument("rck_adg",
                                type=argparse.FileType("rt"),
                                default=sys.stdin)
    project_parser.add_argument("--i-separator", default="\t")
    project_parser.add_argument("--i-extra-separator", default=";")
    project_parser.add_argument("--i-aids-separator", default=",")
    project_parser.add_argument("--adjacencies",
                                required=True,
                                type=argparse.FileType("rt"))
    project_parser.add_argument("--adj-separator", default="\t")
    project_parser.add_argument("--adj-extra-separator", default=";")
    project_parser.add_argument("--gid-suffix", default="projected")
    project_parser.add_argument("-o",
                                "--output",
                                type=argparse.FileType("wt"),
                                default=sys.stdout)
    project_parser.add_argument("--o-separator", default="\t")
    project_parser.add_argument("--o-aids-separator", default=",")
    project_parser.add_argument("--o-extra-separator", default=";")
    ###
    args = parser.parse_args()
    logger = get_standard_logger_from_args(
        args=args, program_name="RCK-UTILS-ADJ-GROUPS-process")
    if args.command == "cat":
        adj_groups = itertools.chain(*(stream_adjacency_groups_from_source(
            source=adj_group_source,
            separator=args.i_separator,
            aids_separator=args.i_aids_separator,
            extra_separator=args.i_extra_separator)
                                       for adj_group_source in args.rck_adg))
        if args.enforce_unique_ids:
            pass
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator)
    elif args.command == "refine":
        logger.info("Refining input adjacency groups")
        logger.info(
            "Reading adjacency groups from {file}".format(file=args.rck_adg))
        adg_groups = read_adjacency_groups_from_source(
            source=args.rck_adg,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator,
            aids_separator=args.i_aids_separator)
        logger.info("A total of {cnt} adjacency groups has been read".format(
            cnt=len(adg_groups)))
        molecule_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.MOLECULE
        ]
        logger.info(
            "A total of {cnt} molecule adjacency groups has been read".format(
                cnt=len(molecule_groups)))
        labeling_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.LABELING
        ]
        logger.info(
            "A total of {cnt} labeling adjacency groups has been read".format(
                cnt=len(labeling_groups)))
        general_groups = [
            ag for ag in adg_groups
            if ag.group_type == AdjacencyGroupType.GENERAL
        ]
        logger.info(
            "A total of {cnt} general adjacency groups has been read".format(
                cnt=len(general_groups)))
        logger.info("Refining molecule adjacency groups")
        refined_molecule_groups = molecule_groups
        logger.info(
            "A total of {cnt} refined molecule adjacency groups remains".
            format(cnt=len(refined_molecule_groups)))
        logger.info("Refining labeling adjacency groups")
        r_labeling_groups = refined_labeling_groups(
            adj_groups=labeling_groups,
            gid_suffix="" if len(args.gid_suffix) == 0 else args.gid_suffix +
            "-L",
            retain_source_gids=True)
        logger.info(
            "A total of {cnt} refined labeling adjacency groups remains".
            format(cnt=len(r_labeling_groups)))
        logger.info("Refining general adjacency groups")
        refined_general_groups = general_groups
        logger.info(
            "A total of {cnt} refined labeling general adjacency groups remains"
            .format(cnt=len(refined_general_groups)))
        adj_groups = itertools.chain(refined_molecule_groups,
                                     r_labeling_groups, refined_general_groups)
        logger.info("Writing refined adjacency groups to {file}".format(
            file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=adj_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator)
    elif args.command == "project":
        logger.info(
            "Projecting input adjacency groups based on input adjacencies")
        logger.info(
            "Reading adjacency groups from {file}".format(file=args.rck_adg))
        adg_groups = read_adjacency_groups_from_source(
            source=args.rck_adg,
            separator=args.i_separator,
            extra_separator=args.i_extra_separator,
            aids_separator=args.i_aids_separator)
        logger.info("A total of {cnt} adjacency gorups has been read".format(
            cnt=len(adg_groups)))
        adjacencies = read_adjacencies_from_source(
            source=args.adjacencies,
            separator=args.adj_separator,
            extra_separator=args.adj_extra_separator)
        p_groups = projected_groups(groups=adg_groups,
                                    adjacencies=adjacencies,
                                    gid_suffix=args.gid_suffix)
        logger.info("A total of {cnt} projected groups remained".format(
            cnt=len(p_groups)))
        logger.info("Writing projected adjacency groups to {file}".format(
            file=args.output))
        write_adjacency_groups_to_destination(
            destination=args.output,
            adjacency_groups=p_groups,
            separator=args.o_separator,
            aids_separator=args.o_aids_separator,
            extra_separator=args.o_extra_separator)
Ejemplo n.º 11
0
def main():
    parser = argparse.ArgumentParser(prog="RCK-UTILS-SCNT-x2rck")
    cli_logging_parser = get_logging_cli_parser()
    chr_strip_parser = get_chromosome_strip_parser()
    subparsers = parser.add_subparsers(title="command", dest="command")
    subparsers.required = True
    ####
    titan_parser = subparsers.add_parser("titan", parents=[cli_logging_parser, chr_strip_parser])
    titan_parser.add_argument("titan_ichor_seg")
    titan_parser.add_argument("--sample-name", required=True)
    titan_parser.add_argument("--clone-ids", default=None)
    titan_parser.add_argument("--separator", default="\t")
    titan_parser.add_argument("--corrected-cn-fix", choices=["None", "equal", "relative-dist"], default="None")
    titan_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    battenberg_parser = subparsers.add_parser("battenberg", parents=[cli_logging_parser, chr_strip_parser])
    battenberg_parser.add_argument("battenberg", type=argparse.FileType("rt"), default=sys.stdin)
    battenberg_parser.add_argument("--separator", default="\t")
    battenberg_parser.add_argument("--sample-name", required=True)
    battenberg_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
    battenberg_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    hatchet_parser = subparsers.add_parser("hatchet", parents=[cli_logging_parser, chr_strip_parser])
    hatchet_parser.add_argument("hatchet", type=str)
    hatchet_parser.add_argument("--separator", default="\t")
    hatchet_parser.add_argument("--min-usage", type=float, default=0.01)
    hatchet_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    group = hatchet_parser.add_mutually_exclusive_group(required=True)
    group.add_argument("--sample-name", default=None)
    group.add_argument("--clone-ids", default=None)
    ####
    remixt_parser = subparsers.add_parser("remixt", parents=[cli_logging_parser, chr_strip_parser])
    remixt_parser.add_argument("remixt", type=argparse.FileType("rt"), default=sys.stdin)
    remixt_parser.add_argument("--separator", default="\t")
    remixt_parser.add_argument("--clone-ids", choices=["1", "2", "1,2"], default="1,2")
    remixt_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    ginkgo_parser = subparsers.add_parser("ginkgo", parents=[cli_logging_parser, chr_strip_parser])
    ginkgo_parser.add_argument("ginkgo", type=argparse.FileType("rt"), default=sys.stdin)
    ginkgo_parser.add_argument("--separator", default="\t")
    ginkgo_parser.add_argument("--sample-name", required=True)
    ginkgo_parser.add_argument("--dummy-clone-name", default="1")
    ginkgo_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    ####
    gff_parser = subparsers.add_parser("gff", parents=[cli_logging_parser, chr_strip_parser])
    gff_parser.add_argument("gff", type=str)
    gff_parser.add_argument("--chr-mapping-file", type=argparse.FileType("rt"))
    gff_parser.add_argument("--chr-mapping-missing-strategy", choices=["keep", "skip"], default="keep")
    gff_parser.add_argument("-o", "--output", type=argparse.FileType("wt"), default=sys.stdout)
    args = parser.parse_args()
    logger = get_standard_logger_from_args(args=args, program_name="RCK-UTILS-SCNT")

    if args.command == "titan":
        logger.info("Converting allele-specific segment copy values form TitanCNA format to RCK")
        titan_full_path = get_full_path(path=args.titan_ichor_seg)
        if args.clone_ids is None:
            logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=titan_full_path))
            clone_ids = titan_get_clone_ids_from_file(file_name=titan_full_path, sample_name=args.sample_name, separator=args.separator)
        else:
            clone_ids = sorted(set(args.clone_ids.split(",")))
        logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        with open(args.titan_ichor_seg, "rt") as source:
            logger.info("Reading allele-specific segment copy number values from {file}".format(file=titan_full_path))
            segments, scnt = get_scnt_from_titan_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator,
                                                        corrected_cn_fix=args.corrected_cn_fix, chr_strip=args.strip_chr)
            logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
            write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
    elif args.command == "battenberg":
        logger.info("Converting allele-specific segment copy values form Battenberg format to RCK")
        clone_ids = args.clone_ids.split(",")
        logger.debug("Clone ids are identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        logger.info("Reading allele-specific segment copy number values form {file}".format(file=args.battenberg))
        segments, scnt = get_scnt_from_battenberg_source(source=args.battenberg, sample_name=args.sample_name, separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
    elif args.command == "hatchet":
        hatchet_full_path = get_full_path(path=args.hatchet)
        logger.info("Converting allele-specific segment copy values form HATCHet format to RCK")
        if args.clone_ids is None:
            logger.debug("Clone ids were not provided, extracting all clone ids from {file}".format(file=hatchet_parser))
            clone_ids = hatchet_get_clone_ids_from_file(file_name=hatchet_full_path, sample_name=args.sample_name, separator=args.separator, min_usage=args.min_usage)
        else:
            clone_ids = sorted(set(args.clone_ids.split(",")))
        logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        with open(hatchet_full_path) as source:
            logger.info("Reading allele-specific segment copy number values from {file}".format(file=hatchet_full_path))
            segments, scnt = get_scnt_from_hatchet_source(source=source, sample_name=args.sample_name, clone_ids=clone_ids, separator=args.separator, chr_strip=args.strip_chr)
            logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
            write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=clone_ids, separator=args.separator)
    elif args.command == "remixt":
        logger.info("Converting allele-specific segment copy values form ReMixT format to RCK")
        clone_ids = args.clone_ids.split(",")
        logger.debug("Clone ids were identified as {clone_ids}".format(clone_ids=",".join(clone_ids)))
        logger.info("Reading allele-specific segment copy number values from {file}".format(file=args.remixt))
        segments, scnt = get_scnt_from_remixt_source(source=args.remixt, separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing allele-specific segment copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, separator=args.separator, clone_ids=clone_ids)
    elif args.command == "ginkgo":
        logger.info("Converting *haploid* segments copy values from Ginkgo format to RCK")
        logger.info("Reading *haploid* segments copy values from {file}".format(file=args.ginkgo))
        segments, scnt = get_scnt_from_ginkgo_source(source=args.ginkgo, sample_name=args.sample_name, dummy_clone=args.dummy_clone_name,
                                                     separator=args.separator, chr_strip=args.strip_chr)
        logger.info("Writing *haploid* segments copy number values in RCK format to {file}".format(file=args.output))
        write_scnt_to_destination(destination=args.output, segments=segments, scnt=scnt, clone_ids=set(args.dummy_clone_name), separator=args.separator)
    elif args.command == "gff":
        logger.info("Converting segments data from GFF format to RCK")
        logger.info("Reading segments from {file}".format(file=args.gff))
        chr_mappings = None
        if args.chr_mapping_file is not None:
            chr_mappings = {}
            logger.info("Reading chromosome mapping data from {file}".format(file=args.chr_mapping_file))
            for line in args.chr_mapping_file:
                line = line.strip()
                data = line.split("\t")
                chr_mappings[data[0]] = data[1]
        segments = get_segments_from_gff_file(file_name=args.gff, chr_strip=args.strip_chr,
                                              chr_mapping=chr_mappings, chr_mapping_missing_strategy=args.chr_mapping_missing_strategy)
        logger.info("Writing segments in RCK format to {file}".format(file=args.output))
        write_segments_to_destination(destination=args.output, segments=segments)
    logger.info("Success!")