Esempio n. 1
0
    def callback(self, args):
        if args.mode.startswith("pair"):
            for i in args.signatures:
                i = int(i)
                W, signature_names = read_signatures(i)

                if args.mode.endswith('gen'):
                    gen_benchmark_2combinations(args.root, signature_names, W)
                elif args.mode.endswith('run'):
                    run_benchmark_2combinations(args.root, i, signature_names, W, force=True)
                elif args.mode.endswith('run_ds'):
                    run_benchmark_2combinations_deconstruct_sigs(args.root, i, signature_names, W, force=True)

        elif args.mode.startswith('multiple'):
            if args.mode.endswith('gen'):
                pass
            elif args.mode.endswith('run'):
                multiple_benchmark_run(i, signature_names, W, force=True)
            elif args.mode.endswith('run_ds'):
                pass
            multiple_benchmark()
            aggregate_multiple_benchmarks()

        elif args.mode == 'aggregate':
            aggregate_benchmarks(args.root)

        else:
            print("Unknown benchmark action mode")
            self.parser.print_usage()
            sys.exit(1)
Esempio n. 2
0
    def __init__(self,
                 profile,
                 sig_set,
                 method='MLEZ',
                 others_threshold=0,
                 bootstrap=True,
                 dummy_sigs=True,
                 global_optimization=None):
        """
        Arguments:
        `profile`: Profile to decompose. Must have length 96.
        `sig_set`: signature set to use for the decomposition.
        `method`: solver method.
        `others_threshold`: minimum threshold for acceptable results.
        `bootstrap`: Use the bootstrap to calculate confidence intervals. 
        `dummy_sigs`: Account for unexplained variance (non-context dependent mutational processes and unknown signatures)
        `debug`: run the decomposition in debug mode.
        """
        assert len(
            profile) == 96, "Invalid sample. Must be vector of length 96"
        assert sig_set in [5, 10, 30,
                           49], "Invalid sig_set choice. Must be 5,10,30 or 49"
        assert method.lower(
        ) in IDENTIFY_MIN_FUNCTIONS, "Unknown method provided"

        self.profile = profile
        self.sig_set = sig_set
        self.method = method
        self.bootstrap = bootstrap
        self.enable_dummy = dummy_sigs
        self.global_optimization = global_optimization
        self.others_threshold = others_threshold
        self.W_and_labels = read_signatures(self.sig_set)
        self._main()
Esempio n. 3
0
def multiple_benchmark_helper(j):
    dirname = "data/benchmark/multiple"

    # for i in [5, 10, 30]:
    for i in [
            30,
    ]:
        W, signature_names = read_signatures(i)
        N = W.shape[1]

        # r = random.randrange(2, i // 3 + 2)
        r = random.randrange(2, min(i + 1, 15))

        # print(np.random.choice(N, r), .05 + np.random.dirichlet(np.ones(r), 1))
        while True:
            h0 = np.zeros(N)
            h0[np.random.choice(
                N, r)] = 0.05 + np.random.dirichlet(np.ones(r), 1)
            if np.greater(h0, 0.05).sum() == r:
                break
        h0 /= h0.sum()
        v0 = W.dot(h0)
        # print(h0)
        n_mutations = random.randrange(10, 50)
        v0_counts = np.random.multinomial(n_mutations, v0 / v0.sum())
        # print(v0_counts)

        random_name = str(uuid.uuid4())[:4]
        fname = dirname + "/{:02d}_{}_{}_{}".format(i, r, n_mutations,
                                                    random_name)
        print(fname)
        profile_fname = fname + ".profile"
        info_fname = fname + ".info"
        mle_info = fname + ".MLE.info"
        mlez_info = fname + ".MLEZ.info"
        ds_info = fname + ".ds.info"

        write_profile(profile_fname, v0_counts)
        write_decomposition(info_fname, h0, signature_names)

        ##################################################
        results = deconstruct_sigs_custom(profile_fname, signatures=i)
        write_decomposition(ds_info, results, signature_names)
        ##################################################
        profile = read_profile_file(profile_fname)
        for method, method_fname in [("MLE", mle_info), ("MLEZ", mlez_info)]:
            _, _, results = decompose_mutational_profile_counts(
                profile, (W, signature_names),
                method,
                debug=False,
                others_threshold=0.0)
            write_decomposition(method_fname, results, signature_names)
Esempio n. 4
0
 def _gen(self):
     W_og, signature_names = read_signatures(self.ref_sig)
     self.ref_w_labels = (W_og, signature_names)
     sig_dict = list(
         map(lambda w, name: {
             'w': w,
             'name': name
         }, W_og.T, signature_names))
     if 'predef_sig_names' in dir(self):
         sig_names = self.predef_sig_names
         sample_sigs = list(
             filter(lambda item: item['name'] in sig_names, sig_dict))
         assert len(sample_sigs) == len(
             sig_names), "sig_names provided were not found"
         h = self.predef_h
     else:
         sample_sigs = np.random.choice(
             sig_dict, self.complexity,
             replace=False)  # synthetic sample sigs
         sig_names = [sig["name"] for sig in sample_sigs]
         h = np.random.rand(self.complexity)  # init exposure
         h = h / h.sum()  # normalize exposures
     W = np.array([sig["w"] for sig in sample_sigs]).T  # sample specific W
     v = W.dot(h)  # calculate mutational profile
     v = np.rint(
         v *
         (self.N_mut / v.sum()))  # make v the desired number of mutations
     v_noise = np.random.random_integers(-self.noise, self.noise,
                                         96)  # generate noise
     v += v_noise  # add noise to v
     v[v < 0] = 0  # make any negative counts 0
     info = {
         'sig': {
             'group': self.ref_sig,
             'used': sig_names
         },
         'W': W,
         'h': h,
         'noise': v_noise,
         'total_mut': v.sum()
     }
     self.info = info
     super().__init__(v)
Esempio n. 5
0
    def identify(self, args):
        if not args.infile:
            logger.warning("Provide input file in VCF or MAF format (-i) and a corresponding genome assembly (-g)")
            return
        if not args.genome:
            logger.warning(genome_error_message)
            return
        if not args.signatures:
            logger.warning("Set of signatures required. Use 5 and 10 for MUTAGENE-5 and MUTAGENE-10. Use 30 for COSMIC-30")
            return

        if args.method.lower() not in IDENTIFY_MIN_FUNCTIONS:
            logger.warning('Unknown method provided')
            return

        method = args.method

        # mutations, processing_stats = read_VCF_profile(args.infile, asm=args.genome)
        # mutations, processing_stats = read_auto_profile(args.infile, fmt=args.input_format, asm=args.genome)
        W, signature_names = read_signatures(int(args.signatures))

        if args.input_format == 'TCGI':
            mutations, mutations_with_context, processing_stats = read_TCGI_with_context_window(args.infile, args.genome, window_size=1)
            samples_profiles = get_multisample_mutational_profile(mutations, counts=True)

            samples_results = {}
            for sample, profile in samples_profiles.items():
                _, _, results = decompose_mutational_profile_counts(
                    profile,
                    (W, signature_names),
                    method,
                    others_threshold=0.0)
                samples_results[sample] = results
            write_multisample_decomposition(args.outfile, samples_results, signature_names)
        if args.input_format == 'MAF':
            mutations, mutations_with_context, processing_stats = read_MAF_with_context_window(args.infile, args.genome, window_size=1)
            samples_profiles = get_multisample_mutational_profile(mutations, counts=True)

            samples_results = {}
            for sample, profile in samples_profiles.items():
                _, _, results = decompose_mutational_profile_counts(
                    profile,
                    (W, signature_names),
                    method,
                    others_threshold=0.0)
                samples_results[sample] = results
            write_multisample_decomposition(args.outfile, samples_results, signature_names)
        elif args.input_format == 'VCF':
            mutations, processing_stats = read_auto_profile(args.infile, fmt=args.input_format, asm=args.genome)
            profile = get_mutational_profile(mutations, counts=True)
            if not args.bootstrap:
                _, _, results = decompose_mutational_profile_counts(
                    profile,
                    (W, signature_names),
                    method,
                    others_threshold=0.0,
                    enable_dummy=args.no_unexplained_variance)
                write_decomposition(args.outfile, results, signature_names, 'VCF')
            else:
                bootstrap_results = []
                for resampled_profile in generate_resampled_profiles(profile, 100):
                    _, _, results = decompose_mutational_profile_counts(
                        resampled_profile,
                        (W, signature_names),
                        method,
                        others_threshold=0.0,
                        enable_dummy=args.no_unexplained_variance)
                    bootstrap_results.append(results)
                write_bootstrap_decomposition(args.outfile, bootstrap_results, signature_names, 'VCF')
Esempio n. 6
0
    def identify(self, args):
        if not args.infile:
            logger.warning("Provide input file in VCF or MAF format (-i) and a corresponding genome assembly (-g)")
            return
        if not args.genome:
            logger.warning(genome_error_message)
            return
        if not args.signatures:
            logger.warning("Set of signatures required. Use 5 and 10 for MUTAGENE-5 and MUTAGENE-10. Use 30 for COSMIC-30")
            return

        if args.method.lower() not in IDENTIFY_MIN_FUNCTIONS:
            logger.warning('Unknown method provided')
            return

        if args.bootstrap_replicates < 10:
            logger.warning("Number of bootstrap replicates too low. Specify at least 10 replicates")
            return

        if args.bootstrap_confidence_level < 70:
            logger.warning("Specify confidence level of at least 70% and less than 99%")
            return

        only = None
        if args.keep_only is not None:
            only = args.keep_only.split(",")
            if len(only) < 1:
                logger.warning("List of signatures to keep for the analysis is empty")
                return
            logger.warning("We will only analyze signatures in this list: {}".format(", ".join(only)))

        if 'input_format' not in args:
            # guess format from file name
            name = args.infile.name.upper()
            if name.endswith("MAF"):
                args.input_format = "MAF"
            elif name.endswith("VCF"):
                args.input_format = "VCF"
            else:
                logger.warning("Input format was not specified. Assuming it is MAF")
                args.input_format = "MAF"

        W, signature_names = read_signatures(args.signatures, only=only)

        try:
            mutations, _, processing_stats = read_mutations(args.input_format, args.infile, args.genome, window_size=1)
        except Exception as e:
            e_message = getattr(e, 'message', repr(e))
            logger.warning(
                "Parsing {0} failed. "
                "Check that the input file is in {0} format "
                "or specify a different format using option -f \n"
                "{1}".format(args.input_format, e_message))

            if logger.root.level == logging.DEBUG:
                raise
            return

        samples_profiles = get_multisample_mutational_profile(mutations, counts=True)
        samples_results = {}

        for sample, profile in samples_profiles.items():
            _, _, results = decompose_mutational_profile_counts(
                profile,
                (W, signature_names),
                args.method,
                others_threshold=0.0,
                enable_dummy=args.no_unexplained_variance)
            samples_results[sample] = results

        if not args.bootstrap:
            write_decomposition(args.outfile, samples_results, signature_names, mutations_threshold=args.mutations_threshold)
        else:
            bootstrap_samples_results = {}
            for sample, profile in samples_profiles.items():
                bootstrap_results = []
                for resampled_profile in tqdm(generate_resampled_profiles(profile, args.bootstrap_replicates), total=args.bootstrap_replicates):
                    _, _, results = decompose_mutational_profile_counts(
                        resampled_profile,
                        (W, signature_names),
                        args.method,
                        others_threshold=0.0,
                        enable_dummy=args.no_unexplained_variance)
                    bootstrap_results.append(results)
                bootstrap_samples_results[sample] = bootstrap_results

            write_decomposition(
                args.outfile, samples_results, signature_names,
                mutations_threshold=args.mutations_threshold,
                bootstrap_method=args.bootstrap_method,
                profile=samples_profiles,
                bootstrap_results=bootstrap_samples_results,
                bootstrap_level=args.bootstrap_confidence_level)
Esempio n. 7
0
def aggregate_multiple_benchmarks():
    methods = {
        "mle": ".MLE.info",
        "mlez": ".MLEZ.info",
        "ds": ".ds.info",
        'aicc': '.AICc.info',
        'bic': '.BIC.info',
        'aiccz': '.AICcz.info',
        'bicz': '.BICz.info',
    }

    # signatures_thresholds = {
    #     5: 0.06,
    #     10: 0.03,
    #     30: 0.01,
    # }

    signatures_thresholds = {
        5: 0.06,
        10: 0.06,
        30: 0.06,
    }

    # signatures_thresholds = {
    #     5: 0.0001,
    #     10: 0.0001,
    #     30: 0.0001,
    # }

    # only report the signature 2 value (as in DeconstructSigs benchmark)
    with open("data/benchmark/multiple/res1.txt", 'w') as o:
        o.write(
            "file_id\tsigtype\tnsig\tnmut\tmethod\tSRMSE\tPRMSE\tSTRMSE\tLLIK\tLLIK0\tTLLIK\tTLLIK0\tprecision\trecall\taccuracy\tf1\n"
        )
        for fname in glob.glob("data/benchmark/multiple/*.profile",
                               recursive=True):
            file_id = fname.split("/")[-1].split(".")[0]
            sigtype, r, nmut, replica = fname.split("/")[-1].split(
                ".")[0].split("_")
            sigtype = int(sigtype)

            if sigtype != 30:
                continue

            W, signature_names = read_signatures(sigtype)

            info_fname = fname.split(".")[0] + '.info'
            orig_profile = read_profile_file(fname)
            h0, names = read_decomposition(info_fname)

            # threshold = 0.06
            threshold = 0.06

            # threshold = 1.0 / np.sqrt(int(nmut)) if method != "ds" else 0.06
            h0_threshold = np.where(h0 > threshold, h0,
                                    0.0)  # zero below threshold
            h0_binary = np.array(
                h0_threshold) > 0.0  # true / false for threshold
            nsig = np.count_nonzero(h0_binary)

            if nsig < int(r):
                print("LESS", sigtype, nsig, r)

            if nsig > int(r):
                print("MORE", sigtype, nsig, r)

            if nsig <= 1:
                continue
            if nsig > 10:
                continue

            for method in methods:
                method_fname = fname.split(".")[0] + methods[method]
                values, names = read_decomposition(method_fname)

                # print(method_fname)

                if values is None:
                    continue

                h = np.array(values)
                if h.sum() == 0:
                    continue

                h_threshold = np.where(h > threshold, h,
                                       0.0)  # zero below threshold

                reconstructed_profile = W.dot(h)
                # print(h)
                # print(reconstructed_profile)

                PRMSE = np.sqrt(
                    mean_squared_error(
                        np.array(orig_profile) / np.array(orig_profile).sum(),
                        np.array(reconstructed_profile) /
                        np.array(reconstructed_profile).sum()))
                SRMSE = np.sqrt(mean_squared_error(h0, h))
                STRMSE = np.sqrt(mean_squared_error(h0_threshold, h_threshold))
                LLIK0 = -NegLogLik(h0, W, orig_profile)
                TLLIK0 = -NegLogLik(h0_threshold, W, orig_profile)
                LLIK = -NegLogLik(h, W, orig_profile)
                TLLIK = -NegLogLik(h_threshold, W, orig_profile)

                # print(h0.sum())
                # print(h.sum())

                h_binary = np.array(
                    h_threshold) > 0.0  # true / false for threshold
                precision = precision_score(h0_binary, h_binary)
                recall = recall_score(h0_binary, h_binary)
                accuracy = accuracy_score(h0_binary, h_binary)
                f1 = f1_score(h0_binary, h_binary)

                o.write(
                    "{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\t{}\n"
                    .format(file_id, sigtype, nsig, nmut, method, SRMSE, PRMSE,
                            STRMSE, LLIK, LLIK0, TLLIK, TLLIK0, precision,
                            recall, accuracy, f1))