Exemple #1
0
def tool_newref(args):
    logging.info('Creating new reference')

    split_path = list(os.path.split(args.outfile))
    if split_path[-1][-4:] == '.npz':
        split_path[-1] = split_path[-1][:-4]
    base_path = os.path.join(split_path[0], split_path[1])

    args.basepath = base_path
    args.prepfile = '{}_prep.npz'.format(base_path)
    args.partfile = '{}_part'.format(base_path)

    samples = []
    logging.info('Importing data ...')
    for infile in args.infiles:
        logging.info('Loading: {}'.format(infile))
        npzdata = np.load(infile, encoding='latin1')
        sample = npzdata['sample'].item()
        binsize = int(npzdata['binsize'])
        logging.info('Binsize: {}'.format(int(binsize)))
        samples.append(scale_sample(sample, binsize, args.binsize))

    samples = np.array(samples)
    genders, trained_cutoff = train_gender_model(samples)

    if not args.nipt:
        for i, sample in enumerate(samples):
            samples[i] = gender_correct(sample, genders[i])

    total_mask, bins_per_chr = get_mask(samples)
    if genders.count('F') > 4:
        mask_F, _ = get_mask(samples[np.array(genders) == 'F'])
        total_mask = total_mask & mask_F
    if genders.count('M') > 4 and not args.nipt:
        mask_M, _ = get_mask(samples[np.array(genders) == 'M'])
        total_mask = total_mask & mask_M

    outfiles = []
    if len(genders) > 9:
        logging.info('Starting autosomal reference creation ...')
        args.tmpoutfile = '{}.tmp.A.npz'.format(args.basepath)
        outfiles.append(args.tmpoutfile)
        tool_newref_prep(args, samples, 'A', total_mask, bins_per_chr)
        logging.info('This might take a while ...')
        tool_newref_main(args, args.cpus)
    else:
        logging.critical(
            'Provide at least 10 samples to enable the generation of a reference.'
        )
        sys.exit()

    if genders.count('F') > 4:
        logging.info('Starting female gonosomal reference creation ...')
        args.tmpoutfile = '{}.tmp.F.npz'.format(args.basepath)
        outfiles.append(args.tmpoutfile)
        tool_newref_prep(args, samples[np.array(genders) == 'F'], 'F',
                         total_mask, bins_per_chr)
        logging.info('This might take a while ...')
        tool_newref_main(args, 1)
    else:
        logging.warning(
            'Provide at least 5 female samples to enable normalization of female gonosomes.'
        )

    if not args.nipt:
        if genders.count('M') > 4:
            logging.info('Starting male gonosomal reference creation ...')
            args.tmpoutfile = '{}.tmp.M.npz'.format(args.basepath)
            outfiles.append(args.tmpoutfile)
            tool_newref_prep(args, samples[np.array(genders) == 'M'], 'M',
                             total_mask, bins_per_chr)
            tool_newref_main(args, 1)
        else:
            logging.warning(
                'Provide at least 5 male samples to enable normalization of male gonosomes.'
            )

    tool_newref_merge(args, outfiles, trained_cutoff)

    logging.info('Finished creating reference')
Exemple #2
0
def tool_test(args):
    logging.info('Starting CNA prediction')

    if not args.bed and not args.plot:
        logging.critical(
            'No output format selected. '
            'Select at least one of the supported output formats (--bed, --plot)'
        )
        sys.exit()

    if args.zscore <= 0:
        logging.critical(
            'Parameter --zscore should be a strictly positive number')
        sys.exit()

    if args.beta is not None:
        if args.beta <= 0 or args.beta > 1:
            logging.critical(
                'Parameter --beta should be a strictly positive number lower than 1'
            )
            sys.exit()

    if args.alpha <= 0 or args.alpha > 1:
        logging.critical(
            'Parameter --alpha should be a strictly positive number lower than 1'
        )
        sys.exit()

    logging.info('Importing data ...')
    ref_file = np.load(args.reference, encoding='latin1')
    sample_file = np.load(args.infile, encoding='latin1')

    sample = sample_file['sample'].item()
    n_reads = sum([sum(sample[x]) for x in sample.keys()])

    sample = scale_sample(sample, int(sample_file['binsize'].item()),
                          int(ref_file['binsize']))

    if not ref_file['is_nipt']:
        actual_gender = predict_gender(sample, ref_file['trained_cutoff'])
        if args.gender:
            actual_gender = args.gender
        sample = gender_correct(sample, actual_gender)
    else:
        actual_gender = 'F'

    if args.gender:
        actual_gender = args.gender

    ref_gender = actual_gender

    logging.info('Normalizing autosomes ...')

    results_r, results_z, results_w, ref_sizes, m_lr, m_z = normalize(
        args, sample, ref_file, 'A')

    if not ref_file['has_male'] and actual_gender == 'M':
        logging.warning(
            'This sample is male, whilst the reference is created with fewer than 5 males. '
            'The female gonosomal reference will be used for X predictions. Note that these might '
            'not be accurate. If the latter is desired, create a new reference and include more '
            'male samples.')
        ref_gender = 'F'

    elif not ref_file['has_female'] and actual_gender == 'F':
        logging.warning(
            'This sample is female, whilst the reference is created with fewer than 5 females. '
            'The male gonosomal reference will be used for XY predictions. Note that these might '
            'not be accurate. If the latter is desired, create a new reference and include more '
            'female samples.')
        ref_gender = 'M'

    logging.info('Normalizing gonosomes ...')

    null_ratios_aut_per_bin = ref_file['null_ratios']
    null_ratios_gon_per_bin = ref_file['null_ratios.{}'.format(
        ref_gender)][len(null_ratios_aut_per_bin):]

    results_r_2, results_z_2, results_w_2, ref_sizes_2, _, _ = normalize(
        args, sample, ref_file, ref_gender)

    rem_input = {
        'args':
        args,
        'wd':
        str(os.path.dirname(os.path.realpath(__file__))),
        'binsize':
        int(ref_file['binsize']),
        'n_reads':
        n_reads,
        'ref_gender':
        ref_gender,
        'actual_gender':
        actual_gender,
        'mask':
        ref_file['mask.{}'.format(ref_gender)],
        'bins_per_chr':
        ref_file['bins_per_chr.{}'.format(ref_gender)],
        'masked_bins_per_chr':
        ref_file['masked_bins_per_chr.{}'.format(ref_gender)],
        'masked_bins_per_chr_cum':
        ref_file['masked_bins_per_chr_cum.{}'.format(ref_gender)]
    }

    del ref_file

    results_r = np.append(results_r, results_r_2)
    results_z = np.append(results_z, results_z_2) - m_z
    results_w = np.append(results_w * np.nanmedian(results_w_2),
                          results_w_2 * np.nanmedian(results_w))
    results_w = results_w / np.nanmedian(results_w)
    ref_sizes = np.append(ref_sizes, ref_sizes_2)

    null_ratios_aut_per_sample = np.transpose(null_ratios_aut_per_bin)
    part_mask = np.array([not x for x in list(np.isnan(results_r))],
                         dtype=bool)
    null_m_lr_aut = np.array([
        np.nanmedian(x[part_mask[:len(null_ratios_aut_per_bin)]])
        for x in null_ratios_aut_per_sample
    ])

    null_ratios_aut_per_bin = null_ratios_aut_per_bin - null_m_lr_aut
    null_ratios = np.array([x.tolist() for x in null_ratios_aut_per_bin] +
                           [x.tolist() for x in null_ratios_gon_per_bin])

    results = {
        'results_r': results_r,
        'results_z': results_z,
        'results_w': results_w,
        'results_nr': null_ratios
    }

    for result in results.keys():
        results[result] = get_post_processed_result(args, results[result],
                                                    ref_sizes, rem_input)

    log_trans(results, m_lr)

    if args.blacklist:
        logging.info('Applying blacklist ...')
        apply_blacklist(rem_input, results)

    logging.info('Executing circular binary segmentation ...')

    results['results_c'] = exec_cbs(rem_input, results)

    if args.bed:
        logging.info('Writing tables ...')
        generate_output_tables(rem_input, results)

    if args.plot:
        logging.info('Writing plots ...')
        exec_write_plots(rem_input, results)

    logging.info('Finished prediction')