コード例 #1
0
def process_one_species(species_name):
    t0 = time.time()
    # load data first; computed from parse_snps_to_pickle.py
    all_genes = core_gene_utils.get_sorted_core_genes(species_name)
    data_dir = os.path.join(config.analysis_directory,
                            "within_hosts_checkpoints/")

    if os.path.exists("{}{}/all_runs_map.pickle".format(data_dir, species_name)):
        print('{} already processed'.format(species_name))
        return
    _, sfs_map = parse_midas_data.parse_within_sample_sfs(
            species_name, allowed_variant_types=set(['4D']))
    allele_counts_map = pickle.load(
        open("{}{}/allele_counts_map.pickle".format(data_dir, species_name), 'rb'))
    found_samples = pickle.load(
        open("{}{}/found_samples.pickle".format(data_dir, species_name), 'rb'))
    passed_sites_map = pickle.load(
        open("{}{}/passed_sites_map.pickle".format(data_dir, species_name), 'rb'))
    print("Finish loading data for {} at {} min".format(
        species_name, (time.time() - t0)/60))

    counts_map = dict()
    runs_map = dict()
    for sample_idx in xrange(len(found_samples)):
        sample_id = found_samples[sample_idx]
        gene_snp_map = HGT_utils.find_single_host_relative_snps(
                sample_idx, found_samples, allele_counts_map, sfs_map)
        if gene_snp_map is None:
            print("Sample {} has no clear peak".format(sample_id))
            continue
        all_gene_counts = HGT_utils.get_gene_snp_vector(
                gene_snp_map, all_genes)
        counts_map[sample_idx] = sum(all_gene_counts)
        runs, starts, ends = HGT_utils.find_runs(all_gene_counts)

        # Now count the number of passed sites for each run
        passed_site_vec = get_passed_site_vector(
                passed_sites_map, all_genes, sample_idx, sample_idx)
        site_counts = np.array([sum(passed_site_vec[start:end+1])
                                for (start, end) in zip(starts, ends)])
        # Now count the number of anolamous events
        runs_map[sample_idx] = (runs, starts, ends, site_counts)

    # save data
    pickle.dump(runs_map, open(
        "{}{}/all_runs_map.pickle".format(data_dir, species_name), 'wb'))
    pickle.dump(counts_map, open(
        "{}{}/snp_counts_map.pickle".format(data_dir, species_name), 'wb'))
    print("Finish saving data for {} at {} min".format(
        species_name, (time.time() - t0)/60))
コード例 #2
0
def process_one_species(species_name):
    if os.path.exists(os.path.join(config.analysis_directory, 'allele_freq', species_name)):
        print("{} already processed".format(species_name))
        return

    samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name, allowed_variant_types=set(['4D']))
    highcoverage_samples = list(
        diversity_utils.calculate_highcoverage_samples(species_name))

    for sample in highcoverage_samples:
        all_fs, all_pfs = sfs_utils.calculate_binned_sfs_from_sfs_map(
            sfs_map[sample], folding='major')
        df = all_fs[1] - all_fs[0]
        # For peak finding, only use the polymorphic sites
        pfs = all_pfs[all_fs < 0.95]
        fs = all_fs[all_fs < 0.95]

        # Find the max peak size
        within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(
            sfs_map[sample])
        between_line = between_sites*1.0 / \
            total_sites/((fs > 0.2)*(fs < 0.5)).sum()
        pmax = np.max([pfs[(fs > 0.1)*(fs < 0.95)].max(), between_line])

        peak_idx, cutoff = HGT_utils._find_sfs_peaks_and_cutoff(fs, pfs, pmax)

        num_peaks = len(peak_idx)

        # Now plot and save the figure
        _ = plt.figure()
        ax = plt.gca()
        ax.set_xlim([0.50, 1.00])
        ax.set_ylim([0, pmax*3])
        ax.bar((all_fs-df/2), all_pfs, width=df)
        ax.plot(fs[peak_idx]-df/2, pfs[peak_idx], 'rx', label='peaks detected')
        ax.set_xlabel('Major allele freq')

        if cutoff:
            ax.axvspan(min(fs), cutoff, alpha=0.1, color='red', label='SNP sites')
        ax.legend()

        path = os.path.join(config.analysis_directory, 'allele_freq',
                            species_name, str(num_peaks))
        if not os.path.exists(path):
            os.makedirs(path)
        plt.savefig(path + '/' + sample + '.png')
        plt.close()
コード例 #3
0
def get_single_peak_sample_mask(species_name):
    """
    Compute a mask that keep only samples suitable for within host analysis
    A sample need to be 1) well covered, 2) has single clean peak
    The list of sample names and the list of peak cutoffs will also be returned
    """
    sample_names = get_sample_names(species_name)

    blacklist = set(HGT_utils.get_within_host_bad_samples(species_name))

    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq',
                                   species_name, '1')
    if not os.path.exists(single_peak_dir):
        print("No single peak samples found for {}".format(species_name))
        mask = np.zeros(len(sample_names)).astype(bool)
        return mask, sample_names, np.array([])
    else:
        single_peak_samples = set([
            f.split('.')[0] for f in os.listdir(single_peak_dir)
            if not f.startswith('.')
        ])
        allowed_samples = single_peak_samples & highcoverage_samples - blacklist
    mask = np.isin(sample_names, list(allowed_samples))

    # filter samples with a clean single peak
    _, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name, allowed_variant_types=set(['4D']))
    results = [
        HGT_utils.find_sfs_peaks_and_cutoff(sample, sfs_map)
        for sample in sample_names[mask]
    ]
    cutoffs = np.array([res[1] for res in results])
    clean_peak_mask = np.array([cutoff is not None for cutoff in cutoffs])
    mask[mask] = clean_peak_mask
    good_cutoffs = cutoffs[clean_peak_mask]
    return mask, sample_names, good_cutoffs.astype(float)
コード例 #4
0
        temporal_change_directory, species_name)

    output_file = gzip.open(intermediate_filename, "w")
    # header!
    output_file.write(", ".join([
        'Species', 'Sample1', 'Sample2', 'Type', 'L', 'Perr', 'Change1', '...'
    ]))
    output_file.write("\n")

    for species_name in good_species_list:

        sample_coverage_map = parse_midas_data.parse_sample_coverage_map(
            species_name)

        sys.stderr.write("Loading SFSs for %s...\t" % species_name)
        samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
            species_name, allowed_variant_types=set(['1D', '2D', '3D', '4D']))
        sys.stderr.write("Done!\n")

        sys.stderr.write("Loading temporal samples...\n")
        # Only plot samples above a certain depth threshold that are involved in timecourse
        snp_samples = diversity_utils.calculate_temporal_samples(species_name)

        # On purpose looking at non-consecutive pairs too
        # (restriction to consecutive pairs is later)
        same_sample_idxs, same_subject_idxs, diff_subject_idxs = sample_utils.calculate_nonconsecutive_ordered_subject_pairs(
            sample_order_map, snp_samples)

        if len(same_subject_idxs[0]) < min_sample_size:
            sys.stderr.write("Not enough temporal samples!\n")
            continue
コード例 #5
0
    divergence_matrices[species_name] = snp_substitution_matrix

    between_divergences[species_name] = []
    for i in xrange(0, divergence_matrices[species_name].shape[0]):
        for j in xrange(i + 1, divergence_matrices[species_name].shape[0]):

            if divergence_matrices[species_name][i, j] >= 0:

                between_divergences[species_name].append(
                    divergence_matrices[species_name][i, j])
    between_divergences[species_name] = numpy.array(
        between_divergences[species_name])

    # Load SNP information for species_name
    sys.stderr.write("Loading SFSs for %s...\t" % species_name)
    sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs(
        species_name)
    sys.stderr.write("Done!\n")

    highcoverage_samples = diversity_utils.calculate_highcoverage_samples(
        species_name)
    desired_samples = snp_samples

    within_polymorphisms[species_name] = []
    for sample in desired_samples:
        within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map(
            sfs_map[sample])
        within_polymorphisms[species_name].append(within_sites * 1.0 /
                                                  total_sites)

species_names = []
sample_sizes = []
コード例 #6
0
    species_name)
median_coverages = numpy.array([
    stats_utils.calculate_nonzero_median_from_histogram(
        sample_coverage_histogram)
    for sample_coverage_histogram in sample_coverage_histograms
])
sample_coverage_map = {
    samples[i]: median_coverages[i]
    for i in xrange(0, len(samples))
}

# Load pi information for species_name
sys.stderr.write("Loading within-sample diversity for %s...\n" % species_name)
samples, site_map = parse_midas_data.parse_within_sample_sfs(
    species_name,
    allowed_variant_types=set(['4D']),
    allowed_genes=core_genes,
    debug=debug)
sys.stderr.write("Done!\n")

median_coverages = numpy.array(
    [sample_coverage_map[samples[i]] for i in xrange(0, len(samples))])

print[len(site_map[i].keys()) for i in xrange(0, len(site_map))]

sys.exit(0)

# Only plot samples above a certain depth threshold that are "haploids"
snp_samples = samples[(median_coverages >= min_coverage) * (pis <= 1e-03)]

num_haploids = len(snp_samples)