def get_QP_sample_mask(species_name): sample_names = get_sample_names(species_name) QP_samples = set(diversity_utils.calculate_haploid_samples(species_name)) highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) allowed_samples = QP_samples & highcoverage_samples return np.isin(sample_names, list(allowed_samples)), sample_names
def process_one_species(species_name): if os.path.exists(os.path.join(config.analysis_directory, 'allele_freq', species_name)): print("{} already processed".format(species_name)) return samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D'])) highcoverage_samples = list( diversity_utils.calculate_highcoverage_samples(species_name)) for sample in highcoverage_samples: all_fs, all_pfs = sfs_utils.calculate_binned_sfs_from_sfs_map( sfs_map[sample], folding='major') df = all_fs[1] - all_fs[0] # For peak finding, only use the polymorphic sites pfs = all_pfs[all_fs < 0.95] fs = all_fs[all_fs < 0.95] # Find the max peak size within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map( sfs_map[sample]) between_line = between_sites*1.0 / \ total_sites/((fs > 0.2)*(fs < 0.5)).sum() pmax = np.max([pfs[(fs > 0.1)*(fs < 0.95)].max(), between_line]) peak_idx, cutoff = HGT_utils._find_sfs_peaks_and_cutoff(fs, pfs, pmax) num_peaks = len(peak_idx) # Now plot and save the figure _ = plt.figure() ax = plt.gca() ax.set_xlim([0.50, 1.00]) ax.set_ylim([0, pmax*3]) ax.bar((all_fs-df/2), all_pfs, width=df) ax.plot(fs[peak_idx]-df/2, pfs[peak_idx], 'rx', label='peaks detected') ax.set_xlabel('Major allele freq') if cutoff: ax.axvspan(min(fs), cutoff, alpha=0.1, color='red', label='SNP sites') ax.legend() path = os.path.join(config.analysis_directory, 'allele_freq', species_name, str(num_peaks)) if not os.path.exists(path): os.makedirs(path) plt.savefig(path + '/' + sample + '.png') plt.close()
def get_desired_samples(species_name, between_host=False): highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) if between_host: QP_samples = set( diversity_utils.calculate_haploid_samples(species_name)) return QP_samples & highcoverage_samples else: single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq', species_name, '1') if not os.path.exists(single_peak_dir): print("Please plot sfs by peaks first for {}".format(species_name)) return None desired_samples = set([ f.split('.')[0] for f in os.listdir(single_peak_dir) if not f.startswith('.') ]) return desired_samples & highcoverage_samples
def get_single_peak_sample_mask(species_name): """ Compute a mask that keep only samples suitable for within host analysis A sample need to be 1) well covered, 2) has single clean peak The list of sample names and the list of peak cutoffs will also be returned """ sample_names = get_sample_names(species_name) blacklist = set(HGT_utils.get_within_host_bad_samples(species_name)) highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq', species_name, '1') if not os.path.exists(single_peak_dir): print("No single peak samples found for {}".format(species_name)) mask = np.zeros(len(sample_names)).astype(bool) return mask, sample_names, np.array([]) else: single_peak_samples = set([ f.split('.')[0] for f in os.listdir(single_peak_dir) if not f.startswith('.') ]) allowed_samples = single_peak_samples & highcoverage_samples - blacklist mask = np.isin(sample_names, list(allowed_samples)) # filter samples with a clean single peak _, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name, allowed_variant_types=set(['4D'])) results = [ HGT_utils.find_sfs_peaks_and_cutoff(sample, sfs_map) for sample in sample_names[mask] ] cutoffs = np.array([res[1] for res in results]) clean_peak_mask = np.array([cutoff is not None for cutoff in cutoffs]) mask[mask] = clean_peak_mask good_cutoffs = cutoffs[clean_peak_mask] return mask, sample_names, good_cutoffs.astype(float)
good_species_list = parse_midas_data.parse_good_species_list() if species!='all': good_species_list = [species] else: if debug: good_species_list = good_species_list[:3] # header for the output file. record_strs = [] for species_name in good_species_list: sys.stderr.write("Loading samples...\n") # Only plot samples above a certain depth threshold that are confidently phaseable. snp_samples = diversity_utils.calculate_highcoverage_samples(species_name, min_coverage=min_coverage) if len(snp_samples)<2: continue sys.stderr.write("found %d samples\n" % len(snp_samples)) # Analyze SNPs, looping over chunk sizes. # Clunky, but necessary to limit memory usage on cluster # Load SNP information for species_name sys.stderr.write("Loading SNPs for %s...\n" % species_name) snps = [] snp_map = {} # contig: list of locations map
for j in xrange(i + 1, divergence_matrices[species_name].shape[0]): if divergence_matrices[species_name][i, j] >= 0: between_divergences[species_name].append( divergence_matrices[species_name][i, j]) between_divergences[species_name] = numpy.array( between_divergences[species_name]) # Load SNP information for species_name sys.stderr.write("Loading SFSs for %s...\t" % species_name) sfs_samples, sfs_map = parse_midas_data.parse_within_sample_sfs( species_name) sys.stderr.write("Done!\n") highcoverage_samples = diversity_utils.calculate_highcoverage_samples( species_name) desired_samples = snp_samples within_polymorphisms[species_name] = [] for sample in desired_samples: within_sites, between_sites, total_sites = sfs_utils.calculate_polymorphism_rates_from_sfs_map( sfs_map[sample]) within_polymorphisms[species_name].append(within_sites * 1.0 / total_sites) species_names = [] sample_sizes = [] avg_divergences = [] for species_name in species_phylogeny_utils.sort_phylogenetically( divergence_matrices.keys()):