def get_QP_sample_mask(species_name): sample_names = get_sample_names(species_name) QP_samples = set(diversity_utils.calculate_haploid_samples(species_name)) highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) allowed_samples = QP_samples & highcoverage_samples return np.isin(sample_names, list(allowed_samples)), sample_names
def get_desired_samples(species_name, between_host=False): highcoverage_samples = set( diversity_utils.calculate_highcoverage_samples(species_name)) if between_host: QP_samples = set( diversity_utils.calculate_haploid_samples(species_name)) return QP_samples & highcoverage_samples else: single_peak_dir = os.path.join(config.analysis_directory, 'allele_freq', species_name, '1') if not os.path.exists(single_peak_dir): print("Please plot sfs by peaks first for {}".format(species_name)) return None desired_samples = set([ f.split('.')[0] for f in os.listdir(single_peak_dir) if not f.startswith('.') ]) return desired_samples & highcoverage_samples
min_change = 0.8 include_high_copynum = False #include_high_copynum = True # Load subject and sample metadata sys.stderr.write("Loading sample metadata...\n") subject_sample_map = parse_HMP_data.parse_subject_sample_map() sample_country_map = parse_HMP_data.parse_sample_country_map() sample_order_map = parse_HMP_data.parse_sample_order_map() sys.stderr.write("Done!\n") # Only plot samples above a certain depth threshold that are involved in timecourse snp_samples = diversity_utils.calculate_temporal_samples(species_name) # The subset of samples that are haploid haploid_samples = set(diversity_utils.calculate_haploid_samples(species_name)) # Only use the subset from North America # The only temporal samples are from here, best not contaminate the between-subject # comparisons with out of sample effects #snp_samples = snp_samples[parse_HMP_data.calculate_country_samples(sample_country_map, sample_list=snp_samples, allowed_countries=set(["United States"]))] #################################################### # # Set up Figure (4 panels, arranged in 2x2 grid) # #################################################### pylab.figure(1, figsize=(5, 2)) fig = pylab.gcf() # make three panels panels
# Load subject and sample metadata sys.stderr.write("Loading sample metadata...\n") subject_sample_map = sample_utils.parse_subject_sample_map() sample_order_map = sample_utils.parse_sample_order_map() sys.stderr.write("Done!\n") examples = [['Bacteroides_vulgatus_57955', '700021876'], ['Bacteroides_uniformis_57318', '700016456'], ['Bacteroides_caccae_53434', '700024998']] for example_idx in xrange(0, len(examples)): species_name = examples[example_idx][0] initial_sample = examples[example_idx][1] # Only plot samples above a certain depth threshold that are "haploids" haploid_samples = diversity_utils.calculate_haploid_samples(species_name, debug=debug) sample_order_map = sample_utils.parse_sample_order_map() # Calculate which triplets of idxs belong to the same subject same_subject_idxs = parse_midas_data.calculate_ordered_subject_triplets( sample_order_map, haploid_samples) temporal_samples = set() for sample_triplet_idx in xrange(0, len(same_subject_idxs)): i, j, k = same_subject_idxs[sample_triplet_idx] if haploid_samples[i] == initial_sample: examples[example_idx].append( (haploid_samples[i], haploid_samples[j], haploid_samples[k])) # Now can plot them the same way as before