Esempio n. 1
0
def get_QP_sample_mask(species_name):
    sample_names = get_sample_names(species_name)

    QP_samples = set(diversity_utils.calculate_haploid_samples(species_name))
    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    allowed_samples = QP_samples & highcoverage_samples
    return np.isin(sample_names, list(allowed_samples)), sample_names
Esempio n. 2
0
def get_desired_samples(species_name, between_host=False):
    highcoverage_samples = set(
        diversity_utils.calculate_highcoverage_samples(species_name))
    if between_host:
        QP_samples = set(
            diversity_utils.calculate_haploid_samples(species_name))
        return QP_samples & highcoverage_samples
    else:
        single_peak_dir = os.path.join(config.analysis_directory,
                                       'allele_freq', species_name, '1')
        if not os.path.exists(single_peak_dir):
            print("Please plot sfs by peaks first for {}".format(species_name))
            return None
        desired_samples = set([
            f.split('.')[0] for f in os.listdir(single_peak_dir)
            if not f.startswith('.')
        ])
        return desired_samples & highcoverage_samples
min_change = 0.8
include_high_copynum = False
#include_high_copynum = True

# Load subject and sample metadata
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = parse_HMP_data.parse_subject_sample_map()
sample_country_map = parse_HMP_data.parse_sample_country_map()
sample_order_map = parse_HMP_data.parse_sample_order_map()
sys.stderr.write("Done!\n")

# Only plot samples above a certain depth threshold that are involved in timecourse
snp_samples = diversity_utils.calculate_temporal_samples(species_name)

# The subset of samples that are haploid
haploid_samples = set(diversity_utils.calculate_haploid_samples(species_name))

# Only use the subset from North America
# The only temporal samples are from here, best not contaminate the between-subject
# comparisons with out of sample effects
#snp_samples = snp_samples[parse_HMP_data.calculate_country_samples(sample_country_map, sample_list=snp_samples, allowed_countries=set(["United States"]))]

####################################################
#
# Set up Figure (4 panels, arranged in 2x2 grid)
#
####################################################

pylab.figure(1, figsize=(5, 2))
fig = pylab.gcf()
# make three panels panels
Esempio n. 4
0
# Load subject and sample metadata
sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = sample_utils.parse_subject_sample_map()
sample_order_map = sample_utils.parse_sample_order_map()
sys.stderr.write("Done!\n")

examples = [['Bacteroides_vulgatus_57955', '700021876'],
            ['Bacteroides_uniformis_57318', '700016456'],
            ['Bacteroides_caccae_53434', '700024998']]

for example_idx in xrange(0, len(examples)):
    species_name = examples[example_idx][0]
    initial_sample = examples[example_idx][1]

    # Only plot samples above a certain depth threshold that are "haploids"
    haploid_samples = diversity_utils.calculate_haploid_samples(species_name,
                                                                debug=debug)

    sample_order_map = sample_utils.parse_sample_order_map()
    # Calculate which triplets of idxs belong to the same subject
    same_subject_idxs = parse_midas_data.calculate_ordered_subject_triplets(
        sample_order_map, haploid_samples)

    temporal_samples = set()
    for sample_triplet_idx in xrange(0, len(same_subject_idxs)):
        i, j, k = same_subject_idxs[sample_triplet_idx]

        if haploid_samples[i] == initial_sample:
            examples[example_idx].append(
                (haploid_samples[i], haploid_samples[j], haploid_samples[k]))

# Now can plot them the same way as before