def get_core_gene_vector(species_name):
    core_genes = core_gene_utils.parse_core_genes(species_name)
    # sort the genes
    all_genes = np.array(list(core_genes))
    gene_indices = np.array(
        list(map(lambda name: int(name.split('.')[-1]), all_genes)))
    all_genes = all_genes[np.argsort(gene_indices)]
    return all_genes
Example #2
0
def main(between_host):
    # Parse and save all the snps between QP hosts
    t0 = time.time()
    if between_host:
        intermediate_file_path = os.path.join(config.analysis_directory,
                                              'between_hosts_checkpoints')
    else:
        intermediate_file_path = os.path.join(config.analysis_directory,
                                              'within_hosts_checkpoints')

    for species_name in desired_species:
        print("Start processing {}".format(species_name))
        core_genes = core_gene_utils.parse_core_genes(species_name)
        desired_samples = get_desired_samples(species_name,
                                              between_host=between_host)
        if desired_samples is None or len(desired_samples) == 0:
            print("{} has no qualified samples".format(species_name))
            continue
        pickle_path = os.path.join(intermediate_file_path, species_name)
        if not os.path.exists(pickle_path):
            print('{} has not been processed'.format(species_name))
            os.mkdir(pickle_path)
        else:
            print('{} already processed'.format(species_name))
            continue
        found_samples, allele_counts_map, passed_sites_map, final_line_number = parse_snps(
            species_name,
            allowed_samples=desired_samples,
            allowed_genes=core_genes,
            allowed_variant_types=['4D'])
        pickle.dump(allele_counts_map,
                    open(pickle_path + '/allele_counts_map.pickle', 'wb'))
        pickle.dump(found_samples,
                    open(pickle_path + '/found_samples.pickle', 'wb'))
        pickle.dump(passed_sites_map,
                    open(pickle_path + '/passed_sites_map.pickle', 'wb'))
        print("Done processing {} at {} min".format(species_name,
                                                    (time.time() - t0) / 60))
Example #3
0
                                                                debug=debug)
        # Only consider one sample per person
        snp_samples = snp_samples[parse_midas_data.calculate_unique_samples(
            subject_sample_map, sample_list=snp_samples)]
        sys.stderr.write("Proceeding with %d haploid samples!\n" %
                         len(snp_samples))

        if len(snp_samples) < min_sample_size:
            sys.stderr.write("Not enough haploid samples!\n")
            continue

        sys.stderr.write("Proceeding with %d haploid samples!\n" %
                         len(snp_samples))

        sys.stderr.write("Loading core genes...\n")
        core_genes = core_gene_utils.parse_core_genes(species_name)
        non_shared_genes = core_gene_utils.parse_non_shared_reference_genes(
            species_name)
        shared_pangenome_genes = core_gene_utils.parse_shared_genes(
            species_name)
        sys.stderr.write("Done! Core genome consists of %d genes\n" %
                         len(core_genes))
        sys.stderr.write("%d shared genes and %d non-shared genes\n" %
                         (len(shared_pangenome_genes), len(non_shared_genes)))

        sys.stderr.write(
            "Loading pre-computed substitution rates for %s...\n" %
            species_name)
        substitution_rate_map = calculate_substitution_rates.load_substitution_rate_map(
            species_name)
        sys.stderr.write("Calculating matrix...\n")