def load_centroid_gene_map(desired_species_name=None): if desired_species_name==None: from parsers import parse_midas_data desired_speciess = parse_midas_data.parse_good_species_list() else: desired_speciess = [desired_species_name] for desired_species_name in desired_speciess: # First load reference genes reference_genes = load_reference_genes(desired_species_name) gene_info_file = gzip.open("%span_genomes/%s/gene_info.txt.gz" % (config.midas_directory, desired_species_name), 'r') gene_info_file.readline() # header centroid_gene_map = {} for line in gene_info_file: items = line.split("\t") gene_id = items[0].strip() centroid_id = items[3].strip() if centroid_id not in centroid_gene_map: centroid_gene_map[centroid_id] = centroid_id if (gene_id in reference_genes) and (centroid_id not in reference_genes): centroid_gene_map[centroid_id] = gene_id gene_info_file.close() return centroid_gene_map
parser.add_argument("--species", help="Name of specific species to run code on", default="all") args = parser.parse_args() debug = args.debug chunk_size = args.chunk_size species=args.species file = gzip.GzipFile(intermediate_filename,"w") # Load subject and sample metadata sys.stderr.write("Loading sample metadata...\n") subject_sample_map = parse_HMP_data.parse_subject_sample_map() sys.stderr.write("Done!\n") # get a list of specis to run this script on. good_species_list = parse_midas_data.parse_good_species_list() if species!='all': good_species_list = [species] else: if debug: good_species_list = good_species_list[:3] # header for the output file. record_strs = [] for species_name in good_species_list: sys.stderr.write("Loading samples...\n") # Only plot samples above a certain depth threshold that are confidently phaseable. snp_samples = diversity_utils.calculate_highcoverage_samples(species_name, min_coverage=min_coverage)
from parsers import parse_midas_data if len(sys.argv) < 3: sys.stderr.write( "Usage: python loop_over_species_wrapper.py all|debug|species command...\n" ) sys.exit(1) # First argument is either 'all', 'debug', or a species name debug_flag = "" if sys.argv[1] == 'debug': species_names = [parse_midas_data.debug_species_name] debug_flag = "--debug" elif sys.argv[1] == 'all': species_names = parse_midas_data.parse_good_species_list() else: good_species_names = parse_midas_data.parse_good_species_list() species_names = [] pattern = sys.argv[1] for species_name in good_species_names: if species_name.startswith(pattern): species_names.append(species_name) # Remaining arguments are command to run, with species name appended as last argument command = " ".join(sys.argv[2:]) sys.stderr.write("Running command: %s\n" % command) sys.stderr.write("for %d species...\n\n" % len(species_names)) for species_name in species_names:
f = float(items[1]) gene_freq_map[gene_name] = f file.close() return gene_freq_map # Actually calculate the core genes if __name__ == '__main__': from parsers import parse_midas_data os.system('mkdir -p %s' % core_genes_directory) os.system('mkdir -p %s' % external_core_genes_directory) pangenome_species = parse_midas_data.parse_good_species_list() cmin = config.core_genome_min_copynum cmax = config.core_genome_max_copynum shared_cmin = config.shared_genome_min_copynum min_good_fraction = config.core_genome_min_prevalence min_coverage = 5 # (for assessing core genome, we'll use a lower coverage value than when we look at real changes) output_filename = default_core_gene_filename output_file = gzip.GzipFile(output_filename, "w") stringent_output_filename = default_stringent_core_gene_filename stringent_output_file = gzip.GzipFile(stringent_output_filename, "w") shared_output_file = gzip.GzipFile(default_shared_gene_filename, "w")