Esempio n. 1
0
def load_centroid_gene_map(desired_species_name=None):
    
    if desired_species_name==None:
        from parsers import parse_midas_data
        desired_speciess = parse_midas_data.parse_good_species_list()
    else:
        desired_speciess = [desired_species_name]
    
    for desired_species_name in desired_speciess:
        # First load reference genes
        reference_genes = load_reference_genes(desired_species_name)
    
        gene_info_file = gzip.open("%span_genomes/%s/gene_info.txt.gz" % (config.midas_directory, desired_species_name), 'r')
    
        gene_info_file.readline() # header
    
        centroid_gene_map = {}
    
        for line in gene_info_file:
        
            items = line.split("\t") 
            gene_id = items[0].strip()
            centroid_id = items[3].strip()
        
            if centroid_id not in centroid_gene_map:
                centroid_gene_map[centroid_id] = centroid_id
            
            if (gene_id in reference_genes) and (centroid_id not in reference_genes):
                centroid_gene_map[centroid_id] = gene_id
            
        
        gene_info_file.close()
    
    return centroid_gene_map
Esempio n. 2
0
    parser.add_argument("--species", help="Name of specific species to run code on", default="all")
    args = parser.parse_args()

    debug = args.debug
    chunk_size = args.chunk_size
    species=args.species

    file = gzip.GzipFile(intermediate_filename,"w")

    # Load subject and sample metadata
    sys.stderr.write("Loading sample metadata...\n")
    subject_sample_map = parse_HMP_data.parse_subject_sample_map()
    sys.stderr.write("Done!\n")
    
    # get a list of specis to run this script on. 
    good_species_list = parse_midas_data.parse_good_species_list()
    if species!='all':
        good_species_list = [species]
    else:    
        if debug:
            good_species_list = good_species_list[:3]
    
    # header for the output file.
    record_strs = []
    
    for species_name in good_species_list:

        sys.stderr.write("Loading samples...\n")

        # Only plot samples above a certain depth threshold that are confidently phaseable.
        snp_samples = diversity_utils.calculate_highcoverage_samples(species_name, min_coverage=min_coverage)
Esempio n. 3
0
from parsers import parse_midas_data

if len(sys.argv) < 3:
    sys.stderr.write(
        "Usage: python loop_over_species_wrapper.py all|debug|species command...\n"
    )
    sys.exit(1)

# First argument is either 'all', 'debug', or a species name

debug_flag = ""
if sys.argv[1] == 'debug':
    species_names = [parse_midas_data.debug_species_name]
    debug_flag = "--debug"
elif sys.argv[1] == 'all':
    species_names = parse_midas_data.parse_good_species_list()
else:
    good_species_names = parse_midas_data.parse_good_species_list()
    species_names = []
    pattern = sys.argv[1]
    for species_name in good_species_names:
        if species_name.startswith(pattern):
            species_names.append(species_name)

# Remaining arguments are command to run, with species name appended as last argument
command = " ".join(sys.argv[2:])

sys.stderr.write("Running command: %s\n" % command)
sys.stderr.write("for %d species...\n\n" % len(species_names))

for species_name in species_names:
        f = float(items[1])
        gene_freq_map[gene_name] = f
    file.close()

    return gene_freq_map


# Actually calculate the core genes
if __name__ == '__main__':

    from parsers import parse_midas_data

    os.system('mkdir -p %s' % core_genes_directory)
    os.system('mkdir -p %s' % external_core_genes_directory)

    pangenome_species = parse_midas_data.parse_good_species_list()

    cmin = config.core_genome_min_copynum
    cmax = config.core_genome_max_copynum
    shared_cmin = config.shared_genome_min_copynum

    min_good_fraction = config.core_genome_min_prevalence
    min_coverage = 5  # (for assessing core genome, we'll use a lower coverage value than when we look at real changes)

    output_filename = default_core_gene_filename
    output_file = gzip.GzipFile(output_filename, "w")

    stringent_output_filename = default_stringent_core_gene_filename
    stringent_output_file = gzip.GzipFile(stringent_output_filename, "w")

    shared_output_file = gzip.GzipFile(default_shared_gene_filename, "w")