Exemple #1
0
def get_patric_annotation_files():
    #pathway_directory = '%spatric_pathway/' % (parse_midas_data.data_directory)

    os.system("mkdir -p %s" % patric_directory)
    #os.system("mkdir -p %s" % pathway_directory)

    #intermediate_filename = intermediate_filename_template % (pairwise_directory, species_name)

    # get a list of specis to run this script on.
    good_species_list = parse_midas_data.parse_good_species_list()

    for species_name in good_species_list:

        core_genes = core_gene_utils.parse_core_genes(species_name)

        genome_ids = set([
            ".".join(core_gene.split(".", 2)[:2]) for core_gene in core_genes
        ])

        for genome_id in genome_ids:

            print(species_name, genome_id)

            cmnd_subsystem = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.subsystem.tab -o %s/%s.PATRIC.subsystem.tab" % (
                genome_id, genome_id, patric_directory, genome_id)
            cmnd_pathway = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.pathway.tab -o %s/%s.PATRIC.pathway.tab" % (
                genome_id, genome_id, patric_directory, genome_id)
            cmnd_features = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.features.tab -o %s/%s.PATRIC.features.tab" % (
                genome_id, genome_id, patric_directory, genome_id)

            cmnd_cat = "cat %s/%s.PATRIC.features.tab | cut -f6,21 > %s/%s.kegg.txt" % (
                patric_directory, genome_id, patric_directory, genome_id)
            cmnd_bzip2 = "bzip2 -k %s/%s.kegg.txt" % (patric_directory,
                                                      genome_id)

            cmnd_rRNAs = "curl ftp://ftp.patricbrc.org/genomes/%s/%s.PATRIC.frn -o %s/%s.PATRIC.frn" % (
                genome_id, genome_id, patric_directory, genome_id)
Exemple #2
0
def load_centroid_gene_map(desired_species_name=None):

    if desired_species_name == None:
        import parse_midas_data
        desired_speciess = parse_midas_data.parse_good_species_list()
    else:
        desired_speciess = [desired_species_name]

    for desired_species_name in desired_speciess:
        # First load reference genes
        reference_genes = load_reference_genes(desired_species_name)

        gene_info_file = gzip.open(
            "%span_genomes/%s/gene_info.txt.gz" %
            (config.midas_directory, desired_species_name), 'r')

        gene_info_file.readline()  # header

        centroid_gene_map = {}

        for line in gene_info_file:

            items = line.split("\t")
            gene_id = items[0].strip()
            centroid_id = items[3].strip()

            if centroid_id not in centroid_gene_map:
                centroid_gene_map[centroid_id] = centroid_id

            if (gene_id in reference_genes) and (centroid_id
                                                 not in reference_genes):
                centroid_gene_map[centroid_id] = gene_id

        gene_info_file.close()

    return centroid_gene_map
################################################################################

min_sample_size = config.between_host_min_sample_size # 46 gives at least 1000 
low_divergence_threshold = config.between_low_divergence_threshold
allowed_variant_types = set(['4D'])

#focal_speciess = ['Bacteroides_vulgatus_57955', 'Roseburia_inulinivorans_61943']
#focal_speciess = ['Bacteroides_vulgatus_57955', 'Faecalibacterium_prausnitzii_62201']
focal_speciess = ['Bacteroides_vulgatus_57955', 'Akkermansia_muciniphila_55290']

focal_colors = ['b','g']

#supplemental_focal_species = ['Bacteroides_fragilis_54507', 'Alistipes_putredinis_61533', 'Eubacterium_rectale_56927']
supplemental_focal_species = ['Bacteroides_fragilis_54507', 'Parabacteroides_distasonis_56985', 'Alistipes_shahii_62199'] # 'Ruminococcus_bromii_62047']  
good_species_list = parse_midas_data.parse_good_species_list()
if debug:
    good_species_list = good_species_list[0:2]

sys.stderr.write("Loading sample metadata...\n")
subject_sample_map = sample_utils.parse_subject_sample_map()
sample_continent_map = sample_utils.parse_sample_continent_map()
sys.stderr.write("Done!\n")

####################################################
#
# Set up Figure (2 panels, arranged in 2x1 grid)
#
####################################################

pylab.figure(1,figsize=(5,6))
import parse_midas_data

if len(sys.argv) < 3:
    sys.stderr.write(
        "Usage: python loop_over_species_wrapper.py all|debug|species command...\n"
    )
    sys.exit(1)

# First argument is either 'all', 'debug', or a species name

debug_flag = ""
if sys.argv[1] == 'debug':
    species_names = [parse_midas_data.debug_species_name]
    debug_flag = "--debug"
elif sys.argv[1] == 'all':
    species_names = parse_midas_data.parse_good_species_list()
else:
    good_species_names = parse_midas_data.parse_good_species_list()
    species_names = []
    pattern = sys.argv[1]
    for species_name in good_species_names:
        if species_name.startswith(pattern):
            species_names.append(species_name)

# Remaining arguments are command to run, with species name appended as last argument
command = " ".join(sys.argv[2:])

sys.stderr.write("Running command: %s\n" % command)
sys.stderr.write("for %d species...\n\n" % len(species_names))

for species_name in species_names:
Exemple #5
0
        f = float(items[1])
        gene_freq_map[gene_name] = f
    file.close()

    return gene_freq_map


# Actually calculate the core genes
if __name__ == '__main__':

    import parse_midas_data

    os.system('mkdir -p %s' % core_genes_directory)
    os.system('mkdir -p %s' % external_core_genes_directory)

    pangenome_species = parse_midas_data.parse_good_species_list()

    cmin = config.core_genome_min_copynum
    cmax = config.core_genome_max_copynum
    shared_cmin = config.shared_genome_min_copynum

    min_good_fraction = config.core_genome_min_prevalence
    min_coverage = 5  # (for assessing core genome, we'll use a lower coverage value than when we look at real changes)

    output_filename = default_core_gene_filename
    output_file = gzip.GzipFile(output_filename, "w")

    stringent_output_filename = default_stringent_core_gene_filename
    stringent_output_file = gzip.GzipFile(stringent_output_filename, "w")

    shared_output_file = gzip.GzipFile(default_shared_gene_filename, "w")
Exemple #6
0
def get_16S_fasta():

    good_species_list = parse_midas_data.parse_good_species_list()

    frn = open(frn_path, 'w')

    for species_name in good_species_list:

        core_genes = core_gene_utils.parse_core_genes(species_name)

        genome_ids = list(
            set([
                ".".join(core_gene.split(".", 2)[:2])
                for core_gene in core_genes
            ]))
        genome_id = genome_ids[0]

        species_name_frn_path = "%s/%s.PATRIC.frn" % (patric_directory,
                                                      genome_id)

        species_name_frn = classFASTA(species_name_frn_path).readFASTA()

        counted_rRNA = False

        for species_name_frn_name, species_name_frn_seq in species_name_frn:

            if 'ssuRNA' not in species_name_frn_name:
                continue

            if len(species_name_frn_seq) < 1200:
                continue

            if counted_rRNA == False:
                #print(species_name, species_name_frn_name, len(species_name_frn_seq))

                species_name_frn_name_split = species_name_frn_name.split()
                frn_header = species_name + '|' + species_name_frn_name_split[
                    0].split('|')[1]

                frn.write('>%s\n' % frn_header)

                seq_split = [
                    species_name_frn_seq[i:i + 80]
                    for i in range(0, len(species_name_frn_seq), 80)
                ]

                for seq in seq_split:

                    frn.write('%s\n' % seq)

                frn.write('\n')

                counted_rRNA = True

            else:
                continue

    # same for outgroup

    outgroup = classFASTA(outgroup_path).readFASTA()
    print(outgroup[0][0])
    frn.write('>%s\n' % outgroup[0][0])
    seq_outgroup_split = [
        outgroup[0][1][i:i + 80] for i in range(0, len(outgroup[0][1]), 80)
    ]

    for seq in seq_outgroup_split:

        frn.write('%s\n' % seq)

    frn.write('\n')

    frn.close()

    os.system('muscle -in %s -out %s' % (frn_path, frn_aligned_path))