Exemple #1
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: compare_taxa.py
--unfiltered-taxon-a=FILE    genome IDs for taxon A as deduced from phylogenetic tree of unfiltered concatemers
--unfiltered-taxon-b=FILE    genome IDs for taxon B as deduced from phylogenetic tree of unfiltered concatemers
--filtered-taxon-a=FILE      genome IDs for taxon A as deduced from phylogenetic tree of filtered concatemers
--filtered-taxon-b=FILE      genome IDs for taxon B as deduced from phylogenetic tree of filtered concatemers
"""
    options = ['unfiltered-taxon-a', 'unfiltered-taxon-b', 'filtered-taxon-a', 'filtered-taxon-b']
    unfiltered_a_file, unfiltered_b_file, filtered_a_file, filtered_b_file = parse_options(usage, options, args)

    # Parse ID files to extract GenBank Project IDs & Organism Name
    with open(unfiltered_a_file) as read_handle:
        unfiltered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(unfiltered_b_file) as read_handle:
        unfiltered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(filtered_a_file) as read_handle:
        filtered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)
    with open(filtered_b_file) as read_handle:
        filtered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle)

    # Otherwise fail after
    if unfiltered_a.keys()[0] in filtered_a:
        if not (set(unfiltered_a.keys()) == set(filtered_a.keys())
                and set(unfiltered_b.keys()) == set(filtered_b.keys())):
            fail(unfiltered_a, unfiltered_b, filtered_a, filtered_b)
    else:
        if not (set(unfiltered_a.keys()) == set(filtered_b.keys())
                and set(unfiltered_b.keys()) == set(filtered_a.keys())):
            fail(unfiltered_a, unfiltered_b, filtered_b, filtered_a)

    # Else: no problems were found
    log.info('Succes: Unfiltered & filtered tree clustering did not result in different taxa.')
Exemple #2
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_phipack.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--stats-file=FILE        destination file path for values found through PhiPack for each ortholog
"""
    options = ('orthologs-zip', 'stats-file')
    orthologs_zip, stats_file = parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='run_phipack_')

    # Extract files from zip archive
    extraction_dir = create_directory('extracted_orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir)

    # Find recombination in all ortholog_files
    _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced:\n%s', stats_file)
Exemple #3
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_phipack.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--stats-file=FILE        destination file path for values found through PhiPack for each ortholog
"""
    options = ('orthologs-zip', 'stats-file')
    orthologs_zip, stats_file = parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='run_phipack_')

    # Extract files from zip archive
    extraction_dir = create_directory('extracted_orthologs',
                                      inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir)

    # Find recombination in all ortholog_files
    _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced:\n%s', stats_file)
Exemple #4
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--external-genomes=    comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes.
    label:FILE,...     labels should be unique as genomes will be identified by this label in further output files
--external-zip=FILE    destination path for archive of user provided external genomes containing formatted nucleotide fasta files
"""
    options = ['external-genomes', 'external-zip']
    external_genomes, external_zip = parse_options(usage, options, args)

    # External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header
    external_fasta_files = {}

    # Handle externally uploaded genomes
    # Sample line: label1:file1,label2:file2, # Note trailing the trailing , that's a Galaxy artifact we'll ignore
    for label, filename in (label_file.split(':') for label_file in external_genomes.split(',') if label_file):
        if len(label) == 0:
            log.error('Empty label provided for upload genome %s. Please provide a label and try again.', filename)
            break
        log.info('Formatting external genome labeled %s at %s', label, filename)
        formatted_file = format_fasta_genome_headers(label, filename)
        external_fasta_files[label] = formatted_file

    # Copy formatted external genome files to archive that will be output as well
    create_archive_of_files(external_zip, external_fasta_files.values())

    # Remove temporary formatted files
    for formatted_file in external_fasta_files.values():
        os.remove(formatted_file)

    # Exit after a comforting log message
    log.info("Produced: \n%s", external_zip)
Exemple #5
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree']
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    # Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files)
    # Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    # reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    # Map Project IDs to Organism names
    id_to_name_map = dict((gid, genome['Organism/Name'])
                          for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems())

    # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id)))

    # Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file,
             target_taxon_a, target_taxon_b, target_tree)
Exemple #6
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--genomes=ID,...           optional comma-separated list of selected GenBank Project IDs from complete genomes table
--previous-file=FILE       optional previously or externally created GenBank Project IDs file whose genomes should be reselected
--require-protein-table    require protein table files to be present for all downloaded genomes
--genomes-file=FILE        destination path for file with selected genome IDs followed by Organism Name on each line
"""
    options = ['genomes=?', 'previous-file=?', 'require-protein-table?', 'genomes-file']
    genomes_line, previous_file, require_ptt, genomes_file = parse_options(usage, options, args)

    # Genome IDs selected by the user that refer to GenBank or RefSeq entries
    genome_ids = []

    # Split ids on comma
    if genomes_line:
        genome_ids.extend(val for val in genomes_line.split(',') if val)

    # Allow for input of previous or externally created genomes-file to rerun an analysis
    if previous_file:
        # Read previous GenBank Project IDs from previous_file, each on their own line
        with open(previous_file) as read_handle:
            genome_ids.extend(line.split()[0] for line in read_handle
                              # But skip external genomes as their IDs will fail to download
                              if 'external genome' not in line)

    # Assert each clade contains enough IDs
    maximum = 100
    # TODO Move this test to translate, where we can see how many translations succeeded + how many externals there are
    if maximum < len(genome_ids):
        logging.error('Expected between two and %s selected genomes, but was %s', maximum, len(genome_ids))
        sys.exit(1)

    # Retrieve genome dictionaries to get to Organism Name
    genomes = select_genomes_by_ids(genome_ids).values()
    genomes = sorted(genomes, key=itemgetter('Organism/Name'))

    # Semi-touch genomes file in case no genomes were selected, for instance when uploading external genomes
    open(genomes_file, mode='a').close()

    # Write IDs to file, with organism name as second column to make the project ID files more self explanatory.
    for genome in genomes:
        # Download files here, but ignore returned files: These can be retrieved from cache during extraction/translation
        download_genome_files(genome, genomes_file, require_ptt=require_ptt)

    # Post check after translation to see if more than one genome actually had some genomic contents
    with open(genomes_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle]
        # If some genomes were skipped, ensure at least two genomes remain
        if len([gid for gid in genome_ids if gid.startswith('#')]):
            assert 2 <= len([gid for gid in genome_ids if not gid.startswith('#')]), \
                "Some genomes were skipped, leaving us with less than two genomes to operate on; " \
                "Inspect messages in Project ID list and reevaluate genome selection"

    # Exit after a comforting log message
    logging.info("Produced: \n%s", genomes_file)
Exemple #7
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: translate.py
--genomes=FILE         file with selected genome IDs followed by Organism Name on each line
--external-zip=FILE    optional archive of user provided external genomes containing formatted nucleotide fasta files
--dna-zip=FILE         destination file path for zip archive of extracted DNA files
--protein-zip=FILE     destination file path for zip archive of translated protein files
"""
    options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip']
    genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(usage, options, args)

    dna_files = []
    protein_files = []

    # Read GenBank Project IDs from genomes_file, each on their own line
    with open(genome_ids_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle
                      if not line.startswith('#') and 'external genome' not in line]

        if len(genome_ids):
            # Retrieve associated genome dictionaries from complete genomes table
            genomes = select_genomes_by_ids(genome_ids).values()
            genomes = sorted(genomes, key=itemgetter('Organism/Name'))

            # Actually translate the genomes to produced a set of files for both  dna files & protein files
            dna_files, protein_files = translate_genomes(genomes)

    # Also translate the external genomes
    if external_zip:
        # Extract external genomes archive
        external_dir = tempfile.mkdtemp(prefix='external_genomes_')
        external_dna_files = extract_archive_of_files(external_zip, external_dir)

        # Append IDs of external fasta files to genome IDs file
        _append_external_genomes(external_dna_files, genome_ids_file)

        # Translate individual files
        external_protein_files = [translate_fasta_coding_regions(dna_file) for dna_file in external_dna_files]

        # Add the files to the appropriate collections
        dna_files.extend(external_dna_files)
        protein_files.extend(external_protein_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(dna_zipfile, dna_files)
    create_archive_of_files(protein_zipfile, protein_files)

    # Do not clean up extracted DNA files or Protein translations: Keep them as cache

    # But do clean up external_dir now that the compressed archives are created
    if external_zip:
        shutil.rmtree(external_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = [
        'orthologs-zip', 'retained-threshold', 'max-indel-length',
        'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'
    ]
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    # Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    # Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files,
                                                       retained_threshold,
                                                       max_indel_length,
                                                       target_stats_path,
                                                       target_scatterplot)

    # Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info(
        'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                     target_stats_path, target_scatterplot)))
Exemple #9
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: split_by_taxa.py
--genomes-a=FILE        file with genome GenBank Project ID and Organism name on each line for taxon A
--genomes-b=FILE        file with genome GenBank Project ID and Organism name on each line for taxon B
--orthologs-zip=FILE    archive of aligned & trimmed single copy orthologous (SICO) genes
--taxon-a-zip=FILE      destination file path for archive of SICO genes belonging to taxon A
--taxon-b-zip=FILE      destination file path for archive of SICO genes belonging to taxon B
"""
    options = [
        'genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip'
    ]
    genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(
        usage, options, args)

    # Parse file containing RefSeq project IDs to extract RefSeq project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_a = [line[0] for line in lines]
        common_prefix_a = _common_prefix([line[1] for line in lines],
                                         'taxon_a')
    with open(genome_b_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_b = [line[0] for line in lines]
        common_prefix_b = _common_prefix([line[1] for line in lines],
                                         'taxon_b')

    # Create run_dir to hold files related to this run
    run_dir = tempfile.mkdtemp(prefix='split_by_taxa_')

    # Extract files from zip archive
    ortholog_files = extract_archive_of_files(
        orthologs_zip, create_directory('alignments', inside_dir=run_dir))

    # Actually split alignments per taxon
    taxon_a_files, taxon_b_files = split_alignment_by_taxa(
        run_dir, ortholog_files, (genome_ids_a, common_prefix_a),
        (genome_ids_b, common_prefix_b))

    # Write the produced files to command line argument filenames
    create_archive_of_files(taxon_a_zip, taxon_a_files)
    create_archive_of_files(taxon_b_zip, taxon_b_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip)
    return taxon_a_zip, taxon_b_zip
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE           archive of orthologous genes in FASTA format
--retained-threshold=PERC      filter orthologs that retain less than PERC % of sequence after trimming alignment
--max-indel-length=NUMBER      filter orthologs that contain insertions / deletions longer than N in middle of alignment
--aligned-zip=FILE             destination file path for archive of aligned orthologous genes
--misaligned-zip=FILE          destination file path for archive of misaligned orthologous genes
--trimmed-zip=FILE             destination file path for archive of aligned & trimmed orthologous genes
--stats=FILE                   destination file path for ortholog trimming statistics file
--scatterplot=FILE             destination file path for scatterplot of retained and filtered sequences by length
"""
    options = ['orthologs-zip', 'retained-threshold', 'max-indel-length',
               'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot']
    orthologs_zip, retained_threshold, max_indel_length, \
    aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \
        parse_options(usage, options, args)

    # Convert retained threshold to integer, so we can fail fast if argument value format was wrong
    retained_threshold = int(retained_threshold)
    max_indel_length = int(max_indel_length)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='align_trim_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    sico_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Align SICOs so all sequences become equal length sequences
    aligned_files = _align_sicos(run_dir, sico_files)

    # Filter orthologs that retain less than PERC % of sequence after trimming alignment
    trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length,
                                                       target_stats_path, target_scatterplot)

    # Create archives of files on command line specified output paths
    create_archive_of_files(aligned_zip, aligned_files)
    create_archive_of_files(misaligned_zip, misaligned_files)
    create_archive_of_files(trimmed_zip, trimmed_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip,
                                         target_stats_path, target_scatterplot)))
Exemple #11
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: split_by_taxa.py
--genomes-a=FILE        file with genome GenBank Project ID and Organism name on each line for taxon A
--genomes-b=FILE        file with genome GenBank Project ID and Organism name on each line for taxon B
--orthologs-zip=FILE    archive of aligned & trimmed single copy orthologous (SICO) genes
--taxon-a-zip=FILE      destination file path for archive of SICO genes belonging to taxon A
--taxon-b-zip=FILE      destination file path for archive of SICO genes belonging to taxon B
"""
    options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip']
    genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args)

    # Parse file containing RefSeq project IDs to extract RefSeq project IDs
    with open(genome_a_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_a = [line[0] for line in lines]
        common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a')
    with open(genome_b_ids_file) as read_handle:
        lines = [line.split('\t') for line in read_handle]
        genome_ids_b = [line[0] for line in lines]
        common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b')

    # Create run_dir to hold files related to this run
    run_dir = tempfile.mkdtemp(prefix='split_by_taxa_')

    # Extract files from zip archive
    ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir))

    # Actually split alignments per taxon
    taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files,
                                                           (genome_ids_a, common_prefix_a),
                                                           (genome_ids_b, common_prefix_b))

    # Write the produced files to command line argument filenames
    create_archive_of_files(taxon_a_zip, taxon_a_files)
    create_archive_of_files(taxon_b_zip, taxon_b_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip)
    return taxon_a_zip, taxon_b_zip
Exemple #12
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = [
        'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'
    ]
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(
        usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(
        sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b,
                                        sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: crosstable_gene_ids.py
--sico-zip=FILE      archive of single copy orthologous (SICO) genes in separate files per ortholog
--crosstable=FILE    destination file path for crosstable between orthologs & genomes with gene IDs at intersections
"""
    options = ['sico-zip', 'crosstable']
    sizo_zip, target_crosstable = parse_options(usage, options, args)

    # Create tempdir
    run_dir = tempfile.mkdtemp(prefix='crosstable_')
    sico_files = extract_archive_of_files(sizo_zip, run_dir)

    # Create crosstable
    create_crosstable(sico_files, target_crosstable)

    # Remove extracted files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    logging.info("Produced: \n%s", target_crosstable)
Exemple #14
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: run_codeml.py
--genomes-a=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon A
--genomes-b=FILE     file with GenBank Project IDs from complete genomes table on each line for taxon B
--sico-zip=FILE      archive of aligned & trimmed single copy orthologous (SICO) genes
--codeml-zip=FILE     destination file path for archive of codeml output per SICO gene
--dnds-stats=FILE     destination file path for file with dN, dS & dN/dS values per SICO gene
"""
    options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats']
    genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args)

    # Parse file to extract GenBank Project IDs
    with open(genome_a_ids_file) as read_handle:
        genome_ids_a = [line.split()[0] for line in read_handle]
    with open(genome_b_ids_file) as read_handle:
        genome_ids_b = [line.split()[0] for line in read_handle]

    # Create run_dir to hold files relating to this run
    run_dir = tempfile.mkdtemp(prefix='run_codeml_')

    # Extract files from zip archive
    sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir))

    # Actually run codeml
    codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files)

    # Write dnds values to single output file
    _write_dnds_per_ortholog(dnds_file, codeml_files)

    # Write the produced files to command line argument filenames
    create_archive_of_files(codeml_zip, codeml_files)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
Exemple #15
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--external-genomes=    comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes.
    label:FILE,...     labels should be unique as genomes will be identified by this label in further output files
--external-zip=FILE    destination path for archive of user provided external genomes containing formatted nucleotide fasta files
"""
    options = ['external-genomes', 'external-zip']
    external_genomes, external_zip = parse_options(usage, options, args)

    # External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header
    external_fasta_files = {}

    # Handle externally uploaded genomes
    # Sample line: label1:file1,label2:file2, # Note trailing the trailing , that's a Galaxy artifact we'll ignore
    for label, filename in (label_file.split(':')
                            for label_file in external_genomes.split(',')
                            if label_file):
        if len(label) == 0:
            log.error(
                'Empty label provided for upload genome %s. Please provide a label and try again.',
                filename)
            break
        log.info('Formatting external genome labeled %s at %s', label,
                 filename)
        formatted_file = format_fasta_genome_headers(label, filename)
        external_fasta_files[label] = formatted_file

    # Copy formatted external genome files to archive that will be output as well
    create_archive_of_files(external_zip, external_fasta_files.values())

    # Remove temporary formatted files
    for formatted_file in external_fasta_files.values():
        os.remove(formatted_file)

    # Exit after a comforting log message
    log.info("Produced: \n%s", external_zip)
Exemple #16
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: extract_orthologs.py
--genomes=FILE       file with GenBank Project IDs from complete genomes table on each line
--dna-zip=FILE       zip archive of extracted DNA files
--groups=FILE        file listing groups of orthologous proteins
--require-limiter    flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL]

--sico-zip=FILE      destination file path for archive of shared single copy orthologous (SICO) genes
--muco-zip=FILE      destination file path for archive of shared multiple copy orthologous genes
--subset-zip=FILE    destination file path for archive of variable copy orthologous genes shared for a subset only
--stats=FILE         destination file path for ortholog statistics file
--heatmap=FILE       destination file path heatmap of orthologs and occurrences of ortholog per genome
--orfans=FILE        destination file path ORFans
"""
    options = ['genomes', 'dna-zip', 'groups', 'require-limiter?',
               'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans']
    genome_ids_file, dna_zip, groups_file, require_limiter, \
        target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \
        parse_options(usage, options, args)

    # Parse file extract GenBank Project IDs
    with open(genome_ids_file) as read_handle:
        genomes = [line.split()[0] for line in read_handle if not line.startswith('#')]

    # Create temporary directory within which to extract orthologs
    run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_')

    # Extract files from zip archive
    temp_dir = create_directory('dna_files', inside_dir=run_dir)
    dna_files = extract_archive_of_files(dna_zip, temp_dir)

    # Actually run ortholog extraction
    sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \
        extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter)

    # Append the orfans to the heatmap file
    _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file)

    # Move produced files to command line specified output paths
    create_archive_of_files(target_sico, sico_files)
    if target_muco:
        create_archive_of_files(target_muco, muco_files)
    if target_subset:
        create_archive_of_files(target_subset, subset_files)
    shutil.move(stats_file, target_stats_path)
    shutil.move(heatmap_file, target_heat)
    shutil.move(orfans_file, target_orfans)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced:")
    log.info("%s", target_sico)
    if target_muco:
        log.info("%s", target_muco)
    if target_subset:
        log.info("%s", target_subset)
    log.info("%s", target_stats_path)
    log.info("%s", target_heat)
Exemple #17
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: concatenate_orthologs.py
--orthologs-zip=FILE     archive of orthologous genes in FASTA format
--coding-regions=FILE    destination file path archive of trimmed orthologous coding regions per genomes
--concatemer=FILE        destination file path for super-concatemer of all genomes
--taxon-a=FILE           destination file path for genome IDs for taxon A
--taxon-b=FILE           destination file path for genome IDs for taxon B
--tree=FILE              destination file path for tree visualization
"""
    options = [
        'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b',
        'tree'
    ]
    orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \
        parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='concatemer_tree_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Separate out orthologs per genome to create trimmed coding region files per genome
    genome_coding_regions_files = coding_regions_per_genome(
        run_dir, ortholog_files)
    create_archive_of_files(target_coding_regions, genome_coding_regions_files)

    # Concatenate coding region files per genome
    concatemer_files = concatemer_per_genome(run_dir,
                                             genome_coding_regions_files)
    # Create super concatemer
    create_super_concatemer(concatemer_files, target_concat_file)

    # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and
    # reading genome ids in the two largest clades.
    super_distance_file = _run_dna_dist(run_dir, target_concat_file)
    super_tree_file = _run_neighbor(run_dir, super_distance_file)
    genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file)

    # Map Project IDs to Organism names
    id_to_name_map = dict(
        (gid, genome['Organism/Name'])
        for gid, genome in select_genomes_by_ids(genome_ids_a +
                                                 genome_ids_b).iteritems())

    # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome
    with open(target_taxon_a, mode='w') as write_handle:
        for genome_id in genome_ids_a:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))
    with open(target_taxon_b, mode='w') as write_handle:
        for genome_id in genome_ids_b:
            write_handle.write('{id}\t{name}\n'.format(id=genome_id,
                                                       name=id_to_name_map.get(
                                                           genome_id,
                                                           genome_id)))

    # Visualize tree
    visualize_tree(super_tree_file, id_to_name_map, target_tree)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions,
             target_concat_file, target_taxon_a, target_taxon_b, target_tree)
Exemple #18
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: select_taxa.py
--genomes=ID,...           optional comma-separated list of selected GenBank Project IDs from complete genomes table
--previous-file=FILE       optional previously or externally created GenBank Project IDs file whose genomes should be reselected
--require-protein-table    require protein table files to be present for all downloaded genomes
--genomes-file=FILE        destination path for file with selected genome IDs followed by Organism Name on each line
"""
    options = [
        'genomes=?', 'previous-file=?', 'require-protein-table?',
        'genomes-file'
    ]
    genomes_line, previous_file, require_ptt, genomes_file = parse_options(
        usage, options, args)

    # Genome IDs selected by the user that refer to GenBank or RefSeq entries
    genome_ids = []

    # Split ids on comma
    if genomes_line:
        genome_ids.extend(val for val in genomes_line.split(',') if val)

    # Allow for input of previous or externally created genomes-file to rerun an analysis
    if previous_file:
        # Read previous GenBank Project IDs from previous_file, each on their own line
        with open(previous_file) as read_handle:
            genome_ids.extend(
                line.split()[0] for line in read_handle
                # But skip external genomes as their IDs will fail to download
                if 'external genome' not in line)

    # Assert each clade contains enough IDs
    maximum = 100
    # TODO Move this test to translate, where we can see how many translations succeeded + how many externals there are
    if maximum < len(genome_ids):
        logging.error(
            'Expected between two and %s selected genomes, but was %s',
            maximum, len(genome_ids))
        sys.exit(1)

    # Retrieve genome dictionaries to get to Organism Name
    genomes = select_genomes_by_ids(genome_ids).values()
    genomes = sorted(genomes, key=itemgetter('Organism/Name'))

    # Semi-touch genomes file in case no genomes were selected, for instance when uploading external genomes
    open(genomes_file, mode='a').close()

    # Write IDs to file, with organism name as second column to make the project ID files more self explanatory.
    for genome in genomes:
        # Download files here, but ignore returned files: These can be retrieved from cache during extraction/translation
        download_genome_files(genome, genomes_file, require_ptt=require_ptt)

    # Post check after translation to see if more than one genome actually had some genomic contents
    with open(genomes_file) as read_handle:
        genome_ids = [line.split()[0] for line in read_handle]
        # If some genomes were skipped, ensure at least two genomes remain
        if len([gid for gid in genome_ids if gid.startswith('#')]):
            assert 2 <= len([gid for gid in genome_ids if not gid.startswith('#')]), \
                "Some genomes were skipped, leaving us with less than two genomes to operate on; " \
                "Inspect messages in Project ID list and reevaluate genome selection"

    # Exit after a comforting log message
    logging.info("Produced: \n%s", genomes_file)
Exemple #19
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: extract_orthologs.py
--genomes=FILE       file with GenBank Project IDs from complete genomes table on each line
--dna-zip=FILE       zip archive of extracted DNA files
--groups=FILE        file listing groups of orthologous proteins
--require-limiter    flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL]

--sico-zip=FILE      destination file path for archive of shared single copy orthologous (SICO) genes
--muco-zip=FILE      destination file path for archive of shared multiple copy orthologous genes
--subset-zip=FILE    destination file path for archive of variable copy orthologous genes shared for a subset only
--stats=FILE         destination file path for ortholog statistics file
--heatmap=FILE       destination file path heatmap of orthologs and occurrences of ortholog per genome
--orfans=FILE        destination file path ORFans
"""
    options = [
        'genomes', 'dna-zip', 'groups', 'require-limiter?', 'sico-zip',
        'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans'
    ]
    genome_ids_file, dna_zip, groups_file, require_limiter, \
        target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \
        parse_options(usage, options, args)

    # Parse file extract GenBank Project IDs
    with open(genome_ids_file) as read_handle:
        genomes = [
            line.split()[0] for line in read_handle if not line.startswith('#')
        ]

    # Create temporary directory within which to extract orthologs
    run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_')

    # Extract files from zip archive
    temp_dir = create_directory('dna_files', inside_dir=run_dir)
    dna_files = extract_archive_of_files(dna_zip, temp_dir)

    # Actually run ortholog extraction
    sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \
        extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter)

    # Append the orfans to the heatmap file
    _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file)

    # Move produced files to command line specified output paths
    create_archive_of_files(target_sico, sico_files)
    if target_muco:
        create_archive_of_files(target_muco, muco_files)
    if target_subset:
        create_archive_of_files(target_subset, subset_files)
    shutil.move(stats_file, target_stats_path)
    shutil.move(heatmap_file, target_heat)
    shutil.move(orfans_file, target_orfans)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info("Produced:")
    log.info("%s", target_sico)
    if target_muco:
        log.info("%s", target_muco)
    if target_subset:
        log.info("%s", target_subset)
    log.info("%s", target_stats_path)
    log.info("%s", target_heat)
Exemple #20
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE            archive of orthologous genes in FASTA format
--filter-multiple-cogs          filter orthologs with multiple COG annotations among genes [OPTIONAL]

--filter-recombination=FILE     filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL]
                                destination file path for archive of recombination orthologs
--recombined-crosstable=FILE    destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL]
--taxon-a=FILE                  file with genome IDs for taxon A to use in recombination filtering
--taxon-b=FILE                  file with genome IDs for taxon B to use in recombination filtering
--retained-zip=FILE             destination file path for archive of retained orthologs after filtering

--orthologs-per-genome=FILE      destination file path for orthologs split out per genome, based on the retained.zip
--concatemer=FILE                destination file path for super-concatemer of all genomes
"""
    options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?',
               'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer')
    orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \
    taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='filter_orthologs_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Filter orthologs with multiple COG annotations among genes if flag was set
    if filter_cogs:
        ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files)

    # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element'

    # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set
    if filter_recombination:
        # Parse file to extract GenBank Project IDs
        with open(taxona) as read_handle:
            genome_ids_a = [line.split()[0] for line in read_handle]
        with open(taxonb) as read_handle:
            genome_ids_b = [line.split()[0] for line in read_handle]
        ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files,
                                                                       genome_ids_a, genome_ids_b)
        # Create crosstable
        create_crosstable(recombined_files, recombined_crosstable)

    # Create archives of files on command line specified output paths
    if filter_cogs:
        shutil.move(transfered_cogs, filter_cogs)
    if filter_recombination:
        create_archive_of_files(filter_recombination, recombined_files)
    create_archive_of_files(retained_zip, ortholog_files)

    # Run the steps required after filtering orthologs
    post_recombination_filter(taxona, taxonb, retained_zip,
                              target_orth_per_genome, target_concat_file, run_dir)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced:')
    if filter_cogs:
        log.info(filter_cogs)
    if filter_recombination:
        log.info(filter_recombination)
    log.info(retained_zip)
    log.info(target_orth_per_genome)
    log.info(target_concat_file)
Exemple #21
0
def main(args):
    """Main function called when run from command line or as part of pipeline."""
    usage = """
Usage: filter_orthologs.py
--orthologs-zip=FILE            archive of orthologous genes in FASTA format
--filter-multiple-cogs          filter orthologs with multiple COG annotations among genes [OPTIONAL]

--filter-recombination=FILE     filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL]
                                destination file path for archive of recombination orthologs
--recombined-crosstable=FILE    destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL]
--taxon-a=FILE                  file with genome IDs for taxon A to use in recombination filtering
--taxon-b=FILE                  file with genome IDs for taxon B to use in recombination filtering
--retained-zip=FILE             destination file path for archive of retained orthologs after filtering

--orthologs-per-genome=FILE      destination file path for orthologs split out per genome, based on the retained.zip
--concatemer=FILE                destination file path for super-concatemer of all genomes
"""
    options = ('orthologs-zip', 'filter-multiple-cogs=?',
               'filter-recombination=?', 'recombined-crosstable=?',
               'taxon-a=?', 'taxon-b=?', 'retained-zip',
               'orthologs-per-genome', 'concatemer')
    orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \
    taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args)

    # Run filtering in a temporary folder, to prevent interference from simultaneous runs
    run_dir = tempfile.mkdtemp(prefix='filter_orthologs_')

    # Extract files from zip archive
    temp_dir = create_directory('orthologs', inside_dir=run_dir)
    ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir)

    # Filter orthologs with multiple COG annotations among genes if flag was set
    if filter_cogs:
        ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(
            run_dir, ortholog_files)

    # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element'

    # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set
    if filter_recombination:
        # Parse file to extract GenBank Project IDs
        with open(taxona) as read_handle:
            genome_ids_a = [line.split()[0] for line in read_handle]
        with open(taxonb) as read_handle:
            genome_ids_b = [line.split()[0] for line in read_handle]
        ortholog_files, recombined_files = _phipack_for_all_orthologs(
            run_dir, ortholog_files, genome_ids_a, genome_ids_b)
        # Create crosstable
        create_crosstable(recombined_files, recombined_crosstable)

    # Create archives of files on command line specified output paths
    if filter_cogs:
        shutil.move(transfered_cogs, filter_cogs)
    if filter_recombination:
        create_archive_of_files(filter_recombination, recombined_files)
    create_archive_of_files(retained_zip, ortholog_files)

    # Run the steps required after filtering orthologs
    post_recombination_filter(taxona, taxonb, retained_zip,
                              target_orth_per_genome, target_concat_file,
                              run_dir)

    # Remove unused files to free disk space
    shutil.rmtree(run_dir)

    # Exit after a comforting log message
    log.info('Produced:')
    if filter_cogs:
        log.info(filter_cogs)
    if filter_recombination:
        log.info(filter_recombination)
    log.info(retained_zip)
    log.info(target_orth_per_genome)
    log.info(target_concat_file)