def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: compare_taxa.py --unfiltered-taxon-a=FILE genome IDs for taxon A as deduced from phylogenetic tree of unfiltered concatemers --unfiltered-taxon-b=FILE genome IDs for taxon B as deduced from phylogenetic tree of unfiltered concatemers --filtered-taxon-a=FILE genome IDs for taxon A as deduced from phylogenetic tree of filtered concatemers --filtered-taxon-b=FILE genome IDs for taxon B as deduced from phylogenetic tree of filtered concatemers """ options = ['unfiltered-taxon-a', 'unfiltered-taxon-b', 'filtered-taxon-a', 'filtered-taxon-b'] unfiltered_a_file, unfiltered_b_file, filtered_a_file, filtered_b_file = parse_options(usage, options, args) # Parse ID files to extract GenBank Project IDs & Organism Name with open(unfiltered_a_file) as read_handle: unfiltered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle) with open(unfiltered_b_file) as read_handle: unfiltered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle) with open(filtered_a_file) as read_handle: filtered_a = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle) with open(filtered_b_file) as read_handle: filtered_b = dict((line.split('\t')[0], line.strip().split('\t')[1]) for line in read_handle) # Otherwise fail after if unfiltered_a.keys()[0] in filtered_a: if not (set(unfiltered_a.keys()) == set(filtered_a.keys()) and set(unfiltered_b.keys()) == set(filtered_b.keys())): fail(unfiltered_a, unfiltered_b, filtered_a, filtered_b) else: if not (set(unfiltered_a.keys()) == set(filtered_b.keys()) and set(unfiltered_b.keys()) == set(filtered_a.keys())): fail(unfiltered_a, unfiltered_b, filtered_b, filtered_a) # Else: no problems were found log.info('Succes: Unfiltered & filtered tree clustering did not result in different taxa.')
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_phipack.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --stats-file=FILE destination file path for values found through PhiPack for each ortholog """ options = ('orthologs-zip', 'stats-file') orthologs_zip, stats_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='run_phipack_') # Extract files from zip archive extraction_dir = create_directory('extracted_orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, extraction_dir) # Find recombination in all ortholog_files _phipack_for_all_orthologs(run_dir, ortholog_files, stats_file) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:\n%s', stats_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: select_taxa.py --external-genomes= comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes. label:FILE,... labels should be unique as genomes will be identified by this label in further output files --external-zip=FILE destination path for archive of user provided external genomes containing formatted nucleotide fasta files """ options = ['external-genomes', 'external-zip'] external_genomes, external_zip = parse_options(usage, options, args) # External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header external_fasta_files = {} # Handle externally uploaded genomes # Sample line: label1:file1,label2:file2, # Note trailing the trailing , that's a Galaxy artifact we'll ignore for label, filename in (label_file.split(':') for label_file in external_genomes.split(',') if label_file): if len(label) == 0: log.error('Empty label provided for upload genome %s. Please provide a label and try again.', filename) break log.info('Formatting external genome labeled %s at %s', label, filename) formatted_file = format_fasta_genome_headers(label, filename) external_fasta_files[label] = formatted_file # Copy formatted external genome files to archive that will be output as well create_archive_of_files(external_zip, external_fasta_files.values()) # Remove temporary formatted files for formatted_file in external_fasta_files.values(): os.remove(formatted_file) # Exit after a comforting log message log.info("Produced: \n%s", external_zip)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: concatenate_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --coding-regions=FILE destination file path archive of trimmed orthologous coding regions per genomes --concatemer=FILE destination file path for super-concatemer of all genomes --taxon-a=FILE destination file path for genome IDs for taxon A --taxon-b=FILE destination file path for genome IDs for taxon B --tree=FILE destination file path for tree visualization """ options = ['orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree'] orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \ parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='concatemer_tree_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Separate out orthologs per genome to create trimmed coding region files per genome genome_coding_regions_files = coding_regions_per_genome(run_dir, ortholog_files) create_archive_of_files(target_coding_regions, genome_coding_regions_files) # Concatenate coding region files per genome concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files) # Create super concatemer create_super_concatemer(concatemer_files, target_concat_file) # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and # reading genome ids in the two largest clades. super_distance_file = _run_dna_dist(run_dir, target_concat_file) super_tree_file = _run_neighbor(run_dir, super_distance_file) genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file) # Map Project IDs to Organism names id_to_name_map = dict((gid, genome['Organism/Name']) for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems()) # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome with open(target_taxon_a, mode='w') as write_handle: for genome_id in genome_ids_a: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id))) with open(target_taxon_b, mode='w') as write_handle: for genome_id in genome_ids_b: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get(genome_id, genome_id))) # Visualize tree visualize_tree(super_tree_file, id_to_name_map, target_tree) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: select_taxa.py --genomes=ID,... optional comma-separated list of selected GenBank Project IDs from complete genomes table --previous-file=FILE optional previously or externally created GenBank Project IDs file whose genomes should be reselected --require-protein-table require protein table files to be present for all downloaded genomes --genomes-file=FILE destination path for file with selected genome IDs followed by Organism Name on each line """ options = ['genomes=?', 'previous-file=?', 'require-protein-table?', 'genomes-file'] genomes_line, previous_file, require_ptt, genomes_file = parse_options(usage, options, args) # Genome IDs selected by the user that refer to GenBank or RefSeq entries genome_ids = [] # Split ids on comma if genomes_line: genome_ids.extend(val for val in genomes_line.split(',') if val) # Allow for input of previous or externally created genomes-file to rerun an analysis if previous_file: # Read previous GenBank Project IDs from previous_file, each on their own line with open(previous_file) as read_handle: genome_ids.extend(line.split()[0] for line in read_handle # But skip external genomes as their IDs will fail to download if 'external genome' not in line) # Assert each clade contains enough IDs maximum = 100 # TODO Move this test to translate, where we can see how many translations succeeded + how many externals there are if maximum < len(genome_ids): logging.error('Expected between two and %s selected genomes, but was %s', maximum, len(genome_ids)) sys.exit(1) # Retrieve genome dictionaries to get to Organism Name genomes = select_genomes_by_ids(genome_ids).values() genomes = sorted(genomes, key=itemgetter('Organism/Name')) # Semi-touch genomes file in case no genomes were selected, for instance when uploading external genomes open(genomes_file, mode='a').close() # Write IDs to file, with organism name as second column to make the project ID files more self explanatory. for genome in genomes: # Download files here, but ignore returned files: These can be retrieved from cache during extraction/translation download_genome_files(genome, genomes_file, require_ptt=require_ptt) # Post check after translation to see if more than one genome actually had some genomic contents with open(genomes_file) as read_handle: genome_ids = [line.split()[0] for line in read_handle] # If some genomes were skipped, ensure at least two genomes remain if len([gid for gid in genome_ids if gid.startswith('#')]): assert 2 <= len([gid for gid in genome_ids if not gid.startswith('#')]), \ "Some genomes were skipped, leaving us with less than two genomes to operate on; " \ "Inspect messages in Project ID list and reevaluate genome selection" # Exit after a comforting log message logging.info("Produced: \n%s", genomes_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: translate.py --genomes=FILE file with selected genome IDs followed by Organism Name on each line --external-zip=FILE optional archive of user provided external genomes containing formatted nucleotide fasta files --dna-zip=FILE destination file path for zip archive of extracted DNA files --protein-zip=FILE destination file path for zip archive of translated protein files """ options = ['genomes', 'external-zip=?', 'dna-zip', 'protein-zip'] genome_ids_file, external_zip, dna_zipfile, protein_zipfile = parse_options(usage, options, args) dna_files = [] protein_files = [] # Read GenBank Project IDs from genomes_file, each on their own line with open(genome_ids_file) as read_handle: genome_ids = [line.split()[0] for line in read_handle if not line.startswith('#') and 'external genome' not in line] if len(genome_ids): # Retrieve associated genome dictionaries from complete genomes table genomes = select_genomes_by_ids(genome_ids).values() genomes = sorted(genomes, key=itemgetter('Organism/Name')) # Actually translate the genomes to produced a set of files for both dna files & protein files dna_files, protein_files = translate_genomes(genomes) # Also translate the external genomes if external_zip: # Extract external genomes archive external_dir = tempfile.mkdtemp(prefix='external_genomes_') external_dna_files = extract_archive_of_files(external_zip, external_dir) # Append IDs of external fasta files to genome IDs file _append_external_genomes(external_dna_files, genome_ids_file) # Translate individual files external_protein_files = [translate_fasta_coding_regions(dna_file) for dna_file in external_dna_files] # Add the files to the appropriate collections dna_files.extend(external_dna_files) protein_files.extend(external_protein_files) # Write the produced files to command line argument filenames create_archive_of_files(dna_zipfile, dna_files) create_archive_of_files(protein_zipfile, protein_files) # Do not clean up extracted DNA files or Protein translations: Keep them as cache # But do clean up external_dir now that the compressed archives are created if external_zip: shutil.rmtree(external_dir) # Exit after a comforting log message log.info("Produced: \n%s &\n%s", dna_zipfile, protein_zipfile)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --retained-threshold=PERC filter orthologs that retain less than PERC % of sequence after trimming alignment --max-indel-length=NUMBER filter orthologs that contain insertions / deletions longer than N in middle of alignment --aligned-zip=FILE destination file path for archive of aligned orthologous genes --misaligned-zip=FILE destination file path for archive of misaligned orthologous genes --trimmed-zip=FILE destination file path for archive of aligned & trimmed orthologous genes --stats=FILE destination file path for ortholog trimming statistics file --scatterplot=FILE destination file path for scatterplot of retained and filtered sequences by length """ options = [ 'orthologs-zip', 'retained-threshold', 'max-indel-length', 'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot' ] orthologs_zip, retained_threshold, max_indel_length, \ aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \ parse_options(usage, options, args) # Convert retained threshold to integer, so we can fail fast if argument value format was wrong retained_threshold = int(retained_threshold) max_indel_length = int(max_indel_length) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='align_trim_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) sico_files = extract_archive_of_files(orthologs_zip, temp_dir) # Align SICOs so all sequences become equal length sequences aligned_files = _align_sicos(run_dir, sico_files) # Filter orthologs that retain less than PERC % of sequence after trimming alignment trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length, target_stats_path, target_scatterplot) # Create archives of files on command line specified output paths create_archive_of_files(aligned_zip, aligned_files) create_archive_of_files(misaligned_zip, misaligned_files) create_archive_of_files(trimmed_zip, trimmed_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info( 'Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot)))
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: split_by_taxa.py --genomes-a=FILE file with genome GenBank Project ID and Organism name on each line for taxon A --genomes-b=FILE file with genome GenBank Project ID and Organism name on each line for taxon B --orthologs-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --taxon-a-zip=FILE destination file path for archive of SICO genes belonging to taxon A --taxon-b-zip=FILE destination file path for archive of SICO genes belonging to taxon B """ options = [ 'genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip' ] genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options( usage, options, args) # Parse file containing RefSeq project IDs to extract RefSeq project IDs with open(genome_a_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_a = [line[0] for line in lines] common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a') with open(genome_b_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_b = [line[0] for line in lines] common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b') # Create run_dir to hold files related to this run run_dir = tempfile.mkdtemp(prefix='split_by_taxa_') # Extract files from zip archive ortholog_files = extract_archive_of_files( orthologs_zip, create_directory('alignments', inside_dir=run_dir)) # Actually split alignments per taxon taxon_a_files, taxon_b_files = split_alignment_by_taxa( run_dir, ortholog_files, (genome_ids_a, common_prefix_a), (genome_ids_b, common_prefix_b)) # Write the produced files to command line argument filenames create_archive_of_files(taxon_a_zip, taxon_a_files) create_archive_of_files(taxon_b_zip, taxon_b_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip) return taxon_a_zip, taxon_b_zip
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --retained-threshold=PERC filter orthologs that retain less than PERC % of sequence after trimming alignment --max-indel-length=NUMBER filter orthologs that contain insertions / deletions longer than N in middle of alignment --aligned-zip=FILE destination file path for archive of aligned orthologous genes --misaligned-zip=FILE destination file path for archive of misaligned orthologous genes --trimmed-zip=FILE destination file path for archive of aligned & trimmed orthologous genes --stats=FILE destination file path for ortholog trimming statistics file --scatterplot=FILE destination file path for scatterplot of retained and filtered sequences by length """ options = ['orthologs-zip', 'retained-threshold', 'max-indel-length', 'aligned-zip', 'misaligned-zip', 'trimmed-zip', 'stats', 'scatterplot'] orthologs_zip, retained_threshold, max_indel_length, \ aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot = \ parse_options(usage, options, args) # Convert retained threshold to integer, so we can fail fast if argument value format was wrong retained_threshold = int(retained_threshold) max_indel_length = int(max_indel_length) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='align_trim_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) sico_files = extract_archive_of_files(orthologs_zip, temp_dir) # Align SICOs so all sequences become equal length sequences aligned_files = _align_sicos(run_dir, sico_files) # Filter orthologs that retain less than PERC % of sequence after trimming alignment trimmed_files, misaligned_files = _trim_alignments(run_dir, aligned_files, retained_threshold, max_indel_length, target_stats_path, target_scatterplot) # Create archives of files on command line specified output paths create_archive_of_files(aligned_zip, aligned_files) create_archive_of_files(misaligned_zip, misaligned_files) create_archive_of_files(trimmed_zip, trimmed_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s', '\n'.join((aligned_zip, misaligned_zip, trimmed_zip, target_stats_path, target_scatterplot)))
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: split_by_taxa.py --genomes-a=FILE file with genome GenBank Project ID and Organism name on each line for taxon A --genomes-b=FILE file with genome GenBank Project ID and Organism name on each line for taxon B --orthologs-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --taxon-a-zip=FILE destination file path for archive of SICO genes belonging to taxon A --taxon-b-zip=FILE destination file path for archive of SICO genes belonging to taxon B """ options = ['genomes-a', 'genomes-b', 'orthologs-zip', 'taxon-a-zip', 'taxon-b-zip'] genome_a_ids_file, genome_b_ids_file, orthologs_zip, taxon_a_zip, taxon_b_zip = parse_options(usage, options, args) # Parse file containing RefSeq project IDs to extract RefSeq project IDs with open(genome_a_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_a = [line[0] for line in lines] common_prefix_a = _common_prefix([line[1] for line in lines], 'taxon_a') with open(genome_b_ids_file) as read_handle: lines = [line.split('\t') for line in read_handle] genome_ids_b = [line[0] for line in lines] common_prefix_b = _common_prefix([line[1] for line in lines], 'taxon_b') # Create run_dir to hold files related to this run run_dir = tempfile.mkdtemp(prefix='split_by_taxa_') # Extract files from zip archive ortholog_files = extract_archive_of_files(orthologs_zip, create_directory('alignments', inside_dir=run_dir)) # Actually split alignments per taxon taxon_a_files, taxon_b_files = split_alignment_by_taxa(run_dir, ortholog_files, (genome_ids_a, common_prefix_a), (genome_ids_b, common_prefix_b)) # Write the produced files to command line argument filenames create_archive_of_files(taxon_a_zip, taxon_a_files) create_archive_of_files(taxon_b_zip, taxon_b_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced: \n%s\n%s", taxon_a_zip, taxon_b_zip) return taxon_a_zip, taxon_b_zip
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_codeml.py --genomes-a=FILE file with GenBank Project IDs from complete genomes table on each line for taxon A --genomes-b=FILE file with GenBank Project IDs from complete genomes table on each line for taxon B --sico-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --codeml-zip=FILE destination file path for archive of codeml output per SICO gene --dnds-stats=FILE destination file path for file with dN, dS & dN/dS values per SICO gene """ options = [ 'genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats' ] genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options( usage, options, args) # Parse file to extract GenBank Project IDs with open(genome_a_ids_file) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(genome_b_ids_file) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] # Create run_dir to hold files relating to this run run_dir = tempfile.mkdtemp(prefix='run_codeml_') # Extract files from zip archive sico_files = extract_archive_of_files( sico_zip, create_directory('sicos', inside_dir=run_dir)) # Actually run codeml codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files) # Write dnds values to single output file _write_dnds_per_ortholog(dnds_file, codeml_files) # Write the produced files to command line argument filenames create_archive_of_files(codeml_zip, codeml_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: crosstable_gene_ids.py --sico-zip=FILE archive of single copy orthologous (SICO) genes in separate files per ortholog --crosstable=FILE destination file path for crosstable between orthologs & genomes with gene IDs at intersections """ options = ['sico-zip', 'crosstable'] sizo_zip, target_crosstable = parse_options(usage, options, args) # Create tempdir run_dir = tempfile.mkdtemp(prefix='crosstable_') sico_files = extract_archive_of_files(sizo_zip, run_dir) # Create crosstable create_crosstable(sico_files, target_crosstable) # Remove extracted files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message logging.info("Produced: \n%s", target_crosstable)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: run_codeml.py --genomes-a=FILE file with GenBank Project IDs from complete genomes table on each line for taxon A --genomes-b=FILE file with GenBank Project IDs from complete genomes table on each line for taxon B --sico-zip=FILE archive of aligned & trimmed single copy orthologous (SICO) genes --codeml-zip=FILE destination file path for archive of codeml output per SICO gene --dnds-stats=FILE destination file path for file with dN, dS & dN/dS values per SICO gene """ options = ['genomes-a', 'genomes-b', 'sico-zip', 'codeml-zip', 'dnds-stats'] genome_a_ids_file, genome_b_ids_file, sico_zip, codeml_zip, dnds_file = parse_options(usage, options, args) # Parse file to extract GenBank Project IDs with open(genome_a_ids_file) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(genome_b_ids_file) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] # Create run_dir to hold files relating to this run run_dir = tempfile.mkdtemp(prefix='run_codeml_') # Extract files from zip archive sico_files = extract_archive_of_files(sico_zip, create_directory('sicos', inside_dir=run_dir)) # Actually run codeml codeml_files = run_codeml_for_sicos(run_dir, genome_ids_a, genome_ids_b, sico_files) # Write dnds values to single output file _write_dnds_per_ortholog(dnds_file, codeml_files) # Write the produced files to command line argument filenames create_archive_of_files(codeml_zip, codeml_files) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message logging.info("Produced: \n%s\n%s", codeml_zip, dnds_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: select_taxa.py --external-genomes= comma-separated list of label:nucleotide fasta file pairs of externally supplied genomes. label:FILE,... labels should be unique as genomes will be identified by this label in further output files --external-zip=FILE destination path for archive of user provided external genomes containing formatted nucleotide fasta files """ options = ['external-genomes', 'external-zip'] external_genomes, external_zip = parse_options(usage, options, args) # External genomes are nucleotide fasta files uploaded by the user of which we will reformat the header external_fasta_files = {} # Handle externally uploaded genomes # Sample line: label1:file1,label2:file2, # Note trailing the trailing , that's a Galaxy artifact we'll ignore for label, filename in (label_file.split(':') for label_file in external_genomes.split(',') if label_file): if len(label) == 0: log.error( 'Empty label provided for upload genome %s. Please provide a label and try again.', filename) break log.info('Formatting external genome labeled %s at %s', label, filename) formatted_file = format_fasta_genome_headers(label, filename) external_fasta_files[label] = formatted_file # Copy formatted external genome files to archive that will be output as well create_archive_of_files(external_zip, external_fasta_files.values()) # Remove temporary formatted files for formatted_file in external_fasta_files.values(): os.remove(formatted_file) # Exit after a comforting log message log.info("Produced: \n%s", external_zip)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: extract_orthologs.py --genomes=FILE file with GenBank Project IDs from complete genomes table on each line --dna-zip=FILE zip archive of extracted DNA files --groups=FILE file listing groups of orthologous proteins --require-limiter flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL] --sico-zip=FILE destination file path for archive of shared single copy orthologous (SICO) genes --muco-zip=FILE destination file path for archive of shared multiple copy orthologous genes --subset-zip=FILE destination file path for archive of variable copy orthologous genes shared for a subset only --stats=FILE destination file path for ortholog statistics file --heatmap=FILE destination file path heatmap of orthologs and occurrences of ortholog per genome --orfans=FILE destination file path ORFans """ options = ['genomes', 'dna-zip', 'groups', 'require-limiter?', 'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans'] genome_ids_file, dna_zip, groups_file, require_limiter, \ target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \ parse_options(usage, options, args) # Parse file extract GenBank Project IDs with open(genome_ids_file) as read_handle: genomes = [line.split()[0] for line in read_handle if not line.startswith('#')] # Create temporary directory within which to extract orthologs run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_') # Extract files from zip archive temp_dir = create_directory('dna_files', inside_dir=run_dir) dna_files = extract_archive_of_files(dna_zip, temp_dir) # Actually run ortholog extraction sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \ extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter) # Append the orfans to the heatmap file _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file) # Move produced files to command line specified output paths create_archive_of_files(target_sico, sico_files) if target_muco: create_archive_of_files(target_muco, muco_files) if target_subset: create_archive_of_files(target_subset, subset_files) shutil.move(stats_file, target_stats_path) shutil.move(heatmap_file, target_heat) shutil.move(orfans_file, target_orfans) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced:") log.info("%s", target_sico) if target_muco: log.info("%s", target_muco) if target_subset: log.info("%s", target_subset) log.info("%s", target_stats_path) log.info("%s", target_heat)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: concatenate_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --coding-regions=FILE destination file path archive of trimmed orthologous coding regions per genomes --concatemer=FILE destination file path for super-concatemer of all genomes --taxon-a=FILE destination file path for genome IDs for taxon A --taxon-b=FILE destination file path for genome IDs for taxon B --tree=FILE destination file path for tree visualization """ options = [ 'orthologs-zip', 'coding-regions', 'concatemer', 'taxon-a', 'taxon-b', 'tree' ] orthologs_zip, target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree = \ parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='concatemer_tree_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Separate out orthologs per genome to create trimmed coding region files per genome genome_coding_regions_files = coding_regions_per_genome( run_dir, ortholog_files) create_archive_of_files(target_coding_regions, genome_coding_regions_files) # Concatenate coding region files per genome concatemer_files = concatemer_per_genome(run_dir, genome_coding_regions_files) # Create super concatemer create_super_concatemer(concatemer_files, target_concat_file) # Determine the taxa present in the super concatemer tree by building a phylogenetic tree from genome concatemer and # reading genome ids in the two largest clades. super_distance_file = _run_dna_dist(run_dir, target_concat_file) super_tree_file = _run_neighbor(run_dir, super_distance_file) genome_ids_a, genome_ids_b = _read_taxa_from_tree(super_tree_file) # Map Project IDs to Organism names id_to_name_map = dict( (gid, genome['Organism/Name']) for gid, genome in select_genomes_by_ids(genome_ids_a + genome_ids_b).iteritems()) # Write Project IDs and Organism Names to files, with a fallback to genome_id for external genome with open(target_taxon_a, mode='w') as write_handle: for genome_id in genome_ids_a: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get( genome_id, genome_id))) with open(target_taxon_b, mode='w') as write_handle: for genome_id in genome_ids_b: write_handle.write('{id}\t{name}\n'.format(id=genome_id, name=id_to_name_map.get( genome_id, genome_id))) # Visualize tree visualize_tree(super_tree_file, id_to_name_map, target_tree) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced: \n%s\n%s\n%s\n%s\n%s', target_coding_regions, target_concat_file, target_taxon_a, target_taxon_b, target_tree)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: select_taxa.py --genomes=ID,... optional comma-separated list of selected GenBank Project IDs from complete genomes table --previous-file=FILE optional previously or externally created GenBank Project IDs file whose genomes should be reselected --require-protein-table require protein table files to be present for all downloaded genomes --genomes-file=FILE destination path for file with selected genome IDs followed by Organism Name on each line """ options = [ 'genomes=?', 'previous-file=?', 'require-protein-table?', 'genomes-file' ] genomes_line, previous_file, require_ptt, genomes_file = parse_options( usage, options, args) # Genome IDs selected by the user that refer to GenBank or RefSeq entries genome_ids = [] # Split ids on comma if genomes_line: genome_ids.extend(val for val in genomes_line.split(',') if val) # Allow for input of previous or externally created genomes-file to rerun an analysis if previous_file: # Read previous GenBank Project IDs from previous_file, each on their own line with open(previous_file) as read_handle: genome_ids.extend( line.split()[0] for line in read_handle # But skip external genomes as their IDs will fail to download if 'external genome' not in line) # Assert each clade contains enough IDs maximum = 100 # TODO Move this test to translate, where we can see how many translations succeeded + how many externals there are if maximum < len(genome_ids): logging.error( 'Expected between two and %s selected genomes, but was %s', maximum, len(genome_ids)) sys.exit(1) # Retrieve genome dictionaries to get to Organism Name genomes = select_genomes_by_ids(genome_ids).values() genomes = sorted(genomes, key=itemgetter('Organism/Name')) # Semi-touch genomes file in case no genomes were selected, for instance when uploading external genomes open(genomes_file, mode='a').close() # Write IDs to file, with organism name as second column to make the project ID files more self explanatory. for genome in genomes: # Download files here, but ignore returned files: These can be retrieved from cache during extraction/translation download_genome_files(genome, genomes_file, require_ptt=require_ptt) # Post check after translation to see if more than one genome actually had some genomic contents with open(genomes_file) as read_handle: genome_ids = [line.split()[0] for line in read_handle] # If some genomes were skipped, ensure at least two genomes remain if len([gid for gid in genome_ids if gid.startswith('#')]): assert 2 <= len([gid for gid in genome_ids if not gid.startswith('#')]), \ "Some genomes were skipped, leaving us with less than two genomes to operate on; " \ "Inspect messages in Project ID list and reevaluate genome selection" # Exit after a comforting log message logging.info("Produced: \n%s", genomes_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: extract_orthologs.py --genomes=FILE file with GenBank Project IDs from complete genomes table on each line --dna-zip=FILE zip archive of extracted DNA files --groups=FILE file listing groups of orthologous proteins --require-limiter flag whether extracted core set of genomes should contain the limiter added in OrthoMCL [OPTIONAL] --sico-zip=FILE destination file path for archive of shared single copy orthologous (SICO) genes --muco-zip=FILE destination file path for archive of shared multiple copy orthologous genes --subset-zip=FILE destination file path for archive of variable copy orthologous genes shared for a subset only --stats=FILE destination file path for ortholog statistics file --heatmap=FILE destination file path heatmap of orthologs and occurrences of ortholog per genome --orfans=FILE destination file path ORFans """ options = [ 'genomes', 'dna-zip', 'groups', 'require-limiter?', 'sico-zip', 'muco-zip=?', 'subset-zip=?', 'stats', 'heatmap', 'orfans' ] genome_ids_file, dna_zip, groups_file, require_limiter, \ target_sico, target_muco, target_subset, target_stats_path, target_heat, target_orfans = \ parse_options(usage, options, args) # Parse file extract GenBank Project IDs with open(genome_ids_file) as read_handle: genomes = [ line.split()[0] for line in read_handle if not line.startswith('#') ] # Create temporary directory within which to extract orthologs run_dir = tempfile.mkdtemp(prefix='extract_orthologs_run_') # Extract files from zip archive temp_dir = create_directory('dna_files', inside_dir=run_dir) dna_files = extract_archive_of_files(dna_zip, temp_dir) # Actually run ortholog extraction sico_files, muco_files, subset_files, stats_file, heatmap_file, orfans_file = \ extract_orthologs(run_dir, genomes, dna_files, groups_file, require_limiter) # Append the orfans to the heatmap file _append_orfans_to_heatmap(orfans_file, genomes, heatmap_file) # Move produced files to command line specified output paths create_archive_of_files(target_sico, sico_files) if target_muco: create_archive_of_files(target_muco, muco_files) if target_subset: create_archive_of_files(target_subset, subset_files) shutil.move(stats_file, target_stats_path) shutil.move(heatmap_file, target_heat) shutil.move(orfans_file, target_orfans) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info("Produced:") log.info("%s", target_sico) if target_muco: log.info("%s", target_muco) if target_subset: log.info("%s", target_subset) log.info("%s", target_stats_path) log.info("%s", target_heat)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --filter-multiple-cogs filter orthologs with multiple COG annotations among genes [OPTIONAL] --filter-recombination=FILE filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL] destination file path for archive of recombination orthologs --recombined-crosstable=FILE destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL] --taxon-a=FILE file with genome IDs for taxon A to use in recombination filtering --taxon-b=FILE file with genome IDs for taxon B to use in recombination filtering --retained-zip=FILE destination file path for archive of retained orthologs after filtering --orthologs-per-genome=FILE destination file path for orthologs split out per genome, based on the retained.zip --concatemer=FILE destination file path for super-concatemer of all genomes """ options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?', 'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer') orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \ taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='filter_orthologs_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Filter orthologs with multiple COG annotations among genes if flag was set if filter_cogs: ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs(run_dir, ortholog_files) # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element' # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set if filter_recombination: # Parse file to extract GenBank Project IDs with open(taxona) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(taxonb) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] ortholog_files, recombined_files = _phipack_for_all_orthologs(run_dir, ortholog_files, genome_ids_a, genome_ids_b) # Create crosstable create_crosstable(recombined_files, recombined_crosstable) # Create archives of files on command line specified output paths if filter_cogs: shutil.move(transfered_cogs, filter_cogs) if filter_recombination: create_archive_of_files(filter_recombination, recombined_files) create_archive_of_files(retained_zip, ortholog_files) # Run the steps required after filtering orthologs post_recombination_filter(taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file, run_dir) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:') if filter_cogs: log.info(filter_cogs) if filter_recombination: log.info(filter_recombination) log.info(retained_zip) log.info(target_orth_per_genome) log.info(target_concat_file)
def main(args): """Main function called when run from command line or as part of pipeline.""" usage = """ Usage: filter_orthologs.py --orthologs-zip=FILE archive of orthologous genes in FASTA format --filter-multiple-cogs filter orthologs with multiple COG annotations among genes [OPTIONAL] --filter-recombination=FILE filter orthologs that show recombination when comparing phylogenetic trees [OPTIONAL] destination file path for archive of recombination orthologs --recombined-crosstable=FILE destination file path for recombined crosstable of GeneIDs, COGs and Products [OPTIONAL] --taxon-a=FILE file with genome IDs for taxon A to use in recombination filtering --taxon-b=FILE file with genome IDs for taxon B to use in recombination filtering --retained-zip=FILE destination file path for archive of retained orthologs after filtering --orthologs-per-genome=FILE destination file path for orthologs split out per genome, based on the retained.zip --concatemer=FILE destination file path for super-concatemer of all genomes """ options = ('orthologs-zip', 'filter-multiple-cogs=?', 'filter-recombination=?', 'recombined-crosstable=?', 'taxon-a=?', 'taxon-b=?', 'retained-zip', 'orthologs-per-genome', 'concatemer') orthologs_zip, filter_cogs, filter_recombination, recombined_crosstable, \ taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file = parse_options(usage, options, args) # Run filtering in a temporary folder, to prevent interference from simultaneous runs run_dir = tempfile.mkdtemp(prefix='filter_orthologs_') # Extract files from zip archive temp_dir = create_directory('orthologs', inside_dir=run_dir) ortholog_files = extract_archive_of_files(orthologs_zip, temp_dir) # Filter orthologs with multiple COG annotations among genes if flag was set if filter_cogs: ortholog_files, transfered_cogs = _filter_multiple_cog_orthologs( run_dir, ortholog_files) # Possible extension: filter ortholog when any strain has been flagged as 'mobile element', 'phage' or 'IS element' # Filter orthologs that show recombination when comparing phylogenetic trees if flag was set if filter_recombination: # Parse file to extract GenBank Project IDs with open(taxona) as read_handle: genome_ids_a = [line.split()[0] for line in read_handle] with open(taxonb) as read_handle: genome_ids_b = [line.split()[0] for line in read_handle] ortholog_files, recombined_files = _phipack_for_all_orthologs( run_dir, ortholog_files, genome_ids_a, genome_ids_b) # Create crosstable create_crosstable(recombined_files, recombined_crosstable) # Create archives of files on command line specified output paths if filter_cogs: shutil.move(transfered_cogs, filter_cogs) if filter_recombination: create_archive_of_files(filter_recombination, recombined_files) create_archive_of_files(retained_zip, ortholog_files) # Run the steps required after filtering orthologs post_recombination_filter(taxona, taxonb, retained_zip, target_orth_per_genome, target_concat_file, run_dir) # Remove unused files to free disk space shutil.rmtree(run_dir) # Exit after a comforting log message log.info('Produced:') if filter_cogs: log.info(filter_cogs) if filter_recombination: log.info(filter_recombination) log.info(retained_zip) log.info(target_orth_per_genome) log.info(target_concat_file)