# *** setquery.py can be used to search presence of OGs for 3+ groups. This can be used to link OGs to particular character traits. *** # Now modified to enable the use of different sp. code dictionaries and group lists in analysis, determined by sys.argv[1] and sys.argv[2].. import group import sys # sys.argv[1] must equal groups, alt_groups or alt_groups_18. # sys.argv[2] must equal codes, alt_codes, or alt_codes_18. # function_mappings and select_function() function are defined to map the sys.argvs from strings to actual functions. # Comma-separated returns allow multiple returns from the same function (only just realised this). function_mappings = { 'groups': group.groups(), 'alt_groups': group.alt_groups(), 'alt_groups_18': group.alt_groups_18(), 'codes': group.codes(), 'alt_codes': group.alt_codes(), 'alt_codes_18': group.alt_codes_18() } def select_function(): while True: try: return function_mappings[sys.argv[1]], function_mappings[ sys.argv[2]] except KeyError: print('Invalid function, try again.') # select_function is called with the relevant group list and codes - these should correspond else the program may fail. group_list, sp_codes = select_function()
# Another alternative to the original find_group.py, iterates over the divided SAR and Haptista subgroups. # New script for the extra groups is created as an alternative to iterating over every group again. import re import glob import group codes = group.alt_codes_18() new_groups = ['Alveolata', 'Centrohelids', 'Haptophyta', 'Rhizaria', 'Stramenopiles'] all_groups = group.alt_groups_18() sorted_all_groups = sorted(all_groups) all_sets = [] for eugroup in new_groups: for ogroup in sorted_all_groups: query = [eugroup, ogroup] if eugroup != ogroup: group_set = set(query) else: group_set = eugroup if group_set not in all_sets: # Appends to all sets to ensure no duplicate analyses. all_sets.append(group_set) if isinstance(group_set, set): group_set = sorted(group_set) # Reverts data to a sorted list so that names are alphabetical. group.find_group(group_set, codes) # 2nd argument of find_group() can now be configured depending on which code set is needed.
] correct_order_15 = [ "Telonemids", "Haptista", "SAR", "Atwista", "Archaeplastida", "Ancyromonadida", "Obazoa", "Discoba", "Collodictyonids", "Cryptista", "Amoebozoa", "Metamonads", "Hemimastigophora", "Apusomonada", "Malawimonadidae" ] correct_order_includingown_18 = [ "Alveolata", "Stramenopiles", "Archaeplastida", "Obazoa", "Telonemids", "Centrohelids", "Ancyromonadida", "Discoba", "Rhizaria", "Cryptista", "Atwista", "Haptophyta", "Collodictyonids", "Amoebozoa", "Metamonads", "Hemimastigophora", "Apusomonada", "Malawimonadidae" ] # Choose the necessary group list from the group module. alt_groups = sorted(group.alt_groups_18()) group_names = alt_groups data = [] for name in group_names: to_parse = glob.glob( "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt" ) for file in to_parse: filename = re.search(r"([A-Z]\w*)_([A-Z]\w*)_output.txt$", file) if filename: if filename.group( 1 ) not in group_names: # Skips 'non-target' files in new_outputs. pass elif filename.group(
# This script finds all the OGs of each group, regardless of exclusivity. # Outputs are stored in total_genome directory. import glob import group import re codes = group.alt_codes_18() group_names = group.alt_groups_18() # alt_groups_18() has eukaryote groups SAR and Haptista, as well as 'Other', split into their respective subgroups. for name in group_names: to_parse = glob.glob("*.fal") filename = "%s_allOGs.txt" % name genome = open(filename, "w") i = 0 for file in to_parse: groups_present = [] with open(file) as f: for line in f: if line.startswith(">"): fields = re.split("_", line) species_code = fields[0][1:] for code in codes: group = codes[species_code] if group not in groups_present: groups_present.append(group) if name in groups_present: # Functions in the same way as findgroup, only without the exclusive set matching. genomeWrite = genome.write(f'{file}\n') i += 1 genomeWrite = genome.write(f'Total gene families in group: {i}') genome.close()