import glob import group import re # To be executed in total_genome dir. # 18 groups list is currently being used. to_parse = glob.glob("*_allOGs.txt") output = open("/mnt/c/Users/scamb/Documents/uob_msc/Genome_Data/OG_arb-fal/new_outputs/summary_data/group_total.txt", "w") original_groups = group.groups() correct_order_groups_15 = ["SAR", "Haptista", "Archaeplastida", "Obazoa", "Telonemids", "Discoba", "Ancyromonadida", "Cryptista", "Atwista", "Amoebozoa", "Collodictyonids", "Metamonads", "Apusomonada", "Hemimastigophora", "Malawimonadidae"] correct_order_groups_18 = ["Alveolata", "Stramenopiles", "Archaeplastida", "Obazoa", "Telonemids", "Discoba", "Centrohelids", "Ancyromonadida", "Cryptista", "Rhizaria", "Haptyophyta", "Atwista", "Amoebozoa", "Collodictyonids", "Metamonads", "Apusomonada", "Hemimastigophora", "Malawimonadidae"] for file in to_parse: name = re.split("_", file) og = group.parse_OG(file) outputWrite = output.write(f"{name[0]}\t{og}\n") output.close() # Writes out totals and own OGs to a file as a single vector. # The data can then be processed in R. own_OGs_to_parse = glob.glob("/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt") totals_own_output = open("/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/summary_data/vector_totals_own_18.txt", "w") # correct_order_groups must be at top of loop to ensure correct order is maintained. # list of choice can be configured here. for eugroup in correct_order_groups_18: for file in own_OGs_to_parse: res = re.search(r"(\w+)_(\w+)_output.txt$", file) if res:
# Extract unique OG data from /new_outputs dir. for name in group_list: to_parse = glob.glob( "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt" ) for file in to_parse: filename = re.search(r"([A-Z]\w+)_([A-Z]\w+)_output.txt$", file) if filename: # Skips files in new_outputs (if certain groups are being excluded). # Must be done for group(1) and group(2) since name order in filename can vary. if filename.group(1) not in group_list: pass elif filename.group(2) not in group_list: pass elif filename.group(1) == name: shared_og = group.parse_OG(file) data.append(shared_og) elif filename.group(2) == name: shared_og = group.parse_OG(file) data.append(shared_og) # Create an array from the data. data_array = np.array(data, dtype=float) # Adjust the array shape to match the length of group list. data_array.shape = (len(group_list), len(group_list)) def convert_to_proportional(data_point, euk_group): """ convert_to_proportional takes OG data individually and uses eukaryote genome total data from parse_total to calculate the percentage of that genome the OGs constitute. """
import sys sys.path.insert( 0, "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/OG_results_pipeline" ) import group import glob split_system = [15, 18] for system in split_system: sum = 0 if system == 15: to_parse = glob.glob("*_15_output.txt") else: to_parse = glob.glob("*_18_output.txt") for file in to_parse: filename = file.split("_") if "common" in filename: # Finds output file containing OGs common to all groups. common = int(group.parse_OG(file)) else: total_OGs = int(group.parse_OG( file)) # Finds the relevant output files (15 or 18). sum = sum + total_OGs # sum accrues the total for all output files over the course of the loop. total = str(sum + common) print("The number of orthogroups common to every eukaryote group (" + str(system) + "-way split) is " + str(common) + ". An additional " + str(sum) + " are common to every group but one, for a total of " + total + " ubiquitous or near-ubiquitous orthogroups.")