Exemple #1
0
import glob
import group
import re

# To be executed in total_genome dir.
# 18 groups list is currently being used.
to_parse = glob.glob("*_allOGs.txt")
output = open("/mnt/c/Users/scamb/Documents/uob_msc/Genome_Data/OG_arb-fal/new_outputs/summary_data/group_total.txt", "w")
original_groups = group.groups()
correct_order_groups_15 = ["SAR", "Haptista", "Archaeplastida", "Obazoa", "Telonemids", "Discoba", "Ancyromonadida", "Cryptista", "Atwista", "Amoebozoa", "Collodictyonids", "Metamonads", "Apusomonada", "Hemimastigophora", "Malawimonadidae"]
correct_order_groups_18 = ["Alveolata", "Stramenopiles", "Archaeplastida", "Obazoa", "Telonemids", "Discoba", "Centrohelids", "Ancyromonadida", "Cryptista", "Rhizaria", "Haptyophyta", "Atwista", "Amoebozoa", "Collodictyonids", "Metamonads", "Apusomonada", "Hemimastigophora", "Malawimonadidae"]

for file in to_parse:
	name = re.split("_", file)
	og = group.parse_OG(file)
	outputWrite = output.write(f"{name[0]}\t{og}\n")

output.close()

# Writes out totals and own OGs to a file as a single vector.
# The data can then be processed in R.
own_OGs_to_parse = glob.glob("/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt")
totals_own_output = open("/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/summary_data/vector_totals_own_18.txt", "w")

# correct_order_groups must be at top of loop to ensure correct order is maintained.
# list of choice can be configured here.
for eugroup in correct_order_groups_18:
	for file in own_OGs_to_parse:
		res = re.search(r"(\w+)_(\w+)_output.txt$", file)
		if res:
Exemple #2
0
# Extract unique OG data from /new_outputs dir.
for name in group_list:
    to_parse = glob.glob(
        "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt"
    )
    for file in to_parse:
        filename = re.search(r"([A-Z]\w+)_([A-Z]\w+)_output.txt$", file)
        if filename:
            # Skips files in new_outputs (if certain groups are being excluded).
            # Must be done for group(1) and group(2) since name order in filename can vary.
            if filename.group(1) not in group_list:
                pass
            elif filename.group(2) not in group_list:
                pass
            elif filename.group(1) == name:
                shared_og = group.parse_OG(file)
                data.append(shared_og)
            elif filename.group(2) == name:
                shared_og = group.parse_OG(file)
                data.append(shared_og)

# Create an array from the data.
data_array = np.array(data, dtype=float)

# Adjust the array shape to match the length of group list.
data_array.shape = (len(group_list), len(group_list))


def convert_to_proportional(data_point, euk_group):
    """ convert_to_proportional takes OG data individually and uses eukaryote genome total data from parse_total to
	calculate the percentage of that genome the OGs constitute. """
import sys
sys.path.insert(
    0,
    "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/OG_results_pipeline"
)
import group
import glob

split_system = [15, 18]

for system in split_system:
    sum = 0
    if system == 15:
        to_parse = glob.glob("*_15_output.txt")
    else:
        to_parse = glob.glob("*_18_output.txt")
    for file in to_parse:
        filename = file.split("_")
        if "common" in filename:  # Finds output file containing OGs common to all groups.
            common = int(group.parse_OG(file))
        else:
            total_OGs = int(group.parse_OG(
                file))  # Finds the relevant output files (15 or 18).
            sum = sum + total_OGs  # sum accrues the total for all output files over the course of the loop.

    total = str(sum + common)
    print("The number of orthogroups common to every eukaryote group (" +
          str(system) + "-way split) is " + str(common) + ". An additional " +
          str(sum) + " are common to every group but one, for a total of " +
          total + " ubiquitous or near-ubiquitous orthogroups.")