Esempio n. 1
0
# *** setquery.py can be used to search presence of OGs for 3+ groups. This can be used to link OGs to particular character traits. ***
# Now modified to enable the use of different sp. code dictionaries and group lists in analysis, determined by sys.argv[1] and sys.argv[2]..
import group
import sys

# sys.argv[1] must equal groups, alt_groups or alt_groups_18.
# sys.argv[2] must equal codes, alt_codes, or alt_codes_18.

# function_mappings and select_function() function are defined to map the sys.argvs from strings to actual functions.
# Comma-separated returns allow multiple returns from the same function (only just realised this).
function_mappings = {
    'groups': group.groups(),
    'alt_groups': group.alt_groups(),
    'alt_groups_18': group.alt_groups_18(),
    'codes': group.codes(),
    'alt_codes': group.alt_codes(),
    'alt_codes_18': group.alt_codes_18()
}


def select_function():
    while True:
        try:
            return function_mappings[sys.argv[1]], function_mappings[
                sys.argv[2]]
        except KeyError:
            print('Invalid function, try again.')


# select_function is called with the relevant group list and codes - these should correspond else the program may fail.
group_list, sp_codes = select_function()
Esempio n. 2
0
				# Another alternative to the original find_group.py, iterates over the divided SAR and Haptista subgroups.
				# New script for the extra groups is created as an alternative to iterating over every group again.
import re
import glob
import group

codes = group.alt_codes_18()
new_groups = ['Alveolata', 'Centrohelids', 'Haptophyta', 'Rhizaria', 'Stramenopiles']
all_groups = group.alt_groups_18()
sorted_all_groups = sorted(all_groups)
all_sets = []

for eugroup in new_groups:
	for ogroup in sorted_all_groups:
		query = [eugroup, ogroup]
		if eugroup != ogroup:
			group_set = set(query)
		else:
			group_set = eugroup
		if group_set not in all_sets:				# Appends to all sets to ensure no duplicate analyses.
			all_sets.append(group_set)
			if isinstance(group_set, set):
				group_set = sorted(group_set)		# Reverts data to a sorted list so that names are alphabetical.
			group.find_group(group_set, codes)		# 2nd argument of find_group() can now be configured depending on which code set is needed.



Esempio n. 3
0
]
correct_order_15 = [
    "Telonemids", "Haptista", "SAR", "Atwista", "Archaeplastida",
    "Ancyromonadida", "Obazoa", "Discoba", "Collodictyonids", "Cryptista",
    "Amoebozoa", "Metamonads", "Hemimastigophora", "Apusomonada",
    "Malawimonadidae"
]
correct_order_includingown_18 = [
    "Alveolata", "Stramenopiles", "Archaeplastida", "Obazoa", "Telonemids",
    "Centrohelids", "Ancyromonadida", "Discoba", "Rhizaria", "Cryptista",
    "Atwista", "Haptophyta", "Collodictyonids", "Amoebozoa", "Metamonads",
    "Hemimastigophora", "Apusomonada", "Malawimonadidae"
]

# Choose the necessary group list from the group module.
alt_groups = sorted(group.alt_groups_18())
group_names = alt_groups
data = []

for name in group_names:
    to_parse = glob.glob(
        "/mnt/c/Users/scamb/Documents/uob_msc/Genome_data/OG_arb-fal/new_outputs/*.txt"
    )
    for file in to_parse:
        filename = re.search(r"([A-Z]\w*)_([A-Z]\w*)_output.txt$", file)
        if filename:
            if filename.group(
                    1
            ) not in group_names:  # Skips 'non-target' files in new_outputs.
                pass
            elif filename.group(
Esempio n. 4
0
				# This script finds all the OGs of each group, regardless of exclusivity.
				# Outputs are stored in total_genome directory.
import glob
import group
import re

codes = group.alt_codes_18()
group_names = group.alt_groups_18() 		# alt_groups_18() has eukaryote groups SAR and Haptista, as well as 'Other', split into their respective subgroups.

for name in group_names:
	to_parse = glob.glob("*.fal")
	filename = "%s_allOGs.txt" % name
	genome = open(filename, "w")
	i = 0
	for file in to_parse:
		groups_present = []
		with open(file) as f:
			for line in f:
				if line.startswith(">"):
					fields = re.split("_", line)
					species_code = fields[0][1:]
					for code in codes:
						group = codes[species_code]
						if group not in groups_present:
							groups_present.append(group)
		if name in groups_present:							# Functions in the same way as findgroup, only without the exclusive set matching.
			genomeWrite = genome.write(f'{file}\n')
			i += 1
	genomeWrite = genome.write(f'Total gene families in group: {i}')
	genome.close()