# PRE evo events summary contains dup/loss per node/ortholog in an OG and in total
# Need to parse the taxons in human lineage
# Full-lineage analysis: consider duplications in amniote to human branch. For comparison to human datasets, use duplications in this lineage trace only (like require 1+)
# Human only analysis: consider duplications in the direct human branch - after split from MRCA with mouse.

import sys, json
import pandas as pd
import pipeline_methods as pre
import numpy as np
from scipy.stats.mstats import mode, gmean, hmean

#Input files
human_mapping_file = pre.human_mapping_file
pfam_results_file = pre.pfam_evo_events_file
meme_results_file = pre.meme_evo_events_file
clans_dict = pre.get_pfam_clans(pre.pfam_clans_file)

#Output
output_full_lineage = pre.phyrepid_results_human_full_lineage
output_human_only = pre.phyrepid_results_human_only

## Taxon id settings

## Mappings
root_nodes = pre.root_nodes
node_projection = pre.node_projection

#Full lineage
taxa_full_lineage = [
    'ENSP', 't9606', 't314146', 't32525', 't40674', 't32524', 't8287'
]
if hmmr_tbl_input_path in hmmr_tbl:
    outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path)
    outfile_path = outfile_path.replace(input_ext, output_ext)

else:
    quit('Need valid .tblout from profile pfam scan of one hit only')

if file_notempty(hmm_results_file):
    with open(hmm_results_file, 'r') as log:
        hmm_results_dict = json.load(log)
else:
    hmm_results_dict = {}
if genetree_name not in hmm_results_dict: hmm_results_dict[genetree_name] = {}

#read clans
pfam_clans = pre.get_pfam_clans(pfam_clans_file)

#read fasta
fasta = pre.read_fasta(fasta_file)

#hmm data
repeats, pfam_hits = pre.parse_domtblout(hmmr_tbl, 'iterative', spacing)

#should only be one clan and one hit

best_hit_repeats = pre.get_best_hit(repeats)
pre.write_fasta_file(outfile_path, best_hit_repeats, fasta, padding)

#logging for statistics
for hit, protein_uri in pfam_hits.items():
    clan = pfam_clans[hit]
Exemple #3
0
schaper_summary_df_file = pre.schaper_comparison_file #used in phyrepid export
schaper_detailed_df_file = pre.schaper_detailed_df_file
schaper_detailed_export = pre.schaper_detailed_export

## Build Schaper data dictionary

#Only compare to species in our pipeline
species_selection_lst = []
with open(pre.species_mapping_file,'r') as sp_mapping:
	species_dict = json.load(sp_mapping)
	for item in species_dict:
		species_selection_lst.append(item['ensembl_stable_id'])					
 
#Load Pfam data
clans = pre.get_pfam_clans(pre.pfam_clans_file) #pfam hit to clan mapping
pfam_accession = {} #maps pfam accession ID to hit name
with open(pre.pfam_clans_file, 'r') as pfam_file: 
	for rows in pfam_file:
		cols = rows.strip().split('\t')
		pfam_accession[cols[0]] = cols[3]	
			
for filename in schaper_files:
	with gzip.open(filename, 'r') as f:
		for line in f: print(line)
			if line[0] != '>': continue
			param = {}
				
			for col in line.strip().split(' '):
				key,value = col.split(':')
				param[key] = value