# PRE evo events summary contains dup/loss per node/ortholog in an OG and in total # Need to parse the taxons in human lineage # Full-lineage analysis: consider duplications in amniote to human branch. For comparison to human datasets, use duplications in this lineage trace only (like require 1+) # Human only analysis: consider duplications in the direct human branch - after split from MRCA with mouse. import sys, json import pandas as pd import pipeline_methods as pre import numpy as np from scipy.stats.mstats import mode, gmean, hmean #Input files human_mapping_file = pre.human_mapping_file pfam_results_file = pre.pfam_evo_events_file meme_results_file = pre.meme_evo_events_file clans_dict = pre.get_pfam_clans(pre.pfam_clans_file) #Output output_full_lineage = pre.phyrepid_results_human_full_lineage output_human_only = pre.phyrepid_results_human_only ## Taxon id settings ## Mappings root_nodes = pre.root_nodes node_projection = pre.node_projection #Full lineage taxa_full_lineage = [ 'ENSP', 't9606', 't314146', 't32525', 't40674', 't32524', 't8287' ]
if hmmr_tbl_input_path in hmmr_tbl: outfile_path = hmmr_tbl.replace(hmmr_tbl_input_path, fasta_output_path) outfile_path = outfile_path.replace(input_ext, output_ext) else: quit('Need valid .tblout from profile pfam scan of one hit only') if file_notempty(hmm_results_file): with open(hmm_results_file, 'r') as log: hmm_results_dict = json.load(log) else: hmm_results_dict = {} if genetree_name not in hmm_results_dict: hmm_results_dict[genetree_name] = {} #read clans pfam_clans = pre.get_pfam_clans(pfam_clans_file) #read fasta fasta = pre.read_fasta(fasta_file) #hmm data repeats, pfam_hits = pre.parse_domtblout(hmmr_tbl, 'iterative', spacing) #should only be one clan and one hit best_hit_repeats = pre.get_best_hit(repeats) pre.write_fasta_file(outfile_path, best_hit_repeats, fasta, padding) #logging for statistics for hit, protein_uri in pfam_hits.items(): clan = pfam_clans[hit]
schaper_summary_df_file = pre.schaper_comparison_file #used in phyrepid export schaper_detailed_df_file = pre.schaper_detailed_df_file schaper_detailed_export = pre.schaper_detailed_export ## Build Schaper data dictionary #Only compare to species in our pipeline species_selection_lst = [] with open(pre.species_mapping_file,'r') as sp_mapping: species_dict = json.load(sp_mapping) for item in species_dict: species_selection_lst.append(item['ensembl_stable_id']) #Load Pfam data clans = pre.get_pfam_clans(pre.pfam_clans_file) #pfam hit to clan mapping pfam_accession = {} #maps pfam accession ID to hit name with open(pre.pfam_clans_file, 'r') as pfam_file: for rows in pfam_file: cols = rows.strip().split('\t') pfam_accession[cols[0]] = cols[3] for filename in schaper_files: with gzip.open(filename, 'r') as f: for line in f: print(line) if line[0] != '>': continue param = {} for col in line.strip().split(' '): key,value = col.split(':') param[key] = value