def extract_para_from_trees(filename): # prepare list out_para = list() # load filename and save ortho@para tmp = open(path_tmp / filename, 'rb') content_pickle = pickle.load(tmp) # save para for st in content_pickle: ortho, para = st.split('@') l_ortho = ortho.split(' ') l_para = para.split(' ') for name1 in l_para: for name2 in l_ortho: if name1 < name2: combined_name = str(len(name1)) + name1 + name2 else: combined_name = str(len(name2)) + name2 + name1 if len(combined_name) > 2: # check if present in filter and save it reduced = int(combined_name[2:]) if reduced in set_filter: out_para.append(int(combined_name)) # dump out_para utils.save_pickle(path_tmp_para / filename, out_para) return [0,0]
def save_prot_2_sp(d): # create new dict with only sp d_str = {k:t[0] for k,t in d.items()} d_int = {int(k):t[0] for k,t in d.items()} # save them on disk utils.save_pickle(out_dir / 'prot_str_2_species.pic', d_str) utils.save_pickle(out_dir / 'prot_int_2_species.pic', d_int)
def extract_para(l_trees): global set_filter set_filter = pickle.load(open(out_dir / 's_filter.pic', 'rb')) global path_tmp_para path_tmp_para = utils.create_out_dir('./dir_step3/tmp_para') # extract para from tree files print(' extract para from trees') pool = Pool(nb_threads) tmp_res = pool.map_async(extract_para_from_trees, l_trees, chunksize=1) pool.close() pool.join() # combine results list para d_para = utils.get_pickle(out_dir / 'd_ortho.pic') d_para = {x:0 for x in d_para} for filename in l_trees: # load pick file with list para content_pickle = pickle.load(open(path_tmp_para / filename, 'rb')) for pair in content_pickle: try: d_para[pair] += 1 except: pass # save it to file utils.save_pickle(out_dir / 'd_para.pic', d_para) # free memory set_filter = None shutil.rmtree(path_tmp_para) shutil.rmtree(path_tmp)
def extract_ortho(l_trees): print(' extract ortho from similarity') # load pickle files tmp_d = utils.get_multi_pickle(Path('dir_step2') / 'dict_similarity_ortho', '_similarity_ortho.pic') # extract ortho d_ortho = collections.defaultdict(int) for l in tmp_d.values(): for sub in itertools.combinations(l, 2): pair_int = int(str(len(sub[0])) + sub[0] + sub[1]) d_ortho[pair_int] += 1 global path_tmp, path_tmp_ortho path_tmp = utils.create_out_dir('./dir_step3/tmp') path_tmp_ortho = utils.create_out_dir('./dir_step3/tmp_ortho') # extract ortho from tree files print(' extract ortho from trees') pool = Pool(nb_threads) tmp_res = pool.map_async(extract_ortho_from_trees, l_trees, chunksize=1) pool.close() pool.join() # unpack ortho and save them for filename in l_trees: content_pickle = pickle.load(open(path_tmp_ortho / filename, 'rb')) for pair in content_pickle: d_ortho[pair] += 1 # free memory content_pickle = None shutil.rmtree(path_tmp_ortho) # remove ortho found only once print(' remove ortho found only once') d_ortho = {k:v for k,v in d_ortho.items() if v > 1} # save it to file utils.save_pickle(out_dir / 'd_ortho.pic', d_ortho) # save a simplified version (without 2 first digits) as set to file s_filter = set() for k in d_ortho: k2 = str(k) s_filter.add( int(k2[2:]) ) utils.save_pickle(out_dir / 's_filter.pic', s_filter)
def extract_ortho_from_trees(filename): # prepare output variables l_ortho = list() l_ortho_para = list() # load dict of trees tmp_d = utils.get_pickle(Path('dir_step2') / 'dict_trees' / filename) # analyse trees 1 by 1 for ref_leaf, newick in tmp_d.items(): # load tree and get all leaves tree = PhyloTree(newick) all_leaves = {leaf.name for leaf in tree} # get all leaves from last interesting nodes ref_node = tree.search_nodes(name = ref_leaf)[0] ortho = custom_species_overlap(ref_node) # add ref_leaf to ortho in case no good node selected if len(ortho) == 0: ortho.add(ref_leaf) # get para para = all_leaves - ortho # save ortho xx = list(ortho) xx.sort() for sub in itertools.combinations(xx, 2): pair_int = int(str(len(sub[0])) + sub[0] + sub[1]) l_ortho.append(pair_int) # save ortho@para if there is a paralogous group if para: l_ortho_para.append(' '.join(ortho) + '@' + ' '.join(para)) # save ortho @ para utils.save_pickle(path_tmp / filename, l_ortho_para) # save ortho utils.save_pickle(path_tmp_ortho / filename, l_ortho) return [0,0]
def step1_kmer_clustering(dir, ext, lk, ma, nt): # convert the parameters to global variables (horrible hack) global directory, extension, length_kmer, min_aa, nb_threads directory, extension, length_kmer, min_aa, nb_threads = Path(dir), ext, lk, ma, nt print('\n --- STEP 1: kmer clustering\n') print(' # parameters') print(' input dir : ' + str(directory)) print(' kmer size : ' + str(length_kmer)) print(' kmer nb aa : ' + str(min_aa)) ## create output directory (delete it first if already exists) global out_dir out_dir = utils.create_out_dir('dir_step1') ## check directory and files print('\n # check input files') global dict_files, list_files, list_start dict_files, list_files, list_start = pre_checking(directory, extension) ## analyse each fasta file (multithreading) print ('\n # kmer clustering\n ' + str(len(list_files)) + ' proteomes on ' + str(nb_threads) + ' threads') pool = ThreadPool(nb_threads) tmp_res = pool.map_async(process_file, list_files, chunksize=1) results_2 = tmp_res.get() pool.close() pool.join() ## create log files log_file = open(out_dir / 'log_step1.txt', 'w+') log_file.write('#index file_name nb_initial nb_final\n') ## save log file and combine other info combined = dict() names = dict() nb_final = 0 for l in results_2: log_file.write(' '.join(l[:4]) + '\n') names.update(l[4]) combined.update(l[5]) nb_final += int(l[3]) ## save pickle files utils.save_pickle(out_dir / 'combined_names.pic', combined) utils.save_pickle(out_dir / 'original_names.pic', names) utils.save_pickle(out_dir / 'species_index.pic', dict_files) print(' -> ' + str(nb_final) + ' proteins saved for the next step') print ('')
def process_file(file): ## extract index index = file.split('.')[0] ## create output directory index_dir = out_dir / index index_dir.mkdir(parents=True, exist_ok=True) ## perform local search against each database for file_db in list_files: search_output = index + '_' + file_db.replace('.fas','.gz') subprocess.check_output(path_diamond + ' blastp --quiet --threads 1 --db ' + str(db_dir / file_db.replace('.fas','.db')) + ' --max-target-seqs ' + str(max_per_species) + ' --query ' + str(Path('dir_step1') / file) + ' \ --compress 1 --more-sensitive -e ' + str(evalue) + ' -o ' + str(index_dir / search_output) + ' --outfmt 6 qseqid sseqid qstart qend sstart cigar 2>&1', shell=True) ## get all DIAMOND output files p = index_dir.glob('*.gz') tmp_l = [x for x in p if x.is_file()] ## get all hits in a dict of list all_output = collections.defaultdict(list) for out_file in tmp_l: with gzip.open(out_file, mode="rt") as f: file_content = csv.reader(f, delimiter=' ') for line in file_content: # save output all_output[line[0]].append(line[1:]) ## analyse BLAST hits nb_phylo = 0 nb_NO_phylo = 0 nb_empty_ali = 0 all_alis = dict() no_phylo = dict() for prot in all_output: ## variable for reduced list of output reduced = list() ## get all species hits (initialise with query prot) ref_species = dict(all_species) ref_species[name_2_sp_phylip_seq[prot][0]] += 1 all_hits = {prot} for ll in all_output[prot]: target = ll[0] target_sp = name_2_sp_phylip_seq[target][0] if target not in all_hits: ref_species[target_sp] += 1 all_hits.add(target) # reduce output for pickle (convert all element to integers) reduced.append(tuple(int(x) for x in ll[:3])) ## analyse species content nb_present, nb_dupli = analyse_species(ref_species) ## case phylogenetic analysis if nb_present > 1 and nb_dupli > 0: nb_phylo += 1 min_start = math.inf max_end = 0 all_hits = dict() # get all hits for this prot for ll in all_output[prot]: target = ll[0] species = name_2_sp_phylip_seq[target][0] qu_start = int(ll[1]) -1 qu_end = int(ll[2]) -1 ta_start = int(ll[3]) -1 cigar = ll[4] if target in all_hits: # extract HSP and add to target seq HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) else: ref_species[species] += 1 # create target seq all_hits[target] = '-' * len(name_2_sp_phylip_seq[prot][2]) # extract HSP HSP = extract_HSP(name_2_sp_phylip_seq[target][2], ta_start, cigar) all_hits[target] = all_hits[target][:qu_start] + HSP + all_hits[target][qu_end + 1:] min_start, max_end = process_location(qu_start, qu_end, min_start, max_end) # add query to hits if not there (it happens sometimes when many similar sequences from the same species) if prot not in all_hits: all_hits[prot] = name_2_sp_phylip_seq[prot][2] # find good positions good_positions = get_positions(prot, all_hits, trim_thres) # save alignment if len(good_positions) == 0: nb_empty_ali += 1 elif len(good_positions) < 4988: ## FastTtree2 limitation (5000 per line) new_ali = [str(len(all_hits)) + ' ' + str(len(good_positions))] for name, seq in all_hits.items(): trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions] new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) all_alis[prot] = '\n'.join(new_ali) else: # take only the 4988 first positions (longer alignments are very rare anyway) new_ali = [str(len(all_hits)) + ' 4988'] for name, seq in all_hits.items(): trimed_seq = [seq[n] for n in range(len(seq)) if n in good_positions][:4988] new_ali.append(name_2_sp_phylip_seq[name][1] + ''.join(trimed_seq)) all_alis[prot] = '\n'.join(new_ali) ## case NO phylogenetic analysis else: nb_NO_phylo += 1 # sort the list of names for further processing xx = list(all_hits) xx.sort() no_phylo[prot] = xx ## save reduced output all_output[prot] = tuple(reduced) ## convert DIAMOND output (keys to integers) and save it to file all_output = {int(x):t for x,t in all_output.items()} output_file = index + '_output.pic' utils.save_pickle(out_dir / 'dict_output' / output_file, all_output) ## save similarity_ortho groups to file blast_ortho_file = index + '_similarity_ortho.pic' utils.save_pickle(out_dir / 'dict_similarity_ortho' / blast_ortho_file, no_phylo) ## save all alignments to file name_ali_file = 'alis_' + index + '.phy' write_ali = open(out_dir / name_ali_file, 'w+') all_ref_prot = list() for ref_prot, ali in all_alis.items(): write_ali.write(ali + '\n') all_ref_prot.append(ref_prot) write_ali.close() # free memory nb_alis = len(all_alis) all_alis = None all_output = None ## deal with method if phylo_method == 'nj': insert = '-noml -nome' elif phylo_method == 'me': insert = '-noml' elif phylo_method == 'ml': insert = '' ## perform phylogenetic analyses and root trees all_trees = dict() nb_pbm_tree = 0 a = subprocess.check_output(path_fasttree + ' -quiet -nosupport -fastest -bionj -pseudo ' + insert + ' -n ' + str(nb_alis) + ' ' + str(Path(out_dir / name_ali_file)) + ' 2>&1', shell=True) a2 = a.strip().decode("utf-8") a3 = a2.split('\n') c = -1 for line in a3: # case the line is in the form 'Ignored unknown character ...' if line.startswith('Ign'): pass else: c += 1 if not line.startswith('('): nb_pbm_tree += 1 # security if nb_pbm_tree > 100: sys.exit("\n ERROR STEP 2: too many errors in phylogenetic analyses -> stopped\n\n") else: # import tree in ete3 and root it ete_tree = PhyloTree(line) mid = ete_tree.get_midpoint_outgroup() try: ete_tree.set_outgroup(mid) except: pass # get reference protein name prot = all_ref_prot[c] # save rooted tree all_trees[prot] = ete_tree.write() ## save trees to file tree_file = index + '_trees.pic' utils.save_pickle(out_dir / 'dict_trees' / tree_file, all_trees) ## clean directory # delete ali file Path.unlink(out_dir / name_ali_file) # delete Diamond outputs shutil.rmtree(index_dir) return [index, str(nb_phylo), str(nb_NO_phylo), str(nb_empty_ali), str(nb_pbm_tree)]
def save_outputs(l_com, d_chimeric, d_species): ## load original and combined names original_name = utils.get_pickle(Path('dir_step1') / 'original_names.pic') combined_prot = utils.get_pickle(Path('dir_step1') / 'combined_names.pic') # create vector nb_species as index and nb_OG as value vector_sp = [0] * (len(d_species) + 1) # create dict for species counts nb_per_sp = collections.defaultdict(int) # create dict for table OGs table_og = dict() ## save the lists of OGs and get fusion info and get OG info d_OG = collections.defaultdict(list) c = 0 OGs_in_network = dict() file_list_OGs = open(out_dir / 'orthologous_groups.txt', 'w+') file_list_OGs.write('#OG_name protein_names\n') for l in l_com: nb_species = count_species(l) # keep OG if more than 1 species if nb_species > 1: vector_sp[nb_species] += 1 # create name OG c += 1 name_OG = 'OG_' + str(c) # create OG vector #table_og[name_OG] = {x:0 for x in d_species} table_og[name_OG] = {x:list() for x in d_species} # save old names OGs_in_network[name_OG] = l # prepare full OG with combined proteins and save it l2 = list() for k in l: if k in combined_prot: for k2 in combined_prot[k]: l2.append(original_name[k2]) else: l2.append(original_name[k]) file_list_OGs.write(name_OG + ' ' + ' '.join(l2) + '\n') # update dict per species and vector OG for k in l: sp = prot_2_sp[k] if k in combined_prot: nb_per_sp[sp] += len(combined_prot[k]) #table_og[name_OG][sp] += len(combined_prot[k]) for k2 in combined_prot[k]: table_og[name_OG][sp].append(original_name[k2]) else: nb_per_sp[sp] += 1 #table_og[name_OG][sp] += 1 table_og[name_OG][sp].append(original_name[k]) # check if gene fusion in OG -> save name OG for each gene-fusion for k in l: if k in d_chimeric: d_chimeric[k].append(name_OG) # count number of edges corresponding to this OG and calculate the clustering coefficient nb_edges = 0 s = set(l) for node in l: for node2 in all_edges[node]: if node2 in s: nb_edges += 1 clustering_coefficient = nb_edges / (len(s) * (len(s) - 1)) # save OG info d_OG[name_OG] = [str(nb_species), str(len(l)), str(len(l2)), str(round(clustering_coefficient,4))] ## save dict old names utils.save_pickle(out_dir / 'OGs_in_network.pic', OGs_in_network) ## save gene-fusions file_fusions = open(out_dir / 'chimeric_proteins.txt', 'w+') file_fusions.write('#species_file protein_name nb_OG_fused list_fused_OGs\n') for k,l in d_chimeric.items(): file_fusions.write(d_species[str(prot_2_sp[k])] + ' ' + original_name[k] + ' ' + str(len(l)) + ' ' + ' '.join(l) + '\n') ## save statistics for each OG file_stats_each_OG = open(out_dir / 'statistics_per_OG.txt', 'w+') file_stats_each_OG.write('#OG_name nb_species nb_reduced_prot nb_all_prot clustering_coefficient\n') for k,l in d_OG.items(): file_stats_each_OG.write(k + ' ' + ' '.join(l) + '\n') ## save table OG counts file_stats_each_OG = open(out_dir / 'table_OGs_protein_counts.txt', 'w+') file_stats_each_OG.write('#OG_name ' + ' '.join(x for x in d_species.values()) + '\n') for og_name, d in table_og.items(): file_stats_each_OG.write(og_name + ' ' + ' '.join(str(len(x)) for x in d.values()) + '\n') ## save table OG names file_stats_each_OG = open(out_dir / 'table_OGs_protein_names.txt', 'w+') file_stats_each_OG.write('#OG_name ' + ' '.join(x for x in d_species.values()) + '\n') for og_name, d in table_og.items(): file_stats_each_OG.write(og_name + ' ' + ' '.join(' '.join(x) for x in d.values()) + '\n') ## calculate nb total prot per species, and then % assigned total_per_sp = collections.defaultdict(int) for sp in prot_2_sp.values(): total_per_sp[sp] += 1 perc_per_sp = {sp: (100 * nb_per_sp[sp] / total) for sp, total in total_per_sp.items()} ## save statistics for each species file_stats_each_species = open(out_dir / 'statistics_per_species.txt', 'w+') file_stats_each_species.write('#species perc_prot_assigned nb_prot_assigned\n') for sp, perc in perc_per_sp.items(): file_stats_each_species.write(d_species[sp] + ' ' + str(round(perc,1)) + ' ' + str(nb_per_sp[sp]) + '\n') ## save OG stats: nb sp VS nb OGs file_stats_OGs_sp = open(out_dir / 'statistics_nb_OGs_VS_nb_species.txt', 'w+') file_stats_OGs_sp.write('#nb_species nb_OGs\n') for i,v in enumerate(vector_sp): file_stats_OGs_sp.write(str(i) + ' ' + str(v) + '\n')