def __init__(self, fasta_file, matrix_path, feature_file_path, phenotypes, phenotype_mapping, selected_samples, p_value_threshold=0.01, remove_redundants=False, num_p=4, blastn_path=''): if len(blastn_path) > 0: os.environ['PATH'] += ':' + blastn_path self.num_p = num_p self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file) self.remove_redundants = remove_redundants self.ez_taxa_dict = { x.split()[0]: x.split()[1].split(';') for x in FileUtility.load_list('db/ez_idx_taxonomy.txt') } self.mat = FileUtility.load_sparse_csr(matrix_path) self.mat = self.mat.toarray() self.mat = self.mat[selected_samples, :] self.mat = csr_matrix(self.mat) self.features = FileUtility.load_list(feature_file_path) self.align_markers_parallel(p_value_threshold) self.redundant_columns_indentification() self.phenotype_mapping = phenotype_mapping self.phenotypes = phenotypes
import collections import pandas as pd import tqdm import itertools import numpy as np from make_representations.cpe_efficient import train_cpe from multiprocessing import Pool ############################################################# # Simple script for learning segmentation steps from a fasta file # Output: the file containing merging steps (i.e., "path_to_mergings"), # can be used instead of Swiss-Prot merging steps ############################################################# # Inputs seq_dict = FileUtility.read_fasta_sequences_ids('sequences.fasta') max_symbols = 10000 min_freq_for_merging = 10 # Output path_to_mergings = 'ppe_mergings.txt' path_to_merging_freqs = 'ppe_freq.txt' ############################################################# SID = list(seq_dict.keys()) SID.sort() seqs = [seq_dict[seqID][0] for seqID in SID] train_cpe(seqs, path_to_mergings, max_symbols,