def __init__(self,
              fasta_file,
              matrix_path,
              feature_file_path,
              phenotypes,
              phenotype_mapping,
              selected_samples,
              p_value_threshold=0.01,
              remove_redundants=False,
              num_p=4,
              blastn_path=''):
     if len(blastn_path) > 0:
         os.environ['PATH'] += ':' + blastn_path
     self.num_p = num_p
     self.seq_IDS = FileUtility.read_fasta_sequences_ids(fasta_file)
     self.remove_redundants = remove_redundants
     self.ez_taxa_dict = {
         x.split()[0]: x.split()[1].split(';')
         for x in FileUtility.load_list('db/ez_idx_taxonomy.txt')
     }
     self.mat = FileUtility.load_sparse_csr(matrix_path)
     self.mat = self.mat.toarray()
     self.mat = self.mat[selected_samples, :]
     self.mat = csr_matrix(self.mat)
     self.features = FileUtility.load_list(feature_file_path)
     self.align_markers_parallel(p_value_threshold)
     self.redundant_columns_indentification()
     self.phenotype_mapping = phenotype_mapping
     self.phenotypes = phenotypes
Example #2
0
import collections
import pandas as pd
import tqdm
import itertools
import numpy as np
from make_representations.cpe_efficient import train_cpe
from multiprocessing import Pool

#############################################################
# Simple script for learning segmentation steps from a fasta file
# Output: the file containing merging steps (i.e., "path_to_mergings"),
# can be used instead of Swiss-Prot merging steps
#############################################################

# Inputs
seq_dict = FileUtility.read_fasta_sequences_ids('sequences.fasta')
max_symbols = 10000
min_freq_for_merging = 10

# Output
path_to_mergings = 'ppe_mergings.txt'
path_to_merging_freqs = 'ppe_freq.txt'

#############################################################

SID = list(seq_dict.keys())
SID.sort()
seqs = [seq_dict[seqID][0] for seqID in SID]
train_cpe(seqs,
          path_to_mergings,
          max_symbols,