def _load_encode_rna(self, class_files): self.data, self.labels = [], [] replacer_seq = lambda x: choice(self.alpha_coder.alph0) replacer_struct = lambda x: choice(self.alpha_coder.alph1) pattern_seq = r"[^{}]".format(re.escape(self.alpha_coder.alph0)) pattern_struct = r"[^{}]".format(re.escape(self.alpha_coder.alph1)) for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, block in io.parse_fasta(handle, "_"): lines = block.split("_") sequence = re.sub(pattern_seq, replacer_seq, lines[0].upper()) if True == self.is_rna_pwm: pwm = np.zeros( (len(sequence), len(self.alpha_coder.alph1)), dtype=np.float32) for x in range(1, pwm.shape[1] + 1): pwm[:, x - 1] = list(map(float, lines[x].split())) self.data.append(self._join_seq_pwm(sequence, pwm)) else: structure = re.sub(pattern_struct, replacer_struct, lines[1].split(" ")[0].upper()) joined = self.alpha_coder.encode((sequence, structure)) self.data.append(self.one_hot_encoder.encode(joined)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()
def _load_encode_dna(self, class_files): self.data, self.labels = [], [] replacer = lambda x: choice(self.one_hot_encoder.alphabet) for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, sequence in io.parse_fasta(handle): sequence = re.sub(r"[NYMRWK]", replacer, sequence.upper()) self.data.append(self.one_hot_encoder.encode(sequence)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()
def _load_encode_rna(self, class_files): self.data, self.labels = [], [] replacer_seq = lambda x: choice(self.alpha_coder.alph0) replacer_struct = lambda x: choice(self.alpha_coder.alph1) if self.alpha_coder.alph1 == "HIMS": idx = 2 else: idx = 1 for class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") for header, block in io.parse_fasta(handle, "_"): lines = block.split("_") sequence = re.sub(r"[NYMRWK]", replacer_seq, lines[0]) structure = re.sub(r"[FT]", replacer_struct, lines[idx].split(" ")[0].upper()) joined = self.alpha_coder.encode((sequence, structure)) self.data.append(self.one_hot_encoder.encode(joined)) if self.multilabel: self.labels.append(list(map(int, header.split(',')))) else: self.labels.append([class_id]) handle.close()
def load_additional_positionwise_data(self, class_files, identifier, standardize=False): """ Add additional numerical features to the network (for each nucleotide in a sequence). For every position in an input sequence additional numerical data can be added to the network (e.g. ChIP-seq signal, conservation for every nucleotide). The data will be added to the input matrix. E.g.: Using sequences of length 200 over the alphabet "ACGT" results in input matrices of size 4x200. Additional position-wise data will be added to these matrices as a new row resulting in matrices of size 5x200. Input files are text files and must contain as many whitespace-separated values in each line as the sequences are long, e.g.: '0.679 1.223 -0.296 ... '0.961 0.532 0.112 ... '0.065 -0.333 -0.256 ... '... The number of provided files must match the fasta files provided to the __init__ function (e.g. if you provided a list of 3 files to __init__ you must provide a list of 3 files here as well) and the number of lines in each file must match the number of entries in the corresponding fasta file. If you want to add multiple features simply call this function multiple times. Input features should be standardized in some way prior to adding them to the network, as this tends to improve the predictive performance. In the same way network kernels are visualized as sequence motifs after the network training (based on the first 4 rows of the input matrices and using the visualize_kernel() Model function), the rows corresponding to additional features are summarized as line plots as well. Parameters ---------- class_files: str or [str] A text file (multi-label) or a list of text files (single-label). identifier: str A short feature name (will be shown in kernel output plots). standardize: bool Scale each column according to the interquartile range. """ if not "positionwise" in dir(self): self.positionwise = OrderedDict() if identifier in self.positionwise: raise RuntimeError( "Identifier '{}' already exists.".format(identifier)) if not isinstance(class_files, list): class_files = [class_files] len_sequence = self.data[0].shape[0] new_data = np.empty((len(self.labels), len_sequence), dtype=np.float32) row = 0 for file_name in class_files: handle = io.get_handle(file_name, 'rt') for i, line in enumerate(handle): try: new_data[row, :] = [float(x) for x in line.split()] except ValueError as err: raise RuntimeError( "ValueError: {} (in line {} in {}).".format( err, i + 1, file_name)) row += 1 handle.close() if row != len(self.labels): raise RuntimeError( "Amount of additional data ({}) doesn't match number of sequences ({})." .format(row, len(self.labels))) if True == standardize: from sklearn.preprocessing import robust_scale self.positionwise[identifier] = robust_scale(new_data, axis=0) if not "positionwise_unscaled" in dir(self): self.positionwise_unscaled = OrderedDict() self.positionwise_unscaled[identifier] = new_data else: self.positionwise[identifier] = new_data
def load_additional_data(self, class_files, is_categorical=False, categories=None, standardize=False): """ Add additional numerical or categorical features to the network (for each sequence as a whole). For every input sequence additional data can be added to the network (e.g. location, average sequence conservation, etc.). The data will be concatenated to the input of the first dense layer (i.e. additional neurons in the first dense layer will be created). Input files are text files and must contain one value per line, e.g.: '0.679 '0.961 '0.065 '0.871 '... The number of provided files must match the fasta files provided to the __init__ function (e.g. if you provided a list of 3 files to __init__ you must provide a list of 3 files here as well) and the number of lines in each file must match the number of entries in the corresponding fasta file. If you want to add multiple features simply call this function multiple times. Interpreting the influence of arbitrary additional data for a neural network is hard and at the moment we don't provide any means to do so. You should run your model with and without the additional data and check if the predictive performance improves. In general, if you have many handcrafted features you might want to consider using a different machine learning technique. Parameters ---------- class_files: str or [str] A text file (multi-label) or a list of text files (single-label). is_categorical: bool Is the provided data categorical or numerical? categories: [str] A list containing all possible categories (only needed if is_categorial == True). standardize: bool Should the z-score be computed for numerical data? """ if not isinstance(class_files, list): class_files = [class_files] # load raw data idx = len(self.meta) self.meta[idx] = {"data": [], "is_categorical": is_categorical} for _class_id, file_name in enumerate(class_files): handle = io.get_handle(file_name, "rt") if True == is_categorical: for line in handle: self.meta[idx]['data'].append(line.strip()) else: for line in handle: self.meta[idx]['data'].append(float(line)) handle.close() if len(self.labels) != len(self.meta[idx]['data']): raise RuntimeError( "Number of additional data ({}) doesn't match number of main data ({})." .format(len(self.meta[idx]['data']), len(self.labels))) # one hot encode categorical data if True == is_categorical: if not isinstance(categories, list): raise RuntimeError( "is_categorical set to True, but no categories list provided." ) categories = sorted(categories) mapping = {val: i for i, val in enumerate(categories)} for i, _val in enumerate(self.meta[idx]['data']): one_hot = np.zeros(len(categories), dtype=np.uint8) one_hot[mapping[self.meta[idx]['data'][i]]] = 1 self.meta[idx]['data'][i] = one_hot # standardize numerical data if desired else: if True == standardize: from scipy import stats self.meta[idx]['data'] = stats.zscore(self.meta[idx]['data'])