コード例 #1
0
ファイル: Data.py プロジェクト: vivekmathema/pysster
 def _load_encode_rna(self, class_files):
     self.data, self.labels = [], []
     replacer_seq = lambda x: choice(self.alpha_coder.alph0)
     replacer_struct = lambda x: choice(self.alpha_coder.alph1)
     pattern_seq = r"[^{}]".format(re.escape(self.alpha_coder.alph0))
     pattern_struct = r"[^{}]".format(re.escape(self.alpha_coder.alph1))
     for class_id, file_name in enumerate(class_files):
         handle = io.get_handle(file_name, "rt")
         for header, block in io.parse_fasta(handle, "_"):
             lines = block.split("_")
             sequence = re.sub(pattern_seq, replacer_seq, lines[0].upper())
             if True == self.is_rna_pwm:
                 pwm = np.zeros(
                     (len(sequence), len(self.alpha_coder.alph1)),
                     dtype=np.float32)
                 for x in range(1, pwm.shape[1] + 1):
                     pwm[:, x - 1] = list(map(float, lines[x].split()))
                 self.data.append(self._join_seq_pwm(sequence, pwm))
             else:
                 structure = re.sub(pattern_struct, replacer_struct,
                                    lines[1].split(" ")[0].upper())
                 joined = self.alpha_coder.encode((sequence, structure))
                 self.data.append(self.one_hot_encoder.encode(joined))
             if self.multilabel:
                 self.labels.append(list(map(int, header.split(','))))
             else:
                 self.labels.append([class_id])
         handle.close()
コード例 #2
0
 def _load_encode_dna(self, class_files):
     self.data, self.labels = [], []
     replacer = lambda x: choice(self.one_hot_encoder.alphabet)
     for class_id, file_name in enumerate(class_files):
         handle = io.get_handle(file_name, "rt")
         for header, sequence in io.parse_fasta(handle):
             sequence = re.sub(r"[NYMRWK]", replacer, sequence.upper())
             self.data.append(self.one_hot_encoder.encode(sequence))
             if self.multilabel:
                 self.labels.append(list(map(int, header.split(','))))
             else:
                 self.labels.append([class_id])
         handle.close()
コード例 #3
0
 def _load_encode_rna(self, class_files):
     self.data, self.labels = [], []
     replacer_seq = lambda x: choice(self.alpha_coder.alph0)
     replacer_struct = lambda x: choice(self.alpha_coder.alph1)
     if self.alpha_coder.alph1 == "HIMS":
         idx = 2
     else:
         idx = 1
     for class_id, file_name in enumerate(class_files):
         handle = io.get_handle(file_name, "rt")
         for header, block in io.parse_fasta(handle, "_"):
             lines = block.split("_")
             sequence = re.sub(r"[NYMRWK]", replacer_seq, lines[0])
             structure = re.sub(r"[FT]", replacer_struct,
                                lines[idx].split(" ")[0].upper())
             joined = self.alpha_coder.encode((sequence, structure))
             self.data.append(self.one_hot_encoder.encode(joined))
             if self.multilabel:
                 self.labels.append(list(map(int, header.split(','))))
             else:
                 self.labels.append([class_id])
         handle.close()
コード例 #4
0
ファイル: Data.py プロジェクト: vivekmathema/pysster
    def load_additional_positionwise_data(self,
                                          class_files,
                                          identifier,
                                          standardize=False):
        """ Add additional numerical features to the network (for each nucleotide in a sequence).

        For every position in an input sequence additional numerical data can be added to
        the network (e.g. ChIP-seq signal, conservation for every nucleotide).
        The data will be added to the input matrix. E.g.: Using sequences of length 200
        over the alphabet "ACGT" results in input matrices of size 4x200. Additional position-wise
        data will be added to these matrices as a new row resulting in matrices of size 5x200.
        
        Input files are text files and must contain as many whitespace-separated values 
        in each line as the sequences are long, e.g.:
        
        '0.679 1.223 -0.296  ...
        '0.961 0.532 0.112   ...
        '0.065 -0.333 -0.256 ...
        '...
        
        The number of provided files must match the fasta files provided to the __init__
        function (e.g. if you provided a list of 3 files to __init__ you must provide a list
        of 3 files here as well) and the number of lines in each file must match the number of
        entries in the corresponding fasta file. If you want to add multiple features simply
        call this function multiple times.

        Input features should be standardized in some way prior to adding them to the
        network, as this tends to improve the predictive performance.

        In the same way network kernels are visualized as sequence motifs after the network
        training (based on the first 4 rows of the input matrices and using the visualize_kernel()
        Model function), the rows corresponding to additional features are summarized
        as line plots as well.

        Parameters
        ----------
        class_files: str or [str]
            A text file (multi-label) or a list of text files (single-label).
        
        identifier: str
            A short feature name (will be shown in kernel output plots).

        standardize: bool
            Scale each column according to the interquartile range.
        """
        if not "positionwise" in dir(self):
            self.positionwise = OrderedDict()
        if identifier in self.positionwise:
            raise RuntimeError(
                "Identifier '{}' already exists.".format(identifier))
        if not isinstance(class_files, list):
            class_files = [class_files]
        len_sequence = self.data[0].shape[0]

        new_data = np.empty((len(self.labels), len_sequence), dtype=np.float32)
        row = 0
        for file_name in class_files:
            handle = io.get_handle(file_name, 'rt')
            for i, line in enumerate(handle):
                try:
                    new_data[row, :] = [float(x) for x in line.split()]
                except ValueError as err:
                    raise RuntimeError(
                        "ValueError: {} (in line {} in {}).".format(
                            err, i + 1, file_name))
                row += 1
            handle.close()
        if row != len(self.labels):
            raise RuntimeError(
                "Amount of additional data ({}) doesn't match number of sequences ({})."
                .format(row, len(self.labels)))
        if True == standardize:
            from sklearn.preprocessing import robust_scale
            self.positionwise[identifier] = robust_scale(new_data, axis=0)
            if not "positionwise_unscaled" in dir(self):
                self.positionwise_unscaled = OrderedDict()
            self.positionwise_unscaled[identifier] = new_data
        else:
            self.positionwise[identifier] = new_data
コード例 #5
0
ファイル: Data.py プロジェクト: vivekmathema/pysster
    def load_additional_data(self,
                             class_files,
                             is_categorical=False,
                             categories=None,
                             standardize=False):
        """ Add additional numerical or categorical features to the network (for each sequence as a whole).

        For every input sequence additional data can be added to the network (e.g. location,
        average sequence conservation, etc.). The data will be concatenated to the input of the
        first dense layer (i.e. additional neurons in the first dense layer will be created). Input
        files are text files and must contain one value per line, e.g.:
        
        '0.679
        '0.961
        '0.065
        '0.871
        '...
        
        The number of provided files must match the fasta files provided to the __init__
        function (e.g. if you provided a list of 3 files to __init__ you must provide a list
        of 3 files here as well) and the number of lines in each file must match the number of
        entries in the corresponding fasta file. If you want to add multiple features simply
        call this function multiple times.

        Interpreting the influence of arbitrary additional data for a neural network is hard and at
        the moment we don't provide any means to do so. You should run your model with and without the
        additional data and check if the predictive performance improves. In general, if you have
        many handcrafted features you might want to consider using a different machine learning
        technique.

        Parameters
        ----------
        class_files: str or [str]
            A text file (multi-label) or a list of text files (single-label).
        
        is_categorical: bool
            Is the provided data categorical or numerical?

        categories: [str]
            A list containing all possible categories (only needed if is_categorial == True).

        standardize: bool
            Should the z-score be computed for numerical data?
        """
        if not isinstance(class_files, list):
            class_files = [class_files]
        # load raw data
        idx = len(self.meta)
        self.meta[idx] = {"data": [], "is_categorical": is_categorical}
        for _class_id, file_name in enumerate(class_files):
            handle = io.get_handle(file_name, "rt")
            if True == is_categorical:
                for line in handle:
                    self.meta[idx]['data'].append(line.strip())
            else:
                for line in handle:
                    self.meta[idx]['data'].append(float(line))
            handle.close()
        if len(self.labels) != len(self.meta[idx]['data']):
            raise RuntimeError(
                "Number of additional data ({}) doesn't match number of main data ({})."
                .format(len(self.meta[idx]['data']), len(self.labels)))
        # one hot encode categorical data
        if True == is_categorical:
            if not isinstance(categories, list):
                raise RuntimeError(
                    "is_categorical set to True, but no categories list provided."
                )
            categories = sorted(categories)
            mapping = {val: i for i, val in enumerate(categories)}
            for i, _val in enumerate(self.meta[idx]['data']):
                one_hot = np.zeros(len(categories), dtype=np.uint8)
                one_hot[mapping[self.meta[idx]['data'][i]]] = 1
                self.meta[idx]['data'][i] = one_hot
        # standardize numerical data if desired
        else:
            if True == standardize:
                from scipy import stats
                self.meta[idx]['data'] = stats.zscore(self.meta[idx]['data'])