Exemple #1
0
    def __init__(self, hamming_distance_probabilities: dict = None, min_gap: int = 0, max_gap: int = 0,
                 alphabet_weights: dict = None, position_weights: dict = None):
        if hamming_distance_probabilities is not None:
            hamming_distance_probabilities = {key: float(value) for key, value in hamming_distance_probabilities.items()}
            assert all(isinstance(key, int) for key in hamming_distance_probabilities.keys()) \
                   and all(isinstance(val, float) for val in hamming_distance_probabilities.values()) \
                   and 0.99 <= sum(hamming_distance_probabilities.values()) <= 1, \
                "GappedKmerInstantiation: for each possible Hamming distance a probability between 0 and 1 has to be assigned " \
                "so that the probabilities for all distance possibilities sum to 1."

        self._hamming_distance_probabilities = hamming_distance_probabilities
        self.position_weights = position_weights
        # if weights are not given for each letter of the alphabet, distribute the remaining probability
        # equally among letters
        self.alphabet_weights = self.set_default_weights(alphabet_weights, EnvironmentSettings.get_sequence_alphabet())
        self._min_gap = min_gap
        self._max_gap = max_gap
Exemple #2
0
    def drop_illegal_character_sequences(dataframe: pd.DataFrame, import_illegal_characters: bool) -> pd.DataFrame:
        if not import_illegal_characters:
            sequence_type = EnvironmentSettings.get_sequence_type()
            sequence_name = sequence_type.name.lower().replace("_", " ")

            legal_alphabet = EnvironmentSettings.get_sequence_alphabet(sequence_type)
            if sequence_type == SequenceType.AMINO_ACID:
                legal_alphabet.append(Constants.STOP_CODON)

            is_illegal_seq = [ImportHelper.is_illegal_sequence(sequence, legal_alphabet) for
                              sequence in dataframe[sequence_type.value]]
            n_illegal = sum(is_illegal_seq)

            if n_illegal > 0:
                dataframe.drop(dataframe.loc[is_illegal_seq].index, inplace=True)
                warnings.warn(
                    f"{ImportHelper.__name__}: {n_illegal} sequences were removed from the dataset because their {sequence_name} sequence contained illegal characters. ")
        return dataframe
Exemple #3
0
    def _substitute_letters(self, position_weights, alphabet_weights, allowed_positions: list, instance: list):

        if self._hamming_distance_probabilities:
            substitution_count = random.choices(list(self._hamming_distance_probabilities.keys()),
                                                list(self._hamming_distance_probabilities.values()), k=1)[0]
            allowed_position_weights = {key: value for key, value in position_weights.items() if key in allowed_positions}
            position_probabilities = self._prepare_probabilities(allowed_position_weights)
            positions = list(np.random.choice(allowed_positions, size=substitution_count, p=position_probabilities))

            while substitution_count > 0:
                if position_weights[positions[substitution_count - 1]] > 0:  # if the position is allowed to be changed
                    position = positions[substitution_count - 1]
                    alphabet_probabilities = self._prepare_probabilities(alphabet_weights)
                    instance[position] = np.random.choice(EnvironmentSettings.get_sequence_alphabet(), size=1,
                                                          p=alphabet_probabilities)[0]
                substitution_count -= 1

        return instance
Exemple #4
0
    def __init__(self, kernel_count: int, kernel_size,
                 positional_channels: int, sequence_type: SequenceType,
                 background_probabilities, chain_names):
        super(PyTorchReceptorCNN, self).__init__()
        self.background_probabilities = background_probabilities
        self.threshold = 0.1
        self.pseudocount = 0.05
        self.in_channels = len(
            EnvironmentSettings.get_sequence_alphabet(
                sequence_type)) + positional_channels
        self.positional_channels = positional_channels
        self.max_information_gain = self.get_max_information_gain()
        self.chain_names = chain_names

        self.conv_chain_1 = [f"chain_1_kernel_{size}" for size in kernel_size]
        self.conv_chain_2 = [f"chain_2_kernel_{size}" for size in kernel_size]

        for size in kernel_size:
            # chain 1
            setattr(
                self, f"chain_1_kernel_{size}",
                nn.Conv1d(in_channels=self.in_channels,
                          out_channels=kernel_count,
                          kernel_size=size,
                          bias=True))
            getattr(self, f"chain_1_kernel_{size}").weight.data. \
                normal_(0.0, np.sqrt(1 / np.prod(getattr(self, f"chain_1_kernel_{size}").weight.shape)))

            # chain 2
            setattr(
                self, f"chain_2_kernel_{size}",
                nn.Conv1d(in_channels=self.in_channels,
                          out_channels=kernel_count,
                          kernel_size=size,
                          bias=True))
            getattr(self, f"chain_2_kernel_{size}").weight.data. \
                normal_(0.0, np.sqrt(1 / np.prod(getattr(self, f"chain_2_kernel_{size}").weight.shape)))

        self.fully_connected = nn.Linear(in_features=kernel_count *
                                         len(kernel_size) * 2,
                                         out_features=1,
                                         bias=True)
        self.fully_connected.weight.data.normal_(
            0.0, np.sqrt(1 / np.prod(self.fully_connected.weight.shape)))
Exemple #5
0
    def create_model(self, dataset: RepertoireDataset, k: int,
                     vector_size: int, batch_size: int, model_path: str):
        model = Word2Vec(size=vector_size, min_count=1,
                         window=5)  # creates an empty model
        all_kmers = KmerHelper.create_all_kmers(
            k=k, alphabet=EnvironmentSettings.get_sequence_alphabet())
        all_kmers = [[kmer] for kmer in all_kmers]
        model.build_vocab(all_kmers)

        for repertoire in dataset.get_data(batch_size=batch_size):
            sentences = KmerHelper.create_sentences_from_repertoire(
                repertoire=repertoire, k=k)
            model.train(sentences=sentences,
                        total_words=len(all_kmers),
                        epochs=15)

        model.save(model_path)

        return model
Exemple #6
0
    def test_sequence_flattened(self):
        path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/"

        PathBuilder.build(path)

        dataset = self.construct_test_flatten_dataset(path)

        encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True})

        encoded_data = encoder.encode(dataset, EncoderParams(
            result_path=path,
            label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]),
            pool_size=1,
            learn_model=True,
            model={},
            filename="dataset.pkl"
        ))

        self.assertTrue(isinstance(encoded_data, SequenceDataset))

        onehot_a = [1.0] + [0.0] * 19
        onehot_t = [0.0] * 16 + [1.0] + [0] * 3

        self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t)
        self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t)

        self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()])
        shutil.rmtree(path)