def __init__(self, hamming_distance_probabilities: dict = None, min_gap: int = 0, max_gap: int = 0, alphabet_weights: dict = None, position_weights: dict = None): if hamming_distance_probabilities is not None: hamming_distance_probabilities = {key: float(value) for key, value in hamming_distance_probabilities.items()} assert all(isinstance(key, int) for key in hamming_distance_probabilities.keys()) \ and all(isinstance(val, float) for val in hamming_distance_probabilities.values()) \ and 0.99 <= sum(hamming_distance_probabilities.values()) <= 1, \ "GappedKmerInstantiation: for each possible Hamming distance a probability between 0 and 1 has to be assigned " \ "so that the probabilities for all distance possibilities sum to 1." self._hamming_distance_probabilities = hamming_distance_probabilities self.position_weights = position_weights # if weights are not given for each letter of the alphabet, distribute the remaining probability # equally among letters self.alphabet_weights = self.set_default_weights(alphabet_weights, EnvironmentSettings.get_sequence_alphabet()) self._min_gap = min_gap self._max_gap = max_gap
def drop_illegal_character_sequences(dataframe: pd.DataFrame, import_illegal_characters: bool) -> pd.DataFrame: if not import_illegal_characters: sequence_type = EnvironmentSettings.get_sequence_type() sequence_name = sequence_type.name.lower().replace("_", " ") legal_alphabet = EnvironmentSettings.get_sequence_alphabet(sequence_type) if sequence_type == SequenceType.AMINO_ACID: legal_alphabet.append(Constants.STOP_CODON) is_illegal_seq = [ImportHelper.is_illegal_sequence(sequence, legal_alphabet) for sequence in dataframe[sequence_type.value]] n_illegal = sum(is_illegal_seq) if n_illegal > 0: dataframe.drop(dataframe.loc[is_illegal_seq].index, inplace=True) warnings.warn( f"{ImportHelper.__name__}: {n_illegal} sequences were removed from the dataset because their {sequence_name} sequence contained illegal characters. ") return dataframe
def _substitute_letters(self, position_weights, alphabet_weights, allowed_positions: list, instance: list): if self._hamming_distance_probabilities: substitution_count = random.choices(list(self._hamming_distance_probabilities.keys()), list(self._hamming_distance_probabilities.values()), k=1)[0] allowed_position_weights = {key: value for key, value in position_weights.items() if key in allowed_positions} position_probabilities = self._prepare_probabilities(allowed_position_weights) positions = list(np.random.choice(allowed_positions, size=substitution_count, p=position_probabilities)) while substitution_count > 0: if position_weights[positions[substitution_count - 1]] > 0: # if the position is allowed to be changed position = positions[substitution_count - 1] alphabet_probabilities = self._prepare_probabilities(alphabet_weights) instance[position] = np.random.choice(EnvironmentSettings.get_sequence_alphabet(), size=1, p=alphabet_probabilities)[0] substitution_count -= 1 return instance
def __init__(self, kernel_count: int, kernel_size, positional_channels: int, sequence_type: SequenceType, background_probabilities, chain_names): super(PyTorchReceptorCNN, self).__init__() self.background_probabilities = background_probabilities self.threshold = 0.1 self.pseudocount = 0.05 self.in_channels = len( EnvironmentSettings.get_sequence_alphabet( sequence_type)) + positional_channels self.positional_channels = positional_channels self.max_information_gain = self.get_max_information_gain() self.chain_names = chain_names self.conv_chain_1 = [f"chain_1_kernel_{size}" for size in kernel_size] self.conv_chain_2 = [f"chain_2_kernel_{size}" for size in kernel_size] for size in kernel_size: # chain 1 setattr( self, f"chain_1_kernel_{size}", nn.Conv1d(in_channels=self.in_channels, out_channels=kernel_count, kernel_size=size, bias=True)) getattr(self, f"chain_1_kernel_{size}").weight.data. \ normal_(0.0, np.sqrt(1 / np.prod(getattr(self, f"chain_1_kernel_{size}").weight.shape))) # chain 2 setattr( self, f"chain_2_kernel_{size}", nn.Conv1d(in_channels=self.in_channels, out_channels=kernel_count, kernel_size=size, bias=True)) getattr(self, f"chain_2_kernel_{size}").weight.data. \ normal_(0.0, np.sqrt(1 / np.prod(getattr(self, f"chain_2_kernel_{size}").weight.shape))) self.fully_connected = nn.Linear(in_features=kernel_count * len(kernel_size) * 2, out_features=1, bias=True) self.fully_connected.weight.data.normal_( 0.0, np.sqrt(1 / np.prod(self.fully_connected.weight.shape)))
def create_model(self, dataset: RepertoireDataset, k: int, vector_size: int, batch_size: int, model_path: str): model = Word2Vec(size=vector_size, min_count=1, window=5) # creates an empty model all_kmers = KmerHelper.create_all_kmers( k=k, alphabet=EnvironmentSettings.get_sequence_alphabet()) all_kmers = [[kmer] for kmer in all_kmers] model.build_vocab(all_kmers) for repertoire in dataset.get_data(batch_size=batch_size): sentences = KmerHelper.create_sentences_from_repertoire( repertoire=repertoire, k=k) model.train(sentences=sentences, total_words=len(all_kmers), epochs=15) model.save(model_path) return model
def test_sequence_flattened(self): path = EnvironmentSettings.root_path + "test/tmp/onehot_seq_flat/" PathBuilder.build(path) dataset = self.construct_test_flatten_dataset(path) encoder = OneHotEncoder.build_object(dataset, **{"use_positional_info": False, "distance_to_seq_middle": None, "flatten": True}) encoded_data = encoder.encode(dataset, EncoderParams( result_path=path, label_config=LabelConfiguration([Label(name="l1", values=[1, 0], positive_class="1")]), pool_size=1, learn_model=True, model={}, filename="dataset.pkl" )) self.assertTrue(isinstance(encoded_data, SequenceDataset)) onehot_a = [1.0] + [0.0] * 19 onehot_t = [0.0] * 16 + [1.0] + [0] * 3 self.assertListEqual(list(encoded_data.encoded_data.examples[0]), onehot_a+onehot_a+onehot_a+onehot_t+onehot_t+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.examples[1]), onehot_a+onehot_t+onehot_a+onehot_t+onehot_a+onehot_t) self.assertListEqual(list(encoded_data.encoded_data.feature_names), [f"{pos}_{char}" for pos in range(6) for char in EnvironmentSettings.get_sequence_alphabet()]) shutil.rmtree(path)