class Test(unittest.TestCase): """ pytest text_processing """ def setUp(self): """ Call Text Processor Library """ self.textprocessor = TextProcessor() def test_clean_text(self): """ Test wheter clean text properly """ text = 'I love you from the Moon and back #hahaha @li' expected_result = 'I love you from the Moon and back ' result = self.textprocessor.clean_text(text) self.assertEqual(result, expected_result) def test_tokenize_text(self): """ Test wether text being tokenized properly """ text = 'i love from the moon and back' expected_result = ['love','moon','back'] result = self.textprocessor.tokenize_text(text) self.assertEqual(result, expected_result)
class PreProcessor(): """Pre Processing class used to convert json file into index list""" def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary(self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) print(tokens) embeding_indexes = self.embedding.replace_tokens_with_index(tokens) padded_indexes = self.pad_sequence(embeding_indexes) return padded_indexes def pad_sequence(self, input_sequence): """Step 4 pad_sequence""" sequence = input_sequence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0]*(self.padding_size - len(sequence)) sequence += pad_sequence return sequence
def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary(self.embedding.dictionary_path) self.padding_size = padding_size
class PreProcessor(): """ Run the pre processing end to end """ def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary( self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): """ Run the pre processing end to end """ cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) embedding_indexs = self.embedding.replace_tokens_with_index(tokens) padded_index = self.pad_sequence(embedding_indexs) return padded_index def pad_sequence(self, input_squence): """ Add 0 padding to a seqence """ sequence = input_squence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0] * (self.padding_size - len(sequence)) sequence = sequence + pad_sequence return sequence
class PreProcessor(): """ Pre-process tweets """ def __init__(self, padding_size=20, max_dictionary_size=500000): self.text_processor = TextProcessor() self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size) self.embedding.load_embedding_dictionary( self.embedding.dictionary_path) self.padding_size = padding_size def pre_process_text(self, text): """ Clean and tokenize text, replace tokens with index """ cleaned_text = self.text_processor.clean_text(text) tokens = self.text_processor.tokenize_text(cleaned_text) embedding_indexs = self.embedding.replace_tokens_with_index(tokens) padded_index = self.pad_sequence(embedding_indexs) return padded_index def pad_sequence(self, input_squence): """ Padding: add 0 until max length """ sequence = input_squence[-self.padding_size:] if len(sequence) < self.padding_size: pad_sequence = [0] * (self.padding_size - len(sequence)) sequence = sequence + pad_sequence return sequence
def setUp(self): """ Call Text Processor Library """ self.textprocessor = TextProcessor()