class Test(unittest.TestCase):
    """
    pytest text_processing

    """
    def setUp(self):
        """
        Call Text Processor Library
        """
        self.textprocessor = TextProcessor()

    def test_clean_text(self):
        """
        Test wheter clean text properly

        """
        text = 'I love you from the Moon and back #hahaha @li'
        expected_result = 'I love you from the Moon and back  '
        result = self.textprocessor.clean_text(text)
        self.assertEqual(result, expected_result)

    def test_tokenize_text(self):
        """
        Test wether text being tokenized properly
        """
        text = 'i love from the moon and back'
        expected_result = ['love','moon','back']
        result = self.textprocessor.tokenize_text(text)
        self.assertEqual(result, expected_result)
class PreProcessor():
    """Pre Processing class used to convert json file into index list"""

    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)
        print(tokens)
        embeding_indexes = self.embedding.replace_tokens_with_index(tokens)

        padded_indexes = self.pad_sequence(embeding_indexes)

        return padded_indexes

    def pad_sequence(self, input_sequence):
        """Step 4 pad_sequence"""

        sequence = input_sequence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0]*(self.padding_size - len(sequence))
            sequence += pad_sequence

        return sequence
class PreProcessor():
    """
            Run the pre processing end to end

    """
    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(
            self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):
        """
        Run the pre processing end to end

        """

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)

        embedding_indexs = self.embedding.replace_tokens_with_index(tokens)

        padded_index = self.pad_sequence(embedding_indexs)

        return padded_index

    def pad_sequence(self, input_squence):
        """
        Add 0 padding to a seqence

        """

        sequence = input_squence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0] * (self.padding_size - len(sequence))
            sequence = sequence + pad_sequence

        return sequence
Example #4
0
class PreProcessor():
    """
    Pre-process tweets
    """
    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(
            self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):
        """
        Clean and tokenize text, replace tokens with index
        """

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)

        embedding_indexs = self.embedding.replace_tokens_with_index(tokens)

        padded_index = self.pad_sequence(embedding_indexs)

        return padded_index

    def pad_sequence(self, input_squence):
        """
        Padding: add 0 until max length
        """

        sequence = input_squence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0] * (self.padding_size - len(sequence))
            sequence = sequence + pad_sequence

        return sequence