class PreProcessor():
    """Pre Processing class used to convert json file into index list"""

    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)
        print(tokens)
        embeding_indexes = self.embedding.replace_tokens_with_index(tokens)

        padded_indexes = self.pad_sequence(embeding_indexes)

        return padded_indexes

    def pad_sequence(self, input_sequence):
        """Step 4 pad_sequence"""

        sequence = input_sequence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0]*(self.padding_size - len(sequence))
            sequence += pad_sequence

        return sequence
Exemple #2
0
 def setUp(self):
     """
     call wordembedding lib
     """
     self.embeddings = WordEmbedding(max_dictionary_size=1000)
     self.embedding_dictionary = self.embeddings.load_embedding_dictionary(
         self.embeddings.dictionary_path)
    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(self.embedding.dictionary_path)

        self.padding_size = padding_size
class PreProcessor():
    """
            Run the pre processing end to end

    """
    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(
            self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):
        """
        Run the pre processing end to end

        """

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)

        embedding_indexs = self.embedding.replace_tokens_with_index(tokens)

        padded_index = self.pad_sequence(embedding_indexs)

        return padded_index

    def pad_sequence(self, input_squence):
        """
        Add 0 padding to a seqence

        """

        sequence = input_squence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0] * (self.padding_size - len(sequence))
            sequence = sequence + pad_sequence

        return sequence
Exemple #5
0
class PreProcessor():
    """
    Pre-process tweets
    """
    def __init__(self, padding_size=20, max_dictionary_size=500000):

        self.text_processor = TextProcessor()
        self.embedding = WordEmbedding(max_dictionary_size=max_dictionary_size)

        self.embedding.load_embedding_dictionary(
            self.embedding.dictionary_path)

        self.padding_size = padding_size

    def pre_process_text(self, text):
        """
        Clean and tokenize text, replace tokens with index
        """

        cleaned_text = self.text_processor.clean_text(text)
        tokens = self.text_processor.tokenize_text(cleaned_text)

        embedding_indexs = self.embedding.replace_tokens_with_index(tokens)

        padded_index = self.pad_sequence(embedding_indexs)

        return padded_index

    def pad_sequence(self, input_squence):
        """
        Padding: add 0 until max length
        """

        sequence = input_squence[-self.padding_size:]

        if len(sequence) < self.padding_size:

            pad_sequence = [0] * (self.padding_size - len(sequence))
            sequence = sequence + pad_sequence

        return sequence
Exemple #6
0
class Test(unittest.TestCase):
    """
    pytest word embedding
    """
    def setUp(self):
        """
        call wordembedding lib
        """
        self.embeddings = WordEmbedding(max_dictionary_size=1000)
        self.embedding_dictionary = self.embeddings.load_embedding_dictionary(
            self.embeddings.dictionary_path)

    def test_load_embedding_dictionary_resource(self):
        """

        Testing proper embedding loading from resource dictionary 

        """

        self.embeddings.load_embedding_dictionary(
            self.embeddings.dictionary_path)
        self.assertEqual(len(self.embeddings.embedding_dictionary), 1000)

    def test_load_from_zip(self):
        """
        test whether can load zip file

        """

        zip_resource = os.path.join(
            self.embeddings.dictionary_path.replace("glove.twitter.txt",
                                                    "test_resources.zip"),
            "pre_processing", "..", "test_resources", "glove.twitter.txt")

        self.embeddings.load_embedding_dictionary(zip_resource)
        self.assertEqual(len(self.embeddings.embedding_dictionary), 1000)

    def test_replace_tokens_with_index(self):
        """
        Testing proper token replacement

        """

        token_list = ['love', '<user>', 'moon', 'back']
        expected_result = [56, 2, 1, 98]
        result = self.embeddings.replace_tokens_with_index(token_list)
        self.assertEqual(result, expected_result)