Beispiel #1
0
    def test_process_tweet(self):
        """
        Test pipeline function
        """

        tweet = "@skand this (abc) [email protected] is a random Tweet. don't run 12:"\
        " https://www.google.com⚡️ #columbia"
        processor = TextPreprocessing(2, 1000)
        result = processor.process_tweet(tweet)
        expected_result = [19, 1]
        self.assertEqual(result, expected_result)
Beispiel #2
0
    def test_pad_sequence2(self):
        """
        Test pad sequence condition 2 function
        """

        word_embeddings = [19, 1, 22, 1, 306, 50]
        processor = TextPreprocessing(10, 1000)
        result = processor.pad_sequence(word_embeddings)

        expected_result = [19, 1, 22, 1, 306, 50, 0, 0, 0, 0]

        self.assertEqual(result, expected_result)
Beispiel #3
0
    def test_tokenize_text(self):
        """
        Test tokeniser function
        """

        cleaned_tweet = "(abc) skand@'t run : ⚡️ columbia"
        processor = TextPreprocessing(2, 1000)
        result = processor.tokenize_text(cleaned_tweet)

        expected_result = ['(', 'abc', ')', 'skand', '@', "'", 't', 'run', ':', '⚡', '️',\
         'columbia']

        self.assertEqual(result, expected_result)
Beispiel #4
0
    def test_clean_text(self):
        """
        Master test function
        """

        tweet = "@skand this (abc) [email protected] is a random Tweet. don't run 12:"\
        " https://www.google.com⚡️ #columbia"
        processor = TextPreprocessing(2, 1000)
        result = processor.clean_text(tweet)

        expected_result = "(abc) skand@'t run : ⚡️ columbia"

        self.assertEqual(result, expected_result)
Beispiel #5
0
    def test_replace_token_with_index(self):
        """
        Test replace token with index function
        """

        tokenized = [
            '(', 'abc', ')', 'skand', '@', "'", 't', 'run', ':', '⚡', '️',
            'columbia'
        ]
        processor = TextPreprocessing(2, 1000)
        result = processor.replace_token_with_index(tokenized)

        expected_result = [19, 1, 22, 1, 306, 50, 189, 901, 4, 1, 1, 1]

        self.assertEqual(result, expected_result)
Beispiel #6
0
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from tweet import TextPreprocessing

print('--IMPORTED--')

# Define text_pre_processor
processor = TextPreprocessing()

print('--PROCESSOR DEFINED--')

## @params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])

sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
## @type: DataSource
## @args: [database = "a6_db", table_name = "train", transformation_ctx = "datasource0"]
## @return: datasource0
## @inputs: []
datasource0 = glueContext.create_dynamic_frame.from_catalog(
    database="a6_db", table_name="train", transformation_ctx="datasource0")
## @type: ApplyMapping
## @args: [mapping = [("sentiment", "long", "sentiment", "long"), ("twitterid", "long", "twitterid", "long"), ("tweet", "string", "tweet", "string")], transformation_ctx = "applymapping1"]
Beispiel #7
0
import json
import tweet
import datetime
import time
from tweet import TextPreprocessing
import boto3

sage_maker_client = boto3.client("runtime.sagemaker")
s3_client = boto3.client("s3")

preprocess = TextPreprocessing(max_length_tweet=100,
                               max_length_dictionary=10000)


def lambda_handler(event, context):
    # TODO implement
    tweet = event["tweet"]

    ## Preprocessing
    time_preprocessing_start = time.time()  ## Start
    features = preprocess.process_tweet(tweet)
    time_preprocessing_end = time.time()  ## End

    model_payload = {"embedding_input": features}

    ## Model Output

    time_model_start = time.time()  ## Start
    response = sage_maker_client.invoke_endpoint(EndpointName='a6-endpoint',\
                                                ContentType='application/json',\
                                                    Body=json.dumps(model_payload))