def test_from_files_success(self):
        """
        Ensure file exists after being written successfully

        return: (boolean): Returns 'True' if the file exist or 'False' otherwise
        """

        word_embedding = WordEmbedding.from_files('words.txt',
                                                  'vectors.npy.gz')
        assert word_embedding is not None
    def test_embed_document_success(self):
        """
        Tests the success of embed_document by passing a document and checking to see if it returns a vector

        :return: (boolean): Returns 'True' if user_vector is returned, 'False' if it returns None.
        """
        document = 'All the worlds a stage, and all the men and women merely players'
        word_embedding = WordEmbedding.from_files('words.txt',
                                                  'vectors.npy.gz')
        user_vector = word_embedding.embed_document(document)
        assert user_vector is not None
    def test_embed_document_failure(self):
        """
        Tests the failure of embed_document by passing a null document and checking to see if it returns a zero vector

        :return: (boolean): Returns 'True' if it retyrns 0, 'False' otherwise.
        """
        document = ''
        word_embedding = WordEmbedding.from_files('words.txt',
                                                  'vectors.npy.gz')
        user_vector = word_embedding.embed_document(document)
        assert numpy.all(user_vector == 0)
    def test_word_vector_success(self):
        """
        Checks to see if a vetor is returned when we pass a word.

        :return: (boolean): Returns 'True' if it retyrns a not None value, 'False' otherwise.
        """
        word = 'the'
        word_embedding = WordEmbedding.from_files('words.txt',
                                                  'vectors.npy.gz')
        vector_for_word = word_embedding(word)

        assert vector_for_word is not None
Ejemplo n.º 5
0
 def run(self):
     """
     Reads the csv file from Source S3, calculates the vector value for the product description and stores it in a new column
     and saves the file to Target S3 location
     """
     try:
         # Read from S3
         ddf = dd.read_csv(  # pragma: no_cover
             self.s3_data_path,
             dtype={'Upc Ean Code': 'str'},
             storage_options={
                 'anon': False,
                 'requester_pays': True
             })
         # calculate vector
         cols = [
             'Uniq Id', 'Product Name', 'Selling Price', 'Image',
             'About Product'
         ]
         word_embedding = WordEmbedding.from_files('words.txt',
                                                   'vectors.npy.gz')
         df_to_write = pd.DataFrame()
         for i in range(ddf.npartitions):
             df_partition = ddf.get_partition(i)[cols].compute()
             product = []
             for index, row in df_partition.iterrows():
                 unique_id = row['Uniq Id']
                 description = row['Product Name']
                 if description is not None:
                     print(unique_id, description)
                     description_vector = word_embedding.embed_document(
                         description)
                     product.append(description_vector)
             df_partition['vectorized_value'] = product
             df_to_write = df_to_write.append(df_partition,
                                              ignore_index=True)
         print('Ready to write')
         ddf_to_write = dd.from_pandas(df_to_write,
                                       npartitions=ddf.npartitions)
         ddf_to_write.fillna(0)
         file_path = self.output().path + 'product-data-*.csv'
         dask.dataframe.to_csv(df=ddf_to_write,
                               filename=file_path,
                               single_file=False,
                               encoding='utf-8',
                               mode='wt',
                               name_function=None,
                               compression=None,
                               compute=True)
         print('File write complete')
     except Exception as err:
         print('Error: ', err)
    def test_from_files_failure(self):
        """
        Ensure that file does not exist after failure during write

        return: (boolean): Returns 'True' if file does not exist after failure otherwise returns 'False'
        """
        try:
            file_name = "words" + str(randint(0, 999999)) + ".txt"
            word_embedding = WordEmbedding.from_files(file_name,
                                                      'vectors.npy.gz')
            assert word_embedding is None
        except FileNotFoundError:
            assert 1 == 1
Ejemplo n.º 7
0
def lambda_handler(event, context):
    """Sample pure Lambda function

    Parameters
    ----------
    event: dict, required
        API Gateway Lambda Proxy Input Format

        Event doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html#api-gateway-simple-proxy-for-lambda-input-format

    context: object, required
        Lambda Context runtime methods and attributes

        Context doc: https://docs.aws.amazon.com/lambda/latest/dg/python-context-object.html

    Returns
    ------
    API Gateway Lambda Proxy Output Format: dict

        Return doc: https://docs.aws.amazon.com/apigateway/latest/developerguide/set-up-lambda-proxy-integrations.html
    """
    if 'searchText' in event['queryStringParameters']:
        s3_output_path = 's3://finalproject-spring-2021/output/'
        search_text = event['queryStringParameters']['searchText']
        s3 = boto3.resource('s3')
        bucket = s3.Bucket('finalproject-spring-2021')
        data_files_in_s3 = bucket.objects.filter(Prefix="output/")
        distance_list = []
        list_of_dataframes = []
        cols = [
            'Uniq Id', 'Product Name', 'Selling Price', 'Image',
            'About Product'
        ]
        try:
            for obj in data_files_in_s3:
                key = obj.key
                body = obj.get()['Body'].read()
                temp = pd.read_csv(io.BytesIO(body), encoding='utf8')
                list_of_dataframes.append(temp)

            product_data_frame = pd.concat(list_of_dataframes)

            # convert search text to vector
            word_embedding = WordEmbedding.from_files('words.txt',
                                                      'vectors.npy.gz')
            search_vector = word_embedding.embed_document(search_text)
            # get distance
            for index, row in product_data_frame.iterrows():
                distance = get_cosine_distance(search_vector,
                                               row['vectorized_value'])
                distance_list.append({
                    'unique_id': row['Uniq Id'],
                    'product_name': row['Product Name'],
                    'selling_price': row['Selling Price'],
                    'image': row['Image'],
                    'about_product': row['About Product'],
                    'distance': distance
                })
            # get closer top 10 list
            distance_df = pd.DataFrame(distance_list).sort_values(
                by='distance', ascending=True, na_position='last')
            data_to_api = distance_df.head(10)

        except Exception as e:
            # Send some context about this error to Lambda Logs
            print(e)
            raise e

        search_list = {}
        return {
            "statusCode":
            200,
            "headers": {
                "Access-Control-Allow-Headers": "*",
                "Access-Control-Allow-Origin": "*",
                "Access-Control-Allow-Methods": "OPTIONS,POST,GET"
            },
            "body":
            json.dumps({
                "status": "success",
                "product_list": data_to_api.to_json(orient='index')
            })
        }

    return {
        "statusCode": 200,
        "headers": {
            "Access-Control-Allow-Headers": "*",
            "Access-Control-Allow-Origin": "*",
            "Access-Control-Allow-Methods": "OPTIONS,POST,GET"
        },
        "body": json.dumps({"status": "No search text"})
    }