Beispiel #1
0
def get_status(sku):
    response = {"status": __in_queue__}
    status = None
    try:
        status = list(db_status.find({"sku": sku}))[0]
        status = status.get("msg")
        db_details = DB.init_db(config.get("details_db")).product_details
        product = list(db_details.find({"sku": sku}))
        product_url, product_name, image_url = "", "", ""
        if product:
            product_name = product[0].get("product_name")
            product_url = product[0].get("url")
            image_url = product[0].get("img")
        logger.info("Status for {}: {}".format(sku, status))
        return {
            "status": status,
            "product_name": product_name,
            "product_url": product_url,
            "image_url": image_url,
        }
    except IndexError:
        # this happens due to a race condition because the sku hasn't been
        # added to the database yet or because it simply doesn't exist. The
        # second case only true if the URL has been typed in manually or
        # bookmarked but the sku is missing from the URL.
        logger.warning(
            "Product status not yet available for sku {}".format(sku))
        _set_status(__in_queue__, sku)
        response = {"status": status}
    except Exception as e:
        logger.exception(e)
        response = {"status": __error__}
    return response
Beispiel #2
0
    def edit_project(self, pParameters):
        '''
        Edits a project in db with klambda file info

        :param list pParameters: a list of optional parameters
        '''
        if self.client.check_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
        }):
            item = self.client.get_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
            })
            self.validate_user(
                item)  # verify if current user can perfom this action
            item['users'] = self.project['users']
            item['description'] = self.project['description']
            item['repo_url'] = self.project['repo_url']
            item['files'] = self.project['files']
            item['last_update'] = datetime.datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")
            self.client.put_item("Klambda_projects", item)
            destination_path = self.project['name'] + "-" + self.project[
                'author']
            self.__s3_client.delete_object(
                'klambda', destination_path)  # deletes the project folder
            self.upload_files(
                self.project['files'])  # uploads listed files on S3
            logger.info("The project %s edited succesfully" %
                        self.project['name'])
        else:
            logger.error("The project %s doesn't exist" % self.project['name'])
Beispiel #3
0
    def delete_project(self, pParameters):
        '''
        Deletes a project from db

        :param list pParameters: a list of optional parameters
        '''
        if self.client.check_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
        }):
            item = self.client.get_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
            })
            self.validate_user(
                item)  # verify if current user can perfom this action
            self.client.delete_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
            })
            destination_path = self.project['name'] + "-" + self.project[
                'author']
            self.__s3_client.delete_object('klambda', destination_path)
            logger.info("The project %s deleted succesfully" %
                        self.project['name'])
        else:
            logger.error("The project %s doesn't exist" % self.project['name'])
Beispiel #4
0
def _db_product_details(sku):
    """
    Return product details from the database.
    """
    db_details = DB.init_db(config.get("details_db")).product_details
    product = list(db_details.find({"sku": sku}))
    if product:
        product_name = product[0].get("product_name")
        product_url = product[0].get("url")
        image_url = product[0].get("img")
        review_count = product[0].get("review_count")
        page_count = product[0].get("review_page_count")
        valid = all(
            [product_name, product_url, image_url, review_count, page_count])
        if valid:
            return {
                "product_name": product_name,
                "review_count": review_count,
                "page_count": page_count,
            }
        else:
            logger.info(
                "Unable to validate product details for {}. Deleting entry".
                format(sku))
            db_details.delete_one({"sku": sku})
    return {}
Beispiel #5
0
def _get_product_details(source, url, sku):
    """
    Scrape product metadata.

    :param url: canonical product url
    :return number of reviews and product name
    """
    sc = scraper.Scraper(source=source)
    response = sc.get_request(url)
    pr = parser.Parser(sku=sku, source=source)
    res = pr.parse(response, init=True)
    if res:
        # Save it to the database
        db_details = DB.init_db(config.get("details_db"))
        db_details = db_details.product_details
        record = {
            "status": "processing",
            "url": url,
            "product_name": res.get("product_name"),
            "review_count": res.get("review_count"),
            "review_page_count": res.get("page_count"),
            "source": source,
            "sku": sku,
            "img": res.get("img_url"),
            "timestamp": time.time(),
        }
        db_details.insert_one(record)
        logger.info("Saved new product details: ")
        logger.info(record)
    return res
Beispiel #6
0
def start(url, progress=False):
    """
    Initiate scraping, parsing, data ingestion, preprocessing, 
    and training. Note that this is a blocking call. Flask will wait on start(). 
    We don't want that. We want the script to return control to Flask immediately
    and continue with the data processing. That way, the sku status can be updated
    at the appropriate time and everyone is happy :). 
    """
    logger.info("Received new url to process: {}".format(url))
    decoded = _decode_url(url)
    if not decoded: return {}
    if progress:
        sku = decoded[1]
        __in_progress__ = True
        status = get_status(sku)
        if status.get("status") == __ready__: __in_progress__ = False
        response = {
            "sku": sku,
            "product_url": decoded[2],
            "in_progress": __in_progress__,
        }
        response.update(status)
        return response
    else:
        executor = ThreadPoolExecutor(max_workers=1)
        executor.submit(_workflow, decoded, url)
        executor.shutdown(wait=False)
Beispiel #7
0
    def infer(self, query):
        '''
        Query the model to infer vectors for an unseen sentence 
        and return a list of sentence tags and their probabilities. 

        steps: number of iterations (?); tested step=1 to step=10e6. 10e5
                is optimal
        topn: number of top n sentences to return; n is determined 
                empirically based on the quality of the predictions 
                generated at or above a given probability threshold
            
        param query: an enriched query
        return sents: a list of inferred sentences 
        '''
        #TODO: validate the query is a valid statement

        steps = config.get("doc2vec").get("inference").get("steps")
        topn = config.get("doc2vec").get("inference").get("topn")

        logger.info('Sentence inference in progress')
        query_tokens = query.split()
        inference = self.d2v_model.infer_vector(query_tokens, steps=steps)
        sims = self.d2v_model.docvecs.most_similar([inference], topn=topn)
        summary = self._summarize(map(lambda x: self._lookup(x[0]), sims))
        probs = [tup[1] for tup in sims]
        confidence = reduce(lambda x, y: x + y, probs) / len(probs)
        if not summary:
            summary = __default_answer__
            confidence = 0.00
        return summary, round(confidence, 2)
Beispiel #8
0
    def _amazon_detail_parser(self, soup):
        logger.info("Started parsing Amazon product detail page for {}".format(self.sku))
        rcount_sel_outer = selectors.get(self.source).get("review_count_outer")
        rcount_sel_inner = selectors.get(self.source).get("review_count_inner")
        name_selector = selectors.get(self.source).get("product_name")
        image_selector = selectors.get(self.source).get("product_image")
        
        # get review count
        review_count = -1
        try:
            review_count = self._amazon_review_count(soup, rcount_sel_outer, rcount_sel_inner)
        except Exception as e:
            # if we have a problem parsing the review count, then 
            # we have nothing to work with
            logger.exception(e)
            return {}
        
        page_count = self._get_page_count(review_count, divisor=10)

        # get product name and image url
        name = soup.select(name_selector)[0].text.strip()
        img_url = self._amazon_image_url(soup, image_selector)
        logger.info("Finished parsing Amazon product detail page for {}".format(self.sku))
        return {"product_name": name,
                "review_count": review_count,
                "page_count": page_count,
                "img_url": img_url,
            }
Beispiel #9
0
    def edit_lambdas(self, pParameters):
        '''
        Edits a function information on the db from the klambda file

        :param list pParameters: a list of function names to download
        '''
        if len(pParameters) == 0: # if not function listed gets all the functions from klambda file
            functions = self.get_lambdas()
        else:
            functions = pParameters
        for function in self.functions_list:
            function_name = next(iter(function)) # get first key of dict
            if function_name in functions:
                if self.validate_function(function[function_name]):
                    if self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}):
                        item = self.client.get_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']})
                        self.validate_user(item['author']) # verify if current user can perfom this action
                        item['runtime'] = function[function_name]['runtime'] 
                        item['description'] = function[function_name]['description'] 
                        item['category'] = function[function_name]['category'] 
                        item['version'] = str(function[function_name]['version'])
                        item['repo_url'] = function[function_name]['repo_url'] 
                        item['folder_path'] = function[function_name]['folder_path'] 
                        item['last_update'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        item['checksum'] = hashlib.md5(json.dumps(item).encode('utf-8')).hexdigest()
                        self.client.put_item("Klambda_functions", item)
                        self.write_data(item, function_name) # stores checksum on klambda file
                        logger.info("The function %s edited succesfully under %s runtime" % (function_name, function[function_name]['runtime']))
                    else:
                        logger.error("The function %s does not exist under %s runtime" % (function_name, function[function_name]['runtime']))
                else:
                    logger.error("The function %s information is incomplete" % function_name)
Beispiel #10
0
    def create_lambdas(self, pParameters):
        '''
        Uploads a function information and creates it on the db from the klambda file

        :param list pParameters: a list of function names to create
        '''
        if len(pParameters) == 0: # if not function listed gets all the functions from klambda file
            functions = self.get_lambdas()
        else:
            functions = pParameters
        for function in self.functions_list:
            function_name = next(iter(function)) # get first key of dict
            if function_name in functions:
                if self.validate_function(function[function_name]):
                    self.validate_user(function[function_name]['author']) # validates the user logged in
                    if not self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}):
                        function[function_name]['name'] = function_name 
                        function[function_name]['version'] = str(function[function_name]['version'])
                        function[function_name]['created_on'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        function[function_name]['last_update'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S")
                        function[function_name]['checksum'] = hashlib.sha256(json.dumps(function[function_name]).encode('utf-8')).hexdigest()
                        self.client.put_item("Klambda_functions",function[function_name])
                        self.write_data(function[function_name], function_name) # stores checksum on klambda file
                        logger.info("The function %s created succesfully from the author %s" % (function_name, function[function_name]['author']))
                    else:
                        logger.error("The function %s already exists for the author %s" % (function_name, function[function_name]['author']))
                else:
                    logger.error("The function %s information is incomplete" % function_name)
Beispiel #11
0
def _nlp_reset(sku):
    """
    Clear the database of any existing sentences for sku.

    :param sku: product sku
    """
    db_sents = DB.init_db(config.get("sent_db")).sentences
    db_sents.delete_many({"sku": sku})
    logger.info("Cleared sentence table for " + sku)
Beispiel #12
0
def ingest(raw):
    """
    Save raw product reviews to the database.

    :param raw: a dictionary (JSON) containing all reviews
    """
    for record in raw:
        # Create/load raw collection
        raw = db.raw
        raw.update(record, record, upsert=True)
        logger.info("Added record to raw feed db")
Beispiel #13
0
 def get_validation_token(self, uid, email):
     user_info = self._db.get('%s%s' % (self.USER_DB, uid))
     if user_info is None:
         logger.warn('No user information found for uid %s' % uid)
         return None
     user = cjson.decode(user_info)
     aemail = email.encode('ascii')
     if (aemail in user.get('emails', {}) and
         user['emails'][aemail].get('state', None) == 'pending'):
         return user['emails'][aemail].get('conf_code', None)
     logger.info('No validation token found for uid %s ' % uid)
     return None
Beispiel #14
0
    def update_function(self, pFunctionName):
        '''
        Updates the folder of a function from a repository

        ::param str pFunctionName: the function name
        '''
        if os.path.isdir('./'+pFunctionName):
            l = local.LocalClient('./'+pFunctionName)
            l.update('./'+pFunctionName)
            logger.info("The function %s updated succesfully" % pFunctionName)
        else:
            logger.error("The function %s is not configured in your project" % pFunctionName)
Beispiel #15
0
    def download_function(self, pUrl, pFunctionName):
        '''
        Downloads the folder of a function from a repository

        :param str pUrl: the downloads url direction of the repository
        :param str pFunctionName: the function name
        '''
        r = remote.RemoteClient(pUrl)
        if not os.path.isdir('./'+pFunctionName):
            r.checkout('./'+pFunctionName)
            logger.info("The function %s downloaded succesfully" % pFunctionName)
        else:
            logger.error("The function %s is already configured in your project, please update in order to get last changes" % pFunctionName)
Beispiel #16
0
def _is_in_queue(sku):
    """Return True if any URLs belonging to sku are in the queue. Return False
    otherwise. 

    :param sku: product sku
    :return: whether or not there are review urls in queue
    """
    db_q = DB.init_db(config.get("queue_db")).queue
    queue = list(db_q.find({"sku": sku}))
    if len(queue) > 0:
        logger.info(sku + " is already in the queue")
        return True
    return False
Beispiel #17
0
def _reviews_scraped(sku):
    """
    Return True if sku reviews have been parsed. Return False otherwise.

    :param sku: product sku
    :return: whether or not the reviews have been scraped+parsed+ingested
    """
    db_raw = DB.init_db(config.get("ingestion_db")).raw
    feed = list(db_raw.find({"sku": sku}))
    if len(feed) > 0:
        logger.info(sku + " reviews have been parsed and ingested")
        return True
    logger.info(sku + " has neither been parsed nor ingested")
    return False
Beispiel #18
0
 def _amazon_review_parser(self, soup):
     review_list = soup.find_all('div', id=re.compile('customer_review-\w+'))
     sel = selectors.get(self.source).get("review_text")
     raw = []
     for review in review_list:
         record = {
             "product_name": self.prod_name,
             "source": self.source,
             "sku": self.sku,
             "review_text": review.find('span', sel).text,
             "sent_tokenized": False
         }
         raw.append(record)
     ingestion.ingest(raw)
     logger.info("Finished parsing single-page reviews for {} from {}".format(self.sku, self.source))
Beispiel #19
0
    def delete_object(self, pBucketName, pObjectPath):
        '''
        Deletes an object from S3 bucket

        :param str pBucketName: name of the bucket
        :param str pObjectPath: path of file in S3
        :raises Botocore Client Exception
        '''
        try:
            klambda_bucket = self.__resource.Bucket(pBucketName)
            klambda_bucket.objects.filter(Prefix=pObjectPath).delete()
            logger.info("Object %s deleted succesfully from %s" %
                        (pObjectPath, klambda_bucket))
        except botocore.exceptions.ClientError as err:
            logger.error(err)
            exit()
Beispiel #20
0
def _detail_parsed(sku):
    """
    Return True if the detail page of sku has been parsed. Return
    False otherwise.

    :param sku: product sku
    :return: whether or not the product detail page has been parsed
    """
    if _db_product_details(sku):
        logger.info(
            "Product detail page for {} has already been parsed".format(sku))
        return True
    logger.info(
        "Product detail page for {} is yet to be downloaded and parsed".format(
            sku))
    return False
Beispiel #21
0
    def resend_code(self, pClientId, pUsername):
        '''
        Register a user in a user pool, with given attributes

        :param str pClientId: id of user pool app client
        :param object pKlambdaUser: KlambdaUser object
        :raises Botocore Client Exception
        '''
        try:
            self.__client.resend_confirmation_code(ClientId=pClientId,
                                                   Username=pUsername)
        except botocore.exceptions.ClientError as err:
            logger.error(err)
            exit()
        else:
            logger.info("Code resent...")
Beispiel #22
0
def _is_trained(sku):
    """
    Return True if a model has already been trained for this product. Return
    False otherwise.

    :param sku: product sku
    :return: whether or not there is a doc2vec model
    """
    #TODO: Always best to retrain with more data so if there are new raw reviews, retrain
    mypath = config.get("doc2vec").get("path")
    onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))]
    if sku in onlyfiles:
        _set_status(__ready__, sku)
        logger.info(sku + " has a trained model")
        return True
    logger.info(sku + " does not have a trained model. Start training")
    return False
Beispiel #23
0
def add_to_queue(source, sku, page_count):
    """
    Generate review URLs and add them to the queue.

    :param source: merchant
    :param sku: product sku
    :param page_count: number of pages with reviews
    """ 
    urls = _build_urls(source, sku, page_count)
    for url in urls:
        record = {
                "url": url,
                "sku": sku,
                "timestamp": time.time(),
        }
        q_db.update(record, record, upsert=True)
    logger.info("Added new URLs to the queue for " + sku)
Beispiel #24
0
    def verify_email(self, pAccessToken, pCode):
        '''
        Register a user in a user pool, with given attributes

        :param str pClientId: id of user pool app client
        :param object pKlambdaUser: KlambdaUser object
        :raises Botocore Client Exception
        '''
        try:
            self.__client.verify_user_attribute(AccessToken=pAccessToken,
                                                AttributeName='email',
                                                Code=pCode)
        except botocore.exceptions.ClientError as err:
            logger.error(err)
            exit()
        else:
            logger.info("Your email verified correctly")
Beispiel #25
0
    def upload_object(self, pBucketName, pObjectPath, pDestPath):
        '''
        Uploads an object to S3 bucket

        :param str pBucketName: name of the bucket
        :param str pObjectPath: path of file to upload
        :param str pDestPath: path to upload file
        :raises Botocore Client Exception
        '''
        try:
            klambda_bucket = self.__resource.Bucket(pBucketName)
            klambda_bucket.upload_file(pObjectPath, pDestPath)
            logger.info("Object %s successfully uploaded to %s" %
                        (klambda_bucket, pDestPath))
        except botocore.exceptions.ClientError as err:
            logger.error(err)
            exit()
Beispiel #26
0
    def _load_model(self):
        """
        Load a trained model if one already exists for a given sku.
        Otherwise, return None.

        return model: a previously trained model or None

        #TODO: could models belonging to products from different merchants
        #TODO: have conflicting names due to identical SKUs? How likely is this?
        """
        self.path += "/" + self.sku
        model = None
        try:
            model = Doc2Vec.load(self.path)
            logger.info("Model successfully loaded")
        except IOError:
            logger.warn("Model not found")
        return model
Beispiel #27
0
def vote_to_db(question, answer, sku, up_count, down_count):
    """
    Save user voting on question-answer pair to the database.
    Saving every question-answer combination as a unique pair would 
    result in many duplicate entries in the database. Please refer to 
    the comments to the question/answer pair classifier for more details
    on how deal with this problem.
    """
    db_votes = DB.init_db(config.get("votes_db"))
    record = {
        "question": question,
        "answer": answer,
        "sku": sku,
        "up_count": up_count,
        "down_count": down_count
    }
    cluster = qna_clustering.Cluster(record)
    cluster.put_votes(db_votes)
    logger.info(record)
Beispiel #28
0
    def delete_lambdas(self, pParameters):
        '''
        Deletes a function from database 

        :param list pParameters: a list of function names to download
        '''
        if len(pParameters) == 0: # if not function listed gets all the functions from klambda file
            functions = self.get_lambdas()
        else:
            functions = pParameters
        for function in self.functions_list:
            function_name = next(iter(function)) # get first key of dict
            if function_name in functions:
                if self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}):
                    item = self.client.get_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']})
                    self.validate_user(item['author']) # verify if current user can perfom this action
                    self.client.delete_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']})
                    logger.info("The function %s deleted succesfully under %s runtime" % (function_name, function[function_name]['runtime']))
                else:
                    logger.error("The function %s does not exist under %s runtime" % (function_name, function[function_name]['runtime']))
Beispiel #29
0
    def train(self):
        """
        Train a new doc2vec model for a given SKU using the PV-DBOW 
        (probability vectors - distributed bag of words) algorithm. If the 
        model alreaady exists, do nothing.

		We set dm=0 to disable distributed memory alogrithm. 
		dm=1 gave us vector inferences that made no sense. However,
		PV-DBOW gives us exactly what we want even though the actual 
		probability of the predicitons is 0.60 - 0.65. Predictions up to 0.90
		are possible with optimized, i.e. less ambiguous, match query.
        """

        d2v_model = self._load_model()
        if d2v_model == None:
            logger.info("Training a new model for SKU " + self.sku)
            tagged_docs = self._tagged_docs()

            # Set some parameters
            params = self._get_params()
            alpha = config.get("doc2vec").get("alpha")
            min_alpha = config.get("doc2vec").get("min_alpha")
            epochs = config.get("doc2vec").get("epochs")
            alpha_delta = (alpha - min_alpha) / epochs

            # Build an untrained model
            d2v_model = Doc2Vec(**params)
            d2v_model.build_vocab(tagged_docs)

            # Train away!
            for epoch in range(epochs):
                random.shuffle(tagged_docs)
                d2v_model.alpha, d2v_model.min_alpha = alpha, alpha
                train_params = {
                    'total_examples': d2v_model.corpus_count,
                    'epochs': d2v_model.iter
                }
                d2v_model.train(tagged_docs, **train_params)
                alpha -= alpha_delta
            logger.info("Finished training for SKU " + self.sku)
            d2v_model.save(self.path)
Beispiel #30
0
    def sign_up(self, pClientId, pKlambdaUser):
        '''
        Register a user in a user pool, with given attributes

        :param str pClientId: id of user pool app client
        :param object pKlambdaUser: KlambdaUser object
        :raises Botocore Client Exception
        '''
        try:
            self.__client.sign_up(
                ClientId=pClientId,
                Username=pKlambdaUser.username,
                Password=pKlambdaUser.password,
                UserAttributes=[
                    {
                        'Name': 'name',
                        'Value': pKlambdaUser.name
                    },
                    {
                        'Name': 'email',
                        'Value': pKlambdaUser.email
                    },
                    {
                        'Name': 'updated_at',
                        'Value':
                        datetime.datetime(2012, 4, 1, 0,
                                          0).strftime('%s')  # unix timestamp 
                    }
                ],
            )
            self.confirm_user(pKlambdaUser.username)
            logger.info("User %s successfully registered" %
                        pKlambdaUser.username)
        except botocore.exceptions.ClientError as err:
            logger.error(err)
            exit()
Beispiel #31
0
    def create_project(self, pParameters):
        '''
        Creates a project in db with klambda file info

        :param list pParameters: a list of optional parameters
        '''
        self.validate_user(
            self.project)  # verify if current user can perfom this action
        if not self.client.check_item("Klambda_projects", {
                'name': self.project['name'],
                'author': self.project['author']
        }):
            self.project['created_on'] = datetime.datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")
            self.project['last_update'] = datetime.datetime.now().strftime(
                "%Y-%m-%d %H:%M:%S")
            self.client.put_item("Klambda_projects", self.project)
            logger.info("The project %s created succesfully" %
                        self.project['name'])
            self.upload_files(
                self.project['files'])  # uploads listed files on S3
        else:
            logger.error("The project %s already exists" %
                         self.project['name'])