def get_status(sku): response = {"status": __in_queue__} status = None try: status = list(db_status.find({"sku": sku}))[0] status = status.get("msg") db_details = DB.init_db(config.get("details_db")).product_details product = list(db_details.find({"sku": sku})) product_url, product_name, image_url = "", "", "" if product: product_name = product[0].get("product_name") product_url = product[0].get("url") image_url = product[0].get("img") logger.info("Status for {}: {}".format(sku, status)) return { "status": status, "product_name": product_name, "product_url": product_url, "image_url": image_url, } except IndexError: # this happens due to a race condition because the sku hasn't been # added to the database yet or because it simply doesn't exist. The # second case only true if the URL has been typed in manually or # bookmarked but the sku is missing from the URL. logger.warning( "Product status not yet available for sku {}".format(sku)) _set_status(__in_queue__, sku) response = {"status": status} except Exception as e: logger.exception(e) response = {"status": __error__} return response
def edit_project(self, pParameters): ''' Edits a project in db with klambda file info :param list pParameters: a list of optional parameters ''' if self.client.check_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }): item = self.client.get_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }) self.validate_user( item) # verify if current user can perfom this action item['users'] = self.project['users'] item['description'] = self.project['description'] item['repo_url'] = self.project['repo_url'] item['files'] = self.project['files'] item['last_update'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") self.client.put_item("Klambda_projects", item) destination_path = self.project['name'] + "-" + self.project[ 'author'] self.__s3_client.delete_object( 'klambda', destination_path) # deletes the project folder self.upload_files( self.project['files']) # uploads listed files on S3 logger.info("The project %s edited succesfully" % self.project['name']) else: logger.error("The project %s doesn't exist" % self.project['name'])
def delete_project(self, pParameters): ''' Deletes a project from db :param list pParameters: a list of optional parameters ''' if self.client.check_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }): item = self.client.get_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }) self.validate_user( item) # verify if current user can perfom this action self.client.delete_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }) destination_path = self.project['name'] + "-" + self.project[ 'author'] self.__s3_client.delete_object('klambda', destination_path) logger.info("The project %s deleted succesfully" % self.project['name']) else: logger.error("The project %s doesn't exist" % self.project['name'])
def _db_product_details(sku): """ Return product details from the database. """ db_details = DB.init_db(config.get("details_db")).product_details product = list(db_details.find({"sku": sku})) if product: product_name = product[0].get("product_name") product_url = product[0].get("url") image_url = product[0].get("img") review_count = product[0].get("review_count") page_count = product[0].get("review_page_count") valid = all( [product_name, product_url, image_url, review_count, page_count]) if valid: return { "product_name": product_name, "review_count": review_count, "page_count": page_count, } else: logger.info( "Unable to validate product details for {}. Deleting entry". format(sku)) db_details.delete_one({"sku": sku}) return {}
def _get_product_details(source, url, sku): """ Scrape product metadata. :param url: canonical product url :return number of reviews and product name """ sc = scraper.Scraper(source=source) response = sc.get_request(url) pr = parser.Parser(sku=sku, source=source) res = pr.parse(response, init=True) if res: # Save it to the database db_details = DB.init_db(config.get("details_db")) db_details = db_details.product_details record = { "status": "processing", "url": url, "product_name": res.get("product_name"), "review_count": res.get("review_count"), "review_page_count": res.get("page_count"), "source": source, "sku": sku, "img": res.get("img_url"), "timestamp": time.time(), } db_details.insert_one(record) logger.info("Saved new product details: ") logger.info(record) return res
def start(url, progress=False): """ Initiate scraping, parsing, data ingestion, preprocessing, and training. Note that this is a blocking call. Flask will wait on start(). We don't want that. We want the script to return control to Flask immediately and continue with the data processing. That way, the sku status can be updated at the appropriate time and everyone is happy :). """ logger.info("Received new url to process: {}".format(url)) decoded = _decode_url(url) if not decoded: return {} if progress: sku = decoded[1] __in_progress__ = True status = get_status(sku) if status.get("status") == __ready__: __in_progress__ = False response = { "sku": sku, "product_url": decoded[2], "in_progress": __in_progress__, } response.update(status) return response else: executor = ThreadPoolExecutor(max_workers=1) executor.submit(_workflow, decoded, url) executor.shutdown(wait=False)
def infer(self, query): ''' Query the model to infer vectors for an unseen sentence and return a list of sentence tags and their probabilities. steps: number of iterations (?); tested step=1 to step=10e6. 10e5 is optimal topn: number of top n sentences to return; n is determined empirically based on the quality of the predictions generated at or above a given probability threshold param query: an enriched query return sents: a list of inferred sentences ''' #TODO: validate the query is a valid statement steps = config.get("doc2vec").get("inference").get("steps") topn = config.get("doc2vec").get("inference").get("topn") logger.info('Sentence inference in progress') query_tokens = query.split() inference = self.d2v_model.infer_vector(query_tokens, steps=steps) sims = self.d2v_model.docvecs.most_similar([inference], topn=topn) summary = self._summarize(map(lambda x: self._lookup(x[0]), sims)) probs = [tup[1] for tup in sims] confidence = reduce(lambda x, y: x + y, probs) / len(probs) if not summary: summary = __default_answer__ confidence = 0.00 return summary, round(confidence, 2)
def _amazon_detail_parser(self, soup): logger.info("Started parsing Amazon product detail page for {}".format(self.sku)) rcount_sel_outer = selectors.get(self.source).get("review_count_outer") rcount_sel_inner = selectors.get(self.source).get("review_count_inner") name_selector = selectors.get(self.source).get("product_name") image_selector = selectors.get(self.source).get("product_image") # get review count review_count = -1 try: review_count = self._amazon_review_count(soup, rcount_sel_outer, rcount_sel_inner) except Exception as e: # if we have a problem parsing the review count, then # we have nothing to work with logger.exception(e) return {} page_count = self._get_page_count(review_count, divisor=10) # get product name and image url name = soup.select(name_selector)[0].text.strip() img_url = self._amazon_image_url(soup, image_selector) logger.info("Finished parsing Amazon product detail page for {}".format(self.sku)) return {"product_name": name, "review_count": review_count, "page_count": page_count, "img_url": img_url, }
def edit_lambdas(self, pParameters): ''' Edits a function information on the db from the klambda file :param list pParameters: a list of function names to download ''' if len(pParameters) == 0: # if not function listed gets all the functions from klambda file functions = self.get_lambdas() else: functions = pParameters for function in self.functions_list: function_name = next(iter(function)) # get first key of dict if function_name in functions: if self.validate_function(function[function_name]): if self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}): item = self.client.get_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}) self.validate_user(item['author']) # verify if current user can perfom this action item['runtime'] = function[function_name]['runtime'] item['description'] = function[function_name]['description'] item['category'] = function[function_name]['category'] item['version'] = str(function[function_name]['version']) item['repo_url'] = function[function_name]['repo_url'] item['folder_path'] = function[function_name]['folder_path'] item['last_update'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") item['checksum'] = hashlib.md5(json.dumps(item).encode('utf-8')).hexdigest() self.client.put_item("Klambda_functions", item) self.write_data(item, function_name) # stores checksum on klambda file logger.info("The function %s edited succesfully under %s runtime" % (function_name, function[function_name]['runtime'])) else: logger.error("The function %s does not exist under %s runtime" % (function_name, function[function_name]['runtime'])) else: logger.error("The function %s information is incomplete" % function_name)
def create_lambdas(self, pParameters): ''' Uploads a function information and creates it on the db from the klambda file :param list pParameters: a list of function names to create ''' if len(pParameters) == 0: # if not function listed gets all the functions from klambda file functions = self.get_lambdas() else: functions = pParameters for function in self.functions_list: function_name = next(iter(function)) # get first key of dict if function_name in functions: if self.validate_function(function[function_name]): self.validate_user(function[function_name]['author']) # validates the user logged in if not self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}): function[function_name]['name'] = function_name function[function_name]['version'] = str(function[function_name]['version']) function[function_name]['created_on'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") function[function_name]['last_update'] = datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S") function[function_name]['checksum'] = hashlib.sha256(json.dumps(function[function_name]).encode('utf-8')).hexdigest() self.client.put_item("Klambda_functions",function[function_name]) self.write_data(function[function_name], function_name) # stores checksum on klambda file logger.info("The function %s created succesfully from the author %s" % (function_name, function[function_name]['author'])) else: logger.error("The function %s already exists for the author %s" % (function_name, function[function_name]['author'])) else: logger.error("The function %s information is incomplete" % function_name)
def _nlp_reset(sku): """ Clear the database of any existing sentences for sku. :param sku: product sku """ db_sents = DB.init_db(config.get("sent_db")).sentences db_sents.delete_many({"sku": sku}) logger.info("Cleared sentence table for " + sku)
def ingest(raw): """ Save raw product reviews to the database. :param raw: a dictionary (JSON) containing all reviews """ for record in raw: # Create/load raw collection raw = db.raw raw.update(record, record, upsert=True) logger.info("Added record to raw feed db")
def get_validation_token(self, uid, email): user_info = self._db.get('%s%s' % (self.USER_DB, uid)) if user_info is None: logger.warn('No user information found for uid %s' % uid) return None user = cjson.decode(user_info) aemail = email.encode('ascii') if (aemail in user.get('emails', {}) and user['emails'][aemail].get('state', None) == 'pending'): return user['emails'][aemail].get('conf_code', None) logger.info('No validation token found for uid %s ' % uid) return None
def update_function(self, pFunctionName): ''' Updates the folder of a function from a repository ::param str pFunctionName: the function name ''' if os.path.isdir('./'+pFunctionName): l = local.LocalClient('./'+pFunctionName) l.update('./'+pFunctionName) logger.info("The function %s updated succesfully" % pFunctionName) else: logger.error("The function %s is not configured in your project" % pFunctionName)
def download_function(self, pUrl, pFunctionName): ''' Downloads the folder of a function from a repository :param str pUrl: the downloads url direction of the repository :param str pFunctionName: the function name ''' r = remote.RemoteClient(pUrl) if not os.path.isdir('./'+pFunctionName): r.checkout('./'+pFunctionName) logger.info("The function %s downloaded succesfully" % pFunctionName) else: logger.error("The function %s is already configured in your project, please update in order to get last changes" % pFunctionName)
def _is_in_queue(sku): """Return True if any URLs belonging to sku are in the queue. Return False otherwise. :param sku: product sku :return: whether or not there are review urls in queue """ db_q = DB.init_db(config.get("queue_db")).queue queue = list(db_q.find({"sku": sku})) if len(queue) > 0: logger.info(sku + " is already in the queue") return True return False
def _reviews_scraped(sku): """ Return True if sku reviews have been parsed. Return False otherwise. :param sku: product sku :return: whether or not the reviews have been scraped+parsed+ingested """ db_raw = DB.init_db(config.get("ingestion_db")).raw feed = list(db_raw.find({"sku": sku})) if len(feed) > 0: logger.info(sku + " reviews have been parsed and ingested") return True logger.info(sku + " has neither been parsed nor ingested") return False
def _amazon_review_parser(self, soup): review_list = soup.find_all('div', id=re.compile('customer_review-\w+')) sel = selectors.get(self.source).get("review_text") raw = [] for review in review_list: record = { "product_name": self.prod_name, "source": self.source, "sku": self.sku, "review_text": review.find('span', sel).text, "sent_tokenized": False } raw.append(record) ingestion.ingest(raw) logger.info("Finished parsing single-page reviews for {} from {}".format(self.sku, self.source))
def delete_object(self, pBucketName, pObjectPath): ''' Deletes an object from S3 bucket :param str pBucketName: name of the bucket :param str pObjectPath: path of file in S3 :raises Botocore Client Exception ''' try: klambda_bucket = self.__resource.Bucket(pBucketName) klambda_bucket.objects.filter(Prefix=pObjectPath).delete() logger.info("Object %s deleted succesfully from %s" % (pObjectPath, klambda_bucket)) except botocore.exceptions.ClientError as err: logger.error(err) exit()
def _detail_parsed(sku): """ Return True if the detail page of sku has been parsed. Return False otherwise. :param sku: product sku :return: whether or not the product detail page has been parsed """ if _db_product_details(sku): logger.info( "Product detail page for {} has already been parsed".format(sku)) return True logger.info( "Product detail page for {} is yet to be downloaded and parsed".format( sku)) return False
def resend_code(self, pClientId, pUsername): ''' Register a user in a user pool, with given attributes :param str pClientId: id of user pool app client :param object pKlambdaUser: KlambdaUser object :raises Botocore Client Exception ''' try: self.__client.resend_confirmation_code(ClientId=pClientId, Username=pUsername) except botocore.exceptions.ClientError as err: logger.error(err) exit() else: logger.info("Code resent...")
def _is_trained(sku): """ Return True if a model has already been trained for this product. Return False otherwise. :param sku: product sku :return: whether or not there is a doc2vec model """ #TODO: Always best to retrain with more data so if there are new raw reviews, retrain mypath = config.get("doc2vec").get("path") onlyfiles = [f for f in listdir(mypath) if isfile(join(mypath, f))] if sku in onlyfiles: _set_status(__ready__, sku) logger.info(sku + " has a trained model") return True logger.info(sku + " does not have a trained model. Start training") return False
def add_to_queue(source, sku, page_count): """ Generate review URLs and add them to the queue. :param source: merchant :param sku: product sku :param page_count: number of pages with reviews """ urls = _build_urls(source, sku, page_count) for url in urls: record = { "url": url, "sku": sku, "timestamp": time.time(), } q_db.update(record, record, upsert=True) logger.info("Added new URLs to the queue for " + sku)
def verify_email(self, pAccessToken, pCode): ''' Register a user in a user pool, with given attributes :param str pClientId: id of user pool app client :param object pKlambdaUser: KlambdaUser object :raises Botocore Client Exception ''' try: self.__client.verify_user_attribute(AccessToken=pAccessToken, AttributeName='email', Code=pCode) except botocore.exceptions.ClientError as err: logger.error(err) exit() else: logger.info("Your email verified correctly")
def upload_object(self, pBucketName, pObjectPath, pDestPath): ''' Uploads an object to S3 bucket :param str pBucketName: name of the bucket :param str pObjectPath: path of file to upload :param str pDestPath: path to upload file :raises Botocore Client Exception ''' try: klambda_bucket = self.__resource.Bucket(pBucketName) klambda_bucket.upload_file(pObjectPath, pDestPath) logger.info("Object %s successfully uploaded to %s" % (klambda_bucket, pDestPath)) except botocore.exceptions.ClientError as err: logger.error(err) exit()
def _load_model(self): """ Load a trained model if one already exists for a given sku. Otherwise, return None. return model: a previously trained model or None #TODO: could models belonging to products from different merchants #TODO: have conflicting names due to identical SKUs? How likely is this? """ self.path += "/" + self.sku model = None try: model = Doc2Vec.load(self.path) logger.info("Model successfully loaded") except IOError: logger.warn("Model not found") return model
def vote_to_db(question, answer, sku, up_count, down_count): """ Save user voting on question-answer pair to the database. Saving every question-answer combination as a unique pair would result in many duplicate entries in the database. Please refer to the comments to the question/answer pair classifier for more details on how deal with this problem. """ db_votes = DB.init_db(config.get("votes_db")) record = { "question": question, "answer": answer, "sku": sku, "up_count": up_count, "down_count": down_count } cluster = qna_clustering.Cluster(record) cluster.put_votes(db_votes) logger.info(record)
def delete_lambdas(self, pParameters): ''' Deletes a function from database :param list pParameters: a list of function names to download ''' if len(pParameters) == 0: # if not function listed gets all the functions from klambda file functions = self.get_lambdas() else: functions = pParameters for function in self.functions_list: function_name = next(iter(function)) # get first key of dict if function_name in functions: if self.client.check_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}): item = self.client.get_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}) self.validate_user(item['author']) # verify if current user can perfom this action self.client.delete_item("Klambda_functions", {'name': function_name, 'author': function[function_name]['author']}) logger.info("The function %s deleted succesfully under %s runtime" % (function_name, function[function_name]['runtime'])) else: logger.error("The function %s does not exist under %s runtime" % (function_name, function[function_name]['runtime']))
def train(self): """ Train a new doc2vec model for a given SKU using the PV-DBOW (probability vectors - distributed bag of words) algorithm. If the model alreaady exists, do nothing. We set dm=0 to disable distributed memory alogrithm. dm=1 gave us vector inferences that made no sense. However, PV-DBOW gives us exactly what we want even though the actual probability of the predicitons is 0.60 - 0.65. Predictions up to 0.90 are possible with optimized, i.e. less ambiguous, match query. """ d2v_model = self._load_model() if d2v_model == None: logger.info("Training a new model for SKU " + self.sku) tagged_docs = self._tagged_docs() # Set some parameters params = self._get_params() alpha = config.get("doc2vec").get("alpha") min_alpha = config.get("doc2vec").get("min_alpha") epochs = config.get("doc2vec").get("epochs") alpha_delta = (alpha - min_alpha) / epochs # Build an untrained model d2v_model = Doc2Vec(**params) d2v_model.build_vocab(tagged_docs) # Train away! for epoch in range(epochs): random.shuffle(tagged_docs) d2v_model.alpha, d2v_model.min_alpha = alpha, alpha train_params = { 'total_examples': d2v_model.corpus_count, 'epochs': d2v_model.iter } d2v_model.train(tagged_docs, **train_params) alpha -= alpha_delta logger.info("Finished training for SKU " + self.sku) d2v_model.save(self.path)
def sign_up(self, pClientId, pKlambdaUser): ''' Register a user in a user pool, with given attributes :param str pClientId: id of user pool app client :param object pKlambdaUser: KlambdaUser object :raises Botocore Client Exception ''' try: self.__client.sign_up( ClientId=pClientId, Username=pKlambdaUser.username, Password=pKlambdaUser.password, UserAttributes=[ { 'Name': 'name', 'Value': pKlambdaUser.name }, { 'Name': 'email', 'Value': pKlambdaUser.email }, { 'Name': 'updated_at', 'Value': datetime.datetime(2012, 4, 1, 0, 0).strftime('%s') # unix timestamp } ], ) self.confirm_user(pKlambdaUser.username) logger.info("User %s successfully registered" % pKlambdaUser.username) except botocore.exceptions.ClientError as err: logger.error(err) exit()
def create_project(self, pParameters): ''' Creates a project in db with klambda file info :param list pParameters: a list of optional parameters ''' self.validate_user( self.project) # verify if current user can perfom this action if not self.client.check_item("Klambda_projects", { 'name': self.project['name'], 'author': self.project['author'] }): self.project['created_on'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") self.project['last_update'] = datetime.datetime.now().strftime( "%Y-%m-%d %H:%M:%S") self.client.put_item("Klambda_projects", self.project) logger.info("The project %s created succesfully" % self.project['name']) self.upload_files( self.project['files']) # uploads listed files on S3 else: logger.error("The project %s already exists" % self.project['name'])