def get_status(sku): response = {"status": __in_queue__} status = None try: status = list(db_status.find({"sku": sku}))[0] status = status.get("msg") db_details = DB.init_db(config.get("details_db")).product_details product = list(db_details.find({"sku": sku})) product_url, product_name, image_url = "", "", "" if product: product_name = product[0].get("product_name") product_url = product[0].get("url") image_url = product[0].get("img") logger.info("Status for {}: {}".format(sku, status)) return { "status": status, "product_name": product_name, "product_url": product_url, "image_url": image_url, } except IndexError: # this happens due to a race condition because the sku hasn't been # added to the database yet or because it simply doesn't exist. The # second case only true if the URL has been typed in manually or # bookmarked but the sku is missing from the URL. logger.warning( "Product status not yet available for sku {}".format(sku)) _set_status(__in_queue__, sku) response = {"status": status} except Exception as e: logger.exception(e) response = {"status": __error__} return response
def _format_msg(fields, kw, maxlen=_MAXLEN): # adding custom extensions # sorting by size msg = _CEF_FORMAT % fields extensions = [(len(str(value)), len(key), key, value) for key, value in kw.items() if key not in _EXTENSIONS] extensions.sort() msg_len = len(msg) for value_len, key_len, key, value in extensions: added_len = value_len + key_len + 2 value = _convert_ext(value) key = _check_key(key) if maxlen and msg_len + added_len > maxlen: # msg is too big. warn = 'CEF Message too big. %s %s' % (msg, str(kw.items())) logger.warning(warn) break msg += ' %s=%s' % (key, value) msg_len += added_len return msg
def main(): logger.config_logs(ROOT_DIR) processor = file_processor.FileProcessor(os.getcwd()+'/klambda.yml') # instance for opening klambda.yml cli_tool = cli.CLI(processor) # instance of CLI Tool client = cognito_client.CognitoClient() args = cli_tool.parser.parse_args() # reads the arguments written by the user if args.signup: if len(args.signup) == 4: user = klambda_user.KlambdaUser(args.signup[0], args.signup[1], args.signup[2], args.signup[3]) client.sign_up(klambda_config.KlambdaConfig.COGNITO_APP_CLIENT, user) exit() elif args.command: credentials = file_processor.FileProcessor(os.getcwd()+'/user.yml') client.initiate_auth(klambda_config.KlambdaConfig.COGNITO_APP_CLIENT, credentials.data['USERNAME'], str(credentials.data['PASSWORD'])) # authenticate user module = cli_tool.modules[args.command] # gets the module specified by the user for com in module.commands: if vars(args)[com] != None: # checks if command has parameters and starts execution module.execute(com, vars(args)[com]) event_body = event.Event( processor.data['project']['name'], processor.data['project']['author'], datetime.datetime.now().strftime("%Y-%m-%d %H:%M:%S"), credentials.data['USERNAME'], com, module.name) event_logger.save_event(event_body) else: logger.warning("No module nor command typed, please try again...") sys.exit()
def _check_key(key): if _KEY.match(key) is not None: return key msg = 'The "%s" key contains illegal characters' % key logger.warning(msg) # replacing illegal characters with a '?' return _KEY.sub('?', key)
def test_graberrors(self): # simpler case: services logger, error level with capture_logs() as errors: logger.error("Yeah") self.assertEqual(errors.read(), "Yeah\n") # services logger, warning level with capture_logs(level=logging.WARNING) as wrn: logger.debug("Yeah") logger.warning("Yeah2") self.assertEqual(wrn.read(), "Yeah2\n") # root logger, warning root = logging.getLogger() with capture_logs(logger="root", level=logging.WARNING) as wrn: root.debug("Yeah") root.warning("Yeah2") self.assertEqual(wrn.read(), "Yeah2\n")
def _format_msg(fields, kw, maxlen=_MAXLEN, as_unicode=False): # adding custom extensions # sorting by size if as_unicode: for k, v in fields.items(): v = _force_unicode(v) fields[k] = v msg = _CEF_FORMAT % fields extensions = [(_len(value), len(key), key, value) for key, value in kw.items() if key not in _EXTENSIONS] extensions.sort() msg_len = len(msg) if as_unicode: msg = _force_unicode(msg) for value_len, key_len, key, value in extensions: added_len = value_len + key_len + 2 value = _convert_ext(value) key = _check_key(key) if maxlen and msg_len + added_len > maxlen: # msg is too big. warn = 'CEF Message too big. %s %s' % (msg, str(kw.items())) logger.warning(warn) break fragment = ' %s=%s' % (key, value) if as_unicode: fragment = _force_unicode(fragment) msg += fragment msg_len += added_len return msg
def log_cef(name, severity, environ, config, username='******', signature=None, **kw): """Creates a CEF record, and emit it in syslog or another file. Args: - name: name to log - severity: integer from 0 to 10 - environ: the WSGI environ object - config: configuration dict - signature: CEF signature code - defaults to name value - username: user name - defaults to 'none' - extra keywords: extra keys used in the CEF extension """ # XXX might want to remove the request dependency here # so this module is standalone from services.util import filter_params name = _convert_prefix(name) if signature is None: signature = name else: signature = _convert_prefix(signature) severity = _convert_prefix(severity) config = filter_params('cef', config) source = get_source_ip(environ) fields = { 'severity': severity, 'source': source, 'method': _convert_ext(environ['REQUEST_METHOD']), 'url': _convert_ext(environ['PATH_INFO']), 'dest': _convert_ext(environ.get('HTTP_HOST', u'none')), 'user_agent': _convert_ext(environ.get('HTTP_USER_AGENT', u'none')), 'signature': signature, 'name': name, 'version': config['version'], 'vendor': config['vendor'], 'device_version': config['device_version'], 'product': config['product'], 'host': _HOST, 'suser': username, 'date': strftime("%b %d %H:%M:%S") } # make sure we don't have a | anymore in regular fields for key, value in list(kw.items()): new_key = _check_key(key) if new_key == key: continue kw[new_key] = value del kw[key] # overriding with provided datas fields.update(kw) # resulting message msg = _CEF_FORMAT % fields # adding custom extensions # sorting by size extensions = [(len(str(value)), len(key), key, value) for key, value in kw.items() if key not in _EXTENSIONS] extensions.sort() msg_len = len(msg) for value_len, key_len, key, value in extensions: added_len = value_len + key_len + 2 value = _convert_ext(value) key = _check_key(key) if msg_len + added_len > _MAXLEN: # msg is too big. warn = 'CEF Message too big. %s %s' % (msg, str(kw.items())) logger.warning(warn) break msg += ' %s=%s' % (key, value) msg_len += added_len if config['file'] == 'syslog': if not SYSLOG: raise ValueError('syslog not supported on this platform') _syslog(msg, config) else: with _log_lock: with open(config['file'], 'a') as f: f.write('%s\n' % msg)
def _workflow(decoded, url): """ Run the whole data scraping, processing, and analysis in new threads. Each thread, beginning with this one, will make its calls in a try-except block. Why do it this way? Because the parent thread that launched this thread dies immediately. Therefore, when an exception is raised in the child thread, there's no one to receive it. This is bad for the client. The client relies on the status of the job. If an exception is raised in the child thread, the thread would die and the status would no longer be updated. This would cause the client to stall forever with a progress animation. If an exception is raised, we want to update the status right away so that the user doesn't have to wait. All operations down the line like scraping or some other launch their own child threads. Those operations also need to update the status before exiting. """ logger.info("Running a new thread for scraping and data processing") source = decoded[0] sku = decoded[1] url = decoded[2] _set_status(__in_queue__, sku) parsed = _db_product_details(sku) try: # Has the detail page been parsed? if not parsed: logger.info( "Detail page not available for {}. Proceeding to download...". format(sku)) parsed = _get_product_details(source, url, sku) if not parsed: logger.error("Error while parsing product detail page for " + sku) logger.error("Aborting process") _set_status(__error__, sku) return else: logger.info( "Detail page for {} already parsed. Skipping download...". format(sku)) _set_status("Gathering data", sku) prod_name = parsed.get("product_name") review_count = parsed.get("review_count") page_count = parsed.get("page_count") # Do we have enough data to train on? if review_count <= config.get("misc").get("min_review_count"): logger.warning("Not enough reviews for " + sku) logger.error("Aborting process") _set_status("Not Enough Data", sku) return # If it's not in the queue, add it if not _is_in_queue(sku): logger.info(sku + " not in queue. Checking if it's been scraped before") if not _reviews_scraped(sku): logger.info(sku + " has not been scraped. Adding to the queue...") sc_helper.add_to_queue(source, sku, page_count) # If it's in the queue, scrape it if _is_in_queue(sku): logger.info(sku + " is in the queue. Launching the scraper") sc_helper.scrape(sku, prod_name, source) # If it hasn't been trained, train it if not _is_trained(sku): _nlp_reset(sku) logger.info("Starting NLP preprocessing") _set_status("Analyzing language", sku) preprocess.NLPreprocessor(sku).tokenize() logger.info("Finished NLP preprocessing") logger.info("Starting model trianing") _set_status("Building knowledge base", sku) d2v = training.Document2Vector(sku).train() logger.info("Finished model training") _update_details_db(sku) _set_status("Ready", sku) except Exception as e: logger.exception(e) _set_status(__error__, sku)
def log_cef(name, severity, environ, config, username='******', signature=None, **kw): """Creates a CEF record, and emit it in syslog or another file. Args: - name: name to log - severity: integer from 0 to 10 - environ: the WSGI environ object - config: configuration dict - signature: CEF signature code - defaults to name value - username: user name - defaults to 'none' - extra keywords: extra keys used in the CEF extension """ # XXX might want to remove the request dependency here # so this module is standalone from services.util import filter_params name = _convert_prefix(name) if signature is None: signature = name else: signature = _convert_prefix(signature) severity = _convert_prefix(severity) config = filter_params('cef', config) source = get_source_ip(environ) fields = {'severity': severity, 'source': source, 'method': _convert_ext(environ['REQUEST_METHOD']), 'url': _convert_ext(environ['PATH_INFO']), 'dest': _convert_ext(environ.get('HTTP_HOST', u'none')), 'user_agent': _convert_ext(environ.get('HTTP_USER_AGENT', u'none')), 'signature': signature, 'name': name, 'version': config['version'], 'vendor': config['vendor'], 'device_version': config['device_version'], 'product': config['product'], 'host': _HOST, 'suser': username, 'date': strftime("%b %d %H:%M:%S")} # make sure we don't have a | anymore in regular fields for key, value in list(kw.items()): new_key = _check_key(key) if new_key == key: continue kw[new_key] = value del kw[key] # overriding with provided datas fields.update(kw) # resulting message msg = _CEF_FORMAT % fields # adding custom extensions # sorting by size extensions = [(len(str(value)), len(key), key, value) for key, value in kw.items() if key not in _EXTENSIONS] extensions.sort() msg_len = len(msg) for value_len, key_len, key, value in extensions: added_len = value_len + key_len + 2 value = _convert_ext(value) key = _check_key(key) if msg_len + added_len > _MAXLEN: # msg is too big. warn = 'CEF Message too big. %s %s' % (msg, str(kw.items())) logger.warning(warn) break msg += ' %s=%s' % (key, value) msg_len += added_len if config['file'] == 'syslog': if not SYSLOG: raise ValueError('syslog not supported on this platform') _syslog(msg, config) else: with _log_lock: with open(config['file'], 'a') as f: f.write('%s\n' % msg)