def check_condition_to_start(version_id): global product_api product_api = Products() crawl_api = Crawls() model_api = Models() try: model = model_api.get_model(TEXT_CLASSIFICATION_MODEL_TYPE, version_id=version_id) if model is not None: return False log.info("check_condition_to_start") # Check if crawling process is done total_crawl_size = crawl_api.get_size_crawls(version_id) crawled_size = crawl_api.get_size_crawls(version_id, status='done') if total_crawl_size != crawled_size: return False total_product_size = product_api.get_size_products(version_id) processed_product_size = product_api.get_size_products( version_id, is_processed_for_text_class_model=True) not_processed_product_size = product_api.get_size_products( version_id, is_processed_for_text_class_model=False) if (processed_product_size + not_processed_product_size) == total_product_size: return False except Exception as e: log.error(str(e)) return True
def check_condition_to_start(version_id): global product_api product_api = Products() crawl_api = Crawls() try: log.info("check_condition_to_start") # Check if crawling process is done total_crawl_size = crawl_api.get_size_crawls(version_id) crawled_size = crawl_api.get_size_crawls(version_id, status='done') if total_crawl_size != crawled_size: return False queue_size = rconn.llen(REDIS_PRODUCT_TEXT_MODEL_PROCESS_QUEUE) if queue_size > 0: return False total_product_size = product_api.get_size_products(version_id) processed_product_size = product_api.get_size_products(version_id, is_processed_for_text_class_model=True) not_processed_product_size = product_api.get_size_products(version_id, is_processed_for_text_class_model=False) if (processed_product_size + not_processed_product_size) == total_product_size: return False except Exception as e: log.error(str(e)) return True
def get_products_by_keyword(self, keyword, offset=0, limit=100): self.log.debug('get_products_by_keyword') product_api = Products() try: total_count = product_api.get_products_count_by_keyword(keyword) except Exception as e: self.log.error( "Exception when calling get_products_count_by_keyword: %s\n" % e) try: products = product_api.get_products_by_keyword(keyword, only_text=False, offset=offset, limit=limit) except Exception as e: self.log.error( "Exception when calling get_products_by_keyword: %s\n" % e) return total_count, products
def check_condition_to_start(version_id): global product_api product_api = Products() try: # Check Classifying processing process is done total_product_size = product_api.get_size_products(version_id) classified_size = product_api.get_size_products(version_id, is_classified=True) if total_product_size != classified_size: return False # Check Object classifying process is done queue_size = rconn.llen(REDIS_OBJECT_INDEX_QUEUE) if queue_size != 0: return False except Exception as e: log.error(str(e)) return True
def check_condition_to_start(version_id): global product_api product_api = Products() crawl_api = Crawls() try: # Check Object classifying process is done queue_size = rconn.llen(REDIS_PRODUCT_CLASSIFY_QUEUE) if queue_size != 0: return False # Check Crawling process is done total_crawl_size = crawl_api.get_size_crawls(version_id) crawled_size = crawl_api.get_size_crawls(version_id, status='done') if total_crawl_size != crawled_size: return False # Check if all images are processed total_product_size = product_api.get_size_products(version_id) available_product_size = product_api.get_size_products(version_id, is_available=True) unavailable_product_size = product_api.get_size_products(version_id, is_available=False) # processed_size = product_api.get_size_products(version_id, is_processed=True) if (available_product_size + unavailable_product_size) != total_product_size: return False # Check Classifying processing process is done classified_size = product_api.get_size_products(version_id, is_classified=True) not_classified_size = product_api.get_size_products(version_id, is_classified=False) if (classified_size + not_classified_size) == total_product_size: return False except Exception as e: log.error(str(e)) return True
def start(rconn): global object_api global feature_api global product_api global version_id try: log.info("Start bl-object-index:1") object_api = Objects() feature_api = Features() product_api = Products() crawl_api = Crawls() file = os.path.join(os.getcwd(), INDEX_FILE) # index_file = load_index_file(file) while True: version_id = get_latest_crawl_version() if version_id is not None: log.info("check_condition_to_start") ok = check_condition_to_start(version_id) log.info("check_condition_to_start: " + str(ok)) if ok is True: index_file = None reset_index(version_id) # dispatch(rconn) # prepare_objects_to_index(rconn, version_id) if DATA_SOURCE == DATA_SOURCE_QUEUE: load_from_queue(index_file) elif DATA_SOURCE == DATA_SOURCE_DB: load_from_db(index_file, version_id) time.sleep(60 * 10) except Exception as e: log.error(str(e))
def check_condition_to_start(version_id): global product_api product_api = Products() crawl_api = Crawls() try: log.info("check_condition_to_start") # Check if image processing queue is empty queue_size = rconn.llen(REDIS_PRODUCT_IMAGE_PROCESS_QUEUE) if queue_size != 0: return False # Check if crawling process is done total_crawl_size = crawl_api.get_size_crawls(version_id) crawled_size = crawl_api.get_size_crawls(version_id, status='done') if total_crawl_size != crawled_size: return False # Check if all images are processed total_product_size = product_api.get_size_products(version_id) available_product_size = product_api.get_size_products( version_id, is_available=True) unavailable_product_size = product_api.get_size_products( version_id, is_available=False) # processed_size = product_api.get_size_products(version_id, is_processed=True) if (available_product_size + unavailable_product_size) == total_product_size: return False except Exception as e: log.error(str(e)) return True
REDIS_PRODUCT_IMAGE_PROCESS_QUEUE = 'bl:product:image:process:queue' REDIS_CRAWL_VERSION = 'bl:crawl:version' REDIS_CRAWL_VERSION_LATEST = 'latest' options = { 'REDIS_SERVER': REDIS_SERVER, 'REDIS_PASSWORD': REDIS_PASSWORD } log = Logging(options, tag='bl-object-classifier') rconn = redis.StrictRedis(REDIS_SERVER, decode_responses=False, port=6379, password=REDIS_PASSWORD) storage = s3.S3(AWS_ACCESS_KEY, AWS_SECRET_ACCESS_KEY) heart_bit = True product_api = Products() object_api = Objects() image_api = Images() version_id = None def analyze_product(p_data): log.info('analyze_product') product = pickle.loads(p_data) try: main_class_code, main_objects = analyze_main_image(product) except Exception as e: log.error('analyze_product:' + str(e)) delete_product_from_db(str(product['_id'])) return
from __future__ import print_function from stylelens_product.products import Products from pprint import pprint api_instance = Products() try: # api_response = api_instance.get_products_by_keyword('Coat', only_text=True, is_processed_for_text_class_model=False, offset=0, limit=100) # pprint(api_response) keyword = 'coat' test_str = 'coating' offset = 0 limit = 100 while True: api_response = api_instance.get_products_by_keyword( keyword, only_text=True, is_processed_for_text_class_model=False, offset=offset, limit=limit) # pprint(api_response) for res in api_response: name = res.get('name') if test_str in name: pprint(test_str + ' in keyword: ' + keyword) pprint(name) pprint(res.get('cate'))
def crawl(host_code, version_id): global product_api product_api = Products() options = {} log.setTag('bl-crawler-' + SPAWN_ID) log.debug('start crawl') options['host_code'] = host_code crawler = StylensCrawler(options) try: if crawler.start() == True: items = crawler.get_items() for item in items: product = {} product['name'] = item['name'] product['host_url'] = item['host_url'] product['host_code'] = item['host_code'] product['host_name'] = item['host_name'] product['product_no'] = item['product_no'] product['main_image'] = item['main_image'] product['sub_images'] = item['sub_images'] try: res = product_api.update_product_by_hostcode_and_productno( product) product['version_id'] = version_id product['product_url'] = item['product_url'] product['tags'] = item['tags'] product['price'] = item['price'] product['currency_unit'] = item['currency_unit'] product['nation'] = item['nation'] product['cate'] = item['cate'] product['sale_price'] = item['sale_price'] product['related_product'] = item['related_product'] product['thumbnail'] = item['thumbnail'] if 'upserted' in res: product_id = str(res['upserted']) log.debug("Created a product: " + product_id) product['is_processed'] = False update_product_by_id(product_id, product) elif res['nModified'] > 0: log.debug("Existing product is updated: product_no:" + product['product_no']) product['is_processed'] = False update_product_by_hostcode_and_productno(product) else: log.debug("The product is same") product['is_processed'] = True update_product_by_hostcode_and_productno(product) except Exception as e: log.error( "Exception when calling ProductApi->update_product_by_hostcode_and_productno: %s\n" % e) # delete_pod() except Exception as e: log.error("host_code:" + host_code + 'error: ' + str(e)) delete_pod() notify_to_classify(host_code) delete_pod()