def format_date(self, key, rules, mp=None, ct=None): """ :param key: :param rules: :param mp: :param ct: :return: """ result_date = None try: if rules == 'TODAY': if key == 'start_time': result_date = date_formatter( current_timestamp(milliseconds=False), date_format="%Y-%m-%dT00:00:00.000+07:00") else: result_date = date_formatter( current_timestamp(milliseconds=False), date_format="%Y-%m-%dT23:59:59.000+07:00") else: raw_date = ct.parse(rule_type=mp['rule_type'], data=self.raw_data, rules=rules, flattening=False) if raw_date: result_date = self.convert(raw_date).values()[0] except Exception as e: logger('{}: {}'.format(self.__class__.__name__, str(e))) finally: return result_date
def publish(self, data): """ :param data: :return: """ try: self.producer.put_message(json.dumps(data)) if self.debug: logger(message=data) except Exception: raise
def build_url(self, url, mp): """ :param url: :param mp: :return: """ try: if not validate(url, data_type='url') and url.startswith('/'): url = '{}{}'.format(mp['mp_link'], url) except Exception as e: logger('{}: {}'.format(self.__class__.__name__, str(e))) finally: return url
def format_item_url(self, mp=None, ct=None): """ :param mp: Marketplace config :param ct: Object Controller :return: string """ result = self.item try: value = dict(pair for d in self.raw_data for pair in d.items()) result = ct.fill_arguments(mp['mp_item_url'], arguments=value) except Exception as e: logger('{}: {}'.format(self.__class__.__name__, str(e))) finally: return result
def format_number(self, key, mp): """ :return: long """ result = 0 try: if self.item: result = validate(self.item, data_type=long) if key in ['price_before', 'price_after']: result = result // mp['price_divider'] except Exception as e: logger('{}: {}'.format(self.__class__.__name__, str(e))) finally: return result
def html_parser(self, rule, attr=None): """ :param rule: :param attr: :return: """ try: if attr: return self.bs.select_one(rule)[attr] else: return self.bs.select(rule) except Exception as e: logger('{}: {} ({}:{})'.format(self.__class__.__name__, str(e), rule, attr)) return None
def format_image_url(self, key, mp=None, ct=None): """ :param key: Rule key :param mp: Marketplace config :param ct: Object Controller :return: string """ image_url = self.item try: if not validate(self.item, data_type='url'): if mp['mp_item_image_url']: image_url = ct.fill_arguments( mp['mp_item_image_url'], {key: self.convert(self.item)}) else: image_url = self.convert(self.item) except Exception as e: logger('{}: {}'.format(self.__class__.__name__, str(e))) finally: return image_url
def extractor(data): """ Extracting bs4.element.* to text. :param data: bs4.* or other type. :return: string """ try: if isinstance(data, list): t = [] for d in data: if d is not None: if isinstance(d, bs4.element.Tag) \ or isinstance(d, bs4.element.NavigableString): d = d.find_all(text=True)[-1] t.append(d) data = "".join(t) except Exception as e: logger('{}: {}'.format(inspect.currentframe().f_code.co_name, str(e))) finally: return data
def json_parser(self, rule, data): """ :param rule: :param data: :return: """ try: item_index = None match = IS_ARRAY.search(rule) if match: rule = match.group(1) if match.group(2): item_index = match.group(2) result = data[rule] if isinstance(result, list) and item_index: result = result[int(item_index)] return result except KeyError as e: logger('{}: KeyError: {}'.format(self.__class__.__name__, str(e))) return dict()
def run(mp_name=None, output=None, file_path=None, file_name=None, publish=None, debug=None): """ :param mp_name: market place name :param output: type of file for output :param file_path: file path :param file_name: file name :param publish: publish data to NSQ :param debug: :return: list of dict """ try: shop_items = [] start_time = end_time = None ct = Controller(mp_name=mp_name) marketplace = ct.mp session_arguments = list_to_dict( get_arguments(marketplace['mp_sessions_url'])) if debug: logger("{}: {}".format("session_arguments", session_arguments)) ses, html = ct.get_sessions(arguments=session_arguments) if debug: logger("{}: {}".format("response", remove_whitespace(str(html)))) logger("{}: {}".format("session", ses)) items_url = marketplace['mp_item_index_url'] items_arguments = list_to_dict(get_arguments(items_url)) if debug: logger("{}: {}".format("items_arguments", items_arguments)) # Get start & end flash sale date from index page if marketplace['period_source'] == 'root': ft = Formatter(data=html) start_time = ft.format_date( key='start_time', rules=marketplace['rule_item_start_time'], mp=marketplace, ct=ct) end_time = ft.format_date(key='end_time', rules=marketplace['rule_item_end_time'], mp=marketplace, ct=ct) if debug: logger("{}: {}".format("start_time", start_time)) logger("{}: {}".format("end_time", end_time)) index = 0 for s in ses[next(iter(ses))]: items_arguments['id'] = s target_url = ct.fill_arguments(items_url, items_arguments) if debug: logger("{}: {}".format("target_url", target_url)) items = ct.get_items(target_url) if debug: logger("{}: {}".format("total items", len(items[next(iter(items))]))) for item in items[next(iter(items))]: shop_item = dict() template = ct.item_template() shop_item['marketplace'] = mp_name if debug: logger("{0:<11}: {1:<11}".format("item index", index)) logger("{0:<11}: {1:<11}".format( "item", remove_whitespace(str(item)))) for t_key, t_value in template.iteritems(): value = ct.parse(rule_type=marketplace['rule_type'], data=item, rules=marketplace[t_value['rule']], flattening=False) if len(value): ft = Formatter(value) if debug: print() logger("{0:<11}: {1:<11}".format("key", t_key)) logger("{0:<11}: {1:<11}".format( "raw_value", value)) if len(value) > 1 and t_key == 'url': value = ft.format_item_url(mp=marketplace, ct=ct) else: raw_value = value[0] value = raw_value[next(iter(raw_value))] if t_key == 'image': value = ft.format_image_url(key=t_key, mp=marketplace, ct=ct) else: if t_key in ['start_time', 'end_time']: value = date_formatter(value) elif t_key in [ 'price_before', 'price_after', 'discount' ]: value = ft.format_number(key=t_key, mp=marketplace) else: value = ft.item if t_key in ['image', 'url']: value = ft.build_url(value, mp=marketplace) if debug: logger("{0:<11}: {1:<11}".format( "clean_value", value)) shop_item[t_key] = remove_whitespace(value) if not shop_item['start_time']: shop_item['start_time'] = date_formatter(start_time) if not shop_item['end_time']: shop_item['end_time'] = date_formatter(end_time) if debug: print() logger("{0:<11}: {1:<11}".format( "result", pformat(shop_item, indent=4))) print() print() shop_items.append(shop_item) if output: if file_path: file_name = file_name if file_name else mp_name return Export(data=shop_items, file_path=file_path, output_format=output, file_name=file_name).save else: raise Exception('File path required') if publish: nsq = Nsq(debug=debug) for item in shop_items: nsq.publish(item) return shop_items except Exception as e: logger(str(e), level='error')