def add_market(): root = tkinter.Tk() root.withdraw() path = filedialog.askdirectory(parent=root, initialdir="/", title='请选择上传产品目录') if path: name = os.path.basename(path) path = path.replace('/', '\\') market = {'name': name, 'directory': path} markets = current_app.data.markets if market['name'] not in markets: markets[market['name']] = market JSON.serialize(markets, '.', 'storage', 'markets.json') return market else: msg = { 'type': 'warning', 'content': 'The Market of ' + market.name + ' was already in system!' } emit('notify', msg, room=request.sid) return else: msg = { 'type': 'primary', 'content': 'No directory of market was selected.' } emit('notify', msg, room=request.sid) return
def serialize(self, visitors): dates = {} for v in visitors: date = v['date'] if date not in dates: dates[date] = [] dates[date].append(v) for d in dates: fn = 'visitors_'+d+'.json' JSON.serialize(dates[d], self.confi_dir, 'visitors', fn)
def save_tracking_ids(self): fn = 'inquiry_tracking_ids_' + self.lname.split(' ')[0] + '.json' root = self.market['directory'] + '_config' tracking_ids = {} for key in self.tracking_ids: tracking_ids[key] = {} tracking_ids[key]['datetime'] = self.tracking_ids[key][ 'datetime'].to_atom_string() tracking_ids[key]['status'] = self.tracking_ids[key]['status'] if 'emails' in self.tracking_ids[key]: tracking_ids[key]['emails'] = self.tracking_ids[key]['emails'] JSON.serialize(tracking_ids, root, [], fn)
def remove_market(market): markets = JSON.deserialize('.', 'storage', 'markets.json') if market['name'] in markets: del markets[market['name']] JSON.serialize(markets, '.', 'storage', 'markets.json') return True else: msg = { 'type': 'warning', 'content': 'Market ' + market['name'] + ' was not found. Try Refesh Your Browser!' } emit('notify', msg, room=request.sid) return False
def crawl_product_ranking(self, keyword, pages): self.current_page = 0 records = [] print(keyword, end=': ') while self.current_page < pages: html = self.next_page(keyword) self.crawl_current_page(html, records=records) print('length:', len(records), end=', ') print('done!') obj = { 'datetime': pendulum.now().to_datetime_string(), 'records': records } JSON.serialize(obj, self.market['directory'] + '_config', 'products_ranking', keyword + '.json') return obj
def check_balance(self): balance = self.browser.find_element_by_css_selector( '.sc-manage-edit-price-dialog span[data-role="span-balance"]').text if self.broker_url: self.balance = self.redis.getset(self.market['name']+'_p4p_balance', balance) if float(balance) == self.initial_balance or self.balance is None: return else: self.balance = self.balance.decode() if self.balance != balance: diff = format(float(self.balance) - float(balance), '.2f') changes = 0 - float(diff) sub_budget = self.redis.get(self.market['name'] + '_p4p_sub_budget') if sub_budget is not None and float(sub_budget) > 0: if self.redis.incrbyfloat(self.market['name'] + '_p4p_sub_budget', changes) < 0: self.redis.set(self.market['name'] + '_p4p_sub_budget_overflow', True) time_str = arrow.now().format('YYYY-MM-DD HH:mm:ss') date_str = time_str.split(' ')[0] root = self.market['directory'] + '_config' fn = 'p4p_balance_change_history_' + date_str + '.json.gz' JSON.serialize([time_str, diff], root, [], fn, append=True) else: if self.balance is None: self.balance = balance elif float(balance) == self.initial_balance: return elif self.balance != balance: diff = format(float(self.balance) - float(balance), '.2f') self.balance = balance time_str = arrow.now().format('YYYY-MM-DD HH:mm:ss') date_str = time_str.split(' ')[0] root = self.market['directory'] + '_config' fn = 'p4p_balance_change_history_'+date_str+'.json.gz' JSON.serialize([time_str, diff], root, [], fn, append=True)
def crawl_keywords(self): keywords = [] with self.lock: self.load_url() while True: html = self.browser.find_element_by_css_selector("div.keyword-manage .bp-table-main-wraper>table tbody").get_attribute('outerHTML') tbody = pq(html) trs = tbody.find('tr') for tr in trs: kws = {} kws['id'] = pq(tr).find('td:first-child input').val() kws['status'] = pq(tr).find('td.bp-cell-status .bp-dropdown-main i').attr('class').split('-').pop() kws['kws'] = pq(tr).find('td.bp-cell-left').text().strip() groups = pq(tr).find('td[data-role="table-col-tag"]').text().strip() kws['my_price'] = pq(tr).find('td:nth-child(5) a').text().strip() kws['average_price'] = pq(tr).find('td:nth-child(6)').text().strip() string = pq(tr).find('span.qs-star-wrap i').attr('class') kws['match_level'] = re.search('qsstar-(\d+)',string).group(1) string = pq(tr).find('.bp-icon-progress-orange').html() kws['search_count'] = re.search(':(\d+%)',string).group(1) string = pq(tr).find('.bp-icon-progress-blue').html() kws['buy_count'] = re.search(':(\d+%)',string).group(1) for grp in groups.split(','): group = grp.strip() obj = kws.copy() obj['group'] = group keywords.append(obj) if not self.next_page(): break root = self.market['directory'] + '_config' fn = 'p4p_keywords_list.json' JSON.serialize(keywords, root, [], fn) return keywords
def set_keywords(self, tp, kws_list): self.keywords_list[tp] = kws_list fn = 'p4p_keywords_list_'+tp+'.json' root = self.market['directory'] + '_config' JSON.serialize(self.keywords_list[tp], root, [], fn)
def save_crawling_result(self, keywords): root = self.market['directory'] + '_config' date_str = keywords[0][0].split(' ')[0] fn = 'p4p_keywords_crawl_result_'+date_str+'.json.gz' JSON.serialize(keywords, root, [], fn, append=True)
def del_keywords(self, tp, kws): if kws in self.keywords_list[tp]: self.keywords_list[tp].remove(kws) fn = 'p4p_keywords_list_'+tp+'.json' root = self.market['directory'] + '_config' JSON.serialize(self.keywords_list[tp], root, [], fn)
def add_keywords(self, tp, kws): self.keywords_list[tp].append(kws) fn = 'p4p_keywords_list_'+tp+'.json' root = self.market['directory'] + '_config' JSON.serialize(self.keywords_list[tp], root, [], fn)
def backgound_crawling_keywords(keyword, website, page_quantity, sid, socketio, market): filename = 'keywords.json' root = market['directory'] + '_config' msg = {'type': "primary", 'content': "打开浏览器 ... ..."} socketio.emit('notify', msg, namespace='/markets', room=sid) chrome_options = webdriver.ChromeOptions() # chrome_options_headless.add_argument('--headless') chrome_options.add_argument('--disable-gpu') chrome_options.add_argument('--disable-extensions') chrome_options.add_argument('--disable-logging') chrome_options.add_argument('--ignore-certificate-errors') browser = webdriver.Chrome(chrome_options=chrome_options) if website == 'alibaba': crawler_name = re.sub(' ', '_', keyword) + ' - ' + str(page_quantity) + '页 - 阿里' crawler = KwclrAlibaba(browser, keyword, page_quantity, sid, socketio) if website == 'alibaba_sp': supplier = re.search('https:\/\/([^\.]+)', keyword).group(1) category = 'all_products' if 'productgrouplist' in keyword: category = re.search('\/([^\/]+.html)', keyword).group(1) crawler_name = supplier + ' - ' + category + ' - ' + str( page_quantity) + '页 - 阿里(商家)' crawler = KwclrAliSp(browser, keyword, page_quantity, sid, socketio) if website == 'alibaba_sr': crawler_name = re.sub( '', '_', keyword) + ' - ' + str(page_quantity) + '页 - 阿里(橱窗)' crawler = KwclrAliSr(browser, keyword, page_quantity, sid, socketio) if website == 'amazon': crawler_name = re.sub( ' ', '_', keyword) + ' - ' + str(page_quantity) + '页 - Amazon' crawler = KwclrAmazon(browser, keyword, page_quantity, sid, socketio) msg = {'type': 'primary', 'content': "开始爬取 ... ..."} socketio.emit('notify', msg, namespace='/markets', room=sid) result = crawler.start() msg = {'type': "primary", 'content': "爬取结束,关闭浏览器 ... ..."} socketio.emit('notify', msg, namespace='/markets', room=sid) browser.quit() msg = {'type': "primary", 'content': "保存结果 ... ..."} socketio.emit('notify', msg, namespace='/markets', room=sid) obj = JSON.deserialize(root, [], filename) if not obj: obj = {} obj[crawler_name] = result JSON.serialize(obj, root, [], filename) socketio.emit('keyword_crawling_result', { 'key': crawler_name, 'result': result }, namespace='/markets', room=sid) browser.quit()
def serialize(obj, market, paths, filename): root = market['directory'] + '_config' JSON.serialize(obj, root, paths, filename) return
def update_market(market): markets = JSON.deserialize('.', 'storage', 'markets.json') key = market['name'] if key in markets: markets[key] = market JSON.serialize(markets, '.', 'storage', 'markets.json')