def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) #print 'task', tasks #print 'item', items #print 'page', page while True: try: page_doc = lxml.html.document_fromstring(page) current_page_number = page_doc.xpath("//div[@class='wp-pagenavi']/span[@class='current']/text()") #print 'number', current_page_number next_page_number = int(current_page_number[0]) + 1 except: traceback.print_exc() return try: session = requests.Session() post_data = { 'p':next_page_number } new_page = session.post('http://suppliers.infobanc.com/directory/edible-oils-oil-seeds-biofuels/S-1119/index.htm', data = post_data) new_tasks, new_items = conf_parser.parse(new_page.text, new_page.content, url, parse_conf, page_encoding) #print 'new_tasks', new_tasks #print 'new_items', new_items except: #print '--------------------------error----------------' traceback.print_exc() return if not new_items or 'No result found' in new_page.text: break else: page = new_page.text tasks += new_tasks items += new_items return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if tasks: return (tasks, items) list_parse_conf = task_db.get_task_conf('life_taobao_list')['task_conf'] tasks, items = conf_parser.parse(page, raw_page, url, list_parse_conf, page_encoding) return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) tasks_zone = [] for (name, val) in tasks: if name == 'anjuke_list': return (tasks, items) elif name == 'anjuke_zone': tasks_zone.append((name, val)) list_parse_conf = task_db.get_task_conf('anjuke_list')['task_conf'] tasks, items = conf_parser.parse(page, raw_page, url, list_parse_conf, page_encoding) return tasks + tasks_zone, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if items: introduction_url = url.replace('qt=shopmenu', 'qt=shopdetail') tasks.append(('baidu_food_introduction', introduction_url)) return (tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) session = requests.Session() for item in items: phone_url = item.get('phone_url', None) if not phone_url: continue try: phone_url = "http://www.tradeindia.com" + phone_url.split("'")[1] page = session.get(phone_url) page_doc = lxml.html.document_fromstring(page.text) phone_list = item.get('phone', []) for key in ['Mobile', 'Phone', 'Fax']: phone = page_doc.xpath( "//tr[contains(td/strong/text(), '%s')]/td[3]/text()" % key) if not phone: continue if type(phone) is list: phone_list += phone else: phone_list.append(phone) except: continue item['phone'] = phone_list return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, task_url in tasks: if task_type != '58_type1_detail': new_tasks.append((task_type, task_url)) continue if task_url.find('entinfo') <= 0: new_tasks.append((task_type, task_url)) continue task_url_list = task_url.split('?') if len(task_url_list) < 2: new_tasks.append((task_type, task_url)) continue entinfo = '' target = '' for v in task_url_list[1].split('&'): if 'entinfo' in v: entinfo = v continue if 'target' in v: target = v new_url = task_url_list[0] + '?' + v if target: new_url += '&' + target new_tasks.append((task_type, new_url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if items: for item in items: id = item.get('phone_id', None) item['phone'] = get_phone(id) return (tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if not tasks: return tasks, items next_page_url = get_next_page_url(url) if next_page_url: tasks.append(('meituan_food_area', next_page_url)) return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type == "zomato_list": new_tasks.append((task_type, "%s/restaurants" % url)) else: new_tasks.append((task_type, url)) return (new_tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type == 'judysbook_list': new_tasks.append((task_type, url.replace("../", ""))) else: new_tasks.append((task_type, url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type != 'grotal_detail': new_tasks.append((task_type, url)) else: for v in re.findall('javascript:Navigate\("(.*)"\);', url): detail_url = "http://www.grotal.com/%s" % v new_tasks.append((task_type, detail_url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type == 'life_taobao_city': if not url.startswith('http'): continue query = urlparse.urlparse(url).query new_tasks.append((task_type, 'http://bendi.koubei.com/list.htm?%s' % query)) else: new_tasks.append((task_type, url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type == 'golden_chennai_list': quote_url = url.replace("window.location=", "") clean_url = quote_url.replace("'", "") new_tasks.append((task_type, clean_url)) else: new_tasks.append((task_type, url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if tasks: new_tasks = [] for task_type, url in tasks: if task_type == 'csc86_list': new_tasks.append((task_type, re.sub(',', '', url))) else: new_tasks.append((task_type, url)) return (new_tasks, items) return (tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, task_url in tasks: if task_type == 'ccb_bank': try: page_num = int(task_url) except: continue new_url = url[:url.find('&pageNo=')] + '&pageNo=' + str(page_num) new_tasks.append(('ccb_bank', new_url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, task_url in tasks: if task_type == 'icbc_office_detail': new_url = ("http://116.213.115.38/thememap/icbc_bank/json/" "getResultByKeywordJson.jsp?Province=%s&City=&" "Keyword=&pageno=1&pagesize=10&Type=0" % task_url) new_tasks.append((task_type, new_url)) else: new_tasks.append((task_type, task_url)) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) try: prospect_id = url.split('.')[-2].split('_')[-1] city = url.split('.')[-2].split('_')[-2] phone_url = ("http://www.indiacom.com/rwd/asp/getphone.asp" "?districtcd=%s&Prospectid=%s" % (city, prospect_id)) resp = requests.get(phone_url, headers={'Referer': url}) phone = resp.text if phone: items[0]['phone'] = phone except: pass return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] mcatId = re.findall("'fch_mcatID':'(.*)','fch_mcatName'", page) mcatName = re.findall("'fch_mcatName':'(.*)','fch_catID'", page) catId = re.findall("'fch_catID':'(.*)','fch_searchCityId'", page) end = 28 post_data = { 'biz': '', 'catId': catId, 'debug_mod': 0, 'end': end, 'mcatId': mcatId, 'mcatName': mcatName, 'prod_serv': 'P', 'rand': 4, 'searchCity': '', 'showkm': '' } session = requests.Session() while True: try: post_data['end'] = end new_page = session.post( "http://dir.indiamart.com/impcatProductPagination.php", data=post_data) new_tasks, new_items = conf_parser.parse(new_page.text, new_page.content, url, parse_conf, page_encoding) if not new_items or 'No result found' in new_page.text: break end += 20 tasks += new_tasks items += new_items except Exception, e: break
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: new_tasks.append((task_type, url)) if task_type == 'hc360_category': url_parts = list(urlparse.urlparse(url)) if not url[4]: new_tasks.append((task_type, url + '?af=1')) new_tasks.append((task_type, url + '?af=2')) else: new_tasks.append((task_type, url + '&af=1')) new_tasks.append((task_type, url + '&af=2')) return new_tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, url in tasks: if task_type == 'bankofchina': try: new_url = ("http://srh.bankofchina.com/search/operation" "/search.jsp?page=%s" % int(url)) new_tasks.append((task_type, new_url)) except Exception, e: print url, e continue else: new_tasks.append((task_type, url))
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) new_tasks = [] for task_type, task_url in tasks: if task_type == 'psbc_bank_city': try: params = re.findall('\((.*)\)', task_url)[0] province_id, city_code = re.findall('(\d+)', params) new_tasks.append(( task_type, "http://www.psbc.com/portal/main" "?transName=queryDeptAtm&ProvinceId=%s&CityCode=%s&intpage=1" % (province_id, city_code))) except Exception, e: print e continue
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if not tasks: return (tasks, items) new_tasks = [] task_detail_url = "http://www.zjs.com.cn/WS_Business/GetPCAData.ashx?PCC=%s" import urlparse for task_type, url in tasks: if task_type == "zjs_detail": try: parse_res = urlparse.urlparse(url) parse_qs = urlparse.parse_qs(parse_res.query, True) new_url = task_detail_url % (parse_qs['areaid'][0]) except exception: logging.error('parse url failed, url: %s' % url) continue new_tasks.append((task_type, new_url)) return (new_tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if len(items) > 0: try: contacts = re.findall("linkman:'(.*?)'", page)[0] items[0]['contacts'] = contacts except Exception, e: print e pass try: headers = headers = { 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)' } uid = re.findall("uid\s*:\s*'(\d+)", page)[0] userdata_url = "http://user.58.com/userdata/?callback=jsonp567&userid=%s&type=26" % uid for i in xrange(3): try: userdata_page = requests.get(userdata_url, headers=headers, timeout=30).text company_title = re.findall("corpname\s*:\s*'\s*(.*?)'", userdata_page)[0] if len(company_title) > 0: items[0]['company_title'] = company_title break except Exception, e: print e continue except Exception, e: print e pass
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if items: try: delivery_min_price = re.findall('(?:"takeout_price":")\s*(\d+)', page)[0] items[0]['delivery_min_price'] = delivery_min_price except: pass try: delivery_price = re.findall('(?:"takeout_cost":")\s*(\d+)', page)[0] items[0]['delivery_price'] = delivery_price except: pass try: delivery_time = re.findall('(?:"delivery_time":")\s*(\d+)', page)[0] items[0]['delivery_time'] = delivery_time except: pass return (tasks, items)
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) session = requests.Session() for item in items: phone_div_info = item.get('phone_div', None) #print 'phone_div_info', phone_div_info phone_list = [] if not phone_div_info: continue try: phone_div_id = phone_div_info.split('id')[1] #print 'phone_div_id', phone_div_id phone_url = 'http://www.eworldtradefair.com/ajax.php' post_data = { 'action': 'viewcompanyphones', 'campsid': phone_div_id } page = session.post(phone_url, data=post_data) #print 'page', page.text page_doc = lxml.html.document_fromstring(page.text) phone = page_doc.xpath("//strong/following-sibling::text()") #print phone if not phone: continue if type(phone) is list: for i in phone: i = i.strip() phone_list += [i] else: phone_list.append(phone.strip()) except: traceback.print_exc() continue item['phone'] = phone_list return tasks, items
def parse(page, raw_page, url, parse_conf, page_encoding=None, task_db=None): tasks, items = conf_parser.parse(page, raw_page, url, parse_conf, page_encoding) if len(items) > 0: if 'comment_num' in items[0]: headers = headers = { 'Accept-Language': 'zh-cn,zh;q=0.8,en-us;q=0.5,en;q=0.3', 'Accept-Encoding': 'gzip, deflate', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'User-Agent': 'Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1)' } comment_num = re.findall('(\d+)', items[0]['comment_num'])[0] items[0]['comment_num'] = comment_num if int(comment_num) > 0: cateid = re.findall('var _cateid\s*=\s*(\d+)\s*;', page)[0] #objectid = re.findall('zuche/(\d+)x\.shtml', url)[0] objectid = re.findall('entinfo=(\d+)', url)[0] items[0]['objectid'] = objectid userid = re.findall('var\s*_userid\s*=\s*(\d+)\s*;', page)[0] comment_url = "http://comment.58.com/comment/pc/listByCateid/1/?callback=getInfoJson1464073500495&userid=%s&cateid=%s&objectid=%s&objecttype=1&star=0" % ( userid, cateid, objectid) comments = {} for i in xrange(3): try: comment_page = requests.get(comment_url, headers=headers, timeout=30).text comment_page = comment_page.strip() if len(comment_page) <= 0: continue comment_page = comment_page[comment_page.find('(') + 1:comment_page.rfind(')')] comments = json.loads(comment_page) break except Exception, e: continue items[0]['comment'] = comments items[0]['comment_url'] = comment_url if 'description_address' in items[0] and type( items[0]['description_address']) is list: items[0]['description_address'] = ''.join( items[0]['description_address']) if 'description_content' in items[0] and type( items[0]['description_content']) is list: items[0]['description_content'] = '\n'.join( items[0]['description_content']) if 'ext_info' in items[0]: new_ext_info = [] ext_info = items[0]['ext_info'] if type(ext_info) is not list: ext_info = [ext_info] for v in ext_info: value = v.get('value', []) if type(value) is list: if len(value) > 1: new_ext_info.append({ 'key': value[0], 'value': ''.join(value[1:]) }) if len(value) == 1: new_ext_info.append({'value': value[0]}) else: new_ext_info.append({'value': value}) if new_ext_info: items[0]['ext_info'] = new_ext_info