def crawl_tag(tag, cookies): url = 'https://www.instagram.com/explore/tags/%s/' % tag content = crawl_util.crawl(url) print cookies end_cursor = re.findall(r'"end_cursor":"([^"]+)"', content) userids = re.findall(r'"owner":{"id":"([^"]+)"}', content) end_cursor = end_cursor[0] if end_cursor else '' for page in range(10): if end_cursor: data = dict(query_hash='ded47faa9a1aaded10161a2ff32abb6b', variables={ "tag_name": "%s" % tag, "first": page + 1, "after": "%s" % end_cursor }) variables = json.dumps(data['variables']) variables = quote(variables) url = 'https://www.instagram.com/graphql/query/?query_hash=%s&variables=%s' % ( data['query_hash'], variables) print url content = crawl_util.crawl(url, cookies=cookies) data = json.loads(content) edge_hashtag_to_media = data['data']['hashtag'][ 'edge_hashtag_to_media'] edges = edge_hashtag_to_media['edges'] end_cursor = edge_hashtag_to_media['page_info']['end_cursor'] for edge in edges: userid = edge['node']['owner']['id'] userids.append(userid) else: break return userids
def crawl_cates(): url = 'http://home.manmanbuy.com/bijia.aspx' page = crawl_util.crawl(url) content = page.text.encode('utf-8') sel = html.document_fromstring(content, parser=html.HTMLParser(encoding='utf-8')) lefts = sel.xpath('//div[@style="float:left;width:470px;"]') rights = sel.xpath('//div[@style="float:right;width:470px;"]') sku = {} for items in [lefts, rights]: if items: for ele in items[0].getchildren(): if ele.tag == 'h2': cat1 = ele.text if cat1 not in sku: sku[cat1] = {} elif ele.tag == 'div' and 'sclassBlock' == ele.attrib['class']: divs = ele.getchildren() cat2 = divs[0].text if cat2 not in sku[cat1]: sku[cat1][cat2] = {} for cat3 in divs[1]: if cat3.tag == 'a': href = cat3.attrib['href'] name = cat3.text if not name: name = cat3.getchildren()[0].text sku[cat1][cat2][name] = {'url': href} return sku
def crawl_list(url, cate, page_key=0): page = crawl_util.crawl(url, headers=headers) content = page.content.decode('utf-8') infos = re.findall(r'window.runParams\s*=\s*(\{.*?\});', content) products = [] dead = True total = 0 if infos and len(infos) > 1: dead = False info = json.loads(infos[1]) total = int(info.get('resultCount', '0')) for item in info.get('items', []): shop = item.get('store', {}).get('storeName', '') product = [ cate, shop, item['title'], item.get('starRating', ''), item.get('price', ''), item.get('tradeDesc', ''), item.get('imageUrl', ''), item.get('productDetailUrl', ''), info.get('resultCount', ''), page_key ] products.append(product) print('products: %s' % len(products)) return products, dead, total
def crawl_cates(url): page = crawl_util.crawl(url) content = page.content.decode('utf-8') sel = html.document_fromstring(content, parser=html.HTMLParser(encoding='utf-8')) links = sel.xpath('//div[@class="cg-main"]//li/a') cnt = len(links) with open('ali.csv', 'a') as w: w = csv.writer(w) for i, link in enumerate(links): url = 'https:' + link.attrib['href'] items = url.split('/') name = items[-1].replace('.html', '') catid = items[-2] for page in range(1, 5): nextpage = url + '?trafficChannel=main&catName=%s&CatId=%s<ype=wholesale&SortType=total_tranpro_desc&page=%s&isrefine=y' % ( name, catid, page) page_key = '%s-%s' % (catid, page) if page_key in ali_catids: print('find ' + page_key) continue print('%s/%s' % (i + 1, cnt), link.text, page, nextpage) products, dead, total = crawl_list(nextpage, name, page_key) if dead: print('I am dead, reset cookie') return if total < page * 60: break w.writerows(products) time.sleep(2)
def get_headers(url): content, cookies = crawl_util.crawl(url, need_return_cookies=True) csrftoken = '' for cookie in cookies: if cookie.name == 'csrftoken': csrftoken = cookie.value break return {'x-csrftoken': csrftoken}
def crawl_info(url): page = crawl_util.crawl(url) content = page.content.decode('utf-8') print(page.cookies) js = re.findall(r'href="(.*/main.[a-z0-9]+.js)"', content) if js: bearer = get_Bearer(js[0]) print(bearer) gt = re.findall(r'"gt=([0-9]+);', content) print(gt) pass
def crawl_user_info(userid, cookies): url = 'https://i.instagram.com/api/v1/users/%s/info/' % userid rsp = crawl_util.crawl(url, headers=headers, cookies=cookies) if rsp.status_code in [403, 429]: print('%s forbidden sleep 60s... %s' % (rsp.status_code, url)) time.sleep(60) elif rsp.status_code == 404: return 404, {} if rsp and 'user' in rsp.json(): return 200, rsp.json()['user'] else: print(url, rsp)
def login(): d = {"phone_id": "e40cf722-2116-474e-b7dc-fccdc5e01c50", "username": email, "adid": "2acaf82e-080f-404d-8268-50d487a7c5e2", "guid": "758492fb-2663-4f27-a99c-beedcf904d33", "device_id": "android-b209e9dbbbf0f081", "password": password, "login_attempt_count": "0"} d = json.dumps(d) signed_body = 'd17c8c06534e46f3db82b1915fa5a178a29ff8e0728c71eef426fef60fe716bf.' + d data = dict(signed_body=signed_body, ig_sig_key_version=4) api = 'https://i.instagram.com/api/v1/accounts/login/' rsp = crawl_util.crawl(api, data=data, method='post', headers=headers) # for cookie in rsp.cookies: # if cookie.name == 'csrftoken': # csrftoken = cookie.value # headers['x-csrftoken'] = csrftoken # break return rsp.cookies
def crawl_info(url, h=None): page = crawl_util.crawl(url, headers=h) content = page.content.decode('utf-8') infos = re.findall(r'window.runParams\s*=\s*(\{[\s\S]*?\});', content) info = infos[0] if infos else '{}' lines = info.split('\n') if len(lines) > 1: info = lines[1] info = info.replace('data:', '') info = info[:-1] info = json.loads(info) if 'pageModule' in info: title = info['pageModule']['title'] desc = info['pageModule']['description'] return [title, desc] with open('a.html', 'w') as w: w.write(content) return None
def get_content(url, cookies, max_page=1, callback_func=None): print(url) max_id = None items = [] for page in range(max_page): if max_id: url = url + '?max_id=%s' + max_id rsp = crawl_util.crawl(url, headers=headers, cookies=cookies) data = rsp.json() if 'items' in data: items += data['items'] elif 'users' in data: items += data['users'] max_id = data['next_max_id'] if max_id is None: break return items
def crawl_list(url, cate_page): page = crawl_util.crawl(url, headers=headers, proxy=proxy) if page is None or page.status_code != 200: succ = True if page: print('status code %s' % page.status_code) succ = False2 return None, 0, succ content = page.content.decode('utf-8') sel = html.document_fromstring(content, parser=html.HTMLParser(encoding='utf-8')) total = sel.xpath('//span[@id="s-result-count"]') if not total: total = sel.xpath('//div[@class="a-section a-spacing-small a-spacing-top-small"]/span[1]') total = total[0].text if total else '0' if total: total = total.replace('results for', '') total = total.replace(',', '') total = total.split('over')[-1].strip() total = total.split('of')[-1].strip() else: total = '0' categories = sel.xpath('//span[@id="s-result-count"]/span') cates = [] if not categories: categories = sel.xpath('//div[@class="a-section a-spacing-small a-spacing-top-small"]/a/span') for cate in categories: cates.append(cate.text) cat3 = sel.xpath( '//div[@class="a-section a-spacing-small a-spacing-top-small"]/span[@class="a-color-state a-text-bold"]') if cat3: cates.append(cat3[0].text) else: for cate in categories[0].getchildren(): cates.append(cate.text) category = ':'.join(cates) products = get_products(sel, category, total, cate_page) find = len(products) if products else 0 print(category, cate_page, total, find) succ = True if content.find('Enter the characters you see below') > 0: print('failed: ' + url) succ = False return products, int(total), succ
def get_brand_trademark(url): page = crawl_util.crawl(url, headers=headers) content = page.content.decode('GB18030') if '品牌申请' not in content: print('Fail, need change cookie.') return False sel = html.document_fromstring(content, parser=html.HTMLParser(encoding='GB18030')) trs = sel.xpath('//table/tr') rows = [] for tr in trs[1:]: row = [] for td in tr.getchildren()[0:5]: txt = td.text.strip() if not txt: txt = td.getchildren()[0].text txt = txt.strip() if txt else '' row.append(txt) rows.append(row) return rows
def get_list(url): page = crawl_util.crawl(url, headers=headers) content = page.content.decode('utf-8') sel = html.document_fromstring(content, parser=html.HTMLParser(encoding='utf-8')) products = sel.xpath('//div[@class="sg-col-inner"]') if not products: products = sel.xpath('//div[@class="s-item-container"]') if products: print(len(products)) next_page = sel.xpath('//li[@class="a-last"]/a') if not next_page: next_page = sel.xpath('//a[@id="pagnNextLink"]') if next_page: href = next_page[0].attrib['href'] next_url = 'https://www.amazon.co.uk' + href print('next page', next_url) with open('../b.html', 'w') as w: w.write(content)
def register(email, password): url = 'https://www.instagram.com/accounts/emailsignup/' headers = get_headers(url) api = 'https://www.instagram.com/accounts/web_create_ajax/' first_name = email.split('@')[0] username = first_name + '2018' data = dict(email=email, password=password, username=username, first_name=first_name, seamless_login_enabled=1, tos_version='row', opt_into_one_tap=False) content, cookies = crawl_util.crawl(api, data=data, headers=headers, method='post', need_return_cookies=True) data = json.loads(content) if 'user_id' in data: return cookies return None
def crawl_list(url): page = crawl_util.crawl(url) content = page.text.encode('utf-8') cnt = re.findall(r'共有(\d+)条记录', content) return cnt[0] if cnt else 20
def crawl_cate(url): page = crawl_util.crawl(url, headers=headers) content = page.content.decode('utf-8') get_cats(content)
def crawl(url, referer=''): headers['referer'] = referer rsp = crawl_util.crawl(url, headers=headers) return rsp.content
def get_Bearer(url): page = crawl_util.crawl(url) print(url) content = page.content.decode('utf-8') bearer = re.findall(r'c="(AAAAAAAAAAAAAAAA[A-Za-z0-9%]+)"', content) return ('Bearer ' + bearer[0]) if bearer else ''
def crawl_info(url): page = crawl_util.crawl(url) content = page.content.decode('utf-8') with open('a.html', 'w') as w: w.write(content) return page.cookies
def crawl_user_info(userid, cookies): url = 'https://i.instagram.com/api/v1/users/%s/info/' % userid content = crawl_util.crawl(url, cookies=cookies) return content
def crawl_list(url, cookies=None): page = crawl_util.crawl(url, cookies=cookies) content = page.content.decode('utf-8') with open('b.html', 'w') as w: w.write(content)