def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} for sub in subs: if sub not in conf.keys(): continue html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = conf[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
def get_links(prefix, html): journals = util.find_journals(html) conferences = util.find_conferences(html) journals = list(set(journals)) conferences = list(set(conferences)) #authors = util.find_authors(html) links = [] for j in journals: if j[0].startswith(prefix): print '\t', j[0] inner_url = 'http://dblp.uni-trier.de/db/journals/' + j[0] inner = util.get_page(inner_url) #links += get_links('###', inner) else: links.append(('journal', j[0].split('/')[0].split('#')[0])) for c in conferences: if c[0].startswith(prefix): print '\t', c[0] inner_url = 'http://dblp.uni-trier.de/db/conf/' + c[0] inner = util.get_page(inner_url) #links += get_links('###', inner) else: links.append(('conference', c[0].split('/')[0].split('#')[0])) #for a in authors: #links.append(('author', a[0].split('#')[0])) pass links = list(set(links)) return links
def get_user_info(u,fp): ourl = "http://www.toutiao.com/c/user/%s/" % u url,html = util.get_page({"url":ourl}) p,h = get_urlinfo(url) params = get_params("func.js") uid = get_userinfo(html) if params is not None and uid is not None: params = json.loads(params) params["user_id"] = uid path = "/c/user/article/" nurl = "%s//%s%s" % (p,h,path) count = 3 while True: url,html = util.get_page({"url":nurl,"data":params,"method":"post"}) if html is None or len(html) == 0: util.log_msg("could not get data from url:%s,data:%s,uid:%s" % (nurl,str(params),u)) break mp = json.loads(html) if "data" in mp and isinstance(mp["data"],list): if len(mp["data"]) == 0: util.log_msg("no data from response.url:%s" % nurl) result = [] for item in mp["data"]: turl = util.parse_url(url,item["source_url"]) try: get_article(turl,url,item,fp,result) except Exception: tp, e,trace = sys.exc_info() util.log_msg("get article(url:%s) info error:%s" % (turl,str(e))) if len(result) > 0: if fp is None: insert_into_db(result) else: for item in result: fp.write("[%s]\t%s\t%s\n" % (time.ctime(),u,json.dumps(item))) else: util.log_msg("no data in content.url:%s" % nurl) if mp["has_more"]: params = get_params("func.js") params = json.loads(params) params["user_id"] = uid nxt = mp["next"] for key in nxt.keys(): params[key]=nxt[key] else: break count -= 1 if count <= 0: break else: util.log_msg("could not parse data from html file,need to check this out.url:%s,referer:%s." % (ourl,referer))
def get_journals(): pos, cnt = 1, 0 util.mkdir(JOURNAL_FOLDER) while True: html = util.get_page(JOURNAL_URL + str(pos)) links = util.find_journals(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'journal' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data[ 'short'] util.save_json( os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Journal', cnt
def crawl_data5u(self): start_url = 'http://www.data5u.com/free/gngn/index.shtml' headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Encoding': 'gzip, deflate', 'Accept-Language': 'en-US,en;q=0.9,zh-CN;q=0.8,zh;q=0.7', 'Cache-Control': 'max-age=0', 'Connection': 'keep-alive', 'Cookie': 'JSESSIONID=47AA0C887112A2D83EE040405F837A86', 'Host': 'www.data5u.com', 'Referer': 'http://www.data5u.com/free/index.shtml', 'Upgrade-Insecure-Requests': '1', 'User-Agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_1) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36', } html = get_page(start_url, options=headers) if html: ip_address = re.compile( '<span><li>(\d+\.\d+\.\d+\.\d+)</li>.*?<li class=\"port.*?>(\d+)</li>', re.S) re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def main(forum_post_number, version, api_url='https://wiki.factorio.com/api.php', version_nav=True): session = requests.Session() edit_token = get_edit_token(session, api_url) latest_version_page_name = 'Main_Page/Latest_versions' version_nav_page_name = 'Template:VersionNav' latest_version_page = get_page(session, api_url, latest_version_page_name) if version_nav: version_nav_page = get_page(session, api_url, version_nav_page_name) if version in latest_version_page: return f'Version {version} already found on "{latest_version_page_name}". Aborting.' if version_nav: if version in version_nav_page: return f'Version {version} already found on "{version_nav_page_name}". Aborting.' if 'None' not in latest_version_page: new_latest_version_page = re.sub( r'({{Translation\|Latest experimental version}}: \[https:\/\/forums\.factorio\.com\/)\d+ \d\.\d+\.\d+', rf'\g<1>{forum_post_number} {version}', latest_version_page) else: new_latest_version_page = re.sub( r'({{Translation\|Latest experimental version}}: ){{Translation\|None}}', rf'\g<1>[https://forums.factorio.com/{forum_post_number} {version}]', latest_version_page) if version_nav: new_version_nav_page = re.sub( r'(}}\n)(}}\n<noinclude>{{Documentation}}<\/noinclude>)', rf'\1* {{{{TransLink|Version history/{version[:version.rfind(".")+1]}0#{version}|{version}}}}}\n\2', version_nav_page) edit_response_latest_version_page = edit_page(session, api_url, edit_token, latest_version_page_name, new_latest_version_page, f'{version}') if version_nav: edit_response_version_nav_page = edit_page(session, api_url, edit_token, version_nav_page_name, new_version_nav_page, f'{version}') return edit_response_latest_version_page.text + ( ('\n' + edit_response_version_nav_page.text) if version_nav else '')
def crawl_89ip(self): start_url = 'http://www.89ip.cn/tqdl.html?num=50&address=&kill_address=&port=&kill_port=&isp=' html = get_page(start_url) if html: find_ips = re.compile('(\d+\.\d+\.\d+\.\d+:\d+)', re.S) ip_ports = find_ips.findall(html) for address_port in ip_ports: yield address_port
def crawl_kxdaili(self): for i in range(1, 4): start_url = 'http://www.kxdaili.com/ipList/{}.html#ip'.format(i) html = get_page(start_url) ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def crawl_ip181(self): start_url = 'http://www.ip181.com/' html = get_page(start_url) ip_adress = re.compile('<tr.*?>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s* 匹配空格 re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def get_journals(): files = util.listdir(JOURNAL_FOLDER) cnt = 0 jour = {} for file_name in files: data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) cnt += 1 print cnt, len(files), data['short'], '|', full_name if '404' not in full_name: jour[data['short']] = full_name subs = get_subs(data['short'], html) for sub in subs: html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub) jour[sub] = get_full_name(html) print '\t', sub, jour[sub] util.save_json('jour_name.json', jour)
def crawl_premproxy(self): for i in ['China-01', 'China-02', 'China-03', 'China-04', 'Taiwan-01']: start_url = 'https://premproxy.com/proxy-by-country/{}.htm'.format( i) html = get_page(start_url) if html: ip_adress = re.compile('<td data-label="IP:port ">(.*?)</td>') re_ip_adress = ip_adress.findall(html) for adress_port in re_ip_adress: yield adress_port.replace(' ', '')
def __init__(self, shelf_id, username='', password=''): self.shelf = shelf_id self.usr = username self.pas = password self.first_page = bs4.BeautifulSoup(get_page(self.shelf, 1, LIST_VIEW), 'lxml') self.story_count = None self.pages = None self.stories = None self.perchap_wc = None self.wordcount = None
def crawl_kuaidaili(self): for page in range(1, 4): start_url = 'https://www.kuaidaili.com/free/inha/{}/'.format(page) html = get_page(start_url) ip_adress = re.compile( '<td data-title="IP">(.*)</td>\s*<td data-title="PORT">(\w+)</td>' ) re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def crawl_xicidaili(self): for page in range(1, 4): start_url = 'http://www.xicidaili.com/wt/{}'.format(page) html = get_page(start_url) ip_adress = re.compile( '<td class="country"><img src="http://fs.xicidaili.com/images/flag/cn.png" alt="Cn" /></td>\s*<td>(.*?)</td>\s*<td>(.*?)</td>' ) re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def crawl_ip3366(self): for page in range(1, 4): start_url = 'http://www.ip3366.net/free/?stype=1&page={}'.format( page) html = get_page(start_url) ip_address = re.compile('<tr>\s*<td>(.*?)</td>\s*<td>(.*?)</td>') # \s * 匹配空格,起到换行作用 re_ip_address = ip_address.findall(html) for address, port in re_ip_address: result = address + ':' + port yield result.replace(' ', '')
def crawl_data5u(self): for i in ['gngn', 'gnpt']: start_url = 'http://www.data5u.com/free/{}/index.shtml'.format(i) html = get_page(start_url) ip_adress = re.compile( ' <ul class="l2">\s*<span><li>(.*?)</li></span>\s*<span style="width: 100px;"><li class=".*">(.*?)</li></span>' ) re_ip_adress = ip_adress.findall(html) for adress, port in re_ip_adress: result = adress + ':' + port yield result.replace(' ', '')
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) cnt = 0 conf = {} for file_name in files: data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) cnt += 1 try: print cnt, len(files), data['short'], '|', full_name if '404' not in full_name: conf[data['short']] = full_name subs = get_subs(data['short'], html) for sub in subs: html = util.get_page('http://dblp.uni-trier.de/db/conf/' + sub) conf[sub] = get_full_name(html) print '\t', sub, conf[sub] except: pass util.save_json('conf_name.json', conf)
def crawl_daili66(self, page_count=4): start_url = 'http://www.66ip.cn/{}.html' urls = [start_url.format(page) for page in range(1, page_count + 1)] for url in urls: print('Crawling', url) html = get_page(url) if html: doc = pq(html) trs = doc('.containerbox table tr:gt(0)').items() for tr in trs: ip = tr.find('td:nth-child(1)').text() port = tr.find('td:nth-child(2)').text() yield ':'.join([ip, port])
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name cnt += 1 print cnt, len(files), data['short'] data['links'] = get_links(data['short'], html)
def get_authors(): files = util.listdir(AUTHOR_FOLDER) util.mkdir(AUTHOR_CRALWED_FOLDER) for file_name in files: save_path = os.path.join(AUTHOR_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(AUTHOR_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name print data['short'], full_name data['links'] = get_links(data['short'], html) util.save_json(save_path, data)
def crawl_daili66(self, count=20): """ 获取代理66 :param count: :return: """ url = 'http://www.66ip.cn/mo.php?tqsl={}'.format(count) print('Crawling 66', url) html = get_page(url) if html: ret = re.findall(r'\d+\.\d+\.\d+\.\d+:\d+', html) for ip in ret: yield ip
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} for sub in subs: html = util.get_page('http://dblp.uni-trier.de/db/journals/' + sub) data['sub'][sub] = {} data['sub'][sub]['pub'] = get_publications(html) data['sub'][sub]['name'] = jour[sub] cnt += 1 print cnt, len(files), data['short'] util.save_json(save_path, data)
def crawl_xroxy(self): for i in ['CN', 'TW']: start_url = 'http://www.xroxy.com/proxylist.php?country={}'.format( i) html = get_page(start_url) if html: ip_adress1 = re.compile( "title='View this Proxy details'>\s*(.*).*") re_ip_adress1 = ip_adress1.findall(html) ip_adress2 = re.compile( "title='Select proxies with port number .*'>(.*)</a>") re_ip_adress2 = ip_adress2.findall(html) for adress, port in zip(re_ip_adress1, re_ip_adress2): adress_port = adress + ':' + port yield adress_port.replace(' ', '')
def crawl_iphai(self): start_url = 'http://www.iphai.com/' html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>\s+(\d+\.\d+\.\d+\.\d+)\s+</td>', re.S) re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>\s+(\d+)\s+</td>', re.S) re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_ip3366(self): for i in range(1, 4): start_url = 'http://www.ip3366.net/?stype=1&page={}'.format(i) html = get_page(start_url) if html: find_tr = re.compile('<tr>(.*?)</tr>', re.S) trs = find_tr.findall(html) for s in range(1, len(trs)): find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(trs[s]) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(trs[s]) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def crawl_kuaidaili(self): for i in range(1, 4): start_url = [ 'http://www.kuaidaili.com/free/{}/{}/'.format(style, i) for style in ['intr', 'inha'] ] for url in start_url: html = get_page(url) if html: ip_address = re.compile('<td data-title="IP">(.*?)</td>') re_ip_address = ip_address.findall(html) port = re.compile('<td data-title="PORT">(.*?)</td>') re_port = port.findall(html) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 for file_name in files: cnt += 1 if cnt < 1970: continue save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) html = util.get_page(data['url']) full_name = get_full_name(html) data['name'] = full_name try: print cnt, len(files), data['short'] except: pass data['links'] = get_links(data['short'], html)
def load_stories(self): if self.stories is None: print('Loading story urls for', self.shelf) s = [] for page in range(self.pages): print('Loading page', page, 'out of', self.pages, 'for', self.shelf) soup = self.first_page if page == 0 else bs4.BeautifulSoup(get_page(self.shelf, page + 1, LIST_VIEW), 'lxml') bold_tags = soup(class_="search_results_count")[0]('b') from_ = int(bold_tags[0].string) to = int(bold_tags[1].string) # there are 1-60 stories on the first page which means 60, but 60-1=59 so we add one count = (to - from_) + 1 story_list = soup(class_="story-list")[0]('li') for story in story_list: s.append(story(class_="right")[0].h2.a['href']) self.stories = tuple(s) print(number_objects(len(self.stories), 'url(|s)'), 'loaded for', self.shelf) return self.stories
def cb(): url = "http://esf.cq.fang.com" #url,html = util.get_page({"url":url}) html = open("data/esf.html").read() entries = get_entry(html) fp = open("data/result.txt","a") for entry in entries: if entry[0] is None: continue eurl = util.parse_url(url,entry[0]) html = open("data/yubei.html").read() eurl,html = util.get_page() subs = get_sub(html) for sub in subs: surl = util.parse_url(eurl,sub[0]) if surl == eurl: continue html = open("data/list.html").read() nurl,items = get_list(html,surl) fp.close()
def get_conferences(): files = util.listdir(CONFERENCE_FOLDER) util.mkdir(CONFERENCE_CRALWED_FOLDER) cnt = 0 conf = util.load_json('conf_name.json') for file_name in files: save_path = os.path.join(CONFERENCE_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(CONFERENCE_FOLDER, file_name)) if data['short'] not in conf.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = conf[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1
def mail_fetch(mail_domain): regex_domain = mail_domain.replace(".", "\\.").replace("-", "\\-") this_regex = mail_regex + regex_domain keyword = '"@%s"' % mail_domain mails = [] for result in search(keyword): logging.info(result) try: text = get_page(result) except: logging.info(traceback.format_exc()) pass matchs = re.findall(this_regex, text) matchs = list(set(matchs)) for match in matchs: logging.info(match) mails.append(match) return list(set(mails))
def get_journals(): files = util.listdir(JOURNAL_FOLDER) util.mkdir(JOURNAL_CRALWED_FOLDER) cnt = 0 jour = util.load_json('jour_name.json') for file_name in files: save_path = os.path.join(JOURNAL_CRALWED_FOLDER, file_name) if util.exists(save_path): continue data = util.load_json(os.path.join(JOURNAL_FOLDER, file_name)) if data['short'] not in jour.keys(): continue html = util.get_page(data['url']) subs = get_subs(data['short'], html) data['name'] = jour[data['short']] data['sub'] = {} if len(subs) == 0: data['sub']['#'] = get_publications(html) util.save_json(save_path, data) cnt += 1 print cnt, len(files), data['short']
def get_article(url,referer,data,fp,result2): url, h = util.get_page({"url":url,"headers":{"Referer":referer}}) tree = etree.HTML(h) scripts = [o for o in tree.xpath("//script/text()") if o.find("BASE_DATA") > -1 or o.find("__pgcInfo")>-1] scripts.append("console.log(JSON.stringify(BASE_DATA))") open("data/tmp.js","w").write("\n".join(scripts)) r = get_params("data/tmp.js") if r is not None: mp = json.loads(r) obj = {"entry":data,"data":mp} conf = [("title",["data","artilceInfo","title"]), ("content",["data","artilceInfo","content"],None,html.unescape), ("comments",["data","commentInfo","comments_count"],0), ("isOriginal",["data","artilceInfo","subInfo","isOriginal"],False), ("url",["__const",url]), ("views",["entry","go_detail_count"], 0), ("cover",["entry","image_url"],""), ("abstract",["entry","abstract"], ""), ("source",["data","artilceInfo","subInfo","source"],""), ("publishtime",["data","artilceInfo","subInfo","time"]), ("tags",["data","artilceInfo","tagInfo","tags"],"",lambda o:",".join([so["name"] for so in o])), ("category",["data","headerInfo","chineseTag"],""), ] result = {} for cf in conf: v = util.get_jpath(obj,cf[1],cf[2] if len(cf)>2 else None,cf[3] if len(cf)>3 else None) if v is not None: result[cf[0]] = v result["id"] = hashlib.md5(url.encode("utf-8")).hexdigest() if "content" in result: result["content"],result["images"] = replace_image(result["content"],url) if "cover" in result and len(result["cover"])>0: result["cover"] = imge_transfer(util.parse_url(url,result["cover"]),url)[1] if len(result) > 0: result2.append(result) else: util.log_msg("could not parse content from html file,need to check this out.url:%s,referer:%s." % (url,referer)) else: util.log_msg("could not parse data from html file,need to check this out.url:%s,referer:%s." % (url,referer))
def get_authors(): pos, cnt = 545504, 0 util.mkdir(AUTHOR_FOLDER) while True: html = util.get_page(AUTHOR_URL + str(pos)) links = util.find_authors(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'author' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short'] util.save_json(os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 300 print 'Author', pos, cnt
def get_conferences(): pos, cnt = 1, 0 util.mkdir(CONFERENCE_FOLDER) while True: html = util.get_page(CONFERENCE_URL + str(pos)) links = util.find_conferences(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'conference' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/conf/' + data['short'] util.save_json(os.path.join(CONFERENCE_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Conference', cnt
def get_journals(): pos, cnt = 1, 0 util.mkdir(JOURNAL_FOLDER) while True: html = util.get_page(JOURNAL_URL + str(pos)) links = util.find_journals(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'journal' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/db/journals/' + data['short'] util.save_json(os.path.join(JOURNAL_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 100 print 'Journal', cnt
def get_authors(): pos, cnt = 545504, 0 util.mkdir(AUTHOR_FOLDER) while True: html = util.get_page(AUTHOR_URL + str(pos)) links = util.find_authors(html) once_cnt = 0 for link in links: if link[0] == '' or '?' in link[0]: continue data = {} data['type'] = 'author' data['short'] = link[0] data['name'] = link[1] data['url'] = 'http://dblp.uni-trier.de/pers/hd/a/' + data['short'] util.save_json( os.path.join(AUTHOR_FOLDER, util.hex_hash(data['short'])), data) cnt += 1 once_cnt += 1 if once_cnt == 0: break pos += 300 print 'Author', pos, cnt
def crawl_xicidaili(self): for i in range(1, 3): start_url = 'http://www.xicidaili.com/nn/{}'.format(i) headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Cookie': '_free_proxy_session=BAh7B0kiD3Nlc3Npb25faWQGOgZFVEkiJWRjYzc5MmM1MTBiMDMzYTUzNTZjNzA4NjBhNWRjZjliBjsAVEkiEF9jc3JmX3Rva2VuBjsARkkiMUp6S2tXT3g5a0FCT01ndzlmWWZqRVJNek1WanRuUDBCbTJUN21GMTBKd3M9BjsARg%3D%3D--2a69429cb2115c6a0cc9a86e0ebe2800c0d471b3', 'Host': 'www.xicidaili.com', 'Referer': 'http://www.xicidaili.com/nn/3', 'Upgrade-Insecure-Requests': '1', } html = get_page(start_url, options=headers) if html: find_trs = re.compile('<tr class.*?>(.*?)</tr>', re.S) trs = find_trs.findall(html) for tr in trs: find_ip = re.compile('<td>(\d+\.\d+\.\d+\.\d+)</td>') re_ip_address = find_ip.findall(tr) find_port = re.compile('<td>(\d+)</td>') re_port = find_port.findall(tr) for address, port in zip(re_ip_address, re_port): address_port = address + ':' + port yield address_port.replace(' ', '')
def cb(): ''' url = "https://www.facebook.com/" url,html = util.get_page({"url":url}) #print(html) ##print(type(html)) open("data/facebook.html","w").write(html) #html = open("data/facebook.html").read() ts = int(time.time()) lsd = re.findall("name=\"lsd\" value=\"[^\"]+\"", html) lsd = lsd[0].split("\"")[3] if len(lsd) > 0 else None lgnrnd = re.findall("name=\"lgnrnd\" value=\"[^\"]+\"",html) lgnrnd = lgnrnd[0].split("\"")[3] if len(lgnrnd) > 0 else None if lsd is None or lgnrnd is None: print("could not parse lsd") sys.exit(0) data = { "lsd":lsd, "email":"*****@*****.**", "pass":"******", "timezone":-480, "lgndim":"eyJ3IjoxOTIwLCJoIjoxMDgwLCJhdyI6MTkyMCwiYWgiOjEwNTMsImMiOjI0fQ==", "lgnrnd":lgnrnd, "lgnjs":ts, "ab_test_data":"AA///AAAAAAAA/A/AAA/AAAAAAAAAAAAAAAAAAAAAAAAf//fA/DBAB", "locale":"zh_CN", "login_source":"login_bluebar", } url = "https://www.facebook.com/login.php?login_attempt=1&lwv=110" url,html = util.get_page({"url":url,"data":data}) open("data/facebook2.html","w").write(html) #url = "https://www.facebook.com/100008346345446" url = "https://www.facebook.com/profile.php?id=100008346345446" url,html = util.get_page({"url":url}) print(url) open("data/profile2.html","w").write(html) ''' ''' c_id = "100014233620831" uid = "100008346345446" ts = int(time.time()) url = "https://www.facebook.com/profile.php?id=%s&lst=%s%%3A%s%%3A%d&sk=friends&source_ref=pb_friends_tl" % (uid,c_id,uid,ts) url,html = util.get_page({"url":url}) print(url) open("data/friends.html","w").write(html) ''' params = { "dpr":"1", "__user":"******", "__a":"1", "__dyn":"7AgNeyfyGmaxx2u6aEyx91qeCwKAKGgyi8zQC-C267UKewWhE98nwgUy22EaUgxebkwy8xa5WjzEgDKuEjKewExaFQ12VVojxCUSbAWCDxi5-78O5u5o5aayrhVo9ohxGbwYUmC-UjDQ6Evwwh8gUW5oy5EG2ut5xq48a9Ef8Cu4rGUpCzo-8Gm8z8O784afxK9yUvy8lUGdyU4eQEB0", "__af":"j0", "__req":"26", "__be":"-1", "__pc":"EXP4:DEFAULT", "__rev":"3161010", "__spin_r":"3161010", "__spin_b":"trunk", "__spin_t":"1500360303" } content = open("data/friends.html").read() data,info = get_info(content,None) params["data"] = data ts = int(time.time()) params["__spin_t"] = ts url = "https://www.facebook.com/ajax/pagelet/generic.php/AllFriendsAppCollectionPagelet" url,html = util.get_page({"url":url,"data":params}) print(url) open("data/friends_page.html","w").write(html)
def get(url,fname): url,html = util.get_page({"url":url}) open(fname,"w").write(html)