async def main(): # We use a session to take advantage of tcp keep-alive # Set a 3 second read and connect timeout. Default is 5 minutes async with aiohttp.ClientSession(conn_timeout=3, read_timeout=3) as session: tasks = [(async_download_link(session, link)) for link in get_links()] # gather aggregates all the tasks and schedules them in the event loop await asyncio.gather(*tasks, return_exceptions=True)
def main(): ''' scrapes all of today's articles on https://blogs.fangraphs.com/ ''' # get the links to all the articles url = 'https://blogs.fangraphs.com/' response = requests.get(url) page = response.text parser = "html.parser" soup = BeautifulSoup(page, parser) links = utils.get_links(soup) # if an article was written today, save it today_datetime = datetime.datetime.now().date() article_title_lst = [] article_date_lst = [] article_text_lst = [] for link in links: response = requests.get(link) page = response.text soup = BeautifulSoup(page, parser) article_date = utils.get_article_date(soup) article_date_datetime = datetime.datetime.strptime(article_date, '%B %d, %Y').date() if today_datetime == article_date_datetime: article_title = utils.get_article_title(soup) article_text = utils.get_article_text(soup) article_title_lst.append(article_title) article_date_lst.append(article_date) article_text_lst.append(article_text) return article_title_lst, article_date_lst, article_text_lst
def multi_threaded_execution(num_of_threads=8): class WorkerThread(threading.Thread): def run(self): while True: dir, link = queue.get() download_link(dir, link) queue.task_done() start_ts = time() download_dir = setup_download_dir() links = get_links(CLIENT_ID) count = len(links) queue = Queue.Queue() for i in range(num_of_threads): t = WorkerThread() t.daemon = True t.start() for link in links: queue.put((download_dir, link)) queue.join() logging.info('Time Taken to Parallely download %s images took %s Seconds', count, time() - start_ts)
def crawl(records, to_be_visited_queue, item_topic_arn, max_children=999): """ Consume catalog from queue and insert children for future visit Input: records(list): catalogs to be visited to_be_visiited_queue(string): SQS to insert children item_sns_arn: ARN of SNS topic that receives the STAC items max_children(int): maximum number of children to be visited """ for record in records: catalog = url_to_json(record['body']) clinks, items = get_links(catalog, record['body']) # Children catalogs are placed into the queue to be visited for index, clink in enumerate(clinks): if index == max_children: break SQS_CLIENT.send_message(QueueUrl=to_be_visited_queue, MessageBody=clink) print('Catalog inserted: ', clink) # Items are sent to SNS topic for item in items: json_item = url_to_json(item) SNS_CLIENT.publish(TargetArn=item_topic_arn, Message=json.dumps(json_item))
def parse_tweet(self, response): # logging.info('Processing --> ' + response.url) username = response.xpath( '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="username u-dir u-textTruncate"]/b/text()' ).get(default='') full_name = response.xpath( '//*[@class="permalink-inner permalink-tweet-container"]//*[@class="FullNameGroup"]/strong/text()' ).get(default='') try: tweet_text = response.xpath('//title/text()').get( default='').split(':')[1].strip() except: tweet_text = ' '.join( response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-text-container"]/p//text()' ).getall()).strip() image_list = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="AdaptiveMediaOuterContainer"]//img/@src' ).getall() date_time = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="client-and-actions"]/span[@class="metadata"]/span/text()' ).get(default='') date_time = parser.parse(date_time.replace( '-', '')).strftime('%Y-%m-%d %H:%M:%S') retweets = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-retweets stat-count"]/a/strong/text()' ).get(default='') likes = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[@class="js-tweet-details-fixer tweet-details-fixer"]/div[@class="js-tweet-stats-container tweet-stats-container"]//*[@class="js-stat-count js-stat-favorites stat-count"]/a/strong/text()' ).get(default='') replies = response.xpath( '//*[contains(@class,"permalink-inner permalink-tweet-container")]//*[contains(@id,"profile-tweet-action-reply-count")]/parent::span/@data-tweet-stat-count' ).get(default='') mentions = get_mentions(tweet_text) hashtags = get_hashtags(tweet_text) cta = get_links(tweet_text) result = { 'username': username.lower(), 'full_name': full_name, 'twitter_url': response.url, 'tweet_text': tweet_text, 'tweet_time': str(date_time), 'number_of_likes': str(likes), 'no_of_retweets': str(retweets), 'no_of_replies': str(replies), 'mentions': ' | '.join(mentions), 'no_of_mentions': str(len(mentions)), 'hashtags': ' | '.join(hashtags), 'no_of_hashtags': str(len(hashtags)), 'call_to_action': ' | '.join(cta), 'image_url': ' | '.join(image_list), 'tag': self.tag } yield result
def queue_backed_execution(): start_time = time() download_dir = setup_download_dir() links = get_links(CLIENT_ID) count = len(links) q = rq(connection=Redis(host='localhost', port=6379)) for link in links: q.enqueue(download_link, download_dir, link)
def get_my_url(): """ 获取自己的主页作为起始也页面返回。 """ myself_soup = utils.get_links( session, "https://www.zhihu.com/settings/profile") my_url = myself_soup.find( "div", {"id": "js-url-preview", "class": "url-preview"}) return "https://www." + my_url.get_text()
def single_threaded_execution(): start_ts = time() download_directory = setup_download_dir() links = get_links(CLIENT_ID) count = len(links) for link in links: download_link(download_directory, link) logging.info( 'Time Taken to Sequentially download %s images took %s Seconds', count, time() - start_ts)
def crawlmainpage(url, session): print("entering crawl mainpage") mainpage_soup = utils.get_links(session, url) #print(mainpage_soup) #print('\n') mainpage_soup_str = str(mainpage_soup) with open('../mainpage.txt', 'wt', encoding='utf-8') as main: main.write(mainpage_soup_str) firstrule = re.compile( r'https:\\u002F\\u002Fapi.zhihu.com\\u002Fquestions\\u002F[0-9]+') firstmatch = re.findall(firstrule, mainpage_soup_str)
def multi_process_execution(pool_size=8): start_time = time() download_dir = setup_download_dir() links = get_links(CLIENT_ID) count = len(links) download = partial(download_link, download_dir) p = Pool(pool_size) p.map(download, links) logging.info( 'Time Taken to Multiprocess download %s images took %s Seconds', count, time() - start_time)
def get_my_url(): """ 获取自己的主页作为起始也页面返回。 """ myself_soup = utils.get_links(session, "https://www.zhihu.com/settings/profile") my_url = myself_soup.find("div", { "id": "js-url-preview", "class": "url-preview" }) return "https://www." + my_url.get_text()
def get_followees(user_url): """ 获取一个用户的关注列表,如果关注人数很多,网站只会显示部分,其余的部分会AJAX动态刷新。我们只抓取初始的那部分。 """ user_followees_url = user_url + "/followees" followees_list = [] followees_soup = utils.get_links(session, user_followees_url) for i in followees_soup.find_all("span", {"class": "author-link-line"}): followee_url = i.find("a").attrs['href'] followees_list.append(followee_url) if followees_list: return followees_list raise NoFolloweeError
def main(): """ Run the script """ args = parse_args() setup_logging(args) outdir = Path(get_output_dir()) if not outdir.exists(): outdir.mkdir() copy_ui() nb_workers = args.j if args.j else os.cpu_count() get_logger().warning("Using %s workers", nb_workers) if nb_workers > 1: with contextlib.closing(multiprocessing.Pool(nb_workers)) as pool: pool.starmap(start_link_handler, [(l, args) for l in get_links(args.file[0])]) else: for link in get_links(args.file[0]): start_link_handler(link, args)
def create_links_pagelist(pagelist_noclean): start_time = time.time() pagelist_links = [] listsize = len(pagelist_noclean) for i, (id, title, content) in enumerate(pagelist_noclean): links = get_links(content) pagelist_links.append((id, title, links)) print_percentage(i, listsize) elapsed_time = time.time() - start_time print(" ** Finish create links pagelist") print(" - Elapsed time create links pagelist : {}".format( hms_string(elapsed_time))) return pagelist_links
def get_my_url(session): """ 获取自己的主页作为起始也页面返回。 """ myself_soup = utils.get_links(session, val['account_url']) # with open('../temp.txt', 'rt', encoding='utf-8') as temp: # with open('../accoutpage.txt', 'wt',encoding='utf-8') as accoutp: # accoutp.truncate() # accoutp.write(temp.read()) print(myself_soup) myidrule = re.compile(r'(?<=people","id":")[0-9a-z]+') myid = re.search(myidrule, str(myself_soup)) try: my_id = myid.group(0) except AttributeError as ae: return False print("\n\nmy_id is " + my_id) return "https://www.zhihu.com/people/" + my_id
def run(self): """ 启动线程。 """ while True: self.get_task() # 输出正在抓取的URL print("Thread #" + str(self.thread_grade) + ": " + self.url) soup = utils.get_links(self.session, self.url, self.proxy) if not soup: print("Url Error") continue data_dict = self.get_info(soup) # 储存到数据库 dbAPI.store_by_mongodb(data_dict) # 控制抓取速度 time.sleep(val['sleep'])
def get_followees(user_url, session): """ 获取一个用户的关注列表,如果关注人数很多,网站只会显示部分,其余的部分会AJAX动态刷新。我们只抓取初始的那部分。 """ user_followees_url = user_url + "/following" print(user_followees_url) followees_list = [] followees_soup = utils.get_links(session, user_followees_url) temp = open("../following.txt", 'w', encoding='utf-8') temp.write(str(followees_soup.prettify())) temp.close() for i in followees_soup.find_all("a", { "class": "UserLink-link", "data-za-detail-view-element_name": "User" }): print("this is i " + i.get('href')) followee_url = "http:" + i.get('href') #remove lots of unvaild varieties if followee_url not in followees_list: followees_list.append(followee_url) if followees_list: return followees_list raise NoFolloweeError
""" Download images asynchronous use concurrent.futures """ from concurrent.futures import ThreadPoolExecutor from time import time from utils import get_links, download_link if __name__ == '__main__': ts = time() links = get_links() # By placing the executor inside a with block, the executors shutdown method # will be called cleaning up threads. # # By default, the executor sets number of workers to 5 times the number of # CPUs. with ThreadPoolExecutor(4) as executor: executor.map(download_link, links, timeout=30) print("Total time download: {}s".format(time() - ts))
""" Download images synchronous """ from utils import get_links, download_link from time import time if __name__ == '__main__': start_time = time() for link in get_links(): download_link(link) print("Total time download: {}s".format(time() - start_time))
def process_detail(self, data): # -1- 先更新该抓取页面的check_time assert len(data) == self.detail_len head_dict = json.loads(data[0:self.piece1_len - 1]) #print head_dict update_item = {} update_item['url_no'] = head_dict['url_no'] update_item['check_time'] = head_dict['timestamp'] update_item['status_code'] = head_dict['Status-Code'] ret = self.linkdb_logic.update(update_item) if ret['retcode'] != 0: logging.warning("update error. uno[%u] status_code[%u] check_time[%u] code[%u] mesg[%s] fno[%u] bid[%u]" \ % (update_item['url_no'], update_item['status_code'], update_item['check_time'], \ ret['retcode'], ret['message'], self.fileno, self.blockid)) #f = open('data.o', 'w') #f.write(utils.unzipData(data[self.piece1_len:])) #f.close() #sys.exit(1) #log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] jsonstr[%s]" \ # % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \ # data[0:self.piece1_len - 1], int(time.time()) - head_dict['timestamp']) # -2- 分析页面,得到一系列扩展连接(只在域内扩展) assert self.detail_len >= self.piece1_len if (head_dict['Status-Code'] == 301 or head_dict['Status-Code'] == 302) and head_dict.has_key('Location'): pass # 跳转的都不要! #link_item = {} #try: # link_item['refer_sign'] = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #except: # rfurl = 'http://'+head_dict['host'] # link_item['refer_sign'] = long(hashlib.md5(rfurl).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #link_item['creat_time'] = head_dict['timestamp'] #link_item['host_no'] = head_dict['host_no'] #link_item['url_type'] = 0 #link_item['status_code'] = 0 #link_item['url'] = str(head_dict['Location']) #link_item['url_sign'] = long(hashlib.md5(link_item['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #link_item['check_time'] = 0; ##print 'link_item["url_sign"]:', link_item['url_sign'] #url_sign_retlist = self.linkdb_logic.select_url_sign([link_item['url_sign'],]) #url_sign_set = set([x['url_sign'] for x in url_sign_retlist]) #if link_item['url_sign'] not in url_sign_set: # ret = self.linkdb_logic.insert(link_item) # if ret['retcode'] != 0: # logging.warning('redirection insert fail. retcode:%u message:%s status_code:%u jsonstr:%s fno:%u bid:%u' %\ # (ret['retcode'], ret['message'], head_dict['Status-Code'], data[0:self.piece1_len - 1], \ # self.fileno, self.blockid,)) elif head_dict[ 'Status-Code'] == 200 and self.detail_len > self.piece1_len: #print '-'*80 t = time.time() try: page_content = utils.unzipData(data[self.piece1_len:]) # print page_content print "==========================" print 'unzip time_cost: %.3f' % (time.time() - t, ) t = time.time() url_set = utils.get_links(page_content, host=head_dict['host'], inhost=True, base_url=head_dict['url']) print 'get_links time_cost: %.3f' % (time.time() - t, ) print 'url_set:', url_set, self.piece1_len, self.detail_len #print 'content-len:', len(page_content) #self.send_response() #sys.exit(1) try: refer_sign = long( hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) # 先查询url_sign是否已经存在于数据库中 url_sign_list = [ long(hashlib.md5(url).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) for url in url_set ] if url_sign_list: t = time.time() url_sign_retlist = self.linkdb_logic.select_url_sign( url_sign_list) #print 'select_url_sign time_cost: %.3f' % (time.time() - t,) t = time.time() url_sign_set = set( [x['url_sign'] for x in url_sign_retlist]) else: url_sign_set = set([]) link_item_list = [] for url in url_set: #if -1 != url.find('ActionData.aspx'): # logging.warning ('got it. %s from %s' % (url, head_dict['url'])) # continue link_item = {} link_item['url_sign'] = long( hashlib.md5(url).hexdigest()[:16], 16) & long( '7fffffffffffffff', 16) if link_item['url_sign'] in url_sign_set: continue else: link_item['refer_sign'] = refer_sign link_item['creat_time'] = head_dict['timestamp'] link_item['host_no'] = head_dict['host_no'] link_item['check_time'] = 0 link_item['url_type'] = 0 link_item['status_code'] = 0 link_item['url'] = str(url) #print link_item link_item_list.append(link_item) if link_item_list: #print len(link_item_list), head_dict['url'] ret = self.linkdb_logic.insert_batch(link_item_list) if ret['retcode'] != 0: logging.warning("insert error. code[%u] mesg[%s] fno[%u] bid[%u]" %\ (ret['retcode'], ret['message'], self.fileno, self.blockid)) log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] all[%u] err[%u] jsonstr[%s] time_cost[%0.3f] fno[%u] bid[%u]" \ % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \ int(time.time()) - head_dict['timestamp'], len(link_item_list), \ ret['failnum'], data[0:self.piece1_len - 1], time.time() - t, self.fileno, self.blockid) else: log_message = 'link_item_list empty. uno[%u] host[%s] delay[%u] p1l[%u] dlen[%u] url_count[%u] filted to zero. fno:%u bid:%u' \ % (head_dict['url_no'], head_dict['host'], int(time.time()) - head_dict['timestamp'], self.piece1_len, \ self.detail_len, len(url_set), self.fileno, self.blockid) logging.info(log_message) #print 'insert time_cost: %.3f' % (time.time() - t,) #charset_start = page_content.find('charset=') #charset_end = page_content[charset_start:].find('"') #charset = page_content[charset_start+8 : charset_start+charset_end] except: logging.warning('shit happens in url. jsonstr:%s fno[%u] bid[%u]' % \ (data[0:self.piece1_len - 1], self.fileno, self.blockid,)) except: logging.warning('unzip fail jsonstr:%s fno:%u bid:%u' % ( data[0:self.piece1_len - 1], self.fileno, self.blockid, )) else: logging.warning('shit happens. jsonstr:%s fno[%u] bid[%u]' % ( data[0:self.piece1_len - 1], self.fileno, self.blockid, )) pass # TODO got some error #sys.exit(1) self.send_response()
def crawlentpage(url, session, fromwhere): print("entering crawl entpage\n") myrule = re.compile( r'<a class=\"zu-top-nav-userinfo\" href=\"\/people\/(.*?)\">') followerrule = re.compile( r'<a class=\"zg-link author-link\" href=\"\/people\/(.*?)\">') nextfollowerrule = re.compile( r'<a class=\"zg-link author-link\" href=\"\\/people\\/(.*?)\">') ppidrule = re.compile(r'(?<=id="pp-)[0-9a-z]+') entpage_soup = utils.get_links(session, url) entpage_soup_str = str(entpage_soup) mymatch = re.findall(myrule, entpage_soup_str) followermatch = re.findall(followerrule, entpage_soup_str) ppidmatch = re.findall(ppidrule, entpage_soup_str) cirnum = 0 ip = utils.prival['mongodbnet']['host'] port = utils.prival['mongodbnet']['port'] remoteclient = pymongo.MongoClient(str(ip) + ":" + str(port)) while cirnum < len(followermatch): try: insertdomainid(remoteclient, val['dbnamenet'], val['colnamenet'], followermatch[cirnum], ppidmatch[cirnum], fromwhere) except OSError as ee: print("{0}".format(ee)) cirnum = cirnum + 1 print('\nmy Domainhack is' + str(mymatch) + '\n') print('\nthe followers\' Domainhacks are' + str(followermatch) + '\n') print('while their ppid are ' + str(ppidmatch) + '\n') if len(followermatch) < 20: return lastonerule = re.compile(r'(?<=mi-)[0-9]+') allstart = re.findall(lastonerule, entpage_soup_str) try: truestart = allstart[len(allstart) - 1] lasttruestart = truestart except IndexError as ie: print("{0}".format(ie)) return print('\nthe truestart is: ' + truestart) time.sleep(random.randint(5, 10)) postnum = 0 while postnum >= 0: #utils.prival['flippagenum']: offsetnum = 40 + postnum * 20 startnum = truestart print('offsetnum=' + str(offsetnum) + ' startnum=' + str(startnum)) params = {"offset": str(offsetnum), "start": str(startnum)} #params = 'offset='+str(offsetnum)+'&start='+startnum #whether, session = zhihu_login.ZhihuAccount(zhihu_login.acc, zhihu_login.sec).login('en', load_cookies) nextentpage_soup = utils.mypost(session, url, params) try: nextentpage_soup_str = str(nextentpage_soup.prettify().encode( 'latin-1').decode('unicode_escape')) if '\"errcode\": 1991832' in nextentpage_soup_str: print(nextentpage_soup_str) print('please change your account whose status is normal') except UnicodeEncodeError: nextentpage_soup_str = str(nextentpage_soup) nextfollowermatch = re.findall(nextfollowerrule, nextentpage_soup_str) ppidmatch = re.findall(ppidrule, nextentpage_soup_str) cirnum = 0 while cirnum < len(nextfollowermatch): try: insertdomainid(remoteclient, val['dbnamenet'], val['colnamenet'], nextfollowermatch[cirnum], ppidmatch[cirnum], fromwhere) #,val['univer_name'][num_url]) except OSError as ee: print("{0}".format(ee)) cirnum = cirnum + 1 print('\nthe followers\' Domainhacks are' + str(nextfollowermatch) + '\n') print('while their ppid are ' + str(ppidmatch) + '\n') if len(nextfollowermatch) < 20: return postnum = postnum + 1 lastonerule = re.compile(r'(?<=mi-)[0-9]+') wholestart = re.findall(lastonerule, nextentpage_soup_str) if len(wholestart) > 0: truestart = wholestart[len(wholestart) - 1] if truestart == lasttruestart: print('flipping ' + str(postnum) + ' pages') return lasttruestart = truestart print('\nthe truestart is: ' + truestart) else: print('not found next start') print('flipping ' + str(postnum) + ' pages') return time.sleep(random.randint(10, 15))
def login(self, captcha_lang: str = 'en', load_cookies: bool = True): """ 模拟登录知乎 :param captcha_lang: 验证码类型 'en' or 'cn' :param load_cookies: 是否读取上次保存的 Cookies :return: bool """ if load_cookies and self.load_cookies(): print('读取 Cookies 文件') print('the cookie.txt selected is ' + str(orifromtxt)) if self.check_login(): print('登录成功') with open(utils.lastfile, 'wt', encoding='utf-8') as tempname: tempname.writelines(utils.cookiepath) personinfo = utils.get_links(self.session, utils.val['apime_url']) if personinfo is not str: try: print(personinfo.text) except AttributeError as ae: print( 'AttributeError: \'str\' object has no attribute \'text\'' ) else: print(personinfo) return True, self.session print('Cookies 已过期') self._check_user_pass() self.login_data.update({ 'username': self.username, 'password': self.password, 'lang': captcha_lang }) timestamp = int(time.time() * 1000) self.login_data.update({ 'captcha': self._get_captcha(self.login_data['lang']), 'timestamp': timestamp, 'signature': self._get_signature(timestamp) }) headers = self.session.headers.copy() headers.update({ 'content-type': 'application/x-www-form-urlencoded', 'x-zse-83': '3_1.1', 'x-xsrftoken': self._get_xsrf() }) data = self._encrypt(self.login_data) login_api = 'https://www.zhihu.com/api/v3/oauth/sign_in' resp = self.session.post(login_api, data=data, headers=headers) while True: if 'error' in resp.text: print(json.loads(resp.text)['error']) print('i am here' + resp.text) self.login_data.update( {'captcha': self._get_captcha(self.login_data['lang'])}) data = self._encrypt(self.login_data) resp = self.session.post(login_api, data=data, headers=headers) else: break if self.check_login(): print('登录成功') with open(utils.lastfile, 'wt', encoding='utf-8') as tempname: tempname.writelines(utils.cookiepath) personinfo = utils.get_links(self.session, utils.val['apime_url']) print(personinfo) return True, self.session print('登录失败') return False
def process_detail(self, data): # -1- 先更新该抓取页面的check_time assert len(data) == self.detail_len head_dict = json.loads(data[0:self.piece1_len - 1]) #print head_dict update_item = {} update_item['url_no'] = head_dict['url_no'] update_item['check_time'] = head_dict['timestamp'] update_item['status_code'] = head_dict['Status-Code'] ret = self.linkdb_logic.update(update_item) if ret['retcode'] != 0: logging.warning("update error. uno[%u] status_code[%u] check_time[%u] code[%u] mesg[%s] fno[%u] bid[%u]" \ % (update_item['url_no'], update_item['status_code'], update_item['check_time'], \ ret['retcode'], ret['message'], self.fileno, self.blockid)) #f = open('data.o', 'w') #f.write(utils.unzipData(data[self.piece1_len:])) #f.close() #sys.exit(1) #log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] jsonstr[%s]" \ # % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \ # data[0:self.piece1_len - 1], int(time.time()) - head_dict['timestamp']) # -2- 分析页面,得到一系列扩展连接(只在域内扩展) assert self.detail_len >= self.piece1_len if (head_dict['Status-Code'] == 301 or head_dict['Status-Code'] == 302) and head_dict.has_key('Location'): pass # 跳转的都不要! #link_item = {} #try: # link_item['refer_sign'] = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #except: # rfurl = 'http://'+head_dict['host'] # link_item['refer_sign'] = long(hashlib.md5(rfurl).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #link_item['creat_time'] = head_dict['timestamp'] #link_item['host_no'] = head_dict['host_no'] #link_item['url_type'] = 0 #link_item['status_code'] = 0 #link_item['url'] = str(head_dict['Location']) #link_item['url_sign'] = long(hashlib.md5(link_item['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) #link_item['check_time'] = 0; ##print 'link_item["url_sign"]:', link_item['url_sign'] #url_sign_retlist = self.linkdb_logic.select_url_sign([link_item['url_sign'],]) #url_sign_set = set([x['url_sign'] for x in url_sign_retlist]) #if link_item['url_sign'] not in url_sign_set: # ret = self.linkdb_logic.insert(link_item) # if ret['retcode'] != 0: # logging.warning('redirection insert fail. retcode:%u message:%s status_code:%u jsonstr:%s fno:%u bid:%u' %\ # (ret['retcode'], ret['message'], head_dict['Status-Code'], data[0:self.piece1_len - 1], \ # self.fileno, self.blockid,)) elif head_dict['Status-Code'] == 200 and self.detail_len > self.piece1_len: #print '-'*80 t = time.time() try: page_content = utils.unzipData(data[self.piece1_len:]) #print 'unzip time_cost: %.3f' % (time.time() - t,) t = time.time() url_set = utils.get_links(page_content, host = head_dict['host'], inhost = True, base_url = head_dict['url']) #print 'get_links time_cost: %.3f' % (time.time() - t,) #print 'url_set:', url_set, self.piece1_len, self.detail_len #print 'content-len:', len(page_content) #self.send_response() #sys.exit(1) try: refer_sign = long(hashlib.md5(head_dict['url']).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) # 先查询url_sign是否已经存在于数据库中 url_sign_list = [long(hashlib.md5(url).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) for url in url_set] if url_sign_list: t = time.time() url_sign_retlist = self.linkdb_logic.select_url_sign(url_sign_list) #print 'select_url_sign time_cost: %.3f' % (time.time() - t,) t = time.time() url_sign_set = set([x['url_sign'] for x in url_sign_retlist]) else: url_sign_set = set([]) link_item_list = [] for url in url_set: #if -1 != url.find('ActionData.aspx'): # logging.warning ('got it. %s from %s' % (url, head_dict['url'])) # continue link_item = {} link_item['url_sign'] = long(hashlib.md5(url).hexdigest()[:16], 16) & long('7fffffffffffffff', 16) if link_item['url_sign'] in url_sign_set: continue else: link_item['refer_sign'] = refer_sign link_item['creat_time'] = head_dict['timestamp'] link_item['host_no'] = head_dict['host_no'] link_item['check_time'] = 0 link_item['url_type'] = 0 link_item['status_code'] = 0 link_item['url'] = str(url) #print link_item link_item_list.append(link_item) if link_item_list: #print len(link_item_list), head_dict['url'] ret = self.linkdb_logic.insert_batch(link_item_list) if ret['retcode'] != 0: logging.warning("insert error. code[%u] mesg[%s] fno[%u] bid[%u]" %\ (ret['retcode'], ret['message'], self.fileno, self.blockid)) log_message = "log_id[%u] p1l[%u] zlen[%u] delay[%u] all[%u] err[%u] jsonstr[%s] time_cost[%0.3f] fno[%u] bid[%u]" \ % (self.log_id, self.piece1_len, self.detail_len-self.piece1_len, \ int(time.time()) - head_dict['timestamp'], len(link_item_list), \ ret['failnum'], data[0:self.piece1_len - 1], time.time() - t, self.fileno, self.blockid) else: log_message = 'link_item_list empty. uno[%u] host[%s] delay[%u] p1l[%u] dlen[%u] url_count[%u] filted to zero. fno:%u bid:%u' \ % (head_dict['url_no'], head_dict['host'], int(time.time()) - head_dict['timestamp'], self.piece1_len, \ self.detail_len, len(url_set), self.fileno, self.blockid) logging.info(log_message) #print 'insert time_cost: %.3f' % (time.time() - t,) #charset_start = page_content.find('charset=') #charset_end = page_content[charset_start:].find('"') #charset = page_content[charset_start+8 : charset_start+charset_end] except: logging.warning('shit happens in url. jsonstr:%s fno[%u] bid[%u]' % \ (data[0:self.piece1_len - 1], self.fileno, self.blockid,)) except: logging.warning('unzip fail jsonstr:%s fno:%u bid:%u' % (data[0:self.piece1_len - 1], self.fileno, self.blockid,)) else: logging.warning('shit happens. jsonstr:%s fno[%u] bid[%u]' % (data[0:self.piece1_len-1],self.fileno,self.blockid,)) pass # TODO got some error #sys.exit(1) self.send_response()
#!/usr/bin/env python3 # -*- coding: utf-8 -*- import sys import time from utils import read_last_email, get_links, logger, Requester, auth_3dhub from config import NAME_3DHUBS, PASSWORD_3DHUBS if __name__ == "__main__": logger.info("Start '{}' script.".format(sys.argv[0])) logger.debug("Read last email.") message_body = read_last_email() links_from_message = get_links(message_body) ignore_links_from_message = True if ignore_links_from_message and len(links_from_message) == 0: logger.error("There are no links in email body.") sys.exit(1) requester = Requester() logger.debug("Authorizate via account '{}'.".format(NAME_3DHUBS)) auth_3dhub(requester, NAME_3DHUBS, PASSWORD_3DHUBS) time.sleep(1) if not ignore_links_from_message: link = links_from_message[0] logger.debug("Go to link [{}].".format(link)) requester.get(link) sys.exit(0) logger.debug("Go to 'My orders'.")