def crawl_href(anchor_url, anchor_text, encoding, selenium): tweet = Tweet() tweet.href = anchor_url tweet.title = anchor_text # get content _logger.debug('extracting content from (%s)' % tweet.href) content = pbrowser.extract_main_body(tweet.href, selenium, encoding) if content == '': # we dare not to deal with article without words return None else: tweet.content = content.encode('utf-8') # get image _logger.debug('trying to grab the main image from webpage, hint:(%s)' % tweet.title) image_url = '' image = None try: image, image_url = pbrowser.get_main_image_with_hint( url=tweet.href, hint=tweet.title, selenium=selenium, hint_encoding=encoding) _logger.debug('image url: %s' % image_url) except Exception, err: _logger.error( 'failed to grab image from %s: %s,%s' % (tweet.href, unicode(err).encode('utf-8'), traceback.format_exc()))
def recursive_crawl(url, encoding, selenium, agent, domain, terminate): if crawled_as_hub(agent, url, day_limit=3): _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url)) return links = pbrowser.get_all_href(url, encoding) _logger.debug("processing %d links" % (len(links))) count = 0 for idx, link in enumerate(links): # ignore href to different domain; accept all href if 'domain' is empty string if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1: _logger.debug('ignore (%s), different from domain (%s)' % (link['href'].text.encode('utf-8'), domain)) continue tweet = None try: #tweet = try_crawl_href(link, encoding, agent, selenium) tweet = try_crawl_href(link['href'].encode('utf-8').lower(), link.text.encode('utf-8').strip(), encoding, agent, selenium) except Exception, err: _logger.error('crawl href failed: %s, %s' % (err, traceback.format_exc())) continue if tweet != None: count += 1 try: agent.add_crawled_tweet(url, tweet) _logger.info( 'new tweed added to db, %d total, (%d / %d) prcessed' % (count, idx, len(links))) except Exception, err: _logger.error('failed to add crawled tweet to DB: %s' % err)
def process_entry(self, entry, source): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) url = entry.link.encode('utf-8') if self.agent.wee_exists(url): _logger.debug("ignore existed wee with url:%s" % url) return _logger.debug("processing entry from (%s)" % url) title = entry.title.encode('utf-8') if entry.has_key('author'): author = entry.author.encode('utf-8') else: author = '' if entry.has_key('updated_parsed') and entry.updated_parsed != None: updated_time = int(time.mktime(entry.updated_parsed)) else: updated_time = int(time.time( )) # FeedParser doesn't understand the 'updated' field # of this feed, neither can we. Probabaly some CJK chars. text, html = self.process_content(entry) if entry.has_key('tags'): tags = [tag.term.encode('utf-8') for tag in entry.tags] else: tags = [] try: self.agent.add_wee(source['id'], url, title, text, html, updated_time, author, tags) except Exception, err: _logger.error("DB failed to add wee: %s" % traceback.format_exc())
def update_proxy_log(self, proxy_addr, log_type): cur_date = datetime.now().strftime("%Y-%m-%d") self.cursor.execute( "select * from proxy_log where proxy_ip = %s and collect_date = %s", (proxy_addr, cur_date)) if self.cursor.rowcount == 0: use = 0 fail = 0 else: row = self.cursor.fetchone() use = row['use_count'] fail = row['fail_count'] if log_type == "use": use += 1 elif log_type == "fail": fail += 1 else: _logger.error("unknown proxy log type: %s" % log_type) return self.cursor.execute( "replace into proxy_log(proxy_ip, collect_date, use_count, fail_count) values(%s, %s, %s, %s)", (proxy_addr, cur_date, use, fail)) self.conn.commit()
def _crawl_thirdary(self, anchor): self.output.write(' %s\n' % anchor.text.encode('utf-8')) _logger.info('crawling fourth (%s)' % anchor['href']) try: self._crawl_fourth(anchor['href']) except Exception, err: _logger.error('fourth(%s) failed: %s' % (anchor['href'], err))
def run(self): while True: self.heartbeat(pending_input=True) self.agent.restart() tasks = self.agent.get_all_crawler_task() my_task = None for task in tasks: if task['id'] % self.shard_count == self.shard_id: my_task = task break if not my_task: _logger.debug('no task for process shard %d' % self.shard_id) time.sleep(10) continue self.heartbeat(pending_input=False) _logger.debug("Got task:%s" % (my_task)) try: if task['ttl'] > 1: self.process_hub(task) elif task['ttl'] == 1: self.process_terminal(task) except Exception, err: _logger.error( 'unexpected exception with url(%s):%s, %s' % (task['anchor_url'], err, traceback.format_exc())) finally:
def _wait_load(self, minutes=1): MIN = 60 * 1000 try: self.selenium.wait_for_page_to_load(timeout=MIN * minutes) except: _logger.error( 'error waiting page to load(%d min), will continue:%s' % (minutes, err))
def find(self, predicate): if self.parent != None: _logger.error('can\'t be called from non-root node') return None for child in self.children: result = child._find_in_depth(predicate) if result != None: return result return None
def fill_account(daemon, helper, user): sele = daemon.selenium daemon.user = user _logger.info('start joining groups') try: daemon.grouping(force=True) except Exception, err: _logger.error('grouping failed: %s' % err)
def remove_statistic(self, email): email = email.strip() if email == '': _logger.error('email is empty') today = date.today().strftime("%Y-%m-%d") self.cursor.execute( "delete from user_statistic where user like '%%%s%%' and collect_date = '%s'" % (email, today)) self.conn.commit()
def _post_article(self, browser, post_config): fail = 0 while fail < 5: try: browser.open("post-new.php", timeout=10) break except Exception, err: _logger.error('open submit url:(post-new.php) failed %d / 5' % (fail + 1)) fail += 1
def get_random_proxy(self): self.cursor.execute('select * from proxy') if self.cursor.rowcount == 0: _logger.error('no proxy in DB') return None all_proxy = list(self.cursor.fetchall()) all_proxy.append(None) # simulate direct access as one proxy return random.choice(all_proxy)
def update_token(self, user_email, app_id, value): try: self.cursor.execute("insert into sina_token values(%s, %s, %s)", (user_email, app_id, value)) self.conn.commit() return True except Exception, err: _logger.error( "failed to update new token using insert:%s, will try update" % err)
def _login(self, browser, login_config): #pre-login fail = 0 while fail < 5: try: browser.open(login_config['login_url']) break except Exception, err: _logger.error('open login page failed (%d/5)' % (fail + 1)) fail += 1
def parse_html(doc): html = doc['content'] # Remove comments, <script>, <style> try: soup = BeautifulSoup(html) except Exception, err: _logger.error( 'Failed to create BeautifulSoup for the document with url: ' + doc['url'] + '\n' + traceback.format_exc()) return []
def stop_follow(self, user, followee_id): try: self.cursor.execute( 'delete from follow_date where user_email = %s and followee_id = %s', (user.uname, followee_id)) self.conn.commit() except Exception, err: _logger.error( 'failed deleting follow date, user:(%s), followee_id:(%s), error:(%s)' % (user.uname, followee_id, err))
def read_one_blog(self, rawblog): blog = _BlogPost() blog.url = rawblog['url'] blog.paragraphs = filter(self._is_valid_paragraph, parse_html(rawblog)) if self.parse_blog_meta(rawblog['content'], blog) != True: _logger.error('parse blog meta failed, url:%s' % blog.url) return None #print ('\n\n' + paragraph_sep + '\n\n') .join(blog.contents).encode('utf-8') #print '\n\n' + doc_sep + '\n\n' return blog
def safe_execute(self, *argv, **kwargv): while True: try: ret = self.cursor.old_execute(*argv, **kwargv) return ret except MySQLdb.OperationalError, err: if err[0] == 2006: _logger.error('MySQL has gone away, will restart agent') self.restart() else: return None
def process_terminal(self, task): anchor_text = task['anchor_text'] anchor_url = task['anchor_url'] _logger.info('processing terminal link, url:%s' % anchor_url) tweet = None try: tweet = try_crawl_href(anchor_url, anchor_text, task['encoding'], self.agent, self.sele) except Exception, err: _logger.error('crawl href failed: %s, %s' % (err, traceback.format_exc()))
def process_hub(self, task): url = task['anchor_url'] _logger.info('processing hub page, url:%s' % url) last_crawl = self.agent.get_crawl_history(url) now = datetime.now() if (now - last_crawl).days <= 3: _logger.debug('ignore, recently crawled: %s' % str(last_crawl)) return domain = task['domain'] encoding = task['encoding'] links = pbrowser.get_all_href(url, encoding) _logger.debug("got %d links" % (len(links))) for idx, link in enumerate(links): if urlparse( link['href'].encode('utf-8')).netloc.find(domain) == -1: _logger.debug('ignore (%s), different from domain (%s)' % (link['href'].encode('utf-8'), domain)) continue # make tempoary source cur_url = link['href'].encode('utf-8').lower() cur_text = link.text.encode('utf-8').strip() if crawled_as_hub(self.agent, cur_url, day_limit=3): _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (cur_url)) continue if crawled_as_terminal(self.agent, cur_url, cur_text, day_limit=30): _logger.debug( 'ignore, recently(3 days) crawled as terminal: %s' % (cur_url)) continue if in_task_queue(self.agent, cur_url, cur_text): _logger.debug('ignore, already added to task queue: %s' % (cur_url)) continue ttl = task['ttl'] - 1 try: self.agent.add_crawler_task(anchor_url=cur_url, anchor_text=cur_text, encoding=encoding, domain=domain, ttl=ttl) _logger.debug('%s added to task in DB' % cur_url) except Exception, err: _logger.error('failed to add crawler task, url:(%s), %s' % (cur_url, err))
def shutdown(self): self.agent.stop() if hasattr(self, 'workers'): for worker in self.workers: pid = worker.pid try: self.kill_worker(worker) _logger.info('child process %d killed' % pid) except Exception, err: _logger.error( 'failed to kill child pid:%d, %s, it will become orphan' % (pid, err))
def pick_proxy_for_slot(agent, slot_id, all_proxy): proxies = [proxy for proxy in all_proxy if proxy['slot_id'] == None] if len(proxies) == 0: _logger.error("No free proxy for slot %d" % slot_id) return for proxy in proxies: if not bad_proxy(proxy): _logger.debug("got healthy proxy at %s" % proxy['addr']) agent.update_proxy_slot(slot_id, proxy) proxy['slot_id'] = slot_id return _logger.error("Can't find any decent proxy for slot %d" % slot_id)
def _crawl_secondary(self, div): tb = div self.output.write(' %s\n' % div.text.encode('utf-8')) while not hasattr(tb, 'name') or tb.name != u"table": tb = tb.nextSibling for third in tb.findAll('a'): _logger.info('crawling thirdary (%s)' % third.text) try: self._crawl_thirdary(third) except Exception, err: _logger.error( 'third(%s) failed: %s\n%s' % (third.text.encode('utf-8'), err, traceback.format_exc()))
def __init__(self, thread=3): self._success_count = 0 self._attempt_count = 0 # Load comments with open('blog-comments') as comm: self.comment_seg = filter(lambda c: len(c) > 5, map(str.strip, comm.read().split('#'))) if len(self.comment_seg) < 5: _logger.error('%d comments found, too small' % len(self.comment_seg)) self.thread_pool = xthreading.ThreadPool(maxThreads=thread)
def crawl_authors(self, authors, callback): for author in authors: cur_url = author _logger.info("crawling author from %s" % cur_url) try: self.crawl_one_author(cur_url, callback) _logger.debug('sleeping for 5 sec') time.sleep(5) except Exception, err: _logger.error( "crawl one author failed, url:(%s), error:%s, %s" % (cur_url, err, traceback.format_exc())) continue
def post_blog(self, browser, post_config): self._wp_base_url = post_config['base-url'] self._wp_config['login']['username'] = post_config['username'] self._wp_config['login']['password'] = post_config['password'] self._wp_config['login']['login_url'] = self._wp_base_url + 'wp-login.php' if not self._login(browser, self._wp_config['login']): _logger.error('login failed (%s:%s), url:(%s)' % (post_config['username'], post_config['password'], self._wp_config['login']['login_url'])) return False, '' self._sleep('after_login') self._wp_config['post']['title'] = post_config['title'] self._wp_config['post']['content'] = post_config['content'] return self._post_article(browser, self._wp_config['post'])
def crawl_second(self, url): self._randsleep() _logger.debug('openning url:%s' % url) html = self.br.open(url).read() soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312")) for anchor in soup.findAll('a'): try: href = anchor['href'] # Ignore internal links if href[:4] != "http" or href.find('hao123.com') != -1: continue self.output.write(' %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8'))) except Exception, err: _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
def fetch_source(self, source): _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) _logger.info("crawling source id=%d url=%s" % (source['id'], source['url'])) cur_time = int(time.time()) last_crawl_time = source['last_crawl_time'] if cur_time - last_crawl_time < HOUR: _logger.info("ignore source(%s), last crawled %d minutes ago" % (source['url'], (cur_time - last_crawl_time) / 60)) return try: _logger.debug("fetching feed from (%s)" % source['url']) p = feedparser.parse(source['url']) _logger.debug("fetched from (%s)" % source['url']) if p.feed.has_key( 'updated_parsed') and p.feed.updated_parsed != None: cur_feed_time = int(time.mktime(p.feed.updated_parsed)) else: cur_feed_time = int(time.time( )) # FeedParser doesn't understand the 'updated' field # of this feed, neither can we. Probabaly some CJK chars. db_feed_time = source['last_feed_time'] if db_feed_time >= cur_feed_time: _logger.info( "ignore source(%s), no new feed. Last feed:%s, cur feed:%s" % (source['url'], datetime.fromtimestamp(db_feed_time), datetime.fromtimestamp(cur_feed_time))) self.agent.update_source_time(source) else: _logger.info("processing %d entries from %s" % (len(p.entries), source['url'])) for entry in p.entries: self.process_entry(entry, source) self.agent.update_source_time(source, cur_feed_time) _logger.debug( "source(%s) updated: %s" % (source['url'], datetime.fromtimestamp(cur_feed_time))) _logger.info("source(id=%d) success" % source['id']) _logger.debug("pool stat: %d working %d waiting" % (self.pool.running(), self.pool.waiting())) except Exception, err: _logger.error( "crawling faild for source id=%d, %s: %s" % (source['id'], source['url'], traceback.format_exc()))
def _crawl_primary(self, anchor): self.output.write(anchor.text.encode('utf-8') + '\n') self._randsleep() html = self.br.open(anchor['href']).read() html = util.convert_to_utf8(html, 'gb2312') soup = BeautifulSoup(html) seconds = soup.findAll('div', 'dirtit') for second in seconds: _logger.info('crawling secondary category: (%s)' % second.text.encode('utf-8')) try: self._crawl_secondary(second) except Exception, err: _logger.error('secondary(%s) failed: %s' % (second.text.encode('utf-8'), err))
class FeedCrawler(object): def __init__(self, agent, pool): self.agent = agent self.pool = pool def grab_image(self, html, entry): wee_url = entry.link.encode('utf-8') soup = BeautifulSoup(html, fromEncoding="utf-8") img = soup.find('img', src=True) if img == None: _logger.debug("%s has no image inside" % wee_url) return url = img['src'] _logger.debug('downloading image from %s' % url) try: br = pbrowser.get_browser() image = br.download_image(url, base_url=wee_url).read() except Exception, err: _logger.error("downloading image failed(%s), baseurl(%s): %s" % (url, wee_url, traceback.format_exc())) return try: self.agent.add_wee_image(wee_url, image) _logger.debug("imaged added for wee:%s" % wee_url) except Exception, err: _logger.error("db error, failed to add image for wee %s: %s" % (wee_url, err))