Ejemplo n.º 1
0
def crawl_href(anchor_url, anchor_text, encoding, selenium):
    tweet = Tweet()
    tweet.href = anchor_url
    tweet.title = anchor_text

    # get content
    _logger.debug('extracting content from (%s)' % tweet.href)
    content = pbrowser.extract_main_body(tweet.href, selenium, encoding)
    if content == '':  # we dare not to deal with article without words
        return None
    else:
        tweet.content = content.encode('utf-8')

    # get image
    _logger.debug('trying to grab the main image from webpage, hint:(%s)' %
                  tweet.title)
    image_url = ''
    image = None

    try:
        image, image_url = pbrowser.get_main_image_with_hint(
            url=tweet.href,
            hint=tweet.title,
            selenium=selenium,
            hint_encoding=encoding)
        _logger.debug('image url: %s' % image_url)
    except Exception, err:
        _logger.error(
            'failed to grab image from %s: %s,%s' %
            (tweet.href, unicode(err).encode('utf-8'), traceback.format_exc()))
Ejemplo n.º 2
0
def recursive_crawl(url, encoding, selenium, agent, domain, terminate):
    if crawled_as_hub(agent, url, day_limit=3):
        _logger.debug('ignore, recently(3 days) crawled as hub: %s' % (url))
        return

    links = pbrowser.get_all_href(url, encoding)
    _logger.debug("processing %d links" % (len(links)))
    count = 0
    for idx, link in enumerate(links):
        # ignore href to different domain; accept all href if 'domain' is empty string
        if urlparse(link['href'].encode('utf-8')).netloc.find(domain) == -1:
            _logger.debug('ignore (%s), different from domain (%s)' %
                          (link['href'].text.encode('utf-8'), domain))
            continue

        tweet = None
        try:
            #tweet = try_crawl_href(link, encoding, agent, selenium)
            tweet = try_crawl_href(link['href'].encode('utf-8').lower(),
                                   link.text.encode('utf-8').strip(), encoding,
                                   agent, selenium)
        except Exception, err:
            _logger.error('crawl href failed: %s, %s' %
                          (err, traceback.format_exc()))
            continue

        if tweet != None:
            count += 1
            try:
                agent.add_crawled_tweet(url, tweet)
                _logger.info(
                    'new tweed added to db, %d total, (%d / %d) prcessed' %
                    (count, idx, len(links)))
            except Exception, err:
                _logger.error('failed to add crawled tweet to DB: %s' % err)
Ejemplo n.º 3
0
    def process_entry(self, entry, source):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        url = entry.link.encode('utf-8')
        if self.agent.wee_exists(url):
            _logger.debug("ignore existed wee with url:%s" % url)
            return

        _logger.debug("processing entry from (%s)" % url)
        title = entry.title.encode('utf-8')
        if entry.has_key('author'):
            author = entry.author.encode('utf-8')
        else:
            author = ''

        if entry.has_key('updated_parsed') and entry.updated_parsed != None:
            updated_time = int(time.mktime(entry.updated_parsed))
        else:
            updated_time = int(time.time(
            ))  # FeedParser doesn't understand the 'updated' field
            # of this feed, neither can we. Probabaly some CJK chars.
        text, html = self.process_content(entry)
        if entry.has_key('tags'):
            tags = [tag.term.encode('utf-8') for tag in entry.tags]
        else:
            tags = []
        try:
            self.agent.add_wee(source['id'], url, title, text, html,
                               updated_time, author, tags)
        except Exception, err:
            _logger.error("DB failed to add wee: %s" % traceback.format_exc())
Ejemplo n.º 4
0
    def update_proxy_log(self, proxy_addr, log_type):
        cur_date = datetime.now().strftime("%Y-%m-%d")

        self.cursor.execute(
            "select * from proxy_log where proxy_ip = %s and collect_date = %s",
            (proxy_addr, cur_date))
        if self.cursor.rowcount == 0:
            use = 0
            fail = 0
        else:
            row = self.cursor.fetchone()
            use = row['use_count']
            fail = row['fail_count']

        if log_type == "use":
            use += 1
        elif log_type == "fail":
            fail += 1
        else:
            _logger.error("unknown proxy log type: %s" % log_type)
            return

        self.cursor.execute(
            "replace into proxy_log(proxy_ip, collect_date, use_count, fail_count) values(%s, %s, %s, %s)",
            (proxy_addr, cur_date, use, fail))
        self.conn.commit()
Ejemplo n.º 5
0
 def _crawl_thirdary(self, anchor):
     self.output.write('    %s\n' % anchor.text.encode('utf-8'))
     _logger.info('crawling fourth (%s)' % anchor['href'])
     try:
         self._crawl_fourth(anchor['href'])
     except Exception, err:
         _logger.error('fourth(%s) failed: %s' % (anchor['href'], err))
Ejemplo n.º 6
0
    def run(self):
        while True:
            self.heartbeat(pending_input=True)
            self.agent.restart()
            tasks = self.agent.get_all_crawler_task()
            my_task = None
            for task in tasks:
                if task['id'] % self.shard_count == self.shard_id:
                    my_task = task
                    break
            if not my_task:
                _logger.debug('no task for process shard %d' % self.shard_id)
                time.sleep(10)
                continue

            self.heartbeat(pending_input=False)
            _logger.debug("Got task:%s" % (my_task))

            try:
                if task['ttl'] > 1:
                    self.process_hub(task)

                elif task['ttl'] == 1:
                    self.process_terminal(task)
            except Exception, err:
                _logger.error(
                    'unexpected exception with url(%s):%s, %s' %
                    (task['anchor_url'], err, traceback.format_exc()))
            finally:
Ejemplo n.º 7
0
 def _wait_load(self, minutes=1):
     MIN = 60 * 1000
     try:
         self.selenium.wait_for_page_to_load(timeout=MIN * minutes)
     except:
         _logger.error(
             'error waiting page to load(%d min), will continue:%s' %
             (minutes, err))
Ejemplo n.º 8
0
 def find(self, predicate):
     if self.parent != None:
         _logger.error('can\'t be called from non-root node')
         return None
     for child in self.children:
         result = child._find_in_depth(predicate)
         if result != None:
             return result
     return None
Ejemplo n.º 9
0
def fill_account(daemon, helper, user):

    sele = daemon.selenium
    daemon.user = user
    _logger.info('start joining groups')
    try:
        daemon.grouping(force=True)
    except Exception, err:
        _logger.error('grouping failed: %s' % err)
Ejemplo n.º 10
0
 def remove_statistic(self, email):
     email = email.strip()
     if email == '':
         _logger.error('email is empty')
     today = date.today().strftime("%Y-%m-%d")
     self.cursor.execute(
         "delete from user_statistic where user like '%%%s%%' and collect_date = '%s'"
         % (email, today))
     self.conn.commit()
Ejemplo n.º 11
0
 def _post_article(self, browser, post_config):
     fail = 0
     while fail < 5:
         try:
             browser.open("post-new.php", timeout=10)
             break
         except Exception, err:
             _logger.error('open submit url:(post-new.php) failed %d / 5' % (fail + 1))
             fail += 1
Ejemplo n.º 12
0
    def get_random_proxy(self):
        self.cursor.execute('select * from proxy')
        if self.cursor.rowcount == 0:
            _logger.error('no proxy in DB')
            return None
        all_proxy = list(self.cursor.fetchall())
        all_proxy.append(None)  # simulate direct access as one proxy

        return random.choice(all_proxy)
Ejemplo n.º 13
0
 def update_token(self, user_email, app_id, value):
     try:
         self.cursor.execute("insert into sina_token values(%s, %s, %s)",
                             (user_email, app_id, value))
         self.conn.commit()
         return True
     except Exception, err:
         _logger.error(
             "failed to update new token using insert:%s, will try update" %
             err)
Ejemplo n.º 14
0
 def _login(self, browser, login_config):
     #pre-login
     fail = 0
     while fail < 5:
         try:
             browser.open(login_config['login_url'])
             break
         except Exception, err:
             _logger.error('open login page failed (%d/5)' % (fail + 1))
             fail += 1
Ejemplo n.º 15
0
def parse_html(doc):
    html = doc['content']
    # Remove comments, <script>, <style>
    try:
        soup = BeautifulSoup(html)
    except Exception, err:
        _logger.error(
            'Failed to create BeautifulSoup for the document with url: ' +
            doc['url'] + '\n' + traceback.format_exc())
        return []
Ejemplo n.º 16
0
 def stop_follow(self, user, followee_id):
     try:
         self.cursor.execute(
             'delete from follow_date where user_email = %s and followee_id = %s',
             (user.uname, followee_id))
         self.conn.commit()
     except Exception, err:
         _logger.error(
             'failed deleting follow date, user:(%s), followee_id:(%s), error:(%s)'
             % (user.uname, followee_id, err))
Ejemplo n.º 17
0
 def read_one_blog(self, rawblog):
     blog = _BlogPost()
     blog.url = rawblog['url']
     blog.paragraphs = filter(self._is_valid_paragraph, parse_html(rawblog))
     if self.parse_blog_meta(rawblog['content'], blog) != True:
         _logger.error('parse blog meta failed, url:%s' % blog.url)
         return None
     #print ('\n\n' + paragraph_sep + '\n\n') .join(blog.contents).encode('utf-8')
     #print '\n\n' + doc_sep + '\n\n'
     return blog
Ejemplo n.º 18
0
 def safe_execute(self, *argv, **kwargv):
     while True:
         try:
             ret = self.cursor.old_execute(*argv, **kwargv)
             return ret
         except MySQLdb.OperationalError, err:
             if err[0] == 2006:
                 _logger.error('MySQL has gone away, will restart agent')
                 self.restart()
         else:
             return None
Ejemplo n.º 19
0
    def process_terminal(self, task):
        anchor_text = task['anchor_text']
        anchor_url = task['anchor_url']
        _logger.info('processing terminal link, url:%s' % anchor_url)

        tweet = None
        try:
            tweet = try_crawl_href(anchor_url, anchor_text, task['encoding'],
                                   self.agent, self.sele)
        except Exception, err:
            _logger.error('crawl href failed: %s, %s' %
                          (err, traceback.format_exc()))
Ejemplo n.º 20
0
    def process_hub(self, task):
        url = task['anchor_url']
        _logger.info('processing hub page, url:%s' % url)
        last_crawl = self.agent.get_crawl_history(url)
        now = datetime.now()
        if (now - last_crawl).days <= 3:
            _logger.debug('ignore, recently crawled: %s' % str(last_crawl))
            return

        domain = task['domain']
        encoding = task['encoding']
        links = pbrowser.get_all_href(url, encoding)
        _logger.debug("got %d links" % (len(links)))

        for idx, link in enumerate(links):
            if urlparse(
                    link['href'].encode('utf-8')).netloc.find(domain) == -1:
                _logger.debug('ignore (%s), different from domain (%s)' %
                              (link['href'].encode('utf-8'), domain))
                continue

            # make tempoary source
            cur_url = link['href'].encode('utf-8').lower()
            cur_text = link.text.encode('utf-8').strip()

            if crawled_as_hub(self.agent, cur_url, day_limit=3):
                _logger.debug('ignore, recently(3 days) crawled as hub: %s' %
                              (cur_url))
                continue

            if crawled_as_terminal(self.agent, cur_url, cur_text,
                                   day_limit=30):
                _logger.debug(
                    'ignore, recently(3 days) crawled as terminal: %s' %
                    (cur_url))
                continue

            if in_task_queue(self.agent, cur_url, cur_text):
                _logger.debug('ignore, already added to task queue: %s' %
                              (cur_url))
                continue

            ttl = task['ttl'] - 1
            try:
                self.agent.add_crawler_task(anchor_url=cur_url,
                                            anchor_text=cur_text,
                                            encoding=encoding,
                                            domain=domain,
                                            ttl=ttl)
                _logger.debug('%s added to task in DB' % cur_url)
            except Exception, err:
                _logger.error('failed to add crawler task, url:(%s), %s' %
                              (cur_url, err))
Ejemplo n.º 21
0
 def shutdown(self):
     self.agent.stop()
     if hasattr(self, 'workers'):
         for worker in self.workers:
             pid = worker.pid
             try:
                 self.kill_worker(worker)
                 _logger.info('child process %d killed' % pid)
             except Exception, err:
                 _logger.error(
                     'failed to kill child pid:%d, %s, it will become orphan'
                     % (pid, err))
Ejemplo n.º 22
0
def pick_proxy_for_slot(agent, slot_id, all_proxy):
    proxies = [proxy for proxy in all_proxy if proxy['slot_id'] == None]
    if len(proxies) == 0:
        _logger.error("No free proxy for slot %d" % slot_id)
        return
    for proxy in proxies:
        if not bad_proxy(proxy):
            _logger.debug("got healthy proxy at %s" % proxy['addr'])
            agent.update_proxy_slot(slot_id, proxy)
            proxy['slot_id'] = slot_id
            return

    _logger.error("Can't find any decent proxy for slot %d" % slot_id)
Ejemplo n.º 23
0
 def _crawl_secondary(self, div):
     tb = div
     self.output.write('  %s\n' % div.text.encode('utf-8'))
     while not hasattr(tb, 'name') or tb.name != u"table":
         tb = tb.nextSibling
     for third in tb.findAll('a'):
         _logger.info('crawling thirdary (%s)' % third.text)
         try:
             self._crawl_thirdary(third)
         except Exception, err:
             _logger.error(
                 'third(%s) failed: %s\n%s' %
                 (third.text.encode('utf-8'), err, traceback.format_exc()))
Ejemplo n.º 24
0
    def __init__(self, thread=3):
        self._success_count = 0
        self._attempt_count = 0

        # Load comments
        with open('blog-comments') as comm:
            self.comment_seg = filter(lambda c: len(c) > 5,
                                      map(str.strip,
                                          comm.read().split('#')))
        if len(self.comment_seg) < 5:
            _logger.error('%d comments found, too small' %
                          len(self.comment_seg))
        self.thread_pool = xthreading.ThreadPool(maxThreads=thread)
Ejemplo n.º 25
0
 def crawl_authors(self, authors, callback):
     for author in authors:
         cur_url = author
         _logger.info("crawling author from %s" % cur_url)
         try:
             self.crawl_one_author(cur_url, callback)
             _logger.debug('sleeping for 5 sec')
             time.sleep(5)
         except Exception, err:
             _logger.error(
                 "crawl one author failed, url:(%s), error:%s, %s" %
                 (cur_url, err, traceback.format_exc()))
             continue
Ejemplo n.º 26
0
    def post_blog(self, browser, post_config):
        self._wp_base_url = post_config['base-url']
        self._wp_config['login']['username'] = post_config['username']
        self._wp_config['login']['password'] = post_config['password']
        self._wp_config['login']['login_url'] = self._wp_base_url + 'wp-login.php'
        if not self._login(browser, self._wp_config['login']):
            _logger.error('login failed (%s:%s), url:(%s)' % (post_config['username'], post_config['password'], self._wp_config['login']['login_url']))
            return False, ''

        self._sleep('after_login')

        self._wp_config['post']['title'] = post_config['title']
        self._wp_config['post']['content'] = post_config['content']
        return self._post_article(browser, self._wp_config['post'])
Ejemplo n.º 27
0
    def crawl_second(self, url):
        self._randsleep()
        _logger.debug('openning url:%s' % url)
        html = self.br.open(url).read()
        soup = BeautifulSoup(util.convert_to_utf8(html, "gb2312"))        

        for anchor in soup.findAll('a'):
            try:
                href = anchor['href']
                # Ignore internal links
                if href[:4] != "http" or href.find('hao123.com') != -1:
                    continue
                self.output.write('  %s %s\n' % (href.encode('utf8'), anchor.text.encode('utf8')))
            except Exception, err:
                _logger.error('got error with anchor(%s): %s' % (str(anchor), err))
Ejemplo n.º 28
0
    def fetch_source(self, source):
        _logger.debug("pool stat: %d working %d waiting" %
                      (self.pool.running(), self.pool.waiting()))
        _logger.info("crawling source id=%d url=%s" %
                     (source['id'], source['url']))

        cur_time = int(time.time())
        last_crawl_time = source['last_crawl_time']
        if cur_time - last_crawl_time < HOUR:
            _logger.info("ignore source(%s), last crawled %d minutes ago" %
                         (source['url'], (cur_time - last_crawl_time) / 60))
            return

        try:
            _logger.debug("fetching feed from (%s)" % source['url'])
            p = feedparser.parse(source['url'])
            _logger.debug("fetched from (%s)" % source['url'])
            if p.feed.has_key(
                    'updated_parsed') and p.feed.updated_parsed != None:
                cur_feed_time = int(time.mktime(p.feed.updated_parsed))
            else:
                cur_feed_time = int(time.time(
                ))  # FeedParser doesn't understand the 'updated' field
                # of this feed, neither can we. Probabaly some CJK chars.
            db_feed_time = source['last_feed_time']
            if db_feed_time >= cur_feed_time:
                _logger.info(
                    "ignore source(%s), no new feed. Last feed:%s, cur feed:%s"
                    % (source['url'], datetime.fromtimestamp(db_feed_time),
                       datetime.fromtimestamp(cur_feed_time)))
                self.agent.update_source_time(source)
            else:
                _logger.info("processing %d entries from %s" %
                             (len(p.entries), source['url']))
                for entry in p.entries:
                    self.process_entry(entry, source)
                self.agent.update_source_time(source, cur_feed_time)
                _logger.debug(
                    "source(%s) updated: %s" %
                    (source['url'], datetime.fromtimestamp(cur_feed_time)))

            _logger.info("source(id=%d) success" % source['id'])
            _logger.debug("pool stat: %d working %d waiting" %
                          (self.pool.running(), self.pool.waiting()))
        except Exception, err:
            _logger.error(
                "crawling faild for source id=%d, %s: %s" %
                (source['id'], source['url'], traceback.format_exc()))
Ejemplo n.º 29
0
    def _crawl_primary(self, anchor):
        self.output.write(anchor.text.encode('utf-8') + '\n')
        self._randsleep()
        html = self.br.open(anchor['href']).read()
        html = util.convert_to_utf8(html, 'gb2312')
        soup = BeautifulSoup(html)

        seconds = soup.findAll('div', 'dirtit')
        for second in seconds:
            _logger.info('crawling secondary category: (%s)' %
                         second.text.encode('utf-8'))
            try:
                self._crawl_secondary(second)
            except Exception, err:
                _logger.error('secondary(%s) failed: %s' %
                              (second.text.encode('utf-8'), err))
Ejemplo n.º 30
0
class FeedCrawler(object):
    def __init__(self, agent, pool):
        self.agent = agent
        self.pool = pool

    def grab_image(self, html, entry):

        wee_url = entry.link.encode('utf-8')
        soup = BeautifulSoup(html, fromEncoding="utf-8")
        img = soup.find('img', src=True)
        if img == None:
            _logger.debug("%s has no image inside" % wee_url)
            return
        url = img['src']

        _logger.debug('downloading image from %s' % url)
        try:
            br = pbrowser.get_browser()
            image = br.download_image(url, base_url=wee_url).read()
        except Exception, err:
            _logger.error("downloading image failed(%s), baseurl(%s): %s" %
                          (url, wee_url, traceback.format_exc()))
            return

        try:
            self.agent.add_wee_image(wee_url, image)
            _logger.debug("imaged added for wee:%s" % wee_url)
        except Exception, err:
            _logger.error("db error, failed to add image for wee %s: %s" %
                          (wee_url, err))