Beispiel #1
0
class ImpMakerParser(Parser):
    def __init__(self, opener=None, url=None, **kw):
        super(ImpMakerParser, self).__init__(opener=opener, url=url, **kw)
        self.logger = kw.get('logger')

    def parse(self, url=None):
        url = url or self.url
        if 'click' not in url:
            times = random.randrange(2, 5)
        else:
            times = 1
        i = 0

        self.opener = MechanizeOpener(user_agent=random_user_agent())
        odds = random.randint(0, 100)
        if 'click' not in url or odds <= 5:
            # add proxy
            p_ = get_ip_proxy()
            if p_:

                self.opener.remove_proxy()
                self.opener.add_proxy(p_)
            while i < times:
                html = self.opener.open(url)
                #print(html)
                i = i + 1
                time.sleep(.1)

        return url
Beispiel #2
0
    def testMechanizeOpener(self):
        test_url = "http://www.baidu.com"
        opener = MechanizeOpener()

        assert "baidu" in opener.open(test_url)

        br = opener.browse_open(test_url)
        assert "百度" in br.title()
        assert "baidu" in br.response().read()
Beispiel #3
0
 def testMechanizeOpener(self):
     test_url = 'http://www.baidu.com'
     opener = MechanizeOpener()
      
     assert 'baidu' in opener.open(test_url)
      
     br = opener.browse_open(test_url)
     assert u'百度' in br.title()
     assert 'baidu' in br.response().read()
Beispiel #4
0
 def testMechanizeOpener(self):
     test_url = 'http://www.baidu.com'
     opener = MechanizeOpener()
       
     assert 'baidu' in opener.open(test_url)
       
     br = opener.browse_open(test_url)
     assert u'百度' in br.title()
     assert 'baidu' in br.response().read()
Beispiel #5
0
    def __init__(self, opener=None, url=None, **kw):
        super(WikiParser, self).__init__(opener=opener, url=url, **kw)

        if self.opener is None:
            self.opener = MechanizeOpener()
        self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL)
        self.en_time_reg = re.compile(
            r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}')
        self.zh_time_reg = re.compile(
            ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
Beispiel #6
0
    def __init__(self, opener=None, url=None, bundle=None, **kwargs):
        super(DoubanMovieParser, self).__init__(opener=opener,
                                                url=url,
                                                **kwargs)
        if self.opener is None:
            self.opener = MechanizeOpener()

        self.url = url

        self.opener.set_default_timeout(TIMEOUT)

        if not hasattr(self, 'logger') or self.logger is None:
            self.logger = get_logger(name='douban_parser')
Beispiel #7
0
 def __init__(self, opener=None, url=None, **kw):
     super(WikiParser, self).__init__(opener=opener, url=url, **kw)
     
     if self.opener is None:
         self.opener = MechanizeOpener()
     self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL)
     self.en_time_reg = re.compile(r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}')
     self.zh_time_reg = re.compile(ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
Beispiel #8
0
 def refresh_cookies(self, ck_dir):
     """
         refresh cookie db
     """
     idx = 0
     # del all cookies
     if os.path.isdir(cookie_dir):
         shutil.rmtree(cookie_dir)
     os.mkdir(cookie_dir)
     # add cookie from folder
     accounts = []
     for root, dirs, files in os.walk(ck_dir):
         for filespath in files:
             full_name = os.path.join(root, filespath)
             with open(full_name) as f:
                 for line in f.readlines():
                     if line:
                         u, p = line.split('\t')
                         if u and p:
                             accounts.append((u.strip(), p.strip()))
     # save cookie
     for u, p in accounts:
         opener = MechanizeOpener(
             user_agent=
             'Baiduspider+(+http://www.baidu.com/search/spider.htm)',
             timeout=10)
         opener.browser.set_proxies({'http': get_ip_proxy(size=10)})
         lm = WeiboLogin(opener, u, p)
         try:
             status = lm.login()
         except Exception as ex:
             self.logger.warn("login error:%s" % u)
             self.logger.error(ex)
             continue
         if status:
             idx += 1
             opener.cj.save(os.path.join(cookie_dir, '%d.txt' % idx),
                            ignore_discard=True,
                            ignore_expires=True)
             self.validated.append("%s\t%s\r\n" % (u, p))
         opener.close()
Beispiel #9
0
    def setUp(self):
        self.test_uid = '1784725941'
        self.bundle = WeiboUserBundle(self.test_uid)
        self.opener = MechanizeOpener()

        self.conn = Connection()
        self.db = self.conn[getattr(user_config.job, 'db')]
        self.collection = self.db.weibo_user

        assert len(user_config.job['login']) > 0

        login_hook(self.opener, **user_config.job['login'][0])
Beispiel #10
0
    def setUp(self):
        self.test_uid = '1667486960'
        self.bundle = WeiboUserBundle(self.test_uid)
        self.opener = MechanizeOpener()

        self.conn = MongoClient()
        self.db = self.conn[getattr(user_config.job, 'db')]
        self.users_collection = self.db.weibo_user
        self.weibos_collection = self.db.micro_blog

        #assert len(user_config.job['login']) > 0

        login_hook(self.opener, **user_config.job['login'][0])
Beispiel #11
0
    def parse(self, url=None):
        url = url or self.url
        if 'click' not in url:
            times = random.randrange(2, 5)
        else:
            times = 1
        i = 0

        self.opener = MechanizeOpener(user_agent=random_user_agent())
        odds = random.randint(0, 100)
        if 'click' not in url or odds <= 5:
            # add proxy
            p_ = get_ip_proxy()
            if p_:

                self.opener.remove_proxy()
                self.opener.add_proxy(p_)
            while i < times:
                html = self.opener.open(url)
                #print(html)
                i = i + 1
                time.sleep(.1)

        return url
Beispiel #12
0
from cola.core.opener import MechanizeOpener
import re


for i in range(100):
    browser = MechanizeOpener().open('https://google.com')
    mainform = re.search('<input .+?>', browser).group()

    print(mainform)
Beispiel #13
0
class WikiParser(Parser):
    def __init__(self, opener=None, url=None, **kw):
        super(WikiParser, self).__init__(opener=opener, url=url, **kw)

        if self.opener is None:
            self.opener = MechanizeOpener()
        self.html_comment_reg = re.compile(r"<!--[^-]+-->", re.DOTALL)
        self.en_time_reg = re.compile(r"\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}")
        self.zh_time_reg = re.compile(ur"\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}")

    def store(self, title, content, last_update):
        try:
            doc = WikiDocument.objects.get(title=title)
            if last_update > doc.last_update:
                doc.content = content
                doc.last_update = last_update
                doc.update(upsert=True)
        except DoesNotExist:
            doc = WikiDocument(title=title, content=content, last_update=last_update)
            doc.save()

    def _extract(self, soup):
        if soup.head is None:
            return None, None, None

        title = soup.head.title.text
        if "-" in title:
            title = title.split("-")[0].strip()
        content = soup.find("div", attrs={"id": "mw-content-text", "class": "mw-content-ltr"})
        while content.table is not None:
            content.table.extract()
        content = content.text

        last_update_str = soup.find("li", attrs={"id": "footer-info-lastmod"}).text
        last_update = None
        match_en_time = self.en_time_reg.search(last_update_str)
        if match_en_time:
            last_update = match_en_time.group()
            last_update = parse(last_update)
        match_zh_time = self.zh_time_reg.search(last_update_str)
        if match_zh_time:
            last_update = match_zh_time.group()
            last_update = re.sub(r"\([^\)]+\)\s", "", last_update)
            last_update = last_update.replace(u"年", "-").replace(u"月", "-").replace(u"日", "")
            last_update = parse(last_update)
        if last_update is None:
            last_update = datetime.now()

        return title, content, last_update

    def parse(self, url=None):
        url = url or self.url

        lang = url.strip("http://").split(".", 1)[0]

        br = self.opener.browse_open(url)
        html = br.response().read()
        html = self.html_comment_reg.sub("", html)
        soup = BeautifulSoup(html)

        title, content, last_update = self._extract(soup)
        if title is None:
            return
        title = title + " " + lang
        self.store(title, content, last_update)

        def _is_same(out_url):
            return out_url.rsplit("#", 1)[0] == url

        for link in br.links():
            if link.url.startswith("http://"):
                out_url = link.url
                if not _is_same(out_url):
                    yield out_url
            else:
                out_url = urlparse.urljoin(link.base_url, link.url)
                if not _is_same(out_url):
                    yield out_url
Beispiel #14
0
class WikiParser(Parser):
    def __init__(self, opener=None, url=None, **kw):
        super(WikiParser, self).__init__(opener=opener, url=url, **kw)
        
        if self.opener is None:
            self.opener = MechanizeOpener()
        self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL)
        self.en_time_reg = re.compile(r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}')
        self.zh_time_reg = re.compile(ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')
        
    def store(self, title, content, last_update):
        try:
            doc = WikiDocument.objects.get(title=title)
            if last_update > doc.last_update:
                doc.content = content
                doc.last_update = last_update
                doc.update(upsert=True)
        except DoesNotExist:
            doc = WikiDocument(title=title, content=content, last_update=last_update)
            doc.save()
            
    def _extract(self, soup):
        if soup.head is None:
            return None, None, None
        
        title = soup.head.title.text
        if '-' in title:
            title = title.split('-')[0].strip()
        content = soup.find('div', attrs={'id': 'mw-content-text', 'class': 'mw-content-ltr'})
        while content.table is not None:
            content.table.extract()
        content = content.text
        
        last_update_str = soup.find('li', attrs={'id': 'footer-info-lastmod'}).text
        last_update = None
        match_en_time = self.en_time_reg.search(last_update_str)
        if match_en_time:
            last_update = match_en_time.group()
            last_update = parse(last_update)
        match_zh_time = self.zh_time_reg.search(last_update_str)
        if match_zh_time:
            last_update = match_zh_time.group()
            last_update = re.sub(r'\([^\)]+\)\s', '', last_update)
            last_update = last_update.replace(u'年', '-').replace(u'月', '-').replace(u'日', '')
            last_update = parse(last_update)
        if last_update is None:
            last_update = datetime.now()
        
        return title, content, last_update
    
    def parse(self, url=None):
        url = url or self.url
        
        lang = url.strip('http://').split('.', 1)[0]
        
        br = self.opener.browse_open(url)
        html = br.response().read()
        html = self.html_comment_reg.sub('', html)
        soup = BeautifulSoup(html)
        
        title, content, last_update = self._extract(soup)
        if title is None:
            return []
        title = title + ' ' + lang
        self.store(title, content, last_update)
        
        def _is_same(out_url):
            return out_url.rsplit('#', 1)[0] == url
        
        links = []
        for link in br.links():
            if link.url.startswith('http://'):
                out_url = link.url
                if not _is_same(out_url):
                    links.append(out_url)
            else:
                out_url = urlparse.urljoin(link.base_url, link.url)
                if not _is_same(out_url):
                    links.append(out_url)
        return links
Beispiel #15
0
class DoubanMovieParser(Parser):
    def __init__(self, opener=None, url=None, bundle=None, **kwargs):
        super(DoubanMovieParser, self).__init__(opener=opener,
                                                url=url,
                                                **kwargs)
        if self.opener is None:
            self.opener = MechanizeOpener()

        self.url = url

        self.opener.set_default_timeout(TIMEOUT)

        if not hasattr(self, 'logger') or self.logger is None:
            self.logger = get_logger(name='douban_parser')

    def get_subject_id(self, url):
        """
            extract subject id from url
        """
        id_arr = re.findall('https://movie.douban.com/subject/(\d+)', url)
        if id_arr:
            return id_arr[0]

    def _check_url(self, dest_url, src_url):
        """
            check whether url are same domain path
        """
        return dest_url.split('?')[0] == src_url.split('?')[0]

    def check(self, url, br):
        dest_url = br.geturl()
        if not self._check_url(dest_url, url):
            if dest_url.startswith('http://douban.com/login.php'):
                raise DoubanLoginFailure('Douban not login or login expired')
        return True

    def get_movie_subject(self, sid):
        try:
            movie = getattr(DoubanMovie, 'objects').get(sid=sid)
        except DoesNotExist:
            movie = DoubanMovie(sid=sid)
            movie.save()
        return movie

    def parse(self, url=None):

        url = url or self.url
        sid = self.get_subject_id(url)
        movie = self.get_movie_subject(sid)
        print(datetime.utcnow())
        # if entry has updated in latest 24 hours, skip this url
        if movie.last_update and abs(
            (datetime.utcnow() - movie.last_update).days) > 1:
            self.logger.warn('Skip vistied url: %s' % url)
            return

        self.logger.debug('proxy:{}'.format(self.opener.proxies))

        try:
            br = self.opener.browse_open(url)
        except URLError:
            raise FetchBannedError()

        if not self.check(url, br):
            return
        html = br.response().read()

        if html == None:
            raise FetchBannedError()

        soup = beautiful_soup(html)

        if re.compile('<span class="pl">集数:</span>').findall(html):
            subtype = 't'
        else:
            subtype = 'm'
        try:
            title = soup.select(
                "span[property='v:itemreviewed']")[0].text.strip()
        except:
            raise FetchBannedError()

        year_tags = soup.select("div#content > h1 span.year")
        if year_tags:
            year = year_tags[0].text[1:-1]
        else:
            year = None
        # self.logger.debug(title)

        summary_tags = soup.select("span[property='v:summary']")
        summary = summary_tags[0].text.strip() if summary_tags else ''

        # tags
        tag_tags = soup.select('div .tags-body a')
        tags = [t.text for t in tag_tags]

        # get directors
        director_tags = soup.select('div #info > span a[rel="v:directedBy"]')
        p1 = re.compile(r'<[^>]+>(?P<director>[^<]+)</a>')
        directors = [p1.match(str(t)).group('director') for t in director_tags]

        # get stars
        star_tags = soup.select('div #info > span a[rel="v:starring"]')
        p2 = re.compile(r'<[^>]+>(?P<star>[^<]+)</a>')
        casts = [p2.match(str(t)).group('star') for t in star_tags]

        # get writers
        writers_tags = soup.select('div #info > span')[1].select('a')
        p2 = re.compile(r'<[^>]+>(?P<writer>[^<]+)</a>')
        writers = [p2.match(str(t)).group('writer') for t in writers_tags]

        # get genre
        genre_tags = soup.select('div #info > span[property="v:genre"]')
        p3 = re.compile(r'<span property="v:genre">(?P<genre>[^<]+)</span>')
        genres = [p3.match(str(t)).group('genre') for t in genre_tags]

        # get release date
        pubdate_tag = soup.select(
            'div #info > span[property="v:initialReleaseDate"]')
        f4 = 0
        if pubdate_tag:
            p41 = re.compile(
                r'<[^>]+>(?P<pubdate>[^(]+)[(]中国大陆([ ]3D)*[)]<[^>]+>')
            p42 = re.compile(
                r'<[^>]+>(?P<pubdate>[^(]+)[(]中国内地([ ]3D)*[)]<[^>]+>')
            p43 = re.compile(
                r'<[^>]+>(?P<pubdate>[^(]+)[(]香港([ ]3D)*[)]<[^>]+>')
            p44 = re.compile(r'[0-9-]+')
            for t in pubdate_tag:
                m = p41.search(str(t))
                if m != None:
                    f4 = 1
                    pubdate = m.group('pubdate')
                    break
                m = p42.search(str(t))
                if m != None:
                    f4 = 1
                    pubdate = m.group('pubdate')
                    break
                m = p43.search(str(t))
                if m != None:
                    f4 = 1
                    pubdate = m.group('pubdate')
                    break
                m = p44.search(str(t))
                if m != None:
                    f4 = 1
                    pubdate = m.group()
                    break
        if f4 == 0:
            self.logger.critical('{0} has no pubdate'.format(sid))
            pubdate = year
        # append month/date if just year is known
        if len(pubdate) == 4:
            pubdate = pubdate + "-6-30"
        elif len(pubdate) == 7:
            pubdate = pubdate + "-15"
        pubdate = datetime.strptime(pubdate, '%Y-%m-%d')
        if not year:
            year = pubdate.strftime('%Y')
        # get wishes
        wishes_tags = soup.select(
            'div #subject-others-interests > .subject-others-interests-ft > a')
        #print wishes_tags
        if len(wishes_tags) == 0:
            self.logger.critical('{0} donnot have wish count'.format(sid))
        wish_count = None
        collect_count = None
        for i in range(len(wishes_tags)):
            m = re.match(u'(?P<wishes>[0-9]+)人想看', wishes_tags[i].text)
            if m:
                wish_count = m.group('wishes')
                continue

            m = re.match(u'(?P<collections>[0-9]+)人看过', wishes_tags[i].text)
            if m:
                collect_count = m.group('collections')

        rating_num = soup.select(r'strong.rating_num')[0].text
        if not rating_num:
            rating_num = None
        rating_lvls = soup.select(r'div.ratings-on-weight span.rating_per')
        if rating_lvls:
            rating_lvls = [float(r.text[:-1]) for r in rating_lvls]

        # season
        season_tags = soup.select('div #info select#season]')
        if season_tags:
            movie.seasons_count = season_tags.count
            movie.current_season = season_tags[0].select(
                'option[selected]')[0].text
        photo_url = soup.select('a[class="nbgnbg"] img')[0].attrs['src']

        #region save movie
        def parseNumber(v):
            m = re.findall('(\d+).*', v)
            if m:
                return int(m[0])
            else:
                # parse chinese
                return convert(v.strip())

        info_map = {
            u'制片国家/地区': {
                'field': 'countries'
            },
            u'语言': {
                'field': 'languages'
            },
            u'集数': {
                'field': 'episodes_count',
                'func': parseNumber
            },
            u'单集片长': {
                'field': 'duration',
                'func': parseNumber
            },
            u'片长': {
                'field': 'duration',
                'func': parseNumber
            },
            u'又名': {
                'field': 'aka',
                'func': lambda v: v.split('/')
            },
            u'IMDb链接': {
                'field': 'imdb_id'
            }
        }

        info_str = soup.select('div #info')[0].text
        for k, f in info_map.items():
            v = re.findall(k + "\:(.*)", info_str, re.MULTILINE)
            if v:
                func = (lambda s: s.strip()) \
                            if 'func' not in f \
                            else f['func']
                f_val = func(v[0].strip())
                setattr(movie, f['field'], f_val)
        movie.sid = sid
        movie.title = title
        movie.photo_alt = photo_url
        movie.year = year
        movie.summary = summary
        movie.tags = tags
        movie.subtype = subtype
        movie.directors = directors
        movie.casts = casts
        movie.writers = writers
        if rating_num:
            movie.rating = float(rating_num)
        if rating_lvls:
            movie.high_rating_pct = rating_lvls[0] + rating_lvls[1]
            movie.low_rating_pct = rating_lvls[3] + rating_lvls[4]
        if wish_count:
            movie.wish_count = wish_count
        if collect_count:
            movie.collect_count = collect_count
        movie.pubdate = pubdate
        movie.genres = genres
        movie.alt = url
        movie.last_update = datetime.now()
        movie.save()

        def _is_same(out_url, url):
            return out_url.rsplit('#', 1)[0] == url

        next_urls = soup.select("div.recommendations-bd a")
        for link in next_urls:
            out_url = link.attrs['href']

            if not _is_same(out_url, url) and out_url.startswith(
                    "https://movie.douban.com/subject"):
                sid_next = self.get_subject_id(out_url)
                if sid_next != sid:
                    yield out_url
Beispiel #16
0
class WikiParser(Parser):
    def __init__(self, opener=None, url=None, **kw):
        super(WikiParser, self).__init__(opener=opener, url=url, **kw)

        if self.opener is None:
            self.opener = MechanizeOpener()
        self.html_comment_reg = re.compile(r'<!--[^-]+-->', re.DOTALL)
        self.en_time_reg = re.compile(
            r'\d{1,2} [A-Z][a-z]{2,} \d{4} at \d{1,2}:\d{1,2}')
        self.zh_time_reg = re.compile(
            ur'\d{4}年\d{1,2}月\d{1,2}日 \(.+\) \d{1,2}:\d{1,2}')

    def store(self, title, content, last_update):
        try:
            doc = WikiDocument.objects.get(title=title)
            if last_update > doc.last_update:
                doc.content = content
                doc.last_update = last_update
                doc.update(upsert=True)
        except DoesNotExist:
            doc = WikiDocument(title=title,
                               content=content,
                               last_update=last_update)
            doc.save()

    def _extract(self, soup):
        if soup.head is None:
            return None, None, None

        title = soup.head.title.text
        if '-' in title:
            title = title.split('-')[0].strip()
        content = soup.find('div',
                            attrs={
                                'id': 'mw-content-text',
                                'class': 'mw-content-ltr'
                            })
        while content.table is not None:
            content.table.extract()
        content = content.text

        last_update_str = soup.find('li', attrs={
            'id': 'footer-info-lastmod'
        }).text
        last_update = None
        match_en_time = self.en_time_reg.search(last_update_str)
        if match_en_time:
            last_update = match_en_time.group()
            last_update = parse(last_update)
        match_zh_time = self.zh_time_reg.search(last_update_str)
        if match_zh_time:
            last_update = match_zh_time.group()
            last_update = re.sub(r'\([^\)]+\)\s', '', last_update)
            last_update = last_update.replace(u'年', '-').replace(u'月',
                                                                 '-').replace(
                                                                     u'日', '')
            last_update = parse(last_update)
        if last_update is None:
            last_update = datetime.now()

        return title, content, last_update

    def parse(self, url=None):
        url = url or self.url

        lang = url.strip('http://').split('.', 1)[0]

        br = self.opener.browse_open(url)
        html = br.response().read()
        html = self.html_comment_reg.sub('', html)
        soup = BeautifulSoup(html)

        title, content, last_update = self._extract(soup)
        if not title:
            return
        title = title + ' ' + lang
        self.store(title, content, last_update)

        def _is_same(out_url, url):
            return out_url.rsplit('#', 1)[0] == url

        for link in br.links():
            q = urlparse.urlparse(link.url)
            if q.scheme in ['http', 'https']:
                out_url = link.url
                if not _is_same(out_url, url):
                    yield out_url
            elif not q.scheme:
                out_url = urlparse.urljoin(link.base_url, link.url)
                if not _is_same(out_url, url):
                    yield out_url
Beispiel #17
0
 def setUp(self):
     self.base_url = 'http://zhidao.baidu.com'
     self.url = 'http://zhidao.baidu.com/question/559110619.html'
     self.html = MechanizeOpener().open(self.url)
Beispiel #18
0
import os
import urllib2
import re
import urlparse
from bs4 import BeautifulSoup
from cola.core.opener import MechanizeOpener

url = 'http://commons.wikimedia.org/wiki/File:Aerial_View_of_Trout_Lake.JPG'
#url = 'http://commons.wikimedia.org/wiki/File:Capturing_the_rain_water_falling_from_roof.jpg'
br = MechanizeOpener().browse_open(url)
html = br.response().read()
#print html
soup = BeautifulSoup(html)
def saveImg(picurl):
	local_path = '/data/test/'
	names = picurl.split('/')
	picname = names[-1]
	print picname
	#name = re.match(pattern,picurl)
	#print name
	print 'downing',picurl
	#filename = local_path +  name.group()
	filename = local_path +  picname
	print filename
#print picurl
	try:
		response = urllib2.urlopen(picurl,timeout=10)
		cont = response.read()
	except urllib2.URLError,e:
		print e.reason
#	cont = MechanizeOpener().browse_open(picurl).read()