Example #1
0
 def testUrlPatterns(self):
     url_patterns = UrlPatterns(
         Url(r'^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', 'wiki_item', FakeParser)
     )
     
     urls = ['http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
             ]
     self.assertTrue(list(url_patterns.matches(urls)), urls)
     self.assertEqual(url_patterns.get_parser(urls[0]), FakeParser)
     
     self.assertFalse(Url('^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', None, None).match('http://zh.wikipedia.org/wiki/File:Flag_of_Cross_of_Burgundy.svg'))
Example #2
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        local_node = 'localhost:%s' % self.job.context.job.port
        nodes = [
            local_node,
        ]

        self.rpc_server = ColaRPCServer(
            ('localhost', self.job.context.job.port))
        self.loader = JobLoader(self.job)
        self.loader.init_mq(self.rpc_server, nodes, local_node, self.dir)

        thd = threading.Thread(target=self.rpc_server.serve_forever)
        thd.setDaemon(True)
        thd.start()
Example #3
0
    def setUp(self):
        self.job = Job('test job', UrlPatterns(), BuiltinOpener, [])
        self.root = tempfile.mkdtemp()

        master_root = os.path.join(self.root, 'master')
        worker_root = os.path.join(self.root, 'worker')
        os.makedirs(master_root)
        os.makedirs(worker_root)

        node = '%s:%s' % (get_ip(), self.job.context.job.port)
        nodes = [node]
        master = '%s:%s' % (get_ip(), self.job.context.job.master_port)

        self.master_loader = MasterJobLoader(self.job, master_root, nodes)
        self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
Example #4
0
def get_job():
    urls = []
    for pattern in user_config.job.patterns:
        url_pattern = Url(pattern.regex,
                          pattern.name,
                          GenericParser,
                          store=pattern.store,
                          extract=pattern.extract)
        urls.append(url_pattern)
    url_patterns = UrlPatterns(*urls)

    return Job(user_config.job.name,
               url_patterns,
               MechanizeOpener,
               starts,
               instances=user_config.job.instances,
               user_conf=user_config)
Example #5
0
def get_job_desc():
    urls = []
    for pattern in user_config.job.patterns:
        url_pattern = Url(pattern.regex,
                          pattern.name,
                          ImpMakerParser,
                          priority=1)
        urls.append(url_pattern)
    url_patterns = UrlPatterns(*urls)

    return JobDescription(user_config.job.name,
                          url_patterns,
                          MechanizeOpener,
                          user_config,
                          starts,
                          unit_cls=None,
                          login_hook=None)
Example #6
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        self.local_node = 'localhost:%s' % self.job.context.job.port
        self.nodes = [
            self.local_node,
        ]
Example #7
0
from login import WeiboLogin
from parsers import WeiboSearchParser
from conf import user_config, instances
from bundle import WeiboSearchBundle

debug = False

def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']
    
    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()

url_patterns = UrlPatterns(
    Url(r'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser),
)

def get_opener():
    opener = SpynnerOpener()
    if debug:
        opener.br.show() # debug
    return opener

def get_job():
    return Job('weibo search crawler', url_patterns, get_opener, [],
               is_bundle=True, unit_cls=WeiboSearchBundle, 
               instances=instances, debug=debug, user_conf=user_config,
               login_hook=login_hook)
    
if __name__ == "__main__":
from conf import starts, user_config, instances
from bundle import WeiboUserBundle


def login_hook(opener, **kw):
    username = str(kw['username'])
    passwd = str(kw['password'])

    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()


url_patterns = UrlPatterns(
    Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog',
        MicroBlogParser),
    Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like',
        ForwardCommentLikeParser),
    Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser),
    Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser),
    Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser))


def get_job():
    return Job('sina weibo crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               is_bundle=True,
               unit_cls=WeiboUserBundle,
               instances=instances,
               debug=False,
               user_conf=user_config,
Example #9
0
        self.store(title, content, last_update)

        def _is_same(out_url, url):
            return out_url.rsplit('#', 1)[0] == url

        for link in br.links():
            q = urlparse.urlparse(link.url)
            if q.scheme in ['http', 'https']:
                out_url = link.url
                if not _is_same(out_url, url):
                    yield out_url
            elif not q.scheme:
                out_url = urlparse.urljoin(link.base_url, link.url)
                if not _is_same(out_url, url):
                    yield out_url


url_patterns = UrlPatterns(
    Url(r'^https://(zh|en).wikipedia.org/wiki/[^(:|/)]+$', 'wiki_page',
        WikiParser))


def get_job_desc():
    return JobDescription('wikipedia crawler', url_patterns, MechanizeOpener,
                          user_config, starts)


if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
Example #10
0
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription

from parsers import DoubanMovieParser
from conf import starts, user_config, instances, mongo_host, mongo_port, db_name
from cola.core.opener import MechanizeOpener
import random


def login_hook(opener, **kw):

    return True


url_patterns = UrlPatterns(
    Url('https://movie.douban.com/subject/\d+.*',
        'subject',
        DoubanMovieParser,
        priority=0), )


def get_job_desc():
    return JobDescription('douban spider', url_patterns, MechanizeOpener,
                          user_config, starts)


if __name__ == "__main__":
    from cola.context import Context
    os.environ.setdefault('http_proxy', '')
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
Example #11
0
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription
from cola.core.opener import MechanizeOpener

from login import WeiboLogin as AccountLogin
from parsers import MicroBlogParser, ForwardCommentLikeParser,UserInfoParser
from conf import starts, user_config

def login_hook(opener, **kw):
    username = str(kw['username'])
    passwd = str(kw['password'])
    
    loginer = AccountLogin(opener,username,passwd)
    ret = loginer.login()

    return ret

url_patterns = UrlPatterns(
        Url('http://weibo.com/\w+/\w+\?.*type\=.*', 'micro_blog', MicroBlogParser, priority=0),
        Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like', ForwardCommentLikeParser ,priority=1),
        Url(r'http://weibo.com/\d+/info.*', 'user_info', UserInfoParser,priority=1),
)

def get_job_desc():
    return JobDescription('weibo post spider', url_patterns, MechanizeOpener, user_config, 
                          starts, login_hook=login_hook)
    
if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
Example #12
0
from parsers import MicroBlogParser, UserInfoParser, UserFriendParser
from conf import starts, user_config, instances
from bundle import WeiboUserBundle


def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']

    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()


url_patterns = UrlPatterns(
    Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog',
        MicroBlogParser),
    Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser),
    Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser),
    Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser))


def get_job():
    return Job('sina weibo crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               is_bundle=True,
               unit_cls=WeiboUserBundle,
               instances=instances,
               debug=False,
               user_conf=user_config,
               login_hook=login_hook)
Example #13
0
from cola.core.opener import SpynnerOpener
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription
from cola.core.opener import MechanizeOpener

from login import WeiboLogin
from parsers import WeiboSearchParser, UserHomePageParser
from conf import user_config, instances
from bundle import WeiboSearchBundle
from conf import starts, user_config, instances

def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']
    
    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()

url_patterns = UrlPatterns(
    Url(u'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser, priority=0),
    Url(u'http://weibo.com/\d+\?.*', 'user_home', UserHomePageParser, priority=0),
)

def get_job_desc():
    return JobDescription('weibo search', url_patterns, MechanizeOpener, user_config, 
                          starts, login_hook=login_hook)
    
if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))