예제 #1
0
 def testUrlPatterns(self):
     url_patterns = UrlPatterns(
         Url(r'^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', 'wiki_item', FakeParser)
     )
     
     urls = ['http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
             ]
     self.assertTrue(list(url_patterns.matches(urls)), urls)
     self.assertEqual(url_patterns.get_parser(urls[0]), FakeParser)
     
     self.assertFalse(Url('^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', None, None).match('http://zh.wikipedia.org/wiki/File:Flag_of_Cross_of_Burgundy.svg'))
예제 #2
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        local_node = 'localhost:%s' % self.job.context.job.port
        nodes = [
            local_node,
        ]

        self.rpc_server = ColaRPCServer(
            ('localhost', self.job.context.job.port))
        self.loader = JobLoader(self.job)
        self.loader.init_mq(self.rpc_server, nodes, local_node, self.dir)

        thd = threading.Thread(target=self.rpc_server.serve_forever)
        thd.setDaemon(True)
        thd.start()
예제 #3
0
    def setUp(self):
        self.job = Job('test job', UrlPatterns(), BuiltinOpener, [])
        self.root = tempfile.mkdtemp()

        master_root = os.path.join(self.root, 'master')
        worker_root = os.path.join(self.root, 'worker')
        os.makedirs(master_root)
        os.makedirs(worker_root)

        node = '%s:%s' % (get_ip(), self.job.context.job.port)
        nodes = [node]
        master = '%s:%s' % (get_ip(), self.job.context.job.master_port)

        self.master_loader = MasterJobLoader(self.job, master_root, nodes)
        self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
예제 #4
0
def get_job():
    urls = []
    for pattern in user_config.job.patterns:
        url_pattern = Url(pattern.regex,
                          pattern.name,
                          GenericParser,
                          store=pattern.store,
                          extract=pattern.extract)
        urls.append(url_pattern)
    url_patterns = UrlPatterns(*urls)

    return Job(user_config.job.name,
               url_patterns,
               MechanizeOpener,
               starts,
               instances=user_config.job.instances,
               user_conf=user_config)
예제 #5
0
def get_job_desc():
    urls = []
    for pattern in user_config.job.patterns:
        url_pattern = Url(pattern.regex,
                          pattern.name,
                          ImpMakerParser,
                          priority=1)
        urls.append(url_pattern)
    url_patterns = UrlPatterns(*urls)

    return JobDescription(user_config.job.name,
                          url_patterns,
                          MechanizeOpener,
                          user_config,
                          starts,
                          unit_cls=None,
                          login_hook=None)
예제 #6
0
    def setUp(self):
        url_patterns = UrlPatterns(
            Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item',
                FakeWikiParser))
        fake_user_conf = Config(StringIO(user_conf))

        self.dir = tempfile.mkdtemp()

        self.job = Job(
            'fake wiki crawler',
            url_patterns,
            MechanizeOpener, [
                'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F',
            ],
            user_conf=fake_user_conf)

        self.local_node = 'localhost:%s' % self.job.context.job.port
        self.nodes = [
            self.local_node,
        ]
예제 #7
0
from login import WeiboLogin
from parsers import WeiboSearchParser
from conf import user_config, instances
from bundle import WeiboSearchBundle

debug = False

def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']
    
    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()

url_patterns = UrlPatterns(
    Url(r'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser),
)

def get_opener():
    opener = SpynnerOpener()
    if debug:
        opener.br.show() # debug
    return opener

def get_job():
    return Job('weibo search crawler', url_patterns, get_opener, [],
               is_bundle=True, unit_cls=WeiboSearchBundle, 
               instances=instances, debug=debug, user_conf=user_config,
               login_hook=login_hook)
    
if __name__ == "__main__":
예제 #8
0
from conf import starts, user_config, instances
from bundle import WeiboUserBundle


def login_hook(opener, **kw):
    username = str(kw['username'])
    passwd = str(kw['password'])

    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()


url_patterns = UrlPatterns(
    Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog',
        MicroBlogParser),
    Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like',
        ForwardCommentLikeParser),
    Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser),
    Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser),
    Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser))


def get_job():
    return Job('sina weibo crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               is_bundle=True,
               unit_cls=WeiboUserBundle,
               instances=instances,
               debug=False,
               user_conf=user_config,
예제 #9
0
파일: __init__.py 프로젝트: brightgems/cola
        self.store(title, content, last_update)

        def _is_same(out_url, url):
            return out_url.rsplit('#', 1)[0] == url

        for link in br.links():
            q = urlparse.urlparse(link.url)
            if q.scheme in ['http', 'https']:
                out_url = link.url
                if not _is_same(out_url, url):
                    yield out_url
            elif not q.scheme:
                out_url = urlparse.urljoin(link.base_url, link.url)
                if not _is_same(out_url, url):
                    yield out_url


url_patterns = UrlPatterns(
    Url(r'^https://(zh|en).wikipedia.org/wiki/[^(:|/)]+$', 'wiki_page',
        WikiParser))


def get_job_desc():
    return JobDescription('wikipedia crawler', url_patterns, MechanizeOpener,
                          user_config, starts)


if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
예제 #10
0
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription

from parsers import DoubanMovieParser
from conf import starts, user_config, instances, mongo_host, mongo_port, db_name
from cola.core.opener import MechanizeOpener
import random


def login_hook(opener, **kw):

    return True


url_patterns = UrlPatterns(
    Url('https://movie.douban.com/subject/\d+.*',
        'subject',
        DoubanMovieParser,
        priority=0), )


def get_job_desc():
    return JobDescription('douban spider', url_patterns, MechanizeOpener,
                          user_config, starts)


if __name__ == "__main__":
    from cola.context import Context
    os.environ.setdefault('http_proxy', '')
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
예제 #11
0
파일: __init__.py 프로젝트: brightgems/cola
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription
from cola.core.opener import MechanizeOpener

from login import WeiboLogin as AccountLogin
from parsers import MicroBlogParser, ForwardCommentLikeParser,UserInfoParser
from conf import starts, user_config

def login_hook(opener, **kw):
    username = str(kw['username'])
    passwd = str(kw['password'])
    
    loginer = AccountLogin(opener,username,passwd)
    ret = loginer.login()

    return ret

url_patterns = UrlPatterns(
        Url('http://weibo.com/\w+/\w+\?.*type\=.*', 'micro_blog', MicroBlogParser, priority=0),
        Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like', ForwardCommentLikeParser ,priority=1),
        Url(r'http://weibo.com/\d+/info.*', 'user_info', UserInfoParser,priority=1),
)

def get_job_desc():
    return JobDescription('weibo post spider', url_patterns, MechanizeOpener, user_config, 
                          starts, login_hook=login_hook)
    
if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
예제 #12
0
파일: __init__.py 프로젝트: iswangheng/cola
from parsers import MicroBlogParser, UserInfoParser, UserFriendParser
from conf import starts, user_config, instances
from bundle import WeiboUserBundle


def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']

    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()


url_patterns = UrlPatterns(
    Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog',
        MicroBlogParser),
    Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser),
    Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser),
    Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser))


def get_job():
    return Job('sina weibo crawler',
               url_patterns,
               MechanizeOpener,
               starts,
               is_bundle=True,
               unit_cls=WeiboUserBundle,
               instances=instances,
               debug=False,
               user_conf=user_config,
               login_hook=login_hook)
예제 #13
0
파일: __init__.py 프로젝트: brightgems/cola
from cola.core.opener import SpynnerOpener
from cola.core.urls import Url, UrlPatterns
from cola.job import JobDescription
from cola.core.opener import MechanizeOpener

from login import WeiboLogin
from parsers import WeiboSearchParser, UserHomePageParser
from conf import user_config, instances
from bundle import WeiboSearchBundle
from conf import starts, user_config, instances

def login_hook(opener, **kw):
    username = kw['username']
    passwd = kw['password']
    
    loginer = WeiboLogin(opener, username, passwd)
    return loginer.login()

url_patterns = UrlPatterns(
    Url(u'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser, priority=0),
    Url(u'http://weibo.com/\d+\?.*', 'user_home', UserHomePageParser, priority=0),
)

def get_job_desc():
    return JobDescription('weibo search', url_patterns, MechanizeOpener, user_config, 
                          starts, login_hook=login_hook)
    
if __name__ == "__main__":
    from cola.context import Context
    ctx = Context(local_mode=True)
    ctx.run_job(os.path.dirname(os.path.abspath(__file__)))