def testUrlPatterns(self): url_patterns = UrlPatterns( Url(r'^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', 'wiki_item', FakeParser) ) urls = ['http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F', ] self.assertTrue(list(url_patterns.matches(urls)), urls) self.assertEqual(url_patterns.get_parser(urls[0]), FakeParser) self.assertFalse(Url('^http://zh.wikipedia.org/wiki/[^FILE][^/]+$', None, None).match('http://zh.wikipedia.org/wiki/File:Flag_of_Cross_of_Burgundy.svg'))
def setUp(self): url_patterns = UrlPatterns( Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item', FakeWikiParser)) fake_user_conf = Config(StringIO(user_conf)) self.dir = tempfile.mkdtemp() self.job = Job( 'fake wiki crawler', url_patterns, MechanizeOpener, [ 'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F', ], user_conf=fake_user_conf) local_node = 'localhost:%s' % self.job.context.job.port nodes = [ local_node, ] self.rpc_server = ColaRPCServer( ('localhost', self.job.context.job.port)) self.loader = JobLoader(self.job) self.loader.init_mq(self.rpc_server, nodes, local_node, self.dir) thd = threading.Thread(target=self.rpc_server.serve_forever) thd.setDaemon(True) thd.start()
def setUp(self): self.job = Job('test job', UrlPatterns(), BuiltinOpener, []) self.root = tempfile.mkdtemp() master_root = os.path.join(self.root, 'master') worker_root = os.path.join(self.root, 'worker') os.makedirs(master_root) os.makedirs(worker_root) node = '%s:%s' % (get_ip(), self.job.context.job.port) nodes = [node] master = '%s:%s' % (get_ip(), self.job.context.job.master_port) self.master_loader = MasterJobLoader(self.job, master_root, nodes) self.worker_loader = WorkerJobLoader(self.job, worker_root, master)
def get_job(): urls = [] for pattern in user_config.job.patterns: url_pattern = Url(pattern.regex, pattern.name, GenericParser, store=pattern.store, extract=pattern.extract) urls.append(url_pattern) url_patterns = UrlPatterns(*urls) return Job(user_config.job.name, url_patterns, MechanizeOpener, starts, instances=user_config.job.instances, user_conf=user_config)
def get_job_desc(): urls = [] for pattern in user_config.job.patterns: url_pattern = Url(pattern.regex, pattern.name, ImpMakerParser, priority=1) urls.append(url_pattern) url_patterns = UrlPatterns(*urls) return JobDescription(user_config.job.name, url_patterns, MechanizeOpener, user_config, starts, unit_cls=None, login_hook=None)
def setUp(self): url_patterns = UrlPatterns( Url(r'^http://zh.wikipedia.org/wiki/[^(:|/)]+$', 'wiki_item', FakeWikiParser)) fake_user_conf = Config(StringIO(user_conf)) self.dir = tempfile.mkdtemp() self.job = Job( 'fake wiki crawler', url_patterns, MechanizeOpener, [ 'http://zh.wikipedia.org/wiki/%E6%97%A0%E6%95%8C%E8%88%B0%E9%98%9F', ], user_conf=fake_user_conf) self.local_node = 'localhost:%s' % self.job.context.job.port self.nodes = [ self.local_node, ]
from login import WeiboLogin from parsers import WeiboSearchParser from conf import user_config, instances from bundle import WeiboSearchBundle debug = False def login_hook(opener, **kw): username = kw['username'] passwd = kw['password'] loginer = WeiboLogin(opener, username, passwd) return loginer.login() url_patterns = UrlPatterns( Url(r'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser), ) def get_opener(): opener = SpynnerOpener() if debug: opener.br.show() # debug return opener def get_job(): return Job('weibo search crawler', url_patterns, get_opener, [], is_bundle=True, unit_cls=WeiboSearchBundle, instances=instances, debug=debug, user_conf=user_config, login_hook=login_hook) if __name__ == "__main__":
from conf import starts, user_config, instances from bundle import WeiboUserBundle def login_hook(opener, **kw): username = str(kw['username']) passwd = str(kw['password']) loginer = WeiboLogin(opener, username, passwd) return loginer.login() url_patterns = UrlPatterns( Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog', MicroBlogParser), Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like', ForwardCommentLikeParser), Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser), Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser), Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser)) def get_job(): return Job('sina weibo crawler', url_patterns, MechanizeOpener, starts, is_bundle=True, unit_cls=WeiboUserBundle, instances=instances, debug=False, user_conf=user_config,
self.store(title, content, last_update) def _is_same(out_url, url): return out_url.rsplit('#', 1)[0] == url for link in br.links(): q = urlparse.urlparse(link.url) if q.scheme in ['http', 'https']: out_url = link.url if not _is_same(out_url, url): yield out_url elif not q.scheme: out_url = urlparse.urljoin(link.base_url, link.url) if not _is_same(out_url, url): yield out_url url_patterns = UrlPatterns( Url(r'^https://(zh|en).wikipedia.org/wiki/[^(:|/)]+$', 'wiki_page', WikiParser)) def get_job_desc(): return JobDescription('wikipedia crawler', url_patterns, MechanizeOpener, user_config, starts) if __name__ == "__main__": from cola.context import Context ctx = Context(local_mode=True) ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
from cola.core.urls import Url, UrlPatterns from cola.job import JobDescription from parsers import DoubanMovieParser from conf import starts, user_config, instances, mongo_host, mongo_port, db_name from cola.core.opener import MechanizeOpener import random def login_hook(opener, **kw): return True url_patterns = UrlPatterns( Url('https://movie.douban.com/subject/\d+.*', 'subject', DoubanMovieParser, priority=0), ) def get_job_desc(): return JobDescription('douban spider', url_patterns, MechanizeOpener, user_config, starts) if __name__ == "__main__": from cola.context import Context os.environ.setdefault('http_proxy', '') ctx = Context(local_mode=True) ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
from cola.core.urls import Url, UrlPatterns from cola.job import JobDescription from cola.core.opener import MechanizeOpener from login import WeiboLogin as AccountLogin from parsers import MicroBlogParser, ForwardCommentLikeParser,UserInfoParser from conf import starts, user_config def login_hook(opener, **kw): username = str(kw['username']) passwd = str(kw['password']) loginer = AccountLogin(opener,username,passwd) ret = loginer.login() return ret url_patterns = UrlPatterns( Url('http://weibo.com/\w+/\w+\?.*type\=.*', 'micro_blog', MicroBlogParser, priority=0), Url(r'http://weibo.com/aj/.+/big.*', 'forward_comment_like', ForwardCommentLikeParser ,priority=1), Url(r'http://weibo.com/\d+/info.*', 'user_info', UserInfoParser,priority=1), ) def get_job_desc(): return JobDescription('weibo post spider', url_patterns, MechanizeOpener, user_config, starts, login_hook=login_hook) if __name__ == "__main__": from cola.context import Context ctx = Context(local_mode=True) ctx.run_job(os.path.dirname(os.path.abspath(__file__)))
from parsers import MicroBlogParser, UserInfoParser, UserFriendParser from conf import starts, user_config, instances from bundle import WeiboUserBundle def login_hook(opener, **kw): username = kw['username'] passwd = kw['password'] loginer = WeiboLogin(opener, username, passwd) return loginer.login() url_patterns = UrlPatterns( Url(r'http://weibo.com/aj/mblog/mbloglist.*', 'micro_blog', MicroBlogParser), Url(r'http://weibo.com/\d+/info', 'user_info', UserInfoParser), Url(r'http://weibo.com/\d+/follow.*', 'follows', UserFriendParser), Url(r'http://weibo.com/\d+/fans.*', 'fans', UserFriendParser)) def get_job(): return Job('sina weibo crawler', url_patterns, MechanizeOpener, starts, is_bundle=True, unit_cls=WeiboUserBundle, instances=instances, debug=False, user_conf=user_config, login_hook=login_hook)
from cola.core.opener import SpynnerOpener from cola.core.urls import Url, UrlPatterns from cola.job import JobDescription from cola.core.opener import MechanizeOpener from login import WeiboLogin from parsers import WeiboSearchParser, UserHomePageParser from conf import user_config, instances from bundle import WeiboSearchBundle from conf import starts, user_config, instances def login_hook(opener, **kw): username = kw['username'] passwd = kw['password'] loginer = WeiboLogin(opener, username, passwd) return loginer.login() url_patterns = UrlPatterns( Url(u'http://s.weibo.com/weibo/.*', 'weibo_search', WeiboSearchParser, priority=0), Url(u'http://weibo.com/\d+\?.*', 'user_home', UserHomePageParser, priority=0), ) def get_job_desc(): return JobDescription('weibo search', url_patterns, MechanizeOpener, user_config, starts, login_hook=login_hook) if __name__ == "__main__": from cola.context import Context ctx = Context(local_mode=True) ctx.run_job(os.path.dirname(os.path.abspath(__file__)))