Beispiel #1
0
def get_project_settings():

#     指定设定(Designating the settings)
# ENVVAR = 'SCRAPY_SETTINGS_MODULE'
# 当您使用Scrapy时,您需要声明您所使用的设定。这可以通过使用环境变量: SCRAPY_SETTINGS_MODULE 来完成。
#
# SCRAPY_SETTINGS_MODULE 必须以Python路径语法编写, 如 myproject.settings 。 注意,设定模块应该在 Python import search path 中。
    if ENVVAR not in os.environ:
        project = os.environ.get('SCRAPY_PROJECT', 'default')
        init_env(project)
    settings_module_path = os.environ.get(ENVVAR)
    if settings_module_path:
        settings_module = import_module(settings_module_path)
    else:
        settings_module = None
    settings = CrawlerSettings(settings_module)

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {}

    # XXX: deprecate and remove this functionality
    for k, v in os.environ.items():
        if k.startswith('SCRAPY_'):
            settings.overrides[k[7:]] = v

    return settings
Beispiel #2
0
def _build_settings(settings=None):
    if settings is None:
        settings = CrawlerSettings()
    elif isinstance(settings, dict):
        values = settings
        settings = CrawlerSettings()
        settings.defaults.update(values)
    return settings
 def __init__(self,
     start_url_loader,
     loger,
     setting_module_path = 'le_crawler.common.headline_album_settings',
     *kargs,
     **kwargs):
   __import__(setting_module_path)
   self.__settings = CrawlerSettings(settings_module
       = sys.modules[setting_module_path])
   self.loger = loger
   # {album_id, {}}
   self.album_ids = {}
   self.__init_regs()
   self.__extend_map_handler = kwargs['extend_map_handler'] if \
   kwargs.has_key('extend_map_handler') else None
   if kwargs.has_key('extract_setting'):
     self.__extend_map_handler = \
     ExtendMapHandler.get_instance(start_url_loader,
         kwargs['extract_setting'])
   else:
     self.__extend_map_handler =\
         ExtendMapHandler.get_instance(start_url_loader)
   self.__url_normalize = UrlNormalize.get_instance()
   self.album_infos = {}
   self.url_filter = UrlFilter().get_instance()
 def setUp(self):
     """Initialize the test."""
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
     self.requests = self.spider.start_requests()
Beispiel #5
0
def create_root(config):
    from scrapy import log
    from scrapy.settings import CrawlerSettings
    from slyd.crawlerspec import (CrawlerSpecManager,
                                  create_crawler_spec_resource)
    from slyd.bot import create_bot_resource
    import slyd.settings
    from slyd.projects import ProjectsResource

    root = Resource()
    root.putChild("static", File(config['docroot']))

    crawler_settings = CrawlerSettings(settings_module=slyd.settings)
    spec_manager = CrawlerSpecManager(crawler_settings)

    # add project management at /projects
    projects = ProjectsResource(crawler_settings)
    root.putChild('projects', projects)

    # add crawler at /projects/PROJECT_ID/bot
    log.msg("Slybot specs loading from %s/[PROJECT]" % spec_manager.basedir,
            level=log.DEBUG)
    projects.putChild("bot", create_bot_resource(spec_manager))

    # add spec at /projects/PROJECT_ID/spec
    spec = create_crawler_spec_resource(spec_manager)
    projects.putChild("spec", spec)
    return root
    def run(self):
        feconfig = self.configdata[const.FE_CONFIG]
        try:
            #=======================================================================
            # if the city use the default config
            #=======================================================================
            city_config = eval(feconfig[self.city_name])
        except Exception:
            city_config = {}

        start_page = city_config.get(const.START_PAGE,
                                     feconfig[const.DEFAULT_START_PAGE])
        end_page = city_config.get(const.END_PAGE,
                                   feconfig[const.DEFAULT_END_PAGE])

        values = {
            const.CONFIG_DATA: self.configdata,
            const.START_PAGE: int(start_page),
            const.END_PAGE: int(end_page),
        }

        settings = u'crawler.shc.fe.settings'
        module_import = __import__(settings, {}, {}, [''])
        settings = CrawlerSettings(module_import, values=values)
        execute(argv=["scrapy", "crawl", 'SHCSpider'], settings=settings)
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings

    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass

    dispatcher.connect(catch_item, signal=signals.item_passed)
    """clean storage"""
    scraperwiki.sqlite.execute("drop table if exists " + spider.name)
    scraperwiki.sqlite.commit()

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.install()
    crawler.configure()
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
 def setUp(self):
     """Initialize the test."""
     settings.LOG_LEVEL = 'DEBUG'
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
def valid_proxy():
    module_ = __import__('crawler.httpproxy.settings', {}, {}, [''])
    values = {
        u'RETRY_ENABLED': 0,
        u'DOWNLOAD_TIMEOUT': 2,
    }
    settings = CrawlerSettings(module_, values=values)
    execute(argv=["scrapy", "crawl", "BaiDuHomePageSpider"], settings=settings)
Beispiel #10
0
def main():
    """Setups item signal and run the spider"""

    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        try:
            print(
                """\
    <item>
        <title>#{number}: {title}</title>
        <link>{link}</link>
        <description>{description}</description>
        <pubDate>{pubdate}</pubDate>
        <guid>{audio_url}</guid>
        <enclosure url="{audio_url}" length="0" type="audio/mpeg" />
    </item>\
""".format(**item))
        except:
            print 'ERROR', item

    # shut off log
    settings = CrawlerSettings()
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler

    crawler = Crawler(settings)
    crawler.signals.connect(catch_item, signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()

    # schedule spider
    crawler.crawl(TALSpider())

    # print header
    with open('header.xml') as f:
        print f.read()

    # start engine scrapy/twisted
    crawler.start()
    reactor.run()

    # print footer
    with open('footer.xml') as f:
        print f.read()
Beispiel #11
0
def main():
    """Setups item signal and run the spider"""

    # set up signal to catch items scraped
    def catch_item(sender, item, **kwargs):
        try:
            print("""\
    <item>
        <title>#{number}: {title}</title>
        <link>{link}</link>
        <description>{description}</description>
        <pubDate>{pubdate}</pubDate>
        <guid>{audio_url}</guid>
        <enclosure url="{audio_url}" length="0" type="audio/mpeg" />
    </item>\
""".format(**item))
        except:
            print 'ERROR', item


    # shut off log
    settings = CrawlerSettings()
    settings.overrides['LOG_ENABLED'] = False

    # set up crawler

    crawler = Crawler(settings)
    crawler.signals.connect(catch_item, signal=signals.item_passed)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()

    # schedule spider
    crawler.crawl(TALSpider())

    # print header
    with open('header.xml') as f:
        print f.read()

    # start engine scrapy/twisted
    crawler.start()
    reactor.run()

    # print footer
    with open('footer.xml') as f:
        print f.read()
Beispiel #12
0
 def setUp(self):
     """Initialize the test."""
     crawler = Crawler(CrawlerSettings(settings))
     crawler.configure()
     self.spider = ebird_spider.EBirdSpider('REG')
     self.spider.set_crawler(crawler)
     self.spider.start_requests()
     self.records = [{
         'checklistID': 'CL00001',
         'comName': 'Common Name',
         'countryCode': 'CC',
         'countryName': 'Country',
         'firstName': 'Name',
         'howMany': 1,
         'lastName': 'Surname',
         'lat': 45.000000,
         'lng': -45.000000,
         'locID': 'L0000001',
         'locName': 'Location 1',
         'locationPrivate': True,
         'obsDt': '2013-03-27 09:00',
         'obsID': 'OBS0000001',
         'obsReviewed': False,
         'obsValid': True,
         'presenceNoted': False,
         'sciName': 'Scientific Name',
         'subID': 'S0000001',
         'subnational1Code': 'SN-01',
         'subnational1Name': 'Region',
         'subnational2Code': 'SN-02',
         'subnational2Name': 'County',
     }, {
         'checklistID': 'CL00002',
         'comName': 'Common Name',
         'countryCode': 'CC',
         'countryName': 'Country',
         'firstName': 'Name',
         'howMany': 1,
         'lastName': 'Surname',
         'lat': 50.000000,
         'lng': -50.000000,
         'locID': 'L0000002',
         'locName': 'Location 2',
         'locationPrivate': True,
         'obsDt': '2013-03-27 10:00',
         'obsID': 'OBS0000002',
         'obsReviewed': False,
         'obsValid': True,
         'presenceNoted': False,
         'sciName': 'Scientific Name',
         'subID': 'S0000002',
         'subnational1Code': 'SN-01',
         'subnational1Name': 'Region',
         'subnational2Code': 'SN-02',
         'subnational2Name': 'County',
     }]
def fetch_proxy():
    module_ = __import__('crawler.httpproxy.settings', {}, {}, [''])
    values = {
        u'DOWNLOAD_DELAY': 0,
        u'DOWNLOAD_TIMEOUT': 1,
        u'RETRY_ENABLED': 0
    }

    settings = CrawlerSettings(module_, values=values)
    execute(argv=["scrapy", "crawl", "FiveOneNewHTTPProxySpider"],
            settings=settings)
Beispiel #14
0
    def __init__(self, spider, results):
        Process.__init__(self)

        self.results = results     
        settings_module = importlib.import_module('Extractors.HTMLScraper.settings')
        settings = CrawlerSettings(settings_module)
        self.crawlerProcess = CrawlerProcess(settings)

        self.items = []
        self.spider = spider
        dispatcher.connect(self._item_passed, signals.item_passed)
Beispiel #15
0
def get_project_settings():
    if ENVVAR not in os.environ:
        project = os.environ.get("SCRAPY_PROJECT", "default")
        init_env(project)
    settings_module_path = os.environ.get(ENVVAR, "scrapy_settings")
    try:
        settings_module = __import__(settings_module_path, {}, {}, [""])
    except ImportError:
        settings_module = None
    settings = CrawlerSettings(settings_module)

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {}

    # XXX: deprecate and remove this functionality
    for k, v in os.environ.items():
        if k.startswith("SCRAPY_"):
            settings.overrides[k[7:]] = v

    return settings
Beispiel #16
0
    def test_skip_parsing_webpages(self):
        """Verify no web requests are made if include_html is False."""
        crawler = Crawler(CrawlerSettings(settings))
        crawler.configure()
        spider = ebird_spider.EBirdSpider('REG')
        spider.set_crawler(crawler)
        spider.start_requests()
        spider.include_html = False

        response = response_for_data(self.records)
        results = spider.parse_locations(response)
        self.assertEqual(0, sum(1 for _ in results))
Beispiel #17
0
def get_project_settings():
    if ENVVAR not in os.environ:
        project = os.environ.get('SCRAPY_PROJECT', 'default')
        init_env(project)
    settings_module_path = os.environ.get(ENVVAR)
    if settings_module_path:
        settings_module = import_module(settings_module_path)
    else:
        settings_module = None
    settings = CrawlerSettings(settings_module)

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    settings.overrides = pickle.loads(pickled_settings) if pickled_settings else {}

    # XXX: deprecate and remove this functionality
    for k, v in os.environ.items():
        if k.startswith('SCRAPY_'):
            settings.overrides[k[7:]] = v

    return settings
Beispiel #18
0
def get_crawler(settings_dict=None):
    """Return an unconfigured Crawler object. If settings_dict is given, it
    will be used as the settings present in the settings module of the
    CrawlerSettings.
    """
    class SettingsModuleMock(object):
        pass
    settings_module = SettingsModuleMock()
    if settings_dict:
        for k, v in settings_dict.items():
            setattr(settings_module, k, v)
    settings = CrawlerSettings(settings_module)
    return Crawler(settings)
Beispiel #19
0
def get_project_settings():
    if ENVVAR not in os.environ:
        project = os.environ.get('SCRAPY_PROJECT', 'default')
        init_env(project)
    settings_module_path = os.environ.get(ENVVAR, 'scrapy_settings')
    try:
        settings_module = __import__(settings_module_path, {}, {}, [''])
    except ImportError:
        settings_module = None
    settings = CrawlerSettings(settings_module)

    # XXX: remove this hack
    pickled_settings = os.environ.get("SCRAPY_PICKLED_SETTINGS_TO_OVERRIDE")
    settings.overrides = pickle.loads(
        pickled_settings) if pickled_settings else {}

    # XXX: deprecate and remove this functionality
    for k, v in os.environ.items():
        if k.startswith('SCRAPY_'):
            settings.overrides[k[7:]] = v

    return settings
Beispiel #20
0
def crawl_spider(domain, day1, day2):
    spider_dict ={'agoda.com': AgodaSpider, 'ivivu.com': IvivuSpider}
    
    args = {'from_date': datetime.now() + timedelta(days=day1),
            'to_date'  : datetime.now() + timedelta(days=day2)
        }
    
    print "\n crawl spider==========="
 
    spider = spider_dict.get(domain, AgodaSpider)
    spider = spider(args)
        
    settings_module = import_module('scraper.scraper.settings')
    settings = CrawlerSettings(settings_module)
    settings.overrides['SPIDER_MODULES'] = ['scraper.scraper.spiders']
    
#        settings = get_project_settings()
    crawler = Crawler(settings)
    crawler.signals.connect(reactor.stop, signal=signals.spider_closed)
    crawler.configure()
    crawler.crawl(spider)
    crawler.start()
    log.start()
    reactor.run()
Beispiel #21
0
def _runCrawler(spider, results):
    settings_module = importlib.import_module(
        'Extractors.HTMLScraper.settings')
    settings = CrawlerSettings(settings_module)
    crawlerProcess = CrawlerProcess(settings)
    items = []

    def _item_passed(item, response, spider):
        items.append(item)

    dispatcher.connect(_item_passed, signals.item_scraped)

    crawler = crawlerProcess.create_crawler("currentCrawler")
    crawler.crawl(spider)
    crawlerProcess.start()
    crawlerProcess.stop()
    results.put(items)
    def test(self):
        crawler = mock.MagicMock()
        crawler.settings = CrawlerSettings()
        crawler.settings.overrides['USER_AGENT'] = 'CustomAgent'
        self.assertRaises(NotConfigured, RobotsTxtMiddleware, crawler)
        crawler.settings.overrides['ROBOTSTXT_OBEY'] = True
        crawler.engine.download = mock.MagicMock()
        ROBOTS = re.sub(
            r'^\s+(?m)', '', '''
        User-Agent: *
        Disallow: /admin/
        Disallow: /static/
        ''')
        response = Response('http://site.local/robots.txt', body=ROBOTS)

        def return_response(request, spider):
            deferred = Deferred()
            reactor.callFromThread(deferred.callback, response)
            return deferred

        crawler.engine.download.side_effect = return_response
        middleware = RobotsTxtMiddleware(crawler)
        spider = None  # not actually used
        # There is a bit of neglect in robotstxt.py: robots.txt is fetched asynchronously,
        # and it is actually fetched only *after* first process_request completes.
        # So, first process_request will always succeed.
        # We defer test() because otherwise robots.txt download mock will be called after assertRaises failure.
        self.assertIsNone(
            middleware.process_request(Request('http://site.local'),
                                       spider))  # not affected by robots.txt

        def test(r):
            self.assertIsNone(
                middleware.process_request(
                    Request('http://site.local/allowed'), spider))
            self.assertRaises(IgnoreRequest, middleware.process_request,
                              Request('http://site.local/admin/main'), spider)
            self.assertRaises(IgnoreRequest, middleware.process_request,
                              Request('http://site.local/static/'), spider)

        deferred = Deferred()
        deferred.addCallback(test)
        reactor.callFromThread(deferred.callback, None)
        return deferred
Beispiel #23
0
    def run(self):
        if self.proxies:
            values = configdata.get(const.vpsettings, {})
            values[AppConst.proxies] = self.proxies
            values[const.DOWNLOAD_TIMEOUT] = int(
                values.get(const.DOWNLOAD_TIMEOUT, 5))
            if const.Console in values:
                if values[const.Console] == u'1':  # out to console
                    values[const.LOG_FILE] = None
                else:
                    log_dir = values.get(const.LOG_DIR, os.getcwd())
                    if const.LOG_FILE in values:
                        logfile_prefix = datetime.datetime.now().strftime(
                            "%Y%m%d_%H%M%S_%f")
                        log_file = '%s_%s' % (logfile_prefix,
                                              values[const.LOG_FILE])
                        values[const.LOG_FILE] = os.sep.join(
                            [log_dir, log_file])

            settings = CrawlerSettings(None, values=values)
            execute(argv=["scrapy", "crawl", 'SOSOSpider'], settings=settings)
Beispiel #24
0
def run_spider(spider, settings):
    """Run a spider with given settings"""
    from scrapy import signals
    from scrapy.xlib.pydispatch import dispatcher
    from scrapy.settings import CrawlerSettings

    def catch_item(sender, item, **kwargs):
        #log.msg("Got:" + str(item))
        pass

    dispatcher.connect(catch_item, signal=signals.item_passed)

    from scrapy.crawler import CrawlerProcess

    settings = CrawlerSettings(values=settings)

    crawler = CrawlerProcess(settings)
    crawler.crawl(spider)

    #log.start(loglevel='DEBUG')

    crawler.start()
Beispiel #25
0
import sys
import tempfile

from scrapy.settings import CrawlerSettings

from checklists_scrapers import settings
from checklists_scrapers.spiders.worldbirds_spider import WorldBirdsSpider
from checklists_scrapers.tests.utils import RunCrawler
from checklists_scrapers.utils import list_files

from checklists_scrapers.tests.validation import checklists


settings.DOWNLOAD_DIR = tempfile.mkdtemp()
settings.REPORT_RECIPIENTS = ''

username = sys.argv[1]
password = sys.argv[2]
country = sys.argv[3]

spider = WorldBirdsSpider(username=username, password=password, country=country)
RunCrawler(CrawlerSettings(settings)).crawl(spider)

for path in list_files(settings.DOWNLOAD_DIR, 'json'):
    with open(path, 'rb') as fp:
        checklists.append(json.load(fp))

nose.run(argv=['checklists_scrapers.tests.validation'])

shutil.rmtree(settings.DOWNLOAD_DIR)
Beispiel #26
0
def test_spec_manager():
    """Create a CrawlerSpecManager configured to use test settings"""
    crawler_settings = CrawlerSettings(settings_module=test_settings)
    return SpecManager(crawler_settings)
	def __contains__(self, item):
		#print(type(item))
		return True if CrawlerSettings.__getitem__(self, item) else False
class HeadLineAlbumExtractor(object):
  _instance = None
  _instance_lock = threading.Lock()
  @staticmethod
  def get_instance(
      start_url_loader,
      setting_module_path = 'le_crawler.common.headline_album_settings',
      *kargs,
      **kwargs):
    HeadLineAlbumExtractor._instance_lock.acquire()
    if not HeadLineAlbumExtractor._instance:
      loger = Log('album_crawler', '../log/album_crawler.log')
      HeadLineAlbumExtractor._instance = \
      HeadLineAlbumExtractor(start_url_loader,
          loger,
          setting_module_path,
          *kargs,
          **kwargs)
    HeadLineAlbumExtractor._instance_lock.release()
    return HeadLineAlbumExtractor._instance

  def __init__(self,
      start_url_loader,
      loger,
      setting_module_path = 'le_crawler.common.headline_album_settings',
      *kargs,
      **kwargs):
    __import__(setting_module_path)
    self.__settings = CrawlerSettings(settings_module
        = sys.modules[setting_module_path])
    self.loger = loger
    # {album_id, {}}
    self.album_ids = {}
    self.__init_regs()
    self.__extend_map_handler = kwargs['extend_map_handler'] if \
    kwargs.has_key('extend_map_handler') else None
    if kwargs.has_key('extract_setting'):
      self.__extend_map_handler = \
      ExtendMapHandler.get_instance(start_url_loader,
          kwargs['extract_setting'])
    else:
      self.__extend_map_handler =\
          ExtendMapHandler.get_instance(start_url_loader)
    self.__url_normalize = UrlNormalize.get_instance()
    self.album_infos = {}
    self.url_filter = UrlFilter().get_instance()

  def get_category_id(self, refer_url):
    ca = self.__get_category_name(refer_url)
    if 'joke' == ca:
      return 109
    elif 'ent' == ca:
      return 104
    else:
      return -1

  def __get_category_name(self, refer_url):
    return self.__extend_map_handler.settings.get_category_name(refer_url)\
        if self.__extend_map_handler  else 'UNKONWN_CATEGORY'

  def __init_regs(self):
    # load glob id url reg
    self.local_id_reg = []
    for r in self.__settings.getlist('LOCAL_ID_REG', []):
      self.local_id_reg.append(re.compile(r, re.I | re.S))
    # load album id url
    self.album_id_match_regs = {}
    for k, v in self.__settings.getdict('ALBUM_ID_URL', {}).items():
      for r in v:
        self.album_id_match_regs.setdefault(k, []).append(re.compile(r, re.I |
          re.S))
    # load extend url dict
    self.extend_album_pages = self.__settings.getdict('ALBUM_PAGE_URL', {})
    # load global album id reg
    self.global_albumid_reg = self.__settings.getdict('GLOBAL_ALBUMID_REG', {})

    # href tags
    self.href_tags = self.__settings.getlist('HREF_TAGs', [])
    self.loger.log.info('load href url tags: %s' % len(self.href_tags))

  def __get_global_albumid(self, localid, site_album_id):
    url = self.global_albumid_reg[localid].replace('(*albumid*)', site_album_id)
    from le_crawler.core.docid_generator import gen_docid
    return gen_docid(url)

  def __get_localid(self, url):
    return query_domain_from_url(url) or self.url_filter.get_domain_from_url(url)
  # deprecated: using above
  #def __get_localid(self, url):
  #  for r in self.local_id_reg:
  #    sg = r.search(url)
  #    if not sg:
  #      continue
  #    g = sg.groups()
  #    if g:
  #      return g[0]
  #  return None

  def __parser_urls(self, sels):
    returls = []
    for urls in sels.xpath('//a'):
      for attr in self.href_tags:
        u = urls.xpath('./@%s' % attr)
        if u:
          returls.append(u.extract()[0].encode('utf8'))
    return returls

  # return [idlist]
  def __get_site_album_id(self, localid, urls):
    retlist = set()
    for u in urls:
      for r in self.album_id_match_regs[localid]:
        sg = r.search(u)
        if sg:
          g = sg.groups()
          if g:
            retlist.add(g[0])
            break
    return list(retlist)

  # return album video urls
  def __get_album_pags(self, localid, idlist, refer_url):
    returls = []
    category = self.__get_category_name(refer_url)
    postfix = ' %s|channel' % category
    if self.extend_album_pages.has_key(localid):
      for pageurl in self.extend_album_pages[localid]:
        for id in idlist:
          glid = self.__get_global_albumid(localid, id)
          strtmp = pageurl.replace('(*albumid*)', id).replace('(*pagenum*)',
              '(*)') + postfix
          from le_crawler.base.url_extend import extend_url
          sta, extedurls = extend_url(strtmp, '1', '8', 0)
          if not sta:
            continue
          for eu in extedurls:
            # preprocess url
            self.album_ids[eu.split(' ')[0]] = glid
          returls.extend(extedurls)
    return returls

  def get_global_albumid_by_refer(self, refer_url):
    if self.album_ids.has_key(refer_url):
      return self.album_ids[refer_url]
    else:
      print 'Error: can not found global id:', refer_url

  # return description
  # dict = {'enter_page': [], 'album_pages':[], 'album_infos_pages' : []}
  def parser_enter(self, url, pages):
    localid = self.__get_localid(url)
    if not localid:
      return []
    from scrapy.selector import Selector
    sel = Selector(text = pages, type = 'html')
    if not sel:
      return []
    urls = self.__parser_urls(sel)
    albumids = self.__get_site_album_id(localid, urls)
    albumurls = self.__get_album_pags(localid, albumids, url)
    # hock start urls to extend_map_handler
    self.__extend_map_handler.settings.add_start_urls(albumurls)
    return albumurls

  # input enter page
  # return album info pages
  def parser_album_info_pages(self, body, url, refer_url):
    sta = False
    albumid = self.get_global_albumid_by_refer(url)
    if not self.album_infos.has_key(albumid):
      sta, items = self.__extend_map_handler.settings.extract_custom_map(
          body = body,
          pageurl = url)
      if not sta:
        return []
      cateid = self.get_category_id(refer_url)
      self.album_infos[albumid] = items
      self.album_infos[albumid]['album_cid'] = cateid
      self.album_infos[albumid]['album_id'] = albumid
      self.album_infos[albumid]['album_url'] = \
        self.__url_normalize.get_unique_url(url)
    # second extract urls
    status, extend_url = self.__extend_map_handler.extract_extend_map(body = body,
        pageurl = url, ignore_empty_property = True)
    if status:
      ldict = self.__extend_map_handler.get_inlink_location_dict()
      if not ldict.has_key(extend_url[0]):
        self.loger.log.error('Failed found inlink location for %s' %
            extend_url[0])
        assert False, 'Failed found inlink location, %s' %  extend_url[0]
      else:
        locationstr = ldict[extend_url[0]]
      self.album_infos[albumid].setdefault('album_vids', {})[locationstr] = \
            [self._get_store_key(i) for i in extend_url]
      video_url = extend_url[0] if extend_url else None
      album_pic = self.__extend_map_handler.lookup_extend_map(video_url, type
          = 'dict')['cover']\
          if video_url and \
          self.__extend_map_handler.lookup_extend_map(video_url, type = 'dict')\
              and self.__extend_map_handler.lookup_extend_map(video_url, type
                  = 'dict').has_key('cover') else None
      if album_pic:
        self.album_infos[albumid]['album_pic'] = album_pic

    return extend_url
  #Note: this docid should same as today_tv_writer
  def _get_store_key(self, url):
    return md5.new(url).hexdigest()

  def parser_enter_page(self, url, sels):
    glbid = self.get_global_albumid_by_refer(url)
    if not glbid:
      return

  def parser_ablum_pages(self, sels):
    pass

  def ignore_crawl_link(self, url):
    return self.__extend_map_handler.settings.ignore_link_to_crawler(url)

  def get_album_info(self, albumid):
    return self.album_infos[albumid] \
        if self.album_infos.has_key(albumid) else {}

  def get_album_infos(self):
    return self.album_infos

  def debug_album_infos(self):
    for k, v in self.album_infos.items():
      print '-------', 'albuminfo', '-------'
      for k1, v1 in v.items():
        print k1, v1
Beispiel #29
0
# encoding=UTF-8
'''
Created on 2013-4-20
@author: Administrator
'''
from robot.configutil import ConfigFile
from scrapy.cmdline import execute
from scrapy.settings import CrawlerSettings

if __name__ == '__main__':

    cfg_file = r'fetchqkyj.cfg'
    configdata = ConfigFile.readconfig(cfg_file).data
    import_modules = __import__('fost.zxy.qkyj.settings',
                                globals={},
                                locals={},
                                fromlist=[
                                    '',
                                ])
    values = dict(configdata)
    settings = CrawlerSettings(import_modules, values=values)
    execute([
        'scrapy',
        'crawl',
        'ZXY_QKYJ_Spider',
    ], settings=settings)
Beispiel #30
0
def test_projects_resource(temp_projects_dir):
    """Create a ProjectsResource configured to use test settings"""
    crawler_settings = CrawlerSettings(settings_module=test_settings)
    projects = ProjectsResource(crawler_settings)
    projects.projectsdir = temp_projects_dir
    return projects
Beispiel #31
0
from twisted.internet import reactor
from scrapy.crawler import Crawler
from scrapy.settings import CrawlerSettings
from scrapy import log, signals
from scrapy.xlib.pydispatch import dispatcher
from finance.spiders.financeSpider import FinanceSpider
import sys


def stop_reactor():
    reactor.stop()


dispatcher.connect(stop_reactor, signal=signals.spider_closed)
spider = FinanceSpider(symbol=sys.argv[1])
crawler = Crawler(CrawlerSettings())
crawler.configure()
crawler.crawl(spider)
crawler.start()
log.start(loglevel=log.ERROR)
log.msg('Running reactor...')
reactor.run()  # the script will block here
log.msg('Reactor stopped.')

print spider.risultato
    def setUp(self):
        """Initialize the test."""
        crawler = Crawler(CrawlerSettings(settings))
        crawler.configure()
        self.spider = ebird_spider.EBirdSpider('REG')
        self.spider.set_crawler(crawler)
        self.lista = {
            'meta': {
                'version': 1,
                'language': 'en',
            },
            'identifier':
            'S0000001',
            'date':
            '2013-03-27',
            'protocol': {
                'time': '09:00',
            },
            'observers': {
                'names': ['Name Surname'],
                'count': 1,
            },
            'source': {
                'name': 'eBird',
                'submitted_by': 'Name Surname',
            },
            'location': {
                'identifier': 'L0000001',
                'name': 'Location 1',
                'county': 'County',
                'region': 'Region',
                'country': 'Country',
                'lat': 45.0,
                'lon': -45.0,
            },
            'entries': [{
                'identifier': 'OBS0000001',
                'species': {
                    'name': 'Common Name',
                },
                'count': 23
            }]
        }
        self.listb = {
            'meta': {
                'version': 1,
                'language': 'en',
            },
            'source':
            'ebird',
            'url':
            'http://ebird.org/',
            'observers': {
                'names': ['Other Name'],
                'count': 1,
            },
            'activity':
            'Birding',
            'protocol': {
                'name': 'Traveling',
                'duration_hours': 2,
                'duration_minutes': 35,
                'distance': 2000,
                'area': 0,
            },
            'comment':
            'A comment',
            'entries': [{
                'species': {
                    'name': 'Common Name',
                },
                'count':
                23,
                'details': [{
                    'age': 'AD',
                    'sex': 'M',
                    'count': 9
                }, {
                    'age': 'AD',
                    'sex': 'F',
                    'count': 6
                }, {
                    'age': 'JUV',
                    'sex': 'X',
                    'count': 8
                }]
            }]
        }

        self.fixture = self.spider.merge_checklists(self.lista, self.listb)
Beispiel #33
0
 def __init__(self, module_path = 'le_crawler.base.url_normalize_settings'):
   module_path = module_path
   __import__(module_path)
   self.__settings = CrawlerSettings(settings_module = sys.modules[module_path])
   self.__load_settings()
Beispiel #34
0
class UrlNormalize():
  #def init_onece(self, *args, **kwargs):
  def __init__(self, module_path = 'le_crawler.base.url_normalize_settings'):
    module_path = module_path
    __import__(module_path)
    self.__settings = CrawlerSettings(settings_module = sys.modules[module_path])
    self.__load_settings()

  def __convert_map_regs(self, regmap):
    if not regmap:
      return {}
    tmpres = {}
    for (id, reglist) in regmap.items():
      regstmp = []
      for r in reglist:
        #print 're....%s' % r
        regstmp.append(re.compile(r, re.IGNORECASE))
      if regstmp:
        tmpres[id] = regstmp
    return tmpres

  def __accept_reg(self, id_reglist, item):
    if not item:
      return None
    for (id, tmpres) in id_reglist.items():
      for r in tmpres:
        if r.search(item):
          return id
    return None

  def __get_keep_para_list(self, mapdict, id):
    if mapdict.has_key(id):
      return mapdict[id]
    return []

  def __get_keep_query_lst(self, id):
    return self.__get_keep_para_list(self.__keep_query, id)

  def __keep_fragment(self, id):
    if not self.__keep_fragments or not self.__keep_fragments.has_key(id):
      return False
    return self.__keep_fragments[id]
  def __update_paras_with_extra(self, input_dict, id):
    if not self.__extra_para.has_key(id):
      return input_dict
    input_dict.update(self.__extra_para[id])

  def get_mapping_id(self, url = None, domain = None):
    # first mapping domain
    if self.__id_mapping_domain.has_key(domain):
      return self.__id_mapping_domain(domain)
    # second try match reg
    return self.__accept_reg(self.__id_mapping_reg, url)

  def __set_query_dict(self, org_dict, id):
    if org_dict is None or id is None:
      return {}
    domain_k_p = self.__get_keep_query_lst(id)
    retdict = {}
    for (k,ef) in domain_k_p:
      if org_dict.has_key(k) and org_dict[k] != '':
        retdict[k] = org_dict[k]
      elif ef:
        retdict[k] = ''
    return retdict

  def __join_query(self, inputd):
    if not inputd:
      return ''
    query_str = None
    reslist = sorted(inputd.items(), key = lambda d: d[0], reverse = True)
    for (k, v) in reslist:
      if query_str:
        query_str += '&%s=%s' % (k, v[0])
      query_str = '%s=%s' % (k, v[0])
    return query_str

  def __load_settings(self):
    self.__id_mapping_reg = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_REG', {}))
    self.__id_mapping_domain = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_DOMAIN', {}))
    self.__keep_query = self.__settings.getdict('KEEP_QUERY', {})
    self.__keep_fragments = self.__settings.getdict('KEEP_FRAGEMENT', {})
    self.__extra_para = self.__settings.getdict('ADD_EXTRA_PARA', {})

  def get_unique_url(self,
      url,
      scheme = None,
      netloc = None,
      domain = None,
      no_conf_no_oper = False):
    id = self.get_mapping_id(url = url, domain = domain)
    if id is None:
      if not no_conf_no_oper:
        id = 'DEFAULT'
      else:
        return url
    if id is None or url is None:
      raise Exception('Failed get mapping id for: %s, %s' % (domain, url))
    urlp = urlparse.urlsplit(url.strip(), allow_fragments = self.__keep_fragment(id))
    if not urlp:
      raise Exception('Failed convert urlparse %s' % url)
    nscheme = urlp.scheme or scheme
    nnetloc = urlp.netloc or netloc
    qdict = urlparse.parse_qs(urlp.query)
    fqdict = self.__set_query_dict(qdict, id)
    self.__update_paras_with_extra(fqdict, id)
    nquery = self.__join_query(fqdict)
    return urlparse.urlunsplit((nscheme, nnetloc, urlp.path, nquery,
      urlp.fragment)).strip()
	def __contains__(self, item):
		return bool(CrawlerSettings.__getitem__(self, item))
Beispiel #36
0
 def _get_settings(settings_dict=None):
     settings_module = type('SettingsModuleMock', (object, ),
                            settings_dict or {})
     return CrawlerSettings(settings_module)
Beispiel #37
0
def fetch51freeproxy():
    values = configdata.get(const.vpsettings, {})
    settings = CrawlerSettings(values=values)
    execute(argv=["scrapy", "crawl", "FOSpider"], settings=settings)