def valid_proxy(): module_ = __import__('crawler.httpproxy.settings', {}, {}, ['']) values = {u'RETRY_ENABLED':0, u'DOWNLOAD_TIMEOUT':1, } settings = CrawlerSettings(module_, values=values) execute(argv=["scrapy", "crawl", "BaiDuHomePageSpider" ], settings=settings)
def main2(i, q): while True: pth = q.get() logging.debug(pth) cmdline.execute(['scrapy', 'runspider', 'code6s_allinone2.py', '-a', 'pth='+pth]) time.sleep(2)
def main(): with project_environment(): from scrapy.cmdline import execute execute()
def run(self): feconfig = self.configdata[const.FE_CONFIG] try: #======================================================================= # if the city use the default config #======================================================================= city_config = eval(feconfig[self.city_name]) except Exception: city_config = {} start_page = city_config.get(const.START_PAGE, feconfig[const.DEFAULT_START_PAGE]) end_page = city_config.get(const.END_PAGE, feconfig[const.DEFAULT_END_PAGE]) values = { const.CONFIG_DATA:self.configdata, const.START_PAGE:int(start_page), const.END_PAGE:int(end_page), } settings = u'crawler.shc.fe.settings' module_import = __import__(settings, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)
def run(self): values = configdata.get(DetailSpiderConst.DetailStatusSettings, {}) values[const.DETAIL_LIST] = self.cis values.update(**{ const.CONFIG_DATA:self.configdata, }) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if not os.path.exists(log_dir): os.makedirs(log_dir,) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings_path = u'crawler.shc.fe.settings' module_import = __import__(settings_path, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'CarStatusSpider' ], settings=settings)
def work(): #projectLists = ['GLYD', 'YBJR'] #cmdline.execute("scrapy crawl mininova".split()) #for i in xrange(len(projectLists)): cmdline.execute("scrapy crawl GLYD".split()) print "hello world."
def run(self): try: settings = CrawlerSettings(__import__(self.dp.dfcfg.settingspath, {}, {}, [''])) execute(argv=["scrapy", "crawl", self.dp.dfcfg.spidername ], settings=settings) except Exception as e: raise e
def main(): ##todo ##generate range of dates #parse date arguement #when calling tsn, these are dates to specify range in current season #when calling nhl, these are season start year and season end year parser = argparse.ArgumentParser() parser.add_argument("startDate", help="input a start date: mm/dd/yr") parser.add_argument("endDate", help="input an end date: mm/dd/yr") args = parser.parse_args() print args #first delete boxsoreAddressList from previous executions try: os.remove("gameCrawlerItems.json") except OSError: pass ##execute call to scrapy ##with date and output file as parameters ##writes to JSON object boxscoreAddressList if(re.match('(\d)(\d)(\d)(\d) (\d)(\d)(\d)(\d)',args.startDate + ' ' + args.endDate)): print "season specified - scraping nhl" execString = ("scrapy crawl nhl -a seasonStart=%s -a seasonEnd=%s " % (args.startDate, args.endDate)) else: print "date range specified - scraping tsn" execString = ("scrapy crawl tsn -a startDate=%s -a endDate=%s " % (args.startDate, args.endDate)) cmdline.execute(execString.split())
def fetch_proxy(): module_ = __import__('crawler.httpproxy.settings', {}, {}, ['']) values = {u'DOWNLOAD_DELAY':0, u'DOWNLOAD_TIMEOUT':1, u'RETRY_ENABLED':0 } settings = CrawlerSettings(module_, values=values) execute(argv=["scrapy", "crawl", "FiveOneNewHTTPProxySpider" ], settings=settings)
def main(): #args:市、区 args = sys.argv[1:] args_str = "scrapy crawl dbhouse" level = ["city", "area"] for i in range(len(args)): args_str = args_str + " -a " + level[i] + "=" + args[i] print(args_str) execute(args_str.split())
def run(self): try: values = self.build_values() except Exception as e: raise e settings = u'crawler.shc.fe.settings' module_import = __import__(settings, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)
def test(): try: os.unlink("scraped/forums.db") except OSError: pass #cmd = 'scrapy crawl -L INFO -a config=phpbb3 -a url=http://www.raspberrypi.org/phpBB3 generic' #cmd = 'scrapy crawl -L INFO -a config=phpbb3.0.x generic -s JOBDIR=crawls/generic-1' #cmd = 'scrapy crawl -L INFO -a config=phpbb3.0.x generic' #cmd = 'scrapy crawl -L DEBUG -a config=phpbb3.0.x.test generic' cmd = 'scrapy crawl -L INFO -a config=qnap generic' execute(cmd.split())
def main2(i, q): while True: pth = q.get() logging.debug(pth) #code6_allinone.main(pth) #os.getenv("export SCRAPY_SETTINGS_MODULE=allinone.settings") #sys.path.append("/home/desktop/flipkart/allinone") #output = subprocess.check_output(['scrapy', 'runspider', 'code6s_allinone.py', '-a', 'pth='+pth]) #os.environ.get("export PYTHONPATH=/home/desktop/flipkart/allinone/") cmdline.execute(['scrapy', 'runspider', 'code6s_allinone.py', '-a', 'pth='+pth]) time.sleep(2)
def main(): args = get_args() url = args.url user = args.login password = args.password try: execute(['scrapy', 'crawl', 'quickscan_spider', '-a', 'url=%s' % url, '-a', 'user=%s' % user, '-a', 'pw=%s' % password, '-s', 'CONCURRENT_REQUESTS=%s' % args.connections, '-a', 'basic=%s' % args.basic, '-a', 'fast=%s' % args.fast]) except KeyboardInterrupt: sys.exit()
def main(): args = get_args() rate = args.ratelimit if rate not in [None, '0']: rate = str(60 / float(rate)) try: execute(['scrapy', 'crawl', 'spider', '-a', 'url=%s' % args.url, '-s', 'CONCURRENT_REQUESTS=%s' % args.connections, '-s', 'DOWNLOAD_DELAY=%s' % rate]) except KeyboardInterrupt: sys.exit()
def main(): args = get_args() rate = args.ratelimit if rate not in [None, '0']: rate = str(60 / float(rate)) try: execute(['scrapy', 'crawl', 'xsscrapy', '-a', 'url=%s' % args.url, '-a', 'user=%s' % args.login, '-a', 'pw=%s' % args.password, '-a', 'basic=%s' % args.basic, '-a', 'hostlimit=%s' % args.hostlimit, '-s', 'CONCURRENT_REQUESTS=%s' % args.connections, '-s', 'DOWNLOAD_DELAY=%s' % rate]) except KeyboardInterrupt: sys.exit()
def main(argv): try: opts, args = getopt.getopt(argv, 's:') if len(opts) == 0: raise getopt.GetoptError(u'Argumento "-s" obrigatorio') for opt, arg in opts: if opt != '-s' or arg == '': raise getopt.GetoptError(u'Argumento "-s" obrigatorio') cmdline.execute(('scrapy crawl ' + arg).split()) break except getopt.GetoptError: print('Erro!\nUtilize o comando: main.py -s <nome do spider>') sys.exit(2)
def main(): #try: # os.unlink("scraped/forums.db") #except OSError: # pass #config = Config(file('example.cfg')) #print config # Start scrapy with some arguments! #try: cmd = 'scrapy crawl -L INFO -a config=qnap generic -s JOBDIR=crawls/generic-1' execute(cmd.split())
def handle(self, *args, **options): default_args = ['scrapy'] argc = len(self._argv) if argc >= 4: default_args.extend(['crawl', 'walker']) default_args.extend(['-a', self._argv[2]]) default_args.extend(['-a', self._argv[3]]) else: self.stdout.write(self.help) return execute(default_args) self.stdout.write(str(args)) self.stdout.write(str(options))
def main3(i, q): for pth in iter(q.get, None): try: cmdline.execute(['scrapy', 'runspider', 'page5_second_scrapy_amazon.py', '-a', 'pth=%s' %(pth)]) print pth except: pass logging.debug(pth) time.sleep(i + 2) q.task_done() q.task_done()
def handle(self, *args, **options): scrapydir = get_scrapyroot() chdir(scrapydir) default_args = ['scrapy', 'crawl'] if len(self._argv) == 3: default_args.append(self._argv[2]) else: self.stdout.write(self.help) return try: execute(default_args) except KeyError, ke: self.stdout.write('iyo spider haionekani')
def main3(i, q): for pth in iter(q.get, None): try: cmdline.execute(["scrapy", "runspider", "page3_second_scrapy_homeshop18.py", "-a", "pth=%s" % (pth)]) print pth except: pass logging.debug(pth) time.sleep(i + 2) q.task_done() q.task_done()
def fetch51anonymousfreeproxy(): values = configdata.get(FetchProxySpiderConst.FetchFOAnonymousProxySettings, {}) values[ScrapyConst.DOWNLOAD_TIMEOUT] = int(values.get(ScrapyConst.DOWNLOAD_TIMEOUT, 0)) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings = CrawlerSettings(None, values=values) execute(argv=["scrapy", "crawl", "FOAnonymousSpider" ], settings=settings)
def handle(self, *args, **options): if (not len(args) == 1) or (args[0] == u"help"): self.stdout.write(u"Usage: {0}\n".format(self.args)) self.stdout.write(self.help) else: # Take a filename from command line to crawl default = [u""] default.append(u"crawl") default.append(u"all") default.append(u"-s") default.append(u"URLS=" + unicode(args[0])) from scrapy.cmdline import execute execute(default)
def main(): args = get_args() rate = args.ratelimit if rate not in [None, '0']: rate = str(60 / float(rate)) try: cookie_key = args.cookie.split('=',1)[0] if args.cookie else None cookie_value = ''.join(args.cookie.split('=',1)[1:]) if args.cookie else None execute(['scrapy', 'crawl', 'xsscrapy', '-a', 'url=%s' % args.url, '-a', 'user=%s' % args.login, '-a', 'pw=%s' % args.password, '-a', 'basic=%s' % args.basic, '-a', 'cookie_key=%s' % cookie_key, '-a', 'cookie_value=%s' % cookie_value, '-s', 'CONCURRENT_REQUESTS=%s' % args.connections, '-s', 'DOWNLOAD_DELAY=%s' % rate]) except KeyboardInterrupt: sys.exit()
def main(): # os.environ.get("export SCRAPY_SETTINGS_MODULE=allinone.settings") # sys.path.append("/home/desktop/flipkart/allinone/") # os.environ.get("export PYTHONPATH=/home/desktop/flipkart/allinone/") # output = subprocess.check_output(['scrapy', 'crawl', 'collect_link_and_extract', '-a', 'pth=dirthree08022014/women/womens-footwear/womens-footwear-xx-sports-shoes-xx-bnbcbl/ZEMgear.csv']) # time.sleep(2) # q.task_done() cmdline.execute( [ "scrapy", "crawl", "collect_link_and_extract", "-a", "pth=dirthree08022014/women/womens-footwear/womens-footwear-xx-sports-shoes-xx-bnbcbl/ZEMgear.csv", ] )
def run(self): feconfig = self.configdata[const.FE_CONFIG] try: #======================================================================= # if the city use the default config #======================================================================= city_config = eval(feconfig[self.city_name]) except Exception: city_config = {} start_page = city_config.get(const.START_PAGE, feconfig[const.DEFAULT_START_PAGE]) end_page = city_config.get(const.END_PAGE, feconfig[const.DEFAULT_END_PAGE]) # values = { # const.CONFIG_DATA:self.configdata, # const.START_PAGE:int(start_page), # const.END_PAGE:int(end_page), # } # settings = u'crawler.shc.fe.settings' # module_import = __import__(settings, {}, {}, ['']) # settings = CrawlerSettings(module_import, values=values) # execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings) values = configdata.get(ListSpiderConst.ListSettings, {}) values.update(**{ const.CONFIG_DATA:self.configdata, const.START_PAGE:int(start_page), const.END_PAGE:int(end_page), }) if ScrapyConst.Console in values: if values[ScrapyConst.Console] == u'1':# out to console values[ScrapyConst.LOG_FILE] = None else: log_dir = values.get(ScrapyConst.LOG_DIR, os.getcwd()) if ScrapyConst.LOG_FILE in values: log_file = values[ScrapyConst.LOG_FILE] values[ScrapyConst.LOG_FILE] = os.sep.join([log_dir , log_file]) settings_path = u'crawler.shc.fe.settings' module_import = __import__(settings_path, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SHCSpider' ], settings=settings)
def main(): args = get_args() url = args.url user = args.login password = args.password if args.basic: basic = 'true' else: basic = 'false' if args.connections: conns = args.connections try: execute(['scrapy', 'crawl', 'xsscrapy', '-a', 'url=%s' % url, '-a', 'user=%s' % user, '-a', 'pw=%s' % password, '-s', 'CONCURRENT_REQUESTS=%s' % conns, '-a', 'basic=%s' % basic]) except KeyboardInterrupt: sys.exit()
def run(self): values = configdata.get(const.vpsettings, {}) values[AppConst.proxies] = self.proxies values[const.DOWNLOAD_TIMEOUT] = int(values.get(const.DOWNLOAD_TIMEOUT, 5)) if const.Console in values: if values[const.Console] == u'1':# out to console values[const.LOG_FILE] = None else: log_dir = values.get(const.LOG_DIR, os.getcwd()) if const.LOG_FILE in values: logfile_prefix = datetime.datetime.now().strftime("%Y%m%d_%H%M%S_%f") log_file = '%s_%s' % (logfile_prefix, values[const.LOG_FILE]) values[const.LOG_FILE] = os.sep.join([log_dir , log_file]) values[const.RETRY_TIMES] = len(valid_urls) settings = u'vp.settings' module_import = __import__(settings, {}, {}, ['']) settings = CrawlerSettings(module_import, values=values) execute(argv=["scrapy", "crawl", 'SOSOSpider' ], settings=settings)
def parse_course(url): if os.path.exists('video_url.json'): os.remove('video_url.json') try: execute([ 'scrapy', 'crawl', 'video', '-o', 'video_url.json', '-a', 'url={}'.format(url) ]) except SystemExit: print(u'Crawled course video urls.') pass with open('video_url.json') as fp: rv = json.loads(fp.read()) rv.sort() count = len(rv) print(u'Start downloading... total %d items' % count) # processes = 2 pool = multiprocessing.Pool(2) for i in range(count): pool.apply_async(download_file, args=(rv[i].get('url'), rv[i].get('title'))) pool.close() pool.join()
pass def parse(self, response): filename = response.url.split("/")[-2] logger.info("filename is {0}".format(filename)) for book in response.xpath("//li[@class='subject-item']"): content = ComonItemLoader(BookItem(), selector=book) content.add_css("book_name", "div.info h2 a::attr(title)") content.add_css("auth_info", "div.info div.pub::text") content.add_css("point", "div.star.clearfix span.rating_nums::text") content.add_css("person_num", "div.star.clearfix span.pl::text") content.add_css("resume", "div.info p::text") content.add_css("book_detail_url", "div.info h2 a::attr(href)") ci2 = content.load_item() # logger.info("!!!!!!!{0}".format(ci2)) yield ci2 # next_page = response.css("div.paginator span.next a::attr(href)").extract_first() # if next_page and self.page_count < 3: # yield scrapy.Request(self.ORIGIN_URL + next_page, callback=self.parse) # self.page_count += 1 # with open("Resources/report/" + filename, 'wb') as f: # f.write(response.body) if __name__ == '__main__': # ds = DoubanSpider() # ds.parse() cmdline.execute("scrapy crawl douban".split())
# -*- coding: utf-8 -*- from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy", "crawl", "btdidi"])
# -*- coding: utf-8 -*- import os import sys from scrapy.cmdline import execute filename = os.path.dirname(os.path.abspath(__file__)) sys.path.append(filename) execute(['scrapy', 'crawl', 'tianqiSpider'])
import os import sys from scrapy.cmdline import execute sys.path.append(os.path.dirname(os.path.abspath(__file__))) # 抓取影评和回应 SPIDER_NAME = "movie_review" execute(["scrapy", "crawl", SPIDER_NAME])
from scrapy import cmdline name = 'example' cmdline.execute( 'scrapy crawl {} -s LOG_FILE=cuiqingcai.log'.format(name).split())
from scrapy import cmdline cmdline.execute( 'scrapy crawl northCapital -a startDate=2020-06-30 -a duration=7'.split())
from scrapy import cmdline cmdline.execute('scrapy crawl douban_spider -o ../../output.json'.split())
from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) execute(["scrapy", "crawl", "distributed_spider"])
#!/usr/python3.6.0/bin/python3.6 # -*- coding: utf-8 -*- import re import sys from scrapy.cmdline import execute if __name__ == '__main__': sys.argv[0] = re.sub(r'(-script\.pyw?|\.exe)?$', '', sys.argv[0]) args = [sys.argv[0], "crawl", "zongheng"] sys.exit(execute(args)) # coding = utf-8
from scrapy.cmdline import execute execute('scrapy crawl carrankSP'.split())
from scrapy import cmdline cmdline.execute("scrapy crawl fcbtexas".split())
from scrapy.cmdline import execute execute(['scrapy', 'crawl', 'search']) #第三个参数是要运行的spider的name
#!/usr/bin/env python # -*- coding: utf-8 -*- """ @version: python 3.7.0 @author: liuxuchao @contact: [email protected] @software: PyCharm @file: run.py @time: 2020-05-07 23:35 """ from scrapy import cmdline name = 'huawa1' cmd = 'scrapy crawl {0}'.format(name) cmdline.execute(cmd.split())
import os import sys from scrapy.cmdline import execute #设置工程目录 sys.path.append(os.path.dirname(os.path.abspath(__file__))) #这里我们可以打印出来可以查看输出的是什么,便于理解为什么这么写 print(os.path.abspath(__file__)) print(os.path.dirname(os.path.abspath(__file__))) #启动爬虫 execute(["scrapy", "crawl", "tencentJob"])
from scrapy import cmdline cmdline.execute('scrapy crawl pm'.split())
from scrapy import cmdline cmdline.execute('scrapy crawl examples -o examples.json'.split())
from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) # execute(["scrapy","crawl","jobbole"]) # execute(["scrapy","crawl","zhihu"]) execute(["scrapy", "crawl", "lagou"])
import os from scrapy.cmdline import execute import sys # sys.path.append("/root/PycharmProjects/flasktools/SpiderUsingScrapy/ArticleSpider") print(os.path.dirname(os.path.abspath(__file__))) sys.path.append(os.path.dirname(os.path.abspath(__file__))) # execute(["scrapy", "crawl", "jobbole"]) execute(["scrapy", "crawl", "zhihu"])
from scrapy import cmdline # cmdline.execute("scrapy crawl douban_spider -o douban.csv".split()) cmdline.execute("scrapy crawl douban_spider".split())
# encoding='utf-8' import scrapy.cmdline as cmd cmd.execute('scrapy crawl douban'.split())
from scrapy import cmdline cmdline.execute("scrapy crawl bancosardegnait".split())
# -*- coding: utf-8 -*- ''' @time: 2018/8/22 20:22 @author: Jack Luo @file: rum.py ''' ''' 图片与音乐同时下载,共用一个pipeline ''' # todo:1、在每张专辑里面添加当前图片的缩略图,而非整张图片 from scrapy.cmdline import execute execute('scrapy crawl luo'.split()) # execute('scrapy crawl luo -o luowang.xml'.split())
cardNum_rule = '"cardNum":"(.*?)"' areaName_rule = '"areaName":"(.*?)"' courtName_rule = '"courtName":"(.*?)"' gistId_rule = '"gistId":"(.*?)"' duty_rule = '"duty":"(.*?)"' performance_rule = '"performance":"(.*?)"' name = re.findall(name_rule, res.text) cardNum = re.findall(cardNum_rule, res.text) areaName = re.findall(areaName_rule, res.text) courtName = re.findall(courtName_rule, res.text) gistId = re.findall(gistId_rule, res.text) duty = re.findall(duty_rule, res.text) performance = re.findall(performance_rule, res.text) for i in name: index = name.index(i) item = LoseMenItem() item['name'] = name[index] item['cardNum'] = cardNum[index] item['areaName'] = areaName[index] item['courtName'] = courtName[index] item['gistId'] = gistId[index] item['duty'] = duty[index] item['performance'] = performance[index] yield item if __name__ == '__main__': from scrapy import cmdline cmdline.execute(['scrapy', 'crawl', 'poor_men'])
from scrapy import cmdline cmdline.execute("scrapy crawl huangye888".split())
from scrapy import cmdline cmdline.execute("scrapy crawl weibostock -o items.json".split())
#!/usr/bin/env python # -*- coding: utf-8 -*- from scrapy.cmdline import execute execute("scrapy crawl GameSpider -s JOBDIR=jobs".split())
from scrapy.cmdline import execute import os import sys base_path = os.path.dirname(os.path.abspath(__file__)) sys.path.append(base_path) execute(['scrapy','crawl','douban'])
from scrapy.cmdline import execute #调用这个可以执行scrapy脚本 import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__)))#找到工程目录才能运行scrapy命令,你可以打印一下看看 # print(os.path.dirname(os.path.abspath(__file__))) # execute(['scrapy','crawl','HuaErJie']) # execute(['scrapy','crawl','hejnews']) execute(['scrapy','crawl','proxySpider'])
# -*- coding:utf-8 -*- __author__ = 'neuclil' from scrapy.cmdline import execute import sys import os sys.path.append(os.path.dirname(os.path.abspath(__file__))) # execute(["scrapy", "crawl", "sz_stock_exchange"]) execute(["scrapy", "crawl", "cninfo"])
#-*-coding:utf-8-*- __author__ = 'Dzr' from scrapy import cmdline cmdline.execute("scrapy crawl atguigu_teacher".split())