def runspider(): date = datetime.datetime.utcnow() unix_date = calendar.timegm(date.utctimetuple()) route = request.args.get('route') domain = request.args.get('domain') directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date) if not os.path.exists(directory): os.makedirs(directory) logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.start(loglevel=logging.DEBUG) dispatcher.connect(stop_reactor, signal=signals.spider_closed) spider = MySpider(route, unix_date) settings_module = importlib.import_module('SiteCrawler.settings') settings = CrawlerSettings(settings_module) crawler = Crawler(settings) crawler.configure() crawler.crawl(spider) crawler.start() log.msg('Running reactor...') reactor.run() # the script will block here until the spider is closed log.msg('Reactor stopped.') return redirect(url_for('choose_graph', domain = domain, date = unix_date))
def __init__(self, dbpool): self.dbpool = dbpool reload(sys) # 2 sys.setdefaultencoding('utf-8') # 3 logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start()
def __init__(self, date=None, coursecode=None): if date is None or coursecode is None: self.historical = False # start_url = "http://racing.hkjc.com/racing/Info/meeting/RaceCard/english/Local/" # raise ValueError("Invalid spider parameters") else: self.racedate = date self.racecode = coursecode self.historical = True logfile = open('testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start()
def __init__(self): #log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log", logstdout=False) #log.start(logfile='log/testlog.log', logstdout=False) logfile = open('log/testlog.log', 'w') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() log.msg("initiating crawler...",level=log.INFO) chromedriver = "/Users/starsdeep/tools/chromedriver" os.environ["webdriver.chrome.driver"] = chromedriver self.driver = webdriver.Chrome(chromedriver) self.username = '******' self.password = '******'
def start_logger(spider_name): # TODO: FIX read for files like spidername.log.1 filename = datetime.now().strftime("%Y-%m-%d." + spider_name + ".log") logfile_ = logfile.LogFile(filename, GLOBAL_PATH + '/logs') logger = ScrapyFileLogObserver(logfile_, logging.DEBUG) tlog.addObserver(logger.emit)
def __init__(self, *args, **kwargs): super(TaobaoSpider, self).__init__(*args, **kwargs) self.login_data['logname'] ='your account' self.login_data['originalLogpasswd'] = 'your password' self.login_data['logpasswd'] = md5(self.login_data['originalLogpasswd']).hexdigest() self.cookie_handle = cookielib.CookieJar() self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie_handle)) urllib2.install_opener(self.opener) #匹配跳转的页面是tmall还是taobao self.r_route_page_mark = re.compile(r"<h1 id=\"mallLogo\" >") ScrapyFileLogObserver(open("spider.log", 'w'), level=log.INFO).start() ScrapyFileLogObserver(open("spider_error.log", 'w'), level=log.ERROR).start()
def start_logger(debug): """ Logger will log for file if debug set to True else will print to cmdline. The logfiles will rotate after exceeding since of 1M and 100 count. """ if debug: tlog.startLogging(sys.stdout) else: filename = datetime.now().strftime("%Y-%m-%d.scrapy.log") logfile_ = logfile.LogFile(filename, 'logs/', maxRotatedFiles=100) logger = ScrapyFileLogObserver(logfile_, logging.INFO) tlog.addObserver(logger.emit)
def __init__(self, key=None, **kwargs): if key is None: raise Exception("Must specify a spider type!") else: self._type = key print key time = datetime.utcnow() log_path = '/var/log/scrapyd/logs/' # exist = os.path.exists(log_path) # if not exist: # os.makedirs(log_path) logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type,time) logfile = os.path.join(log_path, logfile) print logfile handle = open(logfile, 'w') log_observer = ScrapyFileLogObserver(handle, level=logging.INFO) log_observer.start() error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time) error_file = os.path.join(log_path, error_file) error_handle = open(error_file, 'w') error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING) error_observer.start() self.key = "%s:%s" % (self.name, self._type) self.lastmodified = datetime.utcnow() # load urls, load last crawled time super(GeneralSitemapSpider, self).__init__(self.name, **kwargs)
def __init__(self, key=None, **kwargs): #fetch general to crawl list here from file or DB if key is None: raise Exception("No start urls selected!") else: print key self._type = key self.start_urls = URL_MAP.get(key) print(self.start_urls) self.rules = RULE_MAP.get(key) print(self.rules) time = datetime.datetime.utcnow() log_path = '/var/log/scrapyd/logs/' logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type, time) logfile = os.path.join(log_path, logfile) print logfile handle = open(logfile, 'w') log_observer = ScrapyFileLogObserver(handle, level=logging.INFO) log_observer.start() error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time) error_file = os.path.join(log_path, error_file) error_handle = open(error_file, 'w') error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING) error_observer.start() self.first_not_filter = [] #select self_start_urls and self_rules based on the parameter # self.start_urls = ['http://www.yahoo.com'] super(GeneralSpider, self).__init__(self.name, **kwargs)
import logging from scrapy.log import ScrapyFileLogObserver #date parsing module, not used by default, more info http://code.google.com/p/parsedatetime/ #import parsedatetime.parsedatetime as pdt """Creates the folder if it doesn't exist already'""" FOLDER = './data' try: os.mkdir(FOLDER) except OSError, e: if e.errno != errno.EEXIST: raise Exception("Can't create directory'") """Enables loging into file and to standard output""" logfile = open('%s/google.log' %FOLDER, 'a+b') log_observer = ScrapyFileLogObserver(logfile, level=logging.INFO) log_observer.start() """Google custom search API query parameters Required prameters are: cx - custom search engine unique ID key - unique API key, provides API access q - search query other parameters are optional: filter - 0 disables duplicate content filter (default is 1) sort - date:a - ascending sort by date dateRestrict - w[number] - restrict results to number of weeks more info: https://developers.google.com/custom-search/v1/using_rest#query-params """ PARAMS = {
def __init__(self, *args, **kwargs): ScrapyFileLogObserver(open("spider.log", 'w'), level=logging.INFO).start() ScrapyFileLogObserver(open("spider_error.log", 'w'), level=logging.ERROR).start()
def __init__(self, *args, **kwargs): locale.setlocale(locale.LC_ALL, 'en_US.UTF-8') ScrapyFileLogObserver(open("spider.log", 'w'), level=logging.INFO).start() ScrapyFileLogObserver(open("spider_error.log", 'w'), level=logging.ERROR).start()
import sys, os, datetime, errno import logging from scrapy.log import ScrapyFileLogObserver try: import sys, os, datetime, errno today = datetime.datetime.utcnow().strftime("%Y%m%d") logdir = None # ACCUM is the root directory try: accum = os.environ["ACCUM"] except: accum = "/lfs1/users/wat" logdir = os.path.join(accum, "log/escort/%s/www.eros.com/" % today) # ensure log directory exists try: os.makedirs(logdir) except OSError as exception: if exception.errno != errno.EEXIST: raise logfile = open(os.path.join(logdir, "scrapy.log"), 'a') log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG) log_observer.start() except Exception as e: print >> sys.stderr, "Failed to create log dir %r [%r]" % (logdir, e) import sys print >> sys.stderr, "SETTINGS: log file %r" % logfile
def __init__(self, name=None, **kwargs): ScrapyFileLogObserver(open("spider.log", 'w'), level=log.INFO).start() ScrapyFileLogObserver(open("spider_error.log", 'w'), level=log.ERROR).start() super(PostloopSpider, self).__init__(name, **kwargs)