Ejemplo n.º 1
0
def runspider():
	date = datetime.datetime.utcnow()
	unix_date = calendar.timegm(date.utctimetuple())
	
	route = request.args.get('route')
	domain = request.args.get('domain')
	
	directory = r"{0}\initiator\static\scrapes\{1}\{2}".format(os.getcwd(), domain, unix_date)
	
	if not os.path.exists(directory):
		os.makedirs(directory)
	
	logfile = open('testlog.log', 'w')
	log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
	log_observer.start()
	log.start(loglevel=logging.DEBUG)
	
	dispatcher.connect(stop_reactor, signal=signals.spider_closed)
	
	spider = MySpider(route, unix_date)
	
	settings_module = importlib.import_module('SiteCrawler.settings')
	settings = CrawlerSettings(settings_module)
	crawler = Crawler(settings)
	
	crawler.configure()
	crawler.crawl(spider)
	crawler.start()
	
	log.msg('Running reactor...')
	reactor.run()  # the script will block here until the spider is closed
	log.msg('Reactor stopped.')
	return redirect(url_for('choose_graph', domain = domain, date = unix_date))
Ejemplo n.º 2
0
	def __init__(self, dbpool):
		self.dbpool = dbpool
		reload(sys)                         # 2
		sys.setdefaultencoding('utf-8')     # 3
		logfile = open('testlog.log', 'w')
		log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
		log_observer.start()
Ejemplo n.º 3
0
 def __init__(self, date=None, coursecode=None):
     if date is None or coursecode is None:
         self.historical = False
         # start_url = "http://racing.hkjc.com/racing/Info/meeting/RaceCard/english/Local/"
         # raise ValueError("Invalid spider parameters")
     else:
         self.racedate = date
         self.racecode = coursecode
         self.historical = True
     logfile = open('testlog.log', 'w')
     log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
     log_observer.start()
Ejemplo n.º 4
0
    def __init__(self):
        #log.start(logfile=time.strftime("log/%Y%m%d%H%M%S")+".log", logstdout=False)
        #log.start(logfile='log/testlog.log', logstdout=False)
        logfile = open('log/testlog.log', 'w')
        log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
        log_observer.start()

        log.msg("initiating crawler...",level=log.INFO)
        chromedriver = "/Users/starsdeep/tools/chromedriver"
        os.environ["webdriver.chrome.driver"] = chromedriver
        self.driver = webdriver.Chrome(chromedriver)
        self.username = '******'
        self.password = '******'
Ejemplo n.º 5
0
def start_logger(spider_name):

    # TODO: FIX read for files like spidername.log.1
    filename = datetime.now().strftime("%Y-%m-%d." + spider_name + ".log")
    logfile_ = logfile.LogFile(filename, GLOBAL_PATH + '/logs')
    logger = ScrapyFileLogObserver(logfile_, logging.DEBUG)
    tlog.addObserver(logger.emit)
Ejemplo n.º 6
0
    def __init__(self, *args, **kwargs):
        super(TaobaoSpider, self).__init__(*args, **kwargs)
        
        self.login_data['logname'] ='your account'
        self.login_data['originalLogpasswd'] = 'your password'
        self.login_data['logpasswd'] = md5(self.login_data['originalLogpasswd']).hexdigest()
        
        self.cookie_handle = cookielib.CookieJar()
        self.opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(self.cookie_handle))
        urllib2.install_opener(self.opener)
        

        #匹配跳转的页面是tmall还是taobao
        self.r_route_page_mark = re.compile(r"<h1 id=\"mallLogo\" >")
        
        ScrapyFileLogObserver(open("spider.log", 'w'), level=log.INFO).start()
        ScrapyFileLogObserver(open("spider_error.log", 'w'), level=log.ERROR).start()
Ejemplo n.º 7
0
def start_logger(debug):
    """
    Logger will log for file if debug set to True else will print to cmdline.
    The logfiles will rotate after exceeding since of 1M and 100 count.
    """
    if debug:
        tlog.startLogging(sys.stdout)
    else:
        filename = datetime.now().strftime("%Y-%m-%d.scrapy.log")
        logfile_ = logfile.LogFile(filename, 'logs/', maxRotatedFiles=100)
        logger = ScrapyFileLogObserver(logfile_, logging.INFO)
        tlog.addObserver(logger.emit)
    def __init__(self, key=None, **kwargs):
        if key is None:
            raise Exception("Must specify a spider type!")
        else:
            self._type = key
            print key
            time =  datetime.utcnow()
            log_path = '/var/log/scrapyd/logs/'
#            exist = os.path.exists(log_path)
#            if not exist:
#                os.makedirs(log_path)
            logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type,time)
            logfile = os.path.join(log_path, logfile)
            print logfile
            handle = open(logfile, 'w')
            log_observer = ScrapyFileLogObserver(handle, level=logging.INFO)
            log_observer.start()
            
            error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time)
            error_file = os.path.join(log_path, error_file)
            error_handle = open(error_file, 'w')
            error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING)
            error_observer.start()

            self.key = "%s:%s" % (self.name, self._type)
            self.lastmodified = datetime.utcnow()

            # load urls, load last crawled time
        super(GeneralSitemapSpider, self).__init__(self.name, **kwargs)
Ejemplo n.º 9
0
    def __init__(self, key=None, **kwargs):
        #fetch general to crawl list here from file or DB
        if key is None:
            raise Exception("No start urls selected!")
        else:
            print key
            self._type = key
            self.start_urls = URL_MAP.get(key)
            print(self.start_urls)
            self.rules = RULE_MAP.get(key)
            print(self.rules)

            time =  datetime.datetime.utcnow()
            log_path = '/var/log/scrapyd/logs/'
            logfile = "scrapy_%s_%s_%s.log" % (self.name, self._type, time)
            logfile = os.path.join(log_path, logfile)
            print logfile
            handle = open(logfile, 'w')
            log_observer = ScrapyFileLogObserver(handle, level=logging.INFO)
            log_observer.start()
            
            error_file = "scrapy_%s_%s_%s_Error.log" % (self.name, self._type, time)
            error_file = os.path.join(log_path, error_file)
            error_handle = open(error_file, 'w')
            error_observer = ScrapyFileLogObserver(error_handle, level=logging.WARNING)
            error_observer.start()

            self.first_not_filter = []
            #select self_start_urls and self_rules based on the parameter
#        self.start_urls = ['http://www.yahoo.com']
        super(GeneralSpider, self).__init__(self.name, **kwargs)
Ejemplo n.º 10
0
import logging
from scrapy.log import ScrapyFileLogObserver
#date parsing module, not used by default, more info http://code.google.com/p/parsedatetime/
#import parsedatetime.parsedatetime as pdt

"""Creates the folder if it doesn't exist already'"""
FOLDER = './data'
try:
    os.mkdir(FOLDER)
except OSError, e:
    if e.errno != errno.EEXIST:
        raise Exception("Can't create directory'")

"""Enables loging into file and to standard output"""
logfile = open('%s/google.log' %FOLDER, 'a+b')
log_observer = ScrapyFileLogObserver(logfile, level=logging.INFO)
log_observer.start()

"""Google custom search API query parameters
Required prameters are:
cx - custom search engine unique ID
key - unique API key, provides API access
q - search query
other parameters are optional: 
filter - 0 disables duplicate content filter (default is 1)
sort - date:a - ascending sort by date
dateRestrict - w[number] - restrict results to number of weeks
more info:
https://developers.google.com/custom-search/v1/using_rest#query-params
"""
PARAMS = {
Ejemplo n.º 11
0
 def __init__(self, *args, **kwargs):
     ScrapyFileLogObserver(open("spider.log", 'w'),
                           level=logging.INFO).start()
     ScrapyFileLogObserver(open("spider_error.log", 'w'),
                           level=logging.ERROR).start()
Ejemplo n.º 12
0
 def __init__(self, *args, **kwargs):
     locale.setlocale(locale.LC_ALL, 'en_US.UTF-8')
     ScrapyFileLogObserver(open("spider.log", 'w'),
                           level=logging.INFO).start()
     ScrapyFileLogObserver(open("spider_error.log", 'w'),
                           level=logging.ERROR).start()
Ejemplo n.º 13
0
import sys, os, datetime, errno
import logging
from scrapy.log import ScrapyFileLogObserver

try:
    import sys, os, datetime, errno
    today = datetime.datetime.utcnow().strftime("%Y%m%d")
    logdir = None
    # ACCUM is the root directory
    try:
        accum = os.environ["ACCUM"]
    except:
        accum = "/lfs1/users/wat"
    logdir = os.path.join(accum, "log/escort/%s/www.eros.com/" % today)
    # ensure log directory exists
    try:
        os.makedirs(logdir)
    except OSError as exception:
        if exception.errno != errno.EEXIST:
            raise
    logfile = open(os.path.join(logdir, "scrapy.log"), 'a')
    log_observer = ScrapyFileLogObserver(logfile, level=logging.DEBUG)
    log_observer.start()

except Exception as e:
    print >> sys.stderr, "Failed to create log dir %r [%r]" % (logdir, e)

import sys
print >> sys.stderr, "SETTINGS: log file %r" % logfile
Ejemplo n.º 14
0
 def __init__(self, name=None, **kwargs):
     ScrapyFileLogObserver(open("spider.log", 'w'), level=log.INFO).start()
     ScrapyFileLogObserver(open("spider_error.log", 'w'),
                           level=log.ERROR).start()
     super(PostloopSpider, self).__init__(name, **kwargs)