def __init__(self): BaseSpider.__init__(self) self.handle_httpstatus_list = range(0,1000) self.requestCount = 0 print 'Opening Alexa URL CSV, please wait.' maxSites = 200000 selectionInterval = 5 #Include every nth site skipSites = 861010 #Skip the first n sites csv_file = open('top-1m.csv','r') alexaReader = csv.reader(csv_file) rank=1 queuedCount = 0 for line in alexaReader : domain = line[1] if (rank % selectionInterval) == 0 and rank > skipSites: self.allowed_domains.append( domain ) self.start_urls.append(domain) queuedCount = queuedCount + 1 if (queuedCount >= maxSites) : break rank += 1 csv_file.close() print 'Done opening URLs, starting crawler....'
def __init__(self, **kwargs): BaseSpider.__init__(self) self.driver = webdriver.Remote( command_executor='http://127.0.0.1:4444/wd/hub', desired_capabilities=DesiredCapabilities.FIREFOX) url = self.default if 'from' in kwargs: self.start_urls[0] += kwargs['from'] if 'to' in kwargs: self.start_urls[0] += '/' + kwargs['to'] if 'date' in kwargs: dat = kwargs['date'] if re.match('\d{6}', dat): self.monthly = False self.start_urls[0] += '/' + dat elif re.match('\d{4}', dat): self.monthly = True self.start_urls[0] += ('/blah.html?oym=' + dat + '&charttype=1') else: self.this_month() else: self.this_month() if self.monthly and 'rtn' in kwargs: self.rtn = kwargs['rtn'] self.start_urls[0] += '&rtn=' + self.rtn else: self.rtn = '0' url = self.start_urls[0] self.driver.get(url)
def __init__(self, *arg1, **arg2): log.msg(message="man_spider, __init__", _level = log.INFO) BaseSpider.__init__(self, *arg1, **arg2) self.man_spider_callback = {} self.man_spider_callback['list'] = self.callback_list self.man_spider_callback['parse'] = self.callback_parse self.man_spider_callback['all'] = self.callback_all
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides['DOWNLOAD_DELAY'] = 0.1 # regex object for extracting image url self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+") self.num_images_per_page = 20 self.num_images = 200 # initialize word searching url list self.base_url = "http://images.google.com/search?tbm=isch&safe=off" f_word_dict = file(r'SogouLabDic_tab_utf8_linux.dic') # f_word_dict = file(r'test_dict') word_lines = f_word_dict.readlines() print "initialize image searching urls" for word_line in word_lines: word = word_line[ : word_line.index("\t")] start = 0 while start < self.num_images: self.start_urls.append( self.base_url + "&q=" + word + "&start=" + str(start) ) start += self.num_images_per_page print "created " + str( len(self.start_urls) ) + " image searching urls."
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir = kwargs['outDir'] if self.outDir[-1] != '/': self.outDir += '/' startYear = int(kwargs['startYear']) endYear = int(kwargs['endYear']) assert startYear <= endYear except: print >> sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear" exit(1) startingAdd = "https://en.wikipedia.org/wiki/" self.start_urls = [] if startYear < -500: for i in range(startYear, min(-499, endYear), 10): add = startingAdd + str(-i) + "_BC" self.start_urls.append(add) path = self.outDir + str(-i) + "_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > -500: startYear = -499 if startYear > -500 and startYear < 0: for i in range(max(startYear, -499), min(0, endYear), 1): add = startingAdd + str(-i) + "_BC" self.start_urls.append(add) path = self.outDir + str(-i) + "_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > 0: startYear = 1 if startYear > 0: for i in range(startYear, endYear + 1): add = startingAdd + str(i) self.start_urls.append(add) path = self.outDir + str(i) + "/" if not os.path.exists(path): os.makedirs(path)
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir=kwargs['outDir'] if self.outDir[-1]!= '/': self.outDir += '/' startYear=int(kwargs['startYear']) endYear=int(kwargs['endYear']) assert startYear <= endYear except: print >>sys.stderr, "eventSpider needs 3 arguments: outDir, startYear, endYear" exit(1) startingAdd = "https://en.wikipedia.org/wiki/" self.start_urls = [] if startYear < -500: for i in range(startYear, min(-499, endYear), 10): add = startingAdd+str(-i)+"_BC" self.start_urls.append(add) path = self.outDir+str(-i)+"_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > -500: startYear = -499 if startYear >-500 and startYear < 0: for i in range(max(startYear,-499), min(0,endYear), 1): add = startingAdd+str(-i)+"_BC" self.start_urls.append(add) path = self.outDir+str(-i)+"_BC/" if not os.path.exists(path): os.makedirs(path) if endYear > 0: startYear = 1 if startYear > 0: for i in range(startYear, endYear+1): add = startingAdd+str(i) self.start_urls.append(add) path = self.outDir+str(i)+"/" if not os.path.exists(path): os.makedirs(path)
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outDir=kwargs['outDir'] if self.outDir[-1]!= '/': self.outDir += '/' self.endYear=int(kwargs['endYear']) except: print >>sys.stderr, "eventSpider needs 3 arguments: outDir, outFile, endYear" exit(1) startingAdd = "http://en.wikipedia.org/wiki/" self.start_urls = [] # self.start_urls = [startingAdd+"2011"] # if not os.path.exists(self.outDir+"2011"): os.makedirs(self.outDir+"2011") for i in range(1500, 499, -10): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) path = self.outDir+str(i)+"_BC/" if not os.path.exists(path): os.makedirs(path) for i in range(499, 0, -1): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) path = self.outDir+str(i)+"_BC/" if not os.path.exists(path): os.makedirs(path) for i in range(1, self.endYear+1): add = startingAdd+str(i) self.start_urls.append(add) path = self.outDir+str(i)+"/" if not os.path.exists(path): os.makedirs(path)
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides["DOWNLOAD_DELAY"] = 0 settings.overrides["LOG_FILE"] = "scrapy.log" settings.overrides["LOG_STDOUT"] = True settings.overrides["DOWNLOAD_TIMEOUT"] = 180 settings.overrides["RETRY_TIMES"] = 10 # base url of all the pages self.base_url = "http://www.365zn.com/fyc/" # regex objects # example: <a href="fyc_h.htm" self.reobj_word_list_page = re.compile(r"fyc_\w+.htm") # example: <a href=htm/11474.htm title='把持'> self.reobj_word_and_page = re.compile(r"href=\S+\s+title='[^']+'") # 【同义词】 <font color=blue>胸有成竹 心中有数 稳操胜券</font> self.reobj_synonym = re.compile(r"【同义词】\W+<font color=blue>([^<]*)</font>") # 【反义词】 <font color=red>心中无数 手忙脚乱</font> self.reobj_antonym = re.compile(r"【反义词】\W+<font color=red>([^<]*)</font>") # chinese character(s) # self.reobj_chinese = re.compile(r"[\u4e00-\u9fa5]+") self.reobj_chinese = re.compile(r"[\x80-\xff]+")
def __init__(self): BaseSpider.__init__(self) # starting virtual display # comment this line if you are using desktop display.start() # estabilishing browser self.browser = webdriver.Firefox()
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme.") self.browser = webdriver.Firefox(self.profile) self.duplicatesurl = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, domain_name=None): BaseSpider.__init__(self, domain_name) consumer_key = config.get('yammer', 'consumer_key') consumer_secret = config.get('yammer', 'consumer_secret') app_token = config.get('yammer', 'app_token') self.consumer = OAuthConsumer(consumer_key, consumer_secret) self.signature = OAuthSignatureMethod_PLAINTEXT() self.token = OAuthToken.from_string(app_token)
def __init__(self, domain_name=None): BaseSpider.__init__(self, domain_name) consumer_key = config.get('yammer', 'consumer_key') consumer_secret = config.get('yammer', 'consumer_secret') app_token = config.get('yammer', 'app_token') self.consumer = OAuthConsumer(consumer_key, consumer_secret) self.signature = OAuthSignatureMethod_PLAINTEXT() self.token = OAuthToken.from_string(app_token)
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] # self.profile = webdriver.FirefoxProfile("C:/Users/Administrator/AppData/Roaming/Mozilla/Firefox/Profiles/rbqs2eme") # self.browser = webdriver.Firefox(self.profile) self.browser = webdriver.Chrome('C:\Users\ZERO\AppData\Local\Google\Chrome\Application\chromedriver.exe') self.duplicatesurl = {} dispatcher.connect(self.spider_opened, signals.spider_opened) dispatcher.connect(self.spider_closed, signals.spider_closed)
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "https://en.wikipedia.org/wiki/" self.inFile = kwargs['infile'] self.outFile = kwargs['outfile'] self.start_urls = [] self.url2locDic = {} self.readFile(self.inFile) fout = codecs.open(self.outFile, "w", encoding='utf-8') fout.close()
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "http://en.wikipedia.org/wiki/" self.inFile=kwargs['infile'] self.outFile=kwargs['outfile'] self.start_urls = [] self.url2locDic = {} self.readFile(self.inFile) fout = codecs.open(self.outFile,"w", encoding='utf-8') fout.close
def __init__(self): BaseSpider.__init__(self) # inizializzo il baseSpider con il metodo originale (sto riscrivendo il metodo '__init__()') self.verificationErrors = [] # --- Disattivare l'apertura del brawser ------------------------------------- # Funziona soltatno con Linux, per via delle dipendenze grafiche... # self.display = Display(visible=0,backend ='xvnb', size=(800, 600)) # self.display = Display(visible=0, size=(800, 600)) # self.display.start() # ---------------------------------------------------------------------------- self.driver = webdriver.Firefox(self.disableImages()) # carico il webdriver con il profilo che crea la funzione 'disableImages()'
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = [] with open(self.contactsDataFile, 'rb') as csvfile: csvreader = csv.reader(csvfile, delimiter=',', quotechar='"') self.log('Initialing with contact urls from file : ' + self.contactsDataFile + ' ...') for row in csvreader: if row[1].startswith('https') == True: self.start_urls.append(row[1]) self.log('Total contacts loaded : %d' % len(self.start_urls))
def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] file_list = [i for i in os.listdir( self.result_path) if i.endswith('.html')] for i in file_list: path = os.path.join(self.result_path, i).replace('?', '%3F') url = 'file://%s' % (path) self.start_urls.append(url) BaseSpider.__init__(self, kwargs=kwargs) self.item = Commonshop()
def __init__(self): BaseSpider.__init__(self) # use any browser you wish display.start() profile = webdriver.FirefoxProfile() # setup configuration for browser driver to download file properly profile.set_preference("browser.download.folderList", 2) profile.set_preference("browser.download.manager.showWhenStarting", False) profile.set_preference("browser.download.dir", ROOT_DIR) profile.set_preference("browser.helperApps.neverAsk.saveToDisk", "application/csv") self.browser = webdriver.Firefox(firefox_profile=profile)
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "http://en.wikipedia.org/wiki/" self.inFile=kwargs['infile'] self.outFileLoc=kwargs['outfileLoc'] self.outFilePer=kwargs['outfilePer'] self.start_urls = [] self.url2locDic = {} self.url2urlDic = {} self.readFile(self.inFile) fout = open(self.outFileLoc,"w") fout = open(self.outFilePer,"w") fout.close
def __init__(self, username=None): BaseSpider.__init__(self) self.username, self.pwd = username.split(":") self.db = MySQLdb.connect( host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2", charset="utf8", use_unicode=True ) self.cursor = self.db.cursor() self.logined = False log.msg("login with %s" % self.username, level=log.INFO) login_url = self.weibo.login(self.username, self.pwd) if login_url: log.msg("login successful, start crawling.", level=log.INFO) self.start_urls.append(login_url) else: log.msg("login failed", level=log.ERROR)
def __init__(self, username=None): BaseSpider.__init__(self) self.username, self.pwd = username.split(':') self.db = MySQLdb.connect(host="localhost", port=3306, user="******", passwd="pw", db="weibosearch2", charset='utf8', use_unicode=True) self.cursor = self.db.cursor() self.logined = False host = settings.get('REDIS_HOST', REDIS_HOST) port = settings.get('REDIS_PORT', REDIS_PORT) log.msg('login with %s' % self.username, level=log.INFO) login_url = self.weibo.login(self.username, self.pwd) if login_url: log.msg('login successful, start crawling.', level=log.INFO) self.start_urls.append(login_url) else: log.msg('login failed', level=log.ERROR)
def __init__(self, **kwargs): BaseSpider.__init__(self) startingAdd = "https://en.wikipedia.org/wiki/" self.inFile = kwargs['infile'] self.outFileURL = kwargs['outfile'] self.outFileLoc = kwargs['outfileLoc'] self.outFilePer = kwargs['outfilePer'] self.outFileOrg = kwargs['outfileOrg'] self.start_urls = [] self.url2locDic = {} self.url2urlDic = {} self.readFile(self.inFile) fout = open(self.outFileURL, "w") fout.close fout = open(self.outFileLoc, "w") fout.close fout = open(self.outFilePer, "w") fout.close fout = open(self.outFileOrg, "w") fout.close
def __init__(self): BaseSpider.__init__(self) # settings settings.overrides['DOWNLOAD_DELAY'] = 0 settings.overrides['LOG_FILE'] = "scrapy.log" settings.overrides['LOG_STDOUT'] = True settings.overrides['DOWNLOAD_TIMEOUT'] = 180 settings.overrides['RETRY_TIMES'] = 10 self.num_images_per_page = 20 self.num_images = 60 # base url for image searching self.base_url = "http://images.google.com/search?tbm=isch&safe=off" # regex object for extracting image url self.reobj_image = re.compile(r"http://\S+.gstatic.com[^\"\s]+") # initialize start_urls self.fill_start_urls()
def __init__(self, **kwargs): BaseSpider.__init__(self) try: self.outFile=kwargs['outfile'] self.endYear=int(kwargs['endYear']) except: print >>sys.stderr, "eventSpider needs 2 arguments: outfile, endYear" exit(1) startingAdd = "http://en.wikipedia.org/wiki/" self.start_urls = [] for i in range(1500, 499, -10): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) for i in range(499, 0, -1): add = startingAdd+str(i)+"_BC" self.start_urls.append(add) for i in range(1, self.endYear+1): add = startingAdd+str(i) self.start_urls.append(add) fout = open(self.outFile,"w") fout.close
def __init__(self, name=None, **kwargs): if not hasattr(self, 'start_urls'): self.start_urls = [] skiplist = [] skip_file = '%s/skip.txt' % self.result_path if not os.path.isdir(self.result_path): os.makedirs(self.result_path) if os.path.isfile(skip_file): with open(skip_file, 'r') as fp: for eachline in fp: shopid = eachline.replace('\n', '') if shopid.isdigit(): skiplist.append(int(shopid)) id_range = self.id_range if len(id_range) == 2: for i in xrange(id_range[0], id_range[1]): if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) if len(id_range) == 3: for i in xrange(id_range[0], id_range[1], id_range[2]): if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) else: for i in id_range: if i not in skiplist: url = '%s%s' % (self.start_url, i) self.start_urls.append(url) if 1: file_list = [i for i in os.listdir( self.result_path) if i.endswith('.html')] exist_urls = [i.replace('.html', '') for i in file_list] self.start_urls = [ i for i in self.start_urls if i.split('/')[-1] not in exist_urls] BaseSpider.__init__(self, kwargs=kwargs)
def __init__(self): BaseSpider.__init__(self) self.seenTeams = set() # avoid duplicates when teams are listed both for continent and championship
def __init__(self, settings): BaseSpider.__init__(self, name=settings.get('DEFAULT_SPIDER', 'default')) self._settings = settings
def __init__(self): BaseSpider.__init__(self) self.seenTeams = set( ) # avoid duplicates when teams are listed both for continent and championship
def __init__(self): BaseSpider.__init__(self)
def __init__(self): BaseSpider.__init__(self) self.counter = 1
def __init__(self): BaseSpider.__init__(self) DeputadoSpider.start_urls = self.get_start_urls()
def __init__(self): BaseSpider.__init__(self) self.verificationErrors = []
def __init__(self, name=None, **kwargs): BaseSpider.__init__(self, name, **kwargs) self.logger = self.setupLogging()
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) dispatcher.connect(self.process_pending, signals.spider_idle) self.run_num = 1
def __init__(self): BaseSpider.__init__(self) self.seenTeams = dict()
def __init__(self, url=None): self.url = url BaseSpider.__init__(self)
def __init__(self, *args, **kwargs): BaseSpider.__init__(self, *args, **kwargs) CssSpiderMixin.__init__(self, *args, **kwargs)