def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['komm'] self.dateend = '26.03.2017' if self.storage['end_reached'][ self.storage['politNum']] else self.storage['dateEnd'] self.politicians = ('%ED%E8%EA%EE%EB%FF+%F1%E0%F0%EA%EE%E7%E8', '%F4%F0%E0%ED%F1%F3%E0+%EE%EB%EB%E0%ED%E4', '%E4%EC%E8%F2%F0%E8%E9+%EC%E5%E4%E2%E5%E4%E5%E2', '%E4%FD%E2%E8%E4+%EA%FD%EC%E5%F0%EE%ED', '%E2%EB%E0%E4%E8%EC%E8%F0+%EF%F3%F2%E8%ED', '%E0%ED%E3%E5%EB%E0+%EC%E5%F0%EA%E5%EB%FC', '%F2%E5%F0%E5%E7%E0+%EC%FD%E9') #self.politicians = ('николя саркози', 'франсуа олланд', 'дмитрий медведев', 'дэвид кэмерон', 'владимир путин', 'ангела меркель', 'тереза мэй') self.data_format = '%Y-%m-%d' self.starting_page = 1 self.payload = {} self.update_payload()
def __init__(self): Crawler.__init__(self) self.link_crawler = None self.url = 'https://www.instagram.com' ##data디렉토리 및 파일 생성 self.create_data_storage() ##로그설정 Crawler.set_logs('Instagram_Crawler_log','./logging/logfile_instagram.log')
def __init__(self, auth={}, urls={}, force_sync=False, config={}, api_limit=0): Crawler.__init__(self, auth, urls, force_sync, config, api_limit) self._type = config['fetch_by_type'] self._filter = config['filter_key'] self._count_cfg = Config(storage=self._config_strategy, type='counts') self._offset_cfg = Config(storage=self._config_strategy, type='offsets') self._MAX_RESULT_PER_TARGET = 0 self._recipe_factory = RecipeFactory(connector=self._data_get_connector, storage=self._data_strategy)
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "雷锋网" self.root_url = "http://www.leiphone.com" self.headers = { 'Host': 'www.leiphone.com', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0', 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Connection': 'keep-alive' }
def __init__(self): Crawler.__init__(self) self.content_crawler = None ##상품평순 ##self.url = "https://store.musinsa.com/app/product/search?search_type=1&pre_q=&d_cat_cd=&brand=&rate=&page_kind=search&list_kind=small&sort=emt_high&page=%s&display_cnt=120&sale_goods=&ex_soldout=&color=&popup=&q=%s&price1=&price2=" ##낮은가격순 self.url = "https://store.musinsa.com/app/product/search?search_type=1&pre_q=&d_cat_cd=&brand=&rate=&page_kind=search&list_kind=small&sort=price_low&page=%s&display_cnt=120&sale_goods=&ex_soldout=&color=&popup=&chk_research=&q=%s&chk_brand=&price1=&price2=&chk_color=&chk_sale=&chk_soldout=" self.content_url = "https://store.musinsa.com" ##data디렉토리 및 파일 생성 self.create_data_storage() ##로그설정 Crawler.set_logs('Musinsa_Crawler_log', './logging/logfile_musinsa.log')
def __init__(self, student_id, password): """Constructor for getting student id and password. """ # Initializing the base class Crawler. Crawler.__init__(self) self.student_id = student_id self.password = password # Structuring the authentication data into a dict for posting to the server. self.auth_data = {'dfUsernameHidden': student_id, 'dfPasswordHidden': password} # Login the website then other requests can be made with this session and getting the status of login. self.status = self.login()
def __init__(self, shelf, pswd=None): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['suedd'] self.politicians = ('sarkozy', 'hollande', 'medwedew', 'cameron', 'putin', 'merkel', 'theresa+AND+may') self.starting_page = 1 self.update_payload() self.data_format = '%d.%m.%Y'
def __init__(self, shelf): Crawler.__init__(self) self.respage = Resultpage() self.article = Article() self.storage = shelf['ksta_de'] self.politicians = ('sarkozy', 'hollande', 'dmitri|dmitrij+medwedew', 'david+cameron', 'putin', 'merkel'. 'theresa+may') self.site = r'http://www.berliner-zeitung.de/action/berliner-zeitung/4484314/search?' self.starting_page = 0 self.data_format = '%Y-%m-%d' self.update_payload()
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['vz'] self.politicians = ('%ED%E8%EA%EE%EB%FF+%F1%E0%F0%EA%EE%E7%E8','%F4%F0%E0%ED%F1%F3%E0+%EE%EB%EB%E0%ED%E4','%E4%EC%E8%F2%F0%E8%E9+%EC%E5%E4%E2%E5%E4%E5%E2', '%E4%FD%E2%E8%E4+%EA%FD%EC%E5%F0%EE%ED', '%E2%EB%E0%E4%E8%EC%E8%F0+%EF%F3%F2%E8%ED', '%E0%ED%E3%E5%EB%E0+%EC%E5%F0%EA%E5%EB%FC', '%F2%E5%F0%E5%E7%E0+%EC%FD%E9') self.data_format = '%Y-%m-%d' self.starting_page = 1 self.payload = {} self.update_payload()
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['lemonde'] self.politicians = ('nicolas sarkozy', 'francois hollande', 'dmitry medvedev', 'david cameron', 'vladimir putin', 'angela merkel', 'theresa may') self.starting_page = 1 self.data_format = '%Y-%m-%d' self.site = r'http://www.lemonde.fr/recherche/?operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=custom_date&start_day=01&start_month=01&start_year=2000&end_day=28&end_month=03&end_year=2017&sort=desc'.format( self.politicians[self.storage['politNum']])
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['ksta_de'] self.politicians = ('sarkozy', 'hollande', 'medwedew', 'cameron', 'putin', 'merkel', 'theresa+may') self.site = r'http://www.ksta.de/action/ksta/4484314/search?' self.data_format = '%Y-%m-%d' self.starting_page = 0 self.update_payload()
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['independent'] self.politicians = ('nicolas sarkozy', 'francois hollande', 'dmitry medvedev', 'david cameron', 'vladimir putin', 'angela merkel', 'theresa may') self.site = r'http://www.independent.co.uk/search/site/{}'.format( self.politicians[self.storage['politNum']]) self.data_format = '%Y-%m-%d' self.starting_page = 0 self.update_payload()
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['spiegel'] self.politicians = ('nicolas_sarkozy', 'francois_hollande', 'dmitrij_medwedew', 'david_cameron', 'wladimir_putin', 'angela_merkel', 'theresa_may') self.starting_page = 1 self.data_format = '%d.%m.%Y' self.site = r'http://www.spiegel.de/thema/{}/dossierarchiv-{}.html'.format( self.politicians[self.storage['politNum']], max(self.starting_page, self.storage['pn'])) self.payload = None
def __init__(self, shelf, pswd): Crawler.__init__(self, pswd) self.respage = Resultpage() self.article = Article() self.storage = shelf['guardian'] self.politicians = ('nicolas-sarkozy', 'francois-hollande', 'dmitry-medvedev', 'davidcameron', 'vladimir-putin', 'angela-merkel', 'theresamay') self.local = (3, 6) self.starting_page = 1 self.data_format = '%Y-%m-%d' self.site = r'https://www.theguardian.com/{}/{}?'.format( 'world' if self.storage['politNum'] not in self.local else 'politics', self.politicians[self.storage['politNum']])
def __init__(self, login_id=None, last_name=None, pin=None): """Constructor for getting login credentials. """ # Initializing the base class Crawler. Crawler.__init__(self) # Structuring the authentication data into a dict for posting to the server. self.auth_data = { 'loginType': 'B', 'loginId': login_id, 'lastName': last_name, 'pin': pin, 'page.logIn.library': '1@VYKDB20011102005217' } self.books = None self.content = '' # Login the website then other requests can be made with this session and getting the status of login. self.status = self.login() if self.status is True: self.books = self.get_books()
def __init__(self): Crawler.__init__(self) self.HOST = "http://56110.cn" self.suffix = "/Huo/list.html"
def __init__(self, start_url=START_URL): Crawler.__init__(self, start_url) self.tasks = []
def __init__(self): Crawler.__init__(self) self.HOST = "http://wb.56888.net" self.prefix = "/OutSourceList.aspx?tendertype=4&p="
def __init__(self): Crawler.__init__(self) self.url = 'http://www.google.com/search' self.params = {"tbs": "li:1"}
def __init__(self, config): Crawler.__init__(self, config)
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "虎嗅" self.root_url = "http://www.huxiu.com"
def __init__(self): Crawler.__init__(self) self.HOST = "http://fala56.com" self.prefix = "/Views/Huoyuan" self.suffix = "/GoodsLandList.aspx?area=-1"
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "36氪" self.root_url = "http://36kr.com"
def __init__(self): Crawler.__init__(self) self.HOST = "http://www.chinawutong.com" self.prefix = "/103.html?pid="
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "网易科技" self.root_url = "http://tech.163.com/gd/"
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "极客公园" self.root_url = "http://www.geekpark.net"
def __init__(self, proj_name): Crawler.__init__(self, proj_name) self.name = "砍柴网" self.root_url = "http://www.ikanchai.com/"
def __init__(self): Crawler.__init__(self) self.HOST = "http://www.51yunli.com" self.prefix = "/goods/0/0/" self.suffix = "/0" self.MAX_PAGE = 7
def __init__(self): Crawler.__init__(self) self.HOST = "http://www.8glw.com" self.prefix = "/main_info.asp?id=1&page="
def __init__(self, dbname=""): Crawler.__init__(self, dbname)
def __init__(self): Crawler.__init__(self) self.HOST = "http://www.0256.cn" self.prefix = "/goods/?PageIndex="