Esempio n. 1
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['komm']

        self.dateend = '26.03.2017' if self.storage['end_reached'][
            self.storage['politNum']] else self.storage['dateEnd']

        self.politicians = ('%ED%E8%EA%EE%EB%FF+%F1%E0%F0%EA%EE%E7%E8',
                            '%F4%F0%E0%ED%F1%F3%E0+%EE%EB%EB%E0%ED%E4',
                            '%E4%EC%E8%F2%F0%E8%E9+%EC%E5%E4%E2%E5%E4%E5%E2',
                            '%E4%FD%E2%E8%E4+%EA%FD%EC%E5%F0%EE%ED',
                            '%E2%EB%E0%E4%E8%EC%E8%F0+%EF%F3%F2%E8%ED',
                            '%E0%ED%E3%E5%EB%E0+%EC%E5%F0%EA%E5%EB%FC',
                            '%F2%E5%F0%E5%E7%E0+%EC%FD%E9')

        #self.politicians = ('николя саркози', 'франсуа олланд', 'дмитрий медведев', 'дэвид кэмерон', 'владимир путин', 'ангела меркель', 'тереза мэй')

        self.data_format = '%Y-%m-%d'

        self.starting_page = 1

        self.payload = {}

        self.update_payload()
Esempio n. 2
0
 def __init__(self):
     Crawler.__init__(self)
     self.link_crawler = None
     self.url = 'https://www.instagram.com'
     
     ##data디렉토리 및 파일 생성
     self.create_data_storage()
     
     ##로그설정
     Crawler.set_logs('Instagram_Crawler_log','./logging/logfile_instagram.log')
	def __init__(self, auth={}, urls={}, force_sync=False, config={}, api_limit=0):
		Crawler.__init__(self, auth, urls, force_sync, config, api_limit)
		self._type = config['fetch_by_type']
		self._filter = config['filter_key']

		self._count_cfg = Config(storage=self._config_strategy, type='counts')
		self._offset_cfg = Config(storage=self._config_strategy, type='offsets')

		self._MAX_RESULT_PER_TARGET = 0
		self._recipe_factory = RecipeFactory(connector=self._data_get_connector, storage=self._data_strategy)
Esempio n. 4
0
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "雷锋网"
     self.root_url = "http://www.leiphone.com"
     self.headers = {
         'Host': 'www.leiphone.com',
         'User-Agent':
         'Mozilla/5.0 (Windows NT 6.2; rv:16.0) Gecko/20100101 Firefox/16.0',
         'Accept':
         'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
         'Connection': 'keep-alive'
     }
    def __init__(self):
        Crawler.__init__(self)
        self.content_crawler = None
        ##상품평순
        ##self.url = "https://store.musinsa.com/app/product/search?search_type=1&pre_q=&d_cat_cd=&brand=&rate=&page_kind=search&list_kind=small&sort=emt_high&page=%s&display_cnt=120&sale_goods=&ex_soldout=&color=&popup=&q=%s&price1=&price2="

        ##낮은가격순
        self.url = "https://store.musinsa.com/app/product/search?search_type=1&pre_q=&d_cat_cd=&brand=&rate=&page_kind=search&list_kind=small&sort=price_low&page=%s&display_cnt=120&sale_goods=&ex_soldout=&color=&popup=&chk_research=&q=%s&chk_brand=&price1=&price2=&chk_color=&chk_sale=&chk_soldout="
        self.content_url = "https://store.musinsa.com"

        ##data디렉토리 및 파일 생성
        self.create_data_storage()

        ##로그설정
        Crawler.set_logs('Musinsa_Crawler_log',
                         './logging/logfile_musinsa.log')
Esempio n. 6
0
    def __init__(self, student_id, password):
        """Constructor for getting student id and password.
        """

        # Initializing the base class Crawler.
        Crawler.__init__(self)

        self.student_id = student_id
        self.password = password

        # Structuring the authentication data into a dict for posting to the server.
        self.auth_data = {'dfUsernameHidden': student_id,
                          'dfPasswordHidden': password}

        # Login the website then other requests can be made with this session and getting the status of login.
        self.status = self.login()
Esempio n. 7
0
    def __init__(self, shelf, pswd=None):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['suedd']

        self.politicians = ('sarkozy', 'hollande', 'medwedew', 'cameron',
                            'putin', 'merkel', 'theresa+AND+may')

        self.starting_page = 1

        self.update_payload()

        self.data_format = '%d.%m.%Y'
Esempio n. 8
0
    def __init__(self, shelf):

        Crawler.__init__(self)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['ksta_de']

        self.politicians = ('sarkozy', 'hollande', 'dmitri|dmitrij+medwedew', 'david+cameron', 'putin', 'merkel'. 'theresa+may')

        self.site = r'http://www.berliner-zeitung.de/action/berliner-zeitung/4484314/search?'

        self.starting_page = 0

        self.data_format = '%Y-%m-%d'

        self.update_payload()
Esempio n. 9
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['vz']

        self.politicians = ('%ED%E8%EA%EE%EB%FF+%F1%E0%F0%EA%EE%E7%E8','%F4%F0%E0%ED%F1%F3%E0+%EE%EB%EB%E0%ED%E4','%E4%EC%E8%F2%F0%E8%E9+%EC%E5%E4%E2%E5%E4%E5%E2', '%E4%FD%E2%E8%E4+%EA%FD%EC%E5%F0%EE%ED', '%E2%EB%E0%E4%E8%EC%E8%F0+%EF%F3%F2%E8%ED', '%E0%ED%E3%E5%EB%E0+%EC%E5%F0%EA%E5%EB%FC', '%F2%E5%F0%E5%E7%E0+%EC%FD%E9')

        self.data_format = '%Y-%m-%d'

        self.starting_page = 1

        self.payload = {}

        self.update_payload()
Esempio n. 10
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['lemonde']

        self.politicians = ('nicolas sarkozy', 'francois hollande',
                            'dmitry medvedev', 'david cameron',
                            'vladimir putin', 'angela merkel', 'theresa may')

        self.starting_page = 1

        self.data_format = '%Y-%m-%d'

        self.site = r'http://www.lemonde.fr/recherche/?operator=and&exclude_keywords=&qt=recherche_texte_titre&author=&period=custom_date&start_day=01&start_month=01&start_year=2000&end_day=28&end_month=03&end_year=2017&sort=desc'.format(
            self.politicians[self.storage['politNum']])
Esempio n. 11
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['ksta_de']

        self.politicians = ('sarkozy', 'hollande', 'medwedew', 'cameron',
                            'putin', 'merkel', 'theresa+may')

        self.site = r'http://www.ksta.de/action/ksta/4484314/search?'

        self.data_format = '%Y-%m-%d'

        self.starting_page = 0

        self.update_payload()
Esempio n. 12
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['independent']

        self.politicians = ('nicolas sarkozy', 'francois hollande',
                            'dmitry medvedev', 'david cameron',
                            'vladimir putin', 'angela merkel', 'theresa may')

        self.site = r'http://www.independent.co.uk/search/site/{}'.format(
            self.politicians[self.storage['politNum']])

        self.data_format = '%Y-%m-%d'

        self.starting_page = 0

        self.update_payload()
Esempio n. 13
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['spiegel']

        self.politicians = ('nicolas_sarkozy', 'francois_hollande',
                            'dmitrij_medwedew', 'david_cameron',
                            'wladimir_putin', 'angela_merkel', 'theresa_may')

        self.starting_page = 1

        self.data_format = '%d.%m.%Y'

        self.site = r'http://www.spiegel.de/thema/{}/dossierarchiv-{}.html'.format(
            self.politicians[self.storage['politNum']],
            max(self.starting_page, self.storage['pn']))

        self.payload = None
Esempio n. 14
0
    def __init__(self, shelf, pswd):

        Crawler.__init__(self, pswd)

        self.respage = Resultpage()

        self.article = Article()

        self.storage = shelf['guardian']

        self.politicians = ('nicolas-sarkozy', 'francois-hollande',
                            'dmitry-medvedev', 'davidcameron',
                            'vladimir-putin', 'angela-merkel', 'theresamay')

        self.local = (3, 6)

        self.starting_page = 1

        self.data_format = '%Y-%m-%d'

        self.site = r'https://www.theguardian.com/{}/{}?'.format(
            'world' if self.storage['politNum'] not in self.local else
            'politics', self.politicians[self.storage['politNum']])
Esempio n. 15
0
    def __init__(self, login_id=None, last_name=None, pin=None):
        """Constructor for getting login credentials.
        """

        # Initializing the base class Crawler.
        Crawler.__init__(self)

        # Structuring the authentication data into a dict for posting to the server.
        self.auth_data = {
            'loginType': 'B',
            'loginId': login_id,
            'lastName': last_name,
            'pin': pin,
            'page.logIn.library': '1@VYKDB20011102005217'
        }

        self.books = None
        self.content = ''

        # Login the website then other requests can be made with this session and getting the status of login.
        self.status = self.login()
        if self.status is True:
            self.books = self.get_books()
Esempio n. 16
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://56110.cn"
     self.suffix = "/Huo/list.html"
Esempio n. 17
0
 def __init__(self, start_url=START_URL):
     Crawler.__init__(self, start_url)
     self.tasks = []
Esempio n. 18
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://wb.56888.net"
     self.prefix = "/OutSourceList.aspx?tendertype=4&p="
Esempio n. 19
0
 def __init__(self):
     Crawler.__init__(self)
     self.url = 'http://www.google.com/search'
     self.params = {"tbs": "li:1"}
Esempio n. 20
0
 def __init__(self, config):
     Crawler.__init__(self, config)
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "虎嗅"
     self.root_url = "http://www.huxiu.com"
Esempio n. 22
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://fala56.com"
     self.prefix = "/Views/Huoyuan"
     self.suffix = "/GoodsLandList.aspx?area=-1"
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "36氪"
     self.root_url = "http://36kr.com"
Esempio n. 24
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.chinawutong.com"
     self.prefix = "/103.html?pid="
Esempio n. 25
0
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "网易科技"
     self.root_url = "http://tech.163.com/gd/"
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "极客公园"
     self.root_url = "http://www.geekpark.net"
Esempio n. 27
0
 def __init__(self, proj_name):
     Crawler.__init__(self, proj_name)
     self.name = "砍柴网"
     self.root_url = "http://www.ikanchai.com/"
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.51yunli.com"
     self.prefix = "/goods/0/0/"
     self.suffix = "/0"
     self.MAX_PAGE = 7
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.chinawutong.com"
     self.prefix = "/103.html?pid="
Esempio n. 30
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://56110.cn"
     self.suffix = "/Huo/list.html"
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.8glw.com"
     self.prefix = "/main_info.asp?id=1&page="
Esempio n. 32
0
	def __init__(self, dbname=""):
		Crawler.__init__(self, dbname)
Esempio n. 33
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.0256.cn"
     self.prefix = "/goods/?PageIndex="
Esempio n. 34
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://fala56.com"
     self.prefix = "/Views/Huoyuan"
     self.suffix = "/GoodsLandList.aspx?area=-1"
Esempio n. 35
0
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://wb.56888.net"
     self.prefix = "/OutSourceList.aspx?tendertype=4&p="
 def __init__(self):
     Crawler.__init__(self)
     self.HOST = "http://www.0256.cn"
     self.prefix = "/goods/?PageIndex="