class UploadMongoData(): def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.log_path = self.settings.LOG_PATH self.doraemon.createFilePath(self.log_path) def startUpload(self): fromFile = self.settings.LOCAL_MONGO_DATA_PATH toFile = self.settings.REMOTE_MONGO_DATA_PATH if not os.path.exists(fromFile): print 'no mongo data file to upload' return while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.settings.IP_WEBSERVER0, self.settings.PORT_WEBSERVER0, self.settings.USER_ROOT_WEBSERVER0, self.settings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): self.doraemon.deleteFile(fromFile) message1 = 'Success to upload mongo data file: {0}'.format( fromFile) print message1 self.file.logger(self.log_path, message1) except Exception as e: message2 = 'Exception {0} to upload mongo data file: {1}'.format( e.message, fromFile) print message2 self.file.logger(self.log_path, message2)
class FengReceptorContent(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "feng_receptor_content" self.finished_ids = "feng_receptor_content" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath(".//*[contains(@class, 'newLine-4rktaWav')]") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return texts = href_contens[0].xpath( ".//*[contains(@class, 'time-RyJJYUOX')]/text()") time_source = ''.join(texts).strip() self.doraemon.hashSet(self.finished_ids, current_url, current_url) data = {'id': key, 'url': current_url, 'date': time_source} print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) file_path = '/home/dev/Data/rsyncData/test/feng_receptor.csv' items = self.file.readFromCSV(file_path) items.pop(0) for item in items: key = item[0] if key not in all_finished_id: name = key.strip() url = item[1] new_urls.append([url, name]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class ChuansongmeReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/" self.mongo = "gongzhonghao_test" self.finished_ids = "gongzhonghao_test" self.log_path = "/home/dev/Data/rsyncData/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]") if len(href_item) == 0: print 'No data for: {0}'.format(key) return self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': current_url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: tmp_url = "https://chuansongme.com/account/{0}".format(key) new_urls.append([tmp_url, key]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class XueqiuReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "xueqiu_test" self.finished_ids = "xueqiu_test" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath( ".//*[contains(@class, 'search__user__card__content')]") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath(".//*[contains(@class, 'user-name')]/@href") title_content = item.xpath( ".//*[contains(@class, 'user-name')]//span/text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: url = "https://xueqiu.com/u{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = {'id': key, 'url': url} print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: name = key.strip() tmp_url = "https://xueqiu.com/k?q={0}".format(name) new_urls.append([tmp_url, name]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
class WoshipmReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "whoispm_receptor" self.finished_ids = "woshipm_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" self.regx = re.compile("/u/[0-9]{0,}") def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath("./a") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath("@href") title_content = item.xpath(".//text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: isValidUrl = self.regx.match(href[0]) if isValidUrl is None: print 'Invalid url for not match: {0}'.format(href[0]) continue url = "http://www.woshipm.com{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class FengReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "feng_receptor" self.finished_ids = "feng_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) key = response['request_title'].strip() str = response['response'].page_source.encode('utf-8') str_n = str[str.find('(') + 1:-21] str_n = str_n.replace('null', 'None') dics = eval(str_n) if len(dics['items']) == 0: print 'No data for: {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) return for item in dics['items']: name = item['name'].replace('<','').replace('em>','').replace('\\/','') id = item['id'] if len(id) > 0 and name == key: url = "https://feng.ifeng.com/author/{0}".format(id) self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
class CamelBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd2 = self.settings.WORK_PATH_PRD2 self.mongo = self.settings.MONGO_URLS self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_URL self.urls = self.settings.URLS self.max_concurrency = self.globalSettings.MAX_CONCURRENCY self.concurrency_file = self.globalSettings.CONCURRENCY_FILE self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH self.url_timeout = self.settings.URL_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.url_backup_folder_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) results = self.callBack(current_url, html) if len(results) == 0: message1 = 'No url for page: {0}'.format(current_url) self.file.logger(self.log_path, message1) print message1 for item in results: is_title_empty = self.doraemon.isEmpty(item.title) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_urls, item.title) is False): message2 = 'Start to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb( self.mongo, self.doraemon.createCamelMongoJson(item)) message3 = 'End to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message3) print message3 self.file.logger(self.log_path, 'Done for {0}'.format(item.url)) else: if is_title_empty is True: message4 = 'Empty title for {0}'.format(item.url) self.file.logger(self.log_path, message4) print message4 else: print 'Finished title for {0}'.format(item.url) print 'End to parse {0}'.format(current_url) del current_url, results, html gc.collect() def start(self, isdebug=False): if self.doraemon.isCamelReadyToRun( self.settings) is False and isdebug is False: message5 = 'It is not ready to run for {0}'.format(self.name) print message5 return message6 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message6) print message6 new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.url_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file, self.max_concurrency) message7 = 'End for {0} requests of {1}.'.format( str(len(content)), self.name) self.file.logger(self.log_path, message7) print message7 del new_urls, content, url_list, request gc.collect()
class SpiderBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd1 = self.settings.WORK_PATH_PRD1 self.finished_txt_path = self.settings.FINISHED_TXT_PATH self.finished_html_path = self.settings.FINISHED_HTML_PATH self.finished_image_path = self.settings.FINISHED_IMG_PATH self.template_path = self.globalSettings.TEMPLATE_PATH self.article_url = self.globalSettings.ARTICLE_URL self.ali_domain = self.globalSettings.ALI_DOMAIN self.ali_domain_deepinews = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS self.ali_domain_deepinews_img = self.globalSettings.ALI_BUCKET_NAME_DEEPINEWS_IMG self.ip_webserver0 = self.globalSettings.IP_WEBSERVER0 self.port_webserver0 = self.globalSettings.PORT_WEBSERVER0 self.user_root_webserver0 = self.globalSettings.USER_ROOT_WEBSERVER0 self.user_root_password_webserver0 = self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0 self.html_webserver0 = self.globalSettings.HTML_WEBSERVER0 self.mongo = self.settings.MONGO self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_CONTENT self.url_path = self.settings.URL_PATH self.is_open_cache = self.settings.IS_OPEN_CACHE self.finished_backup_folder_path = self.settings.FINISHED_BACKUP_FOLDER_PATH self.max_concurrency_spider = self.globalSettings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = self.globalSettings.CONCURRENCY_FILE_SPIDER self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.local_html_path = self.globalSettings.LOCAL_HTML_PATH self.content_timeout = self.settings.CONTENT_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd1) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.finished_backup_folder_path) self.doraemon.createFilePath(self.monitor_upload_local) self.doraemon.createFilePath(self.local_html_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') request_title = response['request_title'] print 'Start to parse: {0}'.format(current_url) page_source = response['response'].page_source html = etree.HTML(page_source) results = None try: results = self.callBack(current_url, html, page_source) if results == None: message1 = 'No content for: {0}'.format(current_url) print message1 self.file.logger(self.log_path, message1) return dataToMongo = self.doraemon.createSpiderMongoJson(results) except Exception as e: message1 = 'Exception when parse: {0} for {1}'.format(current_url, e.message) print message1 self.file.logger(self.log_path, message1) print 'End to parse: {0}'.format(current_url) if results == None: self.doraemon.storeFinished(self.doraemon.bf_content, request_title) print 'No data for {0}'.format(request_title) else: message2 = 'Start to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb(self.mongo, dataToMongo) message3 = 'End to store mongo {0}'.format(results.url) self.file.logger(self.log_path, message3) print message3 self.doraemon.storeTxt(results.id, results.content, self.finished_txt_path, self.name) self.doraemon.storeFinished(self.doraemon.bf_content, request_title) def start(self): if self.doraemon.isSpiderReadyToRun() is False: message4 = 'It is not ready to run spider: {0}'.format(self.name) print message4 return message5 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message5) print message5 message6 = 'Start requests: {0} '.format(self.name) self.file.logger(self.log_path, message6) print message6 new_url_titles = self.doraemon.readNewUrls(self.doraemon.bf_content, self.url_path) if len(new_url_titles) == 0: self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message7 = 'No new url for {0}'.format(self.name) self.file.logger(self.log_path, message7) print message7 return request = BrowserRequest() content = request.start_chrome(new_url_titles, self.content_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file_spider, self.max_concurrency_spider) message8 = 'End requests for {0}'.format(str(len(content))) self.file.logger(self.log_path, message8) print message8 del content, new_url_titles, request gc.collect()