class Recovery(): def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.root = '/home/dev/Data/rsyncData/prd4/sites' self.dest = '/home/dev/Data/rsyncData/prd4/local' self.resume = '/home/dev/Repository/news/Tegenaria/tSpider/tSpider/dataRecovery/resume.txt' def start(self): sites = os.listdir(self.root) if os.path.exists(self.resume) is False: print 'resume file does not exit and create an new one' self.file.writeToTxtCover(self.resume, '\n') finished = [] items = self.file.readFromTxt(self.resume).strip().split('\n') for item in items: finished.append(item) for site in sites: p1 = '{0}/{1}/html'.format(self.root, site) if os.path.exists(p1) is False: print '{0} has no html.'.format(site) continue allTime = os.listdir(p1) for t in allTime: p2 = '{0}/{1}'.format(p1, t) files = os.listdir(p2) for file in files: fromFile = '{0}/{1}'.format(p2, file) if fromFile not in finished: toFile = '{0}/{1}'.format(self.dest, file) if self.doraemon.copyFile(fromFile, toFile): self.file.writeToTxtAdd(self.resume, fromFile) print '{0} is recovered.'.format(fromFile) print '{0} is finished.'.format(site)
class ChuansongmeReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/" self.mongo = "gongzhonghao_test" self.finished_ids = "gongzhonghao_test" self.log_path = "/home/dev/Data/rsyncData/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_item = html.xpath("./*[contains(@class, 'pagedlist_item')]") if len(href_item) == 0: print 'No data for: {0}'.format(key) return self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': current_url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/gongzhonghao_test.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: tmp_url = "https://chuansongme.com/account/{0}".format(key) new_urls.append([tmp_url, key]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class XueqiuReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "xueqiu_test" self.finished_ids = "xueqiu_test" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath( ".//*[contains(@class, 'search__user__card__content')]") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath(".//*[contains(@class, 'user-name')]/@href") title_content = item.xpath( ".//*[contains(@class, 'user-name')]//span/text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: url = "https://xueqiu.com/u{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = {'id': key, 'url': url} print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/xueqiu.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: if key not in all_finished_id: name = key.strip() tmp_url = "https://xueqiu.com/k?q={0}".format(name) new_urls.append([tmp_url, name]) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
class ProcessTimeoutHandler(): def __init__(self): self.doraemon = Doraemon() self.file = FileIOMiddleware() self.settings = Settings() self.cache_file = self.settings.TIMEOUT_CACHE_FILE self.timeout = self.settings.PROCESS_TIMEOUT self.timeout_content = self.settings.PROCESS_TIMEOUT_CONTENT def updateTimeoutCacheFile(self, processes): if self.doraemon.isFileExists(self.cache_file): self.doraemon.deleteFile(self.cache_file) for process in processes: tmp = '{0}-{1}-{2}'.format(process.pid, process.pname, process.past) self.file.writeToTxtAdd(self.cache_file, tmp) def getTimeoutCache(self): result = [] if self.doraemon.isFileExists(self.cache_file): data = self.file.readFromTxt(self.cache_file) pidTimeoutList = data.split('\n') for item in pidTimeoutList: if self.doraemon.isEmpty(item) is False: tmp = item.split('-') if len(tmp) == 3: result.append( ProcessTimeoutDto(int(tmp[0]), tmp[1], float(tmp[2]), False)) return result def findTarget(self, pid, pids): result = None for p in pids: if p.pid == pid: result = p return result def filterTimeoutProcesses(self, curpids, prepids): result = [] if self.doraemon.isEmpty(curpids): return result for p in curpids: pre = self.findTarget(p.pid, prepids) if pre is not None: if (pre.pname == 'chrome' and self.doraemon.isExceedTimeoutInterval(self.timeout, pre.past)) or \ (pre.pname == 'python' and self.doraemon.isExceedTimeoutInterval(self.timeout_content, pre.past)): result.append( ProcessTimeoutDto(pre.pid, pre.pname, pre.past, True)) else: result.append(p) return result def getCurrentProcesses(self): result = [] pids = psutil.pids() if len(pids): print 'No Chrome process.' for pid in pids: try: p = psutil.Process(pid) pname = p.name() if pname == 'chrome' or pname == 'python': print 'Start to store process {0}.'.format(pid) result.append( ProcessTimeoutDto(pid, pname, p._create_time, False)) except Exception as e: print 'Exception {0} to find process: pid - {1}'.format( e, p.pid) return result def processTimeoutProcesses(self, curpids, prepids): updatedPids = self.filterTimeoutProcesses(curpids, prepids) notTimeoutProcesses = [] for p in updatedPids: if p.isTimeout: try: print 'kill timeout process: pid - {0}'.format(p.pid) os.kill(p.pid, signal.SIGKILL) except Exception as e: print 'Exception {0} to kill process: pid - {1}'.format( e, p.pid) else: notTimeoutProcesses.append(p) self.updateTimeoutCacheFile(notTimeoutProcesses) def start(self): print 'Start to process.' self.processTimeoutProcesses(self.getCurrentProcesses(), self.getTimeoutCache())
class Doraemon(): def __init__(self): settings = Settings() settings.CreateCommonSettings() self.file = FileIOMiddleware() self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT) self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS) self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT) self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS) self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL self.bf_weixin_url = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ARTICLE) self.bf_weixin_content = BloomFilter( self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE) self.bf_weixin_id = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ID) self.bf_finished_image_id = BloomFilter(self.rconn, settings.FINISHED_IMAGE_ID) self.bf_finished_temp_weixin = BloomFilter( self.rconn, settings.FINISHED_TEMP_WEIXIN) self.md5 = hashlib.md5() self.max_concurrency = settings.MAX_CONCURRENCY self.concurrency_file = settings.CONCURRENCY_FILE self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER self.bf_huxiu_nlp = BloomFilter(self.rconn, settings.FINISHED_HUXIU_NLP) self.sites_info = settings.SITES_INFO self.sites_debug = settings.SITES_DEBUG def sshUpload(self, address, port, username, password, fromFile, toFile): transport = paramiko.Transport((address, port)) try: print 'Start to upload file: {0}'.format(fromFile) transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.put(fromFile, toFile) transport.close() print 'Finished to upload file: {0}'.format(fromFile) return True except Exception as e: print 'Exception {0} to upload file: {1}'.format( e.message, fromFile) return False def moveFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.move(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Move file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to move file {1} to file {2}.".format( e.message, fromfile, tofile)) def copyFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.copy(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Copy file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to copy file {1} to file {2}.".format( e.message, fromfile, tofile)) def createFilePath(self, path): isFilePathExists = os.path.exists(path) if isFilePathExists is False: os.makedirs(path) def isExceedRestartInterval(self, path, restart_interval): isRestartPathExists = os.path.exists(path) if isRestartPathExists is False: print 'restart file does not exit and create an new one' self.file.writeToTxtCover(path, time.time()) return True past = float(self.file.readFromTxt(path)) now = time.time() isExceed = ((now - past) // 60 >= restart_interval) or ( self.disable_restart_interval is True) if isExceed is True: print 'exceeds the restart interval and restart' self.file.writeToTxtCover(path, time.time()) else: print 'does not exceed the restart interval and stop' return isExceed def isExceedTimeoutInterval(self, timeout, past): if self.isEmpty(timeout): print 'timeout is empty' return False now = time.time() isExceed = ((now - past) // 60 >= timeout) if isExceed is True: print 'exceeds the timeout.' else: print 'does not exceeds the timeout.' return isExceed def isEmpty(self, obj=None): if isinstance(obj, unicode): obj = obj.encode('utf-8') if isinstance(obj, str): return len([obj for i in obj if i.strip()]) == 0 elif isinstance(obj, int) or isinstance(obj, float): return False elif isinstance(obj, list) or isinstance(obj, dict) or isinstance( obj, tuple) or isinstance(obj, set): return len(obj) == 0 else: return obj == None def isNumber(self, item): return isinstance(item, int) or isinstance(item, float) def isTitleEmpty(self, title, url): if self.isEmpty(title): print 'Empty title for: {0}'.format(url) return True return False def isUrlValid(self, url, good_keys, bad_keys, regx, valid): is_match = False for regx_item in regx: if regx_item.match(url) != None: is_match = True if is_match == False: print 'Invalid url for not match: {0}'.format(url) return False for good in good_keys: if valid == True: continue if good in url: print 'Match good key: {0}'.format(good) valid = True for bad in bad_keys: if valid == False: continue if bad in url: print 'Match bad key: {0}'.format(bad) valid = False return valid def getImageTypeFromUrl(self, url): if 'jpeg' or '.jpg' in url: return 'jpg' if '.png' in url or 'png' in url: return 'png' if '.gif' in url or 'gif' in url: return 'gif' else: print 'Other image type use default type' return 'png' def isDuplicated(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} duplicated!'.format(content) return True else: filter.insert(content_encode) print 'Content {0} not duplicated!'.format(content) return False def isFinished(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} exists!'.format(content) return True else: return False def storeFinished(self, filter, content): print 'Start to store content: {0}'.format(content) content_encode = str(content).encode("utf-8") filter.insert(content_encode) def storeMongodb(self, mongo_url, data): mongo = MongoMiddleware() mongo.insert(mongo_url, data) def storeTxt(self, id, content, finished_txt_path, name): try: self.createFilePath(finished_txt_path) print 'Start to store txt: {0}'.format(id) self.file.writeToTxtCover( '{0}/{1}_{2}.txt'.format(finished_txt_path, name, id), content) print 'End to store txt: {0}'.format(id) except Exception as e: print 'Exception {0} to store txt: {1}'.format(e.message, id) def storeTxtAdd(self, author_txt_path, author_name, settingName): try: self.createFilePath(author_txt_path) print 'Start to store txt: {0}'.format(author_name) self.file.writeToTxtAdd( '{0}/{1}_authors.txt'.format(author_txt_path, settingName), author_name) except Exception as e: print 'Exception to store txt: {0} , for {1}'.format( author_name, e.strerror) print 'End to store txt: {0}'.format(author_name) def storeHtml(self, id, content, finished_html_path): try: self.createFilePath(finished_html_path) print 'Start to store html: {0}'.format(id) self.file.writeToHtmlCover( '{0}/{1}.html'.format(finished_html_path, id), content) print 'End to store html: {0}'.format(id) return True except Exception as e: print 'Exception {0} to store html: {1}'.format(e.message, id) return False def filter(self, filter, url_titles): new_url_titles = [] for url_title in url_titles: if self.isFinished(filter, url_title[1]) is False: new_url_titles.append(url_title) return new_url_titles def imageFilter(self, filter, ids): new_ids = [] for id in ids: if self.isFinished(filter, id) is False: new_ids.append(id) return new_ids def readNewUrls(self, filter, url_path): print 'Start to read urls' isUrlPathExit = os.path.exists(url_path) new_url_titles = [] if isUrlPathExit is True: url_titles = np.array( self.file.readColsFromCSV(url_path, ['url', 'title'])) new_url_titles = self.filter(filter, url_titles) return new_url_titles def readNewImageIds(self, filter, content_path): print 'Start to read ids' isContentPathExit = os.path.exists(content_path) new_ids = [] id_list = [] if isContentPathExit is True: ids = np.array(self.file.readColsFromCSV(content_path, ['id'])) for id in ids: id_list.append(id[0]) new_ids = self.imageFilter(filter, id_list) return new_ids def downloadImage(self, image_url, store_path, image_name): try: self.createFilePath(store_path) print 'start to download image: {0}'.format(image_url) urllib.urlretrieve(image_url, '{0}/{1}'.format(store_path, image_name)) return True except Exception as e: print 'exception to download image: {0} for {1}'.format( image_url, e.message) return False def hashSet(self, name, key, value): self.rconn.hset(name, key, value) def getHashSet(self, name, key): return self.rconn.hget(name, key) def getAllHasSet(self, name): return self.rconn.hgetall(name) def delHashSet(self, name, key): return self.rconn.hdel(name, key) def delKey(self, key): return self.rconn.delete(key) def getKeyLen(self, key): return self.rconn.hlen(key) def getDateOfDaysBefore(self, days): return (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d") def getCurrentYear(self): return time.strftime('%Y', time.localtime(time.time())) def getCurrentDate(self): return time.strftime('%Y-%m-%d', time.localtime(time.time())) def getCurrentLocalTime(self): return datetime.now().strftime('%Y-%m-%d %H:%M:%S') def getDateTime(self, string, dateFormat, pattern, isMatchDate): try: match = re.search(dateFormat, string) strp = datetime.strptime(match.group(), pattern) if isMatchDate: print "'Match date success: {0}".format(strp.date()) return strp.date() else: print "'Match date success: {0}".format(strp.date()) return strp.date() except: print("'Match date fail") return None def getDateFromChinese(self, string): year = self.getCurrentYear() try: if "今天" in string or \ "秒前" in string or \ "分钟前" in string or \ "小时前" in string or \ "Today" in string: return self.getDateOfDaysBefore(0) if "昨天" in string or \ "1天前" in string or \ "Yesterday" in string: return self.getDateOfDaysBefore(1) if "前天" in string or \ "2天前" in string or \ "2 days ago" in string: return self.getDateOfDaysBefore(2) if "3天前" in string or \ "3 days ago" in string: return self.getDateOfDaysBefore(3) if "4天前" in string or \ "4 days ago" in string: return self.getDateOfDaysBefore(4) if "5天前" in string or \ "5 days ago" in string: return self.getDateOfDaysBefore(5) if "6天前" in string or \ "6 days ago" in string: return self.getDateOfDaysBefore(6) if "1周前" in string or \ "1 week ago" in string: return self.getDateOfDaysBefore(7) if "年" not in string and "月" in string and "日" in string: data = re.split(",", string.replace('月', ',').replace('日', '')) return "{0}-{1}-{2}".format(year, self.getNumberFromString(data[0]), self.getNumberFromString(data[1])) if "年" in string and "月" in string and "日" in string: data = re.split( ",", string.replace('年', ',').replace('月', ',').replace('日', ',')) return "{0}-{1}-{2}".format(self.getNumberFromString(data[0]), self.getNumberFromString(data[1]), self.getNumberFromString(data[2])) except: print("Fail to match date from Chinese.") return None def getNumberFromString(self, string): return ''.join(re.findall(r'\d+', string)).strip() def getFinalDate(self, year, month, day): return "{0}-{1}-{2}".format(year, month, day) def formateMonthDay(self, MD): return '{:02d}'.format(MD) def getDateFromString(self, string_date): _date_chinese = self.getDateFromChinese(string_date) if _date_chinese is not None: string_date = _date_chinese _date_year_month_day_crossing = self.getDateTime( string_date, r'\d{4}-\d{1,2}-\d{1,2}', '%Y-%m-%d', True) _date_year2_month_day_crossing = self.getDateTime( string_date, r'\d{2}-\d{1,2}-\d{1,2}', '%y-%m-%d', True) _date_month_day_crossing = self.getDateTime(string_date, r'\d{1,2}-\d{1,2}', '%m-%d', True) _date_year_month_day_dot = self.getDateTime(string_date, r'\d{4}.\d{1,2}.\d{1,2}', '%Y.%m.%d', True) _date_month_day_dot = self.getDateTime(string_date, r'\d{1,2}.\d{1,2}', '%m.%d', True) _date_year_month_day_slash = self.getDateTime( string_date, r'\d{4}\/\d{1,2}\/\d{1,2}', '%Y/%m/%d', True) _date_month_day_slash = self.getDateTime(string_date, r'\d{1,2}\/\d{1,2}', '%m/%d', True) _time_hour_minute_second = self.getDateTime( string_date, r'\d{1,2}:\d{1,2}:\d{1,2}', '%H:%M:%S', False) _time_hour_minute = self.getDateTime(string_date, r'\d{1,2}:\d{1,2}', '%H:%M', False) year = self.getCurrentYear() if _date_year_month_day_crossing is not None: return self.getFinalDate( _date_year_month_day_crossing.year, self.formateMonthDay(_date_year_month_day_crossing.month), self.formateMonthDay(_date_year_month_day_crossing.day)) if _date_year2_month_day_crossing is not None: return self.getFinalDate( _date_year2_month_day_crossing.year, self.formateMonthDay(_date_year2_month_day_crossing.month), self.formateMonthDay(_date_year2_month_day_crossing.day)) if _date_year_month_day_crossing is None and _date_month_day_crossing is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_crossing.month), self.formateMonthDay(_date_month_day_crossing.day)) if _date_year_month_day_dot is not None: return self.getFinalDate( _date_year_month_day_dot.year, self.formateMonthDay(_date_year_month_day_dot.month), self.formateMonthDay(_date_year_month_day_dot.day)) if _date_year_month_day_dot is None and _date_month_day_dot is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_dot.month), self.formateMonthDay(_date_month_day_dot.day)) if _date_year_month_day_slash is not None: return self.getFinalDate( _date_year_month_day_slash.year, self.formateMonthDay(_date_year_month_day_slash.month), self.formateMonthDay(_date_year_month_day_slash.day)) if _date_year_month_day_slash is None and _date_month_day_slash is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_slash.month), self.formateMonthDay(_date_month_day_slash.day)) if _time_hour_minute_second is not None or _time_hour_minute is not None: return self.getCurrentDate() def getMD5(self, content): self.md5.update(content.encode('utf-8')) return self.md5.hexdigest() def compressImage(self, origin_image_path, destination_image_path, multiplier): try: sImg = Image.open(origin_image_path) w, h = sImg.size dImg = sImg.resize((int(w / multiplier), int(h / multiplier)), Image.ANTIALIAS) os.remove(origin_image_path) dImg.save(destination_image_path) print "Compress picture {0} success!".format( destination_image_path) except Exception as e: print "Compress picture {0} failed for {1}".format( destination_image_path, e.message) def getSizeOfImage(self, image_path): try: img = Image.open(image_path) return img.size except Exception as e: print "Exception to open picture {0}, for {1}.".format( image_path, e.message) def getFileSize(self, file_path): try: fsize = os.path.getsize(file_path) fsize = fsize / float(1024) return round(fsize, 2) except Exception as e: print "Exception to get file size of {0}, for {1}.".format( file_path, e.message) def getFileList(self, diractory): file_list = [] isFilePathExists = os.path.exists(diractory) if isFilePathExists is True: file_list = os.listdir(diractory) return file_list def isFileExists(self, file_path): return os.path.exists(file_path) def deleteFile(self, file_path): try: print "Start to delete file: {0}".format(file_path) os.remove(file_path) print "Finished to delete file: {0}".format(file_path) except Exception as e: print "Exception to delete file: {0} for : {1}".format( file_path, e.message) def tar(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) def tarList(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) lst = [] t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) lst.append(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) return lst except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) return [] def isCamelReadyToRun(self, settings): if self.isWorkTime(settings.START_TIME, settings.END_TIME) is False: return False if self.isConcurrencyAllowToRun(self.concurrency_refresh_file, self.refresh_concurrency_interval, self.concurrency_file, self.max_concurrency) is False: return False if self.isExceedRestartInterval(settings.RESTART_PATH, settings.RESTART_INTERVAL) is False: self.recoveryConcurrency(self.concurrency_file, self.max_concurrency) return False return True def isSpiderReadyToRun(self): return self.isConcurrencyAllowToRun( self.concurrency_refresh_file_spider, self.refresh_concurrency_interval_spider, self.concurrency_file_spider, self.max_concurrency_spider) def isWorkTime(self, start_time, end_time): if self.isNumber(start_time) is False: print 'start time is empty' return False if self.isNumber(end_time) is False: print 'end time is empty' return False if self.isAfterHour(start_time) and self.isBeforeHour(end_time): print 'it is work time' return True else: print 'it is not work time before {0} or after {1}'.format( start_time, end_time) return False def isAfterHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) < int(current_time): return True else: return False def isBeforeHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) >= int(current_time): return True else: return False def readFile(self, file): waiting = 0 data = self.file.readFromTxt(file).strip() while self.isEmpty(data): print 'file {0} is under update, waitting... {1} s'.format( file, waiting) time.sleep(1) waiting += 1 data = self.file.readFromTxt(file).strip() return data def isConcurrencyAllowToRun(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): self.updateConcurrencyFile(concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency) isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists : {0}'.format( str(concurrency_available)) if int(concurrency_available) > 0: print 'app is able to run.' new_concurrency_available = concurrency_available - 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) return True else: print 'app is not able to run for no available concurrency.' return False return True def recoveryConcurrency(self, concurrency_file, max_concurrency): isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) return concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists and start to recovery: {0}'.format( str(concurrency_available)) if int(concurrency_available) < max_concurrency: print 'start to recovery concurrenct.' new_concurrency_available = concurrency_available + 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) else: print 'concurrency is not normal and write max concurrency to it.' self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def updateConcurrencyFile(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): if self.isExceedRestartInterval(concurrency_refresh_file, refresh_concurrency_interval) is True: print 'refresh concurrency file: {0}'.format(str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def createCamelData(self, title, url, id, download_time, source): return camelDto(title, url, id, download_time, source) def createCamelMongoJson(self, camelDto): return { 'title': camelDto.title, 'url': camelDto.url, 'id': camelDto.id, 'download_time': camelDto.download_time, 'source': camelDto.source } def createSpiderData(self, url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content): return spiderDto(url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content) def createSpiderMongoJson(self, spiderDto): return { 'url': spiderDto.url, 'origin_url,': spiderDto.origin_url, 'public_time': spiderDto.public_time, 'author_name': spiderDto.author_name, 'title': spiderDto.title, 'id': spiderDto.id, 'download_time': spiderDto.download_time, 'source': spiderDto.source, 'images': spiderDto.images, 'is_open_cache': spiderDto.is_open_cache } def updateImages(self, images, newImages): for image in newImages: data = image.strip() if self.isEmpty(data) is False and data not in images: images.append(data) def completeImageUrls(self, newImages, current_url): result = [] if len(newImages) == 0: print 'No images urls to process' return result for url in newImages: entireUrl = urlparse.urljoin(current_url, url).strip() if re.match('https', entireUrl) is not None: result.append(entireUrl) return result def getSitesInfo(self, isdebug=False): if isdebug: site_info_path = self.sites_debug else: site_info_path = self.sites_info content = self.file.readFromTxt(site_info_path) if self.isEmpty(content): print 'sites info is empty' return None sitesInfo = content.split('[SITE]') results = [] for site in sitesInfo: if self.isEmpty(site): continue results.append(self.extractSiteInfo(site)) return results def extractSiteInfo(self, siteInfo): items = siteInfo.split('\n') result = siteInfoDto(domain=None, name=None, restart_interval=None, url_parallel_number=None, content_parallel_number=None, is_open_cache=None, work_time_start=None, work_time_end=None, good_keys=[], bad_keys=[], href_items=[], href=[], url_match=[], url_title_match=[], url_id_tag=[], content_match=[], content_child_match=[], content_url_match=[], content_id_tag=[], article_match=[], content_title_match=[], content_image_match=[], content_time_match=[], need_self_image=None, url_timeout=None, content_timeout=None) for item in items: if self.isEmpty(item): continue content = item.split('==') key = ''.join(content[0]).strip() value = ''.join(content[1]).strip() if key == 'DOMAIN': result.domain = value continue if key == 'NAME': result.name = value continue if key == 'RESTARTINTERVAL': result.restart_interval = int(value) continue if key == 'URLPARALLELNUMBER': result.url_parallel_number = int(value) continue if key == 'CONTENTPARALLELNUMBER': result.content_parallel_number = int(value) continue if key == 'ISOPENCACHE': result.is_open_cache = bool(value) continue if key == 'WORKTIMESTART': result.work_time_start = int(value) continue if key == 'WORKTIMEEND': result.work_time_end = int(value) continue if key == 'GOODKEYS': if self.isEmpty(value) is False: result.good_keys.append(value) continue if key == 'BADKEYS': if self.isEmpty(value) is False: result.bad_keys.append(value) continue if key == 'URLMATCH': if self.isEmpty(value) is False: result.url_match.append(self.extractRegxRule(value)) continue if key == 'URLTITLEMATCH': if self.isEmpty(value) is False: result.url_title_match.append(self.extractHtmlTag(value)) continue if key == 'URLIDTAG': if self.isEmpty(value) is False: result.url_id_tag.append(self.extractHtmlTag(value)) continue if key == 'CONTENTURLMATCH': if self.isEmpty(value) is False: result.content_url_match.append( self.extractRegxRule(value)) continue if key == 'CONTENTIDTAG': if self.isEmpty(value) is False: result.content_id_tag.append(self.extractHtmlTag(value)) continue if key == 'HREFITEMS': if self.isEmpty(value) is False: result.href_items.append(self.extractHtmlTag(value)) continue if key == 'HREF': if self.isEmpty(value) is False: result.href.append(self.extractHtmlTag(value)) continue if key == 'ARTICLEMATCH': if self.isEmpty(value) is False: result.article_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTMATCH': if self.isEmpty(value) is False: result.content_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTCHILDMATCH': if self.isEmpty(value) is False: result.content_child_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTITLEMATCH': if self.isEmpty(value) is False: result.content_title_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTIMAGEMATCH': if self.isEmpty(value) is False: result.content_image_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTIMEMATCH': if self.isEmpty(value) is False: result.content_time_match.append( self.extractHtmlTag(value)) continue if key == 'NEEDSELFIMAGE': if self.isEmpty(value) is False: result.need_self_image = value == 'True' if key == 'NEEDSELFHTML': if self.isEmpty(value) is False: result.need_self_html = value == 'True' if key == 'URLTIMEOUT': if self.isEmpty(value) is False: result.url_timeout = value if key == 'CONTENTTIMEOUT': if self.isEmpty(value) is False: result.content_timeout = value return result def getUrlId(self, url, idTag): id = None for item in idTag: matchItem = item.regx if matchItem in url: index = url.index(item.regx) + item.index if len(url) <= index: continue id = url[index] if id == None: continue return id def extractRegxRule(self, regxMatch): return re.compile(regxMatch) def extractHtmlTag(self, regxMatch): items = regxMatch.split('|') id = ''.join(items[0]).strip() index = int(items[1]) return regxMatchDto(id, index) def getMatchContent(self, content, regx): if regx.index == -1 or len(content) == 0: return content return content[regx.index] def uploadFileApi(self, url, fileName, fullPath): try: with open(fullPath, mode="r") as f: file = {"file": (fileName, f.read())} encode_data = encode_multipart_formdata(file) file_data = encode_data[0] headers_from_data = {"Content-Type": encode_data[1]} response = requests.post(url=url, headers=headers_from_data, data=file_data).json() if response['code'] != 200: print 'Fail to upload file {0} through api {1}'.format( fileName, url) return False print 'Success to upload file {0} through api {1}'.format( fileName, url) return True except Exception as e: print 'Exception to upload file {0} through api {1} : {2}'.format( fileName, url, e.message) return False
class WoshipmReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "whoispm_receptor" self.finished_ids = "woshipm_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" self.regx = re.compile("/u/[0-9]{0,}") def parse(self, response): current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) key = response['request_title'].strip() href_contens = html.xpath("./a") if len(href_contens) == 0: print 'No data for: {0}'.format(key) return for item in href_contens: href = item.xpath("@href") title_content = item.xpath(".//text()") title = "".join(title_content).strip() if len(href) > 0 and title == key: isValidUrl = self.regx.match(href[0]) if isValidUrl is None: print 'Invalid url for not match: {0}'.format(href[0]) continue url = "http://www.woshipm.com{0}".format(href[0]) self.doraemon.hashSet(self.finished_ids, url, url) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/woshipm_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "http://www.woshipm.com/search-posts?k={0}".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 2, self.log_path, None, callback=self.parse)
class FengReceptor(): def __init__(self): self.settings = Settings() self.getSettings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.doraemon.createFilePath(self.work_path_prd2) def getSettings(self): self.work_path_prd2 = "/home/dev/Data/rsyncData/test/" self.mongo = "feng_receptor" self.finished_ids = "feng_receptor" self.log_path = "/home/dev/Data/rsyncData/test/" def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) key = response['request_title'].strip() str = response['response'].page_source.encode('utf-8') str_n = str[str.find('(') + 1:-21] str_n = str_n.replace('null', 'None') dics = eval(str_n) if len(dics['items']) == 0: print 'No data for: {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) return for item in dics['items']: name = item['name'].replace('<','').replace('em>','').replace('\\/','') id = item['id'] if len(id) > 0 and name == key: url = "https://feng.ifeng.com/author/{0}".format(id) self.doraemon.hashSet(self.finished_ids, key, key) data = { 'id': key, 'url': url } print 'Start to store mongo {0}'.format(data['url']) self.doraemon.storeMongodb(self.mongo, data) print 'Finished for {0}'.format(key) def start_requests(self): print 'Start requests' new_urls = [] all_finished_id = list(self.doraemon.getAllHasSet(self.finished_ids)) txt_path = '/home/dev/Data/rsyncData/test/feng_receptor.txt' gonzhonghao = self.file.readFromTxt(txt_path) keys = gonzhonghao.split('\n') for key in keys: key = key.strip() if key not in all_finished_id: name = key.strip() tmp_url = "https://so.v.ifeng.com/websearch/ifeng-search-server/sub/websearch?k={0}&page=1&distinct=1&n=10&hl=1&os=ios&gv=6.2.5&uid=70b6a1d8f6c64618bf9dfa092fc4e34c&callback=getData".format(name) new_urls.append([tmp_url, name]) else: print 'Finished or no data for {0}'.format(key) self.doraemon.hashSet(self.finished_ids, key, key) if len(new_urls) == 0: print 'No more urls.' return request = BrowserRequest() request.start_chrome(new_urls, 5, self.log_path, None, callback=self.parse)
class StoreFiles(): def __init__(self, htmlpath=None, imagepath=None, templatepath=None, articleurl=None, alidomain=None, alidomaindeepinews=None, alidomaindeepinewsimg=None, ipwebserver0=None, portwebserver0=None, userrootwebserver0=None, userrootpasswordwebserver0=None, htmlwebserver0=None, needselfimage=None, needselfhtml=None, localhtmlpath=None, logpath=None): self.doraemon = Doraemon() self.file = FileIOMiddleware() self.image_count = 0 self.htmlpath = htmlpath self.imagepath = imagepath self.templatepath = templatepath self.articleurl = articleurl self.alidomain = alidomain self.alidomaindeepinews = alidomaindeepinews self.alidomaindeepinewsimg = alidomaindeepinewsimg self.ipwebserver0 = ipwebserver0 self.portwebserver0 = portwebserver0 self.userrootwebserver0 = userrootwebserver0 self.userrootpasswordwebserver0 = userrootpasswordwebserver0 self.htmlwebserver0 = htmlwebserver0 self.needselfimage = needselfimage self.needselfhtml = needselfhtml self.localhtmlpath = localhtmlpath self.logpath = logpath def parseContentRegxRule(self, content_regx_rule): result = matchRules(None, None, None) rules = [ matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[\=]', r'[\'](.*?)[\']'), matchRules(r'[\.][\/][\/](.*?)[[]', r'[\@](.*?)[,]', r'[\'](.*?)[\']'), ] for rule in rules: tag = re.findall(rule.tag, content_regx_rule) key = re.findall(rule.key, content_regx_rule) value = re.findall(rule.value, content_regx_rule) if self.doraemon.isEmpty(tag) is False and \ self.doraemon.isEmpty(key) is False and \ self.doraemon.isEmpty(value) is False: result.tag = tag[0] result.key = key[0] result.value = value[0] break return result def addHighlightTextInner(self, content): return '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addHighlightTextOuter(self, node, content): return '{0}<p class="article_paragraph">'.format(node) + \ '<strong class="article_paragraph_border">{0}</strong>'.format(content) + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addImgNode(self, node, dataSrc, dataRef, width, dataRatio): if width == None: width = 1000 return '{0}<p class="article_paragraph_imag">'.format(node) + \ '<img data-ratio="{0}"'.format(dataRatio) + \ 'data-src="{0}"'.format(dataSrc) + \ 'data-ref="{0}"'.format(dataRef) + \ 'data-type="jpeg"' + \ 'data-w={0} '.format(width) + \ 'class="article_paragraph_img"/>' + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addTextNodeOuter(self, node, content): if self.doraemon.isEmpty(content): return '' return '{0}<p class="article_paragraph">{1}'.format(node, content) + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addParagraphGapNode(self, node): return '{0}<p class="article_paragraph">'.format(node) + \ '<br class="article_paragraph_border"/>' + \ '</p>' def addH1Node(self, node, content): return '{0}<p label="h1" class="article_paragraph_h1">'.format(node) + \ '<span class="article_paragraph_h1_1">' + \ '<span class="article_paragraph_h1_1_1">' + \ '<span class="article_paragraph_h1_1_1_1">{0}'.format(content) + \ '</span>' + \ '</span>' + \ '</span>' + \ '</p>' + \ '<p class="article_paragraph">' + \ '<br class="article_paragraph_border"/>' + \ '</p>' def extractImgSize(self, style, mode): size = re.findall(r'{0}:(.*?)px;'.format(mode), style) if len(size) == 1: return size[0].strip() return None def extractImg(self, url, node): result = imgInfo(None, None, None) if isinstance(node, NavigableString): return None if node.name != 'img' and len(node.contents) == 0: return result if node.name == 'img': if node.attrs.has_key('src') and self.doraemon.isEmpty(result.src): if 'data:image/' not in node.attrs['src']: result.src = node.attrs['src'] if node.attrs.has_key('_src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['_src']: result.src = node.attrs['_src'] if node.attrs.has_key('data-original') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-original']: result.src = node.attrs['data-original'] if node.attrs.has_key('data-src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-src']: result.src = node.attrs['data-src'] if node.attrs.has_key('data-lazy-src') and self.doraemon.isEmpty( result.src): if 'data:image/' not in node.attrs['data-lazy-src']: result.src = node.attrs['data-lazy-src'] if node.attrs.has_key('width') and result.width == None: result.width = node.attrs['width'] if node.attrs.has_key('height') and result.height == None: result.height = node.attrs['height'] if node.attrs.has_key('data-w') and result.width == None: result.width = node.attrs['data-w'] if node.attrs.has_key('data-h') and result.height == None: result.height = node.attrs['data-h'] if node.attrs.has_key('data-backh') and result.height == None: result.height = node.attrs['data-backh'] if node.attrs.has_key('data-backw') and result.width == None: result.width = node.attrs['data-backw'] if node.attrs.has_key('data-wscnh') and result.height == None: result.height = node.attrs['data-wscnh'] if node.attrs.has_key('data-wscnw') and result.width == None: result.width = node.attrs['data-wscnw'] if node.attrs.has_key('style') and (result.width == None or result.height == None): result.width = self.extractImgSize(node.attrs['style'], 'width') result.height = self.extractImgSize(node.attrs['style'], 'height') if isinstance(result.width, int) and isinstance( result.height, int): result.dataRatio = float( float(result.height) / float(result.width)) if result.src != None: result.src = urlparse.urljoin(url, result.src).strip() return result if len(node.contents) > 0: for n in node.contents: result = self.extractImg(url, n) if result != None: return result return result def nodeTraversal(self, url, node, newNode, articleId): if node.name == 'strong' and \ node.parent.name == 'div' and \ self.doraemon.isEmpty(node.string) is False: newNode = '{0}{1}'.format( newNode, self.addHighlightTextOuter(newNode, node.string)) if (node.name == 'h1' or \ node.name == 'h2' or \ node.name == 'h3' or \ node.name == 'h4') and \ self.doraemon.isEmpty(node.string) is False: newNode = '{0}{1}'.format(newNode, self.addH1Node(newNode, node.string)) if isinstance(node, NavigableString) or \ node.name == 'a' or \ node.name == 'p' or \ node.name == 'span' or \ node.name == 'section': if isinstance(node, NavigableString): newNode = self.addTextNodeOuter(newNode, str(node)) else: if self.doraemon.isEmpty(node.text) == False: newNode = self.addTextNodeOuter(newNode, node.text) img = self.extractImg(url, node) updatedNode = updateNode(False, newNode, None, None) if img != None and img.src != None: updatedNode.isImageNode = True updatedNode.imageOriginUrl = img.src updatedNode.imageNewUrl = img.src try: imageType = self.doraemon.getImageTypeFromUrl( updatedNode.imageOriginUrl) imageId = '{0}_{1}'.format(articleId, self.image_count) newImageName = '{0}.{1}'.format(imageId, imageType) if self.doraemon.downloadImage(updatedNode.imageOriginUrl, self.imagepath, newImageName): imageInfo = Image.open('{0}/{1}'.format( self.imagepath, newImageName)) if self.doraemon.isEmpty(imageInfo.width) is False: img.width = imageInfo.width if self.doraemon.isEmpty(imageInfo.height) is False: img.height = imageInfo.height if isinstance(img.width, int) and isinstance( img.height, int): img.dataRatio = float( float(img.height) / float(img.width)) if self.needselfimage: updatedNode.imageNewUrl = 'https://{0}.{1}/{2}/{3}'.format( self.alidomaindeepinews, self.alidomain, self.alidomaindeepinewsimg, newImageName) imageUpload = AliUpload( '{0}'.format(self.imagepath), newImageName, '{0}'.format(self.alidomaindeepinews), '{0}'.format(self.alidomaindeepinewsimg)) if imageUpload.start(): updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, updatedNode.imageNewUrl, updatedNode.imageNewUrl, img.width, img.dataRatio)) self.image_count += 1 else: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) else: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) except Exception as e: updatedNode.node = '{0}{1}'.format( newNode, self.addImgNode(newNode, img.src, img.src, img.width, img.dataRatio)) print 'Exception {0} to download image: {1}'.format( e.message, updatedNode.imageOriginUrl) return updatedNode def updateTemplate(self, template, articleHeadDescription, articleHeadAuthor, articleHeadTitle, articleHeadOriginUrl, articleBodyTitle, articleBodyAuthor, articleBodyPublishTime, articleBodyParagraph, articleBodyOriginUrl): template = template.replace('ArticleHeadDescription', articleHeadDescription) template = template.replace('ArticleHeadAuthor', articleHeadAuthor) template = template.replace('ArticleHeadTitle', articleHeadTitle) template = template.replace('ArticleHeadOriginUrl', articleHeadOriginUrl) template = template.replace('ArticleBodyTitle', articleBodyTitle) template = template.replace('ArticleBodyAuthor', articleBodyAuthor) template = template.replace('ArticleBodyPublishTime', articleBodyPublishTime) template = template.replace('ArticleBodyParagraph', articleBodyParagraph) template = template.replace('ArticleBodyOriginUrl', articleBodyOriginUrl) return template def hasText(self, nodes): for node in nodes: if isinstance(node, NavigableString): continue if node.name == 'img' or \ node.name == 'a' or \ node.name == 'p' or \ node.name == 'span' or \ node.name == 'section': return True return False def goDeepToArticleBody(self, contents): if isinstance(contents, NavigableString): return contents if len(contents) == 0: return contents if self.hasText(contents): return contents if len(contents) > 0: for n in contents: if isinstance(n, NavigableString): continue return self.goDeepToArticleBody(n.contents) def storeFiles(self, data, page_source, content_regx_rule): if self.needselfhtml == False: return data try: self.image_count = 0 newData = copy.copy(data) newArticleId = self.doraemon.getMD5('{0}_{1}'.format( data.author_name, data.id)) newData.url = '{0}{1}.html'.format(self.articleurl, newArticleId) template = self.file.readFromTxt(self.templatepath) match = self.parseContentRegxRule(content_regx_rule) if match.tag is None or \ match.key is None or \ match.value is None: print 'No match rule available for html' return data soup = BeautifulSoup(page_source, 'lxml') matchTags = soup.select('{0}[{1}="{2}"]'.format( match.tag, match.key, match.value)) if len(matchTags) == 0: print 'No tag matched for html' return data nodes = self.goDeepToArticleBody(matchTags[0].contents) articleContent = '' for node in nodes: if isinstance(node, NavigableString): continue if self.doraemon.isEmpty(node): continue newNode = '' updateNodeInfo = self.nodeTraversal(data.url, node, newNode, newArticleId) articleContent = '{0}{1}'.format(articleContent, updateNodeInfo.node) if updateNodeInfo.isImageNode: if updateNodeInfo.imageOriginUrl in newData.images: for i in newData.images: if updateNodeInfo.imageOriginUrl in i or \ updateNodeInfo.imageOriginUrl == i: newData.images[newData.images.index( i)] = updateNodeInfo.imageNewUrl else: newData.images.append(updateNodeInfo.imageNewUrl) template = self.updateTemplate(template, newData.title, '深度资讯DeepINews', newData.title, newData.url, newData.title, newData.source, newData.public_time, articleContent, data.url) if self.doraemon.storeHtml(newArticleId, template, self.htmlpath): htmlName = '{0}.html'.format(newArticleId) fromFile = '{0}/{1}'.format(self.htmlpath, htmlName) toFile = '{0}/{1}'.format(self.localhtmlpath, htmlName) if self.doraemon.copyFile(fromFile, toFile): print 'Copy file {0} done.'.format(fromFile) return newData else: message1 = 'Copy file {0} fail.'.format(fromFile) print message1 self.file.logger(self.logpath, message1) return data except Exception as e: message2 = 'Exception {0} when update : {1}'.format( e.message, data.url) print message2 self.file.logger(self.logpath, message2) return data
class SSHUpload(): def __init__(self): self.settings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() def writeBack(self, fromFiles): print "Start to update retry file: {0}".format( self.settings.RETRY_FILE) writeBackContent = '' for file in fromFiles: writeBackContent = '{0}{1}\n'.format(writeBackContent, file) self.file.writeToTxtAdd(self.settings.RETRY_FILE, writeBackContent) print "Finished to update retry file: {0}".format( self.settings.RETRY_FILE) def updateRemoveFile(self, fromFiles): if self.doraemon.isEmpty(fromFiles): print "No need to update to remove retry file." return content = self.readFile() print "Start to delete retry file: {0}".format( self.settings.RETRY_FILE) os.remove(self.settings.RETRY_FILE) print "Finished to delete retry file: {0}".format( self.settings.RETRY_FILE) for file in fromFiles: if file in content: del content[content.index(file)] else: content.append(file) self.writeBack(content) def updateAddFile(self, fromFiles): if self.doraemon.isEmpty(fromFiles): print "No need to update to add retry file" return content = self.readFile() if self.doraemon.isEmpty(content): self.writeBack(fromFiles) return print "Start to delete retry file: {0}".format( self.settings.RETRY_FILE) os.remove(self.settings.RETRY_FILE) print "Finished to delete retry file: {0}".format( self.settings.RETRY_FILE) for file in content: if file not in fromFiles: fromFiles.append(file) self.writeBack(fromFiles) def readFile(self): files = [] isRetryFileExists = os.path.exists(self.settings.RETRY_FILE) if isRetryFileExists == False: return files content = self.file.readFromTxt(self.settings.RETRY_FILE) if self.doraemon.isEmpty(content): return files items = content.split('\n') for item in items: if self.doraemon.isEmpty(item): continue files.append(item) return files def retry(self): while True: files = self.readFile() updateFiles = [] try: for fromFile in files: fileParts = re.split(r'[/]', fromFile) fileName = fileParts[len(fileParts) - 1] toFile = '{0}/{1}'.format(self.settings.HTML_WEBSERVER0, fileName) if self.doraemon.sshUpload( self.settings.IP_WEBSERVER0, self.settings.PORT_WEBSERVER0, self.settings.USER_ROOT_WEBSERVER0, self.settings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): updateFiles.append(fromFile) print 'Success to retry to upload: {0}'.format( fromFile) self.updateRemoveFile(updateFiles) except Exception as e: self.updateRemoveFile(updateFiles) print 'Exception {0} to retry to upload: {1}'.format( e.message, fromFile) def startUpload(self): fromFile = '{0}.tar.gz'.format(self.settings.LOCAL_HTML_PATH) if not os.listdir(self.settings.LOCAL_HTML_PATH) and os.path.exists( fromFile) is False: print 'no html file to tar' return uploadedList = [] if os.path.exists(fromFile) is False: uploadedList = self.doraemon.tarList(self.settings.LOCAL_HTML_PATH) while os.path.exists(fromFile): try: if self.doraemon.uploadFileApi(self.settings.UPLOAD_HTML_API, 'local.tar.gz', fromFile): os.remove(fromFile) for file in uploadedList: self.doraemon.deleteFile(file) print 'Success to upload html file: {0}'.format(fromFile) except Exception as e: print 'Exception {0} to upload html file: {1}'.format( e.message, fromFile)
class UpdateMonitorFiles(): def __init__(self, siteinfo=None): self.siteinfo = siteinfo self.globalSettings = Settings() self.doraemon = Doraemon() self.getSettings() self.file = FileIOMiddleware() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.work_path_prd4 = self.settings.WORK_PATH_PRD1 self.work_path_prd3 = self.settings.WORK_PATH_PRD2 self.content_backup_path = self.settings.FINISHED_BACKUP_PATH self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH self.url_backup_path = self.settings.URL_BACKUP_PATH self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0 self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0 def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2, tcount2, turl2, diff2): return '<tr>' + \ '<th align="center" valign="middle">{0}</th>'.format(siteName) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \ '<td align="center" valign="middle">{0}</td>'.format(diff1) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \ '<td align="center" valign="middle">{0}</td>'.format(diff2) + \ '</tr>' def updateSite(self, number, title, url): return '<tr>' + \ '<td align="center" valign="middle">{0}</td>'.format(number) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \ '</tr>' def uploadFile(self, fromFile, toFile): while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.globalSettings.IP_WEBSERVER0, self.globalSettings.PORT_WEBSERVER0, self.globalSettings.USER_ROOT_WEBSERVER0, self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): print 'Success to retry to upload monitor file: {0}'.format( fromFile) return True except Exception as e: print 'Exception {0} to upload monitor site file: {1}'.format( e.message, fromFile) return False def updateSingleSite(self, preBackupPath, postBackupPath, siteName): singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0) isPreBackupFileExists = os.path.exists(preBackupPath) isPostBackupFileExists = os.path.exists(postBackupPath) preCsvContent = None if isPreBackupFileExists: print "Start to read url back up file: {0}".format( self.settings.NAME) preCsvContent = self.file.readColsFromCSV(preBackupPath, ['title', 'url']) singleSiteData.tcount = len(preCsvContent.values) else: print "Url back up file not exits: {0}".format(self.settings.NAME) singleSiteData.tcount = 0 if isPostBackupFileExists: print "Start to read post url back up file: {0}".format( self.settings.NAME) postCsvContent = self.file.readColsFromCSV(postBackupPath, ['title', 'url']) singleSiteData.ycount = len(postCsvContent.values) else: print "Post url back up file not exits: {0}".format( self.settings.NAME) singleSiteData.ycount = 0 singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount if preCsvContent is not None: if preCsvContent.empty: print "No new back up url: {0}".format(self.settings.NAME) else: template = self.file.readFromTxt( self.monitor_site_template_path) finalContent = '' number = 1 for item in preCsvContent.values: finalContent = "{0}{1}".format( finalContent, self.updateSite(number, item[1], item[0])) number += 1 template = template.replace( 'UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('ServerName', siteName) template = template.replace('SiteName', self.siteinfo.name) template = template.replace('MainContent', finalContent) turl = '{0}{1}_{2}.html'.format(self.monitor_site_url, self.settings.NAME, siteName) singleSiteData.turl = turl uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format( self.monitor_upload_local, self.settings.NAME, siteName) self.file.writeToHtmlCover(uploadLocalHtmlPath, template) return singleSiteData def processAllSites(self, allSitesData=None): template = self.file.readFromTxt(self.monitor_spiders_template_path) mainContent = '' t = totalDto(0, 0, 0, 0, 0, 0) for data in allSitesData: mainContent = '{0}{1}'.format( mainContent, self.updateSpiders(data.prd3.sitename, data.prd3.ycount, data.prd3.tcount, data.prd3.turl, data.prd3.diff, data.prd4.ycount, data.prd4.tcount, data.prd4.turl, data.prd4.diff)) t.prd3ytotal += data.prd3.ycount t.prd3ttotal += data.prd3.tcount t.prd4ytotal += data.prd4.ycount t.prd4ttotal += data.prd4.tcount t.prd3difftotal = t.prd3ttotal - t.prd3ytotal t.prd4difftotal = t.prd4ttotal - t.prd4ytotal mainContent = '{0}{1}'.format( mainContent, self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '', t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '', t.prd4difftotal)) template = template.replace('UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('MainContent', mainContent) localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local) self.file.writeToHtmlCover(localHtmlPath, template) self.doraemon.tar(self.monitor_upload_local) fromFile = '{0}.tar.gz'.format(self.monitor_upload_local) self.uploadFile( fromFile, '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0)) os.remove(fromFile) def processSingleSite(self): spidersContent = allSitesDto(None, None) spidersContent.prd3 = self.updateSingleSite(self.url_backup_path, self.url_backup_post_path, 'prd3') spidersContent.prd4 = self.updateSingleSite( self.content_backup_path, self.content_backup_post_path, 'prd4') return spidersContent
class CamelBone(): def __init__(self, siteinfo=None, callback=callable): self.siteinfo = siteinfo self.callBack = callback self.globalSettings = Settings() self.file = FileIOMiddleware() self.doraemon = Doraemon() self.getSettings() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.log_path = self.globalSettings.LOG_PATH_PRD2 self.today = self.globalSettings.TODAY self.source = self.settings.SOURCE_NAME self.work_path_prd2 = self.settings.WORK_PATH_PRD2 self.mongo = self.settings.MONGO_URLS self.name = self.settings.NAME self.max_pool_size = self.settings.MAX_POOL_SIZE_URL self.urls = self.settings.URLS self.max_concurrency = self.globalSettings.MAX_CONCURRENCY self.concurrency_file = self.globalSettings.CONCURRENCY_FILE self.url_backup_folder_path = self.settings.URL_BACKUP_FOLDER_PATH self.url_timeout = self.settings.URL_TIMEOUT self.createPath() def createPath(self): self.doraemon.createFilePath(self.work_path_prd2) self.doraemon.createFilePath(self.log_path) self.doraemon.createFilePath(self.url_backup_folder_path) def parse(self, response): time.sleep(1) current_url = response['response'].current_url.encode('gbk') print 'Start to parse: {0}'.format(current_url) html = etree.HTML(response['response'].page_source) results = self.callBack(current_url, html) if len(results) == 0: message1 = 'No url for page: {0}'.format(current_url) self.file.logger(self.log_path, message1) print message1 for item in results: is_title_empty = self.doraemon.isEmpty(item.title) if (is_title_empty is False) and (self.doraemon.isDuplicated( self.doraemon.bf_urls, item.title) is False): message2 = 'Start to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message2) print message2 self.doraemon.storeMongodb( self.mongo, self.doraemon.createCamelMongoJson(item)) message3 = 'End to store mongo {0}'.format(item.url) self.file.logger(self.log_path, message3) print message3 self.file.logger(self.log_path, 'Done for {0}'.format(item.url)) else: if is_title_empty is True: message4 = 'Empty title for {0}'.format(item.url) self.file.logger(self.log_path, message4) print message4 else: print 'Finished title for {0}'.format(item.url) print 'End to parse {0}'.format(current_url) del current_url, results, html gc.collect() def start(self, isdebug=False): if self.doraemon.isCamelReadyToRun( self.settings) is False and isdebug is False: message5 = 'It is not ready to run for {0}'.format(self.name) print message5 return message6 = 'Start {0} requests'.format(self.name) self.file.logger(self.log_path, message6) print message6 new_urls = [] content = self.file.readFromTxt(self.urls) url_list = content.split('\n') for url in url_list: if self.doraemon.isEmpty(url) is False: new_urls.append([url, '']) if len(new_urls) == 0: print 'No url.' return request = BrowserRequest() content = request.start_chrome(new_urls, self.url_timeout, self.max_pool_size, self.log_path, None, callback=self.parse) self.doraemon.recoveryConcurrency(self.concurrency_file, self.max_concurrency) message7 = 'End for {0} requests of {1}.'.format( str(len(content)), self.name) self.file.logger(self.log_path, message7) print message7 del new_urls, content, url_list, request gc.collect()
class Settings(): def __init__(self): self.file = FileIOMiddleware() self.RSYNC_PRD1 = "/home/dev/Data/rsyncData/prd4" self.RSYNC_PRD2 = "/home/dev/Data/rsyncData/prd3" self.CAMEL_FOOD = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/food" self.SITES_INFO = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/cobwebs/sites_info.txt" self.SITES_DEBUG = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/cobwebs/sites_debug.txt" self.SELENIUM_TIMEOUT = 120 #second self.CHROMEDRIVER_PATH = "/usr/bin/chromedriver" #timeout handler self.PROCESS_TIMEOUT = 2 # minutes self.TIMEOUT_CACHE_FILE = "/home/dev/Data/rsyncData/timeout.cache" self.PROCESS_TIMEOUT_CONTENT = 60 # minutes #concurrency self.REFRESH_CONCURRENCY_INTERVAL = 30 #minute self.MAX_CONCURRENCY = 10 self.CONCURRENCY_FILE = "{0}/max_concurrency.txt".format( self.RSYNC_PRD2) self.CONCURRENCY_REFRESH_FILE = "{0}/concurrency_refresh.txt".format( self.RSYNC_PRD2) self.REFRESH_CONCURRENCY_INTERVAL_SPIDER = 30 # minute self.MAX_CONCURRENCY_SPIDER = 10 self.CONCURRENCY_FILE_SPIDER = "{0}/max_concurrency.txt".format( self.RSYNC_PRD1) self.CONCURRENCY_REFRESH_FILE_SPIDER = "{0}/concurrency_refresh.txt".format( self.RSYNC_PRD1) self.USER_AGENTS = [ "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; AcooBrowser; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.0; Acoo Browser; SLCC1; .NET CLR 2.0.50727; Media Center PC 5.0; .NET CLR 3.0.04506)", "Mozilla/4.0 (compatible; MSIE 7.0; AOL 9.5; AOLBuild 4337.35; Windows NT 5.1; .NET CLR 1.1.4322; .NET CLR 2.0.50727)", "Mozilla/5.0 (Windows; U; MSIE 9.0; Windows NT 9.0; en-US)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; Win64; x64; Trident/5.0; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 2.0.50727; Media Center PC 6.0)", "Mozilla/5.0 (compatible; MSIE 8.0; Windows NT 6.0; Trident/4.0; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; .NET CLR 1.0.3705; .NET CLR 1.1.4322)", "Mozilla/4.0 (compatible; MSIE 7.0b; Windows NT 5.2; .NET CLR 1.1.4322; .NET CLR 2.0.50727; InfoPath.2; .NET CLR 3.0.04506.30)", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN) AppleWebKit/523.15 (KHTML, like Gecko, Safari/419.3) Arora/0.3 (Change: 287 c9dfb30)", "Mozilla/5.0 (X11; U; Linux; en-US) AppleWebKit/527+ (KHTML, like Gecko, Safari/419.3) Arora/0.6", "Mozilla/5.0 (Windows; U; Windows NT 5.1; en-US; rv:1.8.1.2pre) Gecko/20070215 K-Ninja/2.1.1", "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9) Gecko/20080705 Firefox/3.0 Kapiko/3.0", "Mozilla/5.0 (X11; Linux i686; U;) Gecko/20070322 Kazehakase/0.4.5", "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.8) Gecko Fedora/1.9.0.8-1.fc10 Kazehakase/0.5.6", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.56 Safari/535.11", "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_7_3) AppleWebKit/535.20 (KHTML, like Gecko) Chrome/19.0.1036.7 Safari/535.20", "Opera/9.80 (Macintosh; Intel Mac OS X 10.6.8; U; fr) Presto/2.9.168 Version/11.52", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/536.11 (KHTML, like Gecko) Chrome/20.0.1132.11 TaoBrowser/2.0 Safari/536.11", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.71 Safari/537.1 LBBROWSER", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E; LBBROWSER)", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/535.11 (KHTML, like Gecko) Chrome/17.0.963.84 Safari/535.11 LBBROWSER", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (compatible; MSIE 9.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E; QQBrowser/7.0.3698.400)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 5.1; Trident/4.0; SV1; QQDownload 732; .NET4.0C; .NET4.0E; 360SE)", "Mozilla/4.0 (compatible; MSIE 6.0; Windows NT 5.1; SV1; QQDownload 732; .NET4.0C; .NET4.0E)", "Mozilla/4.0 (compatible; MSIE 7.0; Windows NT 6.1; WOW64; Trident/5.0; SLCC2; .NET CLR 2.0.50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)", "Mozilla/5.0 (Windows NT 5.1) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.1 (KHTML, like Gecko) Chrome/21.0.1180.89 Safari/537.1", "Mozilla/5.0 (iPad; U; CPU OS 4_2_1 like Mac OS X; zh-cn) AppleWebKit/533.17.9 (KHTML, like Gecko) Version/5.0.2 Mobile/8C148 Safari/6533.18.5", "Mozilla/5.0 (Windows NT 6.1; Win64; x64; rv:2.0b13pre) Gecko/20110307 Firefox/4.0b13pre", "Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:16.0) Gecko/20100101 Firefox/16.0", "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11", "Mozilla/5.0 (X11; U; Linux x86_64; zh-CN; rv:1.9.2.10) Gecko/20100922 Ubuntu/10.10 (maverick) Firefox/3.6.10" ] self.ACCEPT = "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8" self.ACCEPT_LANGUAGE = "zh-CN,zh;q=0.9,en;q=0.8" self.ACCEPT_ENC0DING = "gzip, deflate" self.CONNECTION = "keep-alive" self.CACHE_CONTROL = "max-age=0" self.PRAGMA = "no-cache" self.UPGRADE_INSECURE_REQUESTS = "1" self.LOG_PATH = "{0}/log".format(self.RSYNC_PRD1) self.LOG_PATH_PRD2 = "{0}/log".format(self.RSYNC_PRD2) self.MONGO_URI = 'mongodb://127.0.0.1:27017' self.MONGO_DEEPINEWS = 'DeepNewsDatabase' self.REDIS_HOST = '127.0.0.1' self.REDIS_PORT = 6379 self.BLOOMFILTER_URLS = "tegenaria:urls" self.BLOOMFILTER_CONTENT = "tegenaria:content" self.BLOOMFILTER_AUTHORS = "tegenaria:authors" self.TODAY = time.strftime('%Y-%m-%d', time.localtime(time.time())) self.YESTERDAY = (datetime.now() - timedelta(days=1)).strftime("%Y-%m-%d") self.CHRONUS_SETTINGS = "{0}/log/chronus.csv".format(self.RSYNC_PRD1) self.DISABLE_RESTART_INTERVAL = False #sogo-sogo-weixin self.VALID_PROXY_POOL_SOGO_ACCOUNT = "valid_proxy_pool:sogo_account" self.INVALID_PROXY_POOL_SOGO_ACCOUNT = "invalid_proxy_pool:sogo_account" self.VALID_PROXY_POOL_SOGO_ARTICLE_LIST = "valid_proxy_pool:sogo_article_list" self.INVALID_PROXY_POOL_SOGO_ARTICLE_LIST = "invalid_proxy_pool:sogo_article_list" self.VALID_PROXY_POOL_WX = "valid_proxy_pool:wx" self.INVALID_PROXY_POOL_WX = "invalid_proxy_pool:wx" self.FINISHED_SOGO_ACCOUNT = "finished:sogo_account" self.FINISHED_SOGO_ARTICLE_LIST = "finished:sogo_article_list" self.FINISHED__WX = "finished:wx" # shenjian-weixin self.FINISHED_WEIXIN_URL_ID = "finished:weixin_url_id" self.FINISHED_WEIXIN_URL_ARTICLE = "finished:weixin_url_article" self.FINISHED_WEIXIN_CONTENT_ARTICLE = "finished:weixin_content_article" #sites self.URL_DEEPINEWS_10002_ARTICLE = "http://www.deepinews.com:10002/article/" self.URL_DEEPINEWS_10002_IMAGE = 'http://www.deepinews.com:10002/img/' # self.URL_DEEPINEWS_10002_ARTICLE = 'http://192.168.163.26:8081/article/' # self.URL_DEEPINEWS_10002_IMAGE = 'http://192.168.163.26:8081/img/' #images filter self.FINISHED_IMAGE_ID = "finished:image_id" #temp folder for html and img self.TEMP_FOLDER_HTML = '/home/dev/Data/Production/data4deepinews/html' self.TEMP_FOLDER_IMG = '/home/dev/Data/Production/data4deepinews/img' self.FINISHED_TEMP_WEIXIN = "finished:temp_weixin" #remove server information self.HOST_PASSWORD_FILE = '/home/dev/Repository/news/servers/webserver0.txt' self.HOST_INFO = self.getServerInfo(self.HOST_PASSWORD_FILE) self.HOST_NAME = self.HOST_INFO.ip self.USER_NAME = 'root' self.PASSWORD = self.HOST_INFO.password self.PORT = 22 self.REMOTE_IMG_PATH = '/home/dev/Data/Production/img_tmp' self.REMOTE_HTML_PATH = '/home/dev/Data/Production/html_tmp' self.MAX_UPLOAD_PROCESS = 20 #aliyun oss access token self.ALI_OSS_TOKEN_FILE = '/home/dev/Repository/news/servers/aliyun.txt' self.ALI_OSS_INFO = self.getServerInfo(self.ALI_OSS_TOKEN_FILE) #refresh the redis interval self.REFRESH_REDIS_INTERVAL = 1440 #huxiu_nlp self.FINISHED_HUXIU_NLP = "finished:huxiu_nlp" #mongodb self.SPIDERDB = "SPIDERS" #article url self.ARTICLE_URL = "https://www.deepinews.com/article/" #aliyun self.ALI_DOMAIN = "oss-cn-beijing.aliyuncs.com" self.ALI_BUCKET_NAME_DEEPINEWS = "deepinews" self.ALI_BUCKET_NAME_DEEPINEWS_IMG = "img" # local html info self.LOCAL_HTML_PATH = "{0}/local".format(self.RSYNC_PRD1) # webserver0 html info self.WEBSERVER0_PASSWORD_FILE = '/home/dev/Repository/news/servers/webserver0.txt' self.WEBSERVER0_INFO = self.getServerInfo( self.WEBSERVER0_PASSWORD_FILE) self.IP_WEBSERVER0 = self.WEBSERVER0_INFO.ip self.PORT_WEBSERVER0 = 22 self.USER_ROOT_WEBSERVER0 = "root" self.USER_ROOT_PASSWORD_WEBSERVER0 = self.WEBSERVER0_INFO.password self.HTML_WEBSERVER0 = "/home/dev/Data/Production/article" self.RETRY_FILE = "{0}/retry.txt".format(self.RSYNC_PRD1) self.UPLOAD_HTML_API = "https://www.deepinews.com/api/articles/uploadhtml" #webserver0 mongo data info self.LOCAL_MONGO_DATA_PATH = "/home/dev/Data/Production/data4deepinews/{0}.csv".format( self.TODAY) self.REMOTE_MONGO_DATA_PATH = "/home/dev/Data/Production/data4deepinews/{0}.csv".format( self.TODAY) #template self.TEMPLATE_PATH = "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/storeHtml/template_1.html" #monitor self.MONITOR_SPIDERS_URL = "https://www.deepinews.com/sites/index.html" self.MONITOR_SITE_URL = "https://www.deepinews.com/sites/" self.MONITOR_SPIDERS_TEMPLATE_PATH = \ "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/spiderMonitor/index.html" self.MONITOR_SITE_TEMPLATE_PATH = \ "/home/dev/Repository/news/Tegenaria/tSpider/tSpider/spiderMonitor/site.html" self.MONITOR_UPLOAD_LOCAL = "{0}/monitor".format(self.RSYNC_PRD1) self.MONITOR_UPLOAD_PATH_WEBSERVER0 = "/home/dev/Data/Production" self.MONITOR_SITE_HTML_WEBSERVER0 = "/home/dev/Data/Production/statics/sites" def getServerInfo(self, file): contents = self.file.readFromTxt(file) machines = contents.split('\n') result = None for machine in machines: if machine == '': continue info = machine.split('==') result = machineDto(info[0].strip(), info[1].strip()) return result def SettingsFormat(self, SETTINGS_NAME, SOURCE_NAME, RESTART_INTERVAL, MAX_POOL_SIZE_URL, MAX_POOL_SIZE_CONTENT, IS_OPEN_CACHE, START_TIME, END_TIME, URL_TIMEOUT, CONTENT_TIMEOUT): return settingsSpec(SETTINGS_NAME, SOURCE_NAME, RESTART_INTERVAL, MAX_POOL_SIZE_URL, MAX_POOL_SIZE_CONTENT, IS_OPEN_CACHE, START_TIME, END_TIME, URL_TIMEOUT, CONTENT_TIMEOUT) def CreateSettings(self, siteinfo=None): print "Create setting for: {0}".format(siteinfo.domain) return self.SettingsFormat( siteinfo.domain, siteinfo.name, siteinfo.restart_interval, siteinfo.url_parallel_number, siteinfo.content_parallel_number, siteinfo.is_open_cache, siteinfo.work_time_start, siteinfo.work_time_end, siteinfo.url_timeout, siteinfo.content_timeout) def CreateCommonSettings(self): return self.SettingsFormat('0', '0', '0', '0', '0', '0', '0', '0', '0', '0')