Python FileIOMiddleware.readColsFromCSV Exemples

Langage de programmation: Python

Espace de nommage/Pack: Tegenaria.tSpider.tSpider.middlewares.fileIOMiddleware

Class/Type: FileIOMiddleware

Méthode/Fonction: readColsFromCSV

Exemples au hotexamples.com: 2

Python FileIOMiddleware.readColsFromCSV - 2 exemples trouvés. Ce sont les exemples réels les mieux notés de Tegenaria.tSpider.tSpider.middlewares.fileIOMiddleware.FileIOMiddleware.readColsFromCSV extraits de projets open source. Vous pouvez noter les exemples pour nous aider à en améliorer la qualité.

Méthodes fréquemment utilisées

Afficher Cacher

readFromTxt(12)

FileIOMiddleware(11)

logger(5)

writeToTxtAdd(4)

readColsFromCSV(2)

writeToHtmlCover(2)

writeToTxtCover(2)

readFromCSV(1)

Méthodes fréquemment utilisées

readFromTxt (12)

FileIOMiddleware (11)

logger (5)

writeToTxtAdd (4)

readColsFromCSV (2)

writeToHtmlCover (2)

writeToTxtCover (2)

readFromCSV (1)

Exemple #1

0

Afficher le fichier

Fichier : doraemonMiddleware.py Projet : hulu7/news

class Doraemon(): def __init__(self): settings = Settings() settings.CreateCommonSettings() self.file = FileIOMiddleware() self.rconn = redis.Redis(settings.REDIS_HOST, settings.REDIS_PORT) self.bf_urls = BloomFilter(self.rconn, settings.BLOOMFILTER_URLS) self.bf_content = BloomFilter(self.rconn, settings.BLOOMFILTER_CONTENT) self.bf_authors = BloomFilter(self.rconn, settings.BLOOMFILTER_AUTHORS) self.disable_restart_interval = settings.DISABLE_RESTART_INTERVAL self.bf_weixin_url = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ARTICLE) self.bf_weixin_content = BloomFilter( self.rconn, settings.FINISHED_WEIXIN_CONTENT_ARTICLE) self.bf_weixin_id = BloomFilter(self.rconn, settings.FINISHED_WEIXIN_URL_ID) self.bf_finished_image_id = BloomFilter(self.rconn, settings.FINISHED_IMAGE_ID) self.bf_finished_temp_weixin = BloomFilter( self.rconn, settings.FINISHED_TEMP_WEIXIN) self.md5 = hashlib.md5() self.max_concurrency = settings.MAX_CONCURRENCY self.concurrency_file = settings.CONCURRENCY_FILE self.concurrency_refresh_file = settings.CONCURRENCY_REFRESH_FILE self.refresh_concurrency_interval = settings.REFRESH_CONCURRENCY_INTERVAL self.max_concurrency_spider = settings.MAX_CONCURRENCY_SPIDER self.concurrency_file_spider = settings.CONCURRENCY_FILE_SPIDER self.concurrency_refresh_file_spider = settings.CONCURRENCY_REFRESH_FILE_SPIDER self.refresh_concurrency_interval_spider = settings.REFRESH_CONCURRENCY_INTERVAL_SPIDER self.bf_huxiu_nlp = BloomFilter(self.rconn, settings.FINISHED_HUXIU_NLP) self.sites_info = settings.SITES_INFO self.sites_debug = settings.SITES_DEBUG def sshUpload(self, address, port, username, password, fromFile, toFile): transport = paramiko.Transport((address, port)) try: print 'Start to upload file: {0}'.format(fromFile) transport.connect(username=username, password=password) sftp = paramiko.SFTPClient.from_transport(transport) sftp.put(fromFile, toFile) transport.close() print 'Finished to upload file: {0}'.format(fromFile) return True except Exception as e: print 'Exception {0} to upload file: {1}'.format( e.message, fromFile) return False def moveFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.move(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Move file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to move file {1} to file {2}.".format( e.message, fromfile, tofile)) def copyFile(self, fromfile=None, tofile=None): if fromfile is None or os.path.exists(fromfile) is False: print "Source file {0} is not exits".format(fromfile) return False try: retry = 1 retryLimit = 60 while not os.path.exists(tofile) and retry <= retryLimit: if retry > 1: time.sleep(1) shutil.copy(fromfile, tofile) retry += 1 if retryLimit < retry: raise Exception('Copy file retry limit time reached.') return True except Exception as e: raise Exception( "Exception {0} to copy file {1} to file {2}.".format( e.message, fromfile, tofile)) def createFilePath(self, path): isFilePathExists = os.path.exists(path) if isFilePathExists is False: os.makedirs(path) def isExceedRestartInterval(self, path, restart_interval): isRestartPathExists = os.path.exists(path) if isRestartPathExists is False: print 'restart file does not exit and create an new one' self.file.writeToTxtCover(path, time.time()) return True past = float(self.file.readFromTxt(path)) now = time.time() isExceed = ((now - past) // 60 >= restart_interval) or ( self.disable_restart_interval is True) if isExceed is True: print 'exceeds the restart interval and restart' self.file.writeToTxtCover(path, time.time()) else: print 'does not exceed the restart interval and stop' return isExceed def isExceedTimeoutInterval(self, timeout, past): if self.isEmpty(timeout): print 'timeout is empty' return False now = time.time() isExceed = ((now - past) // 60 >= timeout) if isExceed is True: print 'exceeds the timeout.' else: print 'does not exceeds the timeout.' return isExceed def isEmpty(self, obj=None): if isinstance(obj, unicode): obj = obj.encode('utf-8') if isinstance(obj, str): return len([obj for i in obj if i.strip()]) == 0 elif isinstance(obj, int) or isinstance(obj, float): return False elif isinstance(obj, list) or isinstance(obj, dict) or isinstance( obj, tuple) or isinstance(obj, set): return len(obj) == 0 else: return obj == None def isNumber(self, item): return isinstance(item, int) or isinstance(item, float) def isTitleEmpty(self, title, url): if self.isEmpty(title): print 'Empty title for: {0}'.format(url) return True return False def isUrlValid(self, url, good_keys, bad_keys, regx, valid): is_match = False for regx_item in regx: if regx_item.match(url) != None: is_match = True if is_match == False: print 'Invalid url for not match: {0}'.format(url) return False for good in good_keys: if valid == True: continue if good in url: print 'Match good key: {0}'.format(good) valid = True for bad in bad_keys: if valid == False: continue if bad in url: print 'Match bad key: {0}'.format(bad) valid = False return valid def getImageTypeFromUrl(self, url): if 'jpeg' or '.jpg' in url: return 'jpg' if '.png' in url or 'png' in url: return 'png' if '.gif' in url or 'gif' in url: return 'gif' else: print 'Other image type use default type' return 'png' def isDuplicated(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} duplicated!'.format(content) return True else: filter.insert(content_encode) print 'Content {0} not duplicated!'.format(content) return False def isFinished(self, filter, content): content_encode = str(content).encode("utf-8") if filter.isContains(content_encode): print 'Content {0} exists!'.format(content) return True else: return False def storeFinished(self, filter, content): print 'Start to store content: {0}'.format(content) content_encode = str(content).encode("utf-8") filter.insert(content_encode) def storeMongodb(self, mongo_url, data): mongo = MongoMiddleware() mongo.insert(mongo_url, data) def storeTxt(self, id, content, finished_txt_path, name): try: self.createFilePath(finished_txt_path) print 'Start to store txt: {0}'.format(id) self.file.writeToTxtCover( '{0}/{1}_{2}.txt'.format(finished_txt_path, name, id), content) print 'End to store txt: {0}'.format(id) except Exception as e: print 'Exception {0} to store txt: {1}'.format(e.message, id) def storeTxtAdd(self, author_txt_path, author_name, settingName): try: self.createFilePath(author_txt_path) print 'Start to store txt: {0}'.format(author_name) self.file.writeToTxtAdd( '{0}/{1}_authors.txt'.format(author_txt_path, settingName), author_name) except Exception as e: print 'Exception to store txt: {0} , for {1}'.format( author_name, e.strerror) print 'End to store txt: {0}'.format(author_name) def storeHtml(self, id, content, finished_html_path): try: self.createFilePath(finished_html_path) print 'Start to store html: {0}'.format(id) self.file.writeToHtmlCover( '{0}/{1}.html'.format(finished_html_path, id), content) print 'End to store html: {0}'.format(id) return True except Exception as e: print 'Exception {0} to store html: {1}'.format(e.message, id) return False def filter(self, filter, url_titles): new_url_titles = [] for url_title in url_titles: if self.isFinished(filter, url_title[1]) is False: new_url_titles.append(url_title) return new_url_titles def imageFilter(self, filter, ids): new_ids = [] for id in ids: if self.isFinished(filter, id) is False: new_ids.append(id) return new_ids def readNewUrls(self, filter, url_path): print 'Start to read urls' isUrlPathExit = os.path.exists(url_path) new_url_titles = [] if isUrlPathExit is True: url_titles = np.array( self.file.readColsFromCSV(url_path, ['url', 'title'])) new_url_titles = self.filter(filter, url_titles) return new_url_titles def readNewImageIds(self, filter, content_path): print 'Start to read ids' isContentPathExit = os.path.exists(content_path) new_ids = [] id_list = [] if isContentPathExit is True: ids = np.array(self.file.readColsFromCSV(content_path, ['id'])) for id in ids: id_list.append(id[0]) new_ids = self.imageFilter(filter, id_list) return new_ids def downloadImage(self, image_url, store_path, image_name): try: self.createFilePath(store_path) print 'start to download image: {0}'.format(image_url) urllib.urlretrieve(image_url, '{0}/{1}'.format(store_path, image_name)) return True except Exception as e: print 'exception to download image: {0} for {1}'.format( image_url, e.message) return False def hashSet(self, name, key, value): self.rconn.hset(name, key, value) def getHashSet(self, name, key): return self.rconn.hget(name, key) def getAllHasSet(self, name): return self.rconn.hgetall(name) def delHashSet(self, name, key): return self.rconn.hdel(name, key) def delKey(self, key): return self.rconn.delete(key) def getKeyLen(self, key): return self.rconn.hlen(key) def getDateOfDaysBefore(self, days): return (datetime.now() - timedelta(days=days)).strftime("%Y-%m-%d") def getCurrentYear(self): return time.strftime('%Y', time.localtime(time.time())) def getCurrentDate(self): return time.strftime('%Y-%m-%d', time.localtime(time.time())) def getCurrentLocalTime(self): return datetime.now().strftime('%Y-%m-%d %H:%M:%S') def getDateTime(self, string, dateFormat, pattern, isMatchDate): try: match = re.search(dateFormat, string) strp = datetime.strptime(match.group(), pattern) if isMatchDate: print "'Match date success: {0}".format(strp.date()) return strp.date() else: print "'Match date success: {0}".format(strp.date()) return strp.date() except: print("'Match date fail") return None def getDateFromChinese(self, string): year = self.getCurrentYear() try: if "今天" in string or \ "秒前" in string or \ "分钟前" in string or \ "小时前" in string or \ "Today" in string: return self.getDateOfDaysBefore(0) if "昨天" in string or \ "1天前" in string or \ "Yesterday" in string: return self.getDateOfDaysBefore(1) if "前天" in string or \ "2天前" in string or \ "2 days ago" in string: return self.getDateOfDaysBefore(2) if "3天前" in string or \ "3 days ago" in string: return self.getDateOfDaysBefore(3) if "4天前" in string or \ "4 days ago" in string: return self.getDateOfDaysBefore(4) if "5天前" in string or \ "5 days ago" in string: return self.getDateOfDaysBefore(5) if "6天前" in string or \ "6 days ago" in string: return self.getDateOfDaysBefore(6) if "1周前" in string or \ "1 week ago" in string: return self.getDateOfDaysBefore(7) if "年" not in string and "月" in string and "日" in string: data = re.split(",", string.replace('月', ',').replace('日', '')) return "{0}-{1}-{2}".format(year, self.getNumberFromString(data[0]), self.getNumberFromString(data[1])) if "年" in string and "月" in string and "日" in string: data = re.split( ",", string.replace('年', ',').replace('月', ',').replace('日', ',')) return "{0}-{1}-{2}".format(self.getNumberFromString(data[0]), self.getNumberFromString(data[1]), self.getNumberFromString(data[2])) except: print("Fail to match date from Chinese.") return None def getNumberFromString(self, string): return ''.join(re.findall(r'\d+', string)).strip() def getFinalDate(self, year, month, day): return "{0}-{1}-{2}".format(year, month, day) def formateMonthDay(self, MD): return '{:02d}'.format(MD) def getDateFromString(self, string_date): _date_chinese = self.getDateFromChinese(string_date) if _date_chinese is not None: string_date = _date_chinese _date_year_month_day_crossing = self.getDateTime( string_date, r'\d{4}-\d{1,2}-\d{1,2}', '%Y-%m-%d', True) _date_year2_month_day_crossing = self.getDateTime( string_date, r'\d{2}-\d{1,2}-\d{1,2}', '%y-%m-%d', True) _date_month_day_crossing = self.getDateTime(string_date, r'\d{1,2}-\d{1,2}', '%m-%d', True) _date_year_month_day_dot = self.getDateTime(string_date, r'\d{4}.\d{1,2}.\d{1,2}', '%Y.%m.%d', True) _date_month_day_dot = self.getDateTime(string_date, r'\d{1,2}.\d{1,2}', '%m.%d', True) _date_year_month_day_slash = self.getDateTime( string_date, r'\d{4}\/\d{1,2}\/\d{1,2}', '%Y/%m/%d', True) _date_month_day_slash = self.getDateTime(string_date, r'\d{1,2}\/\d{1,2}', '%m/%d', True) _time_hour_minute_second = self.getDateTime( string_date, r'\d{1,2}:\d{1,2}:\d{1,2}', '%H:%M:%S', False) _time_hour_minute = self.getDateTime(string_date, r'\d{1,2}:\d{1,2}', '%H:%M', False) year = self.getCurrentYear() if _date_year_month_day_crossing is not None: return self.getFinalDate( _date_year_month_day_crossing.year, self.formateMonthDay(_date_year_month_day_crossing.month), self.formateMonthDay(_date_year_month_day_crossing.day)) if _date_year2_month_day_crossing is not None: return self.getFinalDate( _date_year2_month_day_crossing.year, self.formateMonthDay(_date_year2_month_day_crossing.month), self.formateMonthDay(_date_year2_month_day_crossing.day)) if _date_year_month_day_crossing is None and _date_month_day_crossing is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_crossing.month), self.formateMonthDay(_date_month_day_crossing.day)) if _date_year_month_day_dot is not None: return self.getFinalDate( _date_year_month_day_dot.year, self.formateMonthDay(_date_year_month_day_dot.month), self.formateMonthDay(_date_year_month_day_dot.day)) if _date_year_month_day_dot is None and _date_month_day_dot is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_dot.month), self.formateMonthDay(_date_month_day_dot.day)) if _date_year_month_day_slash is not None: return self.getFinalDate( _date_year_month_day_slash.year, self.formateMonthDay(_date_year_month_day_slash.month), self.formateMonthDay(_date_year_month_day_slash.day)) if _date_year_month_day_slash is None and _date_month_day_slash is not None: return self.getFinalDate( year, self.formateMonthDay(_date_month_day_slash.month), self.formateMonthDay(_date_month_day_slash.day)) if _time_hour_minute_second is not None or _time_hour_minute is not None: return self.getCurrentDate() def getMD5(self, content): self.md5.update(content.encode('utf-8')) return self.md5.hexdigest() def compressImage(self, origin_image_path, destination_image_path, multiplier): try: sImg = Image.open(origin_image_path) w, h = sImg.size dImg = sImg.resize((int(w / multiplier), int(h / multiplier)), Image.ANTIALIAS) os.remove(origin_image_path) dImg.save(destination_image_path) print "Compress picture {0} success!".format( destination_image_path) except Exception as e: print "Compress picture {0} failed for {1}".format( destination_image_path, e.message) def getSizeOfImage(self, image_path): try: img = Image.open(image_path) return img.size except Exception as e: print "Exception to open picture {0}, for {1}.".format( image_path, e.message) def getFileSize(self, file_path): try: fsize = os.path.getsize(file_path) fsize = fsize / float(1024) return round(fsize, 2) except Exception as e: print "Exception to get file size of {0}, for {1}.".format( file_path, e.message) def getFileList(self, diractory): file_list = [] isFilePathExists = os.path.exists(diractory) if isFilePathExists is True: file_list = os.listdir(diractory) return file_list def isFileExists(self, file_path): return os.path.exists(file_path) def deleteFile(self, file_path): try: print "Start to delete file: {0}".format(file_path) os.remove(file_path) print "Finished to delete file: {0}".format(file_path) except Exception as e: print "Exception to delete file: {0} for : {1}".format( file_path, e.message) def tar(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) def tarList(self, directory): file_list = os.listdir(directory) if len(file_list) == 0: print "There is no file to compress for: {0}".format(directory) return try: print "Start to compress directory: {0}".format(directory) lst = [] t = tarfile.open(directory + ".tar.gz", "w:gz") for root, dir, files in os.walk(directory): for file in files: fullpath = os.path.join(root, file) t.add(fullpath) lst.append(fullpath) t.close() print "Finished to compress directory: {0}".format(directory) return lst except Exception as e: print "Exception to compress directory: {0} for :{1}".format( directory, e.message) return [] def isCamelReadyToRun(self, settings): if self.isWorkTime(settings.START_TIME, settings.END_TIME) is False: return False if self.isConcurrencyAllowToRun(self.concurrency_refresh_file, self.refresh_concurrency_interval, self.concurrency_file, self.max_concurrency) is False: return False if self.isExceedRestartInterval(settings.RESTART_PATH, settings.RESTART_INTERVAL) is False: self.recoveryConcurrency(self.concurrency_file, self.max_concurrency) return False return True def isSpiderReadyToRun(self): return self.isConcurrencyAllowToRun( self.concurrency_refresh_file_spider, self.refresh_concurrency_interval_spider, self.concurrency_file_spider, self.max_concurrency_spider) def isWorkTime(self, start_time, end_time): if self.isNumber(start_time) is False: print 'start time is empty' return False if self.isNumber(end_time) is False: print 'end time is empty' return False if self.isAfterHour(start_time) and self.isBeforeHour(end_time): print 'it is work time' return True else: print 'it is not work time before {0} or after {1}'.format( start_time, end_time) return False def isAfterHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) < int(current_time): return True else: return False def isBeforeHour(self, hour): if self.isNumber(hour) is False: print 'input hour is empty.' return current_time = time.strftime('%H', time.localtime(time.time())) if int(hour) >= int(current_time): return True else: return False def readFile(self, file): waiting = 0 data = self.file.readFromTxt(file).strip() while self.isEmpty(data): print 'file {0} is under update, waitting... {1} s'.format( file, waiting) time.sleep(1) waiting += 1 data = self.file.readFromTxt(file).strip() return data def isConcurrencyAllowToRun(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): self.updateConcurrencyFile(concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency) isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists : {0}'.format( str(concurrency_available)) if int(concurrency_available) > 0: print 'app is able to run.' new_concurrency_available = concurrency_available - 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) return True else: print 'app is not able to run for no available concurrency.' return False return True def recoveryConcurrency(self, concurrency_file, max_concurrency): isFilePathExists = os.path.exists(concurrency_file) if isFilePathExists is False: print 'concurrency file not exists and create an new one with max concurrency: {0}'.format( str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) return concurrency_available = int(self.readFile(concurrency_file)) print 'concurrency file exists and start to recovery: {0}'.format( str(concurrency_available)) if int(concurrency_available) < max_concurrency: print 'start to recovery concurrenct.' new_concurrency_available = concurrency_available + 1 print 'new concurrency is : {0}'.format( str(new_concurrency_available)) self.file.writeToTxtCover(concurrency_file, str(new_concurrency_available)) else: print 'concurrency is not normal and write max concurrency to it.' self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def updateConcurrencyFile(self, concurrency_refresh_file, refresh_concurrency_interval, concurrency_file, max_concurrency): if self.isExceedRestartInterval(concurrency_refresh_file, refresh_concurrency_interval) is True: print 'refresh concurrency file: {0}'.format(str(max_concurrency)) self.file.writeToTxtCover(concurrency_file, str(max_concurrency)) def createCamelData(self, title, url, id, download_time, source): return camelDto(title, url, id, download_time, source) def createCamelMongoJson(self, camelDto): return { 'title': camelDto.title, 'url': camelDto.url, 'id': camelDto.id, 'download_time': camelDto.download_time, 'source': camelDto.source } def createSpiderData(self, url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content): return spiderDto(url, origin_url, public_time, author_name, title, id, download_time, source, images, is_open_cache, content) def createSpiderMongoJson(self, spiderDto): return { 'url': spiderDto.url, 'origin_url,': spiderDto.origin_url, 'public_time': spiderDto.public_time, 'author_name': spiderDto.author_name, 'title': spiderDto.title, 'id': spiderDto.id, 'download_time': spiderDto.download_time, 'source': spiderDto.source, 'images': spiderDto.images, 'is_open_cache': spiderDto.is_open_cache } def updateImages(self, images, newImages): for image in newImages: data = image.strip() if self.isEmpty(data) is False and data not in images: images.append(data) def completeImageUrls(self, newImages, current_url): result = [] if len(newImages) == 0: print 'No images urls to process' return result for url in newImages: entireUrl = urlparse.urljoin(current_url, url).strip() if re.match('https', entireUrl) is not None: result.append(entireUrl) return result def getSitesInfo(self, isdebug=False): if isdebug: site_info_path = self.sites_debug else: site_info_path = self.sites_info content = self.file.readFromTxt(site_info_path) if self.isEmpty(content): print 'sites info is empty' return None sitesInfo = content.split('[SITE]') results = [] for site in sitesInfo: if self.isEmpty(site): continue results.append(self.extractSiteInfo(site)) return results def extractSiteInfo(self, siteInfo): items = siteInfo.split('\n') result = siteInfoDto(domain=None, name=None, restart_interval=None, url_parallel_number=None, content_parallel_number=None, is_open_cache=None, work_time_start=None, work_time_end=None, good_keys=[], bad_keys=[], href_items=[], href=[], url_match=[], url_title_match=[], url_id_tag=[], content_match=[], content_child_match=[], content_url_match=[], content_id_tag=[], article_match=[], content_title_match=[], content_image_match=[], content_time_match=[], need_self_image=None, url_timeout=None, content_timeout=None) for item in items: if self.isEmpty(item): continue content = item.split('==') key = ''.join(content[0]).strip() value = ''.join(content[1]).strip() if key == 'DOMAIN': result.domain = value continue if key == 'NAME': result.name = value continue if key == 'RESTARTINTERVAL': result.restart_interval = int(value) continue if key == 'URLPARALLELNUMBER': result.url_parallel_number = int(value) continue if key == 'CONTENTPARALLELNUMBER': result.content_parallel_number = int(value) continue if key == 'ISOPENCACHE': result.is_open_cache = bool(value) continue if key == 'WORKTIMESTART': result.work_time_start = int(value) continue if key == 'WORKTIMEEND': result.work_time_end = int(value) continue if key == 'GOODKEYS': if self.isEmpty(value) is False: result.good_keys.append(value) continue if key == 'BADKEYS': if self.isEmpty(value) is False: result.bad_keys.append(value) continue if key == 'URLMATCH': if self.isEmpty(value) is False: result.url_match.append(self.extractRegxRule(value)) continue if key == 'URLTITLEMATCH': if self.isEmpty(value) is False: result.url_title_match.append(self.extractHtmlTag(value)) continue if key == 'URLIDTAG': if self.isEmpty(value) is False: result.url_id_tag.append(self.extractHtmlTag(value)) continue if key == 'CONTENTURLMATCH': if self.isEmpty(value) is False: result.content_url_match.append( self.extractRegxRule(value)) continue if key == 'CONTENTIDTAG': if self.isEmpty(value) is False: result.content_id_tag.append(self.extractHtmlTag(value)) continue if key == 'HREFITEMS': if self.isEmpty(value) is False: result.href_items.append(self.extractHtmlTag(value)) continue if key == 'HREF': if self.isEmpty(value) is False: result.href.append(self.extractHtmlTag(value)) continue if key == 'ARTICLEMATCH': if self.isEmpty(value) is False: result.article_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTMATCH': if self.isEmpty(value) is False: result.content_match.append(self.extractHtmlTag(value)) continue if key == 'CONTENTCHILDMATCH': if self.isEmpty(value) is False: result.content_child_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTITLEMATCH': if self.isEmpty(value) is False: result.content_title_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTIMAGEMATCH': if self.isEmpty(value) is False: result.content_image_match.append( self.extractHtmlTag(value)) continue if key == 'CONTENTTIMEMATCH': if self.isEmpty(value) is False: result.content_time_match.append( self.extractHtmlTag(value)) continue if key == 'NEEDSELFIMAGE': if self.isEmpty(value) is False: result.need_self_image = value == 'True' if key == 'NEEDSELFHTML': if self.isEmpty(value) is False: result.need_self_html = value == 'True' if key == 'URLTIMEOUT': if self.isEmpty(value) is False: result.url_timeout = value if key == 'CONTENTTIMEOUT': if self.isEmpty(value) is False: result.content_timeout = value return result def getUrlId(self, url, idTag): id = None for item in idTag: matchItem = item.regx if matchItem in url: index = url.index(item.regx) + item.index if len(url) <= index: continue id = url[index] if id == None: continue return id def extractRegxRule(self, regxMatch): return re.compile(regxMatch) def extractHtmlTag(self, regxMatch): items = regxMatch.split('|') id = ''.join(items[0]).strip() index = int(items[1]) return regxMatchDto(id, index) def getMatchContent(self, content, regx): if regx.index == -1 or len(content) == 0: return content return content[regx.index] def uploadFileApi(self, url, fileName, fullPath): try: with open(fullPath, mode="r") as f: file = {"file": (fileName, f.read())} encode_data = encode_multipart_formdata(file) file_data = encode_data[0] headers_from_data = {"Content-Type": encode_data[1]} response = requests.post(url=url, headers=headers_from_data, data=file_data).json() if response['code'] != 200: print 'Fail to upload file {0} through api {1}'.format( fileName, url) return False print 'Success to upload file {0} through api {1}'.format( fileName, url) return True except Exception as e: print 'Exception to upload file {0} through api {1} : {2}'.format( fileName, url, e.message) return False

Exemple #2

0

Afficher le fichier

Fichier : updateMonitorFiles.py Projet : hulu7/news

class UpdateMonitorFiles(): def __init__(self, siteinfo=None): self.siteinfo = siteinfo self.globalSettings = Settings() self.doraemon = Doraemon() self.getSettings() self.file = FileIOMiddleware() def getSettings(self): self.settings = self.globalSettings.CreateSettings(self.siteinfo) self.work_path_prd4 = self.settings.WORK_PATH_PRD1 self.work_path_prd3 = self.settings.WORK_PATH_PRD2 self.content_backup_path = self.settings.FINISHED_BACKUP_PATH self.content_backup_post_path = self.settings.FINISHED_BACKUP_POST_PATH self.url_backup_path = self.settings.URL_BACKUP_PATH self.url_backup_post_path = self.settings.URL_BACKUP_POST_PATH self.monitor_site_template_path = self.globalSettings.MONITOR_SITE_TEMPLATE_PATH self.monitor_spiders_template_path = self.globalSettings.MONITOR_SPIDERS_TEMPLATE_PATH self.monitor_upload_local = self.globalSettings.MONITOR_UPLOAD_LOCAL self.monitor_site_webserver0 = self.globalSettings.MONITOR_SITE_HTML_WEBSERVER0 self.monitor_site_url = self.globalSettings.MONITOR_SITE_URL self.monitor_upload_webserver0 = self.globalSettings.MONITOR_UPLOAD_PATH_WEBSERVER0 def updateSpiders(self, siteName, ycount1, tcount1, turl1, diff1, ycount2, tcount2, turl2, diff2): return '<tr>' + \ '<th align="center" valign="middle">{0}</th>'.format(siteName) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount1) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl1, tcount1) + \ '<td align="center" valign="middle">{0}</td>'.format(diff1) + \ '<td align="center" valign="middle">{0}</td>'.format(ycount2) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(turl2, tcount2) + \ '<td align="center" valign="middle">{0}</td>'.format(diff2) + \ '</tr>' def updateSite(self, number, title, url): return '<tr>' + \ '<td align="center" valign="middle">{0}</td>'.format(number) + \ '<td align="center" valign="middle"><a href="{0}" target="_blank">{1}</a></td>'.format(url, title) + \ '</tr>' def uploadFile(self, fromFile, toFile): while os.path.exists(fromFile): try: if self.doraemon.sshUpload( self.globalSettings.IP_WEBSERVER0, self.globalSettings.PORT_WEBSERVER0, self.globalSettings.USER_ROOT_WEBSERVER0, self.globalSettings.USER_ROOT_PASSWORD_WEBSERVER0, fromFile, toFile): print 'Success to retry to upload monitor file: {0}'.format( fromFile) return True except Exception as e: print 'Exception {0} to upload monitor site file: {1}'.format( e.message, fromFile) return False def updateSingleSite(self, preBackupPath, postBackupPath, siteName): singleSiteData = singleSiteDto(self.siteinfo.name, 0, 0, None, 0) isPreBackupFileExists = os.path.exists(preBackupPath) isPostBackupFileExists = os.path.exists(postBackupPath) preCsvContent = None if isPreBackupFileExists: print "Start to read url back up file: {0}".format( self.settings.NAME) preCsvContent = self.file.readColsFromCSV(preBackupPath, ['title', 'url']) singleSiteData.tcount = len(preCsvContent.values) else: print "Url back up file not exits: {0}".format(self.settings.NAME) singleSiteData.tcount = 0 if isPostBackupFileExists: print "Start to read post url back up file: {0}".format( self.settings.NAME) postCsvContent = self.file.readColsFromCSV(postBackupPath, ['title', 'url']) singleSiteData.ycount = len(postCsvContent.values) else: print "Post url back up file not exits: {0}".format( self.settings.NAME) singleSiteData.ycount = 0 singleSiteData.diff = singleSiteData.tcount - singleSiteData.ycount if preCsvContent is not None: if preCsvContent.empty: print "No new back up url: {0}".format(self.settings.NAME) else: template = self.file.readFromTxt( self.monitor_site_template_path) finalContent = '' number = 1 for item in preCsvContent.values: finalContent = "{0}{1}".format( finalContent, self.updateSite(number, item[1], item[0])) number += 1 template = template.replace( 'UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('ServerName', siteName) template = template.replace('SiteName', self.siteinfo.name) template = template.replace('MainContent', finalContent) turl = '{0}{1}_{2}.html'.format(self.monitor_site_url, self.settings.NAME, siteName) singleSiteData.turl = turl uploadLocalHtmlPath = '{0}/{1}_{2}.html'.format( self.monitor_upload_local, self.settings.NAME, siteName) self.file.writeToHtmlCover(uploadLocalHtmlPath, template) return singleSiteData def processAllSites(self, allSitesData=None): template = self.file.readFromTxt(self.monitor_spiders_template_path) mainContent = '' t = totalDto(0, 0, 0, 0, 0, 0) for data in allSitesData: mainContent = '{0}{1}'.format( mainContent, self.updateSpiders(data.prd3.sitename, data.prd3.ycount, data.prd3.tcount, data.prd3.turl, data.prd3.diff, data.prd4.ycount, data.prd4.tcount, data.prd4.turl, data.prd4.diff)) t.prd3ytotal += data.prd3.ycount t.prd3ttotal += data.prd3.tcount t.prd4ytotal += data.prd4.ycount t.prd4ttotal += data.prd4.tcount t.prd3difftotal = t.prd3ttotal - t.prd3ytotal t.prd4difftotal = t.prd4ttotal - t.prd4ytotal mainContent = '{0}{1}'.format( mainContent, self.updateSpiders('Summary', t.prd3ytotal, t.prd3ttotal, '', t.prd3difftotal, t.prd4ytotal, t.prd4ttotal, '', t.prd4difftotal)) template = template.replace('UpdateTime', self.doraemon.getCurrentLocalTime()) template = template.replace('MainContent', mainContent) localHtmlPath = '{0}/index.html'.format(self.monitor_upload_local) self.file.writeToHtmlCover(localHtmlPath, template) self.doraemon.tar(self.monitor_upload_local) fromFile = '{0}.tar.gz'.format(self.monitor_upload_local) self.uploadFile( fromFile, '{0}/monitor.tar.gz'.format(self.monitor_upload_webserver0)) os.remove(fromFile) def processSingleSite(self): spidersContent = allSitesDto(None, None) spidersContent.prd3 = self.updateSingleSite(self.url_backup_path, self.url_backup_post_path, 'prd3') spidersContent.prd4 = self.updateSingleSite( self.content_backup_path, self.content_backup_post_path, 'prd4') return spidersContent