class FilePipeline(object): bank_http_service = BankHttpService() def process_item(self, item, spider): # 本地存放路径 base_path = spider.settings.get("SAVE_PATH") + '/' + time.strftime( '%Y%m%d', time.localtime(time.time())) if 'proCode' in item.keys(): self.base_path = base_path + "/" + item['bankCode'] + "/" + item[ 'channel'] + "/" + item['proCode'] + "/" # 判断是否有产品说明书 ''' if 'instructionUrl' in item.keys() and item['instructionUrl']: self.download(item['instructionUrl'], item['bankCode']) # 判断是否含有风险说明书 if 'riskDisclosureUrl' in item.keys() and item['riskDisclosureUrl']: self.download(item['riskDisclosureUrl'], item['bankCode']) ''' return item def download(self, downloadUrl, bucket_name): isExists = os.path.exists(self.base_path) # 判断是否存在目录,不存在创建 if not isExists: os.makedirs(self.base_path) pass # 兼容多种下载路径 strs = re.split('/|=', downloadUrl) num = len(strs) file_name = strs[num - 1] # 要存放的路径 file_path = self.base_path + file_name self.bank_http_service.downloadFile(downloadUrl, file_path)
def close(self, reason): bank_http_service = BankHttpService() bank_http_service.uploadResult({'bankCode': 'ceb'}) pass