def file_downloaded(self, response, request, info): #path = self.file_path(request, response=response, info=info) path = response.meta.get('item')['folder'][0] + '/' + response.meta.get('item')['pin_id'][0] + '.' + response.meta.get('item')['pic_type'][0] buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
def _transform_downloaded(self, response, request, info, *, item): date_code, origin_path = parse_save_url(request.url) buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(origin_path, buf, info) if not origin_path.endswith("xml"): date_code = os.path.dirname(urlparse(request.url).path) item.fields['date_tiles'].setdefault(date_code, set()) item.fields['date_tiles'][date_code].add(origin_path) if item.fields['date_tiles'][date_code] == item['tile_chklist'][date_code]: product = os.path.dirname(origin_path) tile_list = [os.path.join(settings.FILES_STORE, v) for v in item.fields['date_tiles'][date_code]] stitch_and_reproject(tile_list, ALL_DONE_FILE_OUTPUT_PATH, product=product, date=self.parse_output_date(request.url), reproject_options=item.reproject_options, subset=subset, outformat=output_type, save_stitch_file= yes_no_parser(save_stitch_file)) item.fields['date_tiles'].pop(date_code) return checksum
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) return checksum
def file_downloaded(self, response, request, info, unzip_path=None): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) file_path = zip_download_path + "/" + path key = path.split("/")[-1].split(".")[0] FileService.unzip_files(file_path, key) unzip_path = base_unzip_path + "/" + key for file in os.listdir(unzip_path): file_path = unzip_path + "/" + file json_response = {} # MaliciousCheck.check_malicious(file_path) metadata = FileService.get_file_meta(path) integrated_data = metadata.update(json_response) zip_path = unzip_path + "/" with open(zip_path + 'metadata.txt', 'w') as outfile: json.dump(metadata, outfile) with open(zip_path + 'report.txt', 'w') as outfile: json.dump(json_response, outfile) zipped_file = FileService.zip_files(file_path, key=key) bucket_name = metadata['extension'] client = MinioClient.get_client() if not client.bucket_exists(bucket_name): client.create_bucket(bucket_name) client.upload_file(bucket_name, path.split("/")[-1], zipped_file) return checksum
def image_downloaded(self, response, request, info): checksum = None print("*****pipeline****") for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size filename = "{0:010}.jpg".format(self.count) dirname = response.meta['image_directory_name'] self.count += 1 path = 'full/dl/{0}/{1}'.format(dirname, filename) self.store.persist_file(path, buf, info, meta={ 'width': width, 'height': height }) return checksum
def image_downloaded(self, response, request, info, *, item=None): checksum = None image_stream = self.get_images(response, request, info, item=item) while True: try: path, image, buf = next(image_stream) except OSError: logger.exception('Could not process image') continue except StopIteration: break except Exception: logger.exception('Stopped processing images') continue if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size self.store.persist_file(path, buf, info, meta={ 'width': width, 'height': height }, headers={'Content-Type': 'image/jpeg'}) return checksum
def image_downloaded(self, response, request, info, item=None): path = self.file_path(request, response=response, info=info) try: orig_image = Image.open(BytesIO(response.body)) except UnidentifiedImageError: raise ImageException(f'Image cannot be identified ({request.url})') width, height = orig_image.size if width > self.max_size_px or height > self.max_size_px: raise ImageException( f'Image too large ({width}x{height} < {self.max_size_px}x{self.max_size_px})' ) image, buffer = self.convert_image(orig_image) buffer.seek(0) checksum = md5sum(buffer) width, height = image.size self.store.persist_file(path, buffer, info, meta={ 'width': width, 'height': height }, headers={'Content-Type': 'image/png'}) return checksum
def image_downloaded(self, response, request, info): print("*\n" * 5, "正在下载图片") checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) time = self._get_time() try: with self.pool.connection() as connection: table = connection.table(self.table_name) table.put(path, { "cf:content": buf.getvalue(), "cf:size": "880X600" }) connection.close() print("successfully storing image into hbase,{time},{id}". format(type=type, time=time, id=path)) except Exception as e: print("Caught Hbase exception of image storing:{e}".format( e=str(e))) print("failed storing image into hbase,{time},{id}".format( type=type, time=time, id=path)) return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size if width > 600 and height > 600: self.store.persist_file(path, buf, info, meta={ 'width': width, 'height': height }, headers={'Content-Type': 'image/jpeg'}) thumb = image.copy() thumb.thumbnail((200, 200)) p = path.replace('.jpg', '_thumb.jpg') p = os.path.join(os.path.abspath('./data/images'), p) p = os.path.abspath(p) p, filename = os.path.split(p)[0], os.path.split(p)[1] os.makedirs(p, exist_ok=True) thumb.save(os.path.join(p, filename), 'JPEG') return checksum else: return None return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size # They save the image Here! try: self.store.persist_file(path, buf, info, meta={ 'width': width, 'height': height }, headers={'Content-Type': 'image/jpeg'}) logging.info('We store the following path: %s', path) except: logging.critical('We did not store the following path: %s', path) sys.exit() else: logging.info('Image_downloaded Checksum: %s', checksum) return checksum
def image_downloaded(self, response, request, info, *, item=None): # function 4 checksum = None for path, image, buf, most_common in self.get_images(response, request, info, item=item): if checksum is None: buf.seek(0) checksum = md5sum(buf) # init meta data with width and height width, height = image.size meta_dict = {'width': width, 'height': height} most_common_dict = self.most_common_to_property([most_common])[0] meta_dict.update(most_common_dict) # TODO add most common color info to meta information self.store.persist_file(path, buf, info, meta=meta_dict, headers={'Content-Type': 'image/jpeg'}) # HERE ARE CUSTOM CHANGES return checksum, most_common
def image_downloaded(self, response, request, info): first_buf = None for key, image, buf in self.get_images(response, request, info): self.store.persist_image(key, image, buf, info) if first_buf is None: first_buf = buf first_buf.seek(0) return md5sum(first_buf)
def image_downloaded(self, response, request, info): checksum = None for key, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) self.store.persist_image(key, image, buf, info) return checksum
def file_downloaded(self, response, request, info): #path = self.file_path(request, response=response, info=info) path = response.meta.get( 'item')['folder'][0] + '/' + response.meta.get('item')['pin_id'][ 0] + '.' + response.meta.get('item')['pic_type'][0] buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
def file_downloaded(self, response, request, info): """把文件以二进制形式存入文件""" path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) # 调用存储器的persist_file方法 return checksum
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) if self.cipher is not None: buf = BytesIO(self.cipher.encrypt(response.body)) else: buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) return checksum
def file_downloaded(self, response, request, info, path): """ 重定义文件下载 """ buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) return checksum
def persist_file(self, key, file_content, info, filename): self._mkdir(os.path.join(self.basedir, *key.split('/')), info) absolute_path = self._get_filesystem_path(key, filename) with open(absolute_path, "w") as wf: wf.write(file_content) with open(absolute_path, 'rb') as file_content: checksum = md5sum(file_content) return checksum
def persist_file(self, key, file_content, info, filename): self._mkdir(os.path.join(self.basedir, *key.split('/')), info) absolute_path = self._get_filesystem_path(key,filename) with open(absolute_path,"w") as wf: wf.write(file_content) with open(absolute_path, 'rb') as file_content: checksum = md5sum(file_content) return checksum
def file_downloaded(self, response, request, info): '''重载,增加输出文件大小的功能''' path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) checksum = md5sum(buf) buf.seek(0) file_size = sys.getsizeof(response.body) print(file_size, '************' * 30) buf.seek(0) self.store.persist_file(path, buf, info) return checksum
def stat_image(self, key, info): absolute_path = self._get_filesystem_path(key) try: last_modified = os.path.getmtime(absolute_path) except: # FIXME: catching everything! return {} with open(absolute_path, "rb") as imagefile: checksum = md5sum(imagefile) return {"last_modified": last_modified, "checksum": checksum}
def file_downloaded(self, response, request, info): path = 'full/' + request.meta['filename'] + '.pdf' buf = BytesIO(response.body) self.store.persist_file(path, buf, info) checksum = md5sum(buf) size = os.path.getsize(path) if size < 100: self.json['filename'][response.url] = request.meta['filename'] self.json['file_urls'].append(response.url) return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size self.store.persist_file( path, buf, info, meta={"width": width, "height": height}, headers={"Content-Type": "image/jpeg"} ) return checksum
def stat_file(self, path, info): absolute_path = self._get_filesystem_path(path) try: last_modified = os.path.getmtime(absolute_path) except os.error: return {} with open(absolute_path, 'rb') as f: checksum = md5sum(f) return {'last_modified': last_modified, 'checksum': checksum}
def stat_file(self, path, info): absolute_path = self._get_filesystem_path(path) try: last_modified = os.path.getmtime(absolute_path) except: # FIXME: catching everything! return {} with open(absolute_path, 'rb') as f: checksum = md5sum(f) return {'last_modified': last_modified, 'checksum': checksum}
def image_downloaded(self, response, request, info): checksum = None for key, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) self.store.persist_image(key, image, buf, info) abs_path = self.store._get_filesystem_path(key) image_size = os.path.getsize(abs_path) width, height = image.size return {"checksum": checksum, "width": width, "height": height, "size": image_size}
def file_downloaded(self, response, request, info): # print "2-HELLLLLLOOOOOOOO" path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) # print response.meta['title'] # print "{0}.mp3".format() fname = "{0}.mp3".format(response.meta['title'].encode('utf-8').strip()) self.store.persist_file(fname, buf, info) # self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
def stat_file(self, key, info): absolute_path = self._get_filesystem_path(key) try: last_modified = os.path.getmtime(absolute_path) except: # FIXME: catching everything! return {} with open(absolute_path, 'rb') as f: checksum = md5sum(f) return {'last_modified': last_modified, 'checksum': checksum}
def image_downloaded(self, response, request, info): global biggestItem checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size if biggestItem['width']*biggestItem['height']<width*height: biggestItem = {'width':width, 'height':height, 'path':path, 'buf':buf,'info':info} return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size self.store.persist_file( path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum
def image_downloaded(self, response, request, info): checksum = None for key, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size self.store.persist_file( key, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size if path.startswith('full') and self.check_gif(image): # Save gif from response directly. self.persist_gif(path, response.body, info) else: self.store.persist_file(path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size filename = request._url.rsplit("/", 1)[1] path = 'full/{}'.format(filename) self.store.persist_file( path, buf, info, meta={'width': width, 'heigth': height} ) return checksum
def stat_file(self, path, info): """返回文件最近修改时间和对应的md5值,不存在则返回空字典""" absolute_path = self._get_filesystem_path(path) try: last_modified = os.path.getmtime(absolute_path) # 文件最近修改时间 except: # FIXME: catching everything! return {} with open(absolute_path, 'rb') as f: checksum = md5sum(f) # 计算一个类文件对象的md5值 return {'last_modified': last_modified, 'checksum': checksum}
def image_downloaded(self, response, request, info): checksum = None for path, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) width, height = image.size # path = 'full/%s' % response.meta['image_name']+"."+response.meta['ext'] # **Here Changed** path = 'full/%s' % response.meta['image_name'] # **Here Changed** self.store.persist_file( path, buf, info, meta={'width': width, 'height': height}, headers={'Content-Type': 'image/jpeg'}) return checksum
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = BytesIO(response.body) checksum = md5sum(buf) file_size = sys.getsizeof(response.body) print(file_size, '************' * 30) buf.seek(0) # self.store.persist_file(path, buf, info) #这里限制4M以下的图片不下载,可以在此基础上更改 if file_size > 4194304: self.store.persist_file(path, buf, info) return checksum else: pass
def file_downloaded(self, response, request, info): # The downloaded file is a XML file which stores the actual RTF file as # a base64 encoded string. Here we extract and decode that value. data = xmltodict.parse(response.body) data = data.get('sessao').get('discursoRTFBase64') data = base64.b64decode(data) # And here we basically do the same as super, but using our # decoded data instead of `response.body` path = self.file_path(request, response=response, info=info) buf = BytesIO(data) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
def image_downloaded(self, response, request, info): checksum = None for key, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) folder, filename = self.get_path_from_url(response.url) # to do, move the folder under screenshot containers. folder_path = os.path.join("images", folder) if not os.path.exists(folder_path): os.makedirs(folder_path) filepath = os.path.join(folder_path, filename) if not os.path.exists(filepath): image.save(filepath) return checksum
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) content = self.modify_response(response) self.loop.append(content) if len(self.loop) == request.meta['file_urls_len']: cont = '' for item in self.loop: cont += item + '\n' buf = BytesIO(cont.encode()) checksum = md5sum(buf) buf.seek(0) self.store.persist_file(path, buf, info) self.loop = [] return checksum return None
def image_downloaded(self, response, request, info): checksum = None for key, image, buf in self.get_images(response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) folder, filename = self.get_path_from_url(response.url) #to do, move the folder under screenshot containers. folder_path = os.path.join('images', folder) if not os.path.exists(folder_path): os.makedirs(folder_path) filepath = os.path.join(folder_path, filename) if not os.path.exists(filepath): image.save(filepath) return checksum
def image_downloaded(self, response, request, info): checksum = None for width, height, url_sha2, phash_str, buf in self.get_images( response, request, info): if checksum is None: buf.seek(0) checksum = md5sum(buf) # self.store.persist_file( # path, buf, info, # meta={'width': width, 'height': height}, # headers={'Content-Type': 'image/jpeg'}) # hashString="".join([1 if x else 0 for x in hash]) return width, height, url_sha2, phash_str, checksum
def write_item(self, item): title = item.get('title', 'Untitled') header = """<html lang="en"> <head> <meta charset="utf-8" /> <title>%s</title> </head> <body> """ % title body = self.make_body(item, title) closer = """ </body> </html> """ url = item['location'] media_guid = hashlib.sha1(url).hexdigest() media_ext = '.html' path = 'full/%s%s' % (media_guid, media_ext) absolute_path = os.path.join(self.store.basedir, path) with codecs.open(absolute_path, 'wb', 'utf-8') as f: f.write(header) f.write(body) f.write(closer) item['inline_urls'] = [ urljoin('file://', pathname2url(absolute_path)) ] item['inline_metas'] = [ { 'link_url': item['request_url'], 'location': item['location'], 'title': title, 'content_type': 'text/html'} ] checksum = None with open(absolute_path, 'rb') as f: checksum = md5sum(f) # Compatible with Twisted Deferred results results = [ (True, {'url': url, 'path': path, 'checksum': checksum } ) ] item = self.item_completed(results, item, self.spiderinfo) return item
def fileparse(self, fname ): """ fileparse 获取文件信息 返回数据结构: { "file_name" : "8845970f239f7fbd1c2c6f81861e92a81a43b32e.apk", "file_md5" : "3bb744c851097281aa64180a8c9a1c3b", "file_size" : 946163, "file_path" : '/tmp', } """ info = {} info['file_path'] = os.path.dirname(fname) info['file_name'] = os.path.basename(fname) info['file_size'] = os.path.getsize(fname) with open(fname,'r') as fh: info['file_md5'] = md5sum(fh) return info
def file_downloaded(self, response, request, info): path = self.file_path(request, response=response, info=info) buf = StringIO(response.body) txt = self.text_extract(buf) lns = [ln.strip() for ln in txt.splitlines()] dd = parser.parse(lns[1]).strftime('%Y-%m-%d') data = self.get_index_data(dd, lns) for d in data: log.msg("|".join(d), level=log.INFO) reg_mkt = self.market_data(dd, lns, "Regular Market") for r in reg_mkt: for el in r: log.msg(str(el), level=log.INFO) self.store.persist_file(path, buf, info) checksum = md5sum(buf) return checksum
def stat_file(self, key, info): """ the stat is the file key dir, the last_modified is the file that saved to the file key dir. """ keydir = os.path.join(self.basedir, *key.split('/')) filenames = os.listdir(keydir) if len(filenames) != 1: shutil.rmtree(keydir,True) return {} else: filename = list_first_item(filenames) absolute_path = self._get_filesystem_path(key) try: last_modified = os.path.getmtime(absolute_path) except: # FIXME: catching everything! return {} with open(os.path.join(absolute_path,filename), 'rb') as file_content: checksum = md5sum(file_content) return {'last_modified': last_modified, 'checksum': checksum}
def stat_file(self, path, info): image_buf = self.__hub[path][0] checksum = md5sum(image_buf) return {'last_modified': "", 'checksum': checksum}
def file_downloaded(self, response, request, info): path, image_buf = self.get_image(response, request, info) image_buf.seek(0) checksum = md5sum(image_buf) self.store.persist_file(path, image_buf, info) return checksum
def file_downloaded(self, response, request, info): key = self.file_key(request.url) buf = StringIO(response.body) self.store.persist_file(key, buf, info) checksum = md5sum(buf) return checksum