def check_file(self, data, uri): mime_type = magic.from_buffer(data, mime=True) if mime_type == 'application/gzip': #peek inside compressed content data = zlib.decompress(data, 16+zlib.MAX_WBITS) mime_type = magic.from_buffer(data, mime=True) if mime_type in self.mime_types: sha256 = hashlib.sha256(data).hexdigest() print "Checking %s from %s" %(sha256, uri) result = self.get_vt_result(sha256) if result['response_code'] == 1 and result['positives'] >= self.vt_threshold: #in VT and we care about it return False if result['response_code'] == 0: #not in VT files = {"file": (sha256, data), "apikey" : self.vt_apikey} requests.post("https://www.virustotal.com/vtapi/v2/file/scan", files=files) max_count = 6 # We'll wait 3 minutes and then fail open count = 0 while count < max_count: result = self.get_vt_result(sha256) if result['response_code'] == 1 and result['positives'] >= self.vt_threshold: #in VT and we care about it return False if result['response_code'] == 1 and result['positives'] < self.vt_threshold: #in VT and we don't care about it return True time.sleep(30) count += 1 return True
def main(): try: opts, args = getopt.getopt(sys.argv[1:], "b:f:hm", ["bytes=", "file=", "help", "mime"]) except getopt.GetoptError as err: print str(err) bytes = 10 file = "" mime = False for o, a in opts: if o in ("-f", "--file"): file = a elif o in ("-m", "--mime"): mime = True elif o in ("-b", "--bytes"): bytes = int(a) elif o in ("-h", "--help"): printUsage() sys.exit(0) if len(file) > 0: print magic.from_buffer(open(file).read(bytes), mime) else: print "Please specify a file"
def _generate_file_metadata(self, data): import pydeep import magic from hashlib import md5, sha1, sha256 try: self.filetype = magic.from_buffer(data) except: self.filetype = "Unavailable" try: mimetype = magic.from_buffer(data, mime=True) if mimetype: self.mimetype = mimetype.split(";")[0] if not mimetype: self.mimetype = "unknown" except: self.mimetype = "Unavailable" self.size = len(data) # this is a shard key. you can't modify it once it's set. # MongoEngine will still mark the field as modified even if you set it # to the same value. if not self.md5: self.md5 = md5(data).hexdigest() self.sha1 = sha1(data).hexdigest() self.sha256 = sha256(data).hexdigest() try: self.ssdeep = pydeep.hash_bytes(data) except: self.ssdeep = None
def test_old_from_buffer(self): for key, val in TEST_FILES.items(): file_path = os.path.join(TEST_DATA_DIR, key) with open(file_path, 'rb') as f: buf = f.read(1024) self.assertMatches(magic.from_buffer(buf, mime=False), val[2]) self.assertMatches(magic.from_buffer(buf, mime=True), val[0])
def get_byte_mime(bytes): """ Shortcut to get a mime from bytes in a variable. :param bytes: :return: """ magic.from_buffer(bytes, mime=True).decode("utf-8")
def __init__(self, p_path): configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg') if not os.path.exists(configfile): raise Exception('Unable to find the configuration file. \ Did you set environment variables? \ Or activate the virtualenv.') cfg = configparser.ConfigParser() cfg.read(configfile) self.cache = redis.StrictRedis( host=cfg.get("Redis_Queues", "host"), port=cfg.getint("Redis_Queues", "port"), db=cfg.getint("Redis_Queues", "db"), decode_responses=True) self.store = redis.StrictRedis( host=cfg.get("Redis_Data_Merging", "host"), port=cfg.getint("Redis_Data_Merging", "port"), db=cfg.getint("Redis_Data_Merging", "db"), decode_responses=True) self.store_metadata = redis.StrictRedis( host=cfg.get("ARDB_Metadata", "host"), port=cfg.getint("ARDB_Metadata", "port"), db=cfg.getint("ARDB_Metadata", "db"), decode_responses=True) PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes")) if PASTES_FOLDER not in p_path: self.p_rel_path = p_path p_path = os.path.join(PASTES_FOLDER, p_path) else: self.p_rel_path = None self.p_path = p_path self.p_name = os.path.basename(self.p_path) self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2) self.p_mime = magic.from_buffer("test", mime=True) self.p_mime = magic.from_buffer(self.get_p_content(), mime=True) # Assuming that the paste will alway be in a day folder which is itself # in a month folder which is itself in a year folder. # /year/month/day/paste.gz var = self.p_path.split('/') self.p_date = Date(var[-4], var[-3], var[-2]) self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name) self.p_source = var[-5] self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0]) self.p_encoding = None self.p_hash_kind = {} self.p_hash = {} self.p_langage = None self.p_nb_lines = None self.p_max_length_line = None self.array_line_above_threshold = None self.p_duplicate = None self.p_tags = None
def test_rethrow(self): old = magic.magic_buffer try: def t(x, y): raise magic.MagicException("passthrough") magic.magic_buffer = t with self.assertRaises(magic.MagicException): magic.from_buffer("hello", True) finally: magic.magic_buffer = old
def mimetype(file): """Read MIME type from file""" try: mimetype_ = from_file(file, mime=True) except (OSError, TypeError, ValueError): try: data = file.read() except AttributeError: return from_buffer(file, mime=True) else: return from_buffer(data, mime=True) else: return mimetype_
def test_format(self): client = self.get_muto_client() source_url = self.TEST_IMAGES[0] client.from_url(source_url) client.format = 'jpg' img_jpg = client.process() img_jpg_mime = magic.from_buffer(img_jpg.read(), mime=True) self.assertEqual(img_jpg_mime, 'image/jpeg') client.format = 'gif' img_gif = client.process() img_gif_mime = magic.from_buffer(img_gif.read(), mime=True) self.assertEqual(img_gif_mime, 'image/gif')
def guess_mimetype(blob): """ uses file magic to determine the mime-type of the given data blob. :param blob: file content as read by file.read() :type blob: data :returns: mime-type, falls back to 'application/octet-stream' :rtype: str """ mimetype = 'application/octet-stream' # this is a bit of a hack to support different versions of python magic. # Hopefully at some point this will no longer be necessary # # the version with open() is the bindings shipped with the file source from # http://darwinsys.com/file/ - this is what is used by the python-magic # package on Debian/Ubuntu. However, it is not available on pypi/via pip. # # the version with from_buffer() is available at # https://github.com/ahupp/python-magic and directly installable via pip. # # for more detail see https://github.com/pazz/alot/pull/588 if hasattr(magic, 'open'): m = magic.open(magic.MAGIC_MIME_TYPE) m.load() magictype = m.buffer(blob) elif hasattr(magic, 'from_buffer'): magictype = magic.from_buffer(blob, mime=True) else: raise Exception('Unknown magic API') # libmagic does not always return proper mimetype strings, cf. issue #459 if re.match(r'\w+\/\w+', magictype): mimetype = magictype return mimetype
def check_rsrc(self): """ Function needed to determine the compilation language """ try: ret = {} if hasattr(self.pe, 'DIRECTORY_ENTRY_RESOURCE'): i = 0 for resource_type in self.pe.DIRECTORY_ENTRY_RESOURCE.entries: if resource_type.name is not None: name = "%s" % resource_type.name else: name = "%s" % pefile.RESOURCE_TYPE.get(resource_type.struct.Id) if name == None: name = "%d" % resource_type.struct.Id if hasattr(resource_type, 'directory'): for resource_id in resource_type.directory.entries: if hasattr(resource_id, 'directory'): for resource_lang in resource_id.directory.entries: try: data = self.pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size) filetype = magic.from_buffer(open(self.filename).read(1024)) lang = pefile.LANG.get(resource_lang.data.lang, 'qq_*unknown*') sublang = pefile.get_sublang_name_for_lang( resource_lang.data.lang, resource_lang.data.sublang ) ret[i] = (name, resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size, filetype, lang, sublang) i += 1 except pefile.PEFormatError: pass except: ret = False pass finally: return ret
def compute_hashes(self): """ Compute the file hashes """ filename = self.get_file_path(self.sha256) # Make sure the file exists and is readable if not os.access(filename, os.R_OK): flash('There was an error while trying to analyse the file.', 'danger') return False with open(filename, 'rb') as f: buf = f.read() if self.sha256 is None: self.sha256 = hashlib.sha256(buf).hexdigest() if self.sha1 is None: self.sha1 = hashlib.sha1(buf).hexdigest() if self.md5 is None: self.md5 = hashlib.md5(buf).hexdigest() if self.ssdeep is None: self.ssdeep = ssdeep.hash(buf) if self.mime is None: try: self.mime = magic.from_buffer(buf, mime=True).decode('utf-8') except: self.mime = None if self.entropy is None: self.entropy = self.compute_entropy(buf)
def getBuffertype(self, buffercontent): if MAGIC_AVAILABLE == MAGIC_PYTHON_FILE: ms = self._get_file_magic() btype = ms.buffer(buffercontent) elif MAGIC_AVAILABLE == MAGIC_PYTHON_MAGIC: btype = magic.from_buffer(buffercontent, mime=True) return btype
def __call__(self, data): if self.max_size is not None and data.size > self.max_size: params = { 'max_size': filesizeformat(self.max_size), 'size': filesizeformat(data.size), } raise ValidationError(self.error_messages['max_size'], 'max_size', params) if self.min_size is not None and data.size < self.min_size: params = { 'min_size': filesizeformat(self.mix_size), 'size': filesizeformat(data.size) } raise ValidationError(self.error_messages['min_size'], 'min_size', params) if self.content_types: content_type = magic.from_buffer(data.read(), mime=True) if content_type not in self.content_types: params = { 'content_type': content_type } raise ValidationError(self.error_messages['content_type'], 'content_type', params) def __eq__(self, other): return isinstance(other, FileValidator)
def save_malware(response, directory, black_list, white_list): url = response.url data = response.content mime_type = magic.from_buffer(data, mime=True) if mime_type in black_list: logging.info('%s in ignore list for %s', mime_type, url) return if white_list: if mime_type in white_list: pass else: logging.info('%s not in whitelist for %s', mime_type, url) return # Hash and log md5 = hashlib.md5(data).hexdigest() logging.info("%s hashes to %s" % (url, md5)) # Assume that if viper or vxcage then we dont need to write to file as well. stored = False # Submit to external services if cfg['vxcage']: upload_vxcage(response, md5) stored = True if cfg['cuckoo']: upload_cuckoo(response, md5) if cfg['viper']: upload_viper(response, md5) stored = True # else save to disk if not stored: with open(os.path.join(directory, md5), 'wb') as f: f.write(data) logging.info("Saved %s to dump dir" % md5) return True
def upload_to_gs(bucket_name, client_id, client_secret, file, key, acl='public-read'): conn = GSConnection(client_id, client_secret, calling_format=OrdinaryCallingFormat()) bucket = conn.get_bucket(bucket_name) k = Key(bucket) # generate key filename = secure_filename(file.filename) key_dir = key + '/' + generate_hash(key) + '/' k.key = key_dir + filename # delete old data for item in bucket.list(prefix='/' + key_dir): item.delete() # set object settings file_data = file.read() file_mime = magic.from_buffer(file_data, mime=True) size = len(file_data) sent = k.set_contents_from_string( file_data, headers={ 'Content-Disposition': 'attachment; filename=%s' % filename, 'Content-Type': '%s' % file_mime } ) k.set_acl(acl) gs_url = 'https://storage.googleapis.com/%s/' % bucket_name if sent == size: return gs_url + k.key return False
def get_objects_from_ceph(): local_cur.execute("SELECT etag FROM objects") existing_objects = set() for r in local_cur: existing_objects.add(r[0]) print len(existing_objects) s = IDigBioStorage() buckets = ["datasets","images"] count = 0 rowcount = 0 lrc = 0 for b_k in buckets: b = s.get_bucket("idigbio-" + b_k + "-prod") for k in b.list(): if k.name not in existing_objects: try: ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'}) detected_mime = magic.from_buffer(ks, mime=True) local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime}) existing_objects.add(k.name) rowcount += local_cur.rowcount except: print "Ceph Error", b_k, k.name count += 1 if rowcount != lrc and rowcount % 10000 == 0: print count, rowcount local_pg.commit() lrc = rowcount print count, rowcount local_pg.commit()
def generate_filename(self, instance, filename): if not self.random_filename: return super(WebDAVMixin, self).generate_filename(instance, filename) uuid_string = unicode(uuid.uuid4()) file = getattr(instance, self.attname) if hasattr(file._file, 'content_type') and file._file.content_type in self.valid_content_types: content_type = file._file.content_type else: try: file._file.seek(0) if self.custom_magic_file: content_type = magic.Magic(mime=True, magic_file=self.custom_magic_file).from_buffer(file._file.read(1024)) else: content_type = magic.from_buffer(file._file.read(1024), mime=True) except TypeError as e: content_type = 'application/x-unknown' #Receiving all extensions and checking if file extension matches MIME Type extensions = mimetypes.guess_all_extensions(content_type) try: file_ext = re.findall(r'\.[^.]+$', filename)[0] except IndexError: file_ext = None if file_ext in extensions: ext = file_ext elif extensions: ext = extensions[0] else: ext = '.bin' return os.path.join(self.upload_to, uuid_string[:2], uuid_string[2:4], '%s%s' % (uuid_string, ext))
def from_subpacket(cls, subpacket): if subpacket.image_format == C.JPEG_IMAGE_FORMAT: mime_type = 'image/jpeg' else: mime_type = magic.from_buffer(subpacket.data[:1024], mime=True).decode('ascii') return cls(mime_type, subpacket.data)
def is_rtf(self, data): rtf_mime_types = ( 'text/rtf', 'application/rtf', ) return magic.from_buffer(data, mime = True) in rtf_mime_types
def get_attachment_file(self, attachment, form): """ Loads the attachment file from the server and stores it into the attachment object given as a parameter. The form parameter is the mechanize Form to be submitted for downloading the attachment. The attachment parameter has to be an object of type model.attachment.Attachment. """ time.sleep(self.config.WAIT_TIME) logging.info("Getting attachment '%s'", attachment.identifier) if self.options.verbose: print "Getting attachment '%s'" % attachment.identifier mechanize_request = form.click() try: mform_response = mechanize.urlopen(mechanize_request) mform_url = mform_response.geturl() if self.list_in_string(self.urls['ATTACHMENT_DOWNLOAD_TARGET'], mform_url): attachment.content = mform_response.read() attachment.mimetype = magic.from_buffer(attachment.content, mime=True) attachment.filename = self.make_attachment_filename(attachment.identifier, attachment.mimetype) else: logging.warn("Unexpected form target URL '%s'", mform_url) if self.options.verbose: sys.stderr.write("Unexpected form target URL '%s'\n" % mform_url) except mechanize.HTTPError as e: logging.warn("HTTP Error: code %s, info: %s", e.code, e.msg) if self.options.verbose: print "HTTP-FEHLER:", e.code, e.msg return attachment
def validate(randname): newname = '' filetype = { 'JPEG' : 'jpg', 'GIF ' : 'gif', 'PNG ': 'png', 'JPG ':'jpg' } fileext = magic.from_buffer(open(randname).read(1024))[:4] if fileext in filetype: fileext = filetype[fileext] newname = str(randname) + '.' + str(fileext) hashed = hasher.append(hasher.hasher(randname, newname)) if hashed: os.remove(randname) return hashed cmd = 'mv %s %s' % (randname, newname) os.popen(cmd) thumb(newname) return newname else: os.remove(randname) return None
def is_doc(self, data): doc_mime_types = ( 'application/msword', 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', ) return magic.from_buffer(data, mime = True) in doc_mime_types
def getBenefits(results, dir, ignore_invisibles): benefits = [] devnull = open(os.devnull, "wb") for result in results: (url, width, height) = analyzeResult(result) filedir, filename = resourceSlug(url, dir) try: buffer = open(filename, "rb").read() except IOError: continue ext = magic.from_buffer(buffer).split()[0].lower() # If it's not one of the known image formats, return! # Sorry WebP if (ext != "jpeg") and (ext != "png") and (ext != "gif"): continue optimized_file_name = filename + "_lslsopt" + ext lossy_optimized_file_name = filename + "_lossyopt" + ext resized_file_name = filename + "_" + width + "_" + height + ext # optimize the original image copyfile(filename, optimized_file_name) call(["image_optim", optimized_file_name], stdout=devnull, stderr=devnull) # Lossy optimize the original image call(["convert", optimized_file_name, "-quality", "85", lossy_optimized_file_name]) # call(["image_optim", lossy_optimized_file_name], stdout=devnull, stderr=devnull) # Resize the original image call(["convert", optimized_file_name, "-geometry", width + "x" + height, "-quality", "85", resized_file_name]) # call(["image_optim", resized_file_name], stdout=devnull, stderr=devnull) # Get the original image's dimensions original_dimensions = check_output('identify -format "%w,%h" ' + filename + "|sed 's/,/x/'", shell=True).strip() original_size = fileSize(filename) optimized_size = fileSize(optimized_file_name) lossy_optimized_size = fileSize(lossy_optimized_file_name) resized_size = fileSize(resized_file_name) # If resizing made the image larger, ignore it if resized_size > optimized_size: resized_size = optimized_size # if the image is not displayed, consider all its data as a waste if width == "0": resized_size = 0 if ignore_invisibles: continue benefits.append( [ filename, original_size, original_size - optimized_size, original_size - lossy_optimized_size, original_dimensions + "=>" + width + "x" + height, original_size - resized_size, ] ) devnull.close() return benefits
def type_link(link, file_size=1024, sleep_time=2): """ type_link() will download a link, and then deturmine it's file type using magic numbers. """ #log( "Attempting to type link: '{0}' (using filesize: {1})".format(link,file_size) ) success = False file_type = "" try: if 'mailto:' in link: raise Exception('Invalid link type.') req = urllib2.Request(link, headers={'Range':"byte=0-{0}".format(file_size)}) # try and download the file five times ( in case the site is being fussy ) error_count = 0 while(error_count < 5): try: payload = urllib2.urlopen(req,timeout=5).read(file_size) #log( "Successfully downloaded the first {0} bytes of '{1}'.".format(file_size, link) ) break except Exception, e: #log( "Error within type_link while trying to download {0} bytes from URL:\n\t{1}\n".format(link,str(e)) ) if str(e) != 'time out': raise Exception(e) else: error_count += 1 time.sleep(sleep_time) # type file using libmagic file_type = magic.from_buffer(payload, mime=True) success = True
def get_filetype(file_path): """ Get file format identifier based on the type of the given file. @param file_path: file path @return: file type identifier or magic signature if format is not supported """ log = logging.getLogger("Core.GetFileType") if not os.path.exists(file_path): return None data = open(file_path, "rb").read() # Thanks to Jesse from malc0de.com for this suggestion. # First try official magic bindings, if something fails try to failover # on the unofficial bindings. try: ms = magic.open(magic.MAGIC_NONE) ms.load() file_type = ms.buffer(data) except: try: file_type = magic.from_buffer(data) except Exception, why: log.error("Something went wrong while retrieving magic: %s" % why) return None
def downloadFile(url, dir): os.chdir(dir) url = url.strip() try: print "Downloading: ", url if url.startswith("http://"): url = url[7:] urlhost = url.split("/")[0] urlpath = "/".join(url.split("/")[1:]) f = urlopen("http://" + url) hash = hashlib.md5() hash.update(url) dir = hash.hexdigest()[:2] if not os.path.exists(dir): os.mkdir(dir) buffer = f.read() ext = magic.from_buffer(buffer).split()[0].lower() if "html" in ext: ext = "html.txt" filename = dir + "/" + urlhost + "_" + hash.hexdigest() + "." + ext with open(filename, "wb") as local_file: local_file.write(buffer) local_file.close() with open(filename + ".hdr.txt", "wb") as local_file: local_file.write(str(f.getcode()) + "\n" + str(f.info())) local_file.close() except HTTPError, e: print "HTTPError:", e.code, url
def get_document_file(self, document, document_url, post=False): """ Loads the document file from the server and stores it into the document object given as a parameter. The form parameter is the mechanize Form to be submitted for downloading the document. The document parameter has to be an object of type model.document.Document. """ time.sleep(self.config.WAIT_TIME) logging.info("Getting document '%s'", document.identifier) document_backup = document logging.info("Getting document %s from %s", document.identifier, document_url) if post: document_file = self.get_url(document_url, post_data={'DOLFDNR': '55434', 'options': '64'}) else: document_file = self.get_url(document_url) if not document_file: logging.error("Error downloading file %", document_url) return document document.content = document_file.content # catch strange magic exception try: document.mimetype = magic.from_buffer(document.content, mime=True) except magic.MagicException: logging.warn("Warning: unknown magic error at document %s from %s", document.identifier, document_url) return document_backup document.filename = self.make_document_filename(document.identifier, document.mimetype) return document
def filetype(data): try: ms=magic.open(magic.MAGIC_NONE) ms.load() return ms.buffer(data) except: return magic.from_buffer(data)
def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0) if not "." in fileN: info = handle.info() if 'Content-Type' in info: tp = info['Content-Type'] if ";" in tp: tp = tp.split(";")[0] ext = guess_extension(tp) if ext == None: ext = "unknown_ftype" print(info['Content-Type'], ext) fileN += "." + ext else: fileN += ".jpg" # Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.) guessed = magic.from_buffer(content, mime=True) ext = guess_extension(tp) if ext: fileN = fileN + ext return fileN, content
def get_mime(file): """Given a file, returns mimetype and extension""" mime = magic.from_buffer(file.read(2048), mime=True) extension = guess_extension(mime, False) return mime, extension
def upload_media(request): """ Upload a media file from multi-part HTTP file request. @see https://docs.djangoproject.com/fr/1.10/ref/files/uploads/#custom-upload-handlers """ if not request.FILES: raise SuspiciousOperation(_("No file specified")) up = request.FILES['file'] # check file size if up.size > localsettings.max_file_size: SuspiciousOperation(_("Upload file size limit is set to %i bytes") % localsettings.max_file_size) # simple check mime-types using the file extension (can process a test using libmagic) guessed_mime_type = mimetypes.guess_type(up.name)[0] if guessed_mime_type is None: SuspiciousOperation(_("Undetermined uploaded file type")) # validate the file name and update it in way to be multi OS compliant # remove any '.' before and after name = up.name.strip('.') valid_name = io.StringIO() # replace forbidden characters by '_' for c in name: if ord(c) < 32 or c in ('<', '>', '"', '|', '\\', '`', '*', '?', ':', '/'): c = '_' valid_name.write(c) media = Media() # generate two levels of path from the uuid node l1_path = '%02x' % (((media.uuid.node & 0xffffff000000) >> 24) % 256) l2_path = '%02x' % ((media.uuid.node & 0x000000ffffff) % 256) local_path = os.path.join(l1_path, l2_path) local_file_name = str(media.uuid) media.name = os.path.join(local_path, local_file_name) media.version = 1 media.file_name = valid_name.getvalue() media.file_size = up.size # default owner is the user of the upload media.owner_content_type = ContentType.objects.get_by_natural_key("auth", "user") media.owner_object_id = request.user.pk # create the path if necessary abs_path = os.path.join(localsettings.storage_path, local_path) if not os.path.exists(abs_path): os.makedirs(abs_path, 0o770) abs_file_name = os.path.join(abs_path, local_file_name) dst_file = open(abs_file_name, "wb") # test mime-type with a buffer of a least 1024 bytes test_mime_buffer = io.BytesIO() # copy file content for chunk in up.chunks(): dst_file.write(chunk) if test_mime_buffer.tell() < 1024: test_mime_buffer.write(chunk) dst_file.close() guessed_mime_type = magic.from_buffer(test_mime_buffer.getvalue(), mime=True) # 0660 on file os.chmod(abs_file_name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) media.mime_type = guessed_mime_type # up.content_type # save the model once file is correctly saved media.save() result = { 'id': media.id, 'uuid': media.uuid, 'name': media.name, 'created_date': media.created_date, 'modified_date': media.modified_date, 'file_name': media.file_name, 'file_size': media.file_size } return HttpResponseRest(request, result)
def update_upload_media(request, uuid): """ Upload a media file from multi-part HTTP file request. """ if not request.FILES: raise SuspiciousOperation(_("No file specified")) up = request.FILES['file'] # check file size if up.size > localsettings.max_file_size: SuspiciousOperation(_("Upload file size limit is set to %i bytes") % localsettings.max_file_size) # simple check mime-types using the file extension (can process a test using libmagic) guessed_mime_type = mimetypes.guess_type(up.name)[0] if guessed_mime_type is None: SuspiciousOperation(_("Undetermined uploaded file type")) media = get_object_or_404(Media, uuid=uuid) # check user permission on the media if media.owner_content_type == "auth.user": if media.owner_object_id != request.user.pk: raise PermissionDenied(_('Your are not the owner of the media')) else: perms = get_permissions_for(request.user, media.owner_content_type.app_label, media.owner_content_type.model, media.owner_object_id) if '%s.change_%s' % (media.owner_content_type.app_label, media.owner_content_type.model) not in perms: raise PermissionDenied(_('No change permission to the owner entity')) version = media.version + 1 abs_file_name = os.path.join(localsettings.storage_path, media.name) if not os.path.isfile(abs_file_name): SuspiciousOperation(_("Trying to update a non-existing file")) dst_file = open(abs_file_name, "wb") # test mime-type with a buffer of a least 1024 bytes test_mime_buffer = io.BytesIO() # copy file content for chunk in up.chunks(): dst_file.write(chunk) if test_mime_buffer.tell() < 1024: test_mime_buffer.write(chunk) dst_file.close() guessed_mime_type = magic.from_buffer(test_mime_buffer.getvalue(), mime=True) # 0660 on file os.chmod(abs_file_name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP) # upgrade the version number and file size media.version = version media.file_size = up.size media.mime_type = guessed_mime_type # up.content_type # update the model once file is correctly saved media.save() result = { 'id': media.id, 'uuid': media.uuid, 'version': media.version, 'mime_type': media.content_type, 'file_size': media.file_size, 'modified_date': media.modified_date } return HttpResponseRest(request, result)
item_text_data = item_text_data.text if not os.path.exists(directory): os.makedirs(directory) try: r_image = get(item_image_data['src'], allow_redirects=True) if r_image.status_code != 404: filename, file_extension = os.path.splitext( urlparse( item_image_data['src']).path.split('/')[2]) if file_extension == '': mime = from_buffer( r_image.iter_content(256).__next__(), mime=True) if mime == 'image/jpeg': file_extension = '.jpg' else: print('Mime no adicionado. Agregar') quit() open(directory + '/' + filename + file_extension, 'wb').write(r_image.content) filename = filename + file_extension else: filename = '' print(id_item)
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's url (provided by search) """ assert 'url' in initial_data, 'Manga url is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['url']), headers={'user-agent': USER_AGENT}) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None # Get true URL after redirects split_url = urlsplit(r.url) url = '{0}?{1}'.format(split_url.path, split_url.query) soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( url=url, authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) # Details info_element = soup.find('div', class_='info') for element in info_element.find_all(class_='genre'): if element.span: element.span.extract() data['genres'].append(element.text.strip()) for element in info_element.find_all(class_='author'): if element.span: element.span.extract() if element.a: element.a.extract() data['authors'].append(element.text.strip()) detail_element = soup.find('div', class_='detail_body') if 'challenge' in data['url']: # Challenge (Canvas) data['cover'] = soup.find('div', class_='detail_header').img.get('src') else: # Original data['cover'] = detail_element.get('style').split( ' ')[1][4:-1].split('?')[0] + '?type=q90' # Status value = detail_element.find('p', class_='day_info').text.strip() if value.find('COMPLETED') >= 0: data['status'] = 'complete' elif value.find('UP') >= 0: data['status'] = 'ongoing' data['synopsis'] = detail_element.find('p', class_='summary').text.strip() # Chapters data['chapters'] = self.get_manga_chapters_data(data['url']) return data
def attachment_pre_save(sender, instance, **kwargs): if instance.file: instance.mimetype = magic.from_buffer(instance.file.read(1024), mime=True)
def process(self): sample = self.current_task.get_resource("sample") self.log.info("hostname: {}".format(socket.gethostname())) sha256sum = hashlib.sha256(sample.content).hexdigest() magic_output = magic.from_buffer(sample.content) self.log.info("running sample sha256: {}".format(sha256sum)) timeout = self.current_task.payload.get('timeout') or 60 * 10 hard_time_limit = 60 * 20 if timeout > hard_time_limit: self.log.error( "Tried to run the analysis for more than hard limit of %d seconds", hard_time_limit) return analysis_uid = self.current_task.uid override_uid = self.current_task.payload.get('override_uid') self.log.info(f"analysis UID: {analysis_uid}") if override_uid: analysis_uid = override_uid self.log.info(f"override UID: {override_uid}") self.log.info( "note that artifacts will be stored under this overriden identifier" ) self.rs.set(f"drakvnc:{analysis_uid}", INSTANCE_ID, ex=3600) # 1h workdir = '/tmp/drakrun/vm-{}'.format(int(INSTANCE_ID)) extension = self.current_task.headers.get("extension", "exe").lower() if '(DLL)' in magic_output: extension = 'dll' self.log.info("Running file as %s", extension) file_name = self.current_task.payload.get("file_name", "malwar") + f".{extension}" # Alphanumeric, dot, underscore, dash if not re.match(r"^[a-zA-Z0-9\._\-]+$", file_name): self.log.error("Filename contains invalid characters") return self.log.info("Using file name %s", file_name) start_command = self.current_task.payload.get( "start_command", self._get_start_command(extension, sample)) if not start_command: self.log.error( "Unable to run malware sample, could not generate any suitable command to run it." ) return try: shutil.rmtree(workdir) except Exception as e: print(e) outdir = os.path.join(workdir, 'output') os.makedirs(workdir, exist_ok=True) os.mkdir(outdir) os.mkdir(os.path.join(outdir, 'dumps')) metadata = { "sample_sha256": sha256sum, "magic_output": magic_output, "time_started": int(time.time()) } with open(os.path.join(outdir, 'sample_sha256.txt'), 'w') as f: f.write(hashlib.sha256(sample.content).hexdigest()) with open(os.path.join(workdir, file_name), 'wb') as f: f.write(sample.content) watcher_tcpdump = None watcher_dnsmasq = None for _ in range(3): try: self.log.info("running vm {}".format(INSTANCE_ID)) watcher_dnsmasq = start_dnsmasq( INSTANCE_ID, self.config.config['drakrun'].get('dns_server', '8.8.8.8')) d_run.logging = self.log d_run.run_vm(INSTANCE_ID) watcher_tcpdump = start_tcpdump_collector(INSTANCE_ID, outdir) self.log.info("running monitor {}".format(INSTANCE_ID)) kernel_profile = os.path.join(PROFILE_DIR, "kernel.json") runtime_profile = os.path.join(PROFILE_DIR, "runtime.json") with open(runtime_profile, 'r') as runtime_f: rp = json.loads(runtime_f.read()) inject_pid = rp['inject_pid'] kpgd = rp['vmi_offsets']['kpgd'] hooks_list = os.path.join(ETC_DIR, "hooks.txt") dump_dir = os.path.join(outdir, "dumps") drakmon_log_fp = os.path.join(outdir, "drakmon.log") injector_cmd = [ "injector", "-o", "json", "-d", "vm-{vm_id}".format(vm_id=INSTANCE_ID), "-r", kernel_profile, "-i", inject_pid, "-k", kpgd, "-m", "writefile", "-e", f"%USERPROFILE%\\Desktop\\{file_name}", "-B", os.path.join(workdir, file_name) ] self.log.info("Running injector...") injector = subprocess.Popen(injector_cmd, stdout=subprocess.PIPE) outs, errs = injector.communicate(b"", 20) if injector.returncode != 0: raise subprocess.CalledProcessError( injector.returncode, injector_cmd) injected_fn = json.loads(outs)['ProcessName'] net_enable = int(self.config.config['drakrun'].get( 'net_enable', '0')) if "%f" not in start_command: self.log.warning("No file name in start command") cwd = subprocess.list2cmdline([ntpath.dirname(injected_fn)]) cur_start_command = start_command.replace("%f", injected_fn) # don't include our internal maintanance commands metadata['start_command'] = cur_start_command cur_start_command = f"cd {cwd} & " + cur_start_command if net_enable: cur_start_command = "ipconfig /renew & " + cur_start_command full_cmd = subprocess.list2cmdline( ["cmd.exe", "/C", cur_start_command]) self.log.info("Using command: %s", full_cmd) drakvuf_cmd = [ "drakvuf", "-o", "json", "-x", "poolmon", "-x", "objmon", "-x", "socketmon", "-j", "5", "-t", str(timeout), "-i", inject_pid, "-k", kpgd, "-d", "vm-{vm_id}".format(vm_id=INSTANCE_ID), "--dll-hooks-list", hooks_list, "--memdump-dir", dump_dir, "-r", kernel_profile, "-e", full_cmd ] drakvuf_cmd.extend(self.get_profile_list()) syscall_filter = self.config.config['drakrun'].get( 'syscall_filter', None) if syscall_filter: drakvuf_cmd.extend(["-S", syscall_filter]) with open(drakmon_log_fp, "wb") as drakmon_log: drakvuf = subprocess.Popen(drakvuf_cmd, stdout=drakmon_log) try: exit_code = drakvuf.wait(timeout + 60) except subprocess.TimeoutExpired as e: logging.error( "BUG: Monitor command doesn\'t terminate automatically after timeout expires." ) logging.error("Trying to terminate DRAKVUF...") drakvuf.terminate() drakvuf.wait(10) logging.error( "BUG: Monitor command also doesn\'t terminate after sending SIGTERM." ) drakvuf.kill() drakvuf.wait() logging.error("Monitor command was forcefully killed.") raise e if exit_code != 0: raise subprocess.CalledProcessError( exit_code, drakvuf_cmd) break except subprocess.CalledProcessError: self.log.info( "Something went wrong with the VM {}".format(INSTANCE_ID), exc_info=True) finally: try: subprocess.run( ["xl", "destroy", "vm-{}".format(INSTANCE_ID)], cwd=workdir, check=True) except subprocess.CalledProcessError: self.log.info( "Failed to destroy VM {}".format(INSTANCE_ID), exc_info=True) if watcher_dnsmasq: watcher_dnsmasq.terminate() else: self.log.info( "Failed to analyze sample after 3 retries, giving up.") return self.log.info("waiting for tcpdump to exit") if watcher_tcpdump: try: watcher_tcpdump.wait(timeout=60) except subprocess.TimeoutExpired: self.log.exception("tcpdump doesn't exit cleanly after 60s") self.crop_dumps(os.path.join(outdir, 'dumps'), os.path.join(outdir, 'dumps.zip')) if os.path.exists("/opt/procdot/procmon2dot"): self.generate_graphs(outdir) self.slice_logs(outdir) self.log.info("uploading artifacts") metadata['time_finished'] = int(time.time()) with open(os.path.join(outdir, 'metadata.json'), 'w') as f: f.write(json.dumps(metadata)) payload = {"analysis_uid": analysis_uid} payload.update(metadata) t = Task( { "type": "analysis", "kind": "drakrun", "quality": self.current_task.headers.get("quality", "high") }, payload=payload) for resource in self.upload_artifacts(analysis_uid, workdir): t.add_payload(resource.name, resource) t.add_payload('sample', sample) self.send_task(t)
DIR = os.path.join(DIR, "Dota_2") if not os.path.exists(DIR): os.mkdir(DIR) filepath = search_keywords filepath = filepath.replace(prefix, "") filepath = filepath.split() filepath = "_".join(filepath) DIR = os.path.join(DIR, filepath) if not os.path.exists(DIR): os.mkdir(DIR) if len(items[k][1]) != 0: with open(os.path.join(DIR, str(k+1)+"." + items[k][1]),'wb') as output_file: try: data = response.read() try: type_i = magic.from_buffer(data) print type_i if type_i.find("image", 0, 30): output_file.write(data) print ("saved ====> " + str(k+1)) + " url: " + items[k][0] else: print "wrong data type" response.close(); except Exception: print("Probably Magic.from_buffer exception at image "+str(k)) except Exception: print("Probably httplib.IncompleteRead: IncompleteRead at image "+str(k)) k=k+1; except IOError:
def detect_content_type(f): sample = f.read(2048) f.seek(0) return magic.from_buffer(sample, mime=True)
def check_in_memory_mime(in_memory_file): mime = magic.from_buffer(in_memory_file.read(), mime=True) return mime
def save(request, biz_cc_id): """ @summary: 创建或编辑app maker @param: id: id 判断是新建还是编辑 name: 名称 desc: 简介 template_id: 模板ID template_scheme_id: 执行方案ID """ try: params = request.POST.dict() jsonschema.validate(params, APP_MAKER_PARAMS_SCHEMA) except jsonschema.ValidationError as e: logger.warning(u"APP_MAKER_PARAMS_SCHEMA raise error: %s" % e) message = _(u"参数格式错误:%s" % e) return JsonResponse({'result': False, 'message': message}) logo_obj = request.FILES.get('logo') if logo_obj: valid_mime = {'image/png', 'image/jpg', 'image/jpeg'} is_png_or_jpg = (logo_obj.content_type in valid_mime) if not is_png_or_jpg: return JsonResponse({ 'result': False, 'message': _(u"请上传 jpg 或 png 格式的图片") }) file_size = logo_obj.size # LOGO大小不能大于 100K if file_size > 100 * 1024: message = _(u"LOGO 文件大小必须小于 100K") return JsonResponse({'result': False, 'message': message}) logo_content = logo_obj.read() real_mime = magic.from_buffer(logo_content, mime=True) if real_mime not in valid_mime: return JsonResponse({'result': False, 'message': _(u"图片格式非法")}) else: logo_content = None params.update({ 'username': request.user.username, 'logo_content': logo_content, }) if settings.IS_LOCAL: params['link_prefix'] = '%s/appmaker/' % request.get_host() fake = True else: params['link_prefix'] = '%sappmaker/' % settings.APP_HOST fake = False result, data = AppMaker.objects.save_app_maker(biz_cc_id, params, fake) if not result: return JsonResponse({'result': False, 'message': data}) data = { 'id': data.id, 'code': data.code, 'logo_url': data.logo_url, } return JsonResponse({"result": True, "data": data})
async def _unlocked_transfer_file_to_matrix( client: MautrixTelegramClient, intent: IntentAPI, loc_id: str, location: TypeLocation, thumbnail: TypeThumbnail, is_sticker: bool, tgs_convert: Optional[dict], filename: Optional[str], encrypt: bool, parallel_id: Optional[int]) -> Optional[DBTelegramFile]: db_file = DBTelegramFile.get(loc_id) if db_file: return db_file if parallel_id and isinstance(location, Document) and (not is_sticker or not tgs_convert): db_file = await parallel_transfer_to_matrix(client, intent, loc_id, location, filename, encrypt, parallel_id) mime_type = location.mime_type file = None else: try: file = await client.download_file(location) except (LocationInvalidError, FileIdInvalidError): return None except (AuthBytesInvalidError, AuthKeyInvalidError, SecurityError) as e: log.exception(f"{e.__class__.__name__} while downloading a file.") return None width, height = None, None mime_type = magic.from_buffer(file, mime=True) image_converted = False # A weird bug in alpine/magic makes it return application/octet-stream for gzips... if is_sticker and tgs_convert and ( mime_type == "application/gzip" or (mime_type == "application/octet-stream" and magic.from_buffer(file).startswith("gzip"))): mime_type, file, width, height = await convert_tgs_to( file, tgs_convert["target"], **tgs_convert["args"]) thumbnail = None image_converted = mime_type != "application/gzip" if mime_type == "image/webp": new_mime_type, file, width, height = convert_image( file, source_mime="image/webp", target_type="png", thumbnail_to=(256, 256) if is_sticker else None) image_converted = new_mime_type != mime_type mime_type = new_mime_type thumbnail = None decryption_info = None upload_mime_type = mime_type if encrypt and encrypt_attachment: file, decryption_info_dict = encrypt_attachment(file) decryption_info = EncryptedFile.deserialize(decryption_info_dict) upload_mime_type = "application/octet-stream" content_uri = await intent.upload_media(file, upload_mime_type) if decryption_info: decryption_info.url = content_uri db_file = DBTelegramFile(id=loc_id, mxc=content_uri, decryption_info=decryption_info, mime_type=mime_type, was_converted=image_converted, timestamp=int(time.time()), size=len(file), width=width, height=height) if thumbnail and (mime_type.startswith("video/") or mime_type == "image/gif"): if isinstance(thumbnail, (PhotoSize, PhotoCachedSize)): thumbnail = thumbnail.location try: db_file.thumbnail = await transfer_thumbnail_to_matrix( client, intent, thumbnail, file, mime_type, encrypt) except FileIdInvalidError: log.warning(f"Failed to transfer thumbnail for {thumbnail!s}", exc_info=True) try: db_file.insert() except (IntegrityError, InvalidRequestError) as e: log.exception( f"{e.__class__.__name__} while saving transferred file data. " "This was probably caused by two simultaneous transfers of the same file, " "and should not cause any problems.") return db_file
def getMime(data=None, mimestr=None): if mimestr: return mimetypes.guess_type('file.{0}'.format(mimestr))[0] elif data: return magic.from_buffer(data, mime=True).decode('utf-8') return 'text/plain'
def get_buffer_ext(buffer): buffer.seek(0) mime_info = magic.from_buffer(buffer.read(), mime=True) buffer.seek(0) return mime_info.split('/')[-1]
def guess_mime_type(data): """ Guess a MIME type from magic bytes in a data stream. """ return magic.from_buffer(data)
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, cover=None, )) data['name'] = soup.find('h2', class_='widget-title').text.strip() data['cover'] = self.cover_url.format(data['slug']) # Details elements = soup.find( 'dl', class_='dl-horizontal').findChildren(recursive=False) for element in elements: if element.name not in ('dt', 'dd'): continue if element.name == 'dt': label = element.text continue value = element.text.strip() if label.startswith('Author') or label.startswith('Artist'): for t in value.split(','): t = t.strip() if t not in data['authors']: data['authors'].append(t) elif label.startswith('Categories'): data['genres'] = [t.strip() for t in value.split(',')] elif label.startswith('Status'): status = value.lower() if status in ('ongoing', 'complete'): data['status'] = status data['synopsis'] = soup.find('div', class_='well').p.text.strip() alert_element = soup.find('div', class_='alert-danger') if alert_element: data['synopsis'] += '\n\n' + alert_element.text.strip() # Chapters elements = soup.find('ul', class_='chapters').find_all('li', recursive=False) for element in reversed(elements): h5 = element.h5 if not h5: continue slug = h5.a.get('href').split('/')[-1] title = '{0}: {1}'.format(h5.a.text.strip(), h5.em.text.strip()) date = element.div.div data['chapters'].append( dict(slug=slug, date=convert_date_string(date.text.strip(), format='%d %b. %Y'), title=title)) return data
def get_manga_data(self, initial_data): """ Returns manga data by scraping manga HTML page content Initial data should contain at least manga's slug (provided by search) """ assert 'slug' in initial_data, 'Manga slug is missing in initial data' r = self.session_get(self.manga_url.format(initial_data['slug'])) if r is None: return None mime_type = magic.from_buffer(r.content[:128], mime=True) if r.status_code != 200 or mime_type != 'text/html': return None soup = BeautifulSoup(r.text, 'html.parser') data = initial_data.copy() data.update( dict( authors=[], scanlators=[], genres=[], status=None, synopsis=None, chapters=[], server_id=self.id, )) title_element = soup.find('h1', class_='manga-bg__title') if title_element is None: title_element = soup.find('h1', class_='manga__title') data['name'] = title_element.text.strip() if data.get('cover') is None: data['cover'] = self.cover_url.format(data['slug']) # Details elements = soup.find( 'div', class_='manga-info').find_all(class_='info-list__row') for element in elements: label = element.strong.text.strip() if label.startswith('Auteur') or label.startswith('Artiste'): value = element.a.text.strip() for t in value.split(','): t = t.strip() if t not in data['authors']: data['authors'].append(t) elif label.startswith('Scantrad'): a_element = element.find_all('a')[0] data['scanlators'] = [ a_element.text.replace('[', '').replace(']', '').strip(), ] elif label.startswith('Genres'): a_elements = element.find_all('a') data['genres'] = [ a_element.text.strip() for a_element in a_elements ] elif label.startswith('Statut'): status = element.span.text.strip().lower() if status == 'en cours': data['status'] = 'ongoing' elif status == 'terminé': data['status'] = 'complete' # Synopsis data['synopsis'] = soup.find('div', class_='info-desc__content').text.strip() # Chapters elements = soup.find('div', class_='chapters-list').find_all( 'div', class_='chapter-item') for element in reversed(elements): a_element = element.find('div', class_='chapter-item__name').a slug = a_element.get('href').split('/')[-1] title = a_element.text.strip() date = element.find('div', class_='chapter-item__date').text.strip() data['chapters'].append( dict( slug=slug, title=title, date=convert_date_string(date, format='%d.%m.%Y'), )) return data
def is_text_file(file): msg = magic.from_buffer(file[:1024]) if ("text" in msg) or ("empty" in msg): return True return False
def download_from_url(self, url): """ Download a package from an url """ try: self.log.info("Start downloading {0}".format(url)) #response = requests.get(url) # create an empty temporary file downloaded_file = tempfile.NamedTemporaryFile(delete=False).name # process the download with open(downloaded_file, "wb") as f: response = requests.get(url, stream=STREAM) total_length = response.headers.get('content-length') # check the http response code if response.status_code != 200: self.log.error( "Error while downloading the package : HTTP {0}". format(response.status_code)) return None, None # check the mime type peek = response.iter_content(256).next() mime = magic.from_buffer(peek, mime=True) if mime not in ALLOWED_MIMES: self.log.error( "The package downloaded has not a compliant mime type : {0}. The mime type should be one of these : {1}" .format(mime, ALLOWED_MIMES)) return None, None # download # if streaming is activated if STREAM: if total_length is None: # no content length header f.write(response.content) else: dl = 0 total_length = int(total_length) old_progress = 0 for data in response.iter_content(chunk_size=1024): #self.log.info(dl) if data: f.write(data) f.flush() dl += len(data) #progress = int(50 * dl / total_length) #if progress - old_progress > 5 or progress >= 49: # old_progress = progress # sys.stdout.write("\r[%s%s]" % ('=' * progress, ' ' * (50-progress)) ) # sys.stdout.flush() #sys.stdout.write("\n") os.fsync(f) # if no streaming else: f.write(response.content) except: self.log.error("Error while downloading the package : {0}".format( traceback.format_exc())) self.log.info("Download finished") return downloaded_file, mime
def get_content_type(scan): scan.seek(0) content_type = magic.from_buffer(scan.read(1024), mime=True) content_type = force_text(content_type) scan.seek(0) return content_type
def clean_avatar(self): data = self.cleaned_data['avatar'] if settings.AVATAR_ALLOWED_MIMETYPES: try: import magic except ImportError: raise ImportError("python-magic library must be installed in " "order to use uploaded file content " "limitation") # Construct 256 bytes needed for mime validation magic_buffer = "" for chunk in data.chunks(): magic_buffer += chunk if len(magic_buffer) >= 256: break # https://github.com/ahupp/python-magic#usage mime = magic.from_buffer(magic_buffer, mime=True) if mime not in settings.AVATAR_ALLOWED_MIMETYPES: err = _("File content is invalid. Detected: %(mimetype)s " "Allowed content types are: %(valid_mime_list)s") conf = { 'valid_mime_list': ", ".join(settings.AVATAR_ALLOWED_MIMETYPES), 'mimetype': mime } raise forms.ValidationError(err % conf) if settings.AVATAR_ALLOWED_FILE_EXTS: root, ext = os.path.splitext(data.name.lower()) if ext not in settings.AVATAR_ALLOWED_FILE_EXTS: valid_exts = ", ".join(settings.AVATAR_ALLOWED_FILE_EXTS) error = _("%(ext)s is an invalid file extension. " "Authorized extensions are : %(valid_exts_list)s") raise forms.ValidationError(error % { 'ext': ext, 'valid_exts_list': valid_exts }) if data.size > settings.AVATAR_MAX_SIZE: error = _("Your file is too big (%(size)s), " "the maximum allowed size is %(max_valid_size)s") raise forms.ValidationError( error % { 'size': filesizeformat(data.size), 'max_valid_size': filesizeformat(settings.AVATAR_MAX_SIZE) }) count = Avatar.objects.filter(user=self.user).count() if (settings.AVATAR_MAX_AVATARS_PER_USER > 1 and count >= settings.AVATAR_MAX_AVATARS_PER_USER): error = _("You already have %(nb_avatars)d avatars, " "and the maximum allowed is %(nb_max_avatars)d.") raise forms.ValidationError( error % { 'nb_avatars': count, 'nb_max_avatars': settings.AVATAR_MAX_AVATARS_PER_USER, }) return
def file_mime(self, jfile): mime = magic.from_buffer(jfile.read(), mime=True) if mime == 'text/plain': return True return False
import sys import base64 from jsonrpc.proxy import ServiceProxy from django.core.serializers import serialize import magic, json if __name__ == '__main__': if len(sys.argv) != 3: print('Usage {} <user_pk> <filename>'.format( sys.argv[0] )) sys.exit(1) user_pk = sys.argv[1] filename = sys.argv[2] rpc_server = ServiceProxy('http://localhost:8000/api/') with open( filename, 'rb') as file: input_content = file.read() file = { 'filename': filename, 'mime_type': magic.from_buffer(input_content, mime=True), 'content': base64.b64encode(input_content).decode('utf-8'), } rpc_server.api.upload_file(user_pk, json.dumps(file))
def mime(self) -> str: return magic.from_buffer(b''.join(self.read(count=261)), mime=True)
async def transfer_thumbnail_to_matrix( client: MautrixTelegramClient, intent: IntentAPI, thumbnail_loc: TypeLocation, mime_type: str, encrypt: bool, video: bytes | None, custom_data: bytes | None = None, width: int | None = None, height: int | None = None, async_upload: bool = False, ) -> DBTelegramFile | None: if not Image or not VideoFileClip: return None loc_id = _location_to_id(thumbnail_loc) if not loc_id: return None if custom_data: loc_id += "-mau_custom_thumbnail" db_file = await DBTelegramFile.get(loc_id) if db_file: return db_file video_ext = sane_mimetypes.guess_extension(mime_type) if custom_data: file = custom_data elif VideoFileClip and video_ext and video: try: file, width, height = _read_video_thumbnail(video, video_ext, frame_ext="png") except OSError: return None mime_type = "image/png" else: file = await client.download_file(thumbnail_loc) width, height = None, None mime_type = magic.from_buffer(file, mime=True) decryption_info = None upload_mime_type = mime_type if encrypt: file, decryption_info = encrypt_attachment(file) upload_mime_type = "application/octet-stream" content_uri = await intent.upload_media(file, upload_mime_type, async_upload=async_upload) if decryption_info: decryption_info.url = content_uri db_file = DBTelegramFile( id=loc_id, mxc=content_uri, mime_type=mime_type, was_converted=False, timestamp=int(time.time()), size=len(file), width=width, height=height, decryption_info=decryption_info, ) try: await db_file.insert() except (UniqueViolationError, IntegrityError) as e: log.exception( f"{e.__class__.__name__} while saving transferred file thumbnail data. " "This was probably caused by two simultaneous transfers of the same file, " "and might (but probably won't) cause problems with thumbnails or something." ) return db_file
def _process_downloads(cls, properties, sheets): prop_name = cls.download_property attachment = properties.get(prop_name, {}) href = attachment.get('href', None) if href is not None: if not href.startswith('data:'): msg = "Expected data URI." raise ValidationFailure('body', [prop_name, 'href'], msg) properties = properties.copy() properties[prop_name] = attachment = attachment.copy() if sheets is None: sheets = {} else: sheets = sheets.copy() sheets['downloads'] = downloads = {} download_meta = downloads[prop_name] = {} try: mime_type_declared, charset, data = parse_data_uri(href) except (ValueError, TypeError): msg = 'Could not parse data URI.' raise ValidationFailure('body', [prop_name, 'href'], msg) if charset is not None: download_meta['charset'] = charset # Make sure the mimetype appears to be what the client says it is mime_type_detected = magic.from_buffer(data, mime=True).decode('utf-8') if mime_type_declared and not mimetypes_are_equal( mime_type_declared, mime_type_detected): msg = "Incorrect file type. (Appears to be %s)" % mime_type_detected raise ValidationFailure('body', [prop_name, 'href'], msg) mime_type = mime_type_declared or mime_type_detected attachment['type'] = mime_type if mime_type is not None: download_meta['type'] = mime_type # Make sure mimetype is not disallowed try: allowed_types = cls.schema['properties'][prop_name][ 'properties']['type']['enum'] except KeyError: pass else: if mime_type not in allowed_types: raise ValidationFailure('body', [prop_name, 'href'], 'Mimetype is not allowed.') # Make sure the file extensions matches the mimetype download_meta['download'] = filename = attachment['download'] mime_type_from_filename, _ = mimetypes.guess_type(filename) if not mimetypes_are_equal(mime_type, mime_type_from_filename): raise ValidationFailure( 'body', [prop_name, 'href'], 'Wrong file extension for %s mimetype.' % mime_type) # Validate images and store height/width major, minor = mime_type.split('/') if major == 'image' and minor in ('png', 'jpeg', 'gif', 'tiff'): stream = BytesIO(data) im = Image.open(stream) im.verify() attachment['width'], attachment['height'] = im.size blob_id = uuid4() download_meta['blob_id'] = str(blob_id) session = DBSession() blob = Blob(blob_id=blob_id, data=data) session.add(blob) attachment['href'] = '@@download/%s/%s' % (prop_name, quote(filename)) return properties, sheets
async def _unlocked_transfer_file_to_matrix( client: MautrixTelegramClient, intent: IntentAPI, loc_id: str, location: TypeLocation, thumbnail: TypeThumbnail, is_sticker: bool, tgs_convert: dict | None, filename: str | None, encrypt: bool, parallel_id: int | None, async_upload: bool = False, ) -> DBTelegramFile | None: db_file = await DBTelegramFile.get(loc_id) if db_file: return db_file converted_anim = None if parallel_id and isinstance(location, Document) and (not is_sticker or not tgs_convert): db_file = await parallel_transfer_to_matrix(client, intent, loc_id, location, filename, encrypt, parallel_id) mime_type = location.mime_type file = None else: try: file = await client.download_file(location) except (LocationInvalidError, FileIdInvalidError): return None except (AuthBytesInvalidError, AuthKeyInvalidError, SecurityError) as e: log.exception(f"{e.__class__.__name__} while downloading a file.") return None width, height = None, None mime_type = magic.from_buffer(file, mime=True) image_converted = False # A weird bug in alpine/magic makes it return application/octet-stream for gzips... is_tgs = mime_type == "application/gzip" or ( mime_type == "application/octet-stream" and magic.from_buffer(file).startswith("gzip")) if is_sticker and tgs_convert and is_tgs: converted_anim = await convert_tgs_to(file, tgs_convert["target"], **tgs_convert["args"]) mime_type = converted_anim.mime file = converted_anim.data width, height = converted_anim.width, converted_anim.height image_converted = mime_type != "application/gzip" thumbnail = None decryption_info = None upload_mime_type = mime_type if encrypt and encrypt_attachment: file, decryption_info = encrypt_attachment(file) upload_mime_type = "application/octet-stream" content_uri = await intent.upload_media(file, upload_mime_type, async_upload=async_upload) if decryption_info: decryption_info.url = content_uri db_file = DBTelegramFile( id=loc_id, mxc=content_uri, decryption_info=decryption_info, mime_type=mime_type, was_converted=image_converted, timestamp=int(time.time()), size=len(file), width=width, height=height, ) if thumbnail and (mime_type.startswith("video/") or mime_type == "image/gif"): if isinstance(thumbnail, (PhotoSize, PhotoCachedSize)): thumbnail = thumbnail.location try: db_file.thumbnail = await transfer_thumbnail_to_matrix( client, intent, thumbnail, video=file, mime_type=mime_type, encrypt=encrypt, async_upload=async_upload, ) except FileIdInvalidError: log.warning(f"Failed to transfer thumbnail for {thumbnail!s}", exc_info=True) elif converted_anim and converted_anim.thumbnail_data: db_file.thumbnail = await transfer_thumbnail_to_matrix( client, intent, location, video=None, encrypt=encrypt, custom_data=converted_anim.thumbnail_data, mime_type=converted_anim.thumbnail_mime, width=converted_anim.width, height=converted_anim.height, async_upload=async_upload, ) try: await db_file.insert() except (UniqueViolationError, IntegrityError) as e: log.exception( f"{e.__class__.__name__} while saving transferred file data. " "This was probably caused by two simultaneous transfers of the same file, " "and should not cause any problems.") return db_file
# coding: utf-8 import magic with open("test.gif", mode="rb") as f: buf = f.read(512) print(magic.from_buffer(buf, mime=True))
def load_known_pii(known_pii_locations: List[str], storage_connection_string: Optional[str] = None) -> List[KnownFilthItem]: """This function loads tagged filth from a csv and transforms it into a dict that the detector can use""" start_time = time.time() click.echo("Loading Known Filth...") import pandas as pd # This will be a list of records containing all the info from the loaded tagged pii files known_pii = [] # type: List[Dict[str, Any]] logger = logging.getLogger('scrubadub.tests.benchmark_accuracy_real_data.load_known_pii') # These are the column names that we want target_cols = {'match', 'filth_type'} # These are some optional column names that we will use to filter extra columns out target_cols_optional = {'match_end', 'limit', 'ignore_case', 'ignore_whitespace', 'ignore_partial_word_matches'} # This is an alternate set of column names that are also accepted instead of the ones listed in `target_cols` target_cols_alt = {'pii_type', 'pii_start', 'pii_end'} # We loop over all tagged PII files for known_pii_location in known_pii_locations: file_data = load_files(known_pii_location, storage_connection_string=storage_connection_string) # Loop over the results from the load_files function, could be more than one file if we provide a directory # in `known_pii_location` for file_name, data in file_data.items(): mime_type = magic.from_buffer(data, mime=True) pandas_reader = pd.read_csv if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet": pandas_reader = pd.read_excel else: data = decode_text({file_name: data}, allowed_mime_types=['application/csv'])[file_name].encode('utf-8') dataframe = None # type: Optional[DataFrame] # Work out how many rows to skip in this loop, starting at zero going up to 9 for n_rows_to_skip in range(10): dataframe = pandas_reader(io.BytesIO(data), skiprows=n_rows_to_skip, dtype={ 'match': str, 'match_end': str, 'filth_type': str, 'pii_start': str, 'pii_end': str, 'pii_type': str, }).rename(columns=lambda x: x.strip()) # If we find the `target_cols` then we found the correct number of rows to skip so we break from # this loop if (set(dataframe.columns.to_list()) & target_cols) == target_cols: break # if we find the `target_cols_alt`, we convert those to the standard set of names and then break elif (set(dataframe.columns.to_list()) & target_cols_alt) == target_cols_alt: dataframe = dataframe.rename( columns={ 'pii_type': 'filth_type', 'pii_start': 'match', 'pii_end': 'match_end', } ) dataframe = dataframe.replace({ "filth_type": { "organisation": "organization", "card-number": "credit_card", "dob": "date_of_birth", "driverslicence": "drivers_licence", "postcode": "postalcode", "licenceplate": "vehicle_licence_plate", } }) break dataframe = None # We weren't able to find the correct columns so raise an error if dataframe is None: raise ValueError(f'Unable to read file: {known_pii_location} Are the file format (csv or xslx) and ' f'columns (match, match_end, filth_type and optionally limit) correct?') # strip() the main columns for col in ['match', 'match_end', 'filth_type']: dataframe[col] = dataframe[col].str.strip() # drop rows if the column 'match' has null values if pd.isnull(dataframe['match']).sum() > 0: dataframe = dataframe.dropna(axis='index', subset=['match']) logger.warning( f"The KnownFilth column 'match' contains some null/blank entries in '{file_name}'. " f"Skipping these rows." ) # drop rows if the column 'filth_type' has null values if pd.isnull(dataframe['filth_type']).sum() > 0: dataframe = dataframe.dropna(axis='index', subset=['filth_type']) logger.warning( f"The KnownFilth column 'filth_type' contains some null/blank entries in '{file_name}'. " f"Skipping these rows." ) # Convert the dataframe to a dict in records format and add it to the big list of tagged pii known_pii += dataframe[ [col for col in dataframe.columns if col in (target_cols | target_cols_optional)] ].to_dict(orient='records') # Loop over each of the tagged pieces of pii for item in known_pii: for sub_item in ('limit', 'match_end', 'ignore_case', 'ignore_whitespace', 'ignore_partial_word_matches'): # if each of hte above keys exist, delete it if its empty if sub_item in item.keys(): if pd.isnull(item[sub_item]): del item[sub_item] elif isinstance(item[sub_item], str) and len(item[sub_item].strip()) == 0: del item[sub_item] elif 'ignore' in sub_item: # if ignore is in the name of the item, then try to convert it to a bool item[sub_item] = convert_to_bool(item[sub_item]) if 'ignore' in sub_item and sub_item not in item: # if ignore is not det then set it to true item[sub_item] = True end_time = time.time() click.echo("Loaded Known Filth in {:.2f}s".format(end_time-start_time)) return known_pii
def add_file( app_context, url, original_url=None, key=None, filename=None, *args, **kwargs, ): """Adds files to s3. Args: app_context: Original app context should be passed here if running in separate thread """ with app_context.app.app_context(): is_s3_or_public_url = current_s3_instance.is_s3_url_with_bucket_prefix( url) or current_s3_instance.is_public_url(url) if is_s3_or_public_url and not current_app.config.get( "UPDATE_S3_FILES_METADATA", False): result = {} if key not in url: filename = filename or key key = url.split("/")[-1] result.update({"key": key, "filename": filename}) if current_s3_instance.is_s3_url(url): url = current_s3_instance.get_public_url(key) result.update({"url": url}) LOGGER.info( "File already on S3 - Skipping", url=url, key=key, thread=threading.get_ident(), ) return result file_data = download_file_from_url(url) new_key = hash_data(file_data) mimetype = magic.from_buffer(file_data, mime=True) file_data = BytesIO(file_data) filename = filename or key if not filename: filename = new_key if mimetype in current_app.config.get( "FILES_RESTRICTED_MIMETYPES"): LOGGER.error( "Unsupported file type - Aborting", key=key, mimetype=mimetype, thread=threading.get_ident(), ) raise UnsupportedFileError(mimetype) acl = current_app.config["S3_FILE_ACL"] if current_s3_instance.file_exists(new_key): LOGGER.info( "Replacing file metadata", key=new_key, thread=threading.get_ident(), ) current_s3_instance.replace_file_metadata( new_key, filename, mimetype, acl) else: LOGGER.info( "Uploading file to s3", key=new_key, thread=threading.get_ident(), ) current_s3_instance.upload_file(file_data, new_key, filename, mimetype, acl) result = { "key": new_key, "filename": filename, "url": current_s3_instance.get_public_url(new_key), } if (url.startswith("http") and not current_s3_instance.is_s3_url(url) and not current_s3_instance.is_public_url(url) and not original_url): result["original_url"] = url return result