Python from_buffer Beispiele, magic.from_buffer Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: vt-icap.py Projekt: ohio813/VirusTotal-ICAP

 def check_file(self, data, uri):
     mime_type = magic.from_buffer(data, mime=True)
     if mime_type == 'application/gzip': #peek inside compressed content
         data = zlib.decompress(data, 16+zlib.MAX_WBITS)
         mime_type = magic.from_buffer(data, mime=True)
     if mime_type in self.mime_types:
         sha256 = hashlib.sha256(data).hexdigest()
         print "Checking %s from %s" %(sha256, uri)
         result = self.get_vt_result(sha256)
         if result['response_code'] == 1 and result['positives'] >= self.vt_threshold: #in VT and we care about it
             return False
         if result['response_code'] == 0: #not in VT
             files = {"file": (sha256, data), "apikey" : self.vt_apikey}
             requests.post("https://www.virustotal.com/vtapi/v2/file/scan", files=files)
             max_count = 6 # We'll wait 3 minutes and then fail open
             count = 0
             while count < max_count:
                 result = self.get_vt_result(sha256)
                 if result['response_code'] == 1 and result['positives'] >= self.vt_threshold: #in VT and we care about it
                     return False
                 if result['response_code'] == 1 and result['positives'] < self.vt_threshold: #in VT and we don't care about it
                     return True
                 time.sleep(30)
                 count += 1
     return True

Beispiel #2

0

Datei anzeigen

Datei: Magic.py Projekt: DanishDataArchive/python-utils

def main():
    try:
        opts, args = getopt.getopt(sys.argv[1:], "b:f:hm", ["bytes=", "file=", "help", "mime"])
    except getopt.GetoptError as err:
        print str(err)

    bytes = 10
    file = ""
    mime = False

    for o, a in opts:
        if o in ("-f", "--file"):
            file = a
        elif o in ("-m", "--mime"):
            mime = True
        elif o in ("-b", "--bytes"):
            bytes = int(a)
        elif o in ("-h", "--help"):
            printUsage()
            sys.exit(0)

    if len(file) > 0:
        print magic.from_buffer(open(file).read(bytes), mime)
    else:
        print "Please specify a file"

Beispiel #3

0

Datei anzeigen

Datei: sample.py Projekt: optionstvm/crits

 def _generate_file_metadata(self, data):
     import pydeep
     import magic
     from hashlib import md5, sha1, sha256
     try:
         self.filetype = magic.from_buffer(data)
     except:
         self.filetype = "Unavailable"
     try:
         mimetype = magic.from_buffer(data, mime=True)
         if mimetype:
             self.mimetype = mimetype.split(";")[0]
         if not mimetype:
             self.mimetype = "unknown"
     except:
         self.mimetype = "Unavailable"
     self.size = len(data)
     # this is a shard key. you can't modify it once it's set.
     # MongoEngine will still mark the field as modified even if you set it
     # to the same value.
     if not self.md5:
         self.md5 = md5(data).hexdigest()
     self.sha1 = sha1(data).hexdigest()
     self.sha256 = sha256(data).hexdigest()
     try:
         self.ssdeep = pydeep.hash_bytes(data)
     except:
         self.ssdeep = None

Beispiel #4

0

Datei anzeigen

Datei: test_compat.py Projekt: rshk/python-magic

 def test_old_from_buffer(self):
     for key, val in TEST_FILES.items():
         file_path = os.path.join(TEST_DATA_DIR, key)
         with open(file_path, 'rb') as f:
             buf = f.read(1024)
         self.assertMatches(magic.from_buffer(buf, mime=False), val[2])
         self.assertMatches(magic.from_buffer(buf, mime=True), val[0])

Beispiel #5

0

Datei anzeigen

Datei: mime.py Projekt: luckydonald/luckydonald-utils

def get_byte_mime(bytes):
    """
    Shortcut to get a mime from bytes in a variable.

    :param bytes:
    :return:
    """
    magic.from_buffer(bytes, mime=True).decode("utf-8")

Beispiel #6

0

Datei anzeigen

Datei: Paste.py Projekt: mokaddem/AIL-framework

    def __init__(self, p_path):

        configfile = os.path.join(os.environ['AIL_BIN'], 'packages/config.cfg')
        if not os.path.exists(configfile):
            raise Exception('Unable to find the configuration file. \
                            Did you set environment variables? \
                            Or activate the virtualenv.')

        cfg = configparser.ConfigParser()
        cfg.read(configfile)
        self.cache = redis.StrictRedis(
            host=cfg.get("Redis_Queues", "host"),
            port=cfg.getint("Redis_Queues", "port"),
            db=cfg.getint("Redis_Queues", "db"),
            decode_responses=True)
        self.store = redis.StrictRedis(
            host=cfg.get("Redis_Data_Merging", "host"),
            port=cfg.getint("Redis_Data_Merging", "port"),
            db=cfg.getint("Redis_Data_Merging", "db"),
            decode_responses=True)
        self.store_metadata = redis.StrictRedis(
            host=cfg.get("ARDB_Metadata", "host"),
            port=cfg.getint("ARDB_Metadata", "port"),
            db=cfg.getint("ARDB_Metadata", "db"),
            decode_responses=True)

        PASTES_FOLDER = os.path.join(os.environ['AIL_HOME'], cfg.get("Directories", "pastes"))
        if PASTES_FOLDER not in p_path:
            self.p_rel_path = p_path
            p_path = os.path.join(PASTES_FOLDER, p_path)
        else:
            self.p_rel_path = None

        self.p_path = p_path
        self.p_name = os.path.basename(self.p_path)
        self.p_size = round(os.path.getsize(self.p_path)/1024.0, 2)
        self.p_mime = magic.from_buffer("test", mime=True)
        self.p_mime = magic.from_buffer(self.get_p_content(), mime=True)

        # Assuming that the paste will alway be in a day folder which is itself
        # in a month folder which is itself in a year folder.
        # /year/month/day/paste.gz

        var = self.p_path.split('/')
        self.p_date = Date(var[-4], var[-3], var[-2])
        self.p_rel_path = os.path.join(var[-4], var[-3], var[-2], self.p_name)
        self.p_source = var[-5]
        self.supposed_url = 'https://{}/{}'.format(self.p_source.replace('_pro', ''), var[-1].split('.gz')[0])

        self.p_encoding = None
        self.p_hash_kind = {}
        self.p_hash = {}
        self.p_langage = None
        self.p_nb_lines = None
        self.p_max_length_line = None
        self.array_line_above_threshold = None
        self.p_duplicate = None
        self.p_tags = None

Beispiel #7

0

Datei anzeigen

Datei: test.py Projekt: ahupp/python-magic

    def test_rethrow(self):
        old = magic.magic_buffer
        try:
            def t(x, y):
                raise magic.MagicException("passthrough")
            magic.magic_buffer = t

            with self.assertRaises(magic.MagicException):
                magic.from_buffer("hello", True)
        finally:
            magic.magic_buffer = old

Beispiel #8

0

Datei anzeigen

Datei: mime.py Projekt: HOMEINFO/homeinfo-lib

def mimetype(file):
    """Read MIME type from file"""
    try:
        mimetype_ = from_file(file, mime=True)
    except (OSError, TypeError, ValueError):
        try:
            data = file.read()
        except AttributeError:
            return from_buffer(file, mime=True)
        else:
            return from_buffer(data, mime=True)
    else:
        return mimetype_

Beispiel #9

0

Datei anzeigen

Datei: tests.py Projekt: philippbosch/muto

    def test_format(self):
        client = self.get_muto_client()
        source_url = self.TEST_IMAGES[0]
        client.from_url(source_url)

        client.format = 'jpg'
        img_jpg = client.process()
        img_jpg_mime = magic.from_buffer(img_jpg.read(), mime=True)
        self.assertEqual(img_jpg_mime, 'image/jpeg')

        client.format = 'gif'
        img_gif = client.process()
        img_gif_mime = magic.from_buffer(img_gif.read(), mime=True)
        self.assertEqual(img_gif_mime, 'image/gif')

Beispiel #10

0

Datei anzeigen

Datei: helper.py Projekt: hildensia/alot

def guess_mimetype(blob):
    """
    uses file magic to determine the mime-type of the given data blob.

    :param blob: file content as read by file.read()
    :type blob: data
    :returns: mime-type, falls back to 'application/octet-stream'
    :rtype: str
    """
    mimetype = 'application/octet-stream'
    # this is a bit of a hack to support different versions of python magic.
    # Hopefully at some point this will no longer be necessary
    #
    # the version with open() is the bindings shipped with the file source from
    # http://darwinsys.com/file/ - this is what is used by the python-magic
    # package on Debian/Ubuntu. However, it is not available on pypi/via pip.
    #
    # the version with from_buffer() is available at
    # https://github.com/ahupp/python-magic and directly installable via pip.
    #
    # for more detail see https://github.com/pazz/alot/pull/588
    if hasattr(magic, 'open'):
        m = magic.open(magic.MAGIC_MIME_TYPE)
        m.load()
        magictype = m.buffer(blob)
    elif hasattr(magic, 'from_buffer'):
        magictype = magic.from_buffer(blob, mime=True)
    else:
        raise Exception('Unknown magic API')

    # libmagic does not always return proper mimetype strings, cf. issue #459
    if re.match(r'\w+\/\w+', magictype):
        mimetype = magictype
    return mimetype

Beispiel #11

0

Datei anzeigen

Datei: pe.py Projekt: nheijmans/MalZoo

 def check_rsrc(self):
     """ Function needed to determine the compilation language """
     try:
         ret = {}
         if hasattr(self.pe, 'DIRECTORY_ENTRY_RESOURCE'):
             i = 0
             for resource_type in self.pe.DIRECTORY_ENTRY_RESOURCE.entries:
                 if resource_type.name is not None:
                     name = "%s" % resource_type.name
                 else:
                     name = "%s" % pefile.RESOURCE_TYPE.get(resource_type.struct.Id)
                 if name == None:
                     name = "%d" % resource_type.struct.Id
                 if hasattr(resource_type, 'directory'):
                     for resource_id in resource_type.directory.entries:
                         if hasattr(resource_id, 'directory'):
                             for resource_lang in resource_id.directory.entries:
                                 try:
                                     data = self.pe.get_data(resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size)
                                     filetype = magic.from_buffer(open(self.filename).read(1024)) 
                                     lang = pefile.LANG.get(resource_lang.data.lang, 'qq_*unknown*')
                                     sublang = pefile.get_sublang_name_for_lang( resource_lang.data.lang, resource_lang.data.sublang )
                                     ret[i] = (name, resource_lang.data.struct.OffsetToData, resource_lang.data.struct.Size, filetype, lang, sublang)
                                     i += 1
                                 except pefile.PEFormatError:
                                     pass
     except:
         ret = False 
         pass
     finally:
         return ret

Beispiel #12

0

Datei anzeigen

Datei: sample.py Projekt: nbs-system/mowr

    def compute_hashes(self):
        """ Compute the file hashes """
        filename = self.get_file_path(self.sha256)

        # Make sure the file exists and is readable
        if not os.access(filename, os.R_OK):
            flash('There was an error while trying to analyse the file.', 'danger')
            return False

        with open(filename, 'rb') as f:
            buf = f.read()

        if self.sha256 is None:
            self.sha256 = hashlib.sha256(buf).hexdigest()
        if self.sha1 is None:
            self.sha1 = hashlib.sha1(buf).hexdigest()
        if self.md5 is None:
            self.md5 = hashlib.md5(buf).hexdigest()
        if self.ssdeep is None:
            self.ssdeep = ssdeep.hash(buf)
        if self.mime is None:
            try:
                self.mime = magic.from_buffer(buf, mime=True).decode('utf-8')
            except:
                self.mime = None
        if self.entropy is None:
            self.entropy = self.compute_entropy(buf)

Beispiel #13

0

Datei anzeigen

Datei: attachment.py Projekt: wbnxvzqbrfagznggre/fuglu

 def getBuffertype(self, buffercontent):
     if MAGIC_AVAILABLE == MAGIC_PYTHON_FILE:
         ms = self._get_file_magic()
         btype = ms.buffer(buffercontent)
     elif MAGIC_AVAILABLE == MAGIC_PYTHON_MAGIC:
         btype = magic.from_buffer(buffercontent, mime=True)
     return btype

Beispiel #14

0

Datei anzeigen

Datei: validators.py Projekt: smcarington/mat237

 def __call__(self, data):
     if self.max_size is not None and data.size > self.max_size:
         params = {
         'max_size': filesizeformat(self.max_size), 
         'size': filesizeformat(data.size),
         }
         raise ValidationError(self.error_messages['max_size'],
                                 'max_size', params)
     
     if self.min_size is not None and data.size < self.min_size:
         params = {
             'min_size': filesizeformat(self.mix_size),
             'size': filesizeformat(data.size)
         }
         raise ValidationError(self.error_messages['min_size'], 
                                 'min_size', params)
     
     if self.content_types:
         content_type = magic.from_buffer(data.read(), mime=True)
     if content_type not in self.content_types:
         params = { 'content_type': content_type }
         raise ValidationError(self.error_messages['content_type'],
                                'content_type', params)
         
     def __eq__(self, other):
         return isinstance(other, FileValidator)

Beispiel #15

0

Datei anzeigen

Datei: maltrieve.py Projekt: detrojones/maltrieve

def save_malware(response, directory, black_list, white_list):
    url = response.url
    data = response.content
    mime_type = magic.from_buffer(data, mime=True)
    if mime_type in black_list:
        logging.info('%s in ignore list for %s', mime_type, url)
        return
    if white_list:
        if mime_type in white_list:
            pass
        else:
            logging.info('%s not in whitelist for %s', mime_type, url)
            return

    # Hash and log
    md5 = hashlib.md5(data).hexdigest()
    logging.info("%s hashes to %s" % (url, md5))

    # Assume that if viper or vxcage then we dont need to write to file as well.
    stored = False
    # Submit to external services
    if cfg['vxcage']:
        upload_vxcage(response, md5)
        stored = True
    if cfg['cuckoo']:
        upload_cuckoo(response, md5)
    if cfg['viper']:
        upload_viper(response, md5)
        stored = True
    # else save to disk
    if not stored:
        with open(os.path.join(directory, md5), 'wb') as f:
            f.write(data)
            logging.info("Saved %s to dump dir" % md5)
    return True

Beispiel #16

0

Datei anzeigen

Datei: storage.py Projekt: eddygta17/open-event-server

def upload_to_gs(bucket_name, client_id, client_secret, file, key, acl='public-read'):
    conn = GSConnection(client_id, client_secret, calling_format=OrdinaryCallingFormat())
    bucket = conn.get_bucket(bucket_name)
    k = Key(bucket)
    # generate key
    filename = secure_filename(file.filename)
    key_dir = key + '/' + generate_hash(key) + '/'
    k.key = key_dir + filename
    # delete old data
    for item in bucket.list(prefix='/' + key_dir):
        item.delete()
    # set object settings

    file_data = file.read()
    file_mime = magic.from_buffer(file_data, mime=True)
    size = len(file_data)
    sent = k.set_contents_from_string(
        file_data,
        headers={
            'Content-Disposition': 'attachment; filename=%s' % filename,
            'Content-Type': '%s' % file_mime
        }
    )
    k.set_acl(acl)
    gs_url = 'https://storage.googleapis.com/%s/' % bucket_name
    if sent == size:
        return gs_url + k.key
    return False

Beispiel #17

0

Datei anzeigen

Datei: old-tropicos-mediaing.py Projekt: danstoner/idigbio-scratch

def get_objects_from_ceph():
    local_cur.execute("SELECT etag FROM objects")
    existing_objects = set()
    for r in local_cur:
        existing_objects.add(r[0])

    print len(existing_objects)

    s = IDigBioStorage()
    buckets = ["datasets","images"]
    count = 0
    rowcount = 0
    lrc = 0
    for b_k in buckets:
        b = s.get_bucket("idigbio-" + b_k + "-prod")
        for k in b.list():
            if k.name not in existing_objects:
                try:
                    ks = k.get_contents_as_string(headers={'Range' : 'bytes=0-100'})
                    detected_mime = magic.from_buffer(ks, mime=True)
                    local_cur.execute("INSERT INTO objects (bucket,etag,detected_mime) SELECT %(bucket)s,%(etag)s,%(dm)s WHERE NOT EXISTS (SELECT 1 FROM objects WHERE etag=%(etag)s)", {"bucket": b_k, "etag": k.name, "dm": detected_mime})
                    existing_objects.add(k.name)
                    rowcount += local_cur.rowcount
                except:
                    print "Ceph Error", b_k, k.name
            count += 1


            if rowcount != lrc and rowcount % 10000 == 0:
                print count, rowcount
                local_pg.commit()
                lrc = rowcount
        print count, rowcount
        local_pg.commit()

Beispiel #18

0

Datei anzeigen

Datei: fields.py Projekt: Eksmo/django-webdav-storage

    def generate_filename(self, instance, filename):
        if not self.random_filename:
            return super(WebDAVMixin, self).generate_filename(instance, filename)
        uuid_string = unicode(uuid.uuid4())
        file = getattr(instance, self.attname)
        if hasattr(file._file, 'content_type') and file._file.content_type in self.valid_content_types:
            content_type = file._file.content_type
        else:
            try:
                file._file.seek(0)
                if self.custom_magic_file:
                    content_type = magic.Magic(mime=True,
                                               magic_file=self.custom_magic_file).from_buffer(file._file.read(1024))
                else:
                    content_type = magic.from_buffer(file._file.read(1024), mime=True)
            except TypeError as e:
                content_type = 'application/x-unknown'

        #Receiving all extensions and checking if file extension matches MIME Type
        extensions = mimetypes.guess_all_extensions(content_type)
        try:
            file_ext = re.findall(r'\.[^.]+$', filename)[0]
        except IndexError:
            file_ext = None
        if file_ext in extensions:
            ext = file_ext
        elif extensions:
            ext = extensions[0]
        else:
            ext = '.bin'

        return os.path.join(self.upload_to, uuid_string[:2], uuid_string[2:4], '%s%s' % (uuid_string, ext))

Beispiel #19

0

Datei anzeigen

Datei: transferrable_keys.py Projekt: kevien/python-pgp

 def from_subpacket(cls, subpacket):
     if subpacket.image_format == C.JPEG_IMAGE_FORMAT:
         mime_type = 'image/jpeg'
     else:
         mime_type = magic.from_buffer(subpacket.data[:1024],
                                       mime=True).decode('ascii')
     return cls(mime_type, subpacket.data)

Beispiel #20

0

Datei anzeigen

Datei: SampleLogging.py Projekt: buffer/thug

    def is_rtf(self, data):
        rtf_mime_types = (
            'text/rtf',
            'application/rtf',
        )

        return magic.from_buffer(data, mime = True) in rtf_mime_types

Beispiel #21

0

Datei anzeigen

Datei: scraper.py Projekt: CodeforChemnitz/scrape-a-ris

    def get_attachment_file(self, attachment, form):
        """
        Loads the attachment file from the server and stores it into
        the attachment object given as a parameter. The form
        parameter is the mechanize Form to be submitted for downloading
        the attachment.

        The attachment parameter has to be an object of type
        model.attachment.Attachment.
        """
        time.sleep(self.config.WAIT_TIME)
        logging.info("Getting attachment '%s'", attachment.identifier)
        if self.options.verbose:
            print "Getting attachment '%s'" % attachment.identifier
        mechanize_request = form.click()
        try:
            mform_response = mechanize.urlopen(mechanize_request)
            mform_url = mform_response.geturl()
            if self.list_in_string(self.urls['ATTACHMENT_DOWNLOAD_TARGET'], mform_url):
                attachment.content = mform_response.read()
                attachment.mimetype = magic.from_buffer(attachment.content, mime=True)
                attachment.filename = self.make_attachment_filename(attachment.identifier, attachment.mimetype)
            else:
                logging.warn("Unexpected form target URL '%s'", mform_url)
                if self.options.verbose:
                    sys.stderr.write("Unexpected form target URL '%s'\n" % mform_url)
        except mechanize.HTTPError as e:
            logging.warn("HTTP Error: code %s, info: %s", e.code, e.msg)
            if self.options.verbose:
                print "HTTP-FEHLER:", e.code, e.msg
        return attachment

Beispiel #22

0

Datei anzeigen

Datei: van-gogh.py Projekt: orgonon/van-gogh

def validate(randname):

    newname = ''
    filetype = { 'JPEG' : 'jpg', 
                 'GIF ' : 'gif', 
                 'PNG ': 'png', 
                 'JPG ':'jpg' }

    fileext = magic.from_buffer(open(randname).read(1024))[:4]

    if fileext in filetype:
        fileext = filetype[fileext]
        newname = str(randname) + '.' + str(fileext)

        hashed = hasher.append(hasher.hasher(randname, newname))

        if hashed: 
            os.remove(randname)
            return hashed

        cmd = 'mv %s %s' % (randname, newname)
        os.popen(cmd)
        thumb(newname)
        return newname

    else: 
        os.remove(randname)
        return None

Beispiel #23

0

Datei anzeigen

Datei: SampleLogging.py Projekt: buffer/thug

    def is_doc(self, data):
        doc_mime_types = (
            'application/msword',
            'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
        )

        return magic.from_buffer(data, mime = True) in doc_mime_types

Beispiel #24

0

Datei anzeigen

Datei: resizeBenefits.py Projekt: kitsunde/Sizer-Soze

def getBenefits(results, dir, ignore_invisibles):
    benefits = []
    devnull = open(os.devnull, "wb")
    for result in results:
        (url, width, height) = analyzeResult(result)
        filedir, filename = resourceSlug(url, dir)
        try:
            buffer = open(filename, "rb").read()
        except IOError:
            continue
        ext = magic.from_buffer(buffer).split()[0].lower()
        # If it's not one of the known image formats, return!
        # Sorry WebP
        if (ext != "jpeg") and (ext != "png") and (ext != "gif"):
            continue
        optimized_file_name = filename + "_lslsopt" + ext
        lossy_optimized_file_name = filename + "_lossyopt" + ext
        resized_file_name = filename + "_" + width + "_" + height + ext
        # optimize the original image
        copyfile(filename, optimized_file_name)
        call(["image_optim", optimized_file_name], stdout=devnull, stderr=devnull)

        # Lossy optimize the original image
        call(["convert", optimized_file_name, "-quality", "85", lossy_optimized_file_name])
        # call(["image_optim", lossy_optimized_file_name], stdout=devnull, stderr=devnull)

        # Resize the original image
        call(["convert", optimized_file_name, "-geometry", width + "x" + height, "-quality", "85", resized_file_name])
        # call(["image_optim", resized_file_name], stdout=devnull, stderr=devnull)

        # Get the original image's dimensions
        original_dimensions = check_output('identify -format "%w,%h" ' + filename + "|sed 's/,/x/'", shell=True).strip()

        original_size = fileSize(filename)
        optimized_size = fileSize(optimized_file_name)
        lossy_optimized_size = fileSize(lossy_optimized_file_name)
        resized_size = fileSize(resized_file_name)

        # If resizing made the image larger, ignore it
        if resized_size > optimized_size:
            resized_size = optimized_size

        # if the image is not displayed, consider all its data as a waste
        if width == "0":
            resized_size = 0
            if ignore_invisibles:
                continue

        benefits.append(
            [
                filename,
                original_size,
                original_size - optimized_size,
                original_size - lossy_optimized_size,
                original_dimensions + "=>" + width + "x" + height,
                original_size - resized_size,
            ]
        )
    devnull.close()
    return benefits

Beispiel #25

0

Datei anzeigen

Datei: scraper.py Projekt: reustonium/BarkingOwl

def type_link(link, file_size=1024, sleep_time=2):
    """ type_link() will download a link, and then deturmine it's file type using magic numbers.
    """

    #log( "Attempting to type link: '{0}' (using filesize: {1})".format(link,file_size) )

    success = False
    file_type = ""
    try:

        if 'mailto:' in link:
            raise Exception('Invalid link type.')

        req = urllib2.Request(link, headers={'Range':"byte=0-{0}".format(file_size)})

        # try and download the file five times ( in case the site is being fussy )
        error_count = 0
        while(error_count < 5):
            try:
                payload = urllib2.urlopen(req,timeout=5).read(file_size)
                #log( "Successfully downloaded the first {0} bytes of '{1}'.".format(file_size, link) )
                break
            except Exception, e:
                #log( "Error within type_link while trying to download {0} bytes from URL:\n\t{1}\n".format(link,str(e)) )
                if str(e) != 'time out':
                    raise Exception(e)
                else:
                    error_count += 1
                    time.sleep(sleep_time)
        # type file using libmagic
        file_type = magic.from_buffer(payload, mime=True)
        success = True

Beispiel #26

0

Datei anzeigen

Datei: getfiletype.py Projekt: GitToPyb/cuckoo

def get_filetype(file_path):
    """
    Get file format identifier based on the type of the given file.
    @param file_path: file path
    @return: file type identifier or magic signature if format is not supported
    """
    log = logging.getLogger("Core.GetFileType")
    
    if not os.path.exists(file_path):
        return None

    data = open(file_path, "rb").read()

    # Thanks to Jesse from malc0de.com for this suggestion.
    # First try official magic bindings, if something fails try to failover
    # on the unofficial bindings.
    try:
        ms = magic.open(magic.MAGIC_NONE)
        ms.load()
        file_type = ms.buffer(data)
    except:
        try:
            file_type = magic.from_buffer(data)
        except Exception, why:
            log.error("Something went wrong while retrieving magic: %s" % why)
            return None

Beispiel #27

0

Datei anzeigen

Datei: downloadr.py Projekt: baptistelebail/webdevdata.org

def downloadFile(url, dir):
    os.chdir(dir)
    url = url.strip()
    try: 
        print "Downloading: ", url
        if url.startswith("http://"):
            url = url[7:]
        urlhost = url.split("/")[0]
        urlpath = "/".join(url.split("/")[1:])
        f = urlopen("http://" + url)
        hash = hashlib.md5()
        hash.update(url)
        dir = hash.hexdigest()[:2]
        if not os.path.exists(dir):
            os.mkdir(dir)
        buffer = f.read()
        ext = magic.from_buffer(buffer).split()[0].lower()
        if "html" in ext:
            ext = "html.txt"
        filename = dir + "/" + urlhost + "_" + hash.hexdigest() + "." + ext
        with open(filename, "wb") as local_file:
            local_file.write(buffer)
            local_file.close()
        with open(filename + ".hdr.txt", "wb") as local_file:
            local_file.write(str(f.getcode()) + "\n" + str(f.info()))
            local_file.close()
    except HTTPError, e:
        print "HTTPError:", e.code, url

Beispiel #28

0

Datei anzeigen

Datei: scraperallris.py Projekt: OpenRuhr/ris-scraper

  def get_document_file(self, document, document_url, post=False):
    """
    Loads the document file from the server and stores it into
    the document object given as a parameter. The form
    parameter is the mechanize Form to be submitted for downloading
    the document.

    The document parameter has to be an object of type
    model.document.Document.
    """
    time.sleep(self.config.WAIT_TIME)
    logging.info("Getting document '%s'", document.identifier)
    
    document_backup = document
    logging.info("Getting document %s from %s", document.identifier, document_url)

    if post:
      document_file = self.get_url(document_url, post_data={'DOLFDNR': '55434', 'options': '64'})
    else:
      document_file = self.get_url(document_url)
      if not document_file:
        logging.error("Error downloading file %", document_url)
        return document
    document.content = document_file.content
    # catch strange magic exception
    try:
      document.mimetype = magic.from_buffer(document.content, mime=True)
    except magic.MagicException:
      logging.warn("Warning: unknown magic error at document %s from %s", document.identifier, document_url)
      return document_backup
    document.filename = self.make_document_filename(document.identifier, document.mimetype)
    return document

Beispiel #29

0

Datei anzeigen

Datei: analysis.py Projekt: Sp3ctr3/PyTriage

def filetype(data):
 try:
  ms=magic.open(magic.MAGIC_NONE)
  ms.load()
  return ms.buffer(data)
 except:
  return magic.from_buffer(data)

Beispiel #30

0

Datei anzeigen

Datei: ContentLoader.py Projekt: nothing628/MangaCMS

	def getImage(self, imageUrl, referrer):

		content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer})
		if not content or not handle:
			raise ValueError("Failed to retreive image from page '%s'!" % referrer)

		fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1])
		fileN = bs4.UnicodeDammit(fileN).unicode_markup
		self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0)

		if not "." in fileN:
			info = handle.info()
			if 'Content-Type' in info:
				tp = info['Content-Type']
				if ";" in tp:
					tp = tp.split(";")[0]
				ext = guess_extension(tp)
				if ext == None:
					ext = "unknown_ftype"
				print(info['Content-Type'], ext)
				fileN += "." + ext
			else:
				fileN += ".jpg"

		# Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.)
		guessed = magic.from_buffer(content, mime=True)
		ext = guess_extension(tp)
		if ext:
			fileN = fileN + ext

		return fileN, content

Beispiel #31

0

Datei anzeigen

Datei: main.py Projekt: leblancfg/test_fastapi

def get_mime(file):
    """Given a file, returns mimetype and extension"""
    mime = magic.from_buffer(file.read(2048), mime=True)
    extension = guess_extension(mime, False)
    return mime, extension

Beispiel #32

0

Datei anzeigen

def upload_media(request):
    """
    Upload a media file from multi-part HTTP file request.
    @see https://docs.djangoproject.com/fr/1.10/ref/files/uploads/#custom-upload-handlers
    """
    if not request.FILES:
        raise SuspiciousOperation(_("No file specified"))

    up = request.FILES['file']

    # check file size
    if up.size > localsettings.max_file_size:
        SuspiciousOperation(_("Upload file size limit is set to %i bytes") % localsettings.max_file_size)

    # simple check mime-types using the file extension (can process a test using libmagic)
    guessed_mime_type = mimetypes.guess_type(up.name)[0]
    if guessed_mime_type is None:
        SuspiciousOperation(_("Undetermined uploaded file type"))

    # validate the file name and update it in way to be multi OS compliant
    # remove any '.' before and after
    name = up.name.strip('.')
    valid_name = io.StringIO()

    # replace forbidden characters by '_'
    for c in name:
        if ord(c) < 32 or c in ('<', '>', '"', '|', '\\', '`', '*', '?', ':', '/'):
            c = '_'

        valid_name.write(c)

    media = Media()

    # generate two levels of path from the uuid node
    l1_path = '%02x' % (((media.uuid.node & 0xffffff000000) >> 24) % 256)
    l2_path = '%02x' % ((media.uuid.node & 0x000000ffffff) % 256)

    local_path = os.path.join(l1_path, l2_path)
    local_file_name = str(media.uuid)

    media.name = os.path.join(local_path, local_file_name)
    media.version = 1
    media.file_name = valid_name.getvalue()
    media.file_size = up.size

    # default owner is the user of the upload
    media.owner_content_type = ContentType.objects.get_by_natural_key("auth", "user")
    media.owner_object_id = request.user.pk

    # create the path if necessary
    abs_path = os.path.join(localsettings.storage_path, local_path)
    if not os.path.exists(abs_path):
        os.makedirs(abs_path, 0o770)

    abs_file_name = os.path.join(abs_path, local_file_name)
    dst_file = open(abs_file_name, "wb")

    # test mime-type with a buffer of a least 1024 bytes
    test_mime_buffer = io.BytesIO()

    # copy file content
    for chunk in up.chunks():
        dst_file.write(chunk)

        if test_mime_buffer.tell() < 1024:
            test_mime_buffer.write(chunk)

    dst_file.close()

    guessed_mime_type = magic.from_buffer(test_mime_buffer.getvalue(), mime=True)

    # 0660 on file
    os.chmod(abs_file_name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

    media.mime_type = guessed_mime_type  # up.content_type

    # save the model once file is correctly saved
    media.save()

    result = {
        'id': media.id,
        'uuid': media.uuid,
        'name': media.name,
        'created_date': media.created_date,
        'modified_date': media.modified_date,
        'file_name': media.file_name,
        'file_size': media.file_size
    }

    return HttpResponseRest(request, result)

Beispiel #33

0

Datei anzeigen

def update_upload_media(request, uuid):
    """
    Upload a media file from multi-part HTTP file request.
    """
    if not request.FILES:
        raise SuspiciousOperation(_("No file specified"))

    up = request.FILES['file']

    # check file size
    if up.size > localsettings.max_file_size:
        SuspiciousOperation(_("Upload file size limit is set to %i bytes") % localsettings.max_file_size)

    # simple check mime-types using the file extension (can process a test using libmagic)
    guessed_mime_type = mimetypes.guess_type(up.name)[0]
    if guessed_mime_type is None:
        SuspiciousOperation(_("Undetermined uploaded file type"))

    media = get_object_or_404(Media, uuid=uuid)

    # check user permission on the media
    if media.owner_content_type == "auth.user":
        if media.owner_object_id != request.user.pk:
            raise PermissionDenied(_('Your are not the owner of the media'))
    else:
        perms = get_permissions_for(request.user,
                                    media.owner_content_type.app_label,
                                    media.owner_content_type.model,
                                    media.owner_object_id)

        if '%s.change_%s' % (media.owner_content_type.app_label, media.owner_content_type.model) not in perms:
            raise PermissionDenied(_('No change permission to the owner entity'))

    version = media.version + 1

    abs_file_name = os.path.join(localsettings.storage_path, media.name)

    if not os.path.isfile(abs_file_name):
        SuspiciousOperation(_("Trying to update a non-existing file"))

    dst_file = open(abs_file_name, "wb")

    # test mime-type with a buffer of a least 1024 bytes
    test_mime_buffer = io.BytesIO()

    # copy file content
    for chunk in up.chunks():
        dst_file.write(chunk)

        if test_mime_buffer.tell() < 1024:
            test_mime_buffer.write(chunk)

    dst_file.close()

    guessed_mime_type = magic.from_buffer(test_mime_buffer.getvalue(), mime=True)

    # 0660 on file
    os.chmod(abs_file_name, stat.S_IRUSR | stat.S_IWUSR | stat.S_IRGRP | stat.S_IWGRP)

    # upgrade the version number and file size
    media.version = version
    media.file_size = up.size
    media.mime_type = guessed_mime_type  # up.content_type

    # update the model once file is correctly saved
    media.save()

    result = {
        'id': media.id,
        'uuid': media.uuid,
        'version': media.version,
        'mime_type': media.content_type,
        'file_size': media.file_size,
        'modified_date': media.modified_date
    }

    return HttpResponseRest(request, result)

Beispiel #34

0

Datei anzeigen

                    item_text_data = item_text_data.text

                if not os.path.exists(directory):
                    os.makedirs(directory)

                try:
                    r_image = get(item_image_data['src'], allow_redirects=True)

                    if r_image.status_code != 404:
                        filename, file_extension = os.path.splitext(
                            urlparse(
                                item_image_data['src']).path.split('/')[2])

                        if file_extension == '':
                            mime = from_buffer(
                                r_image.iter_content(256).__next__(),
                                mime=True)

                            if mime == 'image/jpeg':
                                file_extension = '.jpg'
                            else:
                                print('Mime no adicionado. Agregar')
                                quit()

                        open(directory + '/' + filename + file_extension,
                             'wb').write(r_image.content)
                        filename = filename + file_extension
                    else:
                        filename = ''

                    print(id_item)

Beispiel #35

0

Datei anzeigen

Datei: webtoon.py Projekt: fushinari/Komikku

    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's url (provided by search)
        """
        assert 'url' in initial_data, 'Manga url is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['url']),
                             headers={'user-agent': USER_AGENT})
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        # Get true URL after redirects
        split_url = urlsplit(r.url)
        url = '{0}?{1}'.format(split_url.path, split_url.query)

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                url=url,
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        # Details
        info_element = soup.find('div', class_='info')
        for element in info_element.find_all(class_='genre'):
            if element.span:
                element.span.extract()
            data['genres'].append(element.text.strip())

        for element in info_element.find_all(class_='author'):
            if element.span:
                element.span.extract()
            if element.a:
                element.a.extract()
            data['authors'].append(element.text.strip())

        detail_element = soup.find('div', class_='detail_body')
        if 'challenge' in data['url']:
            # Challenge (Canvas)
            data['cover'] = soup.find('div',
                                      class_='detail_header').img.get('src')
        else:
            # Original
            data['cover'] = detail_element.get('style').split(
                ' ')[1][4:-1].split('?')[0] + '?type=q90'

            # Status
            value = detail_element.find('p', class_='day_info').text.strip()
            if value.find('COMPLETED') >= 0:
                data['status'] = 'complete'
            elif value.find('UP') >= 0:
                data['status'] = 'ongoing'

        data['synopsis'] = detail_element.find('p',
                                               class_='summary').text.strip()

        # Chapters
        data['chapters'] = self.get_manga_chapters_data(data['url'])

        return data

Beispiel #36

0

Datei anzeigen

Datei: signals.py Projekt: hyliker/kehubu_backend

def attachment_pre_save(sender, instance, **kwargs):
    if instance.file:
        instance.mimetype = magic.from_buffer(instance.file.read(1024),
                                              mime=True)

Beispiel #37

0

Datei anzeigen

    def process(self):
        sample = self.current_task.get_resource("sample")
        self.log.info("hostname: {}".format(socket.gethostname()))
        sha256sum = hashlib.sha256(sample.content).hexdigest()
        magic_output = magic.from_buffer(sample.content)
        self.log.info("running sample sha256: {}".format(sha256sum))

        timeout = self.current_task.payload.get('timeout') or 60 * 10
        hard_time_limit = 60 * 20
        if timeout > hard_time_limit:
            self.log.error(
                "Tried to run the analysis for more than hard limit of %d seconds",
                hard_time_limit)
            return

        analysis_uid = self.current_task.uid
        override_uid = self.current_task.payload.get('override_uid')

        self.log.info(f"analysis UID: {analysis_uid}")

        if override_uid:
            analysis_uid = override_uid
            self.log.info(f"override UID: {override_uid}")
            self.log.info(
                "note that artifacts will be stored under this overriden identifier"
            )

        self.rs.set(f"drakvnc:{analysis_uid}", INSTANCE_ID, ex=3600)  # 1h

        workdir = '/tmp/drakrun/vm-{}'.format(int(INSTANCE_ID))

        extension = self.current_task.headers.get("extension", "exe").lower()
        if '(DLL)' in magic_output:
            extension = 'dll'
        self.log.info("Running file as %s", extension)

        file_name = self.current_task.payload.get("file_name",
                                                  "malwar") + f".{extension}"
        # Alphanumeric, dot, underscore, dash
        if not re.match(r"^[a-zA-Z0-9\._\-]+$", file_name):
            self.log.error("Filename contains invalid characters")
            return

        self.log.info("Using file name %s", file_name)
        start_command = self.current_task.payload.get(
            "start_command", self._get_start_command(extension, sample))
        if not start_command:
            self.log.error(
                "Unable to run malware sample, could not generate any suitable command to run it."
            )
            return

        try:
            shutil.rmtree(workdir)
        except Exception as e:
            print(e)

        outdir = os.path.join(workdir, 'output')
        os.makedirs(workdir, exist_ok=True)
        os.mkdir(outdir)
        os.mkdir(os.path.join(outdir, 'dumps'))

        metadata = {
            "sample_sha256": sha256sum,
            "magic_output": magic_output,
            "time_started": int(time.time())
        }

        with open(os.path.join(outdir, 'sample_sha256.txt'), 'w') as f:
            f.write(hashlib.sha256(sample.content).hexdigest())

        with open(os.path.join(workdir, file_name), 'wb') as f:
            f.write(sample.content)

        watcher_tcpdump = None
        watcher_dnsmasq = None

        for _ in range(3):
            try:
                self.log.info("running vm {}".format(INSTANCE_ID))
                watcher_dnsmasq = start_dnsmasq(
                    INSTANCE_ID,
                    self.config.config['drakrun'].get('dns_server', '8.8.8.8'))

                d_run.logging = self.log
                d_run.run_vm(INSTANCE_ID)

                watcher_tcpdump = start_tcpdump_collector(INSTANCE_ID, outdir)

                self.log.info("running monitor {}".format(INSTANCE_ID))

                kernel_profile = os.path.join(PROFILE_DIR, "kernel.json")
                runtime_profile = os.path.join(PROFILE_DIR, "runtime.json")
                with open(runtime_profile, 'r') as runtime_f:
                    rp = json.loads(runtime_f.read())
                    inject_pid = rp['inject_pid']
                    kpgd = rp['vmi_offsets']['kpgd']

                hooks_list = os.path.join(ETC_DIR, "hooks.txt")
                dump_dir = os.path.join(outdir, "dumps")
                drakmon_log_fp = os.path.join(outdir, "drakmon.log")

                injector_cmd = [
                    "injector", "-o", "json", "-d",
                    "vm-{vm_id}".format(vm_id=INSTANCE_ID), "-r",
                    kernel_profile, "-i", inject_pid, "-k", kpgd, "-m",
                    "writefile", "-e", f"%USERPROFILE%\\Desktop\\{file_name}",
                    "-B",
                    os.path.join(workdir, file_name)
                ]

                self.log.info("Running injector...")
                injector = subprocess.Popen(injector_cmd,
                                            stdout=subprocess.PIPE)
                outs, errs = injector.communicate(b"", 20)

                if injector.returncode != 0:
                    raise subprocess.CalledProcessError(
                        injector.returncode, injector_cmd)

                injected_fn = json.loads(outs)['ProcessName']
                net_enable = int(self.config.config['drakrun'].get(
                    'net_enable', '0'))

                if "%f" not in start_command:
                    self.log.warning("No file name in start command")

                cwd = subprocess.list2cmdline([ntpath.dirname(injected_fn)])
                cur_start_command = start_command.replace("%f", injected_fn)

                # don't include our internal maintanance commands
                metadata['start_command'] = cur_start_command
                cur_start_command = f"cd {cwd} & " + cur_start_command

                if net_enable:
                    cur_start_command = "ipconfig /renew & " + cur_start_command

                full_cmd = subprocess.list2cmdline(
                    ["cmd.exe", "/C", cur_start_command])
                self.log.info("Using command: %s", full_cmd)

                drakvuf_cmd = [
                    "drakvuf", "-o", "json", "-x", "poolmon", "-x", "objmon",
                    "-x", "socketmon", "-j", "5", "-t",
                    str(timeout), "-i", inject_pid, "-k", kpgd, "-d",
                    "vm-{vm_id}".format(vm_id=INSTANCE_ID), "--dll-hooks-list",
                    hooks_list, "--memdump-dir", dump_dir, "-r",
                    kernel_profile, "-e", full_cmd
                ]

                drakvuf_cmd.extend(self.get_profile_list())

                syscall_filter = self.config.config['drakrun'].get(
                    'syscall_filter', None)
                if syscall_filter:
                    drakvuf_cmd.extend(["-S", syscall_filter])

                with open(drakmon_log_fp, "wb") as drakmon_log:
                    drakvuf = subprocess.Popen(drakvuf_cmd, stdout=drakmon_log)

                    try:
                        exit_code = drakvuf.wait(timeout + 60)
                    except subprocess.TimeoutExpired as e:
                        logging.error(
                            "BUG: Monitor command doesn\'t terminate automatically after timeout expires."
                        )
                        logging.error("Trying to terminate DRAKVUF...")
                        drakvuf.terminate()
                        drakvuf.wait(10)
                        logging.error(
                            "BUG: Monitor command also doesn\'t terminate after sending SIGTERM."
                        )
                        drakvuf.kill()
                        drakvuf.wait()
                        logging.error("Monitor command was forcefully killed.")
                        raise e

                    if exit_code != 0:
                        raise subprocess.CalledProcessError(
                            exit_code, drakvuf_cmd)
                break
            except subprocess.CalledProcessError:
                self.log.info(
                    "Something went wrong with the VM {}".format(INSTANCE_ID),
                    exc_info=True)
            finally:
                try:
                    subprocess.run(
                        ["xl", "destroy", "vm-{}".format(INSTANCE_ID)],
                        cwd=workdir,
                        check=True)
                except subprocess.CalledProcessError:
                    self.log.info(
                        "Failed to destroy VM {}".format(INSTANCE_ID),
                        exc_info=True)

                if watcher_dnsmasq:
                    watcher_dnsmasq.terminate()
        else:
            self.log.info(
                "Failed to analyze sample after 3 retries, giving up.")
            return

        self.log.info("waiting for tcpdump to exit")

        if watcher_tcpdump:
            try:
                watcher_tcpdump.wait(timeout=60)
            except subprocess.TimeoutExpired:
                self.log.exception("tcpdump doesn't exit cleanly after 60s")

        self.crop_dumps(os.path.join(outdir, 'dumps'),
                        os.path.join(outdir, 'dumps.zip'))
        if os.path.exists("/opt/procdot/procmon2dot"):
            self.generate_graphs(outdir)
        self.slice_logs(outdir)
        self.log.info("uploading artifacts")

        metadata['time_finished'] = int(time.time())

        with open(os.path.join(outdir, 'metadata.json'), 'w') as f:
            f.write(json.dumps(metadata))

        payload = {"analysis_uid": analysis_uid}
        payload.update(metadata)

        t = Task(
            {
                "type": "analysis",
                "kind": "drakrun",
                "quality": self.current_task.headers.get("quality", "high")
            },
            payload=payload)

        for resource in self.upload_artifacts(analysis_uid, workdir):
            t.add_payload(resource.name, resource)

        t.add_payload('sample', sample)
        self.send_task(t)

Beispiel #38

0

Datei anzeigen

Datei: dowloader.py Projekt: yarkeeb/pictures_downloader

            DIR = os.path.join(DIR, "Dota_2")
            if not os.path.exists(DIR):
                os.mkdir(DIR)
            filepath = search_keywords
            filepath = filepath.replace(prefix, "")
            filepath = filepath.split()
            filepath = "_".join(filepath)
            DIR = os.path.join(DIR, filepath)
            if not os.path.exists(DIR):
                os.mkdir(DIR)
            if len(items[k][1]) != 0:
                with open(os.path.join(DIR, str(k+1)+"." + items[k][1]),'wb') as output_file:
                    try:
                        data = response.read()
                        try:
                            type_i = magic.from_buffer(data)
                            print type_i
                            if type_i.find("image", 0, 30):
                                output_file.write(data)
                                print ("saved ====> " + str(k+1)) + " url: " + items[k][0]
                            else:
                                print "wrong data type"
                            response.close();
                        except Exception:
                           print("Probably Magic.from_buffer exception at image "+str(k)) 
                    except Exception:
                           print("Probably httplib.IncompleteRead: IncompleteRead at image "+str(k)) 
            k=k+1;

        except IOError:

Beispiel #39

0

Datei anzeigen

Datei: utils.py Projekt: tylerecouture/django-safe-filefield

def detect_content_type(f):
    sample = f.read(2048)
    f.seek(0)

    return magic.from_buffer(sample, mime=True)

Beispiel #40

0

Datei anzeigen

def check_in_memory_mime(in_memory_file):
    mime = magic.from_buffer(in_memory_file.read(), mime=True)
    return mime

Beispiel #41

0

Datei anzeigen

def save(request, biz_cc_id):
    """
    @summary: 创建或编辑app maker
    @param:
            id: id  判断是新建还是编辑
            name:　名称
            desc: 简介
            template_id: 模板ID
            template_scheme_id: 执行方案ID
    """

    try:
        params = request.POST.dict()
        jsonschema.validate(params, APP_MAKER_PARAMS_SCHEMA)
    except jsonschema.ValidationError as e:
        logger.warning(u"APP_MAKER_PARAMS_SCHEMA raise error: %s" % e)
        message = _(u"参数格式错误：%s" % e)
        return JsonResponse({'result': False, 'message': message})

    logo_obj = request.FILES.get('logo')
    if logo_obj:
        valid_mime = {'image/png', 'image/jpg', 'image/jpeg'}
        is_png_or_jpg = (logo_obj.content_type in valid_mime)
        if not is_png_or_jpg:
            return JsonResponse({
                'result': False,
                'message': _(u"请上传 jpg 或 png 格式的图片")
            })
        file_size = logo_obj.size
        # LOGO大小不能大于 100K
        if file_size > 100 * 1024:
            message = _(u"LOGO 文件大小必须小于 100K")
            return JsonResponse({'result': False, 'message': message})
        logo_content = logo_obj.read()
        real_mime = magic.from_buffer(logo_content, mime=True)
        if real_mime not in valid_mime:
            return JsonResponse({'result': False, 'message': _(u"图片格式非法")})
    else:
        logo_content = None

    params.update({
        'username': request.user.username,
        'logo_content': logo_content,
    })

    if settings.IS_LOCAL:
        params['link_prefix'] = '%s/appmaker/' % request.get_host()
        fake = True
    else:
        params['link_prefix'] = '%sappmaker/' % settings.APP_HOST
        fake = False

    result, data = AppMaker.objects.save_app_maker(biz_cc_id, params, fake)
    if not result:
        return JsonResponse({'result': False, 'message': data})

    data = {
        'id': data.id,
        'code': data.code,
        'logo_url': data.logo_url,
    }
    return JsonResponse({"result": True, "data": data})

Beispiel #42

0

Datei anzeigen

Datei: file_transfer.py Projekt: pingiun/mautrix-telegram

async def _unlocked_transfer_file_to_matrix(
        client: MautrixTelegramClient, intent: IntentAPI, loc_id: str,
        location: TypeLocation, thumbnail: TypeThumbnail, is_sticker: bool,
        tgs_convert: Optional[dict], filename: Optional[str], encrypt: bool,
        parallel_id: Optional[int]) -> Optional[DBTelegramFile]:
    db_file = DBTelegramFile.get(loc_id)
    if db_file:
        return db_file

    if parallel_id and isinstance(location, Document) and (not is_sticker
                                                           or not tgs_convert):
        db_file = await parallel_transfer_to_matrix(client, intent, loc_id,
                                                    location, filename,
                                                    encrypt, parallel_id)
        mime_type = location.mime_type
        file = None
    else:
        try:
            file = await client.download_file(location)
        except (LocationInvalidError, FileIdInvalidError):
            return None
        except (AuthBytesInvalidError, AuthKeyInvalidError,
                SecurityError) as e:
            log.exception(f"{e.__class__.__name__} while downloading a file.")
            return None

        width, height = None, None
        mime_type = magic.from_buffer(file, mime=True)

        image_converted = False
        # A weird bug in alpine/magic makes it return application/octet-stream for gzips...
        if is_sticker and tgs_convert and (
                mime_type == "application/gzip" or
            (mime_type == "application/octet-stream"
             and magic.from_buffer(file).startswith("gzip"))):
            mime_type, file, width, height = await convert_tgs_to(
                file, tgs_convert["target"], **tgs_convert["args"])
            thumbnail = None
            image_converted = mime_type != "application/gzip"

        if mime_type == "image/webp":
            new_mime_type, file, width, height = convert_image(
                file,
                source_mime="image/webp",
                target_type="png",
                thumbnail_to=(256, 256) if is_sticker else None)
            image_converted = new_mime_type != mime_type
            mime_type = new_mime_type
            thumbnail = None

        decryption_info = None
        upload_mime_type = mime_type
        if encrypt and encrypt_attachment:
            file, decryption_info_dict = encrypt_attachment(file)
            decryption_info = EncryptedFile.deserialize(decryption_info_dict)
            upload_mime_type = "application/octet-stream"
        content_uri = await intent.upload_media(file, upload_mime_type)
        if decryption_info:
            decryption_info.url = content_uri

        db_file = DBTelegramFile(id=loc_id,
                                 mxc=content_uri,
                                 decryption_info=decryption_info,
                                 mime_type=mime_type,
                                 was_converted=image_converted,
                                 timestamp=int(time.time()),
                                 size=len(file),
                                 width=width,
                                 height=height)
    if thumbnail and (mime_type.startswith("video/")
                      or mime_type == "image/gif"):
        if isinstance(thumbnail, (PhotoSize, PhotoCachedSize)):
            thumbnail = thumbnail.location
        try:
            db_file.thumbnail = await transfer_thumbnail_to_matrix(
                client, intent, thumbnail, file, mime_type, encrypt)
        except FileIdInvalidError:
            log.warning(f"Failed to transfer thumbnail for {thumbnail!s}",
                        exc_info=True)

    try:
        db_file.insert()
    except (IntegrityError, InvalidRequestError) as e:
        log.exception(
            f"{e.__class__.__name__} while saving transferred file data. "
            "This was probably caused by two simultaneous transfers of the same file, "
            "and should not cause any problems.")
    return db_file

Beispiel #43

0

Datei anzeigen

def getMime(data=None, mimestr=None):
    if mimestr:
        return mimetypes.guess_type('file.{0}'.format(mimestr))[0]
    elif data:
        return magic.from_buffer(data, mime=True).decode('utf-8')
    return 'text/plain'

Beispiel #44

0

Datei anzeigen

Datei: file.py Projekt: kahee/MySmallTrip

def get_buffer_ext(buffer):
    buffer.seek(0)
    mime_info = magic.from_buffer(buffer.read(), mime=True)
    buffer.seek(0)
    return mime_info.split('/')[-1]

Beispiel #45

0

Datei anzeigen

 def guess_mime_type(data):
     """ Guess a MIME type from magic bytes in a data stream. """
     return magic.from_buffer(data)

Beispiel #46

0

Datei anzeigen

Datei: hatigarmscans.py Projekt: fushinari/Komikku

    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
                cover=None,
            ))

        data['name'] = soup.find('h2', class_='widget-title').text.strip()
        data['cover'] = self.cover_url.format(data['slug'])

        # Details
        elements = soup.find(
            'dl', class_='dl-horizontal').findChildren(recursive=False)
        for element in elements:
            if element.name not in ('dt', 'dd'):
                continue

            if element.name == 'dt':
                label = element.text
                continue

            value = element.text.strip()

            if label.startswith('Author') or label.startswith('Artist'):
                for t in value.split(','):
                    t = t.strip()
                    if t not in data['authors']:
                        data['authors'].append(t)
            elif label.startswith('Categories'):
                data['genres'] = [t.strip() for t in value.split(',')]
            elif label.startswith('Status'):
                status = value.lower()
                if status in ('ongoing', 'complete'):
                    data['status'] = status

        data['synopsis'] = soup.find('div', class_='well').p.text.strip()
        alert_element = soup.find('div', class_='alert-danger')
        if alert_element:
            data['synopsis'] += '\n\n' + alert_element.text.strip()

        # Chapters
        elements = soup.find('ul', class_='chapters').find_all('li',
                                                               recursive=False)
        for element in reversed(elements):
            h5 = element.h5
            if not h5:
                continue

            slug = h5.a.get('href').split('/')[-1]
            title = '{0}: {1}'.format(h5.a.text.strip(), h5.em.text.strip())
            date = element.div.div

            data['chapters'].append(
                dict(slug=slug,
                     date=convert_date_string(date.text.strip(),
                                              format='%d %b. %Y'),
                     title=title))

        return data

Beispiel #47

0

Datei anzeigen

    def get_manga_data(self, initial_data):
        """
        Returns manga data by scraping manga HTML page content

        Initial data should contain at least manga's slug (provided by search)
        """
        assert 'slug' in initial_data, 'Manga slug is missing in initial data'

        r = self.session_get(self.manga_url.format(initial_data['slug']))
        if r is None:
            return None

        mime_type = magic.from_buffer(r.content[:128], mime=True)

        if r.status_code != 200 or mime_type != 'text/html':
            return None

        soup = BeautifulSoup(r.text, 'html.parser')

        data = initial_data.copy()
        data.update(
            dict(
                authors=[],
                scanlators=[],
                genres=[],
                status=None,
                synopsis=None,
                chapters=[],
                server_id=self.id,
            ))

        title_element = soup.find('h1', class_='manga-bg__title')
        if title_element is None:
            title_element = soup.find('h1', class_='manga__title')
        data['name'] = title_element.text.strip()
        if data.get('cover') is None:
            data['cover'] = self.cover_url.format(data['slug'])

        # Details
        elements = soup.find(
            'div', class_='manga-info').find_all(class_='info-list__row')
        for element in elements:
            label = element.strong.text.strip()

            if label.startswith('Auteur') or label.startswith('Artiste'):
                value = element.a.text.strip()
                for t in value.split(','):
                    t = t.strip()
                    if t not in data['authors']:
                        data['authors'].append(t)
            elif label.startswith('Scantrad'):
                a_element = element.find_all('a')[0]
                data['scanlators'] = [
                    a_element.text.replace('[', '').replace(']', '').strip(),
                ]
            elif label.startswith('Genres'):
                a_elements = element.find_all('a')
                data['genres'] = [
                    a_element.text.strip() for a_element in a_elements
                ]
            elif label.startswith('Statut'):
                status = element.span.text.strip().lower()
                if status == 'en cours':
                    data['status'] = 'ongoing'
                elif status == 'terminé':
                    data['status'] = 'complete'

        # Synopsis
        data['synopsis'] = soup.find('div',
                                     class_='info-desc__content').text.strip()

        # Chapters
        elements = soup.find('div', class_='chapters-list').find_all(
            'div', class_='chapter-item')
        for element in reversed(elements):
            a_element = element.find('div', class_='chapter-item__name').a
            slug = a_element.get('href').split('/')[-1]
            title = a_element.text.strip()
            date = element.find('div',
                                class_='chapter-item__date').text.strip()

            data['chapters'].append(
                dict(
                    slug=slug,
                    title=title,
                    date=convert_date_string(date, format='%d.%m.%Y'),
                ))

        return data

Beispiel #48

0

Datei anzeigen

def is_text_file(file):
    msg = magic.from_buffer(file[:1024])
    if ("text" in msg) or ("empty" in msg):
        return True
    return False

Beispiel #49

0

Datei anzeigen

Datei: package.py Projekt: thefrip/domogik

    def download_from_url(self, url):
        """ Download a package from an url
        """
        try:
            self.log.info("Start downloading {0}".format(url))

            #response = requests.get(url)

            # create an empty temporary file
            downloaded_file = tempfile.NamedTemporaryFile(delete=False).name

            # process the download
            with open(downloaded_file, "wb") as f:
                response = requests.get(url, stream=STREAM)
                total_length = response.headers.get('content-length')

                # check the http response code
                if response.status_code != 200:
                    self.log.error(
                        "Error while downloading the package : HTTP {0}".
                        format(response.status_code))
                    return None, None

                # check the mime type
                peek = response.iter_content(256).next()
                mime = magic.from_buffer(peek, mime=True)
                if mime not in ALLOWED_MIMES:
                    self.log.error(
                        "The package downloaded has not a compliant mime type : {0}. The mime type should be one of these : {1}"
                        .format(mime, ALLOWED_MIMES))
                    return None, None

                # download
                # if streaming is activated
                if STREAM:
                    if total_length is None:  # no content length header
                        f.write(response.content)
                    else:
                        dl = 0
                        total_length = int(total_length)
                        old_progress = 0
                        for data in response.iter_content(chunk_size=1024):
                            #self.log.info(dl)
                            if data:
                                f.write(data)
                                f.flush()
                                dl += len(data)

                                #progress = int(50 * dl / total_length)
                                #if progress - old_progress > 5 or progress >= 49:
                                #    old_progress = progress
                                #    sys.stdout.write("\r[%s%s]" % ('=' * progress, ' ' * (50-progress)) )
                                #    sys.stdout.flush()
                        #sys.stdout.write("\n")
                        os.fsync(f)
                # if no streaming
                else:
                    f.write(response.content)

        except:
            self.log.error("Error while downloading the package : {0}".format(
                traceback.format_exc()))
        self.log.info("Download finished")
        return downloaded_file, mime

Beispiel #50

0

Datei anzeigen

def get_content_type(scan):
    scan.seek(0)
    content_type = magic.from_buffer(scan.read(1024), mime=True)
    content_type = force_text(content_type)
    scan.seek(0)
    return content_type

Beispiel #51

0

Datei anzeigen

    def clean_avatar(self):
        data = self.cleaned_data['avatar']

        if settings.AVATAR_ALLOWED_MIMETYPES:
            try:
                import magic
            except ImportError:
                raise ImportError("python-magic library must be installed in "
                                  "order to use uploaded file content "
                                  "limitation")

            # Construct 256 bytes needed for mime validation
            magic_buffer = ""
            for chunk in data.chunks():
                magic_buffer += chunk
                if len(magic_buffer) >= 256:
                    break

            # https://github.com/ahupp/python-magic#usage
            mime = magic.from_buffer(magic_buffer, mime=True)

            if mime not in settings.AVATAR_ALLOWED_MIMETYPES:
                err = _("File content is invalid. Detected: %(mimetype)s "
                        "Allowed content types are: %(valid_mime_list)s")

                conf = {
                    'valid_mime_list':
                    ", ".join(settings.AVATAR_ALLOWED_MIMETYPES),
                    'mimetype': mime
                }

                raise forms.ValidationError(err % conf)

        if settings.AVATAR_ALLOWED_FILE_EXTS:
            root, ext = os.path.splitext(data.name.lower())
            if ext not in settings.AVATAR_ALLOWED_FILE_EXTS:
                valid_exts = ", ".join(settings.AVATAR_ALLOWED_FILE_EXTS)
                error = _("%(ext)s is an invalid file extension. "
                          "Authorized extensions are : %(valid_exts_list)s")
                raise forms.ValidationError(error % {
                    'ext': ext,
                    'valid_exts_list': valid_exts
                })

        if data.size > settings.AVATAR_MAX_SIZE:
            error = _("Your file is too big (%(size)s), "
                      "the maximum allowed size is %(max_valid_size)s")
            raise forms.ValidationError(
                error % {
                    'size': filesizeformat(data.size),
                    'max_valid_size': filesizeformat(settings.AVATAR_MAX_SIZE)
                })

        count = Avatar.objects.filter(user=self.user).count()
        if (settings.AVATAR_MAX_AVATARS_PER_USER > 1
                and count >= settings.AVATAR_MAX_AVATARS_PER_USER):
            error = _("You already have %(nb_avatars)d avatars, "
                      "and the maximum allowed is %(nb_max_avatars)d.")
            raise forms.ValidationError(
                error % {
                    'nb_avatars': count,
                    'nb_max_avatars': settings.AVATAR_MAX_AVATARS_PER_USER,
                })

        return

Beispiel #52

0

Datei anzeigen

Datei: forms.py Projekt: Basiczombie/Django_Projects

 def file_mime(self, jfile):
     mime = magic.from_buffer(jfile.read(), mime=True)
     if mime == 'text/plain':
         return True
     return False

Beispiel #53

0

Datei anzeigen

import sys
import base64
from jsonrpc.proxy import ServiceProxy
from django.core.serializers import serialize
import magic, json

if __name__ == '__main__':

    if len(sys.argv) != 3:
        print('Usage {} <user_pk> <filename>'.format( sys.argv[0] ))
        sys.exit(1)

    user_pk = sys.argv[1]
    filename = sys.argv[2]

    rpc_server = ServiceProxy('http://localhost:8000/api/')
    with open( filename, 'rb') as file:

        input_content = file.read()
        file = {
            'filename': filename,
            'mime_type': magic.from_buffer(input_content, mime=True),
            'content': base64.b64encode(input_content).decode('utf-8'),
        }

        rpc_server.api.upload_file(user_pk, json.dumps(file))

Beispiel #54

0

Datei anzeigen

Datei: file.py Projekt: xinhuang/PyFFF

 def mime(self) -> str:
     return magic.from_buffer(b''.join(self.read(count=261)), mime=True)

Beispiel #55

0

Datei anzeigen

async def transfer_thumbnail_to_matrix(
    client: MautrixTelegramClient,
    intent: IntentAPI,
    thumbnail_loc: TypeLocation,
    mime_type: str,
    encrypt: bool,
    video: bytes | None,
    custom_data: bytes | None = None,
    width: int | None = None,
    height: int | None = None,
    async_upload: bool = False,
) -> DBTelegramFile | None:
    if not Image or not VideoFileClip:
        return None

    loc_id = _location_to_id(thumbnail_loc)
    if not loc_id:
        return None

    if custom_data:
        loc_id += "-mau_custom_thumbnail"

    db_file = await DBTelegramFile.get(loc_id)
    if db_file:
        return db_file

    video_ext = sane_mimetypes.guess_extension(mime_type)
    if custom_data:
        file = custom_data
    elif VideoFileClip and video_ext and video:
        try:
            file, width, height = _read_video_thumbnail(video,
                                                        video_ext,
                                                        frame_ext="png")
        except OSError:
            return None
        mime_type = "image/png"
    else:
        file = await client.download_file(thumbnail_loc)
        width, height = None, None
        mime_type = magic.from_buffer(file, mime=True)

    decryption_info = None
    upload_mime_type = mime_type
    if encrypt:
        file, decryption_info = encrypt_attachment(file)
        upload_mime_type = "application/octet-stream"
    content_uri = await intent.upload_media(file,
                                            upload_mime_type,
                                            async_upload=async_upload)
    if decryption_info:
        decryption_info.url = content_uri

    db_file = DBTelegramFile(
        id=loc_id,
        mxc=content_uri,
        mime_type=mime_type,
        was_converted=False,
        timestamp=int(time.time()),
        size=len(file),
        width=width,
        height=height,
        decryption_info=decryption_info,
    )
    try:
        await db_file.insert()
    except (UniqueViolationError, IntegrityError) as e:
        log.exception(
            f"{e.__class__.__name__} while saving transferred file thumbnail data. "
            "This was probably caused by two simultaneous transfers of the same file, "
            "and might (but probably won't) cause problems with thumbnails or something."
        )
    return db_file

Beispiel #56

0

Datei anzeigen

Datei: download.py Projekt: brianleesc/encoded

    def _process_downloads(cls, properties, sheets):
        prop_name = cls.download_property
        attachment = properties.get(prop_name, {})
        href = attachment.get('href', None)
        if href is not None:
            if not href.startswith('data:'):
                msg = "Expected data URI."
                raise ValidationFailure('body', [prop_name, 'href'], msg)

            properties = properties.copy()
            properties[prop_name] = attachment = attachment.copy()

            if sheets is None:
                sheets = {}
            else:
                sheets = sheets.copy()
            sheets['downloads'] = downloads = {}
            download_meta = downloads[prop_name] = {}

            try:
                mime_type_declared, charset, data = parse_data_uri(href)
            except (ValueError, TypeError):
                msg = 'Could not parse data URI.'
                raise ValidationFailure('body', [prop_name, 'href'], msg)
            if charset is not None:
                download_meta['charset'] = charset
            # Make sure the mimetype appears to be what the client says it is
            mime_type_detected = magic.from_buffer(data,
                                                   mime=True).decode('utf-8')
            if mime_type_declared and not mimetypes_are_equal(
                    mime_type_declared, mime_type_detected):
                msg = "Incorrect file type. (Appears to be %s)" % mime_type_detected
                raise ValidationFailure('body', [prop_name, 'href'], msg)
            mime_type = mime_type_declared or mime_type_detected
            attachment['type'] = mime_type
            if mime_type is not None:
                download_meta['type'] = mime_type

            # Make sure mimetype is not disallowed
            try:
                allowed_types = cls.schema['properties'][prop_name][
                    'properties']['type']['enum']
            except KeyError:
                pass
            else:
                if mime_type not in allowed_types:
                    raise ValidationFailure('body', [prop_name, 'href'],
                                            'Mimetype is not allowed.')

            # Make sure the file extensions matches the mimetype
            download_meta['download'] = filename = attachment['download']
            mime_type_from_filename, _ = mimetypes.guess_type(filename)
            if not mimetypes_are_equal(mime_type, mime_type_from_filename):
                raise ValidationFailure(
                    'body', [prop_name, 'href'],
                    'Wrong file extension for %s mimetype.' % mime_type)

            # Validate images and store height/width
            major, minor = mime_type.split('/')
            if major == 'image' and minor in ('png', 'jpeg', 'gif', 'tiff'):
                stream = BytesIO(data)
                im = Image.open(stream)
                im.verify()
                attachment['width'], attachment['height'] = im.size

            blob_id = uuid4()
            download_meta['blob_id'] = str(blob_id)
            session = DBSession()
            blob = Blob(blob_id=blob_id, data=data)
            session.add(blob)
            attachment['href'] = '@@download/%s/%s' % (prop_name,
                                                       quote(filename))

        return properties, sheets

Beispiel #57

0

Datei anzeigen

async def _unlocked_transfer_file_to_matrix(
    client: MautrixTelegramClient,
    intent: IntentAPI,
    loc_id: str,
    location: TypeLocation,
    thumbnail: TypeThumbnail,
    is_sticker: bool,
    tgs_convert: dict | None,
    filename: str | None,
    encrypt: bool,
    parallel_id: int | None,
    async_upload: bool = False,
) -> DBTelegramFile | None:
    db_file = await DBTelegramFile.get(loc_id)
    if db_file:
        return db_file

    converted_anim = None

    if parallel_id and isinstance(location, Document) and (not is_sticker
                                                           or not tgs_convert):
        db_file = await parallel_transfer_to_matrix(client, intent, loc_id,
                                                    location, filename,
                                                    encrypt, parallel_id)
        mime_type = location.mime_type
        file = None
    else:
        try:
            file = await client.download_file(location)
        except (LocationInvalidError, FileIdInvalidError):
            return None
        except (AuthBytesInvalidError, AuthKeyInvalidError,
                SecurityError) as e:
            log.exception(f"{e.__class__.__name__} while downloading a file.")
            return None

        width, height = None, None
        mime_type = magic.from_buffer(file, mime=True)

        image_converted = False
        # A weird bug in alpine/magic makes it return application/octet-stream for gzips...
        is_tgs = mime_type == "application/gzip" or (
            mime_type == "application/octet-stream"
            and magic.from_buffer(file).startswith("gzip"))
        if is_sticker and tgs_convert and is_tgs:
            converted_anim = await convert_tgs_to(file, tgs_convert["target"],
                                                  **tgs_convert["args"])
            mime_type = converted_anim.mime
            file = converted_anim.data
            width, height = converted_anim.width, converted_anim.height
            image_converted = mime_type != "application/gzip"
            thumbnail = None

        decryption_info = None
        upload_mime_type = mime_type
        if encrypt and encrypt_attachment:
            file, decryption_info = encrypt_attachment(file)
            upload_mime_type = "application/octet-stream"
        content_uri = await intent.upload_media(file,
                                                upload_mime_type,
                                                async_upload=async_upload)
        if decryption_info:
            decryption_info.url = content_uri

        db_file = DBTelegramFile(
            id=loc_id,
            mxc=content_uri,
            decryption_info=decryption_info,
            mime_type=mime_type,
            was_converted=image_converted,
            timestamp=int(time.time()),
            size=len(file),
            width=width,
            height=height,
        )
    if thumbnail and (mime_type.startswith("video/")
                      or mime_type == "image/gif"):
        if isinstance(thumbnail, (PhotoSize, PhotoCachedSize)):
            thumbnail = thumbnail.location
        try:
            db_file.thumbnail = await transfer_thumbnail_to_matrix(
                client,
                intent,
                thumbnail,
                video=file,
                mime_type=mime_type,
                encrypt=encrypt,
                async_upload=async_upload,
            )
        except FileIdInvalidError:
            log.warning(f"Failed to transfer thumbnail for {thumbnail!s}",
                        exc_info=True)
    elif converted_anim and converted_anim.thumbnail_data:
        db_file.thumbnail = await transfer_thumbnail_to_matrix(
            client,
            intent,
            location,
            video=None,
            encrypt=encrypt,
            custom_data=converted_anim.thumbnail_data,
            mime_type=converted_anim.thumbnail_mime,
            width=converted_anim.width,
            height=converted_anim.height,
            async_upload=async_upload,
        )

    try:
        await db_file.insert()
    except (UniqueViolationError, IntegrityError) as e:
        log.exception(
            f"{e.__class__.__name__} while saving transferred file data. "
            "This was probably caused by two simultaneous transfers of the same file, "
            "and should not cause any problems.")
    return db_file

Beispiel #58

0

Datei anzeigen

# coding: utf-8

import magic

with open("test.gif", mode="rb") as f:
    buf = f.read(512)
    print(magic.from_buffer(buf, mime=True))

Beispiel #59

0

Datei anzeigen

def load_known_pii(known_pii_locations: List[str],
                   storage_connection_string: Optional[str] = None) -> List[KnownFilthItem]:
    """This function loads tagged filth from a csv and transforms it into a dict that the detector can use"""
    start_time = time.time()
    click.echo("Loading Known Filth...")

    import pandas as pd
    # This will be a list of records containing all the info from the loaded tagged pii files
    known_pii = []  # type: List[Dict[str, Any]]

    logger = logging.getLogger('scrubadub.tests.benchmark_accuracy_real_data.load_known_pii')

    # These are the column names that we want
    target_cols = {'match', 'filth_type'}
    # These are some optional column names that we will use to filter extra columns out
    target_cols_optional = {'match_end', 'limit', 'ignore_case', 'ignore_whitespace', 'ignore_partial_word_matches'}
    # This is an alternate set of column names that are also accepted instead of the ones listed in `target_cols`
    target_cols_alt = {'pii_type', 'pii_start', 'pii_end'}

    # We loop over all tagged PII files
    for known_pii_location in known_pii_locations:
        file_data = load_files(known_pii_location, storage_connection_string=storage_connection_string)
        # Loop over the results from the load_files function, could be more than one file if we provide a directory
        # in `known_pii_location`
        for file_name, data in file_data.items():
            mime_type = magic.from_buffer(data, mime=True)
            pandas_reader = pd.read_csv
            if mime_type == "application/vnd.openxmlformats-officedocument.spreadsheetml.sheet":
                pandas_reader = pd.read_excel
            else:
                data = decode_text({file_name: data}, allowed_mime_types=['application/csv'])[file_name].encode('utf-8')

            dataframe = None  # type: Optional[DataFrame]
            # Work out how many rows to skip in this loop, starting at zero going up to 9
            for n_rows_to_skip in range(10):
                dataframe = pandas_reader(io.BytesIO(data), skiprows=n_rows_to_skip, dtype={
                    'match': str,
                    'match_end': str,
                    'filth_type': str,
                    'pii_start': str,
                    'pii_end': str,
                    'pii_type': str,
                }).rename(columns=lambda x: x.strip())
                # If we find the `target_cols` then we found the correct number of rows to skip so we break from
                # this loop
                if (set(dataframe.columns.to_list()) & target_cols) == target_cols:
                    break
                # if we find the `target_cols_alt`, we convert those to the standard set of names and then break
                elif (set(dataframe.columns.to_list()) & target_cols_alt) == target_cols_alt:
                    dataframe = dataframe.rename(
                        columns={
                            'pii_type': 'filth_type',
                            'pii_start': 'match',
                            'pii_end': 'match_end',
                        }
                    )
                    dataframe = dataframe.replace({
                        "filth_type": {
                            "organisation": "organization",
                            "card-number": "credit_card",
                            "dob": "date_of_birth",
                            "driverslicence": "drivers_licence",
                            "postcode": "postalcode",
                            "licenceplate": "vehicle_licence_plate",
                        }
                    })
                    break
                dataframe = None

            # We weren't able to find the correct columns so raise an error
            if dataframe is None:
                raise ValueError(f'Unable to read file: {known_pii_location} Are the file format (csv or xslx) and '
                                 f'columns (match, match_end, filth_type and optionally limit) correct?')

            # strip() the main columns
            for col in ['match', 'match_end', 'filth_type']:
                dataframe[col] = dataframe[col].str.strip()

            # drop rows if the column 'match' has null values
            if pd.isnull(dataframe['match']).sum() > 0:
                dataframe = dataframe.dropna(axis='index', subset=['match'])

                logger.warning(
                    f"The KnownFilth column 'match' contains some null/blank entries in '{file_name}'. "
                    f"Skipping these rows."
                )
            # drop rows if the column 'filth_type' has null values
            if pd.isnull(dataframe['filth_type']).sum() > 0:
                dataframe = dataframe.dropna(axis='index', subset=['filth_type'])
                logger.warning(
                    f"The KnownFilth column 'filth_type' contains some null/blank entries in '{file_name}'. "
                    f"Skipping these rows."
                )
            # Convert the dataframe to a dict in records format and add it to the big list of tagged pii
            known_pii += dataframe[
                [col for col in dataframe.columns if col in (target_cols | target_cols_optional)]
            ].to_dict(orient='records')

    # Loop over each of the tagged pieces of pii
    for item in known_pii:
        for sub_item in ('limit', 'match_end', 'ignore_case', 'ignore_whitespace', 'ignore_partial_word_matches'):
            # if each of hte above keys exist, delete it if its empty
            if sub_item in item.keys():
                if pd.isnull(item[sub_item]):
                    del item[sub_item]
                elif isinstance(item[sub_item], str) and len(item[sub_item].strip()) == 0:
                    del item[sub_item]
                elif 'ignore' in sub_item:
                    # if ignore is in the name of the item, then try to convert it to a bool
                    item[sub_item] = convert_to_bool(item[sub_item])

            if 'ignore' in sub_item and sub_item not in item:
                # if ignore is not det then set it to true
                item[sub_item] = True

    end_time = time.time()
    click.echo("Loaded Known Filth in {:.2f}s".format(end_time-start_time))

    return known_pii

Beispiel #60

0

Datei anzeigen

Datei: literature.py Projekt: inspirehep/inspirehep

    def add_file(
        app_context,
        url,
        original_url=None,
        key=None,
        filename=None,
        *args,
        **kwargs,
    ):
        """Adds files to s3.

        Args:
            app_context: Original app context should be passed here if running in separate thread
        """
        with app_context.app.app_context():
            is_s3_or_public_url = current_s3_instance.is_s3_url_with_bucket_prefix(
                url) or current_s3_instance.is_public_url(url)
            if is_s3_or_public_url and not current_app.config.get(
                    "UPDATE_S3_FILES_METADATA", False):
                result = {}
                if key not in url:
                    filename = filename or key
                    key = url.split("/")[-1]
                    result.update({"key": key, "filename": filename})
                if current_s3_instance.is_s3_url(url):
                    url = current_s3_instance.get_public_url(key)
                    result.update({"url": url})

                LOGGER.info(
                    "File already on S3 - Skipping",
                    url=url,
                    key=key,
                    thread=threading.get_ident(),
                )
                return result
            file_data = download_file_from_url(url)
            new_key = hash_data(file_data)
            mimetype = magic.from_buffer(file_data, mime=True)
            file_data = BytesIO(file_data)
            filename = filename or key
            if not filename:
                filename = new_key
            if mimetype in current_app.config.get(
                    "FILES_RESTRICTED_MIMETYPES"):
                LOGGER.error(
                    "Unsupported file type - Aborting",
                    key=key,
                    mimetype=mimetype,
                    thread=threading.get_ident(),
                )
                raise UnsupportedFileError(mimetype)
            acl = current_app.config["S3_FILE_ACL"]
            if current_s3_instance.file_exists(new_key):
                LOGGER.info(
                    "Replacing file metadata",
                    key=new_key,
                    thread=threading.get_ident(),
                )
                current_s3_instance.replace_file_metadata(
                    new_key, filename, mimetype, acl)
            else:
                LOGGER.info(
                    "Uploading file to s3",
                    key=new_key,
                    thread=threading.get_ident(),
                )
                current_s3_instance.upload_file(file_data, new_key, filename,
                                                mimetype, acl)
            result = {
                "key": new_key,
                "filename": filename,
                "url": current_s3_instance.get_public_url(new_key),
            }
            if (url.startswith("http")
                    and not current_s3_instance.is_s3_url(url)
                    and not current_s3_instance.is_public_url(url)
                    and not original_url):
                result["original_url"] = url
            return result