def extract_data(tag): if type(tag) == mutagen.easymp4.EasyMP4: if 'cover' in tag: picture = tag['cover'] exts = { picture.FORMAT_JPEG: '.jpeg', picture.FORMAT_PNG: '.png' } return ((picture, exts[picture.imageformat]), clean_keys(tag)) else: return (None, clean_keys(tag)) pass if type(tag) == mutagen.easyid3.EasyID3: if 'cover' in tag: picture = tag['cover'] ext = mimetypes.guess_extension(picture.mime) return ((picture.data, ext), clean_keys(tag)) else: return (None, clean_keys(tag)) pass if type(tag) == mutagen.flac.FLAC: if tag.pictures: picture = tag.pictures[0] ext = mimetypes.guess_extension(picture.mime) return ((picture.data, ext), clean_keys(tag)) else: return (None, clean_keys(tag)) pass
def getImage(self, imageUrl, referrer): content, handle = self.wg.getpage(imageUrl, returnMultiple=True, addlHeaders={'Referer': referrer}) if not content or not handle: raise ValueError("Failed to retreive image from page '%s'!" % referrer) fileN = urllib.parse.unquote(urllib.parse.urlparse(handle.geturl())[2].split("/")[-1]) fileN = bs4.UnicodeDammit(fileN).unicode_markup self.log.info("retreived image '%s' with a size of %0.3f K", fileN, len(content)/1000.0) if not "." in fileN: info = handle.info() if 'Content-Type' in info: tp = info['Content-Type'] if ";" in tp: tp = tp.split(";")[0] ext = guess_extension(tp) if ext == None: ext = "unknown_ftype" print(info['Content-Type'], ext) fileN += "." + ext else: fileN += ".jpg" # Let magic figure out the files for us (it's probably smarter then kissmanga, anyways.) guessed = magic.from_buffer(content, mime=True) ext = guess_extension(tp) if ext: fileN = fileN + ext return fileN, content
def create_url_filename(url_str, content_type): # See also: http://stackoverflow.com/a/7406369/1391325 split_url = urlsplit(url_str) netloc = split_url[1] netloc_dirname = os.path.sep.join(reversed(netloc.split('.'))) path = split_url[2] stripped_url_str = "".join((netloc_dirname, path)) url_without_ext, existing_ext = os.path.splitext(stripped_url_str) filename_without_ext = url_without_ext.translate(URL_FILENAME_TRANSLATION_TABLE) if filename_without_ext.endswith(os.path.sep): filename_without_ext = filename_without_ext[:-len(os.path.sep)] if existing_ext: acceptable_filename_exts = mimetypes.guess_all_extensions(content_type) if existing_ext in acceptable_filename_exts: # Re-concatenate the now-normalized filename base with the original extension result = filename_without_ext + existing_ext else: canonical_ext = mimetypes.guess_extension(content_type) if canonical_ext: # If a canonical extension was found for the given content type, concatenate it to the now-normalized filename base result = filename_without_ext + canonical_ext else: # If no canonical extension was found, re-concatenate the original extension after normalizing it normalized_existing_ext = normalize_url_component(existing_ext, ".") result = filename_without_ext + normalized_existing_ext else: # Concatenate the canonical extension for the given content type to the result filename in order to avoid potential clashes with other URLs canonical_ext = mimetypes.guess_extension(content_type) if canonical_ext: result = filename_without_ext + canonical_ext else: # Just add some extention result = filename_without_ext + DEFAULT_OUTPATH_SUFFIX return result
def post(self): # Check if the image uploaded is a multipart/form-data if self.multipart_form_data(): file_data = self.request.files['media'][0] body = file_data['body'] # Retrieve filename from 'filename' field filename = file_data['filename'] else: body = self.request.body # Retrieve filename from 'Slug' header filename = self.request.headers.get('Slug') # Check if the image uploaded is valid if self.validate(body): # Use the default filename for the uploaded images if not filename: content_type = self.request.headers.get('Content-Type', BaseEngine.get_mimetype(body)) extension = mimetypes.guess_extension(content_type.split(';',1)[0], False) if extension is None: # Content-Type is unknown, try with body extension = mimetypes.guess_extension(BaseEngine.get_mimetype(body), False) if extension == '.jpe': extension = '.jpg' # Hack because mimetypes return .jpe by default if extension is None: # Even body is unknown, return an empty string to be contat extension = '' filename = self.context.config.UPLOAD_DEFAULT_FILENAME + extension # Build image id based on a random uuid (32 characters) id = str(uuid.uuid4().hex) self.write_file(id, body) self.set_status(201) self.set_header('Location', self.location(id, filename))
def get_file_name_mime(self, url): pgctnt, hName, mime = self.wg.getFileNameMime(url) parsed = urllib.parse.urlparse(url) pathname = os.path.split(parsed.path)[-1] if not hName and not mime and not pathname: self.log.error("cannot figure out content type for url: %s", url) return pgctnt, "unknown.unknown", "application/octet-stream" # empty path with mimetype of text/html generally means it's a directory index (or some horrible dynamic shit). if not hName and not pathname and mime == "text/html": self.log.info("No path and root location. Assuming index.html") return pgctnt, "index.html", "text/html" ftype, guessed_mime = mimetypes.guess_type(hName) if ftype: return pgctnt, hName, guessed_mime if not mime else mime ftype, guessed_mime = mimetypes.guess_type(pathname) if ftype: return pgctnt, pathname, guessed_mime if not mime else mime chunks = [hName, pathname] chunks = [chunk for chunk in chunks if chunk] outname = " - ".join(chunks) if mime and mimetypes.guess_extension(mime): newext = mimetypes.guess_extension(mime) else: newext = ".unknown" if not outname: outname = "unknown" return pgctnt, outname+newext, mime if mime else "application/octet-stream"
def start_recording(name, queue): """ Starts stream recording """ date_format = CONFIG.get(__SETTINGS_SECTION, "date_format") jar = cookielib.CookieJar() opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(jar)) while True: schedule = queue.get() log.info("Start recording for %s"%schedule) end_time = datetime.strptime(schedule['end'],date_format) shutil.move(schedule['file'], INPROGRESS_DIR) schedule['file'] = os.path.join(INPROGRESS_DIR, schedule['file_name']) response = opener.open(schedule['url']) extension = ".mpeg" if 'Content-type' in response.info(): extension = mimetypes.guess_extension(response.info()['Content-type']) elif 'mime' in schedule: extension = mimetypes.guess_extension(schedule['mime']) f = open(os.path.join(DOWNLOADS_DIR,schedule['name']+extension),"a+") try: while end_time>=datetime.now() and os.path.exists(schedule['file']): f.write(response.read(__BUFFER)) f.flush() shutil.move(schedule['file'], COMPLETED_DIR) log.info("Recording of %s is done"%schedule) except Exception as e: log.error("Failed to record %s. Will try to restart."%schedule,e) queue.put(schedule, True, __RETRY_INTERVAL) pass finally: f.close() response.close()
def execute(self, transform_manager): filename, headers = retrieve(url=self.url, user=transform_manager.owner, username=self.username, password=self.password, user_agent=self.user_agent) try: if headers.get('error'): raise TransformException("Failed to download %s" % self.url) if not filename: raise TransformException(headers.get('message')) content_type = headers.get('content-type', 'unknown/unknown') content_type = content_type.split(';')[0].strip() extension = self.extension \ or self.mimetype_overrides.get(content_type) \ or (mimetypes.guess_extension(content_type) or '').lstrip('.') \ or (mimetypes.guess_extension(content_type, strict=False) or '').lstrip('.') \ or 'unknown' logger.debug("Response had content-type %r; assigning extension %r" % (content_type, extension)) with open(transform_manager(extension, self.name), 'w') as output: transform_manager.start(self, [input], type='identity') with open(filename, 'r') as f: shutil.copyfileobj(f, output) logger.info("File from %r saved to %r" % (self.url, output.name)) return output.name finally: if headers['delete-after']: os.unlink(filename)
def write_html(client,xml_string,filename): tree = etree.fromstring(strip_ns(xml_string)) elements_to_download = tree.xpath('//img[@src]') + tree.xpath('//object[starts-with(@data, "https://")]') if not elements_to_download: return serialise_html(xml_string,filename) serialise_html(xml_string,filename) filename_base = os.path.splitext(filename)[0] for external in elements_to_download: part_id = id_generator() if external.tag == 'img': if not os.path.exists(filename_base + '_images/'): os.makedirs(filename_base + '_images/') data = client.do_request(external.get('data-fullres-src'),raw=True) outfile = os.path.join( filename_base + '_images/', part_id + mimetypes.guess_extension(external.get('data-src-type'))) write_image(data,outfile) data = client.do_request(external.get('src'),raw=True) encoded = base64.b64encode(data) external.set('src', 'data:'+external.get('data-src-type')+';base64,'+encoded) if external.tag == 'object': if not os.path.exists(filename_base + '_attachments/'): os.makedirs(filename_base + '_attachments/') extension = mimetypes.guess_extension(external.get('type')) if ( external.get('type') == 'application/vnd.ms-excel' ): extension = '' outfile = os.path.join( filename_base + '_attachments/', external.get('data-attachment') + extension) data = client.do_request(external.get('data'),raw=True) write_image(data,outfile) external.set('data' , 'file://'+os.path.abspath(outfile)) ET.ElementTree(tree).write(filename,method="html")
def _guess_destination(self, torrent_files): """ try to identify the correct category of the finished torrent and return the destination path where the torrent has to be moved """ download_path = self.config["download_path"] for file in torrent_files: try: ext = os.path.splitext(file["path"])[1] ext = ext.lower() mt.guess_extension(ext) res = mt.types_map[ext] if res in GREY_LIST: log.debug("skipping GREY_LIST extension %s", res) continue if (res.startswith("audio")): return [os.path.join(download_path, self.config["sub_audio"]), "audio"] elif (res.startswith("video")): return [os.path.join(download_path, self.config["sub_video"]), "video"] elif(ext in DOC_FORMAT): return [os.path.join(download_path, self.config["sub_documents"]), "doc"] elif(ext in DATA_FORMAT): return [os.path.join(download_path, self.config["sub_data"]), "data"] except KeyError: log.debug("unknown extension %s, trying again", ext) continue return [os.path.join(download_path, self.config["sub_uncat"]), "uncategorized"]
def unpack_mail(msg, only_headers=False, exclude_headers=True): # TODO: headers, msg_text, msg_html, attachments msg_text = "" msg_html = "" if not msg.is_multipart(): msg_payload = msg.get_payload(decode=True) msg_payload = decode_text(msg_payload) if msg.get_content_type == 'text/html': msg_html = msg_payload else: # text/plain. or other? msg_text = msg_payload return msg_text, msg_html, [] attachments = [] counter = 1 for part in msg.walk(): # multipart/* are just containers if part.get_content_maintype() == 'multipart': continue is_multipart = part.is_multipart() filename = part.get_filename() filename = decode_mail_header(filename) content_type = part.get_content_type() if is_multipart or filename: # an attachment if not filename: # maybe not possible ext = mimetypes.guess_extension(content_type) if not ext: ext = '.bin' filename = 'part-%03d%s' % (counter, ext) attachments.append({ "data": part.get_payload(), "filename": filename, "content_type": content_type, "is_multipart": is_multipart, }) else: part_payload = part.get_payload(decode=True) part_payload = decode_text(part_payload) if content_type == 'text/plain': msg_text = part_payload elif content_type == 'text/html': msg_html = part_payload else: # maybe not possible ext = mimetypes.guess_extension(content_type) if not ext: ext = '.bin' filename = 'part-%03d%s' % (counter, ext) attachments.append({ "data": part.get_payload(), "filename": filename, "content_type": content_type, "is_multipart": is_multipart(), }) counter += 1 return msg_text, msg_html, attachments
def fetch(self, fetch_info, target_path, progress): """ Fetch a file. """ response = requests.get(fetch_info['url'], stream=True) response.raise_for_status() mimetype = fetch_info['mimetype'] or response.headers.get('content-type') encoding = response.headers.get('content-encoding') archive_type = (mimetype, encoding) # If the source has an overriden type, we use that instead. extension = None if fetch_info['mimetype']: extension = mimetypes.guess_extension(fetch_info['mimetype']) if not extension: extension = mimetypes.guess_extension(mimetype) if not extension: LOGGER.debug('No extension registered for this mimetype (%s). Guessing one from the URL...', mimetype) extension = os.path.splitext(urlparse.urlparse(fetch_info['url']).path)[1] if extension and extension.startswith('.'): extension = extension[1:] content_disposition = parse_requests_response(response) filename = content_disposition.filename_sanitized(extension=extension, default_filename='archive') content_length = response.headers.get('content-length') if content_length is not None: content_length = int(content_length) archive_path = os.path.join(target_path, filename) progress.on_start(target=os.path.basename(archive_path), size=content_length) with open(archive_path, 'wb') as target_file: current_size = 0 for buf in response.iter_content(1024): if buf: target_file.write(buf) current_size += len(buf) progress.on_update(progress=current_size) progress.on_finish() return { 'archive_path': archive_path, 'archive_type': archive_type, }
def _find_store_dir(file_path): mime = magic.from_file(file_path, mime=True) store_dir = None if mime in ['application/octet-stream', 'text/plain'] or mimetypes.guess_extension(mime) == None: store_dir = os.path.splitext(file_path)[1] else: store_dir = mimetypes.guess_extension(mime) return store_dir.lstrip(".").lower()
def download(self, resource): if resource in Main.completedResources: #check if they're using the global id with open(str(resource) + mimetypes.guess_extension(Main.completedResources[resource][2]), "wb") as output: output.write(Main.completedResources[resource][1]) else: for key in Main.completedResources: #loop throught to check if they're using the local id completedResource = Main.completedResources[key] if completedResource[3] == resource: with open(str(resource) + mimetypes.guess_extension(completedResource[2]), "wb") as output: output.write(completedResource[1])
def _oooConvertByFormat(self, printout, content_type, extra_context, REQUEST, format, batch_mode): """ Convert the ODF document into the given format. Keyword arguments: printout -- ODF document content_type -- the content type of the printout extra_context -- extra_context including a format REQUEST -- Request object format -- requested output format batch_mode -- Disable headers overriding """ if REQUEST is not None and not format: format = REQUEST.get("format", None) filename = self.getProperty("filename") # Call refresh through cloudooo # XXX This is a temporary implementation: # Calling a webservice must be done through a WebServiceMethod # and a WebServiceConnection from Products.ERP5OOo.Document.OOoDocument import OOoServerProxy, enc, dec server_proxy = OOoServerProxy(self) extension = guess_extension(content_type).strip(".") printout = dec( server_proxy.convertFile( enc(printout), extension, extension, False, True # source_format # destination_format # zip ) ) # refresh # End of temporary implementation if not format: if REQUEST is not None and not batch_mode: REQUEST.RESPONSE.setHeader("Content-Length", len(printout)) REQUEST.RESPONSE.setHeader("Content-Type", "%s" % content_type) REQUEST.RESPONSE.setHeader( "Content-disposition", 'inline;filename="%s%s"' % (filename, guess_extension(content_type) or "") ) return printout from Products.ERP5Type.Document import newTempOOoDocument tmp_ooo = newTempOOoDocument(self, self.title_or_id()) tmp_ooo.edit( data=printout, base_data=printout, filename=self.title_or_id(), content_type=content_type, base_content_type=content_type, ) mime, data = tmp_ooo.convert(format) if REQUEST is not None and not batch_mode: REQUEST.RESPONSE.setHeader("Content-Length", len(data)) REQUEST.RESPONSE.setHeader("Content-type", mime) REQUEST.RESPONSE.setHeader("Content-disposition", 'attachment;filename="%s.%s"' % (filename, format)) return str(data)
def guess_extension(response): "Return an extension based on the Content-Type header in the response" if not response: return None ct = response.info().get('content-type') if ct: mimetype = ct.split(';')[0] ext = mimetypes.guess_extension(mimetype) if ext: return ext # otherwise try based on URL mimetype, _ = mimetypes.guess_type(response.geturl()) if mimetype: ext = mimetypes.guess_extension(mimetype) return ext
def receive_pop(): temp = [] M = poplib.POP3('pop.sina.com') M.user('*****@*****.**') M.pass_(getpass.getpass()) numMessages = len(M.list()[1]) for i in range(numMessages): for j in M.retr(i+1)[1]: temp.append(j) body = "\n".join(temp) begin = re.search("Content", body) print (body[begin.start():]) msg = email.message_from_string(body[begin.start():]) counter = 1 for part in msg.walk(): # multipart/* are just containers if part.get_content_maintype() == 'multipart': continue # Applications should really sanitize the given filename so that # an email message can't be used to overwrite import files filename = part.get_filename() if not filename: ext = mimetypes.guess_extension(part.get_content_type()) if not ext: # Use a generic bag of bits extension ext = '.bin' filename = 'part-%03d%s' % (counter, ext) counter += 1 fp = open(os.path.join(".", filename), 'wb') fp.write(part.get_payload(decode=True)) fp.close()
def __indexCycleProcess(self): self.__run = 1 while self.__run: self.__lockUrls.acquire() if len(self.__urls) > 0: url, search_query, weight = self.__urls.pop() else: url = None self.__lockUrls.release() if url: tmp_filename = tempfile.mktemp("", maay.globalvars.config.getValue("TemporaryDocumentRoot") + os.path.sep) fd = file(tmp_filename, "wb") infos = self.__fetchURL(url, fd) if infos: mime_type, last_modified, content_size, document_id = infos else: continue newname = document_id + (mimetypes.guess_extension(mime_type) or ".txt") absolute_newname = "%s%s%s" % (maay.globalvars.config.getValue("CachedDocumentRoot"), os.path.sep, newname) if os.path.exists(absolute_newname): os.remove(absolute_newname) maay.globalvars.logger.debug("rename %s => %s" % (tmp_filename, absolute_newname)) os.rename(tmp_filename, absolute_newname) maay.globalvars.logger.debug("done => %s" % absolute_newname) maay.globalvars.indexer.addNewDocumentToIndex(absolute_newname, mime_type, last_modified, url, search_query=search_query, weight=weight) else: time.sleep(2)
def addPicture(self, filename, mediatype=None, content=None): """ Add a picture It uses the same convention as OOo, in that it saves the picture in the zipfile in the subdirectory 'Pictures' If passed a file ptr, mediatype must be set @param filename unicode string: name of a file for Pictures @param mediatype unicode string: name of a media, None by default @param content bytes: content of media, None by default @return a unicode string: the file name of the media, eventually created on the fly """ if content is None: if mediatype is None: mediatype, encoding = mimetypes.guess_type(filename) if mediatype is None: mediatype = u'' try: ext = filename[filename.rindex(u'.'):] except: ext=u'' else: ext = mimetypes.guess_extension(mediatype) manifestfn = u"Pictures/%s%s" % (uuid.uuid4().hex.upper(), ext) self.Pictures[manifestfn] = (IS_FILENAME, filename, mediatype) content=b"" # this value is only use by the assert further filename=u"" # this value is only use by the assert further else: manifestfn = filename self.Pictures[manifestfn] = (IS_IMAGE, content, mediatype) assert(type(filename)==type(u"")) assert(type(content) == type(b"")) return manifestfn
def get_result(self, path, original_name=False, default_ext='.bin', delete_msg=True, get_file=True): q = self.get_queue(self.output_queue_name) m = q.read() if m: if get_file: outputs = m['OutputKey'].split(',') for output in outputs: key_name, type = output.split(';') mime_type = type.split('=')[1] if original_name: file_name = m.get('OriginalFileName', key_name) file_name, ext = os.path.splitext(file_name) ext = mimetypes.guess_extension(mime_type) if not ext: ext = default_ext file_name = file_name + ext else: file_name = key_name bucket = self.get_bucket(m['Bucket']) key = bucket.lookup(key_name) print 'retrieving file: %s' % file_name key.get_contents_to_filename(os.path.join(path, file_name)) if delete_msg: q.delete_message(m) return m
def upload_image_url(request): if request.method != 'POST': return HttpResponse(status=403) image_url = request.POST.get('image_url', None) source_domain = request.POST.get('source_domain', None) headers = { 'User-Agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.11 (KHTML, like Gecko) Chrome/23.0.1271.64 Safari/537.11', # 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', # 'Accept-Charset': 'ISO-8859-1,utf-8;q=0.7,*;q=0.3', # 'Accept-Encoding': 'none', # 'Accept-Language': 'en-US,en;q=0.8', # 'Connection': 'keep-alive', 'referer': source_domain, } ext = mimetypes.guess_extension(mimetypes.guess_type(image_url)[0]) req = urllib2.Request(image_url, headers=headers) img_temp = NamedTemporaryFile(delete=True) img_temp.write(urllib2.urlopen(req).read()) img_temp.flush() post_photo = Post_photo() post_photo.photo.save('%s%s'%(uuid.uuid4(), ext), File(img_temp)) post_photo.save() res = { 'link': post_photo.photo.url, } return JsonResponse(res, safe=False)
def get_extension(content): """A handful of workarounds for getting extensions we can trust.""" file_str = magic.from_buffer(content) if file_str.startswith('Composite Document File V2 Document'): # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed # in libmagic 5.11-2. mime = 'application/msword' elif file_str == '(Corel/WP)': mime = 'application/vnd.wordperfect' elif file_str == 'C source, ASCII text': mime = 'text/plain' else: # No workaround necessary mime = magic.from_buffer(content, mime=True) extension = mimetypes.guess_extension(mime) if extension == '.obj': # It could be a wpd, if it's not a PDF if 'PDF' in content[0:40]: # Does 'PDF' appear in the beginning of the content? extension = '.pdf' else: extension = '.wpd' if extension == '.wsdl': # It's probably an HTML file, like those from Resource.org extension = '.html' if extension == '.ksh': extension = '.txt' if extension == '.asf': extension = '.wma' return extension
def func_image_svg_plus_xml (request, cf) : __argument = request.GET.copy() try : convert = mimetypes.guess_extension(__argument.get("force_mimetype", "").strip()).split(".")[1] except : convert = None if convert and convert in ("png", ) : try : s = svg.SVG(cf) output = s.render( outputtype=convert, width=__argument.get("width"), height=__argument.get("height"), ) tmp = func_image( request, output ) return tmp except : return cf return cf
def addNewVideo(request): import mimetypes data = {} try: if request.FILES: _file = request.FILES['video'] visible= request.POST.get('visible') _name = _file.name _type,enc = mimetypes.guess_type(_name) extns =mimetypes.guess_extension(_type) _dir = 'videos/' uploaded =f**k.util.UploadVideo(request, path='f**k/media/'+_dir) if uploaded is not None: data['token'] = uploaded.out_file_name[0] info = myutil.video_info(data['token']) thumb = info.get('thumb') size = info.get('size') dur = info.get('dur') success = m.Videos.objects.create( user= request.user,title=_name, token = data['token'] , type =_type, encoding= enc, visibility =visible, thumbnail= thumb, duration = dur ,resolution= size) if success is not None: data['success'] = True data['message'] = 'uploaded successfully ' data['html'] = get_template('ajax/video/render_video.html').render(Context({'video':success})) else: raise Exception('Something went wrong can not upload this file !') else: data['error'] =True data['message'] = "File Not found!" except Exception,ex: traceback.print_exc(file=sys.stdout) data['error'] = True data['message'] = 'Internal Error' return HttpResponse( "Exception %s"%ex.message)
def clean(self): data = self.cleaned_data if data.get("scheduled_datetime"): sched_dt = data["scheduled_datetime"] sched_tz = timezone.pytz.timezone(data.get("scheduled_tz")) sched_dt = sched_tz.localize(sched_dt.replace(tzinfo=None)) data["scheduled_datetime"] = timezone.localtime(sched_dt) if data.get("attached_media") and data.get("media_url"): raise forms.ValidationError(_("Only one of media URL or " "attached media may be provided")) if data.get("media_url"): response = requests.get(data["media_url"]) if not response.ok: raise forms.ValidationError(_("An error occurred while " "downloading the media from the URL")) ext = mimetypes.guess_extension(response.headers['content-type']) ff = tempfile.NamedTemporaryFile(suffix=ext) ff.write(response.content) img_file = ImageFile(ff, name=ff.name) height, width = get_image_dimensions(img_file) if height is None or width is None: ff.close() raise forms.ValidationError(_("Invalid image")) data["attached_media"] = img_file return data
def __init__(self, transmogrifier, name, options, previous): self.previous = previous self.logger = logging.getLogger(options.get("name", transmogrifier.configuration_id + "." + name)) self.key = defaultMatcher(options, "url-key", name, "url") self.cachekey = Expression(options.get("cache-key", "string:_cache"), transmogrifier, name, options) self.headerskey = Expression(options.get("headers-key", "string:_headers"), transmogrifier, name, options) self.headersext = options.get("headers-extension", mimetypes.guess_extension("message/rfc822")) self.cachedir = resolvePackageReferenceOrFile( options.get("cache-directory", os.path.join(os.environ.get("PWD", os.getcwd()), "var/urlopener.cache.d")) ) if not os.path.isdir(self.cachedir): os.makedirs(self.cachedir) self.defaultpagename = options.get("default-page-name", ".{}.cache".format(options["blueprint"])) handlers = Expression(options.get("handlers", "python:[]"), transmogrifier, name, options)(options) if "ignore-error" in options: self.ignore_error = Expression(options["ignore-error"], transmogrifier, name, options) self.ignore_handler = HTTPDefaultErrorHandler() self.ignore_handler.section = self handlers.append(self.ignore_handler) if not [handler for handler in handlers if isinstance(handler, urllib2.HTTPRedirectHandler)]: handlers.append(HTTPRedirectHandler()) self.opener = urllib2.build_opener(*handlers)
def check_content_id(msg): """Check message part for Content-Id key. The use of Content-ID in mail messages seems to mostly be related to inline images. Since this key is case insensitive, have to loop through all kinds of keys to get there :( The content-id uri seems...pretty loose. <https://tools.ietf.org/html/rfc2392> """ for k in msg.keys(): if k.lower() != 'content-id': continue content_id = msg[k] if content_id.startswith('<'): content_id = content_id[1:] if content_id.endswith('>'): content_id = content_id[:-1] ext = mimetypes.guess_extension(msg.get_content_type()) fn = os.path.join(TMPDIR, '%s%s' % (str(uuid.uuid4()), ext)) CONTENTS.append({'id': content_id, 'filename': fn}) with open(fn, 'wb') as f: f.write(msg.get_payload(decode=True))
def _save(self, name, content): name = self._clean_name(name) headers = self.headers content_type = mimetypes.guess_type(name)[0] or "application/x-octet-stream" if self.gzip and content_type in self.gzip_content_types: content = self._compress_content(content) headers.update({'Content-Encoding': 'gzip'}) headers.update({ 'Content-Type': content_type, 'Content-Length' : len(content), }) newname = _compute_hash(content) extension = mimetypes.guess_extension(content_type) if extension: newname = newname + extension content.name = newname k = self.bucket.get_key(newname) if not k: k = self.bucket.new_key(newname) k.set_metadata('original_filename', name) k.set_contents_from_file(content, headers=headers, policy=self.acl) return newname
def download(self, url): r = requests.get(url, stream=True) for chunk in r.iter_content(chunk_size=1024): self.f.write(chunk) self.f.flush() if r.status_code == 404: return False parsed_url = urlparse(url) self.filename = list(reversed(parsed_url.path.split("/")))[0] if "content-type" in r.headers: self.content_type = r.headers['content-type'] ext = mimetypes.guess_extension(self.content_type) if ext: self.filename = self.filename + ext if "content-disposition" in r.headers: disposition = r.headers['content-disposition'] parts = disposition.split(';') if len(parts) > 1: self.filename = parts[1].strip(' ') self.filename = self.filename[self.filename.find('=') + 1:].strip(' ') self.filename = ''.join([c for c in self.filename if c.isalpha() or c == '.']) print self.filename return True
def addPictureFromFile(self, filename, mediatype=None): """ Add a picture It uses the same convention as OOo, in that it saves the picture in the zipfile in the subdirectory 'Pictures'. If mediatype is not given, it will be guessed from the filename extension. @param filesname unicode string: name of an image file @param mediatype unicode string: type of media, dfaults to None @return a unicode string, the name of the created file """ if mediatype is None: mediatype, encoding = mimetypes.guess_type(filename) if mediatype is None: mediatype = u'' try: ext = filename[filename.rindex(u'.'):] except ValueError: ext=u'' else: ext = mimetypes.guess_extension(mediatype) manifestfn = u"Pictures/%s%s" % (uuid.uuid4().hex.upper(), ext) self.Pictures[manifestfn] = (IS_FILENAME, filename, mediatype) assert(type(filename)==type(u"")) assert(type(mediatype)==type(u"")) return manifestfn
def get_extension(content): """A handful of workarounds for getting extensions we can trust.""" file_str = magic.from_buffer(content) if file_str.startswith("Composite Document File V2 Document"): # Workaround for issue with libmagic1==5.09-2 in Ubuntu 12.04. Fixed # in libmagic 5.11-2. mime = "application/msword" elif file_str == "(Corel/WP)": mime = "application/vnd.wordperfect" elif file_str == "C source, ASCII text": mime = "text/plain" else: # No workaround necessary mime = magic.from_buffer(content, mime=True) extension = mimetypes.guess_extension(mime) if extension == ".obj": # It could be a wpd, if it's not a PDF if "PDF" in content[0:40]: # Does 'PDF' appear in the beginning of the content? extension = ".pdf" elif is_html(content): extension = ".html" else: extension = ".wpd" if extension == ".wsdl": # It's probably an HTML file, like those from Resource.org extension = ".html" if extension == ".ksh": extension = ".txt" if extension == ".asf": extension = ".wma" return extension
def ticket_from_message(message, queue, quiet): # 'message' must be an RFC822 formatted message. msg = message message = email.message_from_string(msg) subject = message.get('subject', _('Created from e-mail')) subject = decode_mail_headers(decodeUnknown(message.get_charset(), subject)) subject = subject.replace("Re: ", "").replace("Fw: ", "").replace("RE: ", "").replace("FW: ", "").replace("Automatic reply: ", "").strip() sender = message.get('from', _('Unknown Sender')) sender = decode_mail_headers(decodeUnknown(message.get_charset(), sender)) sender_email = parseaddr(sender)[1] body_plain, body_html = '', '' for ignore in IgnoreEmail.objects.filter(Q(queues=queue) | Q(queues__isnull=True)): if ignore.test(sender_email): if ignore.keep_in_mailbox: # By returning 'False' the message will be kept in the mailbox, # and the 'True' will cause the message to be deleted. return False return True matchobj = re.match(r".*\["+queue.slug+"-(?P<id>\d+)\]", subject) if matchobj: # This is a reply or forward. ticket = matchobj.group('id') else: ticket = None counter = 0 files = [] for part in message.walk(): if part.get_content_maintype() == 'multipart': continue name = part.get_param("name") if name: name = collapse_rfc2231_value(name) if part.get_content_maintype() == 'text' and name == None: if part.get_content_subtype() == 'plain': body_plain = EmailReplyParser.parse_reply(decodeUnknown(part.get_content_charset(), part.get_payload(decode=True))) else: body_html = part.get_payload(decode=True) try: # strip html tags body_plain = striptags(body_html) except DjangoUnicodeDecodeError as e: charset = chardet.detect(body_html)['encoding'] body_plain = striptags(unicode(body_html, charset)) # remove extra new lines body_plain, n = re.subn(r'[\r\n]+', r'\n', body_plain) # remove extra spaces body_plain, n = re.subn(r'\s+$', '', body_plain, flags=re.M) body_plain = unescape(body_plain) else: if not name: ext = mimetypes.guess_extension(part.get_content_type()) name = "part-%i%s" % (counter, ext) files.append({ 'filename': name, 'content': part.get_payload(decode=True), 'type': part.get_content_type()}, ) counter += 1 if body_plain: body = body_plain if body_html: body += '\n\n' body += _('***Note that HTML tags are stripped out. Please see attachment email_html_body.html for the full html content.') else: body = _('No plain-text email body available. Please see attachment email_html_body.html.') if body_html: files.append({ 'filename': _("email_html_body.html"), 'content': body_html, 'type': 'text/html', }) now = timezone.now() if ticket: try: t = Ticket.objects.get(id=ticket) new = False except Ticket.DoesNotExist: ticket = None priority = 3 smtp_priority = message.get('priority', '') smtp_importance = message.get('importance', '') high_priority_types = ('high', 'important', '1', 'urgent') if smtp_priority in high_priority_types or smtp_importance in high_priority_types: priority = 2 if ticket == None: t = Ticket( title=subject, queue=queue, submitter_email=sender_email, created=now, description=body, priority=priority, ) t.save() new = True update = '' elif t.status == Ticket.CLOSED_STATUS: t.status = Ticket.REOPENED_STATUS t.save() f = FollowUp( ticket = t, title = _('E-Mail Received from %(sender_email)s' % {'sender_email': sender_email}), date = timezone.now(), public = True, comment = body, ) if t.status == Ticket.REOPENED_STATUS: f.new_status = Ticket.REOPENED_STATUS f.title = _('Ticket Re-Opened by E-Mail Received from %(sender_email)s' % {'sender_email': sender_email}) f.save() if not quiet: print (" [%s-%s] %s" % (t.queue.slug, t.id, t.title,)).encode('ascii', 'replace') for file in files: if file['content']: filename = file['filename'].encode('ascii', 'replace').replace(' ', '_') filename = re.sub('[^a-zA-Z0-9._-]+', '', filename) a = Attachment( followup=f, filename=filename, mime_type=file['type'], size=len(file['content']), ) a.file.save(filename, ContentFile(file['content']), save=False) a.save() if not quiet: print " - %s" % filename context = safe_template_context(t) if new: if sender_email: send_templated_mail( 'newticket_submitter', context, recipients=sender_email, sender=queue.from_address, fail_silently=True, ) if queue.new_ticket_cc: send_templated_mail( 'newticket_cc', context, recipients=queue.new_ticket_cc, sender=queue.from_address, fail_silently=True, ) if queue.updated_ticket_cc and queue.updated_ticket_cc != queue.new_ticket_cc: send_templated_mail( 'newticket_cc', context, recipients=queue.updated_ticket_cc, sender=queue.from_address, fail_silently=True, ) else: context.update(comment=f.comment) if t.status == Ticket.REOPENED_STATUS: update = _(' (Reopened)') else: update = _(' (Updated)') if t.assigned_to: send_templated_mail( 'updated_owner', context, recipients=t.assigned_to.email, sender=queue.from_address, fail_silently=True, ) if queue.updated_ticket_cc: send_templated_mail( 'updated_cc', context, recipients=queue.updated_ticket_cc, sender=queue.from_address, fail_silently=True, ) return t
def get_mime(file): """Given a file, returns mimetype and extension""" mime = magic.from_buffer(file.read(2048), mime=True) extension = guess_extension(mime, False) return mime, extension
def getExtension(mimeType): global contentTypes return contentTypes.get(mimeType, mimetypes.guess_extension(mimeType))
def test_attach_http(mock_get): """ API: AttachHTTP() object """ # Define our good:// url class GoodNotification(NotifyBase): def __init__(self, *args, **kwargs): super(GoodNotification, self).__init__(*args, **kwargs) def notify(self, *args, **kwargs): # Pretend everything is okay return True def url(self): # Support url() function return '' # Store our good notification in our schema map SCHEMA_MAP['good'] = GoodNotification # Temporary path path = join(TEST_VAR_DIR, 'apprise-test.gif') class DummyResponse(object): """ A dummy response used to manage our object """ status_code = requests.codes.ok headers = { 'Content-Length': getsize(path), 'Content-Type': 'image/gif', } # Pointer to file ptr = None # used to return random keep-alive chunks _keepalive_chunk_ref = 0 def close(self): return def iter_content(self, chunk_size=1024): """Lazy function (generator) to read a file piece by piece. Default chunk size: 1k.""" while True: self._keepalive_chunk_ref += 1 if 16 % self._keepalive_chunk_ref == 0: # Yield a keep-alive block yield '' data = self.ptr.read(chunk_size) if not data: break yield data def raise_for_status(self): return def __enter__(self): self.ptr = open(path, 'rb') return self def __exit__(self, *args, **kwargs): self.ptr.close() # Prepare Mock dummy_response = DummyResponse() mock_get.return_value = dummy_response # Test custom url get parameters results = AttachHTTP.parse_url( 'http://*****:*****@localhost/apprise.gif?dl=1&cache=300') assert isinstance(results, dict) attachment = AttachHTTP(**results) assert isinstance(attachment.url(), six.string_types) is True # Test that our extended variables are passed along assert mock_get.call_count == 0 assert attachment assert mock_get.call_count == 1 assert 'params' in mock_get.call_args_list[0][1] assert 'dl' in mock_get.call_args_list[0][1]['params'] # Verify that arguments that are reserved for apprise are not # passed along assert 'cache' not in mock_get.call_args_list[0][1]['params'] results = AttachHTTP.parse_url( 'http://*****:*****@localhost/apprise.gif?+key=value&cache=True') assert isinstance(results, dict) attachment = AttachHTTP(**results) assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None # No Content-Disposition; so we use filename from path assert attachment.name == 'apprise.gif' assert attachment.mimetype == 'image/gif' results = AttachHTTP.parse_url( 'http://*****:*****@localhost/ignore-filename.gif') assert isinstance(results, dict) attachment = AttachHTTP(**results) assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None assert attachment.mimetype == 'image/gif' # Because we could determine our mime type, we could build an extension # for our unknown filename assert attachment.name == 'myimage.gif' assert attachment assert len(attachment) == getsize(path) # Similar to test above except we make our max message size just 1 byte # smaller then our gif file. This will cause us to fail to read the # attachment AttachHTTP.max_file_size = getsize(path) - 1 results = AttachHTTP.parse_url('http://localhost/toobig.jpg') assert isinstance(results, dict) attachment = AttachHTTP(**results) # we can not download this attachment assert not attachment assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None assert attachment.mimetype is None assert attachment.name is None assert len(attachment) == 0 # Disable our file size limitations AttachHTTP.max_file_size = 0 results = AttachHTTP.parse_url('http://user@localhost') assert isinstance(results, dict) attachment = AttachHTTP(**results) assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None assert attachment.mimetype == 'image/gif' # Because we could determine our mime type, we could build an extension # for our unknown filename assert attachment.name == 'myimage.gif' assert attachment assert len(attachment) == getsize(path) # Set our header up with an invalid Content-Length; we can still process # this data. It just means we track it lower when reading back content dummy_response.headers = { 'Content-Length': 'invalid' } results = AttachHTTP.parse_url('http://localhost/invalid-length.gif') assert isinstance(results, dict) attachment = AttachHTTP(**results) assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None assert attachment.mimetype == 'image/gif' # Because we could determine our mime type, we could build an extension # for our unknown filename assert attachment.name == 'invalid-length.gif' assert attachment # Give ourselves nothing to work with dummy_response.headers = {} results = AttachHTTP.parse_url('http://user@localhost') assert isinstance(results, dict) attachment = AttachHTTP(**results) # we can not download this attachment assert attachment assert isinstance(attachment.url(), six.string_types) is True # No mime-type and/or filename over-ride was specified, so therefore it # won't show up in the generated URL assert re.search(r'[?&]mime=', attachment.url()) is None assert re.search(r'[?&]name=', attachment.url()) is None # Handle edge-case where detected_name is None for whatever reason attachment.detected_name = None assert attachment.mimetype == attachment.unknown_mimetype assert attachment.name.startswith(AttachHTTP.unknown_filename) assert len(attachment) == getsize(path) # Exception handling mock_get.return_value = None for _exception in REQUEST_EXCEPTIONS: aa = AppriseAttachment.instantiate( 'http://localhost/exception.gif?cache=30') assert isinstance(aa, AttachHTTP) mock_get.side_effect = _exception assert not aa # Restore value AttachHTTP.max_file_size = max_file_size
def get_extension(image_type): try: return mimetypes.guess_extension(image_type) except: return None
def guess_extension(mime: str) -> str: try: return sanity_overrides[mime] except KeyError: return mimetypes.guess_extension(mime)
# print(dir(email_message)) to_=email_message["To"] from_=email_message["From"] subject_=email_message["Subject"] date_=email_message["date"] #payload is the message in a list #iterate the list to retrieve each message counter=0 for part in email_message.walk(): if part.get_content_maintype() == "multipart": continue filename= part.get_filename() content_type= part.get_content_type() if not filename: # ext='.html' ext= mimetypes.guess_extension(content_type) if not ext: ext=".bin" # if 'text' in content_type: # ext=".txt" # elif "html" in content_type: # ext=".html" filename='msg-part-%08d%s' %(counter, ext) counter+=1 # save file save_path=os.path.join(os.getcwd(), "emails", date_, subject_) if not os.path.exists(save_path): #if the path doesnt exists os.makedirs(save_path) with open(os.path.join(save_path,filename),"wb") as fp: fp.write(part.get_payload(decode=True)) # print(subject_)
def guess_extension(mimetype): return OVERRIDE_MIMETYPES.get(mimetype, mimetypes.guess_extension(mimetype))
def save_image(counter, url, response, datasetpath, name, image_id, face_id, bbox, save_face=False): """Save image Full images saved to datasetpath/images/name_image_id.ext Face images saved to datasetpath/faces/name_image_id_face_id.ext Returns True if successful else False """ logger = logging.getLogger("logger") # Output dir for images is datasetpath/images/name output_dir = os.path.join(datasetpath, "images", name) ensure_dir_exists(output_dir) # Filename without extension filename = "{name}_{image_id}".format(name=name, image_id=image_id) outpath = os.path.join(output_dir, filename) # Save file without file extension with open(outpath, 'wb') as outfile: outfile.write(response.content) filetype = imghdr.what(outpath) # Cannot determine filetype. if filetype is None and not has_magic_lib: os.remove(outpath) logger.error("Line {number}: Cannot determine file type: {url}".format( number=counter, url=url)) return False # Get filetype using lib magic elif filetype is None and has_magic_lib: mimetype = magic.from_buffer(response.content, mime=True) if mimetype is None: logger.error( "Line {number}: Cannot determine file type: {url}".format( number=counter, url=url)) return False ext = mimetypes.guess_extension(mimetype).lstrip('.') if ext is None: logger.error( "Line {number}: Cannot determine file type: {url}".format( number=counter, url=url)) return False elif ext == "jpe": filetype = "jpeg" # Rename file to have extension newpath = "{}.{}".format(outpath, filetype) shutil.move(outpath, newpath) # If user wants face images if save_face: try: I = Image.open(newpath) output_dir = os.path.join(datasetpath, "faces", name) ensure_dir_exists(output_dir) filename = "{name}_{image_id}_{face_id}.{ext}".format( name=name, image_id=image_id, face_id=face_id, ext=filetype) I.crop(bbox).save(os.path.join(output_dir, filename)) except IOError as e: logger.error("Line {number}: {error}: {url}".format(number=counter, error=e, url=url)) return False return True
def get_info_from_file_reference(file_reference, **kwargs): #sys.stderr.write('file reference is ' + str(file_reference) + "\n") #logmessage('file reference is ' + str(file_reference)) if 'convert' in kwargs: convert = kwargs['convert'] else: convert = None if 'privileged' in kwargs: privileged = kwargs['privileged'] else: privileged = None has_info = False if re.search(r'^[0-9]+$', str(file_reference)): if 'uids' in kwargs: uids = kwargs['uids'] else: uids = None if uids is None or len(uids) == 0: new_uid = docassemble.base.functions.get_uid() if new_uid is not None: uids = [new_uid] else: uids = [] if 'filename' in kwargs: result = get_info_from_file_number(int(file_reference), privileged=privileged, filename=kwargs['filename'], uids=uids) else: result = get_info_from_file_number(int(file_reference), privileged=privileged, uids=uids) if 'fullpath' not in result: result['fullpath'] = None has_info = True elif re.search(r'^https?://', str(file_reference)): #logmessage("get_info_from_file_reference: " + str(file_reference) + " is a URL") possible_filename = re.sub(r'.*/', '', file_reference) if possible_filename == '': possible_filename = 'index.html' if re.search(r'\.', possible_filename): (possible_ext, possible_mimetype) = get_ext_and_mimetype(possible_filename) possible_ext = re.sub(r'[^A-Za-z0-9\.].*', '', possible_ext) #logmessage("get_info_from_file_reference: starting with " + str(possible_ext) + " and " + str(possible_mimetype)) else: possible_ext = 'txt' possible_mimetype = 'text/plain' result = dict() temp_file = tempfile.NamedTemporaryFile(prefix="datemp", suffix='.' + possible_ext, delete=False) req = Request(file_reference, headers={ 'User-Agent': docassemble.base.config.daconfig.get( 'user agent', 'curl/7.64.0') }) response = urlopen(req) temp_file.write(response.read()) #(local_filename, headers) = urllib.urlretrieve(file_reference) result['fullpath'] = temp_file.name try: #result['mimetype'] = headers.gettype() result['mimetype'] = response.headers['Content-Type'] #logmessage("get_info_from_file_reference: mimetype is " + str(result['mimetype'])) except Exception as errmess: logmessage( "get_info_from_file_reference: could not get mimetype from headers" ) result['mimetype'] = possible_mimetype result['extension'] = possible_ext if 'extension' not in result: #logmessage("get_info_from_file_reference: extension not in result") result['extension'] = re.sub( r'^\.', '', mimetypes.guess_extension(result['mimetype'])) #logmessage("get_info_from_file_reference: extension is " + str(result['extension'])) if re.search(r'\.', possible_filename): result['filename'] = possible_filename else: result['filename'] = possible_filename + '.' + result['extension'] path_parts = os.path.splitext(result['fullpath']) result['path'] = path_parts[0] has_info = True #logmessage("get_info_from_file_reference: downloaded to " + str(result['fullpath'])) else: #logmessage(str(file_reference) + " is not a URL") result = dict() question = kwargs.get('question', None) manual_package = kwargs.get('package', None) folder = kwargs.get('folder', None) the_package = None parts = file_reference.split(':') if len(parts) == 1: the_package = None if question is not None: the_package = question.from_source.package elif manual_package is not None: the_package = manual_package if the_package is None: the_package = docassemble.base.functions.get_current_package() if folder is None: m = re.search(r'^data/(templates|sources|static)/(.*)', file_reference) if m: folder = m.group(1) file_reference = m.group(2) if folder is not None and not re.search(r'/', file_reference): file_reference = 'data/' + str(folder) + '/' + file_reference if the_package is not None: #logmessage("package is " + str(the_package)) file_reference = the_package + ':' + file_reference else: #logmessage("package was null") file_reference = 'docassemble.base:' + file_reference if the_package is not None: result['package'] = the_package elif len(parts) == 2: result['package'] = parts[0] result['fullpath'] = docassemble.base.functions.static_filename_path( file_reference) # sys.stderr.write("path is " + str(result['fullpath']) + "\n") if result['fullpath'] is not None: #os.path.isfile(result['fullpath']) if not has_info: result['filename'] = os.path.basename(result['fullpath']) ext_type, result['mimetype'] = get_ext_and_mimetype( result['fullpath']) path_parts = os.path.splitext(result['fullpath']) result['path'] = path_parts[0] result['extension'] = path_parts[1].lower() result['extension'] = re.sub(r'\.', '', result['extension']) #logmessage("Extension is " + result['extension']) if convert is not None and result['extension'] in convert: #logmessage("Converting...") if os.path.isfile(result['path'] + '.' + convert[result['extension']]): #logmessage("Found conversion file ") result['extension'] = convert[result['extension']] result['fullpath'] = result['path'] + '.' + result['extension'] ext_type, result['mimetype'] = get_ext_and_mimetype( result['fullpath']) else: sys.stderr.write("Did not find file " + result['path'] + '.' + convert[result['extension']] + "\n") return dict() #logmessage("Full path is " + result['fullpath']) if os.path.isfile(result['fullpath']) and not has_info: add_info_about_file(result['fullpath'], result['path'], result) else: sys.stderr.write("File reference " + str(file_reference) + " DID NOT EXIST.\n") return (result)
def _populate_projects(self, iter_obj, yr): """Loop through the iter_obj to and sort/clean data based project_id Produced a list of dictionaries. Sample: {'end': '2012-12-31', 'operating_unit_email': '*****@*****.**', 'inst_id': '', 'operating_unit': 'Lithuania, Republic of', 'iati_op_id': 'LT', 'inst_descr': '', 'start': '2005-01-01', 'operating_unit_id': 'LTU', 'operating_unit_website': 'http://www.undp.lt/', 'project_id': '00038726', 'inst_type_id': '', 'document_name': u'http://www.undp.org/content/dam/undp/documents/projects/LTU/00038726/RC fund.pdf'} Arguments: iter_obj - and iteratble etree object """ counter = 0 # Get sorted units report_units = self.get_and_sort( self.undp_export + '/report_units.csv', 'operating_unit') # sorting table for documents by importancy docs_sort = [ 'A02', 'A03', 'A04', 'A05', 'A01', 'A07', 'A08', 'A09', 'A06', 'A11', 'A10' ] # Loop through each IATI activity in the XML for event, p in iter_obj: # IATI hierarchy used to determine if output or input1 hierarchy = p.attrib['hierarchy'] # Check for projects if hierarchy == '1': obj = Project() obj.project_id.value = self._grab_award_id(p[1].text) # Check if the project_id is unique if obj.project_id.value in self.projects.pks: continue obj.fiscal_year.value.append(yr) obj.project_title.value = p.find( obj.project_title.xml_key).text.lower() obj.project_descr.value = p.find( obj.project_descr.xml_key).text documents = p.findall('./document-link') if documents: names = [] links = [] format = [] places = [] for doc in documents: # avoid adding circular links to the same site/project if ('open.undp.org/#project/' + obj.project_id.value) not in doc.get('url'): try: links.append( urllib2.unquote(doc.get('url')).encode( 'utf-8').decode('utf-8')) except UnicodeDecodeError: links.append( urllib2.unquote( doc.get('url')).decode('utf-8')) #links.append(doc.get('url')) if 'application/' in doc.get('format'): ft = mimetypes.guess_extension( doc.get('format'), False) if ft is None: format.append('') else: format.append(ft.lstrip('.')) else: format.append('') for d in doc.iterchildren( tag=obj.document_name.key): names.append(d.text) # default place is last place = 100 for t in doc.iterchildren(tag='category'): try: tp = docs_sort.index(t.get('code')) except ValueError: tp = 100 if (tp < place): place = tp places.append(place) obj.document_name.value.extend( [names, links, format, places]) # Find start and end dates obj.start.value = p.find(obj.start.xml_key).text obj.end.value = p.find(obj.end.xml_key).text contact = p.findall('./contact-info') obj.operating_unit_email.value = [ e.text for email in contact for e in email.iterchildren( tag=obj.operating_unit_email.key) ][0] # Find operating_unit # If recipient country didn't exist look for recipient region try: obj.iati_op_id.value = (p.find( obj.iati_op_id.xml_key).attrib.get('code')) obj.operating_unit.value = p.find( obj.operating_unit.xml_key).text for r in report_units: if (obj.iati_op_id.value == r['iati_operating_unit'] or obj.iati_op_id.value == r['operating_unit']): obj.operating_unit_id.value = r['operating_unit'] obj.region_id.value = r[obj.region_id.key] except: region_unit = p.findall("./recipient-region") for ru in region_unit: for r in report_units: if type(ru.text) == type( r['ou_descr'] ) and ru.text == r['ou_descr']: obj.operating_unit_id.value = r[ 'operating_unit'] obj.operating_unit.value = r['ou_descr'] obj.iati_op_id.value = '998' # find contact info try: for email in contact: for e in email.iterchildren( tag=obj.operating_unit_email.key): obj.operating_unit_email.value = e.text obj.operating_unit_website.value = p.find( obj.operating_unit_website.xml_key).text except: pass # Check for implementing organization try: inst = p.find("./participating-org[@role='Implementing']") obj.inst_id.value = inst.attrib.get(obj.inst_id.key) obj.inst_type_id.value = inst.attrib.get( obj.inst_type_id.key) obj.inst_descr.value = inst.text except: pass # Populate the Unit Collection self._populate_units(obj) counter += 1 self.log('Processing: %s' % counter, True) self.projects.add(obj.project_id.value, obj) self.log('%s - Project Annuals: %s rows processed' % (yr, counter))
def _get_dehydrated_message(self, msg, record): settings = utils.get_settings() new = EmailMessage() if msg.is_multipart(): for header, value in msg.items(): new[header] = value for part in msg.get_payload(): new.attach(self._get_dehydrated_message(part, record)) elif (settings['strip_unallowed_mimetypes'] and not msg.get_content_type() in settings['allowed_mimetypes']): for header, value in msg.items(): new[header] = value # Delete header, otherwise when attempting to deserialize the # payload, it will be expecting a body for this. del new['Content-Transfer-Encoding'] new[settings['altered_message_header']] = ( 'Stripped; Content type %s not allowed' % (msg.get_content_type())) new.set_payload('') elif ((msg.get_content_type() not in settings['text_stored_mimetypes']) or ('attachment' in msg.get('Content-Disposition', ''))): filename = None raw_filename = msg.get_filename() if raw_filename: filename = utils.convert_header_to_unicode(raw_filename) if not filename: extension = mimetypes.guess_extension(msg.get_content_type()) else: _, extension = os.path.splitext(filename) if not extension: extension = '.bin' attachment = MessageAttachment() attachment.document.save( uuid.uuid4().hex + extension, ContentFile( six.BytesIO(msg.get_payload(decode=True)).getvalue())) attachment.message = record for key, value in msg.items(): attachment[key] = value attachment.save() placeholder = EmailMessage() placeholder[settings['attachment_interpolation_header']] = str( attachment.pk) new = placeholder else: content_charset = msg.get_content_charset() if not content_charset: content_charset = 'ascii' try: # Make sure that the payload can be properly decoded in the # defined charset, if it can't, let's mash some things # inside the payload :-\ msg.get_payload(decode=True).decode(content_charset) except LookupError: logger.warning("Unknown encoding %s; interpreting as ASCII!", content_charset) msg.set_payload( msg.get_payload(decode=True).decode('ascii', 'ignore')) except ValueError: logger.warning( "Decoding error encountered; interpreting %s as ASCII!", content_charset) msg.set_payload( msg.get_payload(decode=True).decode('ascii', 'ignore')) new = msg return new
def get_extension_by_filename(filename): try: return mimetypes.guess_extension(mimetypes.guess_type(filename)[0]) except: return None
def main(): fp = open("/tmp/mail.log", "a") #fp.write("The file is " + sys.argv[1] + "\n") try: with open(sys.argv[1], 'rU') as email_fp: msg = email.message_from_file(email_fp) except Exception as errMess: fp.write("Failed to read e-mail message: " + str(errMess) + "\n") sys.exit("Failed to read e-mail message") raw_date = msg.get('Date', msg.get('Resent-Date', None)) addr_return_path = msg.get('Return-path', None) addr_reply_to = msg.get('Reply-to', None) addr_to = msg.get('Envelope-to', None) addr_from = msg.get('From', msg.get('Sender', None)) subject = msg.get('Subject', None) fp.write("Message to " + str(addr_to) + "\n") #fp.write("From was " + str(addr_from) + "\n") #fp.write("Subject was " + str(subject) + "\n") to_recipients = list() for recipient in getaddresses( msg.get_all('to', []) + msg.get_all('resent-to', [])): to_recipients.append(dict(name=recipient[0], address=recipient[1])) cc_recipients = list() for recipient in getaddresses( msg.get_all('cc', []) + msg.get_all('resent-cc', [])): cc_recipients.append(dict(name=recipient[0], address=recipient[1])) recipients = list() for recipient in getaddresses( msg.get_all('to', []) + msg.get_all('cc', []) + msg.get_all('resent-to', []) + msg.get_all('resent-cc', [])): recipients.append(dict(name=recipient[0], address=recipient[1])) if addr_to is None and len(recipients): addr_to = recipients[0]['address'] #fp.write("recipients are " + str(recipients) + "\n") if addr_to is not None: #fp.write("parsed envelope-to: " + str(parseaddr(addr_to)) + "\n") short_code = re.sub(r'@.*', '', parseaddr(addr_to)[1]) else: short_code = None #fp.write("short code is " + str(short_code) + "\n") record = db.session.query(Shortener).filter_by(short=short_code).first() if record is None: fp.write("short code not found\n") sys.exit("short code not found") #fp.write("short code found\n") #file_number = get_new_file_number(record.uid, 'email', yaml_file_name=record.filename) ##fp.write("file number is " + str(file_number) + "\n") #saved_file_email = SavedFile(file_number, fix=True) if addr_from is not None: #fp.write("parsed from: " + str(parseaddr(addr_from)[1]) + "\n") addr_from = dict(name=parseaddr(addr_from)[0], address=parseaddr(addr_from)[1]) else: addr_from = dict(empty=True) if addr_return_path is not None: #fp.write("parsed return_path: " + str(parseaddr(addr_return_path)[1]) + "\n") addr_return_path = dict(name=parseaddr(addr_return_path)[0], address=parseaddr(addr_return_path)[1]) else: addr_return_path = dict(empty=True) #fp.write("return_path is " + str(addr_return_path) + "\n") if addr_reply_to is not None: #fp.write("parsed reply-to: " + str(parseaddr(addr_reply_to)[1]) + "\n") addr_reply_to = dict(name=parseaddr(addr_reply_to)[0], address=parseaddr(addr_reply_to)[1]) #fp.write("reply-to is " + str(addr_reply_to) + "\n") else: addr_reply_to = dict(empty=True) #fp.write("reply-to is " + str(addr_reply_to) + "\n") msg_current_time = datetime.datetime.now() if raw_date is not None: msg_date = datetime.datetime.fromtimestamp(mktime(parsedate(raw_date))) #fp.write("msg_date is " + str(msg_date) + "\n") else: msg_date = msg_current_time #fp.write("msg_date set to current time\n") headers = list() for item in msg.items(): headers.append([item[0], item[1]]) #fp.write("headers:\n" + json.dumps(headers) + "\n") email_record = Email(short=short_code, to_addr=json.dumps(to_recipients), cc_addr=json.dumps(cc_recipients), from_addr=json.dumps(addr_from), reply_to_addr=json.dumps(addr_reply_to), return_path_addr=json.dumps(addr_return_path), subject=subject, datetime_message=msg_date, datetime_received=msg_current_time) db.session.add(email_record) db.session.commit() save_attachment(record.uid, record.filename, 'headers.json', email_record.id, 0, 'application/json', 'json', json.dumps(headers)) counter = 1 for part in msg.walk(): if part.get_content_maintype() == 'multipart': continue filename = part.get_filename() if part.get_content_type() == 'text/plain': ext = '.txt' else: ext = mimetypes.guess_extension(part.get_content_type()) if not ext: ext = '.bin' if filename: filename = '%03d-%s' % (counter, secure_filename(filename)) else: filename = '%03d-attachment%s' % (counter, ext) #fp.write("Filename is " + str(filename) + "\n") #fp.write("Content type is " + str(part.get_content_type()) + "\n") real_filename = re.sub(r'[0-9][0-9][0-9]-', r'', filename) real_ext = re.sub(r'^\.', r'', ext) save_attachment(record.uid, record.filename, real_filename, email_record.id, counter, part.get_content_type(), real_ext, part.get_payload(decode=True)) counter += 1 fp.close() user = None if record.user_id is not None: user = db.session.query(UserModel).filter_by(id=record.user_id).first() if user is None: user_info = dict(email=None, the_user_id='t' + str(record.temp_user_id), theid=record.temp_user_id, roles=list()) else: user_info = dict(email=user.email, roles=[role.name for role in user.roles], the_user_id=user.id, theid=user.id, firstname=user.first_name, lastname=user.last_name, nickname=user.nickname, country=user.country, subdivisionfirst=user.subdivisionfirst, subdivisionsecond=user.subdivisionsecond, subdivisionthird=user.subdivisionthird, organization=user.organization) result = docassemble.webapp.worker.background_action.delay( record.filename, user_info, record.uid, None, 'http://localhost', 'http://localhost', dict(action='incoming_email', arguments=dict(id=email_record.id)), extra=None)
def parse(args): #Clear output warc file. if args.dump == "warc": if args.silence: print("Recording", args.dump, "to", args.output + ".") with open(args.output_path_sub + args.output, "wb"): pass for record in warc_records(args.string, args.path): try: #Filter out unwanted entries. if not checkFilter(args.filter, record): continue #Increment Index counters. if args.silence: inc("records") inc(record, "warc-type", "types") inc(record, "content_type", "warc-content") if record.http: inc(record.http, "content_type", "http-content") inc(record.http, "error", "status") #Dump records to file. if args.dump == "warc": with open(args.output_path_sub + args.output, "ab") as output: record.write_to(output) if args.dump == "content": url = urlparse(unquote(record['WARC-Target-URI'])) #Set up folder index = url.path.rfind("/") + 1 file = url.path[index:] path = url.path[:index] #Process filename if "." not in file: path += file if not path.endswith("/"): path += "/" file = 'index.html' #Final fixes. path = path.replace(".", "-") host = url.hostname.replace('www.', '', 1) path = args.output_path_sub + host + path #Create new directories if not os.path.exists(path): try: os.makedirs(path) except OSError: path = "/".join([i[:25] for i in path.split("/")]) os.makedirs(path) #Test if file has a proper extension. index = file.index(".") suffix = file[index:] content = record.http.get("content_type", "") slist = mimetypes.guess_all_extensions(content) if suffix not in slist: #Correct suffix if we can. suffix = mimetypes.guess_extension(content) if suffix: file = file[:index] + suffix else: inc(record.http, "content_type", "unknown mime type") #Check for gzip compression. if record.http.get("content-encoding", None) == "gzip": file += ".gz" path += file #If Duplicate file then insert numbers index = path.rfind(".") temp = path n = 0 while os.path.isfile(temp): n += 1 temp = path[:index] + "(" + str(n) + ")" + path[index:] path = temp #Write file. with open(path, 'wb') as fp: record.http.write_payload_to(fp) except Exception: if args.error: if args.silence: print("Error in record. Recording to error.warc.") with open(args.output_path_sub + "error.warc", "ab") as fp: record.write_to(fp) else: raise #print results if args.silence: print("-----------------------------") for i in counts: print("\nCount of {}.".format(i)) pprint(counts[i])
def _load_urllib(self, filename, kwargs): '''(internal) Loading a network file. First download it, save it to a temporary file, and pass it to _load_local().''' if PY2: import urllib2 as urllib_request def gettype(info): return info.gettype() else: import urllib.request as urllib_request def gettype(info): return info.get_content_type() proto = filename.split(':', 1)[0] if proto == 'smb': try: # note: it's important to load SMBHandler every time # otherwise the data is occasionally not loaded from smb.SMBHandler import SMBHandler except ImportError: Logger.warning( 'Loader: can not load PySMB: make sure it is installed') return import tempfile data = fd = _out_osfd = None try: _out_filename = '' if proto == 'smb': # read from samba shares fd = urllib_request.build_opener(SMBHandler).open(filename) else: # read from internet request = urllib_request.Request(filename) if Config.has_option('network', 'useragent'): useragent = Config.get('network', 'useragent') if useragent: request.add_header('User-Agent', useragent) opener = urllib_request.build_opener() fd = opener.open(request) if '#.' in filename: # allow extension override from URL fragment suffix = '.' + filename.split('#.')[-1] else: ctype = gettype(fd.info()) suffix = mimetypes.guess_extension(ctype) suffix = LoaderBase.EXT_ALIAS.get(suffix, suffix) if not suffix: # strip query string and split on path parts = filename.split('?')[0].split('/')[1:] while len(parts) > 1 and not parts[0]: # strip out blanks from '//' parts = parts[1:] if len(parts) > 1 and '.' in parts[-1]: # we don't want '.com', '.net', etc. as the extension suffix = '.' + parts[-1].split('.')[-1] _out_osfd, _out_filename = tempfile.mkstemp(prefix='kivyloader', suffix=suffix) idata = fd.read() fd.close() fd = None # write to local filename write(_out_osfd, idata) close(_out_osfd) _out_osfd = None # load data data = self._load_local(_out_filename, kwargs) # FIXME create a clean API for that for imdata in data._data: imdata.source = filename except Exception as ex: Logger.exception('Loader: Failed to load image <%s>' % filename) # close file when remote file not found or download error try: if _out_osfd: close(_out_osfd) except OSError: pass # update client for c_filename, client in self._client[:]: if filename != c_filename: continue # got one client to update client.image = self.error_image client.dispatch('on_error', error=ex) self._client.remove((c_filename, client)) return self.error_image finally: if fd: fd.close() if _out_osfd: close(_out_osfd) if _out_filename != '': unlink(_out_filename) return data
def getExtension(mimetype): ext = mimetypes.guess_extension(mimetype.split(';')[0]) if ext is None: raise Exception("Unsupported/unrecognized mimetype: " + mimetype) return ext
def process_apids(apid_matches, *, session, csv_writer, logger): """ Given a list of APID tuples as returned by `process_gedcom_text()`, an active session, and a csv writer, it downloads images from Ancestry.com. Presumes the current directory of `os` is the output directory. Returns a list of apids with errors. """ total_apid_matches = len(apid_matches) processed_apids = defaultdict( list) # A dict with dbids as keys, and items as a list of pids. iid_regex = re.compile(r"var iid='([^\s']+)';") processed_iids = { } # A dict with IID's as keys, and the following object as items. class processed_iid(object): def __init__(self, extension, apids=[]): self.extension = extension self.apids = apids problem_apids = set() # Process each apid. for i, match in enumerate(apid_matches, start=1): sour, apid, indiv, dbid, pid = match fields = { 'sour': sour, 'apid': apid, 'indiv': indiv, 'dbid': dbid, 'pid': pid, } logger.info("Processing APID {0} of {1} <APID {2}>...".format( i, total_apid_matches, apid)) # Check if the apid has previously been processed. if dbid in processed_apids and pid in processed_apids[dbid]: logger.info( " > APID previously processed as part of another source.") logger.info(" > Finished!") continue else: # Mark the apid as processed now, so even if something fails, we know not to check it again. processed_apids[dbid].append(pid) # Visit the record page corresponding to the app id. logger.info(" > Getting the record page for the APID...") record_page = session.get( 'http://search.ancestry.com/cgi-bin/sse.dll?indiv={0}&dbid={1}&h={2}' .format(indiv, dbid, pid)) if record_page.status_code != 200: logger.error( " > There was an error when trying to get the record page for the APID." ) problem_apids.add(apid) logger.info(" > Aborted!") continue # Extract the image id associated with the record from the returned html. logger.info( " > Processing the record page to determine the image ID...") match = iid_regex.search(record_page.text) if not match: # TODO, more and better checks could be performed rather than presuming there is no image at this stage, such as checking for a thumbnail. logger.info( " > An image ID could not be found. Either the record does not have an image, or the record page was in an unexpected format." ) fields['image'] = '' fields['extension'] = '' logger.info(" > Writing results to CSV file...") csv_writer.writerow(fields) logger.info(" > Finished!") continue fields['image'] = iid = match.group(1) # Check if the iid has previously been processed. if iid in processed_iids: logger.info( " > The image for this record has previously been processed." ) fields['extension'] = processed_iids[iid].extension logger.info(" > Writing results to CSV file...") csv_writer.writerow(fields) processed_iids[iid].apids.append(apid) logger.info(" > Finished!") continue else: # Mark the iid as processed now, so even if something fails, we know not to check it again. processed_iids[iid] = processed_iid(None, [apid]) # Get the api data related to the image. logger.info(" > Get information regarding the image...") image_page = session.get( 'http://www.ancestry.com/interactive/api/v2/Media/GetMediaInfo/{0}/{1}/{2}' .format(dbid, iid, pid)) if record_page.status_code != 200: logger.error( " > There was an error when trying to get the image info.") problem_apids.add(apid) logger.info(" > Aborted!") continue # Extract the download url for the returned json. logger.info(" > Processing the image information...") image_page_json = image_page.json() try: download_url = image_page_json['ImageServiceUrlForDownload'] except KeyError: logger.error( " > There was an error when trying to get the download URL from the image info." ) problem_apids.add(apid) logger.info(" > Aborted!") continue # Download the image. logger.info(" > Downloading image...") image_download = session.get(download_url, stream=True) if image_download.status_code != 200: logger.error( " > There was an error when trying to download the image.") problem_apids.add(apid) logger.info(" > Aborted!") continue # Save the image to a file. logger.info(" > Saving image...") # Ensure the dbid has a folder for saving the image into. if not os.path.exists(dbid): os.makedirs(dbid) content_type = image_download.headers['content-type'] extension = mimetypes.guess_extension(content_type).strip('.') if extension == 'jpeg' or extension == 'jpe': extension = 'jpg' fields['extension'] = extension # Ensure the extension has been recorded for later use. if processed_iids[iid].extension == None: processed_iids[iid].extension = extension try: with open("{0}/{1}.{2}".format(dbid, iid, extension), 'wb') as f: for chunk in image_download.iter_content(1024): f.write(chunk) except Exception as e: logger.error( ' > There was an unknown error when saving the file: ' + str(e)) logger.info(" > Aborted!") continue logger.info(" > Image file saved successfully.") # Write results to csv file. logger.info(" > Writing results to CSV file...") csv_writer.writerow(fields) logger.info(" > Finished!") # All done. return problem_apids
def binary_content(cls, xmlid=None, model='ir.attachment', id=None, field='datas', unique=False, filename=None, filename_field='datas_fname', download=False, mimetype=None, default_mimetype='application/octet-stream', access_token=None, related_id=None, access_mode=None, env=None): """ Get file, attachment or downloadable content If the ``xmlid`` and ``id`` parameter is omitted, fetches the default value for the binary field (via ``default_get``), otherwise fetches the field for that precise record. :param str xmlid: xmlid of the record :param str model: name of the model to fetch the binary from :param int id: id of the record from which to fetch the binary :param str field: binary field :param bool unique: add a max-age for the cache control :param str filename: choose a filename :param str filename_field: if not create an filename with model-id-field :param bool download: apply headers to download the file :param str mimetype: mintype of the field (for headers) :param related_id: the id of another record used for custom_check :param access_mode: if truthy, will call custom_check to fetch the object that contains the binary. :param str default_mimetype: default mintype if no mintype found :param str access_token: optional token for unauthenticated access only available for ir.attachment :param Environment env: by default use request.env :returns: (status, headers, content) """ env = env or request.env # get object and content obj = None if xmlid: obj = cls._xmlid_to_obj(env, xmlid) elif id and model in env.registry: obj = env[model].browse(int(id)) # obj exists if not obj or not obj.exists() or field not in obj: return (404, [], None) # access token grant access if model == 'ir.attachment' and access_token: obj = obj.sudo() if access_mode: if not cls._check_access_mode(env, id, access_mode, model, access_token=access_token, related_id=related_id): return (403, [], None) elif not consteq(obj.access_token or u'', access_token): return (403, [], None) # check read access try: last_update = obj['__last_update'] except AccessError: return (403, [], None) status, headers, content = None, [], None # attachment by url check module_resource_path = None if model == 'ir.attachment' and obj.type == 'url' and obj.url: url_match = re.match("^/(\w+)/(.+)$", obj.url) if url_match: module = url_match.group(1) module_path = get_module_path(module) module_resource_path = get_resource_path(module, url_match.group(2)) if module_path and module_resource_path: module_path = os.path.join(os.path.normpath(module_path), '') # join ensures the path ends with '/' module_resource_path = os.path.normpath(module_resource_path) if module_resource_path.startswith(module_path): with open(module_resource_path, 'rb') as f: content = base64.b64encode(f.read()) last_update = pycompat.text_type(os.path.getmtime(module_resource_path)) if not module_resource_path: module_resource_path = obj.url if not content: status = 301 content = module_resource_path else: content = obj[field] or '' # filename default_filename = False if not filename: if filename_field in obj: filename = obj[filename_field] if not filename and module_resource_path: filename = os.path.basename(module_resource_path) if not filename: default_filename = True filename = "%s-%s-%s" % (obj._name, obj.id, field) # mimetype mimetype = 'mimetype' in obj and obj.mimetype or False if not mimetype: if filename: mimetype = mimetypes.guess_type(filename)[0] if not mimetype and getattr(env[model]._fields[field], 'attachment', False): # for binary fields, fetch the ir_attachement for mimetype check attach_mimetype = env['ir.attachment'].search_read(domain=[('res_model', '=', model), ('res_id', '=', id), ('res_field', '=', field)], fields=['mimetype'], limit=1) mimetype = attach_mimetype and attach_mimetype[0]['mimetype'] if not mimetype: try: decoded_content = base64.b64decode(content) except base64.binascii.Error: # if we could not decode it, no need to pass it down: it would crash elsewhere... return (404, [], None) mimetype = guess_mimetype(decoded_content, default=default_mimetype) # extension _, existing_extension = os.path.splitext(filename) if not existing_extension or default_filename: extension = mimetypes.guess_extension(mimetype) if extension: filename = "%s%s" % (filename, extension) headers += [('Content-Type', mimetype), ('X-Content-Type-Options', 'nosniff')] # cache etag = bool(request) and request.httprequest.headers.get('If-None-Match') retag = '"%s"' % hashlib.md5(pycompat.to_text(content).encode('utf-8')).hexdigest() status = status or (304 if etag == retag else 200) headers.append(('ETag', retag)) headers.append(('Cache-Control', 'max-age=%s' % (STATIC_CACHE if unique else 0))) # content-disposition default name if download: headers.append(('Content-Disposition', cls.content_disposition(filename))) return (status, headers, content)
def get_suffix_for(value: bytes): mime_type = magic.from_buffer(value, mime=True) extension = mimetypes.guess_extension(mime_type) return extension
def url_fetch_completed(cls, usr, url_name, directory, archive_html, row, settings_row, media_path, media_element, *args): ext = None save = False save_text = False favicon_link = None final_og_link = None summary = 'none' req = args[-1] tags_list = [] save_summary = False if req and req.content_type: if ';' in req.content_type: content_type = req.content_type.split(';')[0].strip() else: content_type = req.content_type if content_type == 'text/plain': ext = '.txt' else: ext = guess_extension(content_type) logger.debug('{} ----> {}'.format(content_type, ext)) if req and req.html and not req.binary: if 'text/html' in req.content_type: soup = BeautifulSoup(req.html, 'html.parser') if soup.title: title = soup.title.text if title.lower() == 'youtube': try_srch = re.search('document.title[^;]*', req.html) if try_srch: title = try_srch.group().replace('document.title = ', '') else: title = cls.unquote_title(url_name) ilink = soup.find('link', {'rel':'icon'}) slink = soup.find('link', {'rel':'shortcut icon'}) mlink = soup.find('meta', {'property':'og:image'}) if mlink: final_og_link = mlink.get('content', '') if ilink: favicon_link = cls.format_link(ilink.get('href'), url_name) elif slink: favicon_link = cls.format_link(slink.get('href'), url_name) else: for link in soup.find_all('link'): rel = link.get('href') if (rel and (rel.endswith('.ico') or '.ico' in rel)): favicon_link = cls.format_link(rel, url_name) if not favicon_link: urlp = urlparse(url_name) favicon_link = urlp.scheme + '://' + urlp.netloc + '/favicon.ico' if archive_html or (settings_row and settings_row.auto_archive): save_text = True if settings_row and (settings_row.autotag or settings_row.auto_summary): summary, tags_list = Summarizer.get_summary_and_tags(req.html, settings_row.total_tags) else: title = cls.unquote_title(url_name) save = True elif req and req.binary: title = cls.unquote_title(url_name) save = True else: ext = '.bin' title = url_name.rsplit('/', 1)[-1] if row is None: if settings_row and settings_row.reader_theme: reader_theme = settings_row.reader_theme else: reader_theme = UserSettings.WHITE row = Library.objects.create(usr=usr, directory=directory, url=url_name, title=title, summary=summary, timestamp=timezone.now(), media_element=media_element, reader_mode=reader_theme) else: logger.debug('row - exists') if not media_path: if ext and ext.startswith('.'): out_dir = ext[1:].upper() else: out_dir = str(ext).upper() if not ext: print(req.content_type) out_title = str(row.id) + str(ext) media_dir = os.path.join(settings.ARCHIVE_LOCATION, out_dir) if not os.path.exists(media_dir): os.makedirs(media_dir) if not os.path.exists(settings.FAVICONS_STATIC): os.makedirs(settings.FAVICONS_STATIC) media_path_parent = os.path.join(media_dir, str(row.id)) final_favicon_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.ico') final_og_image_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.png') media_path = os.path.join(media_path_parent, out_title) row.media_path = media_path row.save() if favicon_link and favicon_link.startswith('http'): cls.vnt.get(favicon_link, out=final_favicon_path) logger.debug(final_og_link) if final_og_link and final_og_link.startswith('http'): cls.vnt.get(final_og_link, out=final_og_image_path) elif media_path and row: final_favicon_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.ico') final_og_image_path = os.path.join(settings.FAVICONS_STATIC, str(row.id) + '.png') media_path_parent, out_title = os.path.split(media_path) if settings_row and settings_row.auto_summary and summary: row.summary = summary if settings_row and not tags_list: row.save() else: save_summary = True if (not os.path.exists(final_favicon_path) and favicon_link and favicon_link.startswith('http')): cls.vnt.get(favicon_link, out=final_favicon_path) if (not os.path.exists(final_og_image_path) and final_og_link and final_og_link.startswith('http')): cls.vnt.get(final_og_link, out=final_og_image_path) if save or save_text: if not os.path.exists(media_path_parent): os.makedirs(media_path_parent) if save: cls.vnt.get(url_name, out=media_path) else: with open(media_path, 'w') as fd: fd.write(req.html) if settings_row and ext in ['.htm', '.html']: cls.convert_html_pdf(media_path_parent, settings_row, row, url_name, media_path, media_element) if settings_row and tags_list: if save_summary: cls.edit_tags(usr, row.id, ','.join(tags_list), '', old_row=row) else: cls.edit_tags(usr, row.id, ','.join(tags_list), '') return row.id
def export_inbox(connection, fetch_uid, fetch_protocol): sp_ch = [ '<', '>', '?', '*', ':', '|', '/', '"', '\\', '\r', '\n', '\t', '\b', '\a' ] success_var, email_data = connection.uid('fetch', fetch_uid, fetch_protocol) email_data = email.message_from_bytes(email_data[0][1]) email_info = { 'Subject': '', 'From': '', 'To': '', 'Date': '', 'Encryption': '' } disposition = [] for email_part in email_data.walk(): if bool(email_part['Subject']): email_info['Subject'] = email_part['Subject'] if bool(email_part['From']): email_info['From'] = email_part['From'] if bool(email_part['To']): email_info['To'] = email_part['To'] if bool(email_part['Date']): email_info['Date'] = email_part['Date'] if bool(email_part['Content-Disposition']): disposition.append(email_part) if bool(email_part['Encryption']): email_info['Encryption'] = email_part['Encryption'] email_subject = email_info['Subject'] if not bool(email_subject): email_subject = '(no subject)' email_sender = email_info['From'] email_sender = re.findall('[^<> ]+@[^<> ]+', email_sender)[0] email_sender = ''.join(x for x in email_sender if not x in sp_ch) email_receiver = email_info['To'] if not bool(email_receiver): email_receiver = '(empty)' email_time = email_info['Date'] sub_dirname = dateutil.parser.parse(email_time).astimezone( dateutil.tz.tzlocal()).strftime('%d.%m.%Y-%H.%M.%S') sub_dirname = f'{fetch_uid.decode("utf-8")}_{email_sender}_{sub_dirname}' datetime = dateutil.parser.parse(email_time).astimezone( dateutil.tz.tzlocal()).strftime('%d/%m/%Y %X') key = None if bool(email_info['Encryption']): print('\n\tThis E-Mail is secured by encryption.') print( f'\n\tHINTS:\n\t\tSender : {email_sender}\n\t\tSubject : {email_subject}' ) print('\n\tYou have only 5 attempts to enter the correct password.') print('\tAfter that the E-Mail will be deleted.') print('\n\tBrowse for a key containing file instead (5 attempts) ?') response = yesno() if response in ['Y', 'y']: win = tkinter.Tk() win.geometry('200x50') win.title('Non functional window') label = tkinter.Label(win, text='IGNORE THIS WINDOW') label.pack() for i in range(5): print(f'\n\t[Attempt {i+1} of 5]') key_path = filedialog.askopenfilename( initialdir=os.environ.get('userprofile'), title='Select the file having the key', filetypes=[('Binary', '*.bin')]) with open(key_path, 'rb') as f: key = f.read() #os.remove(key_path) if len(key) == 32: print('\tKEY ACCEPTED\n\n\tDecrypting .....') break else: print('\tWRONG KEY') key = None continue win.destroy() else: for i in range(5): print(f'\n\t[Attempt {i+1} of 5]') anon_id = input('\n\tPASSWORD : '******'temp-{}'.format(random.randint(0,9999)) api_handler = AnonFile('api_key') sys.stdout = open(os.devnull, 'w') api_handler.download_file( f'https://anonfiles.com/{anon_id}/{key_name}_bin') sys.stdout = sys.__stdout__ #with open(api_res, 'r') as f : # dl_res = f.read() #os.remove(api_res) #if 'Error -- 403: Forbidden' in dl_res : # print('\tWRONG PASSWORD') # continue if os.path.exists(f'{key_name}.bin'): with open(f'{key_name}.bin', 'rb') as f: key = f.read() os.remove(f'{key_name}.bin') print('\tPASSWORD ACCEPTED\n\n\tDecrypting .....') break else: print('\tWRONG PASSWORD') continue else: print('\tWRONG PASSWORD') continue else: print('\tWRONG PASSWORD') continue if not key: print('\n\tAll attempts failed.\n\tDeleting the E-Mail .....') connection.uid('store', fetch_uid, '+FLAGS', '\\Deleted') connection.expunge() print('\tE-Mail deleted.') return # existence of key implies existence of encryption and vice-versa otherwise the function would have quit by now. sub_dirname = os.path.join(os.getcwd(), sub_dirname) check_isdir(sub_dirname) os.chdir(sub_dirname) attachment_dir = os.path.join(os.getcwd(), 'attachments') for part in disposition: filename = part.get_filename() if filename == None: filename = '' else: filename = os.path.basename(filename) if key: if filename.endswith('.encrypted'): filename = filename.replace('.encrypted', '') else: print( '\nSomeone messed with the mail-sending source.\nAttachment names may be affected.' ) filename = ''.join(x for x in filename if not x in sp_ch) name, ext = os.path.splitext(filename) if not ext: ext = mimetypes.guess_extension(part.get_content_type()) if not name: if 'attachment' in part['Content-Disposition']: name = 'untitled-attachment-{}'.format(random.randint(0, 9999)) elif 'inline' in part['Content-Disposition']: name = 'untitled-inline-{}'.format(random.randint(0, 999)) filename = name + ext if len(filename) > 255: name, ext = os.path.splitext(filename) name = name[:(255 - len(ext))] filename = name + ext check_isdir(attachment_dir) os.chdir(attachment_dir) with open(filename, 'wb') as file: payload = part.get_payload(decode=True) if key: payload = decrypt(payload, key) file.write(payload) os.chdir(sub_dirname) with open('main-body.html', 'wb') as file: initial = ( f'<html><body>\nSubject\t:\t{email_subject}<br><br>From\t:\t{email_sender}<br><br>\ To\t:\t{email_receiver}<br><br>Date\t:\t{datetime}<br><br>' + '*' * 200 + '<br><br>\n</body></html>\n\n').encode('utf-8') file.write(initial) for email_part in email_data.walk(): if email_part.get_content_maintype() == 'text': if mimetypes.guess_extension( email_part.get_content_type()) == '.html': payload = email_part.get_payload(decode=True) if key: payload = base64.b64decode(payload) payload = decrypt(payload, key) file.write(payload) elif mimetypes.guess_extension( email_part.get_content_type()) == '.txt': tmp_name = ''.join( chr(random.randint(97, 122)) for i in range(10)) tmp_file = open(f'{tmp_name}.txt', 'wb') payload = email_part.get_payload(decode=True) if key: payload = base64.b64decode(payload) payload = decrypt(payload, key) tmp_file.write(payload) tmp_file.close() os.system('rst2html5 {0}.txt > {0}.html'.format(tmp_name)) tmp_file = open(f'{tmp_name}.html', 'rb') file.write(tmp_file.read()) tmp_file.close() os.remove(f'{tmp_name}.txt') os.remove(f'{tmp_name}.html')
def main(): global args global config global es global verbose global rcode parser = argparse.ArgumentParser( description = 'Unpack MIME attachments from a file and check them against virustotal.com') parser.add_argument('-d', '--directory', dest = 'directory', help = 'directory where files will be extracted (default: /tmp) %%d,%%m,%%y can use used for dynamic names', metavar = 'DIRECTORY') parser.add_argument('-v', '--verbose', action = 'store_false', dest = 'verbose', help = 'verbose output', default = False) parser.add_argument('-c', '--config', dest = 'config_file', help = 'configuration file (default: /etc/mime2vt.conf)', metavar = 'CONFIG') parser.add_argument('-l', '--log', dest = 'dump_file', help = 'mail dump file (default /tmp/message.dump)', metavar = 'DUMPFILE') args = parser.parse_args() # Default values if not args.directory: args.directory = '/tmp' if not args.config_file: args.config_file = '/etc/mime2vt.conf' #writeLog('DEBUG: config_file = %s' % args.config_file) try: c = ConfigParser.ConfigParser() c.read(args.config_file) config['apiKey'] = c.get('virustotal', 'apikey') excludetypes = c.get('virustotal', 'exclude').split(',') # Elasticsearch config config['esServer'] = c.get('elasticsearch', 'server') config['esIndex'] = c.get('elasticsearch', 'index') config['dbPath'] = c.get('database', 'dbpath') except OSError as e: writeLog('Cannot read config file %s: %s' % (args.config_file, e.errno)) exit if config['esServer']: logging.basicConfig() es = Elasticsearch([config['esServer']]) # Create the SQLite DB dbCreate() # Read the mail flow from STDIN data = "" . join(sys.stdin) msg = email.message_from_string(data) if usePyzMail: mailheaders = parseMailheaders(data) if args.dump_file: try: fp = open(args.dump_file, 'a') except OSError as e: writeLog('Cannot dump message to %s: %s' % (args.dump_file, e.errno)) fp.write(data) fp.close() # Process MIME parts for part in msg.walk(): contenttype = part.get_content_type() filename = part.get_param('name') # Hack: Search for a .js extension try: fname, fextension = os.path.splitext(filename) except: fextension = "none" data = part.get_payload(None, True) if data: md5 = hashlib.md5(data).hexdigest() #if dbMD5Exists(md5): # writeLog("Skipping existing MD5 %s" % md5) # continue # New: Extract URLS if contenttype in [ 'text/html', 'text/plain' ]: urls = [] # Source: https://gist.github.com/uogbuji/705383 GRUBER_URLINTEXT_PAT = re.compile(ur'(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?\xab\xbb\u201c\u201d\u2018\u2019]))') lines = data.split('\n') for line in lines: try: #urls.append(re.search("(?P<url>https?://[^\s]+)", word).group("url")) for url in GRUBER_URLINTEXT_PAT.findall(line): if url[0]: urls.append(url[0]) except: pass fp = open('/var/tmp/urls.log', 'a') for url in urls: fp.write("%s\n" % url) fp.close() # Process only interesting files # if contenttype not in ('text/plain', 'text/html', 'image/jpeg', 'image/gif', 'image/png'): if contenttype not in excludetypes or fextension == '.js': if not filename: filename = md5 mime_ext = mimetypes.guess_extension(contenttype) if not mime_ext: # Use a generic bag-of-bits extension mime_ext = '.bin' f_name, f_ext = os.path.splitext(filename) if not f_ext: filename += mime_ext writeLog('Found interesting file: %s (%s)' % (filename, contenttype)) fp = open(os.path.join(generateDumpDirectory(args.directory), filename), 'wb') fp.write(data) fp.close() if contenttype in ['application/zip', 'application/x-zip-compressed']: # Process ZIP archive writeLog('Processing zip archive: %s' % filename) processZipFile(os.path.join(generateDumpDirectory(args.directory), filename)) else: # Check VT score vt = VirusTotalPublicApi(config['apiKey']) response = vt.get_file_report(md5) # Save results to Elasticsearch if config['esServer']: try: response['@timestamp'] = time.strftime("%Y-%m-%dT%H:%M:%S+01:00") response['filename'] = filename if usePyzMail: response['mail'] = mailheaders res = es.index(index=config['esIndex'], doc_type="VTresult", body=json.dumps(response)) except: writeLog("Cannot index to Elasticsearch") # DEBUG fp = open('/tmp/vt.debug', 'a') fp.write(json.dumps(response, sort_keys=False, indent=4)) fp.close() vtScore = "0/0" if response['response_code'] == 200: if response['results']['response_code']: positives = response['results']['positives'] total = response['results']['total'] scan_date = response['results']['scan_date'] vtScore = str(positives) + "/" + str(total) if positives > 0: rcode = 1 writeLog('File: %s (%s) Score: %s Scanned: %s (%s)' % (filename, md5, vtScore, scan_date, timeDiff(scan_date))) else: # Do not resubmit existing MD5 if !dbMD5Exists(md5): writeLog('File: %s (%s) not found, submited for scanning' % (filename, md5)) submit2vt(os.path.join(generateDumpDirectory(args.directory), filename)) dbAddMD5(md5, filename, vtScore) else: writeLog('VT Error: %s' % response['error']) # Analyze OLE documents if API is available parseOLEDocument(os.path.join(generateDumpDirectory(args.directory), filename))
def makesane(row): # replace % characters in URL eg %20 by space urlpath = urllib2.unquote(row[3]) urlprefix = "http://10.129.50.5/nvli/data/" # Strip urlprefix to find the relative path in local partition srcfile = re.sub(urlprefix, '', urlpath) # the local partition srcdir = "/NFSMount/SV-Patel_Data/nvli" srcpath = '/'.join([srcdir, srcfile]) title = row[10].strip() title = title.rstrip('.') title = title.strip() # mimetypes library does not seem to use magic, so this does not work #fmt = mimetypes.guess_extension(mimetypes.guess_type(srcpath)[0]) ## https://github.com/ahupp/python-magic ## pip install python-magic try: ext = mimetypes.guess_extension(magic.from_file(srcpath, mime=True)) except EnvironmentError: # parent of IOError, OSError *and* WindowsError where available # print 'Error, file not found: %s' % srcpath ext = "." + row[20].lower() except: print "For from_file missing error: pip install python-magic" sys.exit(0) # some standard mappings where the default provided by python is # not conventional (eg: .jpe for .jpeg or .jpg ext = extnmap(ext) # rename the original extension to that of magic row[20] = ext.lstrip('.') # made from uniq ID, title and extension filename = sanefilename([title]) # remove common words # get the first n words of title filename = '-'.join(filename.split('-')[:6]) # prefix and suffix filename = sanefilename([row[9], filename]) + ext #filename = sanefilename([row[9], title]) + ext # because sanefilename strips . from filename # source sourceid = row[6] dirname = sanefilename([sourceid]) # create a subdir for each hyphenated part of uniq ID dirname = re.sub(r"-", '/', dirname) relpath = 'archive/' + dirname + '/' + filename destroot = "/NFSMount/sardar/files" destpath = '/'.join([destroot, relpath]) dirname = os.path.dirname(destpath) if not os.path.exists(dirname): os.makedirs(dirname) #os.rename is a mv and needs permissions to delete srcpath print 'Copying from %s to \n\t%s ' % (srcpath, destpath) sys.stdout.flush() try: shutil.copyfile(srcpath, destpath) sane = [row[9], relpath, row[4]] + row[10:] # eg. src and dest are the same file except shutil.Error as e: print('Error: %s' % e) sane = "Error" # eg. source or destination doesn't exist except IOError as e: print('Error: %s, %s' % (srcpath, e.strerror)) sane = "Error" #sane = [filename] return sane
def _download_http_url( link, # type: Link session, # type: PipSession temp_dir, # type: str hashes, # type: Hashes progress_bar # type: str ): # type: (...) -> Tuple[str, str] """Download link url into temp_dir using provided session""" target_url = link.url.split('#', 1)[0] try: resp = session.get( target_url, # We use Accept-Encoding: identity here because requests # defaults to accepting compressed responses. This breaks in # a variety of ways depending on how the server is configured. # - Some servers will notice that the file isn't a compressible # file and will leave the file alone and with an empty # Content-Encoding # - Some servers will notice that the file is already # compressed and will leave the file alone and will add a # Content-Encoding: gzip header # - Some servers won't notice anything at all and will take # a file that's already been compressed and compress it again # and set the Content-Encoding: gzip header # By setting this to request only the identity encoding We're # hoping to eliminate the third case. Hopefully there does not # exist a server which when given a file will notice it is # already compressed and that you're not asking for a # compressed file and will then decompress it before sending # because if that's the case I don't think it'll ever be # possible to make this work. headers={"Accept-Encoding": "identity"}, stream=True, ) resp.raise_for_status() except requests.HTTPError as exc: logger.critical( "HTTP error %s while getting %s", exc.response.status_code, link, ) raise content_type = resp.headers.get('content-type', '') filename = link.filename # fallback # Have a look at the Content-Disposition header for a better guess content_disposition = resp.headers.get('content-disposition') if content_disposition: type, params = cgi.parse_header(content_disposition) # We use ``or`` here because we don't want to use an "empty" value # from the filename param. filename = params.get('filename') or filename ext = splitext(filename)[1] if not ext: ext = mimetypes.guess_extension(content_type) if ext: filename += ext if not ext and link.url != resp.url: ext = os.path.splitext(resp.url)[1] if ext: filename += ext file_path = os.path.join(temp_dir, filename) with open(file_path, 'wb') as content_file: _download_url(resp, link, content_file, hashes, progress_bar) return file_path, content_type
def output(self, task, entry, config): """Moves temp-file into final destination Raises: PluginError if operation fails """ if 'file' not in entry and not task.options.test: log.debug('file missing, entry: %s', entry) raise plugin.PluginError( 'Entry `%s` has no temp file associated with' % entry['title']) try: # use path from entry if has one, otherwise use from download definition parameter path = entry.get('path', config.get('path')) if not isinstance(path, str): raise plugin.PluginError('Invalid `path` in entry `%s`' % entry['title']) # override path from command line parameter if task.options.dl_path: path = task.options.dl_path # expand variables in path try: path = os.path.expanduser(entry.render(path)) except RenderError as e: entry.fail( 'Could not set path. Error during string replacement: %s' % e) return # Clean illegal characters from path name path = pathscrub(path) # If we are in test mode, report and return if task.options.test: log.info('Would write `%s` to `%s`', entry['title'], path) # Set a fake location, so the exec plugin can do string replacement during --test #1015 entry['location'] = os.path.join(path, 'TEST_MODE_NO_OUTPUT') return # make path if not os.path.isdir(path): log.debug('Creating directory %s', path) try: os.makedirs(path) except: raise plugin.PluginError('Cannot create path %s' % path, log) # check that temp file is present if not os.path.exists(entry['file']): log.debug('entry: %s', entry) raise plugin.PluginWarning( 'Downloaded temp file `%s` doesn\'t exist!?' % entry['file']) if config.get('filename'): try: entry['filename'] = entry.render(config['filename']) log.debug('set filename from config %s' % entry['filename']) except RenderError as e: entry.fail( 'Could not set filename. Error during string replacement: %s' % e) return # if we still don't have a filename, try making one from title (last resort) elif not entry.get('filename'): entry['filename'] = entry['title'] log.debug('set filename from title %s', entry['filename']) if 'mime-type' not in entry: log.warning( 'Unable to figure proper filename for %s. Using title.', entry['title']) else: guess = mimetypes.guess_extension(entry['mime-type']) if not guess: log.warning( 'Unable to guess extension with mime-type %s', guess) else: self.filename_ext_from_mime(entry) name = entry.get('filename', entry['title']) # Remove illegal characters from filename #325, #353 name = pathscrub(name) # Remove directory separators from filename #208 name = name.replace('/', ' ') if sys.platform.startswith('win'): name = name.replace('\\', ' ') # remove duplicate spaces name = ' '.join(name.split()) # combine to full path + filename destfile = os.path.join(path, name) log.debug('destfile: %s', destfile) if os.path.exists(destfile): import filecmp if filecmp.cmp(entry['file'], destfile): log.debug("Identical destination file '%s' already exists", destfile) elif config.get('overwrite'): log.debug("Overwriting already existing file %s", destfile) else: log.info( 'File `%s` already exists and is not identical, download failed.', destfile) entry.fail( 'File `%s` already exists and is not identical.' % destfile) return else: # move temp file log.debug('moving %s to %s', entry['file'], destfile) try: shutil.move(entry['file'], destfile) except (IOError, OSError) as err: # ignore permission errors, see ticket #555 import errno if not os.path.exists(destfile): raise plugin.PluginError('Unable to write %s: %s' % (destfile, err)) if err.errno != errno.EPERM and err.errno != errno.EACCES: raise else: del (entry['file']) # store final destination as output key entry['location'] = destfile finally: self.cleanup_temp_file(entry)
def guess_extension(mimetype): x = mimetypes.guess_extension(mimetype) if x == '.jpe': return '.jpeg' return x
def process_mailbox(M): rv, data = M.search(None, "ALL") if rv != 'OK': print("No messages found!") return counter = 0 for num in data[0].split(): try: # time.sleep(1) rv, data = M.fetch(num, '(RFC822)') if rv != 'OK': print("ERROR getting message", num) return msg = email.message_from_bytes(data[0][1]) hdr = email.header.make_header( email.header.decode_header(msg['Subject'])) subject = str(hdr) date_tuple = email.utils.parsedate_tz(msg['Date']) local_date = '' if date_tuple: local_date = datetime.datetime.fromtimestamp( email.utils.mktime_tz(date_tuple)) conn = '' try: conn = MongoClient('localhost', 27017) print("Connected successfully!!!") except: print("Could not connect to MongoDB") # database db = conn.hash # Created or Switched to collection names: testGmailAnup collection = db.ib if db.ib.find({'email_timestamp': str(msg['Date'])}).count() > 0: continue # Attachment attachmenturl = '' emailbody = '' for part in msg.walk(): try: # multipart/* are just containers if part.get_content_maintype() == 'multipart': for part in msg.walk(): if part.get_content_type() == 'text/plain': emailbody = part.get_payload( ) # prints the raw text filename = part.get_filename() if not filename: ext = mimetypes.guess_extension( part.get_content_type()) if not ext: ext = '.bin' filename = 'part-%03d%s' % (counter, ext) filename = str(uuid.uuid1()) + filename f = open('%s/%s' % (Attachment_DIRECTORY, filename), 'wb') f.write(part.get_payload(decode=True)) f.close() if attachmenturl == '': attachmenturl = Attachment_DIRECTORY + "/" + filename else: attachmenturl = attachmenturl + "," + Attachment_DIRECTORY + "/" + filename except Exception: continue # or you could use 'continue' timestamp = int(time.time()) # timestamp # From if not found email_sender = '' email_sender_id = '' try: if '<' in msg['From']: email_sender = msg['From'].split('<')[0] email_sender_id = msg['From'].split('<')[1].replace( ">", " ") else: email_sender_id = msg['From'] except: print("") # To if not found email_recipeint = '' email_recipient_id = '' try: if '<' in msg['To']: email_recipeint = msg['To'].split('<')[0] email_recipient_id = msg['To'].split('<')[1].replace( ">", " ") else: email_recipient_id = msg['To'] except: print("") # CC if not found email_recipeint_CC = '' email_recipient_CC_id = '' try: if '<' in msg['Cc']: email_recipeint_CC = msg['Cc'].split('<')[0] email_recipient_CC_id = msg['Cc'].split('<')[1].replace( ">", " ") else: email_recipient_CC_id = msg['Cc'] except: print("") # CC if not found email_recipeint_CCO = '' email_recipient_CCO_ID = '' try: if '<' in msg['Bcc']: email_recipeint_CCO = msg['Bcc'].split('<')[0] email_recipient_CCO_ID = msg['Bcc'].split('<')[1].replace( ">", " ") else: email_recipient_CCO_ID = msg['Bcc'] except: print("") emp_rec1 = { "tphashobject_metadata_tib": "8f7074d8-a520-4f7d-b2d3-09dc36acb5fd", "tphashobject_metadata_tib_name": "TPEMAIL", "tpemail_metadata_mail_box_name": "Frederico Gmail", "tpemail_metadata_id_mail_box": str(uuid.uuid1()), "tpemail_metadata_time": timestamp, "tpemail_metadata_time_zone": strftime("%z", gmtime()), "tpemail_metadata_email_subject": subject, "tpemail_metadata_email_sender": email_sender, "tpemail_metadata_email_sender_id": email_sender_id, "tpemail_metadata_email_recipeint": email_recipeint, "tpemail_metadata_email_recipient_id": email_recipient_id, "tpemail_metadata_email_timestamp": msg['Date'], "tpemail_metadata_email_header": "", "tpemail_metadata_email_body": emailbody, "tpemail_metadata_email_seq": "", "tpemail_metadata_email_text_content": "", "tpemail_metadata_email_html_content": "", "tpemail_metadata_email_eml_content": "", "tpemail_metadata_email_links": "", "tpemail_metadata_email_atach": attachmenturl, "tpemail_metadata_email_template_id": "", "tpemail_metadata_email_track_link": "", "tpemail_metadata_email_recipeint_cc": email_recipeint_CC, "tpemail_metadata_email_recipient_cc_id": email_recipient_CC_id, "tpemail_metadata_email_recipeint_cco": email_recipeint_CCO, "tpemail_metadata_email_recipient_cco_id": email_recipient_CCO_ID, "tphashobject_metadata_hash_owner_id": "g00zNU6n7WfhUI1u4A5ebxSN0732", "tpemail_metadata_hash_sender_id": "", "tpemail_metadata_hash_recipt_id": "", "tpemail_metadata_hash_sender_name": "", "tpemail_metadata_hash_receipt_name": "", "tpemail_metadata_hash_recipt_cc_id": "", "tpemail_metadata_hash_recipt_cco_id": "", "tphashobject_metadata_hub_group_id": "da0a7b22-fb15-46e0-9f5a-019263d79e36", "tphashobject_metadata_data_sinc_mongodb": "", "tphashobject_metadata_action": "", "tphashobject_metadata_role": "", "tphashobject_metadata_layout_role": "", "tphashobject_metadata_group_id": "9529b03b-38e8-4bdc-aa62-8055a4c36a55", "tpemailbox_metadata_IP_machine": host_ip } # Insert Data rec_id1 = collection.insert_one(emp_rec1) print("Data inserted with record ids", rec_id1) counter += 1 print(str(counter), "]") print('Subject :', subject) print('Raw Date:', msg['Date']) print('From :', msg['From'].split('<')[0]) print("") except Exception as e: print('Main Fail: ' + str(e)) continue # or you could use 'continue'
if e.errno != errno.EEXIST: raise counter = 1 numAttachments = 0 start = time.time() for part in msg.walk(): # multipart/* are just containers if part.get_content_maintype() == 'multipart': continue # Applications should really sanitize the given filename so that an # email message can't be used to overwrite important files filename = part.get_filename() LOG(("\tFound attachment: %s" % filename), False, True) if not filename: orig = '' ext = mimetypes.guess_extension(part.get_content_type()) if ((ext == '.exe') or (ext == '.pdf')): orig = ext ext = '.txt' if not ext: # Use a generic bag-of-bits extension ext = '.bin' filename = 'part-%03d%s%s' % (counter, orig, ext) counter += 1 # write the file to targetFolder outputFilename = ("%s-%s" % (GUID, filename)) LOG(("\tWriting as: %s" % outputFilename), False, True) fp = open(os.path.join(targetFolder, outputFilename), 'wb') fp.write(part.get_payload(decode=True)) numAttachments += 1 fp.close()
def download_past_media(self, dumper, target_id): """ Downloads the past media that has already been dumped into the database but has not been downloaded for the given target ID yet. Media which formatted filename results in an already-existing file will be *ignored* and not re-downloaded again. """ # TODO Should this respect and download only allowed media? Or all? target_in = self.client.get_input_entity(target_id) target = self.client.get_entity(target_in) target_id = utils.get_peer_id(target) msg_cursor = dumper.conn.cursor() msg_cursor.execute( 'SELECT ID, Date, FromID, MediaID FROM Message ' 'WHERE ContextID = ? AND MediaID IS NOT NULL', (target_id, )) msg_row = msg_cursor.fetchone() while msg_row: media_row = dumper.conn.execute( 'SELECT LocalID, VolumeID, Secret, Type, MimeType, Name ' 'FROM Media WHERE ID = ?', (msg_row[3], )).fetchone() # Documents have attributed and they're saved under the "document" # namespace so we need to split it before actually comparing. media_type = media_row[3].split('.') media_type, media_subtype = media_type[0], media_type[-1] if media_type not in ('photo', 'document'): # Only photos or documents are actually downloadable msg_row = msg_cursor.fetchone() continue user_row = dumper.conn.execute( 'SELECT FirstName, LastName FROM User WHERE ID = ?', (msg_row[2], )).fetchone() if user_row: sender_name = '{} {}'.format(msg_row[0] or '', msg_row[1] or '').strip() else: sender_name = '' date = datetime.datetime.utcfromtimestamp(msg_row[1]) formatter = defaultdict(str, id=msg_row[0], context_id=target_id, sender_id=msg_row[2] or 0, type=media_subtype or 'unknown', ext=mimetypes.guess_extension(media_row[4]) or '.bin', name=utils.get_display_name(target) or 'unknown', sender_name=sender_name or 'unknown') if formatter['ext'] == '.jpe': formatter['ext'] = '.jpg' # Nobody uses .jpe for photos name = None if media_subtype == 'photo' else media_row[5] formatter['filename'] = name or date.strftime( '{}_%Y-%m-%d_%H-%M-%S'.format(formatter['type'])) filename = date.strftime(self.media_fmt).format_map(formatter) if not filename.endswith(formatter['ext']): if filename.endswith('.'): filename = filename[:-1] filename += formatter['ext'] if os.path.isfile(filename): __log__.debug('Skipping existing file %s', filename) else: __log__.info('Downloading to %s', filename) os.makedirs(os.path.dirname(filename), exist_ok=True) if media_type == 'document': self.client.download_file(types.InputDocumentFileLocation( id=media_row[0], version=media_row[1], access_hash=media_row[2]), file=filename) else: self.client.download_file(types.InputFileLocation( local_id=media_row[0], volume_id=media_row[1], secret=media_row[2]), file=filename) time.sleep(1) msg_row = msg_cursor.fetchone()