def sync(self): d1 = datetime.now() d = Downloader.Downloader(self.project, self.oauth_provider.http_intercept, self._save_file, self.oauth_provider.get_auth_header, self.project.threads) if self.project.args.mode == "full": self.project.log("transaction", "Full acquisition initiated", "info", True) else: self.project.log("transaction", "Metadata acquisition initiated", "info", True) self.initialize_items() cnt = len(self.files) self.project.log("transaction", "Total items queued for acquisition: " + str(cnt), "info", True) self.metadata() for file in self.files: self.project.log("transaction", "Calculating " + file['path'], "info", True) if file['is_dir'] == False: download_uri = lambda f=file: self._get_download_uri(f) metadata_download_uri = self.oauth_provider.config['API_ENDPOINT'] + '/metadata/auto' + file['path'] parentmap = self._get_parent_mapping(file) filetitle = self._get_file_name(file) orig = os.path.basename(file['path']) if filetitle != orig: self.project.log("exception", "Normalized '{}' to '{}'".format(orig, filetitle), "warning", True) if 'bytes' in file: self.file_size_bytes += int(file['bytes']) save_metadata_path = Common.assert_path(os.path.normpath(os.path.join(os.path.join(self.project.project_folders['metadata'], parentmap), filetitle + ".json")), self.project) if save_metadata_path: self.project.log("transaction", "Queueing {} for download...".format(orig), "info", True) d.put(Downloader.DownloadSlip(metadata_download_uri, file, save_metadata_path, 'path')) if self.project.args.mode == "full": save_download_path = Common.assert_path(os.path.normpath(os.path.join(os.path.join(self.project.project_folders['data'], parentmap), filetitle)), self.project) if save_download_path: self.project.log("transaction", "Queueing {} for download...".format(orig), "info", True) d.put(Downloader.DownloadSlip(download_uri, file, save_download_path, 'path')) self.project.log("transaction", "Total size of files to be acquired is {}".format(Common.sizeof_fmt(self.file_size_bytes, "B")), "highlight", True) if self.project.args.prompt: IO.get("Press ENTER to begin acquisition...") d.start() d.wait_for_complete() d2 = datetime.now() delt = d2 - d1 self.project.log("transaction", "Acquisition completed in {}".format(str(delt)), "highlight", True)
def _save_raw_mail(self, data, slip): data = data.read().decode('utf-8') msg = json.loads(data) msg_data = msg["raw"] msg_data = base64.urlsafe_b64decode(msg_data).decode('utf-8') labels = msg["labelIds"] data_dir = self.project.project_folders["data"] for label in labels: mbox = mailbox.mbox(os.path.join(self.mbox_dir, label)) mbox_msg = email.message_from_bytes(msg_data.encode(), mailbox.mboxMessage) mbox.add(mbox_msg) label_path = os.path.join(data_dir, label) save_path = os.path.join(label_path, slip.savepath) save_path = Common.assert_path(save_path, self.project) if save_path: if not os.path.isdir(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path), exist_ok=True) self.project.savedata(msg_data, save_path, False) self.project.log("transaction", "Saved file to " + save_path, "info", True) for part in mbox_msg.walk(): content_disposition =part.get("Content-Disposition", None) if content_disposition: data = part.get_payload(decode=True) att_name = part.get_filename() if att_name: att_dir = os.path.join(label_path, slip.savepath[:slip.savepath.index('.')]) att_path = os.path.join(att_dir, att_name) os.makedirs(att_dir, exist_ok=True) with open(att_path, 'wb') as f: f.write(data) self.project.log("transaction", "Saved attachment to " + save_path, "info", True) mbox.flush()
def _save_metadata(self, data, slip): data = data.read().decode('utf-8') thread = json.loads(data) f = open(self.metadata_file, 'ab') for message in thread['messages']: for label in message['labelIds']: label_dir = os.path.join(self.project.project_folders['metadata'], label) thread_dir = os.path.join(label_dir, thread['id']) message_dir = os.path.join(thread_dir, message['id']) msg_metadata_path = os.path.join(message_dir, message['id'] + ".json") msg_metadata_path = Common.assert_path(msg_metadata_path, self.project) # Save metadata of each message individually, inside label/thread/message directory if msg_metadata_path: os.makedirs(message_dir, exist_ok=True) self.project.savedata(json.dumps(message, sort_keys=True, indent=4), msg_metadata_path, False) self.project.log("transaction", "Saving metadata to {}".format(msg_metadata_path), "info", True) thread_metadata_path = os.path.join(thread_dir, thread['id'] + ".json") # Save metadata of each thread individually inside label/thread directory thread_metadata_path = Common.assert_path(thread_metadata_path, self.project) if thread_metadata_path: os.makedirs(thread_dir, exist_ok=True) self.project.savedata(json.dumps(thread, sort_keys=True, indent=4), thread_metadata_path, False) self.project.log("transaction", "Saving metadata to {}".format(thread_metadata_path), "info", True) headers = message['payload']['headers'] label_list = ",".join(message['labelIds']) internal_date = message['internalDate'] internal_date = time.strftime("%Y-%m-%d %H:%M:%S", time.gmtime(int(internal_date) / 1000)) header_date = 'N/A' if not self.extract_header_value(headers, 'Date') else self.extract_header_value(headers, 'Date') header_to = 'N/A' if not self.extract_header_value(headers, 'To') else self.extract_header_value(headers, 'To') header_from = 'N/A' if not self.extract_header_value(headers, 'From') else self.extract_header_value(headers, 'From') header_subject = 'N/A' if not self.extract_header_value(headers, 'Subject') else self.extract_header_value(headers, 'Subject') snippet = message['snippet'] thread_id = thread['id'] f.write('"{id}","{internaldate}","{labels}","{headerdate}","{to}","{xfrom}","{subject}","{snippet}","{threadid}"{sep}'.format(id=message['id'],internaldate=internal_date,labels=label_list,headerdate=header_date,to=header_to,xfrom=header_from,subject=header_subject,snippet=snippet,threadid=thread_id,sep=os.linesep).encode('utf-8')) f.close()
def _save_raw_mail(self, data, slip): data = data.read().decode('utf-8') msg = json.loads(data) msg_data = msg["raw"] msg_data = base64.urlsafe_b64decode(msg_data).decode('utf-8') labels = msg["labelIds"] data_dir = self.project.project_folders["data"] for label in labels: mbox = mailbox.mbox(os.path.join(self.mbox_dir, label)) mbox_msg = email.message_from_bytes(msg_data.encode(), mailbox.mboxMessage) mbox.add(mbox_msg) label_path = os.path.join(data_dir, label) save_path = os.path.join(label_path, slip.savepath) save_path = Common.assert_path(save_path, self.project) if save_path: if not os.path.isdir(os.path.dirname(save_path)): os.makedirs(os.path.dirname(save_path), exist_ok=True) self.project.savedata(msg_data, save_path, False) self.project.log("transaction", "Saved file to " + save_path, "info", True) for part in mbox_msg.walk(): content_disposition = part.get("Content-Disposition", None) if content_disposition: data = part.get_payload(decode=True) att_name = part.get_filename() if att_name: att_dir = os.path.join( label_path, slip.savepath[:slip.savepath.index('.')]) att_path = os.path.join(att_dir, att_name) os.makedirs(att_dir, exist_ok=True) with open(att_path, 'wb') as f: f.write(data) self.project.log("transaction", "Saved attachment to " + save_path, "info", True) mbox.flush()
def _save_metadata(self, data, slip): data = data.read().decode('utf-8') thread = json.loads(data) f = open(self.metadata_file, 'ab') for message in thread['messages']: for label in message['labelIds']: label_dir = os.path.join( self.project.project_folders['metadata'], label) thread_dir = os.path.join(label_dir, thread['id']) message_dir = os.path.join(thread_dir, message['id']) msg_metadata_path = os.path.join(message_dir, message['id'] + ".json") msg_metadata_path = Common.assert_path(msg_metadata_path, self.project) # Save metadata of each message individually, inside label/thread/message directory if msg_metadata_path: os.makedirs(message_dir, exist_ok=True) self.project.savedata( json.dumps(message, sort_keys=True, indent=4), msg_metadata_path, False) self.project.log( "transaction", "Saving metadata to {}".format(msg_metadata_path), "info", True) thread_metadata_path = os.path.join(thread_dir, thread['id'] + ".json") # Save metadata of each thread individually inside label/thread directory thread_metadata_path = Common.assert_path( thread_metadata_path, self.project) if thread_metadata_path: os.makedirs(thread_dir, exist_ok=True) self.project.savedata( json.dumps(thread, sort_keys=True, indent=4), thread_metadata_path, False) self.project.log( "transaction", "Saving metadata to {}".format(thread_metadata_path), "info", True) headers = message['payload']['headers'] label_list = ",".join(message['labelIds']) internal_date = message['internalDate'] internal_date = time.strftime( "%Y-%m-%d %H:%M:%S", time.gmtime(int(internal_date) / 1000)) header_date = 'N/A' if not self.extract_header_value( headers, 'Date') else self.extract_header_value( headers, 'Date') header_to = 'N/A' if not self.extract_header_value( headers, 'To') else self.extract_header_value(headers, 'To') header_from = 'N/A' if not self.extract_header_value( headers, 'From') else self.extract_header_value( headers, 'From') header_subject = 'N/A' if not self.extract_header_value( headers, 'Subject') else self.extract_header_value( headers, 'Subject') snippet = message['snippet'] thread_id = thread['id'] f.write( '"{id}","{internaldate}","{labels}","{headerdate}","{to}","{xfrom}","{subject}","{snippet}","{threadid}"{sep}' .format(id=message['id'], internaldate=internal_date, labels=label_list, headerdate=header_date, to=header_to, xfrom=header_from, subject=header_subject, snippet=snippet, threadid=thread_id, sep=os.linesep).encode('utf-8')) f.close()
def sync(self): d1 = datetime.now() d = Downloader.Downloader if self.project.args.mode == "full": self.project.log("transaction", "Full acquisition initiated", "info", True) d = Downloader.Downloader(self.project, self.oauth_provider.http_intercept, self._save_file, self.oauth_provider.get_auth_header, self.project.threads) else: self.project.log("transaction", "Metadata acquisition initiated", "info", True) self.initialize_items() cnt = len(self.files) self.project.log("transaction", "Total items queued for acquisition: " + str(cnt), "info", True) self.metadata() trash_folder = os.path.join(self.project.acquisition_dir, "trash") trash_metadata_folder = os.path.join(self.project.acquisition_dir, "trash_metadata") for file in self.files: self.project.log("transaction", "Calculating " + file['title'], "info", True) download_uri = self._get_download_url(file) parentmap = self._get_parent_mapping(file, self.files) filetitle = self._get_file_name(file) if filetitle != file['title']: self.project.log( "exception", "Normalized '" + file['title'] + "' to '" + filetitle + "'", "warning", True) if file['labels']['trashed'] == True: save_download_path = os.path.join(trash_folder, parentmap) save_metadata_path = os.path.join(trash_metadata_folder, parentmap) save_download_path = os.path.normpath( os.path.join(save_download_path, filetitle)) save_metadata_path = os.path.normpath( os.path.join(save_metadata_path, filetitle + '.json')) else: save_download_path = os.path.normpath( os.path.join( os.path.join(self.project.project_folders["data"], parentmap), filetitle)) save_metadata_path = os.path.normpath( os.path.join( os.path.join(self.project.project_folders["metadata"], parentmap), filetitle + ".json")) save_download_path = Common.assert_path(save_download_path, self.project) save_metadata_path = Common.assert_path(save_metadata_path, self.project) if self.project.args.mode == "full": if save_download_path: v = { "remote_file": os.path.join(parentmap, file['title']), "local_file": save_download_path } download_file = True if 'md5Checksum' in file: v['remote_hash'] = file['md5Checksum'] if os.path.isfile(save_download_path): if 'md5Checksum' in file: file_hash = Common.hashfile( open(save_download_path, 'rb'), hashlib.md5()) if file_hash == file['md5Checksum']: download_file = False self.project.log( "exception", "Local and remote hash matches for " + file['title'] + " ... Skipping download", "warning", True) else: self.project.log( "exception", "Local and remote hash differs for " + file['title'] + " ... Queuing for download", "critical", True) else: self.project.log( "exception", "No hash information for file ' " + file['title'] + "'", "warning", True) if download_file and download_uri: self.project.log( "transaction", "Queueing " + file['title'] + " for download...", "info", True) d.put( Downloader.DownloadSlip(download_uri, file, save_download_path, 'title')) if 'fileSize' in file: self.file_size_bytes += int(file['fileSize']) # If it's a file we can add it to verification file if download_uri: self.verification.append(v) if save_metadata_path: self._save_file( json.dumps(file, sort_keys=True, indent=4), Downloader.DownloadSlip(download_uri, file, save_metadata_path, 'title'), False) self.project.log( "transaction", "Total size of files to be acquired is {}".format( Common.sizeof_fmt(self.file_size_bytes, "B")), "highlight", True) if self.project.args.prompt: IO.get("Press ENTER to begin acquisition...") d.start() d.wait_for_complete() d2 = datetime.now() delt = d2 - d1 self.verify() self.project.log("transaction", "Acquisition completed in {}".format(str(delt)), "highlight", True)
def sync(self): d1 = datetime.now() d = Downloader.Downloader if self.project.args.mode == "full": self.project.log("transaction", "Full acquisition initiated", "info", True) d = Downloader.Downloader(self.project, self.oauth_provider.http_intercept, self._save_file, self.oauth_provider.get_auth_header, self.project.threads) else: self.project.log("transaction", "Metadata acquisition initiated", "info", True) self.initialize_items() cnt = len(self.files) self.project.log("transaction", "Total items queued for acquisition: " + str(cnt), "info", True) self.metadata() trash_folder = os.path.join(self.project.acquisition_dir, "trash") trash_metadata_folder = os.path.join(self.project.acquisition_dir, "trash_metadata") for file in self.files: self.project.log("transaction", "Calculating " + file['title'], "info", True) download_uri = self._get_download_url(file) parentmap = self._get_parent_mapping(file, self.files) filetitle = self._get_file_name(file) if filetitle != file['title']: self.project.log("exception", "Normalized '" + file['title'] + "' to '" + filetitle + "'", "warning", True) if file['labels']['trashed'] == True: save_download_path = os.path.join(trash_folder, parentmap) save_metadata_path = os.path.join(trash_metadata_folder, parentmap) save_download_path = os.path.normpath(os.path.join(save_download_path, filetitle)) save_metadata_path = os.path.normpath(os.path.join(save_metadata_path, filetitle + '.json')) else: save_download_path = os.path.normpath(os.path.join(os.path.join(self.project.project_folders["data"], parentmap), filetitle)) save_metadata_path = os.path.normpath(os.path.join(os.path.join(self.project.project_folders["metadata"], parentmap), filetitle + ".json")) save_download_path = Common.assert_path(save_download_path, self.project) save_metadata_path = Common.assert_path(save_metadata_path, self.project) if self.project.args.mode == "full": if save_download_path: v = {"remote_file": os.path.join(parentmap, file['title']), "local_file": save_download_path} download_file = True if 'md5Checksum' in file: v['remote_hash'] = file['md5Checksum'] if os.path.isfile(save_download_path): if 'md5Checksum' in file: file_hash = Common.hashfile(open(save_download_path, 'rb'), hashlib.md5()) if file_hash == file['md5Checksum']: download_file = False self.project.log("exception", "Local and remote hash matches for " + file[ 'title'] + " ... Skipping download", "warning", True) else: self.project.log("exception", "Local and remote hash differs for " + file[ 'title'] + " ... Queuing for download", "critical", True) else: self.project.log("exception", "No hash information for file ' " + file['title'] + "'", "warning", True) if download_file and download_uri: self.project.log("transaction", "Queueing " + file['title'] + " for download...", "info", True) d.put(Downloader.DownloadSlip(download_uri, file, save_download_path, 'title')) if 'fileSize' in file: self.file_size_bytes += int(file['fileSize']) # If it's a file we can add it to verification file if download_uri: self.verification.append(v) if save_metadata_path: self._save_file(json.dumps(file, sort_keys=True, indent=4), Downloader.DownloadSlip(download_uri, file, save_metadata_path, 'title'), False) self.project.log("transaction", "Total size of files to be acquired is {}".format( Common.sizeof_fmt(self.file_size_bytes, "B")), "highlight", True) if self.project.args.prompt: IO.get("Press ENTER to begin acquisition...") d.start() d.wait_for_complete() d2 = datetime.now() delt = d2 - d1 self.verify() self.project.log("transaction", "Acquisition completed in {}".format(str(delt)), "highlight", True)
def sync(self): d1 = datetime.now() d = Downloader.Downloader(self.project, self.oauth_provider.http_intercept, self._save_file, self.oauth_provider.get_auth_header, self.project.threads) if self.project.args.mode == "full": self.project.log("transaction", "Full acquisition initiated", "info", True) else: self.project.log("transaction", "Metadata acquisition initiated", "info", True) self.initialize_items() cnt = len(self.files) self.project.log("transaction", "Total items queued for acquisition: " + str(cnt), "info", True) self.metadata() for file in self.files: self.project.log("transaction", "Calculating " + file['path'], "info", True) if file['is_dir'] == False: download_uri = lambda f=file: self._get_download_uri(f) metadata_download_uri = self.oauth_provider.config[ 'API_ENDPOINT'] + '/metadata/auto' + file['path'] parentmap = self._get_parent_mapping(file) filetitle = self._get_file_name(file) orig = os.path.basename(file['path']) if filetitle != orig: self.project.log( "exception", "Normalized '{}' to '{}'".format(orig, filetitle), "warning", True) if 'bytes' in file: self.file_size_bytes += int(file['bytes']) save_metadata_path = Common.assert_path( os.path.normpath( os.path.join( os.path.join( self.project.project_folders['metadata'], parentmap), filetitle + ".json")), self.project) if save_metadata_path: self.project.log( "transaction", "Queueing {} for download...".format(orig), "info", True) d.put( Downloader.DownloadSlip(metadata_download_uri, file, save_metadata_path, 'path')) if self.project.args.mode == "full": save_download_path = Common.assert_path( os.path.normpath( os.path.join( os.path.join( self.project.project_folders['data'], parentmap), filetitle)), self.project) if save_download_path: self.project.log( "transaction", "Queueing {} for download...".format(orig), "info", True) d.put( Downloader.DownloadSlip(download_uri, file, save_download_path, 'path')) self.project.log( "transaction", "Total size of files to be acquired is {}".format( Common.sizeof_fmt(self.file_size_bytes, "B")), "highlight", True) if self.project.args.prompt: IO.get("Press ENTER to begin acquisition...") d.start() d.wait_for_complete() d2 = datetime.now() delt = d2 - d1 self.project.log("transaction", "Acquisition completed in {}".format(str(delt)), "highlight", True)