def get_messages_for_cleaning(user_email=None, process_id=None): clean_process = CleanUserProcess.get_by_id(process_id) imap = IMAPHelper() imap.login(email=user_email, password=clean_process.source_password) msg_ids = imap.list_messages(criteria=clean_process.search_criteria, only_with_attachments=True, not_migrated=True) imap.close() if len(msg_ids) > 0: if constants.USER_CONNECTION_LIMIT < len(msg_ids): n = constants.USER_CONNECTION_LIMIT else: n = len(msg_ids) counter.load_and_increment_counter( 'cleaning_%s_total_count' % user_email, delta=len(msg_ids), namespace=str(process_id)) # chunkify: due to the migration API 1QPS limit # should this optimization be used? # return [msg_ids[i::n] for i in xrange(n)] return [msg_ids] else: counter.load_and_increment_counter( 'cleaning_%s_total_count' % user_email, delta=0, namespace=str(process_id)) process = CleanUserProcess.get_by_id(process_id) process.status = constants.FINISHED process.put() return []
def schedule_user_cleaning(user_email=None, process_id=None): all_messages = get_messages_for_cleaning( user_email=user_email, process_id=process_id) number_of_messages = 0 for chunk in all_messages: number_of_messages += len(chunk) process = CleanUserProcess.get_by_id(process_id) process.number_of_messages = number_of_messages process.put() for chunk_ids in all_messages: if len(chunk_ids) > 0: logging.info('Scheduling user [%s] messages cleaning', user_email) deferred.defer(clean_messages, user_email=user_email, chunk_ids=chunk_ids, process_id=process_id)
def clean_messages(user_email=None, password=None, chunk_ids=list(), retry_count=0, process_id=None): cleaned_successfully = [] remaining = [] if len(chunk_ids) <= 0: process = CleanUserProcess.get_by_id(process_id) process.status = constants.FINISHED process.put() return True try: process = CleanUserProcess.get_by_id(process_id) imap = IMAPHelper() imap.login(email=user_email, password=process.source_password) imap.select() domain_name = user_email.split('@')[1] primary_domain = PrimaryDomain.get_or_create( domain_name) try: drive = DriveHelper(credentials_json=primary_domain.credentials, admin_email=primary_domain.admin_email, refresh_token=primary_domain.refresh_token) folder = drive.get_folder(constants.ATTACHMENT_FOLDER) if not folder: folder = drive.create_folder(constants.ATTACHMENT_FOLDER) sub_folder = drive.get_folder(user_email) if not sub_folder: sub_folder = drive.create_folder(user_email, [{'id': folder['id']}]) except Exception as e: logging.error( "Couldn't authenticate drive for user %s" % user_email) raise e try: migration = MigrationHelper( credentials_json=primary_domain.credentials, refresh_token=primary_domain.refresh_token) except Exception as e: logging.error( "Couldn't authenticate migration api for user %s" % user_email) raise e for message_id in chunk_ids: try: result = clean_message(msg_id=message_id, imap=imap, drive=drive, migration=migration, folder_id=sub_folder['id'], user_email=user_email, process_id=process_id) if result: counter.load_and_increment_counter( 'cleaning_%s_ok_count' % (user_email), namespace=str(process_id)) cleaned_successfully.append(message_id) else: counter.load_and_increment_counter( 'cleaning_%s_error_count' % user_email, namespace=str(process_id)) logging.error( 'Error cleaning message ID [%s] for user [%s]: [%s] ', message_id, user_email, result) except Exception as e: logging.exception( 'Failed cleaning individual message ID [%s] for user [%s]', message_id, user_email) remaining = [] if retry_count < constants.MAX_CLEAN_RETRIES: for chunk_msg in chunk_ids: if chunk_msg not in cleaned_successfully: remaining.append(chunk_msg) logging.info( 'Scheduling [%s] remaining cleaning messages for user [%s]', len(remaining), user_email) deferred.defer(clean_messages, user_email=user_email, chunk_ids=remaining, process_id=process_id, retry_count=retry_count + 1) else: for chunk_msg in chunk_ids: if message_id == chunk_msg: continue if chunk_msg not in cleaned_successfully: remaining.append(chunk_msg) logging.info( 'Giving up cleaning message [%s] for ' 'user [%s]', message_id, user_email) counter.load_and_increment_counter( 'cleaning_%s_error_count' % user_email, delta=1, namespace=str(process_id)) deferred.defer(clean_messages, user_email=user_email, chunk_ids=remaining, process_id=process_id) break except Exception as e: logging.exception('Failed cleaning messages chunk') raise e finally: if imap: imap.close() if len(chunk_ids) < 10 or (len(cleaned_successfully) + 10 > len(chunk_ids)): process.status = constants.FINISHED process.put()
def delayed_delete_message(msg_id=None, process_id=None, retries=0): process = CleanUserProcess.get_by_id(process_id) criteria = process.search_criteria msg_process = CleanMessageProcess.query(ndb.AND( CleanMessageProcess.msg_id == msg_id, CleanMessageProcess.clean_process_id == process_id) ).get() if msg_process.status != constants.MIGRATED: if retries < constants.MAX_RETRIES: deferred.defer(delayed_delete_message, msg_id=msg_id, process_id=process_id, retries=retries+1, _countdown=60*2**retries, _queue="elimination") else: logging.error("Couldn't delete msg %s for user %s" % (msg_id, process.source_email)) return imap = IMAPHelper() imap.login(process.source_email, process.source_password) imap.select() # Look for the migrated email, if it doesn't exist yet # retry later try: subject = imap.get_subject(msg_id=msg_id) except Exception as e: if retries < constants.MAX_RETRIES: deferred.defer(delayed_delete_message, msg_id=msg_id, process_id=process_id, retries=retries+1, _countdown=60*2**retries) else: logging.error("Couldn't delete msg %s for user %s, error %s" % (msg_id, process.source_email, e.message)) return messages = imap.list_messages(criteria="subject:(%s) label:Migrated-Migrados" % subject) if len(messages) < 1: if retries < constants.MAX_RETRIES: deferred.defer(delayed_delete_message, msg_id=msg_id, process_id=process_id, retries=retries+1, _countdown=60*2**retries, _queue="elimination") else: logging.error("Couldn't delete msg %s for user %s" % (msg_id, process.source_email)) return imap.delete_message(msg_id=msg_id, criteria=criteria) imap.close() msg_process.status = constants.FINISHED msg_process.put() all_done = True all_cleaning_messages = CleanMessageProcess.query( CleanMessageProcess.clean_process_id == process_id ).fetch() progress = 0 for message in all_cleaning_messages: if not message.status == constants.FINISHED: all_done = False else: progress += 1 if all_done: process.status = constants.FINISHED utc_now = datetime.datetime.utcnow() local_tz = pytz.timezone('America/Bogota') tz_offset = local_tz.utcoffset(utc_now) now = utc_now + tz_offset process.progress = progress process.latest_activity = "%s" % now process.put()
def clean_message(msg_id='', imap=None, drive=None, migration=None, folder_id=None, user_email=None, process_id=None): logging.info("Trying to clean message %s for user %s" % (msg_id, user_email)) process = CleanUserProcess.get_by_id(process_id) criteria = process.search_criteria msg_process = CleanMessageProcess.get_or_create(msg_id, process_id) if msg_process.status == constants.FINISHED: return True result, message = imap.get_message(msg_id=msg_id) if result != 'OK': raise Exception("Couldn't read message") result, label_data = imap.get_message_labels(msg_id=msg_id) labels = [] if label_data and label_data[0]: labels = (((label_data[0].split('('))[2].split(')'))[0]).split() mail = email.message_from_string(message[0][1]) attachments = [] number_of_attachments = 0 if mail.get_content_maintype() == 'multipart': for part in mail.walk(): if part.get_content_maintype() == 'multipart': continue if part.get('Content-Disposition') is None: continue # Is attachment attached = False number_of_attachments += 1 attachment_process = CleanAttachmentProcess.get_or_create( msg_id, msg_process.key.id(), number_of_attachments ) file_id = '' if (attachment_process.status == constants.FINISHED and attachment_process.url and attachment_process.filename and attachment_process.file_id ): attached = True attachments.append( (attachment_process.url, attachment_process.filename)) file_id = attachment_process.file_id if not attached: attachment = part.get_payload(decode=True) mime_type = part.get_content_type() filename = part.get_filename() text, encoding = email.Header.decode_header(filename)[0] if encoding: filename = text.decode(encoding) insert_result = drive.insert_file(filename=filename, mime_type=mime_type, content=attachment, parent_id=folder_id) if not insert_result: attachment_process.error_description = ( "Error inserting file" ) attachment_process.put() raise Exception("Insert file error") drive_url = insert_result['webContentLink'] file_id = insert_result['id'] attachment_process.url = drive_url attachment_process.file_id = file_id attachment_process.status = constants.FINISHED attachment_process.filename = filename attachment_process.put() attachments.append((drive_url, filename)) permission_result = drive.insert_permission(file_id=file_id, value=user_email, type='user', role='writer') if not permission_result: attachment_process.error_description = ( "Permission error" ) attachment_process.put() raise Exception("Permission error") part.set_payload("") for header in part.keys(): part.__delitem__(header) msg_process.status = constants.DUPLICATED msg_process.put() for url, filename in attachments: body_suffix = '<a href="%s">%s</a>' % (url, filename) new_payload = email.MIMEText.MIMEText(body_suffix.encode('utf-8'), 'html', 'utf-8') mail.attach(new_payload) # Send new mail time.sleep(1.2) migration_result = migration.migrate_mail(user_email=user_email, msg=mail, labels=labels) if not migration_result: msg_process.error_description = "Migration error" msg_process.put() raise Exception("Migration error") else: msg_process.status = constants.MIGRATED msg_process.put() # Then delete previous email logging.info("Delaying delete of msg %s for user %s" % (msg_id, user_email)) deferred.defer(delayed_delete_message, msg_id=msg_id, process_id=process_id, _countdown=30, _queue="elimination") return True
def list_process(): form = CleanUserProcessForm() user = users.get_current_user() clean_process_saved = False clean_processes = [] clean_process_query = CleanUserProcess.query(CleanUserProcess.owner_email == user.email()).order() query_params = {} if request.method == 'POST': if form.validate_on_submit(): primary_domain = PrimaryDomain.get_or_create( domain_name = user.email().split('@')[1]) logged_in = 'NO' current_user = users.get_current_user() if current_user.email() == primary_domain.admin_email: imap = IMAPHelper() logged_in, _ = imap.login( form.data['source_email'], form.data['source_password']) imap.close() if logged_in != 'OK': form.source_email.errors.append( "Can't access the email with those credentials") else: clean_user_process = CleanUserProcess( owner_email=user.email(), destination_message_email=user.email(), status=constants.STARTED ) for key, value in form.data.iteritems(): setattr(clean_user_process, key, value) clean_process_key = clean_user_process.put() clean_process_saved = True # TODO: process does not appears immediately after it's saved # launch Pipeline deferred.defer(schedule_user_cleaning, user_email=form.data['source_email'], process_id=clean_process_key.id()) is_prev = request.args.get('prev', False) url_cursor = request.args.get('cursor', None) cursor = Cursor(urlsafe=url_cursor) if url_cursor else None if is_prev: clean_process_query = clean_process_query.order( CleanUserProcess.created) cursor = cursor.reversed() else: clean_process_query = clean_process_query.order( -CleanUserProcess.created) data, next_curs, more = clean_process_query.fetch_page( constants.PAGE_SIZE, start_cursor=cursor) clean_processes.extend(data) if is_prev: prev_curs = next_curs.reversed().urlsafe() if more else None next_curs = url_cursor else: prev_curs = url_cursor next_curs = next_curs.urlsafe() if more else None return render_template('process.html', form=form, user=user.email(), clean_process_saved=clean_process_saved, clean_processes=clean_processes, next_curs=next_curs, more=more, prev_curs=prev_curs)