def fetch_email(imap, email_id): print "*******************fetch_email**************************" # def timeout_handler(signum, frame): # raise self.TimeoutException() # # signal.signal(signal.SIGALRM, timeout_handler) # signal.alarm(30) # triger alarm in 30 seconds # # avro_record = dict() # status = 'FAIL' utils = EmailUtils() try: status, data = imap.fetch( str(email_id), '(X-GM-THRID RFC822)' ) # Gmail's X-GM-THRID will get the thread of the message except TimeoutException: return 'TIMEOUT', {}, None except: return 'ABORT', {}, None charset = None if status != 'OK': return 'ERROR', {}, None else: raw_thread_id = data[0][0] encoded_email = data[0][1] try: charset = utils.get_charset(encoded_email) # RFC2822 says default charset is us-ascii, which often saves us when no charset is specified if (charset): pass else: charset = 'us-ascii' if (charset): # redundant, but saves our ass if we edit above #raw_email = encoded_email.decode(charset) thread_id = utils.get_thread_id(raw_thread_id) # print "CHARSET: " + charset avro_record, charset = utils.process_email(encoded_email, thread_id) else: return 'UNICODE', {}, charset except UnicodeDecodeError: return 'UNICODE', {}, charset except: return 'ERROR', {}, None # Without a charset we pass bad chars to avro, and it dies. See AVRO-565. if charset: return status, avro_record, charset else: return 'CHARSET', {}, charset print "*******************fetch_email end**************************"
def __init__(self): """This class downloads all emails in folders from your 163mail inbox and writes them as raw UTF-8 text in simple Avro records for further processing.""" self.utils = EmailUtils() self.username = None self.password = None self.imap = None self.schema = None self.avro_writer = None self.avro_writertmp = None self.imap_folder = None self.id_list = None self.folder_count = None # Only the email BODY which RFC822.SIZE are smaller than 3M are fetched # otherwise the email HEADER are fetched. self.threshold_size = 2 * 1024 * 1024
def fetch_email(imap, email_id): print "*******************fetch_email**************************" # def timeout_handler(signum, frame): # raise self.TimeoutException() # # signal.signal(signal.SIGALRM, timeout_handler) # signal.alarm(30) # triger alarm in 30 seconds # # avro_record = dict() # status = 'FAIL' utils = EmailUtils() try: status, data = imap.fetch( str(email_id), '(X-GM-THRID RFC822)' ) # Gmail's X-GM-THRID will get the thread of the message except Exception, e: print Exception, " : ", e
def __init__(self): self.utils = EmailUtils() """This class downloads all emails in folders from your Gmail inbox and writes them as raw UTF-8 text in simple Avro records for further processing."""
class GmailSlurper(object): def __init__(self): self.utils = EmailUtils() """This class downloads all emails in folders from your Gmail inbox and writes them as raw UTF-8 text in simple Avro records for further processing.""" def init_directory(self, directory): if os.path.exists(directory): print 'Warning: %(directory)s already exists:' % {"directory":directory} else: os.makedirs(directory) return directory def init_imap(self, username, password): self.username = username self.password = password try: imap.shutdown() except: pass self.imap = imaplib.IMAP4_SSL('imap.gmail.com', 993) self.imap.login(username, password) self.imap.is_readonly = True # part_id will be helpful one we're splitting files among multiple slurpers def init_avro(self, output_path, part_id, schema_path): output_dir = None if(type(output_path) is str): output_dir = self.init_directory(output_path) out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} self.schema = open(schema_path, 'r').read() email_schema = schema.parse(self.schema) rec_writer = io.DatumWriter(email_schema) self.avro_writer = datafile.DataFileWriter( open(out_filename, 'wb'), rec_writer, email_schema ) def init_folder(self, folder): self.imap_folder = folder status, count = self.imap.select(folder) print "Folder '" + str(folder) + "' SELECT status: " + status if(status == 'OK'): count = int(count[0]) ids = range(1,count) ids.reverse() self.id_list = ids print "Folder '" + str(folder) + " has " + str(count) + "' emails...\n" self.folder_count = count return status, count def fetch_email(self, email_id): def timeout_handler(signum, frame): raise self.TimeoutException() signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(30) # triger alarm in 30 seconds avro_record = dict() status = 'FAIL' try: status, data = self.imap.fetch(str(email_id), '(X-GM-THRID RFC822)') # Gmail's X-GM-THRID will get the thread of the message except self.TimeoutException: return 'TIMEOUT', {}, None except: return 'ABORT', {}, None charset = None if status != 'OK': return 'ERROR', {}, None else: raw_thread_id = data[0][0] encoded_email = data[0][1] try: charset = self.utils.get_charset(encoded_email) # RFC2822 says default charset is us-ascii, which often saves us when no charset is specified if(charset): pass else: charset = 'us-ascii' if(charset): # redundant, but saves our ass if we edit above #raw_email = encoded_email.decode(charset) thread_id = self.utils.get_thread_id(raw_thread_id) print "CHARSET: " + charset avro_record, charset = self.utils.process_email(encoded_email, thread_id) else: return 'UNICODE', {}, charset except UnicodeDecodeError: return 'UNICODE', {}, charset # Without a charset we pass bad chars to avro, and it dies. See AVRO-565. if charset: return status, avro_record, charset else: return 'CHARSET', {}, charset def shutdown(self): self.avro_writer.close() self.imap.close() self.imap.logout() def write(self, record): self.avro_writer.append(record) def flush(self): self.avro_writer.flush() print "Flushed avro writer..." def slurp(self): if(self.imap and self.imap_folder): for email_id in self.id_list: (status, email_hash, charset) = self.fetch_email(email_id) if(status == 'OK' and charset and 'thread_id' in email_hash and 'froms' in email_hash): print email_id, charset, email_hash['thread_id'] self.write(email_hash) if((int(email_id) % 1000) == 0): self.flush() elif(status == 'ERROR' or status == 'PARSE' or status == 'UNICODE' or status == 'CHARSET' or status =='FROM'): sys.stderr.write("Problem fetching email id " + str(email_id) + ": " + status + "\n") continue elif (status == 'ABORT' or status == 'TIMEOUT'): sys.stderr.write("resetting imap for " + status + "\n") stat, c = self.reset() sys.stderr.write("IMAP RESET: " + str(stat) + " " + str(c) + "\n") else: sys.stderr.write("ERROR IN PARSING EMAIL, SKIPPED ONE") continue def reset(self): self.init_imap(self.username, self.password) status, count = self.init_folder(self.imap_folder) return status, count class TimeoutException(Exception): """Indicates an operation timed out.""" pass
class GmailSlurper(object): def __init__(self): self.utils = EmailUtils() """This class downloads all emails in folders from your Gmail inbox and writes them as raw UTF-8 text in simple Avro records for further processing.""" def init_directory(self, directory): if os.path.exists(directory): print 'Warning: %(directory)s already exists:' % { "directory": directory } else: os.makedirs(directory) return directory def init_imap(self, username, password): self.username = username self.password = password try: imap.shutdown() except: pass self.imap = imaplib.IMAP4_SSL('imap.gmail.com', 993) self.imap.login(username, password) self.imap.is_readonly = True # part_id will be helpful one we're splitting files among multiple slurpers def init_avro(self, output_path, part_id, schema_path): output_dir = None if (type(output_path) is str): output_dir = self.init_directory(output_path) out_filename = '%(output_dir)s/part-%(part_id)s.avro' % \ {"output_dir": output_dir, "part_id": str(part_id)} self.schema = open(schema_path, 'r').read() email_schema = schema.parse(self.schema) rec_writer = io.DatumWriter(email_schema) self.avro_writer = datafile.DataFileWriter(open(out_filename, 'wb'), rec_writer, email_schema) def init_folder(self, folder): self.imap_folder = folder status, count = self.imap.select(folder) print "Folder '" + str(folder) + "' SELECT status: " + status if (status == 'OK'): count = int(count[0]) ids = range(1, count) ids.reverse() self.id_list = ids print "Folder '" + str(folder) + " has " + str( count) + "' emails...\n" self.folder_count = count return status, count def fetch_email(self, email_id): def timeout_handler(signum, frame): raise self.TimeoutException() signal.signal(signal.SIGALRM, timeout_handler) signal.alarm(30) # triger alarm in 30 seconds avro_record = dict() status = 'FAIL' try: status, data = self.imap.fetch( str(email_id), '(X-GM-THRID RFC822)' ) # Gmail's X-GM-THRID will get the thread of the message except self.TimeoutException: return 'TIMEOUT', {}, None except: return 'ABORT', {}, None charset = None if status != 'OK': return 'ERROR', {}, None else: raw_thread_id = data[0][0] encoded_email = data[0][1] try: charset = self.utils.get_charset(encoded_email) # RFC2822 says default charset is us-ascii, which often saves us when no charset is specified if (charset): pass else: charset = 'us-ascii' if (charset): # redundant, but saves our ass if we edit above #raw_email = encoded_email.decode(charset) thread_id = self.utils.get_thread_id(raw_thread_id) print "CHARSET: " + charset avro_record, charset = self.utils.process_email( encoded_email, thread_id) else: return 'UNICODE', {}, charset except UnicodeDecodeError: return 'UNICODE', {}, charset # Without a charset we pass bad chars to avro, and it dies. See AVRO-565. if charset: return status, avro_record, charset else: return 'CHARSET', {}, charset def shutdown(self): self.avro_writer.close() self.imap.close() self.imap.logout() def write(self, record): self.avro_writer.append(record) def flush(self): self.avro_writer.flush() print "Flushed avro writer..." def slurp(self): if (self.imap and self.imap_folder): for email_id in self.id_list: (status, email_hash, charset) = self.fetch_email(email_id) if (status == 'OK' and charset and 'thread_id' in email_hash and 'froms' in email_hash): print email_id, charset, email_hash['thread_id'] self.write(email_hash) if ((int(email_id) % 1000) == 0): self.flush() elif (status == 'ERROR' or status == 'PARSE' or status == 'UNICODE' or status == 'CHARSET' or status == 'FROM'): sys.stderr.write("Problem fetching email id " + str(email_id) + ": " + status + "\n") continue elif (status == 'ABORT' or status == 'TIMEOUT'): sys.stderr.write("resetting imap for " + status + "\n") stat, c = self.reset() sys.stderr.write("IMAP RESET: " + str(stat) + " " + str(c) + "\n") else: sys.stderr.write("ERROR IN PARSING EMAIL, SKIPPED ONE") continue def reset(self): self.init_imap(self.username, self.password) status, count = self.init_folder(self.imap_folder) return status, count class TimeoutException(Exception): """Indicates an operation timed out.""" pass