def process_msg(file_path_tuple): '''Processes a single email file Arguments: file_path_tuple - a tuple of (idx, root, file_name) ''' (idx, root, file_name) = file_path_tuple logging.info('[#%d] file: %s' % (idx, os.path.join(root, file_name)) ) _, _, _, _, body_text = parse_plain_text_email(os.path.join(root, file_name)) tokens = punkt_word_tokenizer(body_text.lower()) return tokens
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize = False, stem = False, nonascii = True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory absolute path path_index_file - file paths index file store_dir - index store directory absolute path Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples))) store = SimpleFSDirectory(File(store_dir)) writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED) print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii for ft in file_tuples: idx, root, file_name = ft file_path = os.path.join(root, file_name) logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name) # parses the emails in plain text format receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, tokenize = True, lemmatize = lemmatize, stem = stem, nonascii = nonascii) doc = Document() doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES)) doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added BCC field in indexing. doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added Email-Date field in indexing doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) if len(message_text) > 0: doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) else: logging.error("[%d] file: %s - body text is empty.", idx, file_name) # Adds all documents fields as a separate index so that we can search through them doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) logging.info("[%d] file: %s - added to Lucene index.", idx, file_name) writer.commit() writer.close() logging.info('Lucene: All files are indexed.')
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize=False, stem=False, nonascii=True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory (absolute path) path_index_file - file paths index file store_dir - index store directory absolute path lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer stem - stem tokens nonascii - allow non-ASCII characters Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('Index file path: %s' % path_index_file) schema = Schema(file_id=NUMERIC(int, stored=True), file_name=ID(stored=True), file_path=ID(stored=True), email_reciever=TEXT(stored=True), email_sender=TEXT(stored=True), email_cc=TEXT(stored=True), email_subject=TEXT(stored=True), email_bcc=TEXT(stored=True), date=ID(stored=True), email_body=TEXT(stored=True), all=TEXT(stored=True)) ix = create_in(store_dir, schema) writer = ix.writer() logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' % (stem, lemmatize, len(file_tuples), nonascii)) for ft in file_tuples: idx, root, file_name, file_type = ft file_path = os.path.join(root, file_name) logging.info("[%d] creating index for %s...", idx, file_name) ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, nonascii=nonascii, file_type=file_type) (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret writer.add_document(file_id = idx, file_name = unicode(file_name), file_path = unicode(file_path), email_reciever = unicode(receiver), email_sender = unicode(sender), email_cc = unicode(cc), email_subject = unicode(subject), email_bcc = unicode(bcc), date = unicode(date), email_body = unicode(body_text), all = unicode(doc_text)) writer.commit() logging.info('All files are indexed.')