def build_lda_corpus(data_folder, path_index_file, stop_words_file, dictionary_file, ldac_file, min_frequency, min_word_len): ''' The main function that does the job! ''' if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) # Creates the dictionary create_dictionary(stop_words_file, file_tuples, dictionary_file, min_frequency, min_word_len) # Creates the corpus dictionary = corpora.Dictionary().load(dictionary_file) corpus_memory_friendly = TextCorpus(dictionary, file_tuples) # doesn't load the corpus into the memory! corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary) logging.info('The Enron corpus building is completed.')
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize = False, stem = False, nonascii = True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory absolute path path_index_file - file paths index file store_dir - index store directory absolute path Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples))) store = SimpleFSDirectory(File(store_dir)) writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED) print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii for ft in file_tuples: idx, root, file_name = ft file_path = os.path.join(root, file_name) logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name) # parses the emails in plain text format receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, tokenize = True, lemmatize = lemmatize, stem = stem, nonascii = nonascii) doc = Document() doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES)) doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added BCC field in indexing. doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added Email-Date field in indexing doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) if len(message_text) > 0: doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) else: logging.error("[%d] file: %s - body text is empty.", idx, file_name) # Adds all documents fields as a separate index so that we can search through them doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) logging.info("[%d] file: %s - added to Lucene index.", idx, file_name) writer.commit() writer.close() logging.info('Lucene: All files are indexed.')
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize=False, stem=False, nonascii=True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory (absolute path) path_index_file - file paths index file store_dir - index store directory absolute path lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer stem - stem tokens nonascii - allow non-ASCII characters Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('Index file path: %s' % path_index_file) schema = Schema(file_id=NUMERIC(int, stored=True), file_name=ID(stored=True), file_path=ID(stored=True), email_reciever=TEXT(stored=True), email_sender=TEXT(stored=True), email_cc=TEXT(stored=True), email_subject=TEXT(stored=True), email_bcc=TEXT(stored=True), date=ID(stored=True), email_body=TEXT(stored=True), all=TEXT(stored=True)) ix = create_in(store_dir, schema) writer = ix.writer() logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' % (stem, lemmatize, len(file_tuples), nonascii)) for ft in file_tuples: idx, root, file_name, file_type = ft file_path = os.path.join(root, file_name) logging.info("[%d] creating index for %s...", idx, file_name) ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, nonascii=nonascii, file_type=file_type) (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret writer.add_document(file_id = idx, file_name = unicode(file_name), file_path = unicode(file_path), email_reciever = unicode(receiver), email_sender = unicode(sender), email_cc = unicode(cc), email_subject = unicode(subject), email_bcc = unicode(bcc), date = unicode(date), email_body = unicode(body_text), all = unicode(doc_text)) writer.commit() logging.info('All files are indexed.')
def index_plain_text_emails2(data_folder, path_index_file, store_dir, stem=False, min_token_len=2, max_token_len=40, procs=1, limitmb=128, multisegment=False, max_doc_length=-1): ''' Indexes all the plain text emails and attachements in the input directory and stores the index in the store_dir Arguments: data_folder - input directory (absolute path) path_index_file - file paths index file store_dir - index store directory absolute path stem - stem tokens min_token_len - minimum required length for a token max_token_len - maximum required length for a token procs - number of processors limitmb - memory limit multisegment - allow multi-segment write max_doc_length - max document length Returns: None ''' if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('Index file path: %s' % path_index_file) if stem: analyzer = StemmingAnalyzer(expression=pat4, stoplist=stop_words, minsize=min_token_len, maxsize=max_token_len, cachesize=-1) else: analyzer = StandardAnalyzer(expression=pat4, stoplist=stop_words, minsize=min_token_len, maxsize=max_token_len) std_ana = StandardAnalyzer(stoplist=None) schema = Schema(file_id=NUMERIC(int, stored=True), file_name=ID(stored=True), file_path=ID(stored=True), email_reciever=TEXT(stored=True, analyzer=std_ana), email_sender=TEXT(stored=True, analyzer=std_ana), email_cc=TEXT(stored=True, analyzer=std_ana), email_subject=TEXT(stored=True, analyzer=std_ana), email_bcc=TEXT(stored=True, analyzer=std_ana), date=ID(stored=True), email_body=TEXT(stored=True, analyzer=analyzer), all=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists(store_dir): os.mkdir(store_dir) ix = create_in(store_dir, schema) if procs > 1: writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) else: writer = ix.writer(limitmb=limitmb) logging.info('Stem = %s, D = %d' % (stem, len(file_tuples))) truncate_count = 0 for ft in file_tuples: idx, root, file_name, file_type = ft file_path = os.path.join(root, file_name) logging.info("[%d] creating index for %s...", idx, file_name) (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = parse_text_emails_and_attachments(file_path, file_type) # TODO this needs to be removed et = doc_text.split() if max_doc_length > 1 and len(et) > max_doc_length: doc_text = " ".join(et[:max_doc_length]) truncate_count += 1 writer.add_document(file_id = idx, file_name = unicode(file_name), file_path = unicode(file_path), email_reciever = unicode(receiver), email_sender = unicode(sender), email_cc = unicode(cc), email_subject = unicode(subject), email_bcc = unicode(bcc), date = unicode(date), email_body = unicode(body_text), all = unicode(doc_text)) writer.commit() logging.info('%d documents are truncated.', truncate_count) logging.info('All files are indexed.')