def build_lda_corpus(data_folder, path_index_file, stop_words_file, dictionary_file, ldac_file, min_frequency, min_word_len):
    '''
    The main function that does the job! 
    
    '''
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))

        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)

    
    # Creates the dictionary 
    create_dictionary(stop_words_file, file_tuples, dictionary_file, min_frequency, min_word_len)
    
    # Creates the corpus 
    dictionary = corpora.Dictionary().load(dictionary_file)       
    corpus_memory_friendly = TextCorpus(dictionary, file_tuples) # doesn't load the corpus into the memory!
    corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary)
    
    logging.info('The Enron corpus building is completed.')
def index_plain_text_emails(data_folder, 
                            path_index_file, store_dir, 
                            lemmatize = False, stem = False, 
                            nonascii = True):
    '''
    Indexes all the plain text emails in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory absolute path 
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): 
        os.mkdir(store_dir)
    
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
    
        logging.info('%d email documents found.' % len(file_tuples))
    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)
    
    logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples)))
        
    store = SimpleFSDirectory(File(store_dir))
    writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED)
    
    print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder
    print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii  
    
    for ft in file_tuples: 
        idx, root, file_name = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name)
        # parses the emails in plain text format 
        receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, 
                                                                                                    tokenize = True, 
                                                                                                    lemmatize = lemmatize, 
                                                                                                    stem = stem, 
                                                                                                    nonascii = nonascii)

        doc = Document()
        doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES))
        doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added BCC field in indexing.
        doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added Email-Date field in indexing
        doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        
        if len(message_text) > 0:
            doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        else:
            logging.error("[%d] file: %s - body text is empty.", idx, file_name)
            
        # Adds all documents fields as a separate index so that we can search through them 
        doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))

        writer.addDocument(doc)
        logging.info("[%d] file: %s - added to Lucene index.", idx, file_name)


    writer.commit()
    writer.close()

    logging.info('Lucene: All files are indexed.')
def index_plain_text_emails(data_folder, path_index_file, store_dir, 
                            lemmatize=False, stem=False, nonascii=True):
    '''
    Indexes all the plain text emails in the input directory and stores the 
    index in the store_dir  
    
    Arguments: 
        data_folder - input directory (absolute path)
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
        lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer 
        stem - stem tokens 
        nonascii - allow non-ASCII characters  
         
        
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): os.mkdir(store_dir)
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('Index file path: %s' % path_index_file)

    schema = Schema(file_id=NUMERIC(int, stored=True), 
                    file_name=ID(stored=True), 
                    file_path=ID(stored=True), 
                    email_reciever=TEXT(stored=True), 
                    email_sender=TEXT(stored=True), 
                    email_cc=TEXT(stored=True), 
                    email_subject=TEXT(stored=True), 
                    email_bcc=TEXT(stored=True),
                    date=ID(stored=True),
                    email_body=TEXT(stored=True),
                    all=TEXT(stored=True))
    ix = create_in(store_dir, schema)
    writer = ix.writer()
    logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' 
                 % (stem, lemmatize, len(file_tuples), nonascii))
    
    for ft in file_tuples: 
        idx, root, file_name, file_type = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] creating index for %s...", idx, file_name)
        
        
        ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, 
                                     nonascii=nonascii, file_type=file_type)

        (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret
        
        writer.add_document(file_id = idx, 
                            file_name = unicode(file_name), 
                            file_path = unicode(file_path), 
                            email_reciever = unicode(receiver), 
                            email_sender = unicode(sender), 
                            email_cc = unicode(cc),
                            email_subject = unicode(subject), 
                            email_bcc = unicode(bcc), 
                            date = unicode(date), 
                            email_body = unicode(body_text), 
                            all = unicode(doc_text))
 
    writer.commit()
    logging.info('All files are indexed.')
def index_plain_text_emails2(data_folder, path_index_file, store_dir, 
                             stem=False, min_token_len=2, max_token_len=40,
                             procs=1, limitmb=128, multisegment=False, 
                             max_doc_length=-1):
    '''
    Indexes all the plain text emails and attachements in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory (absolute path)
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
        stem - stem tokens 
        min_token_len - minimum required length for a token 
        max_token_len - maximum required length for a token 
        procs - number of processors 
        limitmb - memory limit
        multisegment - allow multi-segment write  
        max_doc_length - max document length 
        
    Returns: 
        None 

    '''

    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('Index file path: %s' % path_index_file)

    if stem:
        analyzer = StemmingAnalyzer(expression=pat4, stoplist=stop_words, 
                                    minsize=min_token_len, 
                                    maxsize=max_token_len, 
                                    cachesize=-1)
    else: 
        analyzer = StandardAnalyzer(expression=pat4, stoplist=stop_words, 
                                    minsize=min_token_len, 
                                    maxsize=max_token_len)        
    std_ana = StandardAnalyzer(stoplist=None)    
    schema = Schema(file_id=NUMERIC(int, stored=True), 
                    file_name=ID(stored=True), file_path=ID(stored=True), 
                    email_reciever=TEXT(stored=True, analyzer=std_ana), 
                    email_sender=TEXT(stored=True, analyzer=std_ana), 
                    email_cc=TEXT(stored=True, analyzer=std_ana), 
                    email_subject=TEXT(stored=True, analyzer=std_ana), 
                    email_bcc=TEXT(stored=True, analyzer=std_ana),
                    date=ID(stored=True), 
                    email_body=TEXT(stored=True, analyzer=analyzer),
                    all=TEXT(stored=True, analyzer=analyzer))
    if not os.path.exists(store_dir): os.mkdir(store_dir)
    ix = create_in(store_dir, schema)
    
    if procs > 1: 
        writer = ix.writer(procs=procs, limitmb=limitmb, 
                           multisegment=multisegment)
    else: 
        writer = ix.writer(limitmb=limitmb)

    logging.info('Stem = %s, D = %d' % (stem, len(file_tuples)))
    
    truncate_count = 0
    for ft in file_tuples: 
        idx, root, file_name, file_type = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] creating index for %s...", idx, file_name)
        
        (receiver, sender, cc, subject, body_text, bcc, date, 
         doc_text) = parse_text_emails_and_attachments(file_path, file_type)
        
        # TODO this needs to be removed 
        et = doc_text.split()
        if max_doc_length > 1 and len(et) > max_doc_length: 
            doc_text = " ".join(et[:max_doc_length])
            truncate_count += 1
        
        writer.add_document(file_id = idx, 
                            file_name = unicode(file_name), 
                            file_path = unicode(file_path), 
                            email_reciever = unicode(receiver), 
                            email_sender = unicode(sender), 
                            email_cc = unicode(cc),
                            email_subject = unicode(subject), 
                            email_bcc = unicode(bcc), 
                            date = unicode(date), 
                            email_body = unicode(body_text), 
                            all = unicode(doc_text))
    writer.commit()

    logging.info('%d documents are truncated.', truncate_count)
    logging.info('All files are indexed.')