コード例 #1
0
def build_lda_corpus(data_folder, path_index_file, stop_words_file, dictionary_file, ldac_file, min_frequency, min_word_len):
    '''
    The main function that does the job! 
    
    '''
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))

        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)

    
    # Creates the dictionary 
    create_dictionary(stop_words_file, file_tuples, dictionary_file, min_frequency, min_word_len)
    
    # Creates the corpus 
    dictionary = corpora.Dictionary().load(dictionary_file)       
    corpus_memory_friendly = TextCorpus(dictionary, file_tuples) # doesn't load the corpus into the memory!
    corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary)
    
    logging.info('The Enron corpus building is completed.')
コード例 #2
0
def load_lda_parameters(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lda_mdl_file = mdl_cfg['LDA']['lda_model_file']
    lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lda_file_path_index = load_file_paths_index(path_index_file)
        lda_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lda_mdl_file) and nexists(lda_cos_index_file): 
        lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file)
        
    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    lda_theta = np.loadtxt(lda_theta_file) # loads the LDA theta from the model theta file 
    num_docs, num_topics = lda_theta.shape
    min_lda_theta = np.min(np.min(lda_theta))
    print 'LDA-theta is loaded: # of documents:', num_docs, \
        '# of topics:', num_topics, 'min(Theta):', min_lda_theta  
    
    lda_beta_file = mdl_cfg['LDA']['lda_beta_file']
    lda_beta = np.loadtxt(lda_beta_file) # loads the LDA theta from the model theta file 
    num_topics, vocab_size = lda_beta.shape
    min_lda_beta = np.min(np.min(lda_beta))
    print 'LDA-beta is loaded: # of topics:', num_topics, \
        '# of terms in the vocabulary:', vocab_size, \
        'min(Bheta):', min_lda_beta
    print     
    
    return lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta
コード例 #3
0
def load_lsi_parameters(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lsi_mdl_file = mdl_cfg['LSI']['lsi_model_file']
    lsi_cos_index_file = mdl_cfg['LSI']['lsi_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lsi_file_path_index = load_file_paths_index(path_index_file)
        lsi_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lsi_mdl_file) and nexists(lsi_cos_index_file): 
        lsi_mdl, lsi_index = load_lsi_variables(lsi_mdl_file, lsi_cos_index_file)
        
    return lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index
コード例 #4
0
def load_tm(mdl_cfg):
    
    dictionary_file = mdl_cfg['CORPUS']['dict_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']
    lda_mdl_file = mdl_cfg['LDA']['lda_model_file']
    lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file']
    
    if nexists(dictionary_file) and nexists(path_index_file):       
        lda_file_path_index = load_file_paths_index(path_index_file)
        lda_dictionary = load_dictionary(dictionary_file)
        
    if nexists(lda_mdl_file) and nexists(lda_cos_index_file): 
        lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file)
        
    return lda_dictionary, lda_mdl, lda_index, lda_file_path_index
コード例 #5
0
def test_query():
    '''
    This function tests a sample query 
    '''

    dictionary_file = '/home/cgeorge/data/tm/enron.dict'
    doc_paths_file = '/home/cgeorge/data/enron.path.index'
    lsi_mdl_file = '/home/cgeorge/data/tm/enron.lsi'
    lsi_index_file = '/home/cgeorge/data/tm/enron.lsi.cos.index'
    query = 'half from new deals and the other half from reserve releases, and when you back out the prudency release you get back to zero net curve shift for 2001, which is what the original file had'
    
    dictionary, lsi, index = load_lsi_variables(dictionary_file, lsi_mdl_file, lsi_index_file)
    files_info = load_file_paths_index(doc_paths_file)
    
    responsive_docs, non_responsive_docs = process_query(query, dictionary, lsi, index, files_info, limit=5)
    
    return responsive_docs, non_responsive_docs
コード例 #6
0
def search_tm_topics(topics_list, limit, mdl_cfg):   
    '''
    Performs search on the topic model using relevant  
    topic indices 
    '''

    EPS = 1e-24 # a constant 
    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file) # loads the file paths    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) # loads the LDA theta from the model theta file 
    num_docs, num_topics = lda_theta.shape
    
    print 'LDA-theta is loaded: number of documents: ', num_docs, ' number of topics: ', num_topics  
    
    unsel_topic_idx = [idx for idx in range(0, num_topics) if idx not in topics_list]
    sel = np.log(lda_theta[:, topics_list] + EPS)
    unsel = np.log(1.0 - lda_theta[:, unsel_topic_idx] + EPS)
    ln_score = sel.sum(axis=1) + unsel.sum(axis=1)  
    sorted_idx = ln_score.argsort(axis=0)[::-1]
    # score = np.exp(ln_score)
    
    # Normalize the topic index search score 
    # TODO: this is an adhoc method right now. May come back later... 
    min_ln_score = min(ln_score)
    n_ln_score = (1.0 - ln_score / min_ln_score)

    ts_results = []
    for i in range(0, min(limit, num_docs)):
        ts_results.append([lda_file_path_index[sorted_idx[i]][0], # document id  
                          lda_file_path_index[sorted_idx[i]][1], # document directory path   
                          lda_file_path_index[sorted_idx[i]][2], # document name
                          n_ln_score[sorted_idx[i]]]) # similarity score 
        # print lda_file_path_index[sorted_idx[i]], ln_score[sorted_idx[i]], n_ln_score[sorted_idx[i]], score[sorted_idx[i]] 
        

    # grabs the files details from the index     
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    
    results = [[row[0], float(row[10])] for row in ts_results] # Note: we need a float conversion because it's retrieving as string 
    
    return results
コード例 #7
0
def get_tm_classification_dataset(mdl_cfg_file, positive_dir):   
    
    mdl_cfg = read_config(mdl_cfg_file)

    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file)    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.float)
    num_docs, num_topics = lda_theta.shape
    
    print 'LDA Theta: Number of documents ', num_docs, ' number of topics ', num_topics  
    
    class_ids = np.zeros(num_docs)
    file_paths = [] 
    for i, (_, root, file_name) in enumerate(lda_file_path_index):
        if positive_dir == root: # os.path.exists(os.path.join(positive_dir, file_name)):
            class_ids[i] = RELEVANT_CLASS_ID
        else:
            class_ids[i] = IRRELEVANT_CLASS_ID
        file_paths.append(os.path.join(root, file_name))

        
    return (class_ids, lda_theta, file_paths)
コード例 #8
0
def search_tm_sel_topics_cos(topics_list, topics_prob, limit, mdl_cfg):   

    lda_theta_file = mdl_cfg['LDA']['lda_theta_file']
    index_dir = mdl_cfg['LUCENE']['lucene_index_dir']
    path_index_file = mdl_cfg['CORPUS']['path_index_file']    
    lda_file_path_index = load_file_paths_index(path_index_file)    
    lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble)
    num_docs, num_topics = lda_theta.shape
    
    print 'Number of documents: ', num_docs, ' number of topics: ', num_topics  
    
    from scipy.spatial.distance import cosine 
    topics_prob = np.array(topics_prob)
    sel = lda_theta[:, topics_list]
    cos_scores = np.zeros(num_docs)
    for i in range(0, num_docs):
        cos_scores[i] = cosine(topics_prob, sel[i, :])

    sorted_idx = cos_scores.argsort(axis=0)[::-1]
    
    ts_results = []
    
    for i in range(0, min(limit, num_docs)):
        ts_results.append([lda_file_path_index[sorted_idx[i]][0], 
                          lda_file_path_index[sorted_idx[i]][1], 
                          lda_file_path_index[sorted_idx[i]][2], 
                          cos_scores[sorted_idx[i]]])
        print lda_file_path_index[sorted_idx[i]], cos_scores[sorted_idx[i]]
        

    # grabs the files details from the index 
    
    ts_results = get_indexed_file_details(ts_results, index_dir) 
    results = [[row[0], row[10]] for row in ts_results]
    
    return results
コード例 #9
0
def index_plain_text_emails(data_folder, 
                            path_index_file, store_dir, 
                            lemmatize = False, stem = False, 
                            nonascii = True):
    '''
    Indexes all the plain text emails in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory absolute path 
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): 
        os.mkdir(store_dir)
    
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
    
        logging.info('%d email documents found.' % len(file_tuples))
    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('File paths index is stored into %s' % path_index_file)
    
    logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples)))
        
    store = SimpleFSDirectory(File(store_dir))
    writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED)
    
    print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder
    print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii  
    
    for ft in file_tuples: 
        idx, root, file_name = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name)
        # parses the emails in plain text format 
        receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, 
                                                                                                    tokenize = True, 
                                                                                                    lemmatize = lemmatize, 
                                                                                                    stem = stem, 
                                                                                                    nonascii = nonascii)

        doc = Document()
        doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED))
        doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES))
        doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added BCC field in indexing.
        doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        #Subodh-Rahul - Added Email-Date field in indexing
        doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS))
        
        if len(message_text) > 0:
            doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))
        else:
            logging.error("[%d] file: %s - body text is empty.", idx, file_name)
            
        # Adds all documents fields as a separate index so that we can search through them 
        doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES))

        writer.addDocument(doc)
        logging.info("[%d] file: %s - added to Lucene index.", idx, file_name)


    writer.commit()
    writer.close()

    logging.info('Lucene: All files are indexed.')
コード例 #10
0
def index_plain_text_emails(data_folder, path_index_file, store_dir, 
                            lemmatize=False, stem=False, nonascii=True):
    '''
    Indexes all the plain text emails in the input directory and stores the 
    index in the store_dir  
    
    Arguments: 
        data_folder - input directory (absolute path)
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
        lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer 
        stem - stem tokens 
        nonascii - allow non-ASCII characters  
         
        
    Returns: 
        None 

    '''
    
    if not os.path.exists(store_dir): os.mkdir(store_dir)
    
    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('Index file path: %s' % path_index_file)

    schema = Schema(file_id=NUMERIC(int, stored=True), 
                    file_name=ID(stored=True), 
                    file_path=ID(stored=True), 
                    email_reciever=TEXT(stored=True), 
                    email_sender=TEXT(stored=True), 
                    email_cc=TEXT(stored=True), 
                    email_subject=TEXT(stored=True), 
                    email_bcc=TEXT(stored=True),
                    date=ID(stored=True),
                    email_body=TEXT(stored=True),
                    all=TEXT(stored=True))
    ix = create_in(store_dir, schema)
    writer = ix.writer()
    logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' 
                 % (stem, lemmatize, len(file_tuples), nonascii))
    
    for ft in file_tuples: 
        idx, root, file_name, file_type = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] creating index for %s...", idx, file_name)
        
        
        ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, 
                                     nonascii=nonascii, file_type=file_type)

        (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret
        
        writer.add_document(file_id = idx, 
                            file_name = unicode(file_name), 
                            file_path = unicode(file_path), 
                            email_reciever = unicode(receiver), 
                            email_sender = unicode(sender), 
                            email_cc = unicode(cc),
                            email_subject = unicode(subject), 
                            email_bcc = unicode(bcc), 
                            date = unicode(date), 
                            email_body = unicode(body_text), 
                            all = unicode(doc_text))
 
    writer.commit()
    logging.info('All files are indexed.')
コード例 #11
0
def index_plain_text_emails2(data_folder, path_index_file, store_dir, 
                             stem=False, min_token_len=2, max_token_len=40,
                             procs=1, limitmb=128, multisegment=False, 
                             max_doc_length=-1):
    '''
    Indexes all the plain text emails and attachements in the input directory 
    and stores the index in the store_dir  
    
    Arguments: 
        data_folder - input directory (absolute path)
        path_index_file - file paths index file 
        store_dir - index store directory absolute path 
        stem - stem tokens 
        min_token_len - minimum required length for a token 
        max_token_len - maximum required length for a token 
        procs - number of processors 
        limitmb - memory limit
        multisegment - allow multi-segment write  
        max_doc_length - max document length 
        
    Returns: 
        None 

    '''

    if os.path.exists(path_index_file): 
        logging.info('Loading file paths index...')
        file_tuples = load_file_paths_index(path_index_file)
        logging.info('%d files found in the file paths index.' % len(file_tuples))
    else: 
        logging.info('Loading files in the data folder %s...' % data_folder)
        file_tuples = get_file_paths_index(data_folder)
        logging.info('%d email documents found.' % len(file_tuples))    
        store_file_paths_index(path_index_file, file_tuples)
        logging.info('Index file path: %s' % path_index_file)

    if stem:
        analyzer = StemmingAnalyzer(expression=pat4, stoplist=stop_words, 
                                    minsize=min_token_len, 
                                    maxsize=max_token_len, 
                                    cachesize=-1)
    else: 
        analyzer = StandardAnalyzer(expression=pat4, stoplist=stop_words, 
                                    minsize=min_token_len, 
                                    maxsize=max_token_len)        
    std_ana = StandardAnalyzer(stoplist=None)    
    schema = Schema(file_id=NUMERIC(int, stored=True), 
                    file_name=ID(stored=True), file_path=ID(stored=True), 
                    email_reciever=TEXT(stored=True, analyzer=std_ana), 
                    email_sender=TEXT(stored=True, analyzer=std_ana), 
                    email_cc=TEXT(stored=True, analyzer=std_ana), 
                    email_subject=TEXT(stored=True, analyzer=std_ana), 
                    email_bcc=TEXT(stored=True, analyzer=std_ana),
                    date=ID(stored=True), 
                    email_body=TEXT(stored=True, analyzer=analyzer),
                    all=TEXT(stored=True, analyzer=analyzer))
    if not os.path.exists(store_dir): os.mkdir(store_dir)
    ix = create_in(store_dir, schema)
    
    if procs > 1: 
        writer = ix.writer(procs=procs, limitmb=limitmb, 
                           multisegment=multisegment)
    else: 
        writer = ix.writer(limitmb=limitmb)

    logging.info('Stem = %s, D = %d' % (stem, len(file_tuples)))
    
    truncate_count = 0
    for ft in file_tuples: 
        idx, root, file_name, file_type = ft
        file_path = os.path.join(root, file_name)
        logging.info("[%d] creating index for %s...", idx, file_name)
        
        (receiver, sender, cc, subject, body_text, bcc, date, 
         doc_text) = parse_text_emails_and_attachments(file_path, file_type)
        
        # TODO this needs to be removed 
        et = doc_text.split()
        if max_doc_length > 1 and len(et) > max_doc_length: 
            doc_text = " ".join(et[:max_doc_length])
            truncate_count += 1
        
        writer.add_document(file_id = idx, 
                            file_name = unicode(file_name), 
                            file_path = unicode(file_path), 
                            email_reciever = unicode(receiver), 
                            email_sender = unicode(sender), 
                            email_cc = unicode(cc),
                            email_subject = unicode(subject), 
                            email_bcc = unicode(bcc), 
                            date = unicode(date), 
                            email_body = unicode(body_text), 
                            all = unicode(doc_text))
    writer.commit()

    logging.info('%d documents are truncated.', truncate_count)
    logging.info('All files are indexed.')