def build_lda_corpus(data_folder, path_index_file, stop_words_file, dictionary_file, ldac_file, min_frequency, min_word_len): ''' The main function that does the job! ''' if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) # Creates the dictionary create_dictionary(stop_words_file, file_tuples, dictionary_file, min_frequency, min_word_len) # Creates the corpus dictionary = corpora.Dictionary().load(dictionary_file) corpus_memory_friendly = TextCorpus(dictionary, file_tuples) # doesn't load the corpus into the memory! corpora.BleiCorpus.serialize(ldac_file, corpus_memory_friendly, id2word=dictionary) logging.info('The Enron corpus building is completed.')
def load_lda_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] lda_theta = np.loadtxt(lda_theta_file) # loads the LDA theta from the model theta file num_docs, num_topics = lda_theta.shape min_lda_theta = np.min(np.min(lda_theta)) print 'LDA-theta is loaded: # of documents:', num_docs, \ '# of topics:', num_topics, 'min(Theta):', min_lda_theta lda_beta_file = mdl_cfg['LDA']['lda_beta_file'] lda_beta = np.loadtxt(lda_beta_file) # loads the LDA theta from the model theta file num_topics, vocab_size = lda_beta.shape min_lda_beta = np.min(np.min(lda_beta)) print 'LDA-beta is loaded: # of topics:', num_topics, \ '# of terms in the vocabulary:', vocab_size, \ 'min(Bheta):', min_lda_beta print return lda_dictionary, lda_mdl, lda_index, lda_file_path_index, lda_theta, lda_beta
def load_lsi_parameters(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lsi_mdl_file = mdl_cfg['LSI']['lsi_model_file'] lsi_cos_index_file = mdl_cfg['LSI']['lsi_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lsi_file_path_index = load_file_paths_index(path_index_file) lsi_dictionary = load_dictionary(dictionary_file) if nexists(lsi_mdl_file) and nexists(lsi_cos_index_file): lsi_mdl, lsi_index = load_lsi_variables(lsi_mdl_file, lsi_cos_index_file) return lsi_dictionary, lsi_mdl, lsi_index, lsi_file_path_index
def load_tm(mdl_cfg): dictionary_file = mdl_cfg['CORPUS']['dict_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_mdl_file = mdl_cfg['LDA']['lda_model_file'] lda_cos_index_file = mdl_cfg['LDA']['lda_cos_index_file'] if nexists(dictionary_file) and nexists(path_index_file): lda_file_path_index = load_file_paths_index(path_index_file) lda_dictionary = load_dictionary(dictionary_file) if nexists(lda_mdl_file) and nexists(lda_cos_index_file): lda_mdl, lda_index = load_lda_variables(lda_mdl_file, lda_cos_index_file) return lda_dictionary, lda_mdl, lda_index, lda_file_path_index
def test_query(): ''' This function tests a sample query ''' dictionary_file = '/home/cgeorge/data/tm/enron.dict' doc_paths_file = '/home/cgeorge/data/enron.path.index' lsi_mdl_file = '/home/cgeorge/data/tm/enron.lsi' lsi_index_file = '/home/cgeorge/data/tm/enron.lsi.cos.index' query = 'half from new deals and the other half from reserve releases, and when you back out the prudency release you get back to zero net curve shift for 2001, which is what the original file had' dictionary, lsi, index = load_lsi_variables(dictionary_file, lsi_mdl_file, lsi_index_file) files_info = load_file_paths_index(doc_paths_file) responsive_docs, non_responsive_docs = process_query(query, dictionary, lsi, index, files_info, limit=5) return responsive_docs, non_responsive_docs
def search_tm_topics(topics_list, limit, mdl_cfg): ''' Performs search on the topic model using relevant topic indices ''' EPS = 1e-24 # a constant lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) # loads the file paths lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) # loads the LDA theta from the model theta file num_docs, num_topics = lda_theta.shape print 'LDA-theta is loaded: number of documents: ', num_docs, ' number of topics: ', num_topics unsel_topic_idx = [idx for idx in range(0, num_topics) if idx not in topics_list] sel = np.log(lda_theta[:, topics_list] + EPS) unsel = np.log(1.0 - lda_theta[:, unsel_topic_idx] + EPS) ln_score = sel.sum(axis=1) + unsel.sum(axis=1) sorted_idx = ln_score.argsort(axis=0)[::-1] # score = np.exp(ln_score) # Normalize the topic index search score # TODO: this is an adhoc method right now. May come back later... min_ln_score = min(ln_score) n_ln_score = (1.0 - ln_score / min_ln_score) ts_results = [] for i in range(0, min(limit, num_docs)): ts_results.append([lda_file_path_index[sorted_idx[i]][0], # document id lda_file_path_index[sorted_idx[i]][1], # document directory path lda_file_path_index[sorted_idx[i]][2], # document name n_ln_score[sorted_idx[i]]]) # similarity score # print lda_file_path_index[sorted_idx[i]], ln_score[sorted_idx[i]], n_ln_score[sorted_idx[i]], score[sorted_idx[i]] # grabs the files details from the index ts_results = get_indexed_file_details(ts_results, index_dir) results = [[row[0], float(row[10])] for row in ts_results] # Note: we need a float conversion because it's retrieving as string return results
def get_tm_classification_dataset(mdl_cfg_file, positive_dir): mdl_cfg = read_config(mdl_cfg_file) lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) lda_theta = np.loadtxt(lda_theta_file, dtype=np.float) num_docs, num_topics = lda_theta.shape print 'LDA Theta: Number of documents ', num_docs, ' number of topics ', num_topics class_ids = np.zeros(num_docs) file_paths = [] for i, (_, root, file_name) in enumerate(lda_file_path_index): if positive_dir == root: # os.path.exists(os.path.join(positive_dir, file_name)): class_ids[i] = RELEVANT_CLASS_ID else: class_ids[i] = IRRELEVANT_CLASS_ID file_paths.append(os.path.join(root, file_name)) return (class_ids, lda_theta, file_paths)
def search_tm_sel_topics_cos(topics_list, topics_prob, limit, mdl_cfg): lda_theta_file = mdl_cfg['LDA']['lda_theta_file'] index_dir = mdl_cfg['LUCENE']['lucene_index_dir'] path_index_file = mdl_cfg['CORPUS']['path_index_file'] lda_file_path_index = load_file_paths_index(path_index_file) lda_theta = np.loadtxt(lda_theta_file, dtype=np.longdouble) num_docs, num_topics = lda_theta.shape print 'Number of documents: ', num_docs, ' number of topics: ', num_topics from scipy.spatial.distance import cosine topics_prob = np.array(topics_prob) sel = lda_theta[:, topics_list] cos_scores = np.zeros(num_docs) for i in range(0, num_docs): cos_scores[i] = cosine(topics_prob, sel[i, :]) sorted_idx = cos_scores.argsort(axis=0)[::-1] ts_results = [] for i in range(0, min(limit, num_docs)): ts_results.append([lda_file_path_index[sorted_idx[i]][0], lda_file_path_index[sorted_idx[i]][1], lda_file_path_index[sorted_idx[i]][2], cos_scores[sorted_idx[i]]]) print lda_file_path_index[sorted_idx[i]], cos_scores[sorted_idx[i]] # grabs the files details from the index ts_results = get_indexed_file_details(ts_results, index_dir) results = [[row[0], row[10]] for row in ts_results] return results
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize = False, stem = False, nonascii = True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory absolute path path_index_file - file paths index file store_dir - index store directory absolute path Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('File paths index is stored into %s' % path_index_file) logging.info('Lucene: Stem = %s, Lemmatize = %s, Number of documents = %d' % (stem, lemmatize, len(file_tuples))) store = SimpleFSDirectory(File(store_dir)) writer = IndexWriter(store, STD_ANALYZER, True, IndexWriter.MaxFieldLength.LIMITED) print 'Lucene:', len(file_tuples), 'files found in %s.' % data_folder print 'Lucene: Stem =', stem, 'Lemmatize =', lemmatize, 'Allow non-ASCII =', nonascii for ft in file_tuples: idx, root, file_name = ft file_path = os.path.join(root, file_name) logging.info("[%d] file: %s - adding to Lucene index.", idx, file_name) # parses the emails in plain text format receiver, sender, cc, subject, message_text, bcc, date, email_text = parse_plain_text_email(file_path, tokenize = True, lemmatize = lemmatize, stem = stem, nonascii = nonascii) doc = Document() doc.add(Field(MetadataType.FILE_ID, str(idx), Field.Store.YES, Field.Index.NOT_ANALYZED)) doc.add(Field(MetadataType.FILE_NAME, file_name, Field.Store.YES, Field.Index.NOT_ANALYZED, Field.TermVector.YES)) doc.add(Field(MetadataType.FILE_PATH, file_path, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_RECEIVER, receiver, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SENDER, sender, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_CC, cc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) doc.add(Field(MetadataType.EMAIL_SUBJECT, subject, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added BCC field in indexing. doc.add(Field(MetadataType.EMAIL_BCC, bcc, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) #Subodh-Rahul - Added Email-Date field in indexing doc.add(Field(MetadataType.EMAIL_DATE, date, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.WITH_POSITIONS_OFFSETS)) if len(message_text) > 0: doc.add(Field(MetadataType.EMAIL_BODY, message_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) else: logging.error("[%d] file: %s - body text is empty.", idx, file_name) # Adds all documents fields as a separate index so that we can search through them doc.add(Field(MetadataType.ALL, email_text, Field.Store.YES, Field.Index.ANALYZED, Field.TermVector.YES)) writer.addDocument(doc) logging.info("[%d] file: %s - added to Lucene index.", idx, file_name) writer.commit() writer.close() logging.info('Lucene: All files are indexed.')
def index_plain_text_emails(data_folder, path_index_file, store_dir, lemmatize=False, stem=False, nonascii=True): ''' Indexes all the plain text emails in the input directory and stores the index in the store_dir Arguments: data_folder - input directory (absolute path) path_index_file - file paths index file store_dir - index store directory absolute path lemmatize - lemmatize tokens based on the NLTK WordNet lemmatizer stem - stem tokens nonascii - allow non-ASCII characters Returns: None ''' if not os.path.exists(store_dir): os.mkdir(store_dir) if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('Index file path: %s' % path_index_file) schema = Schema(file_id=NUMERIC(int, stored=True), file_name=ID(stored=True), file_path=ID(stored=True), email_reciever=TEXT(stored=True), email_sender=TEXT(stored=True), email_cc=TEXT(stored=True), email_subject=TEXT(stored=True), email_bcc=TEXT(stored=True), date=ID(stored=True), email_body=TEXT(stored=True), all=TEXT(stored=True)) ix = create_in(store_dir, schema) writer = ix.writer() logging.info('Stem = %s, Lemmatize = %s, D = %d, non-ASCII = %s' % (stem, lemmatize, len(file_tuples), nonascii)) for ft in file_tuples: idx, root, file_name, file_type = ft file_path = os.path.join(root, file_name) logging.info("[%d] creating index for %s...", idx, file_name) ret = parse_plain_text_email(file_path, lemmatize=lemmatize, stem=stem, nonascii=nonascii, file_type=file_type) (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = ret writer.add_document(file_id = idx, file_name = unicode(file_name), file_path = unicode(file_path), email_reciever = unicode(receiver), email_sender = unicode(sender), email_cc = unicode(cc), email_subject = unicode(subject), email_bcc = unicode(bcc), date = unicode(date), email_body = unicode(body_text), all = unicode(doc_text)) writer.commit() logging.info('All files are indexed.')
def index_plain_text_emails2(data_folder, path_index_file, store_dir, stem=False, min_token_len=2, max_token_len=40, procs=1, limitmb=128, multisegment=False, max_doc_length=-1): ''' Indexes all the plain text emails and attachements in the input directory and stores the index in the store_dir Arguments: data_folder - input directory (absolute path) path_index_file - file paths index file store_dir - index store directory absolute path stem - stem tokens min_token_len - minimum required length for a token max_token_len - maximum required length for a token procs - number of processors limitmb - memory limit multisegment - allow multi-segment write max_doc_length - max document length Returns: None ''' if os.path.exists(path_index_file): logging.info('Loading file paths index...') file_tuples = load_file_paths_index(path_index_file) logging.info('%d files found in the file paths index.' % len(file_tuples)) else: logging.info('Loading files in the data folder %s...' % data_folder) file_tuples = get_file_paths_index(data_folder) logging.info('%d email documents found.' % len(file_tuples)) store_file_paths_index(path_index_file, file_tuples) logging.info('Index file path: %s' % path_index_file) if stem: analyzer = StemmingAnalyzer(expression=pat4, stoplist=stop_words, minsize=min_token_len, maxsize=max_token_len, cachesize=-1) else: analyzer = StandardAnalyzer(expression=pat4, stoplist=stop_words, minsize=min_token_len, maxsize=max_token_len) std_ana = StandardAnalyzer(stoplist=None) schema = Schema(file_id=NUMERIC(int, stored=True), file_name=ID(stored=True), file_path=ID(stored=True), email_reciever=TEXT(stored=True, analyzer=std_ana), email_sender=TEXT(stored=True, analyzer=std_ana), email_cc=TEXT(stored=True, analyzer=std_ana), email_subject=TEXT(stored=True, analyzer=std_ana), email_bcc=TEXT(stored=True, analyzer=std_ana), date=ID(stored=True), email_body=TEXT(stored=True, analyzer=analyzer), all=TEXT(stored=True, analyzer=analyzer)) if not os.path.exists(store_dir): os.mkdir(store_dir) ix = create_in(store_dir, schema) if procs > 1: writer = ix.writer(procs=procs, limitmb=limitmb, multisegment=multisegment) else: writer = ix.writer(limitmb=limitmb) logging.info('Stem = %s, D = %d' % (stem, len(file_tuples))) truncate_count = 0 for ft in file_tuples: idx, root, file_name, file_type = ft file_path = os.path.join(root, file_name) logging.info("[%d] creating index for %s...", idx, file_name) (receiver, sender, cc, subject, body_text, bcc, date, doc_text) = parse_text_emails_and_attachments(file_path, file_type) # TODO this needs to be removed et = doc_text.split() if max_doc_length > 1 and len(et) > max_doc_length: doc_text = " ".join(et[:max_doc_length]) truncate_count += 1 writer.add_document(file_id = idx, file_name = unicode(file_name), file_path = unicode(file_path), email_reciever = unicode(receiver), email_sender = unicode(sender), email_cc = unicode(cc), email_subject = unicode(subject), email_bcc = unicode(bcc), date = unicode(date), email_body = unicode(body_text), all = unicode(doc_text)) writer.commit() logging.info('%d documents are truncated.', truncate_count) logging.info('All files are indexed.')