def get_all_tokens_worker(entry_queue, results_queue, ngram): token_counter = Counter() if ngram >=2: vocabulary = load_vocabulary_trie(ngram-1) while True: entry = entry_queue.get() if entry == FLAG_ALL_DONE: results_queue.put(token_counter) results_queue.put(FLAG_WORKER_FINISHED_PROCESSING) break else: if results_queue.qsize() > 30 or len(token_counter) > 10000000: print("Sleeping. Results qsize: ", results_queue.qsize()) time.sleep(10) if len(token_counter) > 1000000 and results_queue.qsize() < 5: results_queue.put(token_counter) token_counter = Counter() ocr = open(entry, encoding='cp1252', errors='ignore').read() for token in ngram_generator(ocr, ngram): if ngram == 1 or check_ngram_validity(token, vocabulary, ngram): token_counter[" ".join(token)] += 1
def distinctive_terms_overall(main_name): global_totals = get_vocabulary_totals(1) vocabulary_trie = load_vocabulary_trie(1) local_vocabulary, local_totals = get_vocabulary_and_totals(main_name) global_totals_localized = np.zeros(len(local_vocabulary['id_to_token'])) for token in local_vocabulary['token_to_id']: local_token_id = local_vocabulary['token_to_id'][token] global_token_id = vocabulary_trie[token] global_totals_localized[local_token_id] = global_totals[ global_token_id] print(len(global_totals), len(local_totals), len(global_totals_localized)) distinctive_terms = get_distinctive_terms(local_totals, global_totals_localized, local_vocabulary) print(distinctive_terms) db = Database("TOB_NETWORKS") con, cur = db.connect() cur.execute('SELECT DISTINCT(tid) as tid from {}_docs'.format(main_name)) tids = [row['tid'] for row in cur.fetchall()] totals2 = get_totals(tids, local_vocabulary) dist = get_distinctive_terms(totals2, global_totals_localized, local_vocabulary) print("\n", dist) totals3 = get_totals(tids, local_vocabulary, tf=True) dist = get_distinctive_terms(totals3, global_totals_localized, local_vocabulary) print("\n", dist)
def __get_distinctive_terms(self): try: distinctive_terms = pickle.load( open( PATH_TOKENIZED + 'networks/{}_distinctive_terms.pickle'.format( self.main_name), 'rb')) except IOError: distinctive_terms = {'overall': None, 'nodes': {}, 'edges': {}} global_totals = get_vocabulary_totals(1) vocabulary_trie = load_vocabulary_trie(1) totals_legacy = np.zeros(len(self.vocabulary['ordered'])) for token_id, token in enumerate(self.vocabulary['ordered']): token_id_global = vocabulary_trie[token] token_total_global = global_totals[token_id_global] totals_legacy[token_id] = token_total_global distinctive_terms['overall'] = self.__calculate_distinctive_terms( self.totals, totals_legacy) print("overall", distinctive_terms['overall']) names_set = set( [n.lower() for name in self.nodes for n in name.split(',')]) for term in distinctive_terms['overall']: if term[0] not in names_set: print(term) for node in self.nodes: node_dtm = self.apply_filter(nodes=[node]) node_counts = np.array(node_dtm.sum(axis=0)).flatten() distinctive_terms['nodes'][ node] = self.__calculate_distinctive_terms( node_counts, self.totals) print("\n", node, distinctive_terms['nodes'][node]) for edge in self.edges: edge_dtm = self.apply_filter(edges=[edge]) # skip edges without content if edge_dtm.sum() == 0: continue edge_counts = np.array(edge_dtm.sum(axis=0)).flatten() distinctive_terms['edges'][ edge] = self.__calculate_distinctive_terms( edge_counts, self.totals) print("\n", edge, distinctive_terms['edges'][edge]) pickle.dump( distinctive_terms, open( PATH_TOKENIZED + 'networks/{}_distinctive_terms.pickle'.format( self.main_name), 'wb')) return distinctive_terms
def add_terms(): for ngram in range(1,3): # update vocabulary trie # this messes up the ids but I don't use them anymore because I don't use the doc-term matrices anymore start = time.time() vocabulary = load_vocabulary_trie(ngram) keys = vocabulary.keys() + ADDED_TOKENS[ngram] vocabulary_new = Trie(keys) vocabulary_new.save(PATH_TOKENIZED + 'tries/full_vocabulary_{}_grams.trie'.format(ngram)) full_db_to_tokens(ngram, add_new_terms=set(ADDED_TOKENS[ngram])) print("adding new tokens for {}-gram took {}.".format(ngram, time.time() - start))
def test(ngram=1): voc = load_vocabulary_trie(ngram) file = open('{}grams.txt'.format(ngram), 'w') count = 0 for key in sorted(voc.keys()): validity = "" if ngram == 1: validity = str(check_1gram_validity(key)) elif ngram == 2: if re.match('[0-9]+ [0-9]+[a-z]*', key): validity = "False" else: validity = "True" file.write(validity + "\t" + key+"\n") if validity == "True": count += 1 print(len(voc), count )
def __get_vocabulary_and_totals(self): try: vocabulary_dict = pickle.load( open( PATH_TOKENIZED + 'networks/{}_vocabulary.pickle'.format(self.main_name), 'rb')) totals = np.load( open( PATH_TOKENIZED + 'networks/{}_totals.npy'.format(self.main_name), 'rb')) except IOError: from tobacco.frequencies_preprocessing.preprocessing_docs import get_ocr_by_tid vocabulary_trie = load_vocabulary_trie(1) db = Database("TOB_NETWORKS") con, cur = db.connect() totals = np.zeros(len(vocabulary_trie), dtype=np.int64) vocabulary_dict = { 'token_to_id': {}, 'id_to_token': {}, 'ordered': [] } cur.execute('SELECT DISTINCT(tid) as tid FROM {}_sections'.format( main_name)) while True: row = cur.fetchone() if not row: break else: text = get_ocr_by_tid(row['tid'], return_bytearray=False) for token in text.split(): if token in vocabulary_trie: totals[vocabulary_trie[token]] += 1 token_id = 0 for i in totals.argsort()[-5000:][::-1]: token = vocabulary_trie.restore_key(i) if token in STOP_WORDS or totals[i] < 10: continue else: vocabulary_dict['id_to_token'][token_id] = token vocabulary_dict['token_to_id'][token] = token_id vocabulary_dict['ordered'].append(token) assert vocabulary_dict['ordered'][ token_id] == vocabulary_dict['id_to_token'][token_id] token_id += 1 assert len(vocabulary_dict['ordered']) == len( vocabulary_dict['id_to_token']) == len( vocabulary_dict['token_to_id']) # store totals of main person totals_name = np.zeros(len(vocabulary_dict['id_to_token'])) for token in vocabulary_dict['token_to_id']: token_id_global = vocabulary_trie[token] token_id_local = vocabulary_dict['token_to_id'][token] totals_name[token_id_local] = totals[token_id_global] np.save( open( PATH_TOKENIZED + 'networks/{}_totals.npy'.format(self.main_name), 'wb'), totals_name) pickle.dump( vocabulary_dict, open( PATH_TOKENIZED + 'networks/{}_vocabulary.pickle'.format(self.main_name), 'wb')) return vocabulary_dict, totals
from tobacco.utilities.ocr import load_vocabulary_trie GLOBAL_IDF_WEIGHTS = np.load(PATH_TOKENIZED + 'idf_weights.npy') ''' 8/15/17 So, recap of basic linear algebra w/r/t sparsity: sklearn's nmf (and presumably others) offer l1 and l2 regularization that can be weighed against each other (e.g. 50% l1, 50% l2) l1 regularization: reduce number of non-zero weights l2 regularization: reduce squared sum of weights i.e. to enforce sparsity in the terms, use l1 regularization ''' VOCABULARY = load_vocabulary_trie(1) def tokenize_sections(output_sections, vocabulary, log_likelihoods, tokenizer_type='count', use_global_idf=True): start = time.time() sections = [i[7] for i in output_sections] # map term to token_id vocabulary_dict = {token: idx for idx, token in enumerate(vocabulary)} indices = array.array(str("i"))
def full_db_to_tokens(ngram=1, use_sections=False, add_new_terms=None): """ Stores the full database as a csc doc-term matrix (one for each ngram level) :param ngram: 1-5 :param use_sections: use 200 word sections if true, else use full documents :return: Step 1: Take a slice of the vocabulary and tokenize all documents to a csr matrix. Then transfrom that csr to a csc matrix Step 2: Take all the csc slices and stack them next to each other. Rationale: Creating the full csr matrix at once and then turning it to a csc matrix uses absurd amounts of memory Note to future: yes, you have tried and no, it didn't work. """ # Load vocabulary according to ngram level vocabulary = load_vocabulary_trie(ngram) # Slice the vocabulary into slices with length n, depending on the ngram level. # ngram_to_interval = {1: 130000, 2: 2600000, 3: 3100000, 4:3600000, 5: 2500000} ngram_to_interval = {1: 140000, 2: 1000000, 3: 3100000, 4:3600000, 5: 2500000} voc_interval = ngram_to_interval[ngram] # if just adding new terms, then we don't need to slice the vocabulary if add_new_terms: voc_interval = 100000000 print("{} vocabulary slices to process.".format(len(range(0, len(vocabulary)-1, voc_interval)))) for voc_idx in range(0, len(vocabulary)-1, voc_interval): print("Working on voc_idx {} out of {}".format(voc_idx, len(vocabulary))) # 2/1/17: just to make it clear: voc_idx is the vocabulary offset vocabulary_offset = voc_idx vocabulary_slice = {} if add_new_terms: vocabulary_slice = {new_term: vocabulary[new_term] for new_term in add_new_terms} print(vocabulary_slice) else: for i in range(voc_idx, voc_idx + voc_interval): try: vocabulary_slice[vocabulary.restore_key(i)] = i except KeyError: pass # Initialize arrays for indices and indptr, add first element to indptr data = array.array(str("l")) indices = array.array(str("l")) indptr = array.array(str("l")) indptr.append(0) entry_queue = get_entry_queue() for i in range(NUMBER_OF_PROCESSES): entry_queue.put(FLAG_ALL_DONE) print("entry queue size", entry_queue.qsize()) results_queue = mp.Queue() # Initialize and start processes for process_n in range(NUMBER_OF_PROCESSES): p = mp.Process(target = tokenize_document_worker, args=(entry_queue, results_queue, ngram, vocabulary_slice, vocabulary_offset, use_sections)) p.start() processors_finished = 0 # next id to be added to the results current_id = 0 # storage dict for returned but not yet added results pending_results = {} while True: new_result = results_queue.get() if new_result == FLAG_WORKER_FINISHED_PROCESSING: processors_finished += 1 if processors_finished == NUMBER_OF_PROCESSES: assert entry_queue.qsize() == 0 store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections) break else: # all results first get added to the pending_results dict pending_results[new_result['id']] = {'indices': new_result['indices'], 'data': new_result['data']} while True: # then, if the next id to be added is in the result, # the result gets moved from the dict to the indices array if current_id in pending_results: if current_id % 10000 == 0: print("Current id: {}. qsize: {}. Data length: {}.".format(current_id, results_queue.qsize(), len(data))) print(len(indptr)) if use_sections: for section_id in range(len(pending_results[current_id]['indices'])): indices += pending_results[current_id]['indices'][section_id] data += pending_results[current_id]['data'][section_id] indptr.append(len(indices)) else: indices += pending_results[current_id]['indices'] data += pending_results[current_id]['data'] indptr.append(len(indices)) pending_results.pop(current_id, None) current_id += 1 else: break
def store_vocabulary_slice(data, indices, indptr, vocabulary_slice, ngram, vocabulary_offset, add_new_terms, use_sections=False): ''' Iterates through vocabulary processed so far and stores every token a) in the tokens table of tob_full (token, token_reversed, id, ngram, total) b) as a compressed sparse matrix :param data: :param indices: :param indptr: :param vocabulary: :param ngram: :return: ''' print("finished tokenizing. storing vocabulary slice.") # parse to int (may not be necessary) data = np.frombuffer(data, dtype=np.int64) indices = np.frombuffer(indices, dtype = np.int64) indptr = np.frombuffer(indptr, dtype=np.int64) # if adding new terms, the temp matrix has to have as many columns as the vocabulary as a whole, not just the # current vocabulary slice if add_new_terms: shape = (len(indptr) - 1, len(load_vocabulary_trie(ngram))) else: shape = (len(indptr) - 1, len(vocabulary_slice)) temp_matrix = csr_matrix((data, indices, indptr), shape=shape, dtype= np.int64) # get global tfidf weights here from IPython import embed embed() temp_matrix = temp_matrix.tocsc() print("temp matrix") print("shape", temp_matrix.shape) print("indptr, voc slice", len(indptr), len(vocabulary_slice)) print("nnz", temp_matrix.getnnz()) print("len, sum of data", len(data), np.sum(data)) db = Database("TOB_FULL") tokens = [] for token in vocabulary_slice: if len(tokens) >= 20000: print("Quality control on first token vector") test_vector = get_ngram_vector(tokens[0]['token']) print("token: ", tokens[0]['token'], " total db: ", tokens[0]['total'], "total vector ", test_vector.sum(), "Shape: ", test_vector.shape, " nnz: ", test_vector.getnnz(), "indptr: ", test_vector.indptr, " data len ", len(test_vector.data), " indices len ", len(test_vector.indices)) if not use_sections: db.batch_insert('tokens', ['token', 'token_reversed', 'id', 'ngram', 'total'], tokens) tokens = [] id = vocabulary_slice[token] # extract, indptr, data, and indices directly instead of forming a column slice first # the column slice takes about 3secs per term # subtract vocabulary offset to get the correct ids indptr_token_start = temp_matrix.indptr[id - vocabulary_offset] indptr_token_end = temp_matrix.indptr[id+1 - vocabulary_offset] indices_token = temp_matrix.indices[indptr_token_start:indptr_token_end] data_token = temp_matrix.data[indptr_token_start:indptr_token_end] indptr_token = np.array([0, len(indices_token)], dtype=np.int64) # if add_new_terms: # shape = (len(load_vocabulary_trie(ngram)), 1) # else: shape = (temp_matrix.shape[0], 1) token_vector = csc_matrix((data_token, indices_token, indptr_token), shape=shape) # to compress directory: tar -c tokens | pv --size `du -csh tokens | grep total | cut -f1` | pigz -9 > tokens.tar.gz hash_path = hashlib.sha256(token.encode()).hexdigest() if use_sections: hash_path += '_sections' token_path = PATH_TOKENS + '{}/{}/{}/{}/'.format(hash_path[0], hash_path[1], hash_path[2], hash_path[3]) if not os.path.exists(token_path): os.makedirs(token_path) store_csr_matrix_to_file(token_vector, token_path + hash_path, compressed=True) if not use_sections: tokens.append({ 'token': token, 'token_reversed': token[::-1], 'id': id, 'ngram': ngram, 'total': np.sum(data_token) }) if not use_sections: db.batch_insert('tokens', ['token', 'token_reversed', 'id', 'ngram', 'total'], tokens)
def get_globals(globals_type='frequencies', load_only_docs=False): """ Returns all the globals necessary to process a frequencies or text_passages global use load_only_docs to load only the docs (but not section) filters and totals in frequency mode Mode: frequencies, load: docs and sections. 14s Mode: frequencies, load: docs 2s :param globals_type: :return: """ s = time.time() if globals_type == 'frequencies': globals_dict = { 'filters': { 'docs': { 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'), # 66 MB 'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'), # 121 MB 'availability': get_availability_filters(return_type='csc', docs_or_sections='docs')# 40 MB }, }, 'totals': { 'totals': { 'docs': { 'np': get_totals_vector(docs_or_sections='docs', return_type='np_int32'), # 43 MB }, }, 'collection':{ 'docs': get_collection_totals_vectors(docs_or_sections='docs') }, 'doc_type': { 'docs': get_doc_type_totals_vectors(docs_or_sections='docs') } }, 'vocabulary_totals': get_vocabulary_totals(1), 'vocabulary_trie': load_vocabulary_trie(1), # 1 MB 'vocabulary_set': load_vocabulary_trie(1, return_type='set'), # 15 MB 'collections_and_idx_dict': get_col_name_and_idx_dict(), 'doc_type_and_idx_dict': get_doc_types_to_idx_dict(), 'year_parts_id_list': { 'docs': get_year_doc_id_list('docs'), # 45 MB } } if not load_only_docs: globals_dict['filters']['sections'] = { 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'), # 1009 MB 'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'), # 828 MB 'availability': get_availability_filters(return_type='csc', docs_or_sections='sections') # 427 MB } globals_dict['totals']['totals']['sections'] = {} globals_dict['totals']['totals']['sections']['np'] = get_totals_vector(docs_or_sections='sections', return_type='csc') # 341 MB globals_dict['year_parts_id_list']['sections'] = get_year_doc_id_list('sections') # 1 MB ??this seems wrong ?? globals_dict['totals']['collection']['sections'] = get_collection_totals_vectors(docs_or_sections='sections') globals_dict['totals']['doc_type']['sections'] = get_doc_type_totals_vectors(docs_or_sections='sections') # if globals_type == 'frequencies': # globals_dict = { # 'filters':{ # 'docs':{ # 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='docs'), # 66 MB # 'collection': get_collection_filters(return_type='csc', docs_or_sections='docs'), # 121 MB # 'availability': get_availability_filters(return_type='csc', docs_or_sections='docs') # 40 MB # }, # 'sections':{ # 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'), # 1009 MB # 'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'), # 828 MB # 'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')# 427 MB # } # }, # # 'totals':{ # 'totals':{ # 'docs':{ # 'np': csc_to_np_int32(get_totals_vector(docs_or_sections='docs')), # 43 MB # }, # 'sections':{ # 'np': csc_to_np_int32(get_totals_vector(docs_or_sections='sections')), # 341 MB # }, # }, # }, # # # 8/31/18 I don't think these are used -> commented out for the time being. # # 'year_doc_matrix':{ # # 'docs': get_year_doc_transformation_matrix(docs_or_sections='docs'), # 170 MB # # 'sections': get_year_doc_transformation_matrix(docs_or_sections='sections') # 1360 MB # # }, # # 'vocabulary_totals': get_vocabulary_totals(1), # 'vocabulary_trie': load_vocabulary_trie(1), # 1 MB # 'vocabulary_set': load_vocabulary_trie(1, return_type='set'), # 15 MB # # 'year_parts_id_list':{ # 'docs': get_year_doc_id_list('docs'), # 45 MB # 'sections': get_year_doc_id_list('sections') # 1 MB ?? why so much less than docs?? # } # } elif globals_type == 'passages': globals_dict = { 'filters':{ 'sections':{ 'doc_type': get_doc_type_filters(return_type='csc', docs_or_sections='sections'), # 958 MB 'collection': get_collection_filters(return_type='csc', docs_or_sections='sections'), # 828 MB 'availability': get_availability_filters(return_type='csc', docs_or_sections='sections')# 427 MB } }, 'doc_types_and_idx_dict': get_doc_types_to_idx_dict(), 'collections_and_idx_dict': get_col_name_and_idx_dict(), 'section_to_doc_and_offset_arr': get_section_to_doc_and_offset_arr(), # 1024 MB 'vocabulary_totals': get_vocabulary_totals(1), 'vocabulary_trie': load_vocabulary_trie(1), # 1 MB 'vocabulary_set': load_vocabulary_trie(1, return_type='set'), # 15 MB 'year_parts_id_list':{ 'docs': get_year_doc_id_list('docs'), # 45 MB 'sections': get_year_doc_id_list('sections') # 1 MB ?? why so much less than docs?? } } else: raise ValueError("only 'frequencies' and 'passages' are valid values for globals_type but not {}".format(globals_type)) # print("Loading globals in {} mode took: {}".format(globals_type, time.time() - s)) return globals_dict
import time from collections import namedtuple import numpy as np from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals from tobacco.text_passages.text_passages_helper_process_year_of_sections import process_year_of_sections_cython from tobacco.text_passages.text_passages_helper_search import parse_text_passages_tokens from tobacco.frequencies_preprocessing.preprocessing_filters import get_active_filters_np from tobacco.utilities.ocr import load_vocabulary_trie VOCABULARY = load_vocabulary_trie(1, return_type='set') from tobacco.frequencies.calculate_ngrams_class import NgramResult # only use end_year, not start_year Document = namedtuple('Document', ['tid', 'title', 'date', 'year', 'collection']) Passage = namedtuple('Passage', ['Document', 'text']) def find_text_passages(tokens, active_filters, years_to_process, passage_length, globals, logging=False, insert_result_to_db=True): """ This is the main task to find text passages matching one or more search terms. The main processing itself is done year by year in the cython function process_year_of_sections_cython