def get_years_to_extract_docs(tokens, doc_type_filters, collection_filters, availability_filters, passage_length, start_year, end_year, cur): ''' Create a list of years that were not previously processed ''' years_to_process = list(range(start_year, end_year +1)) section_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, passage_length)) cur.execute('SELECT year FROM results_passages_yearly WHERE query_hash = "{}";'.format(section_hash)) for year_row in cur.fetchall(): if year_row['year'] in years_to_process: years_to_process.remove(year_row['year']) # create a list of years to load from db years_to_load_from_db = list(range(start_year, end_year + 1)) for year in years_to_process: years_to_load_from_db.remove(year) print("process: {}. \nload:{}.".format(years_to_process, years_to_load_from_db)) return years_to_process, years_to_load_from_db
def store_result_in_db(self, database): hash = generate_hash((self.parsed_search_tokens, self.doc_type_filters, self.collection_filters, self.availability_filters, self.term_filters)) store_cmd = '''REPLACE INTO results_frequencies (tokens, doc_type_filters, collection_filters, availability_filters, term_filters, query_hash, results, last_accessed, count_accessed ) VALUES(%s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);''' con, cur = database.connect() cur.execute(store_cmd, (str(self.tokens), str( self.doc_type_filters), str(self.collection_filters), str(self.availability_filters), str(self.term_filters), hash, json.dumps(self.generate_results_dict()))) con.commit() con.close()
def get_sections(tokens, doc_type_filters, collection_filters, availability_filters, passage_length, passages_per_year, start_year, end_year, min_readability): """ Loads the sections for a text passages query :param tokens: :param doc_type_filters: :param collection_filters: :param availability_filters: :param passage_length: :param passages_per_year: :param start_year: :param end_year: :param min_readability: :return: """ try: output_sections = [] section_hash = generate_hash( (tokens, doc_type_filters, collection_filters, availability_filters, passage_length)) # load only correct passages if params == default if passage_length == 600 and passages_per_year == 100 and min_readability == 0.85: print('default true') con, cur = DB.connect() cur.execute( '''SELECT results, year FROM results_passages_yearly_default WHERE query_hash = "{}" AND year >= {} AND year <= {} ORDER BY year ASC;''' .format(section_hash, start_year, end_year)) rows = cur.fetchall() print("len rows", len(rows), len(rows) == end_year + 1 - start_year) if len(rows) == end_year + 1 - start_year: for row in rows: output_sections += json.loads( row['results'].decode('utf-8')) return output_sections random.seed(passages_per_year) shuffled_section_ids = list(range(5000)) random.shuffle(shuffled_section_ids) year_counter = Counter() # make sure all years available # all years are available if count of rows == number of years to load cur = None # make cur from while loop available below con = None count = 0 while True: if count == 3000: print( "did not find section result after 3000 loops or 5 minutes for {}" .format(tokens, doc_type_filters, collection_filters, availability_filters)) result = { 'errors': "Something went wrong with this query and we terminated the calculation after 5 minutes." } send_email( 'Text passages failure', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.''' .format(tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) con.close() return result con, cur = DB.connect() cur.execute( '''SELECT COUNT(*) as count_years FROM results_passages_yearly WHERE query_hash = "{}" AND year >= {} AND year <= {}''' .format(section_hash, start_year, end_year)) print( os.getpid(), count, 'sections', '''SELECT COUNT(*) as count_years FROM results_passages_yearly WHERE query_hash = "{}" AND year >= {} AND year <= {}''' .format(section_hash, start_year, end_year)) count_years = cur.fetchall()[0]['count_years'] if count_years == end_year + 1 - start_year: break else: time.sleep(0.1) count += 1 cur.execute('''SELECT results, year FROM results_passages_yearly WHERE query_hash = "{}" AND year >= {} AND year <= {} ORDER BY year ASC;''' .format(section_hash, start_year, end_year)) while True: sections_row = cur.fetchone() if not sections_row: break year = sections_row['year'] year_sections = json.loads(sections_row['results'].decode('utf-8')) if year_sections == []: continue for section_id in shuffled_section_ids: if year_counter[year] == passages_per_year: break # shuffled_section_ids goes to 4999 -> there will often be misses try: section = year_sections[section_id] except IndexError: continue if section[6] > min_readability: output_sections.append(section) year_counter[year] += 1 con.close() return output_sections except: send_email( 'Section passages failure', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}. Traceback: {}.'''.format( tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability, traceback.format_exc())) try: con.close() except ProgrammingError: pass raise
def get_topic_model_results(tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability): """ Loads just the topic model that results from a query Topic models can take up to 1 minute to process and are loaded separately so there are already results to explore before the topic model is finished. :param tokens: :param doc_type_filters: :param collection_filters: :param availability_filters: :param start_year: :param end_year: :param passage_length: :param passages_per_year: :param min_readability: :return: """ print("Getting text passages result", tokens, start_year, end_year) con = None tokens, errors, doc_type_filters, collection_filters, availability_filters = sanitize_query_input( tokens, doc_type_filters, collection_filters, availability_filters, type='passages') query_hash = generate_hash( (tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) count = 0 while True: if count == 2000: result = { 'errors': "Something went wrong with this query and we terminated the calculation after 5 minutes." } send_email( 'Topic Model', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.''' .format(tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) con.close() return result con, cur = DB.connect() cur.execute( 'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"' .format(query_hash)) print( os.getpid(), count, 'topic_model', 'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"' .format(query_hash)) rows = cur.fetchall() if len(rows) == 0: count += 1 con.close() time.sleep(0.2) continue else: results = json.loads(rows[0]['results'].decode('utf-8')) if len(results['topic_model']) == 0: time.sleep(0.3) count += 1 con.close() continue else: con.close() return results['topic_model']
def get_frequencies_results(tokens, doc_type_filters, collection_filters, availability_filters, term_filters): """ Loads the results for one frequency query. First checks if they are already in the database. If not, it adds the task to the redis queue and waits for the result to be added to the database :param tokens: :param doc_type_filters: :param collection_filters: :param availability_filters: :param term_filters: :return: """ try: tokens, doc_type_filters, collection_filters, availability_filters, term_filters = sanitize_query_input( tokens, doc_type_filters, collection_filters, availability_filters, term_filters) result = None con, cur = DB.connect() hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, term_filters)) cur.execute( 'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";' .format(hash)) rows = cur.fetchall() # if results was precalculated if len(rows) > 0: result = rows[0]['results'] # else, calculate and store the result else: REDIS_CON.push_task_frequencies( (tokens, doc_type_filters, collection_filters, availability_filters, term_filters)) count = 0 while True: if count == 3000: print( "did not find result after 20000 loops or 10 minutes for {}" .format(tokens, doc_type_filters, collection_filters, availability_filters, term_filters)) result = { 'errors': "Something went wrong with this query and we terminated the calculation after 5 minutes." } send_email( 'Frequencies failure', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Term Filters: {}.'''. format(tokens, doc_type_filters, collection_filters, availability_filters, term_filters)) con.close() return result con, cur = DB.connect() cur.execute( 'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";' .format(hash)) print( os.getpid(), count, 'frequencies', 'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";' .format(hash)) rows = cur.fetchall() if len(rows) == 0: count += 1 time.sleep(0.1) continue else: result = rows[0]['results'] break result = json.loads(result.decode('utf-8')) count_accessed = rows[0]['count_accessed'] + 1 cur.execute( 'UPDATE results_frequencies SET count_accessed = {} WHERE query_hash = "{}";' .format(count_accessed, hex)) con.commit() con.close() return result except: send_email( 'Frequencies error', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Term Filters: {}. Stack Trace: {}'''.format(tokens, doc_type_filters, collection_filters, availability_filters, term_filters, traceback.format_exc())) try: con.close() except ProgrammingError: pass raise
def get_text_passages_results(tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability): """ This is the main task set up to retrieve text passages results. It checks if the result is already available in the database. If not, it adds the task to the redis queue Sections are loaded separately through the get_sections function. :param tokens: :param doc_type_filters: :param collection_filters: :param availability_filters: :param start_year: :param end_year: :param passage_length: :param passages_per_year: :param min_readability: :return: """ try: start_time = time.time() print("Getting text passages result", tokens, start_year, end_year) tokens, token_parse_errors, doc_type_filters, collection_filters, availability_filters = sanitize_query_input( tokens, doc_type_filters, collection_filters, availability_filters, type='passages') con, cur = DB.connect() query_hash = generate_hash( (tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) cur.execute( 'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"' .format(query_hash)) rows = cur.fetchall() # if result available, just load if len(rows) > 0: results = json.loads(rows[0]['results'].decode('utf-8')) try: print("errors", results['errors'], 'errors' in results) except: pass # only load sections if there are no errors if 'errors' in results and len(results['errors']) > 0: results['sections'] = [] else: results['sections'] = get_sections( tokens, doc_type_filters, collection_filters, availability_filters, passage_length, passages_per_year, start_year, end_year, min_readability) count_accessed = rows[0]['count_accessed'] + 1 # else, create new task, load section, then load overall results else: print("result not available, creating task") REDIS_CON.push_task_passages( ('complete', tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) results = dict() count = 0 while True: if count == 3000: print( "did not find result after 3000 loops or 5 minutes for {}" .format(tokens, doc_type_filters, collection_filters, availability_filters)) result = { 'errors': "Something went wrong with this query and we terminated the calculation after 5 minutes." } send_email( 'Text passages failure', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.''' .format(tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) con.close() return result con, cur = DB.connect() cur.execute( 'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}";' .format(query_hash)) print( os.getpid(), count, 'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}";' .format(query_hash)) rows = cur.fetchall() if len(rows) == 0: count += 1 time.sleep(0.1) continue else: results = { **results, **json.loads(rows[0]['results'].decode('utf-8')) } break count_accessed = 1 # only load sections if there are no errors if 'errors' in results and len(results['errors']) > 0: results['sections'] = [] else: results['sections'] = get_sections( tokens, doc_type_filters, collection_filters, availability_filters, passage_length, passages_per_year, start_year, end_year, min_readability) cur.execute( 'UPDATE results_passages SET count_accessed = {} WHERE query_hash = "{}";' .format(count_accessed, query_hash)) con.commit() con.close() print("Passages search for {} took {} and returned {} docs.".format( tokens, time.time() - start_time, len(results['sections']))) if token_parse_errors: if 'errors' in results: results['errors'] += token_parse_errors else: results['errors'] = token_parse_errors return results except: send_email( 'Text passages failure', '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}. Availability Filters: {}. Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}. Stack Trace: {}'''.format( tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability, traceback.format_exc())) try: con.close() except ProgrammingError: pass raise
def execute_full_text_passages_task(task_params): """ Executes a full text passages task, i.e. retrieve sections, distinctive terms, topic and store them in the db :param task_params: :return: """ start = time.time() tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, \ passages_per_year, min_readability = task_params active_filters = {'doc_type': doc_type_filters, 'collection': collection_filters, 'availability': availability_filters} query_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability)) section_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, passage_length)) insert_cmd = '''REPLACE INTO results_passages (tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, passages_per_year, min_readability, query_hash, results, last_accessed, count_accessed) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);''' con, cur = DB.connect() error_while_processing = False results_with_potential_error = None # Step 1: Find years to process years_to_process, years_to_load_from_db = get_years_to_extract_docs(tokens, doc_type_filters, collection_filters, availability_filters, passage_length, start_year, end_year, cur) # Step 2: If more than 20 years, split passage retrieval into multiple tasks and change years_to_load to whole range # First chunk will be executed by this task if len(years_to_process) > 21: years_to_process_chunks = [years_to_process[i:i+21] for i in range(0, len(years_to_process), 21)] for years_to_process_chunk in years_to_process_chunks[1:]: REDIS_HOST.push_task_sections(('sections', tokens, active_filters, years_to_process_chunk, passage_length)) years_to_load_from_db = list(range(start_year, end_year + 1)) years_to_process = years_to_process_chunks[0] # Step 3 Process years if necessary and add to database print("step 3", years_to_process) if len(years_to_process) > 0: results_with_potential_error = find_text_passages(tokens, active_filters, years_to_process, passage_length, GLOBALS_PASSAGES) if results_with_potential_error['errors'] != '': error_while_processing = True # Step 4: if errors while processing, store error if error_while_processing: cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters), start_year, end_year, passage_length, passages_per_year, min_readability, query_hash, json.dumps(results_with_potential_error))) con.commit() return # Step 5: Load raw sections if no errors if years_to_process == []: raw_sections = {} else: # if there are no sections, return empty dict try: raw_sections = results_with_potential_error['sections'] except KeyError: raw_sections = {} if len(years_to_load_from_db) > 0: loaded_sections = load_sections(years_to_load_from_db, section_hash) raw_sections.update(loaded_sections) # Step 6 select and process sections final_sections, docs_by_year_list, section_id_to_year_dict = select_and_process_sections(raw_sections, passages_per_year, min_readability, start_year, end_year) # Step 7 Distinctive authors, terms, topic models frequent_authors = get_frequent_authors(final_sections) text_passages_totals, top_2000_terms_set = get_text_passages_totals(final_sections, tokens) distinctive_terms, log_likelihoods = calculate_distinctive_terms(text_passages_totals, final_sections, tokens) #9/29: insert temporary result without topic model if more than 10000 docs if len(final_sections) > 10000: print("calculating topic model separately... ") temp_result = {'distinctive_terms': distinctive_terms, 'frequent_authors': frequent_authors, 'docs_by_year_list': docs_by_year_list, 'topic_model': {}, 'errors' : ''} cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters), start_year, end_year, passage_length, passages_per_year, min_readability, query_hash, json.dumps(temp_result))) con.commit() # add all distinctive terms that weren't in the top 2000 terms for term in distinctive_terms: if term[0] not in top_2000_terms_set: top_2000_terms_set.add(term[0]) topic_model_type = 'nmf' # in case I want to run nmf_sparse if passage_length == 601: topic_model_type = 'nmf_sparse' if passage_length == 599: topic_model_type = 'nmf_local_idf' topic_model = calculate_topic_model(final_sections, section_id_to_year_dict, vocabulary=list(top_2000_terms_set), log_likelihoods=log_likelihoods, n_components=20, topic_model_type=topic_model_type) # Step 8: Insert result result = {'distinctive_terms': distinctive_terms, 'frequent_authors': frequent_authors, 'docs_by_year_list': docs_by_year_list, 'topic_model': topic_model, 'errors' : ''} cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters), start_year, end_year, passage_length, passages_per_year, min_readability, query_hash, json.dumps(result))) con.commit() print("full passages task for {}, {}, {} took {}.\n".format(tokens, start_year, end_year, time.time() - start))
def insert_passages_yearly_result(tokens, active_filters, year, passage_length, complete, output_sections): """ Insert the results of one year inot the results db The process runs through 2 storage options. First, it stores all results (up to 5000) in the database. Out of these, a requested result of, e.g. 400 can be selected later. Second, for faster loading, it stores a set with default search params (100 passages per year, 0.85 readability) in the results_passages_yearly_default table. :param tokens: :param active_filters: :param year: :param passage_length: :param complete: :param output_sections: :return: """ con, cur = DB_TOB_RESULTS.connect() print("inserting yearly", year, len(output_sections)) section_hash = generate_hash( (tokens, active_filters['doc_type'], active_filters['collection'], active_filters['availability'], passage_length)) store_cmd = '''REPLACE INTO results_passages_yearly (tokens, doc_type_filters, collection_filters, availability_filters, year, passage_length, complete, results, query_hash ) VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);''' cur.execute(store_cmd, (str(tokens), str(active_filters['doc_type']), str(active_filters['collection']), str(active_filters['availability']), year, passage_length, complete, json.dumps(output_sections), section_hash)) con.commit() default_sections = [] year_counter = 0 for section_id in SHUFFLED_SECTION_IDS: if year_counter == 100: break try: section = output_sections[section_id] except IndexError: continue if section[6] > 0.85: default_sections.append(section) year_counter += 1 store_cmd = '''REPLACE INTO results_passages_yearly_default (tokens, doc_type_filters, collection_filters, availability_filters, year, results, query_hash ) VALUES(%s, %s, %s, %s, %s, %s, %s);''' cur.execute(store_cmd, (str(tokens), str( active_filters['doc_type']), str( active_filters['collection']), str(active_filters['availability']), year, json.dumps(default_sections), section_hash)) con.commit() con.close()