def get_years_to_extract_docs(tokens, doc_type_filters, collection_filters, availability_filters, passage_length,
                              start_year, end_year, cur):
    '''
    Create a list of years that were not previously processed
    '''

    years_to_process = list(range(start_year, end_year +1))
    section_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, passage_length))

    cur.execute('SELECT year FROM results_passages_yearly WHERE query_hash = "{}";'.format(section_hash))
    for year_row in cur.fetchall():
        if year_row['year'] in years_to_process:
            years_to_process.remove(year_row['year'])

    # create a list of years to load from db
    years_to_load_from_db = list(range(start_year, end_year + 1))
    for year in years_to_process:
        years_to_load_from_db.remove(year)

    print("process: {}. \nload:{}.".format(years_to_process, years_to_load_from_db))

    return years_to_process, years_to_load_from_db
Beispiel #2
0
    def store_result_in_db(self, database):

        hash = generate_hash((self.parsed_search_tokens, self.doc_type_filters,
                              self.collection_filters,
                              self.availability_filters, self.term_filters))
        store_cmd = '''REPLACE INTO results_frequencies (tokens,
                                                        doc_type_filters,
                                                        collection_filters,
                                                        availability_filters,
                                                        term_filters,
                                                        query_hash,
                                                        results,
                                                        last_accessed,
                                                        count_accessed
                                                        )
                                    VALUES(%s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);'''
        con, cur = database.connect()
        cur.execute(store_cmd,
                    (str(self.tokens), str(
                        self.doc_type_filters), str(self.collection_filters),
                     str(self.availability_filters), str(self.term_filters),
                     hash, json.dumps(self.generate_results_dict())))
        con.commit()
        con.close()
def get_sections(tokens, doc_type_filters, collection_filters,
                 availability_filters, passage_length, passages_per_year,
                 start_year, end_year, min_readability):
    """ Loads the sections for a text passages query

    :param tokens:
    :param doc_type_filters:
    :param collection_filters:
    :param availability_filters:
    :param passage_length:
    :param passages_per_year:
    :param start_year:
    :param end_year:
    :param min_readability:
    :return:
    """

    try:
        output_sections = []
        section_hash = generate_hash(
            (tokens, doc_type_filters, collection_filters,
             availability_filters, passage_length))

        # load only correct passages if params == default
        if passage_length == 600 and passages_per_year == 100 and min_readability == 0.85:
            print('default true')
            con, cur = DB.connect()
            cur.execute(
                '''SELECT results, year FROM results_passages_yearly_default
                                WHERE query_hash = "{}" AND year >= {} AND year <= {} ORDER BY year ASC;'''
                .format(section_hash, start_year, end_year))

            rows = cur.fetchall()
            print("len rows", len(rows),
                  len(rows) == end_year + 1 - start_year)
            if len(rows) == end_year + 1 - start_year:
                for row in rows:
                    output_sections += json.loads(
                        row['results'].decode('utf-8'))
                return output_sections

        random.seed(passages_per_year)
        shuffled_section_ids = list(range(5000))
        random.shuffle(shuffled_section_ids)
        year_counter = Counter()

        # make sure all years available
        # all years are available if count of rows == number of years to load
        cur = None  # make cur from while loop available below
        con = None

        count = 0
        while True:

            if count == 3000:
                print(
                    "did not find section result after 3000 loops or 5 minutes for {}"
                    .format(tokens, doc_type_filters, collection_filters,
                            availability_filters))
                result = {
                    'errors':
                    "Something went wrong with this query and we terminated the calculation after 5 minutes."
                }
                send_email(
                    'Text passages failure',
                    '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                            Availability Filters: {}.
                            Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.'''
                    .format(tokens, doc_type_filters, collection_filters,
                            availability_filters, start_year, end_year,
                            passage_length, passages_per_year,
                            min_readability))
                con.close()
                return result

            con, cur = DB.connect()
            cur.execute(
                '''SELECT COUNT(*) as count_years FROM results_passages_yearly
                                WHERE query_hash = "{}" AND year >= {} AND year <= {}'''
                .format(section_hash, start_year, end_year))
            print(
                os.getpid(), count, 'sections',
                '''SELECT COUNT(*) as count_years FROM results_passages_yearly
                                WHERE query_hash = "{}" AND year >= {} AND year <= {}'''
                .format(section_hash, start_year, end_year))
            count_years = cur.fetchall()[0]['count_years']
            if count_years == end_year + 1 - start_year:
                break
            else:
                time.sleep(0.1)
                count += 1

        cur.execute('''SELECT results, year FROM results_passages_yearly
                          WHERE query_hash = "{}" AND year >= {} AND year <= {} ORDER BY year ASC;'''
                    .format(section_hash, start_year, end_year))

        while True:
            sections_row = cur.fetchone()

            if not sections_row: break
            year = sections_row['year']
            year_sections = json.loads(sections_row['results'].decode('utf-8'))

            if year_sections == []: continue
            for section_id in shuffled_section_ids:
                if year_counter[year] == passages_per_year:
                    break

                # shuffled_section_ids goes to 4999 -> there will often be misses
                try:
                    section = year_sections[section_id]
                except IndexError:
                    continue

                if section[6] > min_readability:
                    output_sections.append(section)
                    year_counter[year] += 1

        con.close()
        return output_sections

    except:
        send_email(
            'Section passages failure',
            '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                            Availability Filters: {}.
                            Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.
                            Traceback: {}.'''.format(
                tokens, doc_type_filters, collection_filters,
                availability_filters, start_year, end_year, passage_length,
                passages_per_year, min_readability, traceback.format_exc()))

        try:
            con.close()
        except ProgrammingError:
            pass

        raise
def get_topic_model_results(tokens, doc_type_filters, collection_filters,
                            availability_filters, start_year, end_year,
                            passage_length, passages_per_year,
                            min_readability):
    """ Loads just the topic model that results from a query

    Topic models can take up to 1 minute to process and are loaded separately so there are already results to explore
    before the topic model is finished.

    :param tokens:
    :param doc_type_filters:
    :param collection_filters:
    :param availability_filters:
    :param start_year:
    :param end_year:
    :param passage_length:
    :param passages_per_year:
    :param min_readability:
    :return:
    """

    print("Getting text passages result", tokens, start_year, end_year)
    con = None

    tokens, errors, doc_type_filters, collection_filters, availability_filters = sanitize_query_input(
        tokens,
        doc_type_filters,
        collection_filters,
        availability_filters,
        type='passages')

    query_hash = generate_hash(
        (tokens, doc_type_filters, collection_filters, availability_filters,
         start_year, end_year, passage_length, passages_per_year,
         min_readability))

    count = 0
    while True:
        if count == 2000:
            result = {
                'errors':
                "Something went wrong with this query and we terminated the calculation after 5 minutes."
            }
            send_email(
                'Topic Model',
                '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                        Availability Filters: {}.
                        Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.'''
                .format(tokens, doc_type_filters, collection_filters,
                        availability_filters, start_year, end_year,
                        passage_length, passages_per_year, min_readability))
            con.close()
            return result
        con, cur = DB.connect()
        cur.execute(
            'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"'
            .format(query_hash))
        print(
            os.getpid(), count, 'topic_model',
            'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"'
            .format(query_hash))
        rows = cur.fetchall()
        if len(rows) == 0:
            count += 1
            con.close()
            time.sleep(0.2)
            continue
        else:
            results = json.loads(rows[0]['results'].decode('utf-8'))
            if len(results['topic_model']) == 0:
                time.sleep(0.3)
                count += 1
                con.close()
                continue
            else:
                con.close()
                return results['topic_model']
def get_frequencies_results(tokens, doc_type_filters, collection_filters,
                            availability_filters, term_filters):
    """ Loads the results for one frequency query.

    First checks if they are already in the database. If not, it adds the task to the redis queue and waits for the
    result to be added to the database

    :param tokens:
    :param doc_type_filters:
    :param collection_filters:
    :param availability_filters:
    :param term_filters:
    :return:
    """

    try:

        tokens, doc_type_filters, collection_filters, availability_filters, term_filters = sanitize_query_input(
            tokens, doc_type_filters, collection_filters, availability_filters,
            term_filters)

        result = None

        con, cur = DB.connect()

        hash = generate_hash((tokens, doc_type_filters, collection_filters,
                              availability_filters, term_filters))

        cur.execute(
            'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";'
            .format(hash))
        rows = cur.fetchall()

        # if results was precalculated
        if len(rows) > 0:
            result = rows[0]['results']

        # else, calculate and store the result
        else:
            REDIS_CON.push_task_frequencies(
                (tokens, doc_type_filters, collection_filters,
                 availability_filters, term_filters))

            count = 0
            while True:
                if count == 3000:
                    print(
                        "did not find result after 20000 loops or 10 minutes for {}"
                        .format(tokens, doc_type_filters, collection_filters,
                                availability_filters, term_filters))
                    result = {
                        'errors':
                        "Something went wrong with this query and we terminated the calculation after 5 minutes."
                    }
                    send_email(
                        'Frequencies failure',
                        '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                                Availability Filters: {}. Term Filters: {}.'''.
                        format(tokens, doc_type_filters, collection_filters,
                               availability_filters, term_filters))
                    con.close()
                    return result
                con, cur = DB.connect()
                cur.execute(
                    'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";'
                    .format(hash))
                print(
                    os.getpid(), count, 'frequencies',
                    'SELECT results, count_accessed FROM results_frequencies WHERE query_hash = "{}";'
                    .format(hash))
                rows = cur.fetchall()
                if len(rows) == 0:
                    count += 1
                    time.sleep(0.1)
                    continue
                else:
                    result = rows[0]['results']
                    break

        result = json.loads(result.decode('utf-8'))

        count_accessed = rows[0]['count_accessed'] + 1
        cur.execute(
            'UPDATE results_frequencies SET count_accessed = {} WHERE query_hash = "{}";'
            .format(count_accessed, hex))
        con.commit()
        con.close()

        return result

    except:

        send_email(
            'Frequencies error',
            '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                    Availability Filters: {}. Term Filters: {}.
                    Stack Trace: {}'''.format(tokens, doc_type_filters,
                                              collection_filters,
                                              availability_filters,
                                              term_filters,
                                              traceback.format_exc()))

        try:
            con.close()
        except ProgrammingError:
            pass

        raise
def get_text_passages_results(tokens, doc_type_filters, collection_filters,
                              availability_filters, start_year, end_year,
                              passage_length, passages_per_year,
                              min_readability):
    """ This is the main task set up to retrieve text passages results.

    It checks if the result is already available in the database. If not, it adds the task to the redis queue
    Sections are loaded separately through the get_sections function.

    :param tokens:
    :param doc_type_filters:
    :param collection_filters:
    :param availability_filters:
    :param start_year:
    :param end_year:
    :param passage_length:
    :param passages_per_year:
    :param min_readability:
    :return:
    """

    try:

        start_time = time.time()
        print("Getting text passages result", tokens, start_year, end_year)

        tokens, token_parse_errors, doc_type_filters, collection_filters, availability_filters = sanitize_query_input(
            tokens,
            doc_type_filters,
            collection_filters,
            availability_filters,
            type='passages')

        con, cur = DB.connect()
        query_hash = generate_hash(
            (tokens, doc_type_filters, collection_filters,
             availability_filters, start_year, end_year, passage_length,
             passages_per_year, min_readability))

        cur.execute(
            'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}"'
            .format(query_hash))
        rows = cur.fetchall()

        # if result available, just load
        if len(rows) > 0:
            results = json.loads(rows[0]['results'].decode('utf-8'))

            try:
                print("errors", results['errors'], 'errors' in results)
            except:
                pass

            # only load sections if there are no errors
            if 'errors' in results and len(results['errors']) > 0:
                results['sections'] = []
            else:
                results['sections'] = get_sections(
                    tokens, doc_type_filters, collection_filters,
                    availability_filters, passage_length, passages_per_year,
                    start_year, end_year, min_readability)

            count_accessed = rows[0]['count_accessed'] + 1

        # else, create new task, load section, then load overall results
        else:
            print("result not available, creating task")
            REDIS_CON.push_task_passages(
                ('complete', tokens, doc_type_filters, collection_filters,
                 availability_filters, start_year, end_year, passage_length,
                 passages_per_year, min_readability))
            results = dict()

            count = 0
            while True:
                if count == 3000:
                    print(
                        "did not find result after 3000 loops or 5 minutes for {}"
                        .format(tokens, doc_type_filters, collection_filters,
                                availability_filters))
                    result = {
                        'errors':
                        "Something went wrong with this query and we terminated the calculation after 5 minutes."
                    }
                    send_email(
                        'Text passages failure',
                        '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                                    Availability Filters: {}.
                                    Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.'''
                        .format(tokens, doc_type_filters, collection_filters,
                                availability_filters, start_year, end_year,
                                passage_length, passages_per_year,
                                min_readability))
                    con.close()
                    return result

                con, cur = DB.connect()
                cur.execute(
                    'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}";'
                    .format(query_hash))
                print(
                    os.getpid(), count,
                    'SELECT results, count_accessed FROM results_passages WHERE query_hash = "{}";'
                    .format(query_hash))
                rows = cur.fetchall()
                if len(rows) == 0:
                    count += 1
                    time.sleep(0.1)
                    continue
                else:
                    results = {
                        **results,
                        **json.loads(rows[0]['results'].decode('utf-8'))
                    }
                    break

            count_accessed = 1

            # only load sections if there are no errors
            if 'errors' in results and len(results['errors']) > 0:
                results['sections'] = []
            else:
                results['sections'] = get_sections(
                    tokens, doc_type_filters, collection_filters,
                    availability_filters, passage_length, passages_per_year,
                    start_year, end_year, min_readability)

        cur.execute(
            'UPDATE results_passages SET count_accessed = {} WHERE query_hash = "{}";'
            .format(count_accessed, query_hash))
        con.commit()
        con.close()

        print("Passages search for {} took {} and returned {} docs.".format(
            tokens,
            time.time() - start_time, len(results['sections'])))

        if token_parse_errors:
            if 'errors' in results:
                results['errors'] += token_parse_errors
            else:
                results['errors'] = token_parse_errors
        return results

    except:
        send_email(
            'Text passages failure',
            '''Tokens: {}. Doc Type Filters: {}. Collection Filters: {}.
                            Availability Filters: {}.
                            Start Year: {}. End Year: {}. Passages Length: {}. Passages per Year: {}. Min Readability: {}.
                            Stack Trace: {}'''.format(
                tokens, doc_type_filters, collection_filters,
                availability_filters, start_year, end_year, passage_length,
                passages_per_year, min_readability, traceback.format_exc()))

        try:
            con.close()
        except ProgrammingError:
            pass

        raise
def execute_full_text_passages_task(task_params):
    """ Executes a full text passages task, i.e. retrieve sections, distinctive terms, topic and store them in the db

    :param task_params:
    :return:
    """

    start = time.time()

    tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year, passage_length, \
    passages_per_year, min_readability = task_params
    active_filters = {'doc_type': doc_type_filters, 'collection': collection_filters, 'availability': availability_filters}
    query_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, start_year, end_year,
                          passage_length, passages_per_year, min_readability))
    section_hash = generate_hash((tokens, doc_type_filters, collection_filters, availability_filters, passage_length))

    insert_cmd = '''REPLACE INTO results_passages (tokens, doc_type_filters, collection_filters, availability_filters,
                                                                start_year, end_year, passage_length, passages_per_year,
                                                                min_readability,
                                                                query_hash,
                                                                results, last_accessed, count_accessed)
                                        VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s, %s, %s, DATE(NOW()), 0);'''

    con, cur = DB.connect()

    error_while_processing = False
    results_with_potential_error = None

    # Step 1: Find years to process
    years_to_process, years_to_load_from_db = get_years_to_extract_docs(tokens, doc_type_filters,
                                    collection_filters, availability_filters, passage_length, start_year, end_year, cur)

    # Step 2: If more than 20 years, split passage retrieval into multiple tasks and change years_to_load to whole range
    #         First chunk will be executed by this task
    if len(years_to_process) > 21:
        years_to_process_chunks = [years_to_process[i:i+21] for i in range(0, len(years_to_process), 21)]
        for years_to_process_chunk in years_to_process_chunks[1:]:
            REDIS_HOST.push_task_sections(('sections', tokens, active_filters, years_to_process_chunk, passage_length))
        years_to_load_from_db = list(range(start_year, end_year + 1))
        years_to_process = years_to_process_chunks[0]

    # Step 3 Process years if necessary and add to database
    print("step 3", years_to_process)
    if len(years_to_process) > 0:
        results_with_potential_error = find_text_passages(tokens, active_filters, years_to_process, passage_length,
                                                          GLOBALS_PASSAGES)
        if results_with_potential_error['errors'] != '':
            error_while_processing = True

    # Step 4: if errors while processing, store error
    if error_while_processing:
        cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters),
                                 start_year, end_year, passage_length, passages_per_year, min_readability, query_hash,
                                 json.dumps(results_with_potential_error)))
        con.commit()
        return

    # Step 5: Load raw sections if no errors
    if years_to_process == []:
        raw_sections = {}
    else:
        # if there are no sections, return empty dict
        try:
            raw_sections = results_with_potential_error['sections']
        except KeyError:
            raw_sections = {}
    if len(years_to_load_from_db) > 0:
        loaded_sections = load_sections(years_to_load_from_db, section_hash)
        raw_sections.update(loaded_sections)

    # Step 6 select and process sections
    final_sections, docs_by_year_list, section_id_to_year_dict = select_and_process_sections(raw_sections,
                                                             passages_per_year, min_readability, start_year, end_year)

    # Step 7 Distinctive authors, terms, topic models
    frequent_authors = get_frequent_authors(final_sections)

    text_passages_totals, top_2000_terms_set = get_text_passages_totals(final_sections, tokens)
    distinctive_terms, log_likelihoods = calculate_distinctive_terms(text_passages_totals, final_sections, tokens)


    #9/29: insert temporary result without topic model if more than 10000 docs
    if len(final_sections) > 10000:

        print("calculating topic model separately... ")

        temp_result = {'distinctive_terms': distinctive_terms,
                  'frequent_authors': frequent_authors,
                  'docs_by_year_list': docs_by_year_list,
                  'topic_model': {},
                  'errors' : ''}
        cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters),
                                 start_year, end_year, passage_length, passages_per_year, min_readability, query_hash,
                                 json.dumps(temp_result)))
        con.commit()

    # add all distinctive terms that weren't in the top 2000 terms
    for term in distinctive_terms:
        if term[0] not in top_2000_terms_set:
            top_2000_terms_set.add(term[0])



    topic_model_type = 'nmf'
    # in case I want to run nmf_sparse
    if passage_length == 601:
        topic_model_type = 'nmf_sparse'
    if passage_length == 599:
        topic_model_type = 'nmf_local_idf'
    topic_model = calculate_topic_model(final_sections, section_id_to_year_dict, vocabulary=list(top_2000_terms_set),
                                        log_likelihoods=log_likelihoods, n_components=20,
                                        topic_model_type=topic_model_type)

    # Step 8: Insert result
    result = {'distinctive_terms': distinctive_terms,
              'frequent_authors': frequent_authors,
              'docs_by_year_list': docs_by_year_list,
              'topic_model': topic_model,
              'errors' : ''}

    cur.execute(insert_cmd, (str(tokens), str(doc_type_filters), str(collection_filters), str(availability_filters),
                             start_year, end_year, passage_length, passages_per_year, min_readability, query_hash,
                             json.dumps(result)))
    con.commit()

    print("full passages task for {}, {}, {} took {}.\n".format(tokens, start_year, end_year, time.time() - start))
Beispiel #8
0
def insert_passages_yearly_result(tokens, active_filters, year, passage_length,
                                  complete, output_sections):
    """ Insert the results of one year inot the results db

    The process runs through 2 storage options.
    First, it stores all results (up to 5000) in the database. Out of these, a requested result of, e.g. 400 can be
    selected later.
    Second, for faster loading, it stores a set with default search params (100 passages per year, 0.85 readability)
    in the results_passages_yearly_default table.

    :param tokens:
    :param active_filters:
    :param year:
    :param passage_length:
    :param complete:
    :param output_sections:
    :return:
    """

    con, cur = DB_TOB_RESULTS.connect()

    print("inserting yearly", year, len(output_sections))

    section_hash = generate_hash(
        (tokens, active_filters['doc_type'], active_filters['collection'],
         active_filters['availability'], passage_length))

    store_cmd = '''REPLACE INTO results_passages_yearly (tokens,
                                                        doc_type_filters,
                                                        collection_filters,
                                                        availability_filters,
                                                        year,
                                                        passage_length,
                                                        complete,
                                                        results,
                                                        query_hash
                                                        )
                                              VALUES(%s, %s, %s, %s, %s, %s, %s, %s, %s);'''

    cur.execute(store_cmd,
                (str(tokens), str(active_filters['doc_type']),
                 str(active_filters['collection']),
                 str(active_filters['availability']), year, passage_length,
                 complete, json.dumps(output_sections), section_hash))
    con.commit()

    default_sections = []
    year_counter = 0
    for section_id in SHUFFLED_SECTION_IDS:
        if year_counter == 100:
            break
        try:
            section = output_sections[section_id]
        except IndexError:
            continue

        if section[6] > 0.85:
            default_sections.append(section)
            year_counter += 1

    store_cmd = '''REPLACE INTO results_passages_yearly_default (tokens,
                                                        doc_type_filters,
                                                        collection_filters,
                                                        availability_filters,
                                                        year,
                                                        results,
                                                        query_hash
                                                        )
                                              VALUES(%s, %s, %s, %s, %s, %s, %s);'''

    cur.execute(store_cmd, (str(tokens), str(
        active_filters['doc_type']), str(
            active_filters['collection']), str(active_filters['availability']),
                            year, json.dumps(default_sections), section_hash))
    con.commit()
    con.close()