Ejemplo n.º 1
0
def find_distinctive_terms(term='younger adult smoker', collection='rj', start_year=1981,
                           end_year=1984):
    from tobacco.text_passages.find_text_passages import find_text_passages
    from tobacco.text_passages.text_passages_helper_distinctive_terms import get_text_passages_totals, \
        calculate_distinctive_terms
    from tobacco.results_storage.results_storage_worker_text_passages import select_and_process_sections

    col_id = COL_NAME_TO_ID[collection]
    globals = get_globals(globals_type='passages')
    active_filters = {'doc_type': ['internal communication'], 'collection': [col_id],
                      'availability': [], 'term': []}

    res = find_text_passages([term], active_filters=active_filters,
                             years_to_process=[i for i in range(start_year, end_year+1)],
                             globals=globals, passage_length=600, insert_result_to_db=False)

    raw_sections = res['sections']

    # Step 6 select and process sections
    final_sections, docs_by_year_list, section_id_to_year_dict = select_and_process_sections(raw_sections,
             passages_per_year=10000, min_readability=0.0, start_year=start_year, end_year=end_year)


    text_passages_totals, top_2000_terms_set = get_text_passages_totals(final_sections, [term])
    distinctive_terms, log_likelihoods = calculate_distinctive_terms(text_passages_totals,
                                                                     final_sections, [term])

    dt = {d[0]:d[2] for d in distinctive_terms}

    embed()
Ejemplo n.º 2
0
def sentiment(search_term):

    try:
        sections = pickle.load(open(f'{search_term}.pickle', 'rb'))
    except FileNotFoundError:

        globals = get_globals(globals_type='passages')
        active_filters = {
            'doc_type': ['internal communication'],
            'collection': [],
            'availability': [],
            'term': []
        }

        results = find_text_passages(
            [search_term],
            active_filters=active_filters,
            years_to_process=[i for i in range(1950, 1990)],
            globals=globals,
            passage_length=200,
            logging=True)
        sections = []
        for year in results['sections']:
            if not results['sections'][year]:
                continue

            for section in results['sections'][year]:
                s = {
                    'text': section[7].replace('<b>', '').replace('</b>', ''),
                    'date': section[4],
                    'id': section[0]
                }
                sections.append(s)

        pickle.dump(sections, open(f'{search_term}.pickle', mode='wb'))

    lexicon = load_emotion_lexicon()
    for section in sections:
        for emotion in EMOTIONS:
            section[emotion] = 0
            section[f'{emotion}_terms'] = []

        for word in section['text'].split():
            if word in lexicon:
                for emotion in EMOTIONS:
                    try:
                        section[emotion] += lexicon[word][emotion]
                        if lexicon[word][emotion] > 0:
                            section[f'{emotion}_terms'].append(word)
                    except KeyError:
                        pass

    for emotion in EMOTIONS:
        print("")
        for section in sorted(sections, key=lambda x: x[emotion])[::-1][:5]:
            print('{:8s} {:9s}. {:.1f}. {:12s}. {}. ({})'.format(
                section['id'], section['date'], section[emotion], emotion,
                section['text'], " ".join(section[f'{emotion}_terms'])))

    embed()
import json
import traceback

from MySQLdb import ProgrammingError
from tobacco.configuration import RESULTS_DB_NAME
from tobacco.frequencies.calculate_ngrams import get_frequencies
from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
from tobacco.results_storage.results_storage_redis import Redis_Con
from tobacco.utilities.databases import Database
from tobacco.utilities.email_notifications import send_email
from tobacco.utilities.hash import generate_hash
from tobacco.frequencies.calculate_ngrams_class import NgramResult

GLOBALS_FREQUENCIES = get_globals(globals_type='frequencies')
DB = Database(RESULTS_DB_NAME)

REDIS_HOST = Redis_Con()


def look_for_frequencies_tasks_and_execute():
    """ This function is the interface between the redis task manager and the frequency calculation process

    If a frequency result needs to be calculated, this function activates the task to do so and adds the result
    to the results db.

    :return:
    """

    print("Frequencies worker is ready")

    while True:
Ejemplo n.º 4
0
        for dt in dts_filtered:
            dt_totals = globals['totals']['doc_type'][self.docs_or_sections][
                dt['name']]
            dt_totals_filtered = dt_totals.convert_to_year_array(
                filter_vec=self.collection_filters_np)
            freqs = dt['absolute_counts'] / dt_totals_filtered
            freqs.vector = np.nan_to_num(freqs.vector)
            dt['frequencies'] = freqs
            dts.append(dt)

        self.doc_types = dts


if __name__ == "__main__":
    unparsed_search_tokens = ['addic*']
    doc_type_filters = ['letter']
    collection_filters = [5, 6, 7]
    availability_filters = []
    term_filters = []

    doc_type_filters = []
    collection_filters = []

    globals = get_globals(load_only_docs=True)
    ngram = NgramResult(doc_type_filters,
                        collection_filters,
                        availability_filters,
                        term_filters,
                        unparsed_search_tokens=unparsed_search_tokens)
    ngram.compute_result(globals)
Ejemplo n.º 5
0
from tobacco.utilities.vector import Vector
from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
from tobacco.utilities.ocr import load_vocabulary_trie
from IPython import embed
import math
import sqlite3

from tobacco.utilities.databases import Database

GLOBAL_DOCS = get_globals(load_only_docs=True)
FILTERS = GLOBAL_DOCS['filters']['docs']
TOTALS_COL = GLOBAL_DOCS['totals']['collection']['docs']

COL_NAME_TO_ID = {
    'pm': 5,
    'rj': 6,
    'll': 7,
    'bw': 8,
    'at': 9,
    'ti': 10,
    'ba': 15,
#    'msa_bat': 'msa_bat'
}


def process_all():

    create_sqlite_table()

    terms = []
from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
from tobacco.results_storage.results_storage_redis import Redis_Con
#from tobacco.text_passages.find_text_passages_mysql3 import find_text_passages
from tobacco.text_passages.text_passages_helper_distinctive_terms import get_text_passages_totals, \
    calculate_distinctive_terms
from tobacco.text_passages.text_passages_helper_topic_model import calculate_topic_model
from tobacco.utilities.databases import Database
from tobacco.utilities.email_notifications import send_email
from tobacco.utilities.hash import generate_hash

DB = Database(RESULTS_DB_NAME)
WORKERS_TO_RUN = 2

#VOCABULARY = load_vocabulary_trie(1)

GLOBALS_PASSAGES = get_globals(globals_type='passages')

REDIS_HOST = Redis_Con()


con_g, cur_g = DB.connect()

def look_for_text_passages_tasks_and_execute():
    """ This function is the interface between the redis task manager and the text passages search process

    If a text passage result needs to be calculated, this function activates the task to do so and adds the result
    to the results db.

    :return:
    """
Ejemplo n.º 7
0
def load_country_data():

    population_dict = {}
    for country_name in COUNTRY_CODES:
        print(country_name)
        population_dict[country_name] = 116 * [0.0]
        for year in range(1940, 2017):
            population_in_year = get_population(country_name, year)
            population_dict[country_name][year - 1901] = population_in_year

    countries_to_totals = {}
    country_data = {}

    from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
    from tobacco.frequencies.calculate_ngrams import get_frequencies
    globals = get_globals()

    for name in COUNTRY_CODES:
        print(name)
        id = COUNTRY_CODES[name]

        if name in ALIASES:
            search_term = ALIASES[name]
        else:
            search_term = [name.lower()]

        d = {}
        collection_ids = {
            'at': 9,
            'ba': 15,
            'bw': 8,
            'll': 7,
            'pm': 5,
            'rj': 6
        }

        for collection in ['at', 'ba', 'bw', 'll', 'pm', 'rj']:

            active_filters = {
                'doc_type': ['internal communication', 'marketing documents'],
                'collection': [collection_ids[collection]],
                'availability': [],
                'term': []
            }

            res = get_frequencies(search_term,
                                  active_filters,
                                  globals,
                                  profiling_run=False)

            d[collection] = {'name': name}

            # add data from combined collections
            try:
                countries_to_totals[name] = res['data']['tokens'][0]['total']
                d[collection]['total'] = res['data']['tokens'][0]['total']
                d[collection]['mean_freq'] = np.mean(
                    res['data']['tokens'][0]['frequencies'])
                d[collection]['counts'] = res['data']['tokens'][0]['counts']
                d[collection]['frequencies'] = res['data']['tokens'][0][
                    'frequencies']

            # some countries don't exist in TA, e.g. French Southern and Antarctic Lands
            except KeyError:
                import traceback
                print(traceback.format_exc())
                d[collection]['total'] = 0
                d[collection]['mean_freq'] = 0.0
                d[collection]['counts'] = 116 * [0]
                d[collection]['frequencies'] = 116 * [0.0]

            print(d[collection]['counts'])

        country_data[id] = d

    output = {'country_data': country_data, 'population_dict': population_dict}

    json.dump(
        output,
        open('/pcie/tobacco/tokenized/tobacco_flask_data/country_data.json',
             'w'))

    embed()
Ejemplo n.º 8
0
def get_frequencies(search_tokens,
                    active_filters,
                    globals=None,
                    profiling_run=False):
    """ Processes one frequency query and returns the results as a dict

    :param search_tokens: unparsed search token string
    :param active_filters: dict of lists, e.g. {'doc_type': ["internal communication"], 'collection': [1,2],
                                                'availability': [], 'term': []}
    :param globals: the frequency globals
    :param profiling_run: Used to profile multiprocessing function. If true, they are run as a single process.
    :return: Dict consisting of 'data' and 'errors'


    Results include data for:
    - tokens (absolute counts, relative frequencies, z-scores)
    - collections (absolute counts and relative frequencies for 9 most frequent collections)
    - document type groups (absolute counts and relative frequencies)
    - document types (absolute counts and relative frequencies for 9 most frequent document types)

    Processing steps (* indicates run as separate process through multiprocessing)
    - *Parse search terms (run as separate process
    - Load overall, collections, and doc type filters
    - Load token vectors
    - *Calculate z-scores for tokens
    - *Add collections data
    - *Add document type data (including document type groups)

    >>> globals = get_globals()
    >>> active_filters = {'doc_type': [], 'collection': [], 'availability': [], 'term': []}
    >>> result = get_frequencies(['cancer', 'neuro*', 'carcin*'], active_filters, globals, profiling_run=False)

    """

    from tobacco.frequencies_preprocessing.preprocessing_years_cython import \
        transform_doc_to_year_array
    from tobacco.frequencies_preprocessing.preprocessing_filters import get_active_filters_np

    from tobacco.frequencies.calculate_ngrams_collections import add_collections_mp
    from tobacco.frequencies.calculate_ngrams_doc_types import add_doc_types_mp
    from tobacco.frequencies_preprocessing.preprocessing_globals_loader import get_globals
    from tobacco.frequencies_preprocessing.preprocessing_search import parse_search_tokens
    from tobacco.frequencies_preprocessing.preprocessing_tokens import get_tokens
    from tobacco.frequencies_preprocessing.preprocessing_z_scores import get_z_scores
    globals = get_globals(globals_type='frequencies')

    if len(active_filters['term']) == 0:
        docs_or_sections = 'docs'
    else:
        docs_or_sections = 'sections'

    print("Calculating Frequencies. Term filter is: {}. Using {}.".format(
        active_filters['term'], docs_or_sections))
    start_time = time.time()

    # Initialize multiprocessing queue to handle the results for the collections and document types
    mp_results_queue = multiprocessing.Queue()

    # parse the search tokens as a separate process...
    multiprocessing.Process(target=parse_search_tokens,
                            args=(search_tokens, mp_results_queue)).start()

    # ... in the meantime, load the active doc type and collection filters.
    active_doc_type_filters_np, active_collection_filters_np, active_filters_np = get_active_filters_np(
        active_filters,
        globals['filters'],
        docs_or_sections=docs_or_sections,
        return_type=np.uint8)

    df = {
        'active_doc_type_filters_np': active_doc_type_filters_np,
        'active_collection_filters_np': active_collection_filters_np,
        'active_filters_np': active_filters_np
    }

    # create a total count per year array. Add 1 to totals to avoid division by 0 errors.
    df['totals_years'] = transform_doc_to_year_array(
        data=globals['totals']['totals'][docs_or_sections]['np'],
        filter=df['active_filters_np'],
        docs_or_sections=docs_or_sections) + 1

    # get the parsed search tokens. If there were errors, return them.
    token_list, token_search_errors = mp_results_queue.get()
    if len(token_list) == 0:
        print({'error': token_search_errors})
        return {'error': token_search_errors}

    # get the count data for all tokens.
    df = get_tokens(df, token_list, docs_or_sections)
    print("time tokens: {}".format(time.time() - start_time))

    # Second round of multiprocessing: calculate z-scores, collection and doc type data
    tokens_for_z_scores = [{
        'token': token['token'],
        'counts': token['counts']
    } for token in df['tokens']]
    multiprocessing.Process(target=get_z_scores,
                            args=(tokens_for_z_scores, df['totals_years'],
                                  mp_results_queue)).start()
    multiprocessing.Process(target=add_doc_types_mp,
                            args=(df['aggregate'],
                                  df['active_collection_filters_np'],
                                  docs_or_sections, mp_results_queue)).start()
    multiprocessing.Process(target=add_collections_mp,
                            args=(df['aggregate'],
                                  df['active_doc_type_filters_np'],
                                  docs_or_sections, mp_results_queue)).start()

    # for profiling purposes, make the multiprocessing parts use a single process
    # otherwise, profiling with the line profiler doesn't work.
    if profiling_run:
        test_queue = multiprocessing.Queue()
        add_collections_mp(df['aggregate'], df['active_doc_type_filters_np'],
                           docs_or_sections, test_queue)
        cols = test_queue.get()

        add_doc_types_mp(df['aggregate'], df['active_collection_filters_np'],
                         docs_or_sections, test_queue)
        doc_types_mp = test_queue.get()
        doc_type_groups_mp = test_queue.get()

    del df['aggregate']
    del df['aggregate_years']
    del df['totals_years']
    del df['active_filters_np']
    del df['active_collection_filters_np']
    del df['active_doc_type_filters_np']

    for i in range(4):
        print(i)
        mp_result = mp_results_queue.get()
        if mp_result[0] == 'z_scores':
            z_scores = mp_result[1]
            for token_id in range(len(z_scores)):
                df['tokens'][token_id]['z_scores'] = z_scores[token_id].tolist(
                )
        else:
            df[mp_result[0]] = mp_result[1]

    for token_dict in df['tokens']:
        token_dict['counts'] = token_dict['counts'].tolist()
        token_dict['frequencies'] = token_dict['frequencies'].tolist()

    print("Time total: ", time.time() - start_time)

    embed()

    return {'data': df, 'error': token_search_errors}
Ejemplo n.º 9
0
        if logging:
            try:
                print("\nSections for {}: {}".format(
                    year, len(output_docs['sections'][year])))
                print(output_docs['sections'][year])
            except TypeError:
                print("\nSections for {}: 0".format(year))
                pass

    return output_docs


if __name__ == "__main__":

    globals = get_globals(globals_type='passages')
    active_filters = {
        'doc_type': ['internal communication'],
        'collection': [6],
        'availability': [],
        'term': []
    }
    start = time.time()
    # results = find_text_passages('compound w', active_filters=active_filters, start_year=2000, end_year=2016, globals=globals, passage_length=200,
    #                     passages_per_year=20, min_readability=0.00, prepare_for_html=True)
    # print("Time", time.time() - start)

    start = time.time()
    results = find_text_passages(
        ['youth smoking'],
        active_filters=active_filters,