Example #1
0
def _setup_mongo():

    test_corpus_name = 'Test ' + datetime.now().isoformat()[:-7]

    docs = _setup_iatv_corpus(test_corpus_name)

    ic = IatvCorpus(name=test_corpus_name, documents=docs)

    ic.save()

    return test_corpus_name
Example #2
0
def build_iatv_corpus(start_datetime,
                      stop_datetime,
                      corpus_name,
                      program_names=None):
    '''
    Build corpus for a given year and a set of networks.

    Arguments:
        start_datetime (datetime.datetime): datetime equal/after which shows
            should be taken
        stop_datetime (datetime.datetime): datetime equal/before which shows
            should be taken
        corpus_name (str): descriptive name for the corpus for future discovery
        programs (dict): Dictionary of {station: [programs]}

    Returns:
        (IatvCorpus): Newly created corpus
    '''
    if program_names is None:
        corpus_docs = IatvDocument.objects(start_localtime__gte=start_datetime,
                                           start_localtime__lte=stop_datetime)
    else:
        corpus_docs = IatvDocument.objects(program_name__in=program_names,
                                           start_localtime__gte=start_datetime,
                                           start_localtime__lte=stop_datetime)

    corpus = IatvCorpus(name=corpus_name, documents=corpus_docs)

    return corpus
Example #3
0
def _teardown_mongo(test_corpus_name):

    ic = IatvCorpus.objects(name=test_corpus_name)[0]
    # iterate over all documents and remove
    for doc in ic.documents:
        IatvDocument.objects(pk=doc.pk)[0].delete()

    ic.delete()
Example #4
0
def test_daily_frequency():

    test_corpus_name = _setup_mongo()
    date_index = pd.date_range('2016-9-1', '2016-9-4', freq='D')
    ic = IatvCorpus.objects(name=test_corpus_name)[0]

    # obtained by dividing total metaphor counts by total shows per day
    expected_metaphor_freq_all = pd.DataFrame(
        index=date_index, data={'freq': [.75, 1.5, 2.0/3.0, 2.0/3.0]}
    )

    pn = [
        'Tracy Morgans news hour', 'Dingbat Alley', 'iCry Sad News Time',
        'Digging Turnips with Ethan Land', 'Good morning, middle america!'
    ]
    n = ['MSNBCW', 'CNNW', 'FOXNEWSW']
    fw = ['kill', 'murder', 'punch', 'attack']
    so = ['trump', 'clinton', 'obama', 'media']

    input_df = _gen_test_input(pn, n, fw, so)
    daily_freq = daily_frequency(input_df, date_index, ic)

    pd.testing.assert_frame_equal(daily_freq, expected_metaphor_freq_all)

    daily_freq_by_network = daily_frequency(
        input_df, date_index, ic, by=['network']
    )[['MSNBCW', 'CNNW', 'FOXNEWSW']]

    expected_metaphor_freq_by_network = pd.DataFrame(
        index=date_index,
        data=[
            (0, 2, 1),
            (2.5, np.nan, .5),
            (0, np.nan, 1),
            (np.nan, 0, 1)
        ],
        dtype=np.float64,
        columns=pd.Index(['MSNBCW', 'CNNW', 'FOXNEWSW'], name='network')
    )

    pd.testing.assert_frame_equal(
        daily_freq_by_network, expected_metaphor_freq_by_network
    )
Example #5
0
def test_shows_per_day():
    '''
    Insert some shows into the database using the program names (pn) and
    networks (n) from other parts of the tests.

    XXX these program names associated with a network should be in
    a dictionary to avoid straining my brain remembering which network went
    with which program name XXX
    '''
    test_corpus_name = _setup_mongo()

    ic = IatvCorpus.objects(name=test_corpus_name)[0]

    date_index = pd.date_range('2016-9-1', '2016-9-4', freq='D')

    expected_spd = pd.Series(
        index=date_index,
        data=[4, 4, 3, 3],
        dtype=np.float64
    )
    spd = shows_per_date(date_index, ic)

    pd.testing.assert_series_equal(expected_spd, spd)

    expected_spd_by_network = pd.DataFrame(
        index=date_index,
        data={
            'MSNBCW':   [2, 2, 1, 0],
            'CNNW':     [1, 0, 0, 1],
            'FOXNEWSW': [1, 2, 2, 2]
        },
        dtype=np.float64
    )
    spd_by_network = shows_per_date(date_index, ic, by_network=True)

    pd.testing.assert_frame_equal(expected_spd_by_network, spd_by_network)

    _teardown_mongo(test_corpus_name)
Example #6
0
def shows_per_date(date_index, iatv_corpus, by_network=False):
    '''
    Arguments:
        date_index (pandas.DatetimeIndex): Full index of dates covered by
            data
        iatv_corpus (app.models.IatvCorpus): Obtained, e.g., using
            `iatv_corpus = IatvCorpus.objects.get(name='Viomet Sep-Nov 2016')`
        by_network (bool): whether or not to do a faceted daily count
            by network

    Returns:
        (pandas.Series) if by_network is False, (pandas.DataFrame)
            if by_network is true.
    '''
    if type(iatv_corpus) is str:
        iatv_corpus = IatvCorpus.objects(name=iatv_corpus)[0]

    docs = iatv_corpus.documents

    n_dates = len(date_index)

    if not by_network:

        # get all date/show name tuples & remove show re-runs from same date
        prog_dates = set(
            [
                (d.program_name, d.start_localtime.date())
                for d in docs
            ]
        )

        # count total number of shows on each date
        # note we count the second entry of the tuples, which is just the
        # date, excluding program name
        shows_per_date = Counter(el[1] for el in prog_dates)

        spd_series = pd.Series(
            index=date_index,
            data={'counts': np.zeros(n_dates)}
        ).sort_index()

        for date in shows_per_date:
            spd_series.loc[date] = shows_per_date[date]

        return spd_series

    else:
        # get all date/network/show name tuples
        # & remove show re-runs from same date
        prog_dates = set(
            [
                (d.program_name, d.network, d.start_localtime.date())
                for d in docs
            ]
        )

        # count total number of shows on each date for each network
        # note we count the second entry of the tuples, which is just the
        # date, excluding program name
        shows_per_network_per_date = Counter(el[1:] for el in prog_dates)

        n_dates = len(date_index)
        spd_frame = pd.DataFrame(
            index=date_index,
            data={
                'MSNBCW': np.zeros(n_dates),
                'CNNW': np.zeros(n_dates),
                'FOXNEWSW': np.zeros(n_dates)
            }
        )

        for tup in shows_per_network_per_date:
            spd_frame.loc[tup[1]][tup[0]] = shows_per_network_per_date[tup]

        return spd_frame
Example #7
0
def fit_all_networks(df,
                     date_range,
                     iatv_corpus_name,
                     by_network=True,
                     poisson=False,
                     verbose=False):

    ic = IatvCorpus.objects(name=iatv_corpus_name)[0]

    # The first date of date_range can't be the last excited state date.
    last_excited_date_candidates = date_range[1:]

    candidate_excited_date_pairs = [(fd, ld)
                                    for ld in last_excited_date_candidates
                                    for fd in date_range[date_range < ld]]

    if by_network:

        if iatv_corpus_name is None:
            raise RuntimeError(
                'If by_network=True, must provide iatv_corpus_name')

        network_freq = daily_frequency(df, date_range, ic, by=['network'])

        results = {}
        for network in ['MSNBCW', 'CNNW', 'FOXNEWSW']:

            single_network = \
                network_freq[network].to_frame().reset_index().dropna()

            # this is ugly but required to match partition_AICs at this time
            single_network.columns = ['date', 'freq']

            all_fits = partition_AICs(single_network,
                                      candidate_excited_date_pairs,
                                      model_formula='freq ~ state',
                                      poisson=poisson,
                                      verbose=verbose)

            # The first date of the second level state cannot be the first
            # date in the dataset.
            all_fits = all_fits[all_fits.first_date != datetime(2012, 9, 1)]

            # The best fit is the one with the minimum AIC.
            best_fit = all_fits.iloc[all_fits['AIC'].idxmin()]

            # PartitionInfo provides a data structure wrapper around data row.
            pinfo = PartitionInfo.from_fit(best_fit)

            if poisson:
                pinfo.f_ground /= 2.0
                pinfo.f_excited /= 2.0

            results.update({network: (pinfo, best_fit, all_fits)})

        return results

    else:

        all_freq = daily_frequency(df, date_range, ic).reset_index().dropna()

        all_freq.columns = ['date', 'freq']

        all_fits = partition_AICs(all_freq,
                                  candidate_excited_date_pairs,
                                  model_formula='freq ~ state')

        best_fit = all_fits.iloc[all_fits['AIC'].idxmin()]

        return best_fit