def _setup_mongo(): test_corpus_name = 'Test ' + datetime.now().isoformat()[:-7] docs = _setup_iatv_corpus(test_corpus_name) ic = IatvCorpus(name=test_corpus_name, documents=docs) ic.save() return test_corpus_name
def build_iatv_corpus(start_datetime, stop_datetime, corpus_name, program_names=None): ''' Build corpus for a given year and a set of networks. Arguments: start_datetime (datetime.datetime): datetime equal/after which shows should be taken stop_datetime (datetime.datetime): datetime equal/before which shows should be taken corpus_name (str): descriptive name for the corpus for future discovery programs (dict): Dictionary of {station: [programs]} Returns: (IatvCorpus): Newly created corpus ''' if program_names is None: corpus_docs = IatvDocument.objects(start_localtime__gte=start_datetime, start_localtime__lte=stop_datetime) else: corpus_docs = IatvDocument.objects(program_name__in=program_names, start_localtime__gte=start_datetime, start_localtime__lte=stop_datetime) corpus = IatvCorpus(name=corpus_name, documents=corpus_docs) return corpus
def _teardown_mongo(test_corpus_name): ic = IatvCorpus.objects(name=test_corpus_name)[0] # iterate over all documents and remove for doc in ic.documents: IatvDocument.objects(pk=doc.pk)[0].delete() ic.delete()
def test_daily_frequency(): test_corpus_name = _setup_mongo() date_index = pd.date_range('2016-9-1', '2016-9-4', freq='D') ic = IatvCorpus.objects(name=test_corpus_name)[0] # obtained by dividing total metaphor counts by total shows per day expected_metaphor_freq_all = pd.DataFrame( index=date_index, data={'freq': [.75, 1.5, 2.0/3.0, 2.0/3.0]} ) pn = [ 'Tracy Morgans news hour', 'Dingbat Alley', 'iCry Sad News Time', 'Digging Turnips with Ethan Land', 'Good morning, middle america!' ] n = ['MSNBCW', 'CNNW', 'FOXNEWSW'] fw = ['kill', 'murder', 'punch', 'attack'] so = ['trump', 'clinton', 'obama', 'media'] input_df = _gen_test_input(pn, n, fw, so) daily_freq = daily_frequency(input_df, date_index, ic) pd.testing.assert_frame_equal(daily_freq, expected_metaphor_freq_all) daily_freq_by_network = daily_frequency( input_df, date_index, ic, by=['network'] )[['MSNBCW', 'CNNW', 'FOXNEWSW']] expected_metaphor_freq_by_network = pd.DataFrame( index=date_index, data=[ (0, 2, 1), (2.5, np.nan, .5), (0, np.nan, 1), (np.nan, 0, 1) ], dtype=np.float64, columns=pd.Index(['MSNBCW', 'CNNW', 'FOXNEWSW'], name='network') ) pd.testing.assert_frame_equal( daily_freq_by_network, expected_metaphor_freq_by_network )
def test_shows_per_day(): ''' Insert some shows into the database using the program names (pn) and networks (n) from other parts of the tests. XXX these program names associated with a network should be in a dictionary to avoid straining my brain remembering which network went with which program name XXX ''' test_corpus_name = _setup_mongo() ic = IatvCorpus.objects(name=test_corpus_name)[0] date_index = pd.date_range('2016-9-1', '2016-9-4', freq='D') expected_spd = pd.Series( index=date_index, data=[4, 4, 3, 3], dtype=np.float64 ) spd = shows_per_date(date_index, ic) pd.testing.assert_series_equal(expected_spd, spd) expected_spd_by_network = pd.DataFrame( index=date_index, data={ 'MSNBCW': [2, 2, 1, 0], 'CNNW': [1, 0, 0, 1], 'FOXNEWSW': [1, 2, 2, 2] }, dtype=np.float64 ) spd_by_network = shows_per_date(date_index, ic, by_network=True) pd.testing.assert_frame_equal(expected_spd_by_network, spd_by_network) _teardown_mongo(test_corpus_name)
def shows_per_date(date_index, iatv_corpus, by_network=False): ''' Arguments: date_index (pandas.DatetimeIndex): Full index of dates covered by data iatv_corpus (app.models.IatvCorpus): Obtained, e.g., using `iatv_corpus = IatvCorpus.objects.get(name='Viomet Sep-Nov 2016')` by_network (bool): whether or not to do a faceted daily count by network Returns: (pandas.Series) if by_network is False, (pandas.DataFrame) if by_network is true. ''' if type(iatv_corpus) is str: iatv_corpus = IatvCorpus.objects(name=iatv_corpus)[0] docs = iatv_corpus.documents n_dates = len(date_index) if not by_network: # get all date/show name tuples & remove show re-runs from same date prog_dates = set( [ (d.program_name, d.start_localtime.date()) for d in docs ] ) # count total number of shows on each date # note we count the second entry of the tuples, which is just the # date, excluding program name shows_per_date = Counter(el[1] for el in prog_dates) spd_series = pd.Series( index=date_index, data={'counts': np.zeros(n_dates)} ).sort_index() for date in shows_per_date: spd_series.loc[date] = shows_per_date[date] return spd_series else: # get all date/network/show name tuples # & remove show re-runs from same date prog_dates = set( [ (d.program_name, d.network, d.start_localtime.date()) for d in docs ] ) # count total number of shows on each date for each network # note we count the second entry of the tuples, which is just the # date, excluding program name shows_per_network_per_date = Counter(el[1:] for el in prog_dates) n_dates = len(date_index) spd_frame = pd.DataFrame( index=date_index, data={ 'MSNBCW': np.zeros(n_dates), 'CNNW': np.zeros(n_dates), 'FOXNEWSW': np.zeros(n_dates) } ) for tup in shows_per_network_per_date: spd_frame.loc[tup[1]][tup[0]] = shows_per_network_per_date[tup] return spd_frame
def fit_all_networks(df, date_range, iatv_corpus_name, by_network=True, poisson=False, verbose=False): ic = IatvCorpus.objects(name=iatv_corpus_name)[0] # The first date of date_range can't be the last excited state date. last_excited_date_candidates = date_range[1:] candidate_excited_date_pairs = [(fd, ld) for ld in last_excited_date_candidates for fd in date_range[date_range < ld]] if by_network: if iatv_corpus_name is None: raise RuntimeError( 'If by_network=True, must provide iatv_corpus_name') network_freq = daily_frequency(df, date_range, ic, by=['network']) results = {} for network in ['MSNBCW', 'CNNW', 'FOXNEWSW']: single_network = \ network_freq[network].to_frame().reset_index().dropna() # this is ugly but required to match partition_AICs at this time single_network.columns = ['date', 'freq'] all_fits = partition_AICs(single_network, candidate_excited_date_pairs, model_formula='freq ~ state', poisson=poisson, verbose=verbose) # The first date of the second level state cannot be the first # date in the dataset. all_fits = all_fits[all_fits.first_date != datetime(2012, 9, 1)] # The best fit is the one with the minimum AIC. best_fit = all_fits.iloc[all_fits['AIC'].idxmin()] # PartitionInfo provides a data structure wrapper around data row. pinfo = PartitionInfo.from_fit(best_fit) if poisson: pinfo.f_ground /= 2.0 pinfo.f_excited /= 2.0 results.update({network: (pinfo, best_fit, all_fits)}) return results else: all_freq = daily_frequency(df, date_range, ic).reset_index().dropna() all_freq.columns = ['date', 'freq'] all_fits = partition_AICs(all_freq, candidate_excited_date_pairs, model_formula='freq ~ state') best_fit = all_fits.iloc[all_fits['AIC'].idxmin()] return best_fit