def interrogation_from_conclines(newdata): """make new interrogation result from its conc lines""" from collections import Counter from pandas import DataFrame import corpkit from corpkit import editor results = {} conc = newdata subcorpora = list(set(conc['c'])) for subcorpus in subcorpora: counted = Counter(list(conc[conc['c'] == subcorpus]['m'])) results[subcorpus] = counted the_big_dict = {} unique_results = set( [item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [ subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0]) ] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) df = editor(df, sort_by='total', print_info=False) df.concordance = conc return df
def edit(self, *args, **kwargs): """Edit each value with :func:`~corpkit.interrogation.Interrogation.edit`. See :func:`~corpkit.interrogation.Interrogation.edit` for possible arguments. :returns: :class:`corpkit.interrogation.Interrodict` """ from corpkit import editor return editor(self, *args, **kwargs)
def interroplot(path, query): """Demo function for interrogator/plotter. 1. Interrogates path with Tregex query, 2. Gets relative frequencies 3. Plots the top seven results :param path: path to corpus :type path: str :param query: Tregex query :type query: str """ import corpkit from corpkit import interrogator, editor, plotter quickstart = interrogator(path, 'words', query, show = ['w']) edited = editor(quickstart.results, '%', quickstart.totals, print_info = False) plotter(str(path), edited.results)
def interrogation_from_conclines(newdata): """make new interrogation result from its conc lines""" from collections import Counter from pandas import DataFrame import corpkit from corpkit import editor results = {} conc = newdata subcorpora = list(set(conc['c'])) for subcorpus in subcorpora: counted = Counter(list(conc[conc['c'] == subcorpus]['m'])) results[subcorpus] = counted the_big_dict = {} unique_results = set([item for sublist in list(results.values()) for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index = sorted(results.keys())) df = editor(df, sort_by = 'total', print_info = False) df.concordance = conc return df
# <markdowncell> # Lists of years and totals are pretty dry. Luckily, we can use the `plotter()` function to visualise our results. At minimum, `plotter()` needs two arguments: # 1. a title (in quotation marks) # 2. a list of results to plot # <codecell> plotter('Word counts in each subcorpus', allwords.totals) # <markdowncell> # Because we have smaller samples for 1963 and 2014, we might want to project them. To do that, we can pass subcorpus names and projection values to `editor()`: # <codecell> proj_vals = [(1963, 5), (2014, 1.37)] projected = editor(allwords.totals, projection = proj_vals) plotter('Word counts in each subcorpus (projected)', projected.totals) # <markdowncell> # Great! So, we can see that the number of words per year varies quite a lot, even after projection. That's worth keeping in mind. # <markdowncell> # ### Frequency of risk words in the NYT # <markdowncell> # Next, let's count the total number of risk words. Notice that we are using the `'both'` flag, instead of the `'count'` flag, because we want both the word and its tag. # <codecell> # our query: riskwords_query = r'__ < /(?i).?\brisk.?\b/' # any risk word and its word class/part of speech # get all risk words and their tags :
def edit(self, *args, **kwargs): """calls corpkit.editor.editor()""" from corpkit import editor return editor(self, *args, **kwargs)
def edit(self, *args, **kwargs): """Edit results of interrogations, do keywording, sort, etc. >>> # rel. frequencies for words without initial capital >>> rel = data.edit('%', 'self', skip_entries = r'^[A-Z]') ``just/skip_entries`` and ``just/skip_subcorpora`` can take a few different kinds of input: * str: treated as regular expression to match * list: * of integers: indices to match * of strings: entries/subcorpora to match ``merge_entries`` and ``merge_subcorpora``, however, are best entered as dicts: ``{newname: criteria, newname2: criteria2}``` where criteria is a string, list, etc. :param operation: Kind of maths to do on inputted lists: ``'+'``, ``'-'``, ``'/'``, ``'*'``, ``'%'``: self explanatory ``'k'``: log likelihood (keywords) ``'a'``: get distance metric ``'d'``: get percent difference (alternative approach to keywording) :type operation: str :param dataframe2: List of results or totals. If list of results, for each entry in dataframe 1, locate entry with same name in dataframe 2, and do maths there if 'self', do all merging/keeping operations, then use edited dataframe1 as dataframe2 :type dataframe2: pandas.Series/pandas.DataFrame/dict/`self` :param sort_by: Calculate slope, stderr, r, p values, then sort by. ``'increase'``: highest to lowest slope value ``'decrease'``: lowest to highest slope value ``'turbulent'``: most change in y axis values ``'static'``: least change in y axis values ``'total/most'``: largest number first ``'infreq/least'``: smallest number first ``'name'``: alphabetically :type sort_by: str :param keep_stats: Keep/drop stats values from dataframe after sorting :type keep_stats: bool :param keep_top: After sorting, remove all but the top *keep_top* results :type keep_top: int :param just_totals: Sum each column and work with sums :type just_totals: bool :param threshold: When using results list as dataframe 2, drop values occurring fewer than n times. If not keywording, you can use: ``'high'``: ``dataframe2 total / 2500`` ``'medium'``: ``dataframe2 total / 5000`` ``'low'``: ``dataframe2 total / 10000`` If keywording, there are smaller default thresholds :type threshold: int/bool :param just_entries: Keep matching entries :type just_entries: see above :param skip_entries: Skip matching entries :type skip_entries: see above :param merge_entries: Merge matching entries :type merge_entries: see above :param newname: New name for merged entries :type newname: str/``'combine'`` :param just_subcorpora: Keep matching subcorpora :type just_subcorpora: see above :param skip_subcorpora: Skip matching subcorpora :type skip_subcorpora: see above :param span_subcorpora: If subcorpora are numerically named, span all from *int* to *int2*, inclusive :type span_subcorpora: tuple -- ``(int, int2)`` :param merge_subcorpora: Merge matching subcorpora :type merge_subcorpora: see above :param new_subcorpus_name: Name for merged subcorpora :type new_subcorpus_name: str/``'combine'`` :param replace_names: Edit result names and then merge duplicate names. :type replace_names: dict -- ``{criteria: replacement_text}``; str -- a regex to delete from names :param projection: a to multiply results in subcorpus by n :type projection: tuple -- ``(subcorpus_name, n)`` :param remove_above_p: Delete any result over ``p`` :type remove_above_p: bool :param p: set the p value :type p: float :param revert_year: When doing linear regression on years, turn annual subcorpora into 1, 2 ... :type revert_year: bool :param print_info: Print stuff to console showing what's being edited :type print_info: bool :param spelling: Convert/normalise spelling: :type spelling: str -- ``'US'``/``'UK'`` :param selfdrop: When keywording, try to remove target corpus from reference corpus :type selfdrop: bool :param calc_all: When keywording, calculate words that appear in either corpus :type calc_all: bool :returns: :class:`corpkit.interrogation.Interrogation` """ from corpkit import editor branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r'): return editor(self.results, *args, **kwargs) elif branch.lower().startswith('t'): return editor(self.totals, *args, **kwargs)
def make_multi(interrogation, indexnames = None): """ make pd.multiindex version of an interrogation (for pandas geeks) :param interrogation: a corpkit interrogation :type interrogation: a corpkit interrogation, pd.DataFrame or pd.Series :param indexnames: pass in a list of names for the multiindex; leave as None to get them if possible from interrogation use False to explicitly not get them :type indexnames: list of strings/None/False :returns: pd.DataFrame with multiindex""" # get proper names for index if possible translator = {'f': 'Function', 'l': 'Lemma', 'r': 'Distance from root', 'w': 'Word', 't': 'Trees', 'i': 'Index', 'n': 'N-grams', 'p': 'POS', 'g': 'Governor', 'd': 'Dependent', 'gp': 'Governor POS', 'dp': 'Dependent POS', 'gl': 'Governor lemma', 'dl': 'Dependent lemma', 'gf': 'Governor function', 'df': 'Dependent function'} import numpy as np import pandas as pd # if it's an interrodict, we want to make it into a single df import corpkit from interrogation import Interrodict if interrogation.__class__ == Interrodict: import pandas as pd import numpy as np flat = [[], [], []] for name, data in list(interrogation.items()): for subcorpus in list(data.results.index): flat[0].append(name) flat[1].append(subcorpus) flat[2].append(data.results.ix[subcorpus]) flat[0] = np.array(flat[0]) flat[1] = np.array(flat[1]) df = pd.DataFrame(flat[2], index = flat[:2]) df.index.names = ['corpus', 'subcorpus'] df = df.fillna(0) df = df.T df[('Total', 'Total')] = df.sum(axis = 1) df = df.sort_values(by=('Total', 'Total'), ascending = False).drop(('Total', 'Total'), axis = 1).T try: df = df.astype(int) except: pass return df # determine datatype, get df and cols if type(interrogation) == pd.core.frame.DataFrame: df = interrogation cols = list(interrogation.columns) elif type(interrogation) == pd.core.series.Series: cols = list(interrogation.index) df = pd.DataFrame(interrogation).T else: cols = list(interrogation.results.columns) df = interrogation.results # set indexnames if we have them if indexnames is not False: indexnames = [translator[i] for i in interrogation.query['show']] # split column names on slash for index, i in enumerate(cols): cols[index] = i.split('/') # make numpy arrays arrays = [] for i in range(len(cols[0])): arrays.append(np.array([x[i] for x in cols])) # make output df, add names if we have them newdf = pd.DataFrame(df.T.as_matrix(), index=arrays).T if indexnames: newdf.columns.names = indexnames pd.set_option('display.multi_sparse', False) from corpkit import editor return editor(newdf)