def shannon(self): from corpkit.interrogation import Interrogation def to_apply(ser): data = [] import numpy as np import pandas as pd for word in ser.index: if not ser[word]: probability = np.nan self_information = np.nan else: probability = ser[word] / float(1.0 * len(ser)) self_information = np.log2(1.0 / probability) data.append(self_information) return pd.Series(data, index=ser.index) res = getattr(self, 'results', self) appl = res.apply(to_apply, axis=1) ents = appl.sum(axis=1) / appl.shape[1] ents.name = 'Entropy' return Interrogation(results=appl, totals=ents)
def interrogator(corpus, search='w', query='any', show='w', exclude=False, excludemode='any', searchmode='all', case_sensitive=False, save=False, subcorpora=False, just_metadata=False, skip_metadata=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, only_unique=False, only_format_match=True, multiprocess=False, spelling=False, regex_nonword_filter=r'[A-Za-z0-9]', gramsize=1, conc=False, maxconc=9999, window=None, no_closed=False, no_punct=True, discard=False, **kwargs): """ Interrogate corpus, corpora, subcorpus and file objects. See corpkit.interrogation.interrogate() for docstring """ conc = kwargs.get('do_concordancing', conc) quiet = kwargs.get('quiet', False) coref = kwargs.pop('coref', False) show_conc_metadata = kwargs.pop('show_conc_metadata', False) fsi_index = kwargs.pop('fsi_index', True) dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies') nosubmode = subcorpora is None #todo: temporary #if getattr(corpus, '_dlist', False): # subcorpora = 'file' # store kwargs and locs locs = locals().copy() locs.update(kwargs) locs.pop('kwargs', None) import codecs import signal import os from time import localtime, strftime from collections import Counter import pandas as pd from pandas import DataFrame, Series from corpkit.interrogation import Interrogation, Interrodict from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, animator, filtermaker, fix_search, pat_format, auto_usecols, format_tregex, make_conc_lines_from_whole_mid) from corpkit.other import as_regex from corpkit.dictionaries.process_types import Wordlist from corpkit.build import check_jdk from corpkit.conll import pipeline from corpkit.process import delete_files_and_subcorpora have_java = check_jdk() # remake corpus without bad files and folders corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata) # so you can do corpus.interrogate('features/postags/wordclasses/lexicon') if search == 'features': search = 'v' query = 'any' if search in ['postags', 'wordclasses']: query = 'any' preserve_case = True show = 'p' if search == 'postags' else 'x' # use tregex if simple because it's faster # but use dependencies otherwise search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'} if search == 'lexicon': search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'} query = 'any' show = ['w'] if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3: raise ValueError('search argument not recognised.') import re if regex_nonword_filter: is_a_word = re.compile(regex_nonword_filter) else: is_a_word = re.compile(r'.*') from traitlets import TraitError # convert cql-style queries---pop for the sake of multiprocessing cql = kwargs.pop('cql', None) if cql: from corpkit.cql import to_corpkit search, exclude = to_corpkit(search) def signal_handler(signal, _): """ Allow pausing and restarting whn not in GUI """ if root: return import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) def add_adj_for_ngram(show, gramsize): """ If there's a gramsize of more than 1, remake show for ngramming """ if gramsize == 1: return show out = [] for i in show: out.append(i) for i in range(1, gramsize): for bit in show: out.append('+%d%s' % (i, bit)) return out def fix_show_bit(show_bit): """ Take a single search/show_bit type, return match """ ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c'] starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c'] show_bit = show_bit.lstrip('n') show_bit = show_bit.lstrip('b') show_bit = list(show_bit) if show_bit[-1] not in ends: show_bit.append('w') if show_bit[0] not in starts: show_bit.insert(0, 'm') return ''.join(show_bit) def fix_show(show, gramsize): """ Lowercase anything in show and turn into list """ if isinstance(show, list): show = [i.lower() for i in show] elif isinstance(show, STRINGTYPE): show = show.lower() show = [show] show = [fix_show_bit(i) for i in show] return add_adj_for_ngram(show, gramsize) def is_multiquery(corpus, search, query, outname): """ Determine if multiprocessing is needed/possibe, and do some retyping if need be as well """ is_mul = False from collections import OrderedDict from corpkit.dictionaries.process_types import Wordlist if isinstance(query, Wordlist): query = list(query) if subcorpora and multiprocess: is_mul = 'subcorpora' if isinstance(subcorpora, (list, tuple)): is_mul = 'subcorpora' if isinstance(query, (dict, OrderedDict)): is_mul = 'namedqueriessingle' if isinstance(search, dict): if all(isinstance(i, dict) for i in list(search.values())): is_mul = 'namedqueriesmultiple' return is_mul, corpus, search, query def ispunct(s): import string return all(c in string.punctuation for c in s) def uniquify(conc_lines): """get unique concordance lines""" from collections import OrderedDict unique_lines = [] checking = [] for index, (_, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def compiler(pattern): """ Compile regex or fail gracefully """ if hasattr(pattern, 'pattern'): return pattern import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def determine_search_func(show): """Figure out what search function we're using""" simple_tregex_mode = False statsmode = False tree_to_text = False search_trees = False simp_crit = all(not i for i in [kwargs.get('tgrep'), files_as_subcorpora, subcorpora, just_metadata, skip_metadata]) if search.get('t') and simp_crit: if have_java: simple_tregex_mode = True else: search_trees = 'tgrep' optiontext = 'Searching parse trees' elif datatype == 'conll': if any(i.endswith('t') for i in search.keys()): if have_java and not kwargs.get('tgrep'): search_trees = 'tregex' else: search_trees = 'tgrep' optiontext = 'Searching parse trees' elif any(i.endswith('v') for i in search.keys()): # either of these searchers now seems to work #seacher = get_stats_conll statsmode = True optiontext = 'General statistics' elif any(i.endswith('r') for i in search.keys()): optiontext = 'Distance from root' else: optiontext = 'Querying CONLL data' return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees def get_tregex_values(show): """If using Tregex, set appropriate values - Check for valid query - Make 'any' query - Make list query """ translated_option = 't' if isinstance(search['t'], Wordlist): search['t'] = list(search['t']) q = tregex_engine(corpus=False, query=search.get('t'), options=['-t'], check_query=True, root=root, preserve_case=preserve_case ) # so many of these bad fixing loops! nshow = [] for i in show: if i == 'm': nshow.append('w') else: nshow.append(i.lstrip('m')) show = nshow if q is False: if root: return 'Bad query', None else: return 'Bad query', None if isinstance(search['t'], list): regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive) else: regex = '' # listquery, anyquery, translated_option treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'], 'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'], 'c': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'], 'l': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'], 'u': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v'] } newshow = [] listq, anyq, translated_option = treg_dict.get(show[0][-1].lower()) newshow.append(translated_option) for item in show[1:]: _, _, noption = treg_dict.get(item.lower()) newshow.append(noption) if isinstance(search['t'], list): search['t'] = listq elif search['t'] == 'any': search['t'] = anyq return search['t'], newshow def correct_spelling(a_string): """correct spelling within a string""" if not spelling: return a_string from corpkit.dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def make_search_iterable(corpus): """determine how to structure the corpus for interrogation""" # skip file definitions if they are not needed if getattr(corpus, '_dlist', False): return {(i.name, i.path): [i] for i in list(corpus.files)} #return {('Sample', 'Sample'): list(corpus.files)} if simple_tregex_mode: if corpus.level in ['s', 'f', 'd']: return {(corpus.name, corpus.path): False} else: return {(os.path.basename(i), os.path.join(corpus.path, i)): False for i in os.listdir(corpus.path) if os.path.isdir(os.path.join(corpus.path, i))} if isinstance(corpus, Datalist): to_iterate_over = {} # it could be files or subcorpus objects if corpus[0].level in ['s', 'd']: if files_as_subcorpora: for subc in corpus: for f in subc.files: to_iterate_over[(f.name, f.path)] = [f] else: for subc in corpus: to_iterate_over[(subc.name, subc.path)] = subc.files elif corpus[0].level == 'f': for f in corpus: to_iterate_over[(f.name, f.path)] = [f] elif corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora: # just files in a directory if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] else: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} if files_as_subcorpora: # don't know if possible: has subcorpora but also .files if hasattr(corpus, 'files') and corpus.files is not None: for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] # has subcorpora with files in those elif hasattr(corpus, 'files') and corpus.files is None: for subc in corpus.subcorpora: for f in subc.files: to_iterate_over[(f.name, f.path)] = [f] else: if corpus[0].level == 's': for subcorpus in corpus: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files elif corpus[0].level == 'f': for f in corpus: to_iterate_over[(f.name, f.path)] = [f] else: for subcorpus in corpus.subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files return to_iterate_over def welcome_printer(return_it=False): """Print welcome message""" if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if only_conc: message = 'Concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) from corpkit.process import dictformat sformat = dictformat(search) welcome = ('\n%s: %s %s ...\n %s\n ' \ 'Query: %s\n %s corpus ... \n' % \ (thetime, message, cname, optiontext, sformat, message)) if return_it: return welcome else: print(welcome) def goodbye_printer(return_it=False, only_conc=False): """Say goodbye before exiting""" if not kwargs.get('printstatus', True): return thetime = strftime("%H:%M:%S", localtime()) if only_conc: finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ',')) else: finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %s matches.' % format(tot, ',') else: finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ',')) if return_it: return finalstring else: print(finalstring) def get_conc_colnames(corpus, fsi_index=False, simple_tregex_mode=False): fields = [] base = 'c f s l m r' if simple_tregex_mode: base = base.replace('f ', '') if fsi_index and not simple_tregex_mode: base = 'i ' + base if PYTHON_VERSION == 2: base = base.encode('utf-8').split() else: base = base.split() if show_conc_metadata: from corpkit.build import get_all_metadata_fields meta = get_all_metadata_fields(corpus.path) if isinstance(show_conc_metadata, list): meta = [i for i in meta if i in show_conc_metadata] #elif show_conc_metadata is True: # pass for i in sorted(meta): if i in ['speaker', 'sent_id', 'parse']: continue if PYTHON_VERSION == 2: base.append(i.encode('utf-8')) else: base.append(i) return base def make_conc_obj_from_conclines(conc_results, fsi_index=False): """ Turn conclines into DataFrame """ from corpkit.interrogation import Concordance #fsi_place = 2 if fsi_index else 0 all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series for lin in unique_results: #spkr = str(spkr, errors = 'ignore') #if not subcorpora: # lin[fsi_place] = lin[fsi_place] #lin.insert(fsi_place, sc_name) if len(lin) < len(conc_col_names): diff = len(conc_col_names) - len(lin) lin.extend(['none'] * diff) all_conc_lines.append(Series(lin, index=conc_col_names)) try: conc_df = pd.concat(all_conc_lines, axis=1).T except ValueError: return if all(x == '' for x in list(conc_df['s'].values)) or \ all(x == 'none' for x in list(conc_df['s'].values)): conc_df.drop('s', axis=1, inplace=True) locs['corpus'] = corpus.name if maxconc: conc_df = Concordance(conc_df[:maxconc]) else: conc_df = Concordance(conc_df) try: conc_df.query = locs except AttributeError: pass return conc_df def lowercase_result(res): """ Take any result and do spelling/lowercasing if need be todo: remove lowercase and change name """ if not res or statsmode: return res # this is likely broken, but spelling in interrogate is deprecated anyway if spelling: res = [correct_spelling(r) for r in res] return res def postprocess_concline(line, fsi_index=False, conc=False): # todo: are these right? if not conc: return line subc, star, en = 0, 2, 5 if fsi_index: subc, star, en = 2, 4, 7 if not preserve_case: line[star:en] = [str(x).lower() for x in line[star:en]] if spelling: line[star:en] = [correct_spelling(str(b)) for b in line[star:en]] return line def make_progress_bar(): """generate a progress bar""" if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: total_files = sum(len(x) for x in list(to_iterate_over.values())) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'quiet': quiet, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') if in_notebook: par_args['welcome_message'] = welcome_message outn = kwargs.get('outname', '') if outn: outn = getattr(outn, 'name', outn) outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) return p, outn, total_files, par_args # find out if using gui root = kwargs.get('root') note = kwargs.get('note') language_model = kwargs.get('language_model') # set up pause method original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: if not root: original_sigint = signal.getsignal(signal.SIGINT) signal.signal(signal.SIGINT, signal_handler) # find out about concordancing only_conc = False no_conc = False if conc is False: no_conc = True if isinstance(conc, str) and conc.lower() == 'only': only_conc = True no_conc = False numconc = 0 # wipe non essential class attributes to not bloat query attrib if isinstance(corpus, Corpus): import copy corpus = copy.copy(corpus) for k, v in corpus.__dict__.items(): if isinstance(v, (Interrogation, Interrodict)): corpus.__dict__.pop(k, None) # convert path to corpus object if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)): if not multiprocess and not kwargs.get('outname'): corpus = Corpus(corpus, print_info=False) # figure out how the user has entered the query and show, and normalise from corpkit.process import searchfixer search = searchfixer(search, query) show = fix_show(show, gramsize) locs['show'] = show # instantiate lemmatiser if need be lem_instance = False if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lem_instance = WordNetLemmatizer() # do multiprocessing if need be im, corpus, search, query, = is_multiquery(corpus, search, query, kwargs.get('outname', False)) # figure out if we can multiprocess the corpus if hasattr(corpus, '__iter__') and im: corpus = Corpus(corpus, print_info=False) if hasattr(corpus, '__iter__') and not im: im = 'datalist' if isinstance(corpus, Corpora): im = 'multiplecorpora' # split corpus if the user wants multiprocessing but no other iterable if not im and multiprocess: im = 'datalist' if getattr(corpus, 'subcorpora', False): corpus = corpus[:] else: corpus = corpus.files search = fix_search(search, case_sensitive=case_sensitive, root=root) exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root) # if it's already been through pmultiquery, don't do it again locs['search'] = search locs['exclude'] = exclude locs['query'] = query locs['corpus'] = corpus locs['multiprocess'] = multiprocess locs['print_info'] = kwargs.get('printstatus', True) locs['multiple'] = im locs['subcorpora'] = subcorpora locs['nosubmode'] = nosubmode # send to multiprocess function if im: signal.signal(signal.SIGINT, original_sigint) from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) # get corpus metadata cname = corpus.name if isinstance(save, STRINGTYPE): savename = corpus.name + '-' + save if save is True: raise ValueError('save must be str, not bool.') datatype = getattr(corpus, 'datatype', 'conll') singlefile = getattr(corpus, 'singlefile', False) level = getattr(corpus, 'level', 'c') # store all results in here from collections import defaultdict results = defaultdict(Counter) count_results = defaultdict(list) conc_results = defaultdict(list) # check if just counting, turn off conc if so countmode = 'c' in show or 'mc' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) # Determine the search function to be used # optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show) # no conc for statsmode if statsmode: no_conc = True only_conc = False conc = False # Set some Tregex-related values translated_option = False if search.get('t'): query, translated_option = get_tregex_values(show) if query == 'Bad query' and translated_option is None: if root: return 'Bad query' else: return # more tregex options if tree_to_text: treg_q = r'ROOT << __' op = ['-o', '-t', '-w', '-f'] elif simple_tregex_mode: treg_q = search['t'] op = ['-%s' % i for i in translated_option] + ['-o', '-f'] # make iterable object for corpus interrogation to_iterate_over = make_search_iterable(corpus) try: from ipywidgets import IntProgress _ = IntProgress(min=0, max=10, value=1) in_notebook = True except TraitError: in_notebook = False except ImportError: in_notebook = False # caused in newest ipython except AttributeError: in_notebook = False lemtag = False if search.get('t'): from corpkit.process import gettag lemtag = gettag(search.get('t'), lemmatag) usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref) # print welcome message welcome_message = welcome_printer(return_it=in_notebook) # create a progress bar p, outn, total_files, par_args = make_progress_bar() if conc: conc_col_names = get_conc_colnames(corpus, fsi_index=fsi_index, simple_tregex_mode=False) # Iterate over data, doing interrogations for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if nosubmode: subcorpus_name = 'Total' # results for subcorpus go here #conc_results[subcorpus_name] = [] #count_results[subcorpus_name] = [] #results[subcorpus_name] = Counter() # get either everything (tree_to_text) or the search['t'] query if tree_to_text or simple_tregex_mode: result = tregex_engine(query=treg_q, options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case) # format search results with slashes etc if not countmode and not tree_to_text: result = format_tregex(result, show, translated_option=translated_option, exclude=exclude, excludemode=excludemode, lemtag=lemtag, lem_instance=lem_instance, countmode=countmode, speaker_data=False) # if concordancing, do the query again with 'whole' sent and fname if not no_conc: ops = ['-w'] + op #ops = [i for i in ops if i != '-n'] whole_result = tregex_engine(query=search['t'], options=ops, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) # format match too depending on option if not only_format_match: wholeresult = format_tregex(whole_result, show, translated_option=translated_option, exclude=exclude, excludemode=excludemode, lemtag=lemtag, lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True) # make conc lines from conc results conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show) for lin in conc_result: if maxconc is False or numconc < maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 # add matches to ongoing counts if countmode: count_results[subcorpus_name] += [result] else: if result: results[subcorpus_name] += Counter([i[-1] for i in result]) else: results[subcorpus_name] += Counter() # update progress bar current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # todo: move this kwargs.pop('by_metadata', None) # conll querying goes by file, not subcorpus for f in files: slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else '' filepath, corefs = f.path, coref res, conc_res = pipeline(filepath, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, case_sensitive=case_sensitive, conc=conc, only_format_match=only_format_match, speaker=slow_treg_speaker_guess, gramsize=gramsize, no_punct=no_punct, no_closed=no_closed, window=window, filename=f.path, coref=corefs, countmode=countmode, maxconc=(maxconc, numconc), is_a_word=is_a_word, by_metadata=subcorpora, show_conc_metadata=show_conc_metadata, just_metadata=just_metadata, skip_metadata=skip_metadata, fsi_index=fsi_index, category=subcorpus_name, translated_option=translated_option, statsmode=statsmode, preserve_case=preserve_case, usecols=usecols, search_trees=search_trees, lem_instance=lem_instance, lemtag=lemtag, **kwargs) if res is None and conc_res is None: current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # deal with symbolic structures---that is, rather than adding # results by subcorpora, add them by metadata value # todo: sorting? if subcorpora: for (k, v), concl in zip(res.items(), conc_res.values()): v = lowercase_result(v) results[k] += Counter(v) for line in concl: if maxconc is False or numconc < maxconc: line = postprocess_concline(line, fsi_index=fsi_index, conc=conc) conc_results[k].append(line) numconc += 1 current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # garbage collection needed? sents = None corefs = None if res == 'Bad query': return 'Bad query' if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for line in conc_res: line = postprocess_concline(line, fsi_index=fsi_index, conc=conc) if maxconc is False or numconc < maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: res = lowercase_result(res) # discard removes low results, helping with # curse of dimensionality countres = Counter(res) if isinstance(discard, float): countres.most_common() nkeep = len(counter) - len(counter) * discard countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep}) elif isinstance(discard, int): countres = Counter({k: v for k, v in countres.most_common() if v >= discard}) results[subcorpus_name] += countres #else: #results[subcorpus_name] += res # update progress bar current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # Get concordances into DataFrame, return if just conc if not no_conc: # fail on this line with typeerror if no results? conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index) if only_conc and conc_df is None: return elif only_conc: locs = sanitise_dict(locs) try: conc_df.query = locs except AttributeError: return conc_df if save and not kwargs.get('outname'): if conc_df is not None: conc_df.save(savename) goodbye_printer(only_conc=True) if not root: signal.signal(signal.SIGINT, original_sigint) return conc_df else: conc_df = None # Get interrogation into DataFrame if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set(item for sublist in list(results.values()) for item in sublist) sortres = sorted(results.items(), key=lambda x: x[0]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) # for ngrams, remove hapaxes #if show_ngram or show_collocates: # if not language_model: # df = df[[i for i in list(df.columns) if df[i].sum() > 1]] numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() # turn df into series if all conditions met conds = [countmode, files_as_subcorpora, subcorpora, kwargs.get('df1_always_df', False)] anyxs = [level == 's', singlefile, nosubmode] if all(not x for x in conds) and any(x for x in anyxs): df = Series(df.ix[0]) df.sort_values(ascending=False, inplace=True) tot = df.sum() numentries = len(df.index) total_total = tot # turn data into DF for GUI if need be if isinstance(df, Series) and kwargs.get('df1_always_df', False): total_total = df.sum() df = DataFrame(df) tot = Series(total_total, index=['Total']) # if we're doing files as subcorpora, we can remove the extension etc if isinstance(df, DataFrame) and files_as_subcorpora: cname = corpus.name.replace('-stripped', '').replace('-parsed', '') edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''), (r'-%s(-stripped)?(-parsed)?' % cname, '')] from corpkit.editor import editor df = editor(df, replace_subcorpus_names=edits).results tot = df.sum(axis=1) total_total = df.sum().sum() if conc_df is not None and conc_df is not False: # removed 'f' from here for now for col in ['c']: for pat in ['.txt', '.conll', '.conllu']: conc_df[col] = conc_df[col].str.replace(pat, '') conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '') #df.index = df.index.str.replace('w', 'this') # make interrogation object locs['corpus'] = corpus.path locs = sanitise_dict(locs) if nosubmode and isinstance(df, pd.DataFrame): df = df.sum() interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df) # save it if save and not kwargs.get('outname'): print('\n') interro.save(savename) goodbye = goodbye_printer(return_it=in_notebook) if in_notebook: try: p.children[2].value = goodbye.replace('\n', '') except AttributeError: pass if not root: signal.signal(signal.SIGINT, original_sigint) return interro
def multiindex(self): """Create a `pandas.MultiIndex` version of results. :Example: >>> d = corpora.interrogate({F: 'compound', GL: '^risk'}, show=L) >>> d.keys() ['CHT', 'WAP', 'WSJ'] >>> d['CHT'].results .... health cancer security credit flight safety heart 1987 87 25 28 13 7 6 4 1988 72 24 20 15 7 4 9 1989 137 61 23 10 5 5 6 >>> d.multiindex().results ... health cancer credit security downside Corpus Subcorpus CHT 1987 87 25 13 28 20 1988 72 24 15 20 12 1989 137 61 10 23 10 WAP 1987 83 44 8 44 10 1988 83 27 13 40 6 1989 95 77 18 25 12 WSJ 1987 52 27 33 4 21 1988 39 11 37 9 22 1989 55 47 43 9 24 :returns: A :class:`corpkit.interrogation.Interrogation` """ import pandas as pd import numpy as np from itertools import product from corpkit.interrogation import Interrodict, Interrogation query = self.query def trav(dct, parents={}, level=0, colset=set(), results=list(), myparname=[]): from collections import defaultdict columns = False if hasattr(dct, 'items'): parents[level] = list(dct.keys()) level += 1 for k, v in list(dct.items()): pars = myparname + [k] # the below is only for python3 #pars = [*myparname, k] trav(v, parents=parents, level=level, results=results, myparname=pars) else: if parents.get(level): parents[level] |= set(dct.results.index) else: parents[level] = set(dct.results.index) if not dct.results.empty: for n, ser in dct.results.iterrows(): ser.name = tuple(myparname + [ser.name]) #ser.name = (*myparname, ser.name) results.append(ser) for c in list(dct.results.columns): colset.add(c) level += 1 return results data = trav(self) index = [i.name for i in data] # todo: better default for speakers? if isinstance(self.query, dict) and self.query.get('subcorpora'): nms = {'names': self.query['subcorpora']} else: nms = {} ix = pd.MultiIndex.from_tuples(index, **nms) df = pd.DataFrame(data, index=ix) df = df.fillna(0).astype(int) df = df[df.sum().sort_values(ascending=False).index] totals = df.sum(axis=1) return Interrogation(results=df, totals=totals, query=query)
def make_multi(interrogation, indexnames=None): """ make pd.multiindex version of an interrogation (for pandas geeks) :param interrogation: a corpkit interrogation :type interrogation: a corpkit interrogation, pd.DataFrame or pd.Series :param indexnames: pass in a list of names for the multiindex; leave as None to get them if possible from interrogation use False to explicitly not get them :type indexnames: list of strings/None/False :returns: pd.DataFrame with multiindex""" # get proper names for index if possible from corpkit.constants import transshow, transobjs import numpy as np import pandas as pd # if it's an interrodict, we want to make it into a single df import corpkit from corpkit.interrogation import Interrodict, Interrogation seriesmode = False if isinstance(interrogation, (Interrodict, dict)): import pandas as pd import numpy as np flat = [[], [], []] for name, data in list(interrogation.items()): for subcorpus in list(data.results.index): # make multiindex flat[0].append(name) flat[1].append(subcorpus) # add results flat[2].append(data.results.ix[subcorpus]) flat[0] = np.array(flat[0]) flat[1] = np.array(flat[1]) df = pd.DataFrame(flat[2], index=flat[:2]) if indexnames is None: indexnames = ['Corpus', 'Subcorpus'] df.index.names = indexnames df = df.fillna(0) df = df.T df[('Total', 'Total')] = df.sum(axis=1) df = df.sort_values(by=('Total', 'Total'), ascending=False).drop( ('Total', 'Total'), axis=1).T try: df = df.astype(int) except: pass return Interrogation(df, df.sum(axis=1), getattr(interrogation, 'query', None)) # determine datatype, get df and cols rows = False if isinstance(interrogation, pd.core.frame.DataFrame): df = interrogation cols = list(interrogation.columns) rows = list(interrogation.index) elif isinstance(interrogation, pd.core.series.Series): cols = list(interrogation.index) seriesmode = True df = pd.DataFrame(interrogation).T elif isinstance(interrogation, Interrogation): df = interrogation.results if isinstance(df, pd.core.series.Series): cols = list(df.index) seriesmode = True df = pd.DataFrame(df).T else: cols = list(df.columns) rows = list(df.index) # set indexnames if we have them if indexnames is not False: if interrogation.query.get('show'): indexnames = [] ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's'] for showval in interrogation.query['show']: if len(showval) == 1: if showval in ends: showval = 'm' + showval else: showval = showval + 'w' a = transobjs.get(showval[0], showval[0]) b = transshow.get(showval[-1], showval[-1]) indexstring = '%s %s' % (a, b.lower()) indexnames.append(indexstring) else: indexnames = False # split column names on slash for index, i in enumerate(cols): cols[index] = i.split('/') # make numpy arrays arrays = [] for i in range(len(cols[0])): arrays.append(np.array([x[i] for x in cols])) # make output df, add names if we have them newdf = pd.DataFrame(df.T.as_matrix(), index=arrays).T if indexnames: newdf.columns.names = indexnames if rows: newdf.index = rows pd.set_option('display.multi_sparse', False) totals = newdf.sum(axis=1) query = interrogation.query conco = getattr(interrogation, 'concordance', None) return Interrogation(newdf, totals, query, conco)
def editor(interrogation, operation=None, denominator=False, sort_by=False, keep_stats=False, keep_top=False, just_totals=False, threshold='medium', just_entries=False, skip_entries=False, span_entries=False, merge_entries=False, just_subcorpora=False, skip_subcorpora=False, span_subcorpora=False, merge_subcorpora=False, replace_names=False, replace_subcorpus_names=False, projection=False, remove_above_p=False, p=0.05, print_info=False, spelling=False, selfdrop=True, calc_all=True, keyword_measure='ll', **kwargs): """ See corpkit.interrogation.Interrogation.edit() for docstring """ # grab arguments, in case we get dict input and have to iterate locs = locals() import corpkit import re import collections import pandas as pd import numpy as np from pandas import DataFrame, Series from time import localtime, strftime try: get_ipython().getoutput() except TypeError: have_ipython = True except NameError: have_ipython = False try: from IPython.display import display, clear_output except ImportError: pass # new ipython error except AttributeError: have_ipython = False pass # to use if we also need to worry about concordance lines return_conc = False from corpkit.interrogation import Interrodict, Interrogation, Concordance if interrogation.__class__ == Interrodict: locs.pop('interrogation', None) from collections import OrderedDict outdict = OrderedDict() for i, (k, v) in enumerate(interrogation.items()): # only print the first time around if i != 0: locs['print_info'] = False if isinstance(denominator, STRINGTYPE) and denominator.lower() == 'self': denominator = interrogation # if df2 is also a dict, get the relevant entry if isinstance(denominator, (dict, Interrodict)): #if sorted(set([i.lower() for i in list(dataframe1.keys())])) == \ # sorted(set([i.lower() for i in list(denominator.keys())])): # locs['denominator'] = denominator[k] # fix: this repeats itself for every key, when it doesn't need to # denominator_sum: if kwargs.get('denominator_sum'): locs['denominator'] = denominator.collapse(axis='key') if kwargs.get('denominator_totals'): locs['denominator'] = denominator[k].totals else: locs['denominator'] = denominator[k].results outdict[k] = v.results.edit(**locs) if print_info: thetime = strftime("%H:%M:%S", localtime()) print( "\n%s: Finished! Output is a dictionary with keys:\n\n '%s'\n" % (thetime, "'\n '".join(sorted(outdict.keys())))) return Interrodict(outdict) elif isinstance(interrogation, (DataFrame, Series)): dataframe1 = interrogation elif isinstance(interrogation, Interrogation): #if interrogation.__dict__.get('concordance', None) is not None: # concordances = interrogation.concordance branch = kwargs.pop('branch', 'results') if branch.lower().startswith('r'): dataframe1 = interrogation.results elif branch.lower().startswith('t'): dataframe1 = interrogation.totals elif branch.lower().startswith('c'): dataframe1 = interrogation.concordance return_conc = True else: dataframe1 = interrogation.results elif isinstance(interrogation, Concordance) or \ all(x in list(dataframe1.columns) for x in [ 'l', 'm', 'r']): return_conc = True print('heree') dataframe1 = interrogation # hope for the best else: dataframe1 = interrogation the_time_started = strftime("%Y-%m-%d %H:%M:%S") pd.options.mode.chained_assignment = None try: from process import checkstack except ImportError: from corpkit.process import checkstack if checkstack('pythontex'): print_info = False def combiney(df, df2, operation='%', threshold='medium', prinf=True): """ Mash df and df2 together in appropriate way """ totals = False # delete under threshold if just_totals: if using_totals: if not single_totals: to_drop = list( df2[df2['Combined total'] < threshold].index) df = df.drop([e for e in to_drop if e in list(df.index)]) if prinf: to_show = [] [to_show.append(w) for w in to_drop[:5]] if len(to_drop) > 10: to_show.append('...') [to_show.append(w) for w in to_drop[-5:]] if len(to_drop) > 0: print( 'Removing %d entries below threshold:\n %s' % (len(to_drop), '\n '.join(to_show))) if len(to_drop) > 10: print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1)) else: print('') else: denom = df2 else: denom = list(df2) if single_totals: if operation == '%': totals = df.sum() * 100.0 / float(df.sum().sum()) df = df * 100.0 try: df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '+': try: df = df.add(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '-': try: df = df.sub(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '*': totals = df.sum() * float(df.sum().sum()) try: df = df.mul(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == '/': try: totals = df.sum() / float(df.sum().sum()) df = df.div(denom, axis=0) except ValueError: thetime = strftime("%H:%M:%S", localtime()) print( '%s: cannot combine DataFrame 1 and 2: different shapes' % thetime) elif operation == 'a': for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2 elif operation.startswith('c'): import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") df = pandas.concat([df, df2], axis=1) return df, totals elif not single_totals: if not operation.startswith('a'): # generate totals if operation == '%': totals = df.sum() * 100.0 / float(df2.sum().sum()) if operation == '*': totals = df.sum() * float(df2.sum().sum()) if operation == '/': totals = df.sum() / float(df2.sum().sum()) if operation.startswith('c'): # add here the info that merging will not work # with identical colnames import warnings with warnings.catch_warnings(): warnings.simplefilter("ignore") d = pd.concat([df.T, df2.T]) # make index nums d = d.reset_index() # sum and remove duplicates d = d.groupby('index').sum() dx = d.reset_index('index') dx.index = list(dx['index']) df = dx.drop('index', axis=1).T def editf(datum): meth = { '%': datum.div, '*': datum.mul, '/': datum.div, '+': datum.add, '-': datum.sub } if datum.name in list(df2.columns): method = meth[operation] mathed = method(df2[datum.name], fill_value=0.0) if operation == '%': return mathed * 100.0 else: return mathed else: return datum * 0.0 df = df.apply(editf) else: for c in [c for c in list(df.columns) if int(c) > 1]: df[c] = df[c] * (1.0 / int(c)) df = df.sum(axis=1) / df2.T.sum() return df, totals def skip_keep_merge_span(df): """ Do all skipping, keeping, merging and spanning """ from corpkit.dictionaries.process_types import Wordlist if skip_entries: if isinstance(skip_entries, (list, Wordlist)): df = df.drop(list(skip_entries), axis=1, errors='ignore') else: df = df.loc[:, ~df.columns.str.contains(skip_entries)] if just_entries: if isinstance(just_entries, (list, Wordlist)): je = [i for i in list(just_entries) if i in list(df.columns)] df = df[je] else: df = df.loc[:, df.columns.str.contains(just_entries)] if merge_entries: for newname, crit in merge_entries.items(): if isinstance(crit, (list, Wordlist)): crit = [i for i in list(crit) if i in list(df.columns)] cr = [i for i in list(crit) if i in list(df.columns)] summed = df[cr].sum(axis=1) df = df.drop(list(cr), axis=1, errors='ignore') else: summed = df.loc[:, df.columns.str.contains(crit)].sum(axis=1) df = df.loc[:, ~df.columns.str.contains(crit)] df.insert(0, newname, summed, allow_duplicates=True) if span_entries: df = df.iloc[:, span_entries[0]:span_entries[1]] if skip_subcorpora: if isinstance(skip_subcorpora, (list, Wordlist)): df = df.drop(list(skip_subcorpora), axis=0, errors='ignore') else: df = df[~df.index.str.contains(skip_subcorpora)] if just_subcorpora: if isinstance(just_subcorpora, (list, Wordlist)): js = [i for i in list(just_subcorpora) if i in list(df.index)] df = df.loc[js] else: df = df[df.index.str.contains(just_subcorpora)] if merge_subcorpora: df = df.T for newname, crit in merge_subcorpora.items(): if isinstance(crit, (list, Wordlist)): crit = [i for i in list(crit) if i in list(df.columns)] summed = df[list(crit)].sum(axis=1) df = df.drop(list(crit), axis=1, errors='ignore') else: summed = df.loc[:, df.columns.str.contains(crit)].sum(axis=1) df = df.loc[:, ~df.columns.str.contains(crit)] df.insert(0, newname, summed, allow_duplicates=True) df = df.T if span_subcorpora: df = df.iloc[span_subcorpora[0]:span_subcorpora[1], :] return df def parse_input(df, the_input): """turn whatever has been passed in into list of words that can be used as pandas indices---maybe a bad way to go about it""" parsed_input = False import re if the_input == 'all': the_input = r'.*' if isinstance(the_input, int): try: the_input = str(the_input) except: pass the_input = [the_input] elif isinstance(the_input, STRINGTYPE): regex = re.compile(the_input) parsed_input = [w for w in list(df) if re.search(regex, w)] return parsed_input from corpkit.dictionaries.process_types import Wordlist if isinstance(the_input, Wordlist) or the_input.__class__ == Wordlist: the_input = list(the_input) if isinstance(the_input, list): if isinstance(the_input[0], int): parsed_input = [ word for index, word in enumerate(list(df)) if index in the_input ] elif isinstance(the_input[0], STRINGTYPE): try: parsed_input = [ word for word in the_input if word in df.columns ] except AttributeError: # if series parsed_input = [ word for word in the_input if word in df.index ] return parsed_input def synonymise(df, pos='n'): """pass a df and a pos and convert df columns to most common synonyms""" from nltk.corpus import wordnet as wn #from dictionaries.taxonomies import taxonomies from collections import Counter fixed = [] for w in list(df.columns): try: syns = [] for syns in wn.synsets(w, pos=pos): for w in syns: synonyms.append(w) top_syn = Counter(syns).most_common(1)[0][0] fixed.append(top_syn) except: fixed.append(w) df.columns = fixed return df def convert_spell(df, convert_to='US', print_info=print_info): """turn dataframes into us/uk spelling""" from dictionaries.word_transforms import usa_convert if print_info: print('Converting spelling ... \n') if convert_to == 'UK': usa_convert = {v: k for k, v in list(usa_convert.items())} fixed = [] for val in list(df.columns): try: fixed.append(usa_convert[val]) except: fixed.append(val) df.columns = fixed return df def merge_duplicates(df, print_info=print_info): if print_info: print('Merging duplicate entries ... \n') # now we have to merge all duplicates for dup in df.columns.get_duplicates(): #num_dupes = len(list(df[dup].columns)) temp = df[dup].sum(axis=1) #df = df.drop([dup for d in range(num_dupes)], axis=1) df = df.drop(dup, axis=1) df[dup] = temp return df def name_replacer(df, replace_names, print_info=print_info): """replace entry names and merge""" import re # get input into list of tuples # if it's a string, we want to delete it if isinstance(replace_names, STRINGTYPE): replace_names = [(replace_names, '')] # this is for some malformed list if not isinstance(replace_names, dict): if isinstance(replace_names[0], STRINGTYPE): replace_names = [replace_names] # if dict, make into list of tupes if isinstance(replace_names, dict): replace_names = [(v, k) for k, v in replace_names.items()] for to_find, replacement in replace_names: if print_info: if replacement: print('Replacing "%s" with "%s" ...\n' % (to_find, replacement)) else: print('Deleting "%s" from entry names ...\n' % to_find) to_find = re.compile(to_find) if not replacement: replacement = '' df.columns = [ re.sub(to_find, replacement, l) for l in list(df.columns) ] df = merge_duplicates(df, print_info=False) return df def newname_getter(df, parsed_input, newname='combine', prinf=True, merging_subcorpora=False): """makes appropriate name for merged entries""" if merging_subcorpora: if newname is False: newname = 'combine' if isinstance(newname, int): the_newname = list(df.columns)[newname] elif isinstance(newname, STRINGTYPE): if newname == 'combine': if len(parsed_input) <= 3: the_newname = '/'.join(parsed_input) elif len(parsed_input) > 3: the_newname = '/'.join(parsed_input[:3]) + '...' else: the_newname = newname if not newname: # revise this code import operator sumdict = {} for item in parsed_input: summed = sum(list(df[item])) sumdict[item] = summed the_newname = max(iter(sumdict.items()), key=operator.itemgetter(1))[0] if not isinstance(the_newname, STRINGTYPE): the_newname = str(the_newname, errors='ignore') return the_newname def projector(df, list_of_tuples, prinf=True): """project abs values""" if isinstance(list_of_tuples, list): tdict = {} for a, b in list_of_tuples: tdict[a] = b list_of_tuples = tdict for subcorpus, projection_value in list(list_of_tuples.items()): if isinstance(subcorpus, int): subcorpus = str(subcorpus) df.ix[subcorpus] = df.ix[subcorpus] * projection_value if prinf: if isinstance(projection_value, float): print('Projection: %s * %s' % (subcorpus, projection_value)) if isinstance(projection_value, int): print('Projection: %s * %d' % (subcorpus, projection_value)) if prinf: print('') return df def lingres(ser, index): from scipy.stats import linregress from pandas import Series ix = ['slope', 'intercept', 'r', 'p', 'stderr'] return Series(linregress(index, ser.values), index=ix) def do_stats(df): """do linregress and add to df""" try: from scipy.stats import linregress except ImportError: thetime = strftime("%H:%M:%S", localtime()) print('%s: sort type not available in this version of corpkit.' % thetime) return False indices = list(df.index) first_year = list(df.index)[0] try: x = [int(y) - int(first_year) for y in indices] except ValueError: x = list(range(len(indices))) stats = df.apply(lingres, axis=0, index=x) df = df.append(stats) df = df.replace([np.inf, -np.inf], 0.0) return df def resort(df, sort_by=False, keep_stats=False): """ Sort results, potentially using scipy's linregress """ # translate options and make sure they are parseable stat_field = ['slope', 'intercept', 'r', 'p', 'stderr'] easy_sorts = ['total', 'infreq', 'name', 'most', 'least', 'reverse'] stat_sorts = ['increase', 'decrease', 'static', 'turbulent'] options = stat_field + easy_sorts + stat_sorts sort_by_convert = {'most': 'total', True: 'total', 'least': 'infreq'} sort_by = sort_by_convert.get(sort_by, sort_by) # probably broken :( if just_totals: if sort_by == 'name': return df.sort_index() else: return df.sort_values(by='Combined total', ascending=sort_by != 'total', axis=1) stats_done = False if keep_stats or sort_by in stat_field + stat_sorts: df = do_stats(df) stats_done = True if isinstance(df, bool): if df is False: return False if isinstance(df, Series): if stats_done: stats = df.ix[range(-5, 0)] df = df.drop(list(stats.index)) if sort_by == 'name': df = df.sort_index() elif sort_by == 'reverse': df = df[::-1] else: df = df.sort_values(ascending=sort_by != 'total') if stats_done: df = df.append(stats) return df if sort_by == 'name': # currently case sensitive df = df.reindex_axis(sorted(df.columns), axis=1) elif sort_by in ['total', 'infreq']: if df1_istotals: df = df.T df = df[list( df.sum().sort_values(ascending=sort_by != 'total').index)] elif sort_by == 'reverse': df = df.T[::-1].T # sort by slope etc., or search by subcorpus name if sort_by in stat_field or sort_by not in options: asc = kwargs.get('reverse', False) df = df.T.sort_values(by=sort_by, ascending=asc).T if sort_by in ['increase', 'decrease', 'static', 'turbulent']: slopes = df.ix['slope'] if sort_by == 'increase': df = df[slopes.argsort()[::-1]] elif sort_by == 'decrease': df = df[slopes.argsort()] elif sort_by == 'static': df = df[slopes.abs().argsort()] elif sort_by == 'turbulent': df = df[slopes.abs().argsort()[::-1]] if remove_above_p: df = df.T df = df[df['p'] <= p] df = df.T # remove stats field by default if not keep_stats: df = df.drop(stat_field, axis=0, errors='ignore') return df def set_threshold(big_list, threshold, prinf=True): if isinstance(threshold, STRINGTYPE): if threshold.startswith('l'): denominator = 10000 if threshold.startswith('m'): denominator = 5000 if threshold.startswith('h'): denominator = 2500 if isinstance(big_list, DataFrame): tot = big_list.sum().sum() if isinstance(big_list, Series): tot = big_list.sum() tshld = float(tot) / float(denominator) else: tshld = threshold if prinf: print('Threshold: %d\n' % tshld) return tshld # copy dataframe to be very safe df = dataframe1.copy() # make cols into strings try: df.columns = [str(c) for c in list(df.columns)] except: pass if operation is None: operation = 'None' if isinstance(interrogation, Concordance): return_conc = True # do concordance work if return_conc: if just_entries: if isinstance(just_entries, int): just_entries = [just_entries] if isinstance(just_entries, STRINGTYPE): df = df[df['m'].str.contains(just_entries)] if isinstance(just_entries, list): if all(isinstance(e, STRINGTYPE) for e in just_entries): mp = df['m'].map(lambda x: x in just_entries) df = df[mp] else: df = df.ix[just_entries] if skip_entries: if isinstance(skip_entries, int): skip_entries = [skip_entries] if isinstance(skip_entries, STRINGTYPE): df = df[~df['m'].str.contains(skip_entries)] if isinstance(skip_entries, list): if all(isinstance(e, STRINGTYPE) for e in skip_entries): mp = df['m'].map(lambda x: x not in skip_entries) df = df[mp] else: df = df.drop(skip_entries, axis=0) if just_subcorpora: if isinstance(just_subcorpora, int): just_subcorpora = [just_subcorpora] if isinstance(just_subcorpora, STRINGTYPE): df = df[df['c'].str.contains(just_subcorpora)] if isinstance(just_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in just_subcorpora): mp = df['c'].map(lambda x: x in just_subcorpora) df = df[mp] else: df = df.ix[just_subcorpora] if skip_subcorpora: if isinstance(skip_subcorpora, int): skip_subcorpora = [skip_subcorpora] if isinstance(skip_subcorpora, STRINGTYPE): df = df[~df['c'].str.contains(skip_subcorpora)] if isinstance(skip_subcorpora, list): if all(isinstance(e, STRINGTYPE) for e in skip_subcorpora): mp = df['c'].map(lambda x: x not in skip_subcorpora) df = df[mp] else: df = df.drop(skip_subcorpora, axis=0) return Concordance(df) if print_info: print('\n***Processing results***\n========================\n') df1_istotals = False if isinstance(df, Series): df1_istotals = True df = DataFrame(df) # if just a single result else: df = DataFrame(df) if operation.startswith('k'): if sort_by is False: if not df1_istotals: sort_by = 'turbulent' if df1_istotals: df = df.T # figure out if there's a second list # copy and remove totals if there is single_totals = True using_totals = False outputmode = False if denominator.__class__ == Interrogation: try: denominator = denominator.results except AttributeError: denominator = denominator.totals if denominator is not False and not isinstance(denominator, STRINGTYPE): df2 = denominator.copy() using_totals = True if isinstance(df2, DataFrame): if len(df2.columns) > 1: single_totals = False else: df2 = Series(df2.iloc[:, 0]) elif isinstance(df2, Series): single_totals = True #if operation == 'k': #raise ValueError('Keywording requires a DataFrame for denominator. Use "self"?') else: if operation in ['k', 'a', '%', '/', '*', '-', '+']: denominator = 'self' if denominator == 'self': outputmode = True if operation.startswith('a') or operation.startswith('A'): if list(df.columns)[0] != '0' and list(df.columns)[0] != 0: df = df.T if using_totals: if not single_totals: df2 = df2.T if projection: # projection shouldn't do anything when working with '%', remember. df = projector(df, projection) if using_totals: df2 = projector(df2, projection) if spelling: df = convert_spell(df, convert_to=spelling) df = merge_duplicates(df, print_info=False) if not single_totals: df2 = convert_spell(df2, convert_to=spelling, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not df1_istotals: sort_by = 'total' if replace_names: df = name_replacer(df, replace_names) df = merge_duplicates(df) if not single_totals: df2 = name_replacer(df2, replace_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if not sort_by: sort_by = 'total' if replace_subcorpus_names: df = name_replacer(df.T, replace_subcorpus_names) df = merge_duplicates(df).T df = df.sort_index() if not single_totals: if isinstance(df2, DataFrame): df2 = df2.T df2 = name_replacer(df2, replace_subcorpus_names, print_info=False) df2 = merge_duplicates(df2, print_info=False) if isinstance(df2, DataFrame): df2 = df2.T df2 = df2.sort_index() if not sort_by: sort_by = 'total' # remove old stats if they're there: statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: df = df.drop(statfields, axis=0) except: pass if using_totals: try: df2 = df2.drop(statfields, axis=0) except: pass # remove totals and tkinter order for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and df1_istotals: continue try: df = df.drop(name, axis=ax, errors='ignore') except: pass for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): if name == 'Total' and single_totals: continue try: df2 = df2.drop(name, axis=ax, errors='ignore') except: pass df = skip_keep_merge_span(df) try: df2 = skip_keep_merge_span(df2) except: pass # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) if just_totals: df = DataFrame(df.sum(), columns=['Combined total']) if using_totals: if not single_totals: df2 = DataFrame(df2.sum(), columns=['Combined total']) else: df2 = df2.sum() tots = df.sum(axis=1) if using_totals or outputmode: if not operation.startswith('k'): tshld = 0 # set a threshold if just_totals if outputmode is True: df2 = df.T.sum() if not just_totals: df2.name = 'Total' else: df2.name = 'Combined total' using_totals = True single_totals = True if just_totals: if not single_totals: tshld = set_threshold(df2, threshold, prinf=print_info) df, tots = combiney(df, df2, operation=operation, threshold=tshld, prinf=print_info) # if doing keywording... if operation.startswith('k'): if isinstance(denominator, STRINGTYPE): if denominator == 'self': df2 = df.copy() else: df2 = denominator from corpkit.keys import keywords df = keywords(df, df2, selfdrop=selfdrop, threshold=threshold, print_info=print_info, editing=True, calc_all=calc_all, sort_by=sort_by, measure=keyword_measure, **kwargs) # drop infinites and nans df = df.replace([np.inf, -np.inf], np.nan) df = df.fillna(0.0) # resort data if sort_by or keep_stats: df = resort(df, keep_stats=keep_stats, sort_by=sort_by) if isinstance(df, bool): if df is False: return 'linregress' if keep_top: if not just_totals: df = df[list(df.columns)[:keep_top]] else: df = df.head(keep_top) if just_totals: # turn just_totals into series: df = Series(df['Combined total'], name='Combined total') if df1_istotals: if operation.startswith('k'): try: df = Series(df.ix[dataframe1.name]) df.name = '%s: keyness' % df.name except: df = df.iloc[0, :] df.name = 'keyness' % df.name # generate totals branch if not percentage results: # fix me if df1_istotals or operation.startswith('k'): if not just_totals: try: total = Series(df['Total'], name='Total') except: total = 'none' pass #total = df.copy() else: total = 'none' else: # might be wrong if using division or something... try: total = df.T.sum(axis=1) except: total = 'none' if not isinstance(tots, DataFrame) and not isinstance(tots, Series): total = df.sum(axis=1) else: total = tots if isinstance(df, DataFrame): if df.empty: datatype = 'object' else: datatype = df.iloc[0].dtype else: datatype = df.dtype locs['datatype'] = datatype # TURN INT COL NAMES INTO STR try: df.results.columns = [str(d) for d in list(df.results.columns)] except: pass def add_tkt_index(df): """add an order for tkintertable if using gui""" if isinstance(df, Series): df = df.T df = df.drop('tkintertable-order', errors='ignore', axis=0) df = df.drop('tkintertable-order', errors='ignore', axis=1) dat = [i for i in range(len(df.index))] df['tkintertable-order'] = Series(dat, index=list(df.index)) df = df.T return df # while tkintertable can't sort rows if checkstack('tkinter'): df = add_tkt_index(df) if kwargs.get('df1_always_df'): if isinstance(df, Series): df = DataFrame(df) # delete non-appearing conc lines lns = None if isinstance(getattr(interrogation, 'concordance', None), Concordance): try: col_crit = interrogation.concordance['m'].map( lambda x: x in list(df.columns)) ind_crit = interrogation.concordance['c'].map( lambda x: x in list(df.index)) lns = interrogation.concordance[col_crit] lns = lns.loc[ind_crit] lns = Concordance(lns) except ValueError: lns = None output = Interrogation(results=df, totals=total, query=locs, concordance=lns) if print_info: print('***Done!***\n========================\n') return output
def plotter(df, title=False, kind='line', x_label=None, y_label=None, style='ggplot', figsize=(8, 4), save=False, legend_pos='best', reverse_legend='guess', num_to_plot=7, tex='try', colours='default', cumulative=False, pie_legend=True, partial_pie=False, show_totals=False, transparent=False, output_format='png', interactive=False, black_and_white=False, show_p_val=False, indices=False, transpose=False, rot=False, **kwargs): """Visualise corpus interrogations. :param title: A title for the plot :type title: str :param df: Data to be plotted :type df: Pandas DataFrame :param x_label: A label for the x axis :type x_label: str :param y_label: A label for the y axis :type y_label: str :param kind: The kind of chart to make :type kind: str ('line'/'bar'/'barh'/'pie'/'area') :param style: Visual theme of plot :type style: str ('ggplot'/'bmh'/'fivethirtyeight'/'seaborn-talk'/etc) :param figsize: Size of plot :type figsize: tuple (int, int) :param save: If bool, save with *title* as name; if str, use str as name :type save: bool/str :param legend_pos: Where to place legend :type legend_pos: str ('upper right'/'outside right'/etc) :param reverse_legend: Reverse the order of the legend :type reverse_legend: bool :param num_to_plot: How many columns to plot :type num_to_plot: int/'all' :param tex: Use TeX to draw plot text :type tex: bool :param colours: Colourmap for lines/bars/slices :type colours: str :param cumulative: Plot values cumulatively :type cumulative: bool :param pie_legend: Show a legend for pie chart :type pie_legend: bool :param partial_pie: Allow plotting of pie slices only :type partial_pie: bool :param show_totals: Print sums in plot where possible :type show_totals: str -- 'legend'/'plot'/'both' :param transparent: Transparent .png background :type transparent: bool :param output_format: File format for saved image :type output_format: str -- 'png'/'pdf' :param black_and_white: Create black and white line styles :type black_and_white: bool :param show_p_val: Attempt to print p values in legend if contained in df :type show_p_val: bool :param indices: To use when plotting "distance from root" :type indices: bool :param stacked: When making bar chart, stack bars on top of one another :type stacked: str :param filled: For area and bar charts, make every column sum to 100 :type filled: str :param legend: Show a legend :type legend: bool :param rot: Rotate x axis ticks by *rot* degrees :type rot: int :param subplots: Plot each column separately :type subplots: bool :param layout: Grid shape to use when *subplots* is True :type layout: tuple -- (int, int) :param interactive: Experimental interactive options :type interactive: list -- [1, 2, 3] :returns: matplotlib figure """ import corpkit import os try: from IPython.utils.shimmodule import ShimWarning import warnings warnings.simplefilter('ignore', ShimWarning) except: pass kwargs['rot'] = rot xtickspan = kwargs.pop('xtickspan', False) # prefer seaborn plotting try: import seaborn as sns except (ImportError, AttributeError): pass import matplotlib as mpl from matplotlib import rc if interactive: import matplotlib.pyplot as plt, mpld3 else: import matplotlib.pyplot as plt import matplotlib.ticker as ticker import pandas from pandas import DataFrame, Series, MultiIndex from time import localtime, strftime from process import checkstack if interactive: import mpld3 import collections from mpld3 import plugins, utils from plugins import InteractiveLegendPlugin, HighlightLines have_mpldc = False try: from mpldatacursor import datacursor, HighlightingDataCursor have_mpldc = True except ImportError: pass # if the data was multiindexed, the default is a little different! from corpkit.interrogation import Interrogation if isinstance(df.index, MultiIndex): import matplotlib.pyplot as nplt shape = kwargs.get('shape', 'auto') truncate = kwargs.get('truncate', 8) if shape == 'auto': shape = (int(len(df.index.levels[0]) / 2), 2) f, axes = nplt.subplots(*shape) for i, ((name, data), ax) in enumerate(zip(df.groupby(level=0), axes.flatten())): data = data.loc[name] if isinstance(truncate, int) and i > truncate: continue if kwargs.get('name_format'): name = kwargs.get('name_format').format(name) data = Interrogation(results=data, totals=data.sum(axis=1), query=None) data.visualise(title=name, ax=ax, kind=kind, x_label=x_label, y_label=y_label, style=style, figsize=figsize, save=save, legend_pos=legend_pos, reverse_legend=reverse_legend, num_to_plot=num_to_plot, tex=tex, colours=colours, cumulative=cumulative, pie_legend=pie_legend, partial_pie=partial_pie, show_totals=show_totals, transparent=transparent, output_format=output_format, interactive=interactive, black_and_white=black_and_white, show_p_val=show_p_val, indices=indices, transpose=transpose, rot=rot) return nplt def copy(self): from corpkit.interrogation import Interrodict copied = {} for k, v in self.items(): copied[k] = v return Interrodict(copied) # check what environment we're in tk = checkstack('tkinter') running_python_tex = checkstack('pythontex') running_spider = checkstack('spyder') if not title: title = '' def truncate_colormap(cmap, minval=0.0, maxval=1.0, n=100): """remove extreme values from colourmap --- no pure white""" import matplotlib.colors as colors import numpy as np new_cmap = colors.LinearSegmentedColormap.from_list( 'trunc({n},{a:.2f},{b:.2f})'.format(n=cmap.name, a=minval, b=maxval), cmap(np.linspace(minval, maxval, n))) return new_cmap def get_savename(imagefolder, save=False, title=False, ext='png'): """Come up with the savename for the image.""" import os from corpkit.process import urlify # name as if not ext.startswith('.'): ext = '.' + ext if isinstance(save, STRINGTYPE): savename = os.path.join(imagefolder, (urlify(save) + ext)) #this 'else' is redundant now that title is obligatory else: if title: filename = urlify(title) + ext savename = os.path.join(imagefolder, filename) # remove duplicated ext if savename.endswith('%s%s' % (ext, ext)): savename = savename.replace('%s%s' % (ext, ext), ext, 1) return savename def rename_data_with_total(dataframe, was_series=False, using_tex=False, absolutes=True): """adds totals (abs, rel, keyness) to entry name strings""" if was_series: where_the_words_are = dataframe.index else: where_the_words_are = dataframe.columns the_labs = [] for w in list(where_the_words_are): if not absolutes: if was_series: perc = dataframe.T[w][0] else: the_labs.append(w) continue if using_tex: the_labs.append('%s (%.2f\%%)' % (w, perc)) else: the_labs.append('%s (%.2f %%)' % (w, perc)) else: if was_series: score = dataframe.T[w].sum() else: score = dataframe[w].sum() if using_tex: the_labs.append('%s (n=%d)' % (w, score)) else: the_labs.append('%s (n=%d)' % (w, score)) if not was_series: dataframe.columns = the_labs else: vals = list(dataframe[list(dataframe.columns)[0]].values) dataframe = pandas.DataFrame(vals, index=the_labs) dataframe.columns = ['Total'] return dataframe def auto_explode(dataframe, tinput, was_series=False, num_to_plot=7): """give me a list of strings and i'll output explode option""" output = [0 for s in range(num_to_plot)] if was_series: l = list(dataframe.index) else: l = list(dataframe.columns) if isinstance(tinput, (STRINGTYPE, int)): tinput = [tinput] if isinstance(tinput, list): for i in tinput: if isinstance(i, STRINGTYPE): index = l.index(i) else: index = i output[index] = 0.1 return output # get a few options from kwargs sbplt = kwargs.get('subplots', False) show_grid = kwargs.pop('grid', True) the_rotation = kwargs.get('rot', False) dragmode = kwargs.pop('draggable', False) leg_frame = kwargs.pop('legend_frame', True) leg_alpha = kwargs.pop('legend_alpha', 0.8) # auto set num to plot based on layout lo = kwargs.get('layout', None) if lo: num_to_plot = lo[0] * lo[1] # todo: get this dynamically instead. styles = [ 'dark_background', 'bmh', 'grayscale', 'ggplot', 'fivethirtyeight', 'matplotlib', False, 'mpl-white' ] #if style not in styles: #raise ValueError('Style %s not found. Use %s' % (str(style), ', '.join(styles))) if style == 'mpl-white': try: sns.set_style("whitegrid") except: pass style = 'matplotlib' if kwargs.get('savepath'): mpl.rcParams['savefig.directory'] = kwargs.get('savepath') kwargs.pop('savepath', None) mpl.rcParams['savefig.bbox'] = 'tight' mpl.rcParams.update({'figure.autolayout': True}) # try to use tex # TO DO: # make some font kwargs here using_tex = False mpl.rcParams['font.family'] = 'sans-serif' mpl.rcParams['text.latex.unicode'] = True if tex == 'try' or tex is True: try: rc('text', usetex=True) rc('font', **{'family': 'serif', 'serif': ['Computer Modern']}) using_tex = True except: matplotlib.rc('font', family='sans-serif') matplotlib.rc('font', serif='Helvetica Neue') matplotlib.rc('text', usetex='false') rc('text', usetex=False) else: rc('text', usetex=False) if interactive: using_tex = False if show_totals is False: show_totals = 'none' # find out what kind of plot we're making, and enable # or disable interactive values if need be kwargs['kind'] = kind.lower() if interactive: if kwargs['kind'].startswith('bar'): interactive_types = [3] elif kwargs['kind'] == 'area': interactive_types = [2, 3] elif kwargs['kind'] == 'line': interactive_types = [2, 3] elif kwargs['kind'] == 'pie': interactive_types = None warnings.warn( 'Interactive plotting not yet available for pie plots.') else: interactive_types = [None] if interactive is False: interactive_types = [None] # find out if pie mode, add autopct format piemode = False if kind == 'pie': piemode = True # always the best spot for pie #if legend_pos == 'best': #legend_pos = 'lower left' if show_totals.endswith('plot') or show_totals.endswith('both'): kwargs['pctdistance'] = 0.6 if using_tex: kwargs['autopct'] = r'%1.1f\%%' else: kwargs['autopct'] = '%1.1f%%' # copy data, make series into df dataframe = df.copy() if kind == 'heatmap': try: dataframe = dataframe.T except: pass was_series = False if isinstance(dataframe, Series): was_series = True if not cumulative: dataframe = DataFrame(dataframe) else: dataframe = DataFrame(dataframe.cumsum()) else: # don't know if this is much good. if transpose: dataframe = dataframe.T if cumulative: dataframe = DataFrame(dataframe.cumsum()) if len(list(dataframe.columns)) == 1: was_series = True # attempt to convert x axis to ints: #try: # dataframe.index = [int(i) for i in list(dataframe.index)] #except: # pass # remove totals and tkinter order if not was_series: for name, ax in zip(['Total'] * 2 + ['tkintertable-order'] * 2, [0, 1, 0, 1]): try: dataframe = dataframe.drop(name, axis=ax, errors='ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', errors='ignore') except: pass try: dataframe = dataframe.drop('tkintertable-order', axis=1, errors='ignore') except: pass # look at columns to see if all can be ints, in which case, set up figure # for depnumming if not was_series: if indices == 'guess': def isint(x): try: a = float(x) b = int(a) except (ValueError, OverflowError): return False else: return a == b if all([isint(x) is True for x in list(dataframe.columns)]): indices = True else: indices = False # if depnumming, plot all, transpose, and rename axes if indices is True: num_to_plot = 'all' dataframe = dataframe.T if y_label is None: y_label = 'Percentage of all matches' if x_label is None: x_label = '' # set backend? output_formats = [ 'svgz', 'ps', 'emf', 'rgba', 'raw', 'pdf', 'svg', 'eps', 'png', 'pgf' ] if output_format not in output_formats: raise ValueError('%s output format not recognised. Must be: %s' % (output_format, ', '.join(output_formats))) # don't know if these are necessary if 'pdf' in output_format: plt.switch_backend(output_format) if 'pgf' in output_format: plt.switch_backend(output_format) if num_to_plot == 'all': if was_series: if not piemode: num_to_plot = len(dataframe) else: num_to_plot = len(dataframe) else: if not piemode: num_to_plot = len(list(dataframe.columns)) else: num_to_plot = len(dataframe.index) # explode pie, or remove if not piemode if piemode and not sbplt and kwargs.get('explode'): kwargs['explode'] = auto_explode(dataframe, kwargs['explode'], was_series=was_series, num_to_plot=num_to_plot) else: kwargs.pop('explode', None) legend = kwargs.get('legend', True) #cut data short plotting_a_totals_column = False if was_series: if list(dataframe.columns)[0] != 'Total': try: can_be_ints = [int(x) for x in list(dataframe.index)] num_to_plot = len(dataframe) except: dataframe = dataframe[:num_to_plot] elif list(dataframe.columns)[0] == 'Total': plotting_a_totals_column = True if not 'legend' in kwargs: legend = False num_to_plot = len(dataframe) else: if transpose: dataframe = dataframe.head(num_to_plot) else: dataframe = dataframe.T.head(num_to_plot).T # remove stats fields, put p in entry text, etc. statfields = ['slope', 'intercept', 'r', 'p', 'stderr'] try: dataframe = dataframe.drop(statfields, axis=1, errors='ignore') except: pass try: dataframe.ix['p'] there_are_p_vals = True except: there_are_p_vals = False if show_p_val: if there_are_p_vals: newnames = [] for col in list(dataframe.columns): pval = dataframe[col]['p'] def p_string_formatter(val): if val < 0.001: if not using_tex: return 'p < 0.001' else: return r'p $<$ 0.001' else: return 'p = %s' % format(val, '.3f') pstr = p_string_formatter(pval) newname = '%s (%s)' % (col, pstr) newnames.append(newname) dataframe.columns = newnames dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') else: warnings.warn( 'No p-values calculated to show.\n\nUse keep_stats kwarg while editing to generate these values.' ) else: if there_are_p_vals: dataframe.drop(statfields, axis=0, inplace=True, errors='ignore') # make and set y label absolutes = True if isinstance(dataframe, DataFrame): try: if not all([s.is_integer() for s in dataframe.iloc[0, :].values]): absolutes = False except: pass else: if not all([s.is_integer() for s in dataframe.values]): absolutes = False ########################################## ################ COLOURS ################# ########################################## # set defaults, with nothing for heatmap yet if colours is True or colours == 'default' or colours == 'Default': if kind != 'heatmap': colours = 'viridis' else: colours = 'default' # assume it's a single color, unless string denoting map cmap_or_c = 'color' if isinstance(colours, str): cmap_or_c = 'colormap' from matplotlib.colors import LinearSegmentedColormap if isinstance(colours, LinearSegmentedColormap): cmap_or_c = 'colormap' # for heatmaps, it's always a colormap if kind == 'heatmap': cmap_or_c = 'cmap' # if it's a defaulty string, set accordingly if isinstance(colours, str): if colours.lower().startswith('diverg'): colours = sns.diverging_palette(10, 133, as_cmap=True) # if default not set, do diverge for any df with a number < 0 elif colours.lower() == 'default': mn = dataframe.min() if isinstance(mn, Series): mn = mn.min() if mn < 0: colours = sns.diverging_palette(10, 133, as_cmap=True) else: colours = sns.light_palette("green", as_cmap=True) if 'seaborn' not in style: kwargs[cmap_or_c] = colours #if not was_series: # if kind in ['pie', 'line', 'area']: # if colours and not plotting_a_totals_column: # kwargs[cmap_or_c] = colours # else: # if colours: # kwargs[cmap_or_c] = colours #if piemode: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # else: # if num_to_plot > 0: # kwargs[cmap_or_c] = colours # multicoloured bar charts #if colours and cmap_or_c == 'colormap': # if kind.startswith('bar'): # if len(list(dataframe.columns)) == 1: # if not black_and_white: # import numpy as np # the_range = np.linspace(0, 1, num_to_plot) # middle = len(the_range) / 2 # try: # cmap = plt.get_cmap(colours) # kwargs[cmap_or_c] = [cmap(n) for n in the_range][middle] # except ValueError: # kwargs[cmap_or_c] = colours # # make a bar width ... ? ... # #kwargs['width'] = (figsize[0] / float(num_to_plot)) / 1.5 # reversing legend option if reverse_legend is True: rev_leg = True elif reverse_legend is False: rev_leg = False # show legend or don't, guess whether to reverse based on kind if kind in ['bar', 'barh', 'area', 'line', 'pie']: if was_series: legend = False if kind == 'pie': if pie_legend: legend = True else: legend = False if kind in ['barh', 'area']: if reverse_legend == 'guess': rev_leg = True if not 'rev_leg' in locals(): rev_leg = False # the default legend placement if legend_pos is True: legend_pos = 'best' # cut dataframe if just_totals try: tst = dataframe['Combined total'] dataframe = dataframe.head(num_to_plot) except: pass # no title for subplots because ugly, if title and not sbplt: kwargs['title'] = title # no interactive subplots yet: if sbplt and interactive: import warnings interactive = False warnings.warn('No interactive subplots yet, sorry.') return # not using pandas for labels or legend anymore. #kwargs['labels'] = None #kwargs['legend'] = False if legend: if num_to_plot > 6: if not kwargs.get('ncol'): kwargs['ncol'] = num_to_plot // 7 # kwarg options go in leg_options leg_options = { 'framealpha': leg_alpha, 'shadow': kwargs.get('shadow', False), 'ncol': kwargs.pop('ncol', 1) } # determine legend position based on this dict if legend_pos: possible = { 'best': 0, 'upper right': 1, 'upper left': 2, 'lower left': 3, 'lower right': 4, 'right': 5, 'center left': 6, 'center right': 7, 'lower center': 8, 'upper center': 9, 'center': 10, 'o r': 2, 'outside right': 2, 'outside upper right': 2, 'outside center right': 'center left', 'outside lower right': 'lower left' } if isinstance(legend_pos, int): the_loc = legend_pos elif isinstance(legend_pos, str): try: the_loc = possible[legend_pos] except KeyError: raise KeyError( 'legend_pos value must be one of:\n%s\n or an int between 0-10.' % ', '.join(list(possible.keys()))) leg_options['loc'] = the_loc #weirdness needed for outside plot if legend_pos in ['o r', 'outside right', 'outside upper right']: leg_options['bbox_to_anchor'] = (1.02, 1) if legend_pos == 'outside center right': leg_options['bbox_to_anchor'] = (1.02, 0.5) if legend_pos == 'outside lower right': leg_options['loc'] == 'upper right' leg_options['bbox_to_anchor'] = (0.5, 0.5) # a bit of distance between legend and plot for outside legends if isinstance(legend_pos, str): if legend_pos.startswith('o'): leg_options['borderaxespad'] = 1 if not piemode: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) else: if pie_legend: if show_totals.endswith('both') or show_totals.endswith('legend'): dataframe = rename_data_with_total(dataframe, was_series=was_series, using_tex=using_tex, absolutes=absolutes) if piemode: if partial_pie: dataframe = dataframe / 100.0 # some pie things if piemode: if not sbplt: kwargs['y'] = list(dataframe.columns)[0] def filler(df): pby = df.T.copy() for i in list(pby.columns): tot = pby[i].sum() pby[i] = pby[i] * 100.0 / tot return pby.T areamode = False if kind == 'area': areamode = True if legend is False: kwargs['legend'] = False # line highlighting option for interactive! if interactive: if 2 in interactive_types: if kind == 'line': kwargs['marker'] = ',' if not piemode: kwargs['alpha'] = 0.1 # convert dates --- works only in my current case! #if plotting_a_totals_column or not was_series: # try: # can_it_be_int = int(list(dataframe.index)[0]) # can_be_int = True # except: # can_be_int = False # if can_be_int: # if 1500 < int(list(dataframe.index)[0]): # if 2050 > int(list(dataframe.index)[0]): # n = pandas.PeriodIndex([d for d in list(dataframe.index)], freq='A') # dataframe = dataframe.set_index(n) if kwargs.get('filled'): if areamode or kind.startswith('bar'): dataframe = filler(dataframe) kwargs.pop('filled', None) MARKERSIZE = 4 COLORMAP = { 0: { 'marker': None, 'dash': (None, None) }, 1: { 'marker': None, 'dash': [5, 5] }, 2: { 'marker': "o", 'dash': (None, None) }, 3: { 'marker': None, 'dash': [1, 3] }, 4: { 'marker': "s", 'dash': [5, 2, 5, 2, 5, 10] }, 5: { 'marker': None, 'dash': [5, 3, 1, 2, 1, 10] }, 6: { 'marker': 'o', 'dash': (None, None) }, 7: { 'marker': None, 'dash': [5, 3, 1, 3] }, 8: { 'marker': "1", 'dash': [1, 3] }, 9: { 'marker': "*", 'dash': [5, 5] }, 10: { 'marker': "2", 'dash': [5, 2, 5, 2, 5, 10] }, 11: { 'marker': "s", 'dash': (None, None) } } HATCHES = { 0: { 'color': '#dfdfdf', 'hatch': "/" }, 1: { 'color': '#6f6f6f', 'hatch': "\\" }, 2: { 'color': 'b', 'hatch': "|" }, 3: { 'color': '#dfdfdf', 'hatch': "-" }, 4: { 'color': '#6f6f6f', 'hatch': "+" }, 5: { 'color': 'b', 'hatch': "x" } } if black_and_white: if kind == 'line': kwargs['linewidth'] = 1 cmap = plt.get_cmap('Greys') new_cmap = truncate_colormap(cmap, 0.25, 0.95) if kind == 'bar': # darker if just one entry if len(dataframe.columns) == 1: new_cmap = truncate_colormap(cmap, 0.70, 0.90) kwargs[cmap_or_c] = new_cmap # remove things from kwargs if heatmap if kind == 'heatmap': hmargs = { 'annot': kwargs.pop('annot', True), cmap_or_c: kwargs.pop(cmap_or_c, None), 'fmt': kwargs.pop('fmt', ".2f"), 'cbar': kwargs.pop('cbar', False) } for i in [ 'vmin', 'vmax', 'linewidths', 'linecolor', 'robust', 'center', 'cbar_kws', 'cbar_ax', 'square', 'mask', 'norm' ]: if i in kwargs.keys(): hmargs[i] = kwargs.pop(i, None) class dummy_context_mgr(): """a fake context for plotting without style perhaps made obsolete by 'classic' style in new mpl""" def __enter__(self): return None def __exit__(self, one, two, three): return False with plt.style.context( (style)) if style != 'matplotlib' else dummy_context_mgr(): kwargs.pop('filled', None) if not sbplt: # check if negative values, no stacked if so if areamode: if not kwargs.get('ax'): kwargs['legend'] = False if dataframe.applymap(lambda x: x < 0.0).any().any(): kwargs['stacked'] = False rev_leg = False if kind != 'heatmap': # turn off pie labels at the last minute if kind == 'pie' and pie_legend: kwargs['labels'] = None kwargs['autopct'] = '%.2f' if kind == 'pie': kwargs.pop('color', None) ax = dataframe.plot(figsize=figsize, **kwargs) else: fg = plt.figure(figsize=figsize) if title: plt.title(title) ax = kwargs.get('ax', plt.axes()) tmp = sns.heatmap(dataframe, ax=ax, **hmargs) ax.set_title(title) for item in tmp.get_yticklabels(): item.set_rotation(0) plt.close(fg) if areamode and not kwargs.get('ax'): handles, labels = plt.gca().get_legend_handles_labels() del handles del labels if x_label: ax.set_xlabel(x_label) if y_label: ax.set_ylabel(y_label) else: if not kwargs.get('layout'): plt.gcf().set_tight_layout(False) if kind != 'heatmap': ax = dataframe.plot(figsize=figsize, **kwargs) else: plt.figure(figsize=figsize) if title: plt.title(title) ax = plt.axes() sns.heatmap(dataframe, ax=ax, **hmargs) plt.xticks(rotation=0) plt.yticks(rotation=0) def rotate_degrees(rotation, labels): if rotation is None: if max(labels, key=len) > 6: return 45 else: return 0 elif rotation is False: return 0 elif rotation is True: return 45 else: return rotation if sbplt: if 'layout' not in kwargs: axes = [l for l in ax] else: axes = [] cols = [l for l in ax] for col in cols: for bit in col: axes.append(bit) for index, a in enumerate(axes): if xtickspan is not False: a.xaxis.set_major_locator( ticker.MultipleLocator(xtickspan)) labels = [item.get_text() for item in a.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) try: if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') except AttributeError: pass else: if kind == 'heatmap': labels = [item.get_text() for item in ax.get_xticklabels()] rotation = rotate_degrees(the_rotation, labels) if the_rotation == 0: ax.set_xticklabels(labels, rotation=rotation, ha='center') else: ax.set_xticklabels(labels, rotation=rotation, ha='right') if transparent: plt.gcf().patch.set_facecolor('white') plt.gcf().patch.set_alpha(0) if black_and_white: if kind == 'line': # white background # change everything to black and white with interesting dashes and markers c = 0 for line in ax.get_lines(): line.set_color('black') #line.set_width(1) line.set_dashes(COLORMAP[c]['dash']) line.set_marker(COLORMAP[c]['marker']) line.set_markersize(MARKERSIZE) c += 1 if c == len(list(COLORMAP.keys())): c = 0 # draw legend with proper placement etc if legend: if not piemode and not sbplt and kind != 'heatmap': if 3 not in interactive_types: handles, labels = plt.gca().get_legend_handles_labels() # area doubles the handles and labels. this removes half: #if areamode: # handles = handles[-len(handles) / 2:] # labels = labels[-len(labels) / 2:] if rev_leg: handles = handles[::-1] labels = labels[::-1] if kwargs.get('ax'): lgd = plt.gca().legend(handles, labels, **leg_options) ax.get_legend().draw_frame(leg_frame) else: lgd = plt.legend(handles, labels, **leg_options) lgd.draw_frame(leg_frame) if interactive: # 1 = highlight lines # 2 = line labels # 3 = legend switches ax = plt.gca() # fails for piemode lines = ax.lines handles, labels = plt.gca().get_legend_handles_labels() if 1 in interactive_types: plugins.connect(plt.gcf(), HighlightLines(lines)) if 3 in interactive_types: plugins.connect( plt.gcf(), InteractiveLegendPlugin(lines, labels, alpha_unsel=0.0)) for i, l in enumerate(lines): y_vals = l.get_ydata() x_vals = l.get_xdata() x_vals = [str(x) for x in x_vals] if absolutes: ls = [ '%s (%s: %d)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] else: ls = [ '%s (%s: %.2f%%)' % (labels[i], x_val, y_val) for x_val, y_val in zip(x_vals, y_vals) ] if 2 in interactive_types: #if 'kind' in kwargs and kwargs['kind'] == 'area': tooltip_line = mpld3.plugins.LineLabelTooltip( lines[i], labels[i]) mpld3.plugins.connect(plt.gcf(), tooltip_line) #else: if kind == 'line': tooltip_point = mpld3.plugins.PointLabelTooltip(l, labels=ls) mpld3.plugins.connect(plt.gcf(), tooltip_point) if piemode: if not sbplt: plt.axis('equal') ax.get_xaxis().set_visible(False) ax.get_yaxis().set_visible(False) # add x label # this could be revised now! # if time series period, it's year for now if isinstance(dataframe.index, pandas.tseries.period.PeriodIndex): x_label = 'Year' y_l = False if not absolutes: y_l = 'Percentage' else: y_l = 'Absolute frequency' # hacky: turn legend into subplot titles :) if sbplt: # title the big plot #plt.gca().suptitle(title, fontsize = 16) #plt.subplots_adjust(top=0.9) # get all axes if 'layout' not in kwargs: axes = [l for index, l in enumerate(ax)] else: axes = [] cols = [l for index, l in enumerate(ax)] for col in cols: for bit in col: axes.append(bit) # set subplot titles for index, a in enumerate(axes): try: titletext = list(dataframe.columns)[index] except: pass a.set_title(titletext) try: a.legend_.remove() except: pass #try: # from matplotlib.ticker import MaxNLocator # from corpkit.process import is_number # indx = list(dataframe.index) # if all([is_number(qq) for qq in indx]): # ax.get_xaxis().set_major_locator(MaxNLocator(integer=True)) #except: # pass # remove axis labels for pie plots if piemode: a.axes.get_xaxis().set_visible(False) a.axes.get_yaxis().set_visible(False) a.axis('equal') a.grid(b=show_grid) # add sums to bar graphs and pie graphs # doubled right now, no matter if not sbplt: # show grid ax.grid(b=show_grid) if kind.startswith('bar'): width = ax.containers[0][0].get_width() if was_series: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): # make plot a bit higher if putting these totals on it plt.ylim([0, the_y_limit * 1.05]) for i, label in enumerate(list(dataframe.index)): if len(dataframe.ix[label]) == 1: score = dataframe.ix[label][0] else: if absolutes: score = dataframe.ix[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') else: the_y_limit = plt.ylim()[1] if show_totals.endswith('plot') or show_totals.endswith('both'): for i, label in enumerate(list(dataframe.columns)): if len(dataframe[label]) == 1: score = dataframe[label][0] else: if absolutes: score = dataframe[label].sum() else: #import warnings #warnings.warn("It's not possible to determine total percentage from individual percentages.") continue if not absolutes: plt.annotate('%.2f' % score, (i, score), ha='center', va='bottom') else: plt.annotate(score, (i, score), ha='center', va='bottom') if not kwargs.get('layout') and not sbplt and not kwargs.get('ax'): plt.tight_layout() if kwargs.get('ax'): try: plt.gcf().set_tight_layout(False) except: pass try: plt.set_tight_layout(False) except: pass if save: if running_python_tex: imagefolder = '../images' else: imagefolder = 'images' savename = get_savename(imagefolder, save=save, title=title, ext=output_format) if not os.path.isdir(imagefolder): os.makedirs(imagefolder) # save image and get on with our lives if legend_pos.startswith('o') and not sbplt: plt.gcf().savefig(savename, dpi=150, bbox_extra_artists=(lgd, ), bbox_inches='tight', format=output_format) else: plt.gcf().savefig(savename, dpi=150, format=output_format) time = strftime("%H:%M:%S", localtime()) if os.path.isfile(savename): print('\n' + time + ": " + savename + " created.") else: raise ValueError("Error making %s." % savename) if dragmode: plt.legend().draggable() if sbplt: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) # add DataCursor to notebook backend if possible if have_mpldc: if kind == 'line': HighlightingDataCursor( plt.gca().get_lines(), highlight_width=4, highlight_color=False, formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['y']))) else: datacursor(formatter=lambda **kwargs: '%s: %s' % (kwargs['label'], "{0:.3f}".format(kwargs['height']))) #if not interactive and not running_python_tex and not running_spider \ # and not tk: # plt.gcf().show() # return plt #elif running_spider or tk: # return plt if interactive: plt.subplots_adjust(right=.8) plt.subplots_adjust(left=.1) try: ax.legend_.remove() except: pass return mpld3.display() else: return plt
def interrogator( corpus, search, query="any", show="w", exclude=False, excludemode="any", searchmode="all", dep_type="collapsed-ccprocessed-dependencies", case_sensitive=False, quicksave=False, just_speakers=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, conc=False, only_unique=False, random=False, only_format_match=False, multiprocess=False, spelling=False, regex_nonword_filter=r"[A-Za-z0-9:_]", gramsize=2, split_contractions=False, **kwargs ): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" # store kwargs locs = locals() from corpkit.interrogation import Interrogation from corpkit.process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from corpkit.other import as_regex from corpkit.process import get_deps from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator from corpkit.dictionaries.word_transforms import wordlist, taglemma # find out if using gui root = kwargs.get("root") note = kwargs.get("note") # convert path to corpus object if type(corpus) == str: from corpkit.corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from corpkit.process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(search.keys()) == 1: query = search.values()[0] if "l" in show and search.get("t"): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, "__iter__"): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != search.values()[0] or len(search.keys()) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == "each": im = True just_speakers = ["each"] if just_speakers == ["each"]: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in search.values()): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" import os from corpkit.process import tregex_engine # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None]) to_write.encode("utf-8", errors="ignore") with open(to_open, "w") as fo: fo.write(to_write) q = search.values()[0] res = tregex_engine( query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True ) if root: root.update() os.remove(to_open) if countmode: return len(res) else: return res def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" with open(to_open, "w") as fo: for sent in sents: statsmode_results["Sentences"] += 1 sts = sent.parse_string.rstrip() encd = sts.encode("utf-8", errors="ignore") + "\n" fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith("pass")]) statsmode_results["Passives"] += numpass statsmode_results["Tokens"] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results["Words"] += len(words) statsmode_results["Characters"] += len("".join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from corpkit.other import as_regex tregex_qs = { "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/", "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)", "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))", "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))", "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))", "Open class words": r"/^(NN|JJ|VB|RB)/ < __", "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/", "Clauses": r"/^S/ < __", "Interrogative": r"ROOT << (/\?/ !< __)", "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"), "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"), "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.relational, boundaries="w"), } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + "/" + str(total_files) if kwargs.get("outname"): tot_string = "%s: %s" % (kwargs["outname"], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get("note", False): kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False): if speakr is False: speakr = "" conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if "-join-".join([f, whole, mid]) not in duplicates: duplicates.append("-join-".join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith("u"): if word.lower() in taglemma.keys(): word = taglemma[word.lower()] else: if word == "x": word = "Other" # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag=False): """ Find tag for WordNet lemmatisation """ import re tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False} if lemmatag is False: tag = "n" # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)") tagchecker = re.compile(r"^[A-Z]{1,4}$") qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "") treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], "n") elif lemmatag: tag = lemmatag return tag def format_tregex(results): """format tregex by show list""" if countmode: return results import re done = [] if "l" in show or "pl" in show: lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get("w"): if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("w"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("l"), lemma): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("p"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("pl"), lemma): continue if exclude and excludemode == "all": num_to_cause_exclude = len(exclude.keys()) current_num = 0 if exclude.get("w"): if re.search(exclude.get("w"), word): current_num += 1 if exclude.get("l"): if re.search(exclude.get("l"), lemma): current_num += 1 if exclude.get("p"): if re.search(exclude.get("p"), word): current_num += 1 if exclude.get("pl"): if re.search(exclude.get("pl"), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == "t": bits.append(word) if i == "l": bits.append(lemma) elif i == "w": bits.append(word) elif i == "p": bits.append(word) elif i == "pl": bits.append(lemma) joined = "/".join(bits) done.append(joined) return done def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = "".join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == "any": pattern = r".*" if not split_contractions: list_of_toks = unsplitter(list_of_toks) # list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index + x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[" ".join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in ngrams.items(): if v > 1: for i in range(v): result.append(k) if countmode: return len(result) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print "%s: Query %s" % (thetime, error_message) if root: return "Bad query" else: raise ValueError("%s: Query %s" % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == "Bad query": return "Bad query" if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})" compiled_pattern = compiler(pattern) if compiled_pattern == "Bad query": return "Bad query" matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return len(matches) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == "uk": usa_convert = {v: k for k, v in usa_convert.items()} spell_out = [] bits = a_string.split("/") for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = "/".join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})" pat = compiler(pat) if pat == "Bad query": return "Bad query" matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs["search"] = search locs["query"] = query locs["just_speakers"] = just_speakers locs["corpus"] = corpus locs["multiprocess"] = multiprocess if im: from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} # check if just counting countmode = "c" in show # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get("denominator", 1) startnum = kwargs.get("startnum", 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and "t" in search.keys(): simple_tregex_mode = True else: if corpus.datatype == "plaintext": if search.get("n"): raise NotImplementedError("Use a tokenised corpus for n-gramming.") # searcher = plaintext_ngram optiontext = "n-grams via plaintext" if search.get("w"): if kwargs.get("regex", True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = "Searching plaintext" elif corpus.datatype == "tokens": if search.get("n"): searcher = tok_ngrams optiontext = "n-grams via tokens" elif search.get("w"): if kwargs.get("regex", True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get("w")) == list: searcher = tok_by_list optiontext = "Searching tokens" only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"] if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()): raise ValueError( 'Need parsed corpus to search with "%s" option(s).' % ", ".join([i for i in search.keys() if i in only_parse]) ) elif corpus.datatype == "parse": if search.get("t"): searcher = slow_tregex elif search.get("s"): searcher = get_stats statsmode = True optiontext = "General statistics" global numdone numdone = 0 else: from corpkit.depsearch import dep_searcher searcher = dep_searcher optiontext = "Dependency querying" ############################################ # Set some Tregex-related values # ############################################ if search.get("t"): query = search.get("t") # check the query q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root) if query is False: if root: return "Bad query" else: return optiontext = "Searching parse trees" if "p" in show or "pl" in show: translated_option = "u" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "t" in show: translated_option = "o" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "w" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "c" in show: count_results = {} only_count = True translated_option = "C" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "l" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" query = search["t"] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for k, v in sorted(corpus.structure.items()): to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if conc: message = "Concordancing" else: message = "Interrogating" if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) sformat = "\n ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()]) if search == {"s": r".*"}: sformat = "features" welcome = "\n%s: %s %s ...\n %s\n Query: %s\n" % ( thetime, message, corpus.name, optiontext, sformat, ) print welcome ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(to_iterate_over.keys()) else: if search.get("s"): total_files = sum([len(x) for x in to_iterate_over.values()]) * 12 else: total_files = sum([len(x) for x in to_iterate_over.values()]) par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files} term = None if kwargs.get("paralleling", None) is not None: from blessings import Terminal term = Terminal() par_args["terminal"] = term par_args["linenum"] = kwargs.get("paralleling") outn = kwargs.get("outname", "") if outn: outn = outn + ": " tstr = "%s%d/%d" % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if countmode or conc: results[subcorpus_name] = [] else: results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ["-o", "-" + translated_option] result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if countmode: results[subcorpus_name].append(result) continue result = Counter(format_tregex(result)) if conc: op.append("-w") whole_result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if not only_format_match: whole_result = format_tregex(whole_result) result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False) if spelling: for index, line in enumerate(result): result[index] = [correct_spelling(b) for b in line] results[subcorpus_name] += result current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: if corpus.datatype == "parse": with open(f.path, "r") as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print "Could not read file: %s" % f.path continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if not sents: continue else: sents = corenlp_xml.sentences res = searcher( sents, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, lemmatise=False, case_sensitive=case_sensitive, concordancing=conc, only_format_match=only_format_match, ) if res == "Bad query": return "Bad query" if searcher == slow_tregex and not countmode: res = format_tregex(res) elif corpus.datatype == "tokens": import pickle with open(f.path, "rb") as fo: data = pickle.load(fo) res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") elif corpus.datatype == "plaintext": with open(f.path, "rb") as data: data = data.read() data = unicode(data, errors="ignore") res = searcher(search.values()[0], data, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") if countmode: results[subcorpus_name] += res continue # add filename and do lowercasing for conc if conc: for index, line in enumerate(res): line.insert(0, f.name) if not preserve_case: line = [b.lower() for b in line] if spelling: line = [correct_spelling(b) for b in line] results[subcorpus_name] += [line] # do lowercasing and spelling else: if not preserve_case: res = [r.lower() for r in res] if spelling: res = [correct_spelling(r) for r in res] results[subcorpus_name] += Counter(res) if not statsmode: current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) # delete temp file if there import os if os.path.isfile("tmp.txt"): os.remove("tmp.txt") ############################################ # Get concordances into DataFrame # ############################################ if conc: all_conc_lines = [] for sc_name, resu in sorted(results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu # make into series pindex = "c f s l m r".encode("utf-8").split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors="ignore") fname = os.path.basename(fname) # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function all_conc_lines.append( Series( [ sc_name.encode("ascii", errors="ignore"), fname.encode("ascii", errors="ignore"), spkr.encode("ascii", errors="ignore"), start.encode("ascii", errors="ignore"), word.encode("ascii", errors="ignore"), end.encode("ascii", errors="ignore"), ], index=pindex, ) ) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) df = pd.concat(all_conc_lines, axis=1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: df.columns = ["c", "f", "s", "l", "m", "r"] else: df.columns = ["c", "f", "s", "l", "m", "r", "link"] if all(x == "" for x in list(df["s"].values)): df.drop("s", axis=1, inplace=True) if kwargs.get("note"): kwargs["note"].progvar.set(100) if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index)) print finalstring from corpkit.interrogation import Concordance output = Concordance(df) output.query = locs if quicksave: interro.save() return output ############################################ # Get interrogation into DataFrame # ############################################ else: if countmode: df = Series({k: sum(v) for k, v in sorted(results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in results.values() for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get("df1_always_df"): df = Series(df.ix[0]) df.sort(ascending=False) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix["Total-tmp"] = df.sum() the_tot = df.ix["Total-tmp"] df = df[the_tot.argsort()[::-1]] df = df.drop("Total-tmp", axis=0) # format final string if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Interrogation finished!" % thetime if countmode: finalstring += " %d matches." % tot else: finalstring += " %d unique results, %d total occurrences." % (numentries, total_total) print finalstring interro = Interrogation(results=df, totals=tot, query=locs) if quicksave: interro.save() return interro
def pmultiquery(corpus, search, show='words', query='any', sort_by='total', save=False, multiprocess='default', root=False, note=False, print_info=True, subcorpora=False, **kwargs ): """ - Parallel process multiple queries or corpora. - This function is used by corpkit.interrogator.interrogator() - for multiprocessing. - There's no reason to call this function yourself. """ import os from pandas import DataFrame, Series import pandas as pd import collections from collections import namedtuple, OrderedDict from time import strftime, localtime import corpkit from corpkit.interrogator import interrogator from corpkit.interrogation import Interrogation, Interrodict from corpkit.process import canpickle try: from joblib import Parallel, delayed except ImportError: pass import multiprocessing locs = locals() for k, v in kwargs.items(): locs[k] = v in_notebook = locs.get('in_notebook') def best_num_parallel(num_cores, num_queries): """decide how many parallel processes to run the idea, more or less, is to balance the load when possible""" import corpkit if num_queries <= num_cores: return num_queries if num_queries > num_cores: if (num_queries / num_cores) == num_cores: return int(num_cores) if num_queries % num_cores == 0: try: return max([int(num_queries / n) for n in range(2, num_cores) \ if int(num_queries / n) <= num_cores]) except ValueError: return num_cores else: import math if (float(math.sqrt(num_queries))).is_integer(): square_root = math.sqrt(num_queries) if square_root <= num_queries / num_cores: return int(square_root) return num_cores num_cores = multiprocessing.cpu_count() # what is our iterable? ... multiple = kwargs.get('multiple', False) mult_corp_are_subs = False if hasattr(corpus, '__iter__'): if all(getattr(x, 'level', False) == 's' for x in corpus): mult_corp_are_subs = True non_first_sub = None if subcorpora: non_first_sub = subcorpora[1:] if isinstance(subcorpora, list) else None subval = subcorpora if not non_first_sub else subcorpora[0] #print(subcorpora, non_first_sub, subval) if subcorpora is True: import re subcorpora = re.compile(r'.*') else: # strange travis error happened here subcorpora = corpus.metadata['fields'][subval] if len(subcorpora) == 0: print('No %s metadata found.' % str(subval)) return mapcores = {'datalist': [corpus, 'corpus'], 'multiplecorpora': [corpus, 'corpus'], 'namedqueriessingle': [query, 'query'], 'namedqueriesmultiple': [search, 'search'], 'subcorpora': [subcorpora, 'subcorpora']} # a is a dummy, just to produce default one toiter, itsname = mapcores.get(multiple, [False, False]) if isinstance(toiter, dict): toiter = toiter.items() denom = len(toiter) num_cores = best_num_parallel(num_cores, denom) # todo: code below makes no sense vals = ['eachspeaker', 'multiplespeaker', 'namedqueriesmultiple'] if multiple == 'multiplecorpora' and any(x is True for x in vals): from corpkit.corpus import Corpus, Corpora if isinstance(corpus, Corpora): multiprocess = False else: corpus = Corpus(corpus) if isinstance(multiprocess, int): num_cores = multiprocess if multiprocess is False: num_cores = 1 # make sure saves are right type if save is True: raise ValueError('save must be string when multiprocessing.') # make a list of dicts to pass to interrogator, # with the iterable unique in every one locs['printstatus'] = False locs['multiprocess'] = False locs['df1_always_df'] = False locs['files_as_subcorpora'] = False locs['corpus'] = corpus if multiple == 'multiplespeaker': locs['multispeaker'] = True if isinstance(non_first_sub, list) and len(non_first_sub) == 1: non_first_sub = non_first_sub[0] # make the default query locs = {k: v for k, v in locs.items() if canpickle(v)} # make a new dict for every iteration ds = [dict(**locs) for i in range(denom)] for index, (d, bit) in enumerate(zip(ds, toiter)): d['paralleling'] = index if multiple in ['namedqueriessingle', 'namedqueriesmultiple']: d[itsname] = bit[1] d['outname'] = bit[0] elif multiple in ['multiplecorpora', 'datalist']: d['outname'] = bit.name.replace('-parsed', '') d[itsname] = bit elif multiple in ['subcorpora']: d[itsname] = bit jmd = {subval: bit} # put this earlier j2 = kwargs.get('just_metadata', False) if not j2: j2 = {} jmd.update(j2) d['just_metadata'] = jmd d['outname'] = bit d['by_metadata'] = False d['subcorpora'] = non_first_sub if non_first_sub: d['print_info'] = False # message printer should be a function... if kwargs.get('conc') is False: message = 'Interrogating' elif kwargs.get('conc') is True: message = 'Interrogating and concordancing' elif kwargs.get('conc').lower() == 'only': message = 'Concordancing' time = strftime("%H:%M:%S", localtime()) from corpkit.process import dictformat if print_info: # proper printing for plurals # in truth this needs to be revised, it's horrible. sformat = dictformat(search, query) if num_cores == 1: add_es = '' else: add_es = 'es' if multiple in ['multiplecorpora', 'datalist']: corplist = "\n ".join([i.name for i in list(corpus)[:20]]) if len(corpus) > 20: corplist += '\n ... and %d more ...\n' % (len(corpus) - 20) print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s):\n %s" \ "\n Query: %s\n %s corpus ... \n" % (time, len(corpus), num_cores, add_es, corplist, sformat, message))) elif multiple == 'namedqueriessingle': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(query), num_cores, add_es, corpus.name, sformat, message) )) elif multiple == 'namedqueriesmultiple': print(("\n%s: Beginning %d corpus interrogations (in %d parallel process%s): %s" \ "\n Queries: %s\n %s corpus ... \n" % (time, len(list(search.keys())), num_cores, add_es, corpus.name, sformat, message))) elif multiple in ['eachspeaker', 'multiplespeaker']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) elif multiple in ['subcorpora']: print(("\n%s: Beginning %d parallel corpus interrogation%s: %s" \ "\n Query: %s\n %s corpus ... \n" % (time, num_cores, add_es.lstrip('e'), corpus.name, sformat, message) )) # run in parallel, get either a list of tuples (non-c option) # or a dataframe (c option) #import sys #reload(sys) #stdout=sys.stdout failed = False terminal = False used_joblib = False #ds = ds[::-1] #todo: the number of blank lines to print can be way wrong if not root and print_info: from blessings import Terminal terminal = Terminal() print('\n' * (len(ds) - 2)) for dobj in ds: linenum = dobj['paralleling'] # this try handles nosetest problems in sublime text try: with terminal.location(0, terminal.height - (linenum + 1)): # this is a really bad idea. thetime = strftime("%H:%M:%S", localtime()) num_spaces = 26 - len(dobj['outname']) print('%s: QUEUED: %s' % (thetime, dobj['outname'])) except: pass if not root and multiprocess: try: res = Parallel(n_jobs=num_cores)(delayed(interrogator)(**x) for x in ds) used_joblib = True except: failed = True print('Multiprocessing failed.') raise if not res: failed = True else: res = [] for index, d in enumerate(ds): d['startnum'] = (100 / denom) * index res.append(interrogator(**d)) try: res = sorted([i for i in res if i]) except: pass # remove unpicklable bits from query from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType badtypes = (ModuleType, FunctionType, BuiltinFunctionType, BuiltinMethodType) qlocs = {k: v for k, v in locs.items() if not isinstance(v, badtypes)} if hasattr(qlocs.get('corpus', False), 'name'): qlocs['corpus'] = qlocs['corpus'].path else: qlocs['corpus'] = list([i.path for i in qlocs.get('corpus', [])]) # return just a concordance from corpkit.interrogation import Concordance if kwargs.get('conc') == 'only': concs = pd.concat([x for x in res]) thetime = strftime("%H:%M:%S", localtime()) concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] lines = Concordance(concs) if save: lines.save(save, print_info=print_info) if print_info: print('\n\n%s: Finished! %d results.\n\n' % (thetime, format(len(concs.index), ','))) return lines # return interrodict (to become multiindex) if isinstance(res[0], Interrodict) or not all(isinstance(i.results, Series) for i in res): out = OrderedDict() for interrog, d in zip(res, ds): for unpicklable in ['note', 'root']: interrog.query.pop(unpicklable, None) try: out[interrog.query['outname']] = interrog except KeyError: out[d['outname']] = interrog idict = Interrodict(out) if print_info: thetime = strftime("%H:%M:%S", localtime()) print("\n\n%s: Finished! Output is multi-indexed." % thetime) idict.query = qlocs if save: idict.save(save, print_info=print_info) return idict # make query and total branch, save, return # todo: standardise this so we don't have to guess transposes # else: if multiple == 'multiplecorpora' and not mult_corp_are_subs: sers = [i.results for i in res] out = DataFrame(sers, index=[i.query['outname'] for i in res]) out = out.reindex_axis(sorted(out.columns), axis=1) # sort cols out = out.fillna(0) # nan to zero out = out.astype(int) # float to int out = out.T else: # make a series from counts if all(len(i.results) == 1 for i in res): out = pd.concat([r.results for r in res]) out = out.sort_index() else: try: out = pd.concat([r.results for r in res], axis=1) out = out.T out.index = [i.query['outname'] for i in res] except ValueError: return None # format like normal # this sorts subcorpora, which are cls out = out[sorted(list(out.columns))] # puts subcorpora in the right place if not mult_corp_are_subs and multiple != 'subcorpora': out = out.T if multiple == 'subcorpora': out = out.sort_index() out = out.fillna(0) # nan to zero out = out.astype(int) if 'c' in show and mult_corp_are_subs: out = out.sum() out.index = sorted(list(out.index)) # sort by total if isinstance(out, DataFrame): out = out[list(out.sum().sort_values(ascending=False).index)] # really need to figure out the deal with tranposing! if all(x.endswith('.xml') for x in list(out.columns)) \ or all(x.endswith('.txt') for x in list(out.columns)) \ or all(x.endswith('.conll') for x in list(out.columns)): out = out.T if kwargs.get('nosubmode'): out = out.sum() from corpkit.interrogation import Interrogation tt = out.sum(axis=1) if isinstance(out, DataFrame) else out.sum() out = Interrogation(results=out, totals=tt, query=qlocs) if hasattr(out, 'columns') and len(out.columns) == 1: out = out.sort_index() if kwargs.get('conc') is True: try: concs = pd.concat([x.concordance for x in res], ignore_index=True) concs = concs.sort_values(by='c') concs = concs.reset_index(drop=True) if kwargs.get('maxconc'): concs = concs[:kwargs.get('maxconc')] out.concordance = Concordance(concs) except ValueError: out.concordance = None thetime = strftime("%H:%M:%S", localtime()) if terminal: print(terminal.move(terminal.height-1, 0)) if print_info: if terminal: print(terminal.move(terminal.height-1, 0)) if hasattr(out.results, 'columns'): print('%s: Interrogation finished! %s unique results, %s total.' % (thetime, format(len(out.results.columns), ','), format(out.totals.sum(), ','))) else: print('%s: Interrogation finished! %s matches.' % (thetime, format(tt, ','))) if save: out.save(save, print_info = print_info) if list(out.results.index) == ['0'] and not kwargs.get('df1_always_df'): out.results = out.results.ix[0].sort_index() return out