def make_progress_bar(): """generate a progress bar""" if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: total_files = sum(len(x) for x in list(to_iterate_over.values())) par_args = { 'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'quiet': quiet, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1) } term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') if in_notebook: par_args['welcome_message'] = welcome_message outn = kwargs.get('outname', '') if outn: outn = getattr(outn, 'name', outn) outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) return p, outn, total_files, par_args
def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs): """ Download something to proj_path, unless it's CoreNLP, which goes to ~/corenlp """ import os import shutil import glob import zipfile from time import localtime, strftime from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator file_name = url.split('/')[-1] home = os.path.expanduser("~") customdir = kwargs.get('custom_corenlp_dir', False) # if it's corenlp, put it in home/corenlp # if that dir exists, check if for a zip file # if there's a zipfile and it works, move on # if there's a zipfile and it's broken, delete it if 'stanford' in url: if customdir: downloaded_dir = customdir else: downloaded_dir = os.path.join(home, 'corenlp') if not os.path.isdir(downloaded_dir): os.makedirs(downloaded_dir) else: poss_zips = glob.glob( os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip')) if poss_zips: fullfile = poss_zips[-1] from zipfile import BadZipfile try: the_zip_file = zipfile.ZipFile(fullfile) ret = the_zip_file.testzip() if ret is None: return downloaded_dir, fullfile else: os.remove(fullfile) except BadZipfile: os.remove(fullfile) #else: # shutil.rmtree(downloaded_dir) else: downloaded_dir = os.path.join(proj_path, 'temp') try: os.makedirs(downloaded_dir) except OSError: pass fullfile = os.path.join(downloaded_dir, file_name) if actually_download: import __main__ as main if not root and not hasattr(main, '__file__'): txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url selection = INPUTFUNC(txt) if 'n' in selection.lower(): return None, None try: import requests # NOTE the stream=True parameter r = requests.get(url, stream=True, verify=False) file_size = int(r.headers['content-length']) file_size_dl = 0 block_sz = 8192 showlength = file_size / block_sz thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloading ... \n' % thetime) par_args = { 'printstatus': kwargs.get('printstatus', True), 'length': showlength } if not root: tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength) p = animator(None, None, init=True, tot_string=tstr, **par_args) animator(p, file_size_dl + 1, tstr) with open(fullfile, 'wb') as f: for chunk in r.iter_content(chunk_size=block_sz): if chunk: # filter out keep-alive new chunks f.write(chunk) file_size_dl += len(chunk) #print file_size_dl * 100.0 / file_size if kwargs.get('note'): kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size)) else: tstr = '%d/%d' % (file_size_dl / block_sz, showlength) animator(p, file_size_dl / block_sz, tstr, **par_args) if root: root.update() except Exception as err: import traceback print(traceback.format_exc()) thetime = strftime("%H:%M:%S", localtime()) print('%s: Download failed' % thetime) try: f.close() except: pass if root: root.update() return None, None if kwargs.get('note'): kwargs['note'].progvar.set(100) else: p.animate(int(file_size)) thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloaded successully.' % thetime) try: f.close() except: pass return downloaded_dir, fullfile
def interrogator(corpus, search='w', query='any', show='w', exclude=False, excludemode='any', searchmode='all', case_sensitive=False, save=False, subcorpora=False, just_metadata=False, skip_metadata=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, only_unique=False, only_format_match=True, multiprocess=False, spelling=False, regex_nonword_filter=r'[A-Za-z0-9]', gramsize=1, conc=False, maxconc=9999, window=None, no_closed=False, no_punct=True, discard=False, **kwargs): """ Interrogate corpus, corpora, subcorpus and file objects. See corpkit.interrogation.interrogate() for docstring """ conc = kwargs.get('do_concordancing', conc) quiet = kwargs.get('quiet', False) coref = kwargs.pop('coref', False) show_conc_metadata = kwargs.pop('show_conc_metadata', False) fsi_index = kwargs.pop('fsi_index', True) dep_type = kwargs.pop('dep_type', 'collapsed-ccprocessed-dependencies') nosubmode = subcorpora is None #todo: temporary #if getattr(corpus, '_dlist', False): # subcorpora = 'file' # store kwargs and locs locs = locals().copy() locs.update(kwargs) locs.pop('kwargs', None) import codecs import signal import os from time import localtime, strftime from collections import Counter import pandas as pd from pandas import DataFrame, Series from corpkit.interrogation import Interrogation, Interrodict from corpkit.corpus import Datalist, Corpora, Corpus, File, Subcorpus from corpkit.process import (tregex_engine, get_deps, unsplitter, sanitise_dict, animator, filtermaker, fix_search, pat_format, auto_usecols, format_tregex, make_conc_lines_from_whole_mid) from corpkit.other import as_regex from corpkit.dictionaries.process_types import Wordlist from corpkit.build import check_jdk from corpkit.conll import pipeline from corpkit.process import delete_files_and_subcorpora have_java = check_jdk() # remake corpus without bad files and folders corpus, skip_metadata, just_metadata = delete_files_and_subcorpora(corpus, skip_metadata, just_metadata) # so you can do corpus.interrogate('features/postags/wordclasses/lexicon') if search == 'features': search = 'v' query = 'any' if search in ['postags', 'wordclasses']: query = 'any' preserve_case = True show = 'p' if search == 'postags' else 'x' # use tregex if simple because it's faster # but use dependencies otherwise search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'} if search == 'lexicon': search = 't' if not subcorpora and not just_metadata and not skip_metadata and have_java else {'w': 'any'} query = 'any' show = ['w'] if not kwargs.get('cql') and isinstance(search, STRINGTYPE) and len(search) > 3: raise ValueError('search argument not recognised.') import re if regex_nonword_filter: is_a_word = re.compile(regex_nonword_filter) else: is_a_word = re.compile(r'.*') from traitlets import TraitError # convert cql-style queries---pop for the sake of multiprocessing cql = kwargs.pop('cql', None) if cql: from corpkit.cql import to_corpkit search, exclude = to_corpkit(search) def signal_handler(signal, _): """ Allow pausing and restarting whn not in GUI """ if root: return import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler) def add_adj_for_ngram(show, gramsize): """ If there's a gramsize of more than 1, remake show for ngramming """ if gramsize == 1: return show out = [] for i in show: out.append(i) for i in range(1, gramsize): for bit in show: out.append('+%d%s' % (i, bit)) return out def fix_show_bit(show_bit): """ Take a single search/show_bit type, return match """ ends = ['w', 'l', 'i', 'n', 'f', 'p', 'x', 's', 'a', 'e', 'c'] starts = ['d', 'g', 'm', 'b', 'h', '+', '-', 'r', 'c'] show_bit = show_bit.lstrip('n') show_bit = show_bit.lstrip('b') show_bit = list(show_bit) if show_bit[-1] not in ends: show_bit.append('w') if show_bit[0] not in starts: show_bit.insert(0, 'm') return ''.join(show_bit) def fix_show(show, gramsize): """ Lowercase anything in show and turn into list """ if isinstance(show, list): show = [i.lower() for i in show] elif isinstance(show, STRINGTYPE): show = show.lower() show = [show] show = [fix_show_bit(i) for i in show] return add_adj_for_ngram(show, gramsize) def is_multiquery(corpus, search, query, outname): """ Determine if multiprocessing is needed/possibe, and do some retyping if need be as well """ is_mul = False from collections import OrderedDict from corpkit.dictionaries.process_types import Wordlist if isinstance(query, Wordlist): query = list(query) if subcorpora and multiprocess: is_mul = 'subcorpora' if isinstance(subcorpora, (list, tuple)): is_mul = 'subcorpora' if isinstance(query, (dict, OrderedDict)): is_mul = 'namedqueriessingle' if isinstance(search, dict): if all(isinstance(i, dict) for i in list(search.values())): is_mul = 'namedqueriesmultiple' return is_mul, corpus, search, query def ispunct(s): import string return all(c in string.punctuation for c in s) def uniquify(conc_lines): """get unique concordance lines""" from collections import OrderedDict unique_lines = [] checking = [] for index, (_, speakr, start, middle, end) in enumerate(conc_lines): joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def compiler(pattern): """ Compile regex or fail gracefully """ if hasattr(pattern, 'pattern'): return pattern import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print('%s: Query %s' % (thetime, error_message)) if root: return 'Bad query' else: raise ValueError('%s: Query %s' % (thetime, error_message)) def determine_search_func(show): """Figure out what search function we're using""" simple_tregex_mode = False statsmode = False tree_to_text = False search_trees = False simp_crit = all(not i for i in [kwargs.get('tgrep'), files_as_subcorpora, subcorpora, just_metadata, skip_metadata]) if search.get('t') and simp_crit: if have_java: simple_tregex_mode = True else: search_trees = 'tgrep' optiontext = 'Searching parse trees' elif datatype == 'conll': if any(i.endswith('t') for i in search.keys()): if have_java and not kwargs.get('tgrep'): search_trees = 'tregex' else: search_trees = 'tgrep' optiontext = 'Searching parse trees' elif any(i.endswith('v') for i in search.keys()): # either of these searchers now seems to work #seacher = get_stats_conll statsmode = True optiontext = 'General statistics' elif any(i.endswith('r') for i in search.keys()): optiontext = 'Distance from root' else: optiontext = 'Querying CONLL data' return optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees def get_tregex_values(show): """If using Tregex, set appropriate values - Check for valid query - Make 'any' query - Make list query """ translated_option = 't' if isinstance(search['t'], Wordlist): search['t'] = list(search['t']) q = tregex_engine(corpus=False, query=search.get('t'), options=['-t'], check_query=True, root=root, preserve_case=preserve_case ) # so many of these bad fixing loops! nshow = [] for i in show: if i == 'm': nshow.append('w') else: nshow.append(i.lstrip('m')) show = nshow if q is False: if root: return 'Bad query', None else: return 'Bad query', None if isinstance(search['t'], list): regex = as_regex(search['t'], boundaries='line', case_sensitive=case_sensitive) else: regex = '' # listquery, anyquery, translated_option treg_dict = {'p': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 'pl': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 'x': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'u'], 't': [r'__ < (/%s/ !< __)' % regex, r'__ < (/.?[A-Za-z0-9].?/ !< __)', 'o'], 'w': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'], 'c': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 'C'], 'l': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 't'], 'u': [r'/%s/ !< __' % regex, r'/.?[A-Za-z0-9].?/ !< __', 'v'] } newshow = [] listq, anyq, translated_option = treg_dict.get(show[0][-1].lower()) newshow.append(translated_option) for item in show[1:]: _, _, noption = treg_dict.get(item.lower()) newshow.append(noption) if isinstance(search['t'], list): search['t'] = listq elif search['t'] == 'any': search['t'] = anyq return search['t'], newshow def correct_spelling(a_string): """correct spelling within a string""" if not spelling: return a_string from corpkit.dictionaries.word_transforms import usa_convert if spelling.lower() == 'uk': usa_convert = {v: k for k, v in list(usa_convert.items())} bits = a_string.split('/') for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = '/'.join(bits) return r def make_search_iterable(corpus): """determine how to structure the corpus for interrogation""" # skip file definitions if they are not needed if getattr(corpus, '_dlist', False): return {(i.name, i.path): [i] for i in list(corpus.files)} #return {('Sample', 'Sample'): list(corpus.files)} if simple_tregex_mode: if corpus.level in ['s', 'f', 'd']: return {(corpus.name, corpus.path): False} else: return {(os.path.basename(i), os.path.join(corpus.path, i)): False for i in os.listdir(corpus.path) if os.path.isdir(os.path.join(corpus.path, i))} if isinstance(corpus, Datalist): to_iterate_over = {} # it could be files or subcorpus objects if corpus[0].level in ['s', 'd']: if files_as_subcorpora: for subc in corpus: for f in subc.files: to_iterate_over[(f.name, f.path)] = [f] else: for subc in corpus: to_iterate_over[(subc.name, subc.path)] = subc.files elif corpus[0].level == 'f': for f in corpus: to_iterate_over[(f.name, f.path)] = [f] elif corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not hasattr(corpus, 'subcorpora') or not corpus.subcorpora: # just files in a directory if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] else: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} if files_as_subcorpora: # don't know if possible: has subcorpora but also .files if hasattr(corpus, 'files') and corpus.files is not None: for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] # has subcorpora with files in those elif hasattr(corpus, 'files') and corpus.files is None: for subc in corpus.subcorpora: for f in subc.files: to_iterate_over[(f.name, f.path)] = [f] else: if corpus[0].level == 's': for subcorpus in corpus: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files elif corpus[0].level == 'f': for f in corpus: to_iterate_over[(f.name, f.path)] = [f] else: for subcorpus in corpus.subcorpora: to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files return to_iterate_over def welcome_printer(return_it=False): """Print welcome message""" if no_conc: message = 'Interrogating' else: message = 'Interrogating and concordancing' if only_conc: message = 'Concordancing' if kwargs.get('printstatus', True): thetime = strftime("%H:%M:%S", localtime()) from corpkit.process import dictformat sformat = dictformat(search) welcome = ('\n%s: %s %s ...\n %s\n ' \ 'Query: %s\n %s corpus ... \n' % \ (thetime, message, cname, optiontext, sformat, message)) if return_it: return welcome else: print(welcome) def goodbye_printer(return_it=False, only_conc=False): """Say goodbye before exiting""" if not kwargs.get('printstatus', True): return thetime = strftime("%H:%M:%S", localtime()) if only_conc: finalstring = '\n\n%s: Concordancing finished! %s results.' % (thetime, format(len(conc_df), ',')) else: finalstring = '\n\n%s: Interrogation finished!' % thetime if countmode: finalstring += ' %s matches.' % format(tot, ',') else: finalstring += ' %s unique results, %s total occurrences.' % (format(numentries, ','), format(total_total, ',')) if return_it: return finalstring else: print(finalstring) def get_conc_colnames(corpus, fsi_index=False, simple_tregex_mode=False): fields = [] base = 'c f s l m r' if simple_tregex_mode: base = base.replace('f ', '') if fsi_index and not simple_tregex_mode: base = 'i ' + base if PYTHON_VERSION == 2: base = base.encode('utf-8').split() else: base = base.split() if show_conc_metadata: from corpkit.build import get_all_metadata_fields meta = get_all_metadata_fields(corpus.path) if isinstance(show_conc_metadata, list): meta = [i for i in meta if i in show_conc_metadata] #elif show_conc_metadata is True: # pass for i in sorted(meta): if i in ['speaker', 'sent_id', 'parse']: continue if PYTHON_VERSION == 2: base.append(i.encode('utf-8')) else: base.append(i) return base def make_conc_obj_from_conclines(conc_results, fsi_index=False): """ Turn conclines into DataFrame """ from corpkit.interrogation import Concordance #fsi_place = 2 if fsi_index else 0 all_conc_lines = [] for sc_name, resu in sorted(conc_results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu #make into series for lin in unique_results: #spkr = str(spkr, errors = 'ignore') #if not subcorpora: # lin[fsi_place] = lin[fsi_place] #lin.insert(fsi_place, sc_name) if len(lin) < len(conc_col_names): diff = len(conc_col_names) - len(lin) lin.extend(['none'] * diff) all_conc_lines.append(Series(lin, index=conc_col_names)) try: conc_df = pd.concat(all_conc_lines, axis=1).T except ValueError: return if all(x == '' for x in list(conc_df['s'].values)) or \ all(x == 'none' for x in list(conc_df['s'].values)): conc_df.drop('s', axis=1, inplace=True) locs['corpus'] = corpus.name if maxconc: conc_df = Concordance(conc_df[:maxconc]) else: conc_df = Concordance(conc_df) try: conc_df.query = locs except AttributeError: pass return conc_df def lowercase_result(res): """ Take any result and do spelling/lowercasing if need be todo: remove lowercase and change name """ if not res or statsmode: return res # this is likely broken, but spelling in interrogate is deprecated anyway if spelling: res = [correct_spelling(r) for r in res] return res def postprocess_concline(line, fsi_index=False, conc=False): # todo: are these right? if not conc: return line subc, star, en = 0, 2, 5 if fsi_index: subc, star, en = 2, 4, 7 if not preserve_case: line[star:en] = [str(x).lower() for x in line[star:en]] if spelling: line[star:en] = [correct_spelling(str(b)) for b in line[star:en]] return line def make_progress_bar(): """generate a progress bar""" if simple_tregex_mode: total_files = len(list(to_iterate_over.keys())) else: total_files = sum(len(x) for x in list(to_iterate_over.values())) par_args = {'printstatus': kwargs.get('printstatus', True), 'root': root, 'note': note, 'quiet': quiet, 'length': total_files, 'startnum': kwargs.get('startnum'), 'denom': kwargs.get('denominator', 1)} term = None if kwargs.get('paralleling', None) is not None: from blessings import Terminal term = Terminal() par_args['terminal'] = term par_args['linenum'] = kwargs.get('paralleling') if in_notebook: par_args['welcome_message'] = welcome_message outn = kwargs.get('outname', '') if outn: outn = getattr(outn, 'name', outn) outn = outn + ': ' tstr = '%s%d/%d' % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) return p, outn, total_files, par_args # find out if using gui root = kwargs.get('root') note = kwargs.get('note') language_model = kwargs.get('language_model') # set up pause method original_sigint = signal.getsignal(signal.SIGINT) if kwargs.get('paralleling', None) is None: if not root: original_sigint = signal.getsignal(signal.SIGINT) signal.signal(signal.SIGINT, signal_handler) # find out about concordancing only_conc = False no_conc = False if conc is False: no_conc = True if isinstance(conc, str) and conc.lower() == 'only': only_conc = True no_conc = False numconc = 0 # wipe non essential class attributes to not bloat query attrib if isinstance(corpus, Corpus): import copy corpus = copy.copy(corpus) for k, v in corpus.__dict__.items(): if isinstance(v, (Interrogation, Interrodict)): corpus.__dict__.pop(k, None) # convert path to corpus object if not isinstance(corpus, (Corpus, Corpora, Subcorpus, File, Datalist)): if not multiprocess and not kwargs.get('outname'): corpus = Corpus(corpus, print_info=False) # figure out how the user has entered the query and show, and normalise from corpkit.process import searchfixer search = searchfixer(search, query) show = fix_show(show, gramsize) locs['show'] = show # instantiate lemmatiser if need be lem_instance = False if any(i.endswith('l') for i in show) and isinstance(search, dict) and search.get('t'): from nltk.stem.wordnet import WordNetLemmatizer lem_instance = WordNetLemmatizer() # do multiprocessing if need be im, corpus, search, query, = is_multiquery(corpus, search, query, kwargs.get('outname', False)) # figure out if we can multiprocess the corpus if hasattr(corpus, '__iter__') and im: corpus = Corpus(corpus, print_info=False) if hasattr(corpus, '__iter__') and not im: im = 'datalist' if isinstance(corpus, Corpora): im = 'multiplecorpora' # split corpus if the user wants multiprocessing but no other iterable if not im and multiprocess: im = 'datalist' if getattr(corpus, 'subcorpora', False): corpus = corpus[:] else: corpus = corpus.files search = fix_search(search, case_sensitive=case_sensitive, root=root) exclude = fix_search(exclude, case_sensitive=case_sensitive, root=root) # if it's already been through pmultiquery, don't do it again locs['search'] = search locs['exclude'] = exclude locs['query'] = query locs['corpus'] = corpus locs['multiprocess'] = multiprocess locs['print_info'] = kwargs.get('printstatus', True) locs['multiple'] = im locs['subcorpora'] = subcorpora locs['nosubmode'] = nosubmode # send to multiprocess function if im: signal.signal(signal.SIGINT, original_sigint) from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) # get corpus metadata cname = corpus.name if isinstance(save, STRINGTYPE): savename = corpus.name + '-' + save if save is True: raise ValueError('save must be str, not bool.') datatype = getattr(corpus, 'datatype', 'conll') singlefile = getattr(corpus, 'singlefile', False) level = getattr(corpus, 'level', 'c') # store all results in here from collections import defaultdict results = defaultdict(Counter) count_results = defaultdict(list) conc_results = defaultdict(list) # check if just counting, turn off conc if so countmode = 'c' in show or 'mc' in show if countmode: no_conc = True only_conc = False # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get('denominator', 1) startnum = kwargs.get('startnum', 0) # Determine the search function to be used # optiontext, simple_tregex_mode, statsmode, tree_to_text, search_trees = determine_search_func(show) # no conc for statsmode if statsmode: no_conc = True only_conc = False conc = False # Set some Tregex-related values translated_option = False if search.get('t'): query, translated_option = get_tregex_values(show) if query == 'Bad query' and translated_option is None: if root: return 'Bad query' else: return # more tregex options if tree_to_text: treg_q = r'ROOT << __' op = ['-o', '-t', '-w', '-f'] elif simple_tregex_mode: treg_q = search['t'] op = ['-%s' % i for i in translated_option] + ['-o', '-f'] # make iterable object for corpus interrogation to_iterate_over = make_search_iterable(corpus) try: from ipywidgets import IntProgress _ = IntProgress(min=0, max=10, value=1) in_notebook = True except TraitError: in_notebook = False except ImportError: in_notebook = False # caused in newest ipython except AttributeError: in_notebook = False lemtag = False if search.get('t'): from corpkit.process import gettag lemtag = gettag(search.get('t'), lemmatag) usecols = auto_usecols(search, exclude, show, kwargs.pop('usecols', None), coref=coref) # print welcome message welcome_message = welcome_printer(return_it=in_notebook) # create a progress bar p, outn, total_files, par_args = make_progress_bar() if conc: conc_col_names = get_conc_colnames(corpus, fsi_index=fsi_index, simple_tregex_mode=False) # Iterate over data, doing interrogations for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if nosubmode: subcorpus_name = 'Total' # results for subcorpus go here #conc_results[subcorpus_name] = [] #count_results[subcorpus_name] = [] #results[subcorpus_name] = Counter() # get either everything (tree_to_text) or the search['t'] query if tree_to_text or simple_tregex_mode: result = tregex_engine(query=treg_q, options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case) # format search results with slashes etc if not countmode and not tree_to_text: result = format_tregex(result, show, translated_option=translated_option, exclude=exclude, excludemode=excludemode, lemtag=lemtag, lem_instance=lem_instance, countmode=countmode, speaker_data=False) # if concordancing, do the query again with 'whole' sent and fname if not no_conc: ops = ['-w'] + op #ops = [i for i in ops if i != '-n'] whole_result = tregex_engine(query=search['t'], options=ops, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) # format match too depending on option if not only_format_match: wholeresult = format_tregex(whole_result, show, translated_option=translated_option, exclude=exclude, excludemode=excludemode, lemtag=lemtag, lem_instance=lem_instance, countmode=countmode, speaker_data=False, whole=True) # make conc lines from conc results conc_result = make_conc_lines_from_whole_mid(whole_result, result, show=show) for lin in conc_result: if maxconc is False or numconc < maxconc: conc_results[subcorpus_name].append(lin) numconc += 1 # add matches to ongoing counts if countmode: count_results[subcorpus_name] += [result] else: if result: results[subcorpus_name] += Counter([i[-1] for i in result]) else: results[subcorpus_name] += Counter() # update progress bar current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # todo: move this kwargs.pop('by_metadata', None) # conll querying goes by file, not subcorpus for f in files: slow_treg_speaker_guess = kwargs.get('outname', '') if kwargs.get('multispeaker') else '' filepath, corefs = f.path, coref res, conc_res = pipeline(filepath, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, case_sensitive=case_sensitive, conc=conc, only_format_match=only_format_match, speaker=slow_treg_speaker_guess, gramsize=gramsize, no_punct=no_punct, no_closed=no_closed, window=window, filename=f.path, coref=corefs, countmode=countmode, maxconc=(maxconc, numconc), is_a_word=is_a_word, by_metadata=subcorpora, show_conc_metadata=show_conc_metadata, just_metadata=just_metadata, skip_metadata=skip_metadata, fsi_index=fsi_index, category=subcorpus_name, translated_option=translated_option, statsmode=statsmode, preserve_case=preserve_case, usecols=usecols, search_trees=search_trees, lem_instance=lem_instance, lemtag=lemtag, **kwargs) if res is None and conc_res is None: current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # deal with symbolic structures---that is, rather than adding # results by subcorpora, add them by metadata value # todo: sorting? if subcorpora: for (k, v), concl in zip(res.items(), conc_res.values()): v = lowercase_result(v) results[k] += Counter(v) for line in concl: if maxconc is False or numconc < maxconc: line = postprocess_concline(line, fsi_index=fsi_index, conc=conc) conc_results[k].append(line) numconc += 1 current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) continue # garbage collection needed? sents = None corefs = None if res == 'Bad query': return 'Bad query' if countmode: count_results[subcorpus_name] += [res] else: # add filename and do lowercasing for conc if not no_conc: for line in conc_res: line = postprocess_concline(line, fsi_index=fsi_index, conc=conc) if maxconc is False or numconc < maxconc: conc_results[subcorpus_name].append(line) numconc += 1 # do lowercasing and spelling if not only_conc: res = lowercase_result(res) # discard removes low results, helping with # curse of dimensionality countres = Counter(res) if isinstance(discard, float): countres.most_common() nkeep = len(counter) - len(counter) * discard countres = Counter({k: v for i, (k, v) in enumerate(countres.most_common()) if i <= nkeep}) elif isinstance(discard, int): countres = Counter({k: v for k, v in countres.most_common() if v >= discard}) results[subcorpus_name] += countres #else: #results[subcorpus_name] += res # update progress bar current_iter += 1 tstr = '%s%d/%d' % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # Get concordances into DataFrame, return if just conc if not no_conc: # fail on this line with typeerror if no results? conc_df = make_conc_obj_from_conclines(conc_results, fsi_index=fsi_index) if only_conc and conc_df is None: return elif only_conc: locs = sanitise_dict(locs) try: conc_df.query = locs except AttributeError: return conc_df if save and not kwargs.get('outname'): if conc_df is not None: conc_df.save(savename) goodbye_printer(only_conc=True) if not root: signal.signal(signal.SIGINT, original_sigint) return conc_df else: conc_df = None # Get interrogation into DataFrame if countmode: df = Series({k: sum(v) for k, v in sorted(count_results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set(item for sublist in list(results.values()) for item in sublist) sortres = sorted(results.items(), key=lambda x: x[0]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for _, subcorp_result in sortres] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) # for ngrams, remove hapaxes #if show_ngram or show_collocates: # if not language_model: # df = df[[i for i in list(df.columns) if df[i].sum() > 1]] numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() # turn df into series if all conditions met conds = [countmode, files_as_subcorpora, subcorpora, kwargs.get('df1_always_df', False)] anyxs = [level == 's', singlefile, nosubmode] if all(not x for x in conds) and any(x for x in anyxs): df = Series(df.ix[0]) df.sort_values(ascending=False, inplace=True) tot = df.sum() numentries = len(df.index) total_total = tot # turn data into DF for GUI if need be if isinstance(df, Series) and kwargs.get('df1_always_df', False): total_total = df.sum() df = DataFrame(df) tot = Series(total_total, index=['Total']) # if we're doing files as subcorpora, we can remove the extension etc if isinstance(df, DataFrame) and files_as_subcorpora: cname = corpus.name.replace('-stripped', '').replace('-parsed', '') edits = [(r'(-[0-9][0-9][0-9])?\.txt\.conllu?', ''), (r'-%s(-stripped)?(-parsed)?' % cname, '')] from corpkit.editor import editor df = editor(df, replace_subcorpus_names=edits).results tot = df.sum(axis=1) total_total = df.sum().sum() if conc_df is not None and conc_df is not False: # removed 'f' from here for now for col in ['c']: for pat in ['.txt', '.conll', '.conllu']: conc_df[col] = conc_df[col].str.replace(pat, '') conc_df[col] = conc_df[col].str.replace(r'-[0-9][0-9][0-9]$', '') #df.index = df.index.str.replace('w', 'this') # make interrogation object locs['corpus'] = corpus.path locs = sanitise_dict(locs) if nosubmode and isinstance(df, pd.DataFrame): df = df.sum() interro = Interrogation(results=df, totals=tot, query=locs, concordance=conc_df) # save it if save and not kwargs.get('outname'): print('\n') interro.save(savename) goodbye = goodbye_printer(return_it=in_notebook) if in_notebook: try: p.children[2].value = goodbye.replace('\n', '') except AttributeError: pass if not root: signal.signal(signal.SIGINT, original_sigint) return interro
def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs): """ Download something to proj_path """ import corpkit import os import shutil import glob import sys import zipfile from time import localtime, strftime from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator file_name = url.split('/')[-1] home = os.path.expanduser("~") # if it's corenlp, put it in home/corenlp # if that dir exists, check if for a zip file # if there's a zipfile and it works, move on # if there's a zipfile and it's broken, delete it if 'stanford' in url: downloaded_dir = os.path.join(home, 'corenlp') if not os.path.isdir(downloaded_dir): os.makedirs(downloaded_dir) else: poss_zips = glob.glob(os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip')) if poss_zips: fullfile = poss_zips[-1] the_zip_file = zipfile.ZipFile(fullfile) ret = the_zip_file.testzip() if ret is None: return downloaded_dir, fullfile else: os.remove(fullfile) #else: # shutil.rmtree(downloaded_dir) else: downloaded_dir = os.path.join(proj_path, 'temp') try: os.makedirs(downloaded_dir) except OSError: pass fullfile = os.path.join(downloaded_dir, file_name) if actually_download: if not root: txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url selection = INPUTFUNC(txt) if 'n' in selection.lower(): return None, None try: import requests # NOTE the stream=True parameter r = requests.get(url, stream=True, verify=False) file_size = int(r.headers['content-length']) file_size_dl = 0 block_sz = 8192 showlength = file_size / block_sz thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloading ... \n' % thetime) par_args = {'printstatus': kwargs.get('printstatus', True), 'length': showlength} if not root: tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength) p = animator(None, None, init=True, tot_string=tstr, **par_args) animator(p, file_size_dl + 1, tstr) with open(fullfile, 'wb') as f: for chunk in r.iter_content(chunk_size=block_sz): if chunk: # filter out keep-alive new chunks f.write(chunk) file_size_dl += len(chunk) #print file_size_dl * 100.0 / file_size if kwargs.get('note'): kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size)) else: tstr = '%d/%d' % (file_size_dl / block_sz, showlength) animator(p, file_size_dl / block_sz, tstr, **par_args) if root: root.update() except Exception as err: import traceback print(traceback.format_exc()) thetime = strftime("%H:%M:%S", localtime()) print('%s: Download failed' % thetime) try: f.close() except: pass if root: root.update() return if kwargs.get('note'): kwargs['note'].progvar.set(100) else: p.animate(int(file_size)) thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloaded successully.' % thetime) try: f.close() except: pass return downloaded_dir, fullfile
def interrogator( corpus, search, query="any", show="w", exclude=False, excludemode="any", searchmode="all", dep_type="collapsed-ccprocessed-dependencies", case_sensitive=False, quicksave=False, just_speakers=False, preserve_case=False, lemmatag=False, files_as_subcorpora=False, conc=False, only_unique=False, random=False, only_format_match=False, multiprocess=False, spelling=False, regex_nonword_filter=r"[A-Za-z0-9:_]", gramsize=2, split_contractions=False, **kwargs ): """interrogate corpus, corpora, subcorpus and file objects see corpkit.interrogation.interrogate() for docstring""" # store kwargs locs = locals() from corpkit.interrogation import Interrogation from corpkit.process import tregex_engine import pandas as pd from pandas import DataFrame, Series from collections import Counter from corpkit.other import as_regex from corpkit.process import get_deps from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator from corpkit.dictionaries.word_transforms import wordlist, taglemma # find out if using gui root = kwargs.get("root") note = kwargs.get("note") # convert path to corpus object if type(corpus) == str: from corpkit.corpus import Corpus corpus = Corpus(corpus) # figure out how the user has entered the query and normalise from corpkit.process import searchfixer search, search_iterable = searchfixer(search, query) # for better printing of query, esp during multiprocess # can remove if multiprocess printing improved if len(search.keys()) == 1: query = search.values()[0] if "l" in show and search.get("t"): from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() if type(show) == str: show = [show] def is_multiquery(corpus, search, query, just_speakers): """determine if multiprocessing is needed do some retyping if need be as well""" im = False from collections import OrderedDict if hasattr(corpus, "__iter__"): im = True # so we can do search = 't', query = ['NP', 'VP']: if type(query) == list: if query != search.values()[0] or len(search.keys()) > 1: query = {c.title(): c for c in query} if type(query) == dict or type(query) == OrderedDict: im = True if just_speakers: if just_speakers == "each": im = True just_speakers = ["each"] if just_speakers == ["each"]: im = True if type(just_speakers) == str: im = False just_speakers = [just_speakers] if type(just_speakers) == list: if len(just_speakers) > 1: im = True if type(search) == dict: if all(type(i) == dict for i in search.values()): im = True return im, corpus, search, query, just_speakers def slow_tregex(sents, **dummy_args): """do the speaker-specific version of tregex queries""" import os from corpkit.process import tregex_engine # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" to_write = "\n".join([sent._parse_string.strip() for sent in sents if sent.parse_string is not None]) to_write.encode("utf-8", errors="ignore") with open(to_open, "w") as fo: fo.write(to_write) q = search.values()[0] res = tregex_engine( query=q, options=["-o", "-%s" % translated_option], corpus=to_open, root=root, preserve_case=True ) if root: root.update() os.remove(to_open) if countmode: return len(res) else: return res def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" with open(to_open, "w") as fo: for sent in sents: statsmode_results["Sentences"] += 1 sts = sent.parse_string.rstrip() encd = sts.encode("utf-8", errors="ignore") + "\n" fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith("pass")]) statsmode_results["Passives"] += numpass statsmode_results["Tokens"] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results["Words"] += len(words) statsmode_results["Characters"] += len("".join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from corpkit.other import as_regex tregex_qs = { "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/", "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)", "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))", "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))", "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))", "Open class words": r"/^(NN|JJ|VB|RB)/ < __", "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/", "Clauses": r"/^S/ < __", "Interrogative": r"ROOT << (/\?/ !< __)", "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"), "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"), "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.relational, boundaries="w"), } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + "/" + str(total_files) if kwargs.get("outname"): tot_string = "%s: %s" % (kwargs["outname"], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get("note", False): kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results def make_conc_lines_from_whole_mid(wholes, middle_column_result, speakr=False): if speakr is False: speakr = "" conc_lines = [] # remove duplicates from results unique_wholes = [] unique_middle_column_result = [] duplicates = [] for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)): if "-join-".join([f, whole, mid]) not in duplicates: duplicates.append("-join-".join([f, whole, mid])) unique_wholes.append([f, whole]) unique_middle_column_result.append(mid) # split into start, middle and end, dealing with multiple occurrences for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)): reg = re.compile(r"([^a-zA-Z0-9-]|^)(" + re.escape(mid) + r")([^a-zA-Z0-9-]|$)", re.IGNORECASE | re.UNICODE) offsets = [(m.start(), m.end()) for m in re.finditer(reg, whole)] for offstart, offend in offsets: start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip() conc_lines.append([os.path.basename(f), speakr, start, middle, end]) return conc_lines def uniquify(conc_lines): from collections import OrderedDict unique_lines = [] checking = [] for index, (f, speakr, start, middle, end) in enumerate(conc_lines): joined = " ".join([speakr, start, "MIDDLEHERE:", middle, ":MIDDLEHERE", end]) if joined not in checking: unique_lines.append(conc_lines[index]) checking.append(joined) return unique_lines def lemmatiser(list_of_words, tag): """take a list of unicode words and a tag and return a lemmatised list.""" output = [] for word in list_of_words: if translated_option.startswith("u"): if word.lower() in taglemma.keys(): word = taglemma[word.lower()] else: if word == "x": word = "Other" # only use wordnet lemmatiser when appropriate else: if word in wordlist: word = wordlist[word] word = lmtzr.lemmatize(word, tag) output.append(word) return output def gettag(query, lemmatag=False): """ Find tag for WordNet lemmatisation """ import re tagdict = {"N": "n", "A": "a", "V": "v", "A": "r", "None": False, "": False, "Off": False} if lemmatag is False: tag = "n" # same default as wordnet # attempt to find tag from tregex query tagfinder = re.compile(r"^[^A-Za-z]*([A-Za-z]*)") tagchecker = re.compile(r"^[A-Z]{1,4}$") qr = query.replace(r"\w", "").replace(r"\s", "").replace(r"\b", "") treebank_tag = re.findall(tagfinder, qr) if re.match(tagchecker, treebank_tag[0]): tag = tagdict.get(treebank_tag[0], "n") elif lemmatag: tag = lemmatag return tag def format_tregex(results): """format tregex by show list""" if countmode: return results import re done = [] if "l" in show or "pl" in show: lemmata = lemmatiser(results, gettag(search.get("t"), lemmatag)) else: lemmata = [None for i in results] for word, lemma in zip(results, lemmata): bits = [] if exclude and exclude.get("w"): if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("w"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("l"), lemma): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("p"), word): continue if len(exclude.keys()) == 1 or excludemode == "any": if re.search(exclude.get("pl"), lemma): continue if exclude and excludemode == "all": num_to_cause_exclude = len(exclude.keys()) current_num = 0 if exclude.get("w"): if re.search(exclude.get("w"), word): current_num += 1 if exclude.get("l"): if re.search(exclude.get("l"), lemma): current_num += 1 if exclude.get("p"): if re.search(exclude.get("p"), word): current_num += 1 if exclude.get("pl"): if re.search(exclude.get("pl"), lemma): current_num += 1 if current_num == num_to_cause_exclude: continue for i in show: if i == "t": bits.append(word) if i == "l": bits.append(lemma) elif i == "w": bits.append(word) elif i == "p": bits.append(word) elif i == "pl": bits.append(lemma) joined = "/".join(bits) done.append(joined) return done def tok_by_list(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re if type(pattern) == str: pattern = [pattern] if not case_sensitive: pattern = [p.lower() for p in pattern] if not concordancing: if case_sensitive: matches = [m for m in list_of_toks if m in pattern] else: matches = [m for m in list_of_toks if m.lower() in pattern] else: matches = [] for index, token in enumerate(list_of_toks): if token in pattern: match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(token) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def unsplitter(lst): """unsplit contractions and apostophes from tokenised text""" if split_contractions: return lst unsplit = [] for index, t in enumerate(lst): if index == 0 or index == len(lst) - 1: unsplit.append(t) continue if "'" in t and not t.endswith("'"): rejoined = "".join([lst[index - 1], t]) unsplit.append(rejoined) else: if not "'" in lst[index + 1]: unsplit.append(t) return unsplit def tok_ngrams(pattern, list_of_toks, concordancing=False, split_contractions=True): from collections import Counter import re ngrams = Counter() result = [] # if it's not a compiled regex list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)] if pattern.lower() == "any": pattern = r".*" if not split_contractions: list_of_toks = unsplitter(list_of_toks) # list_of_toks = [x for x in list_of_toks if "'" not in x] for index, w in enumerate(list_of_toks): try: the_gram = [list_of_toks[index + x] for x in range(gramsize)] if not any(re.search(pattern, x) for x in the_gram): continue ngrams[" ".join(the_gram)] += 1 except IndexError: pass # turn counter into list of results for k, v in ngrams.items(): if v > 1: for i in range(v): result.append(k) if countmode: return len(result) else: return result def compiler(pattern): """compile regex or fail gracefully""" import re try: if case_sensitive: comped = re.compile(pattern) else: comped = re.compile(pattern, re.IGNORECASE) return comped except: import traceback import sys from time import localtime, strftime exc_type, exc_value, exc_traceback = sys.exc_info() lst = traceback.format_exception(exc_type, exc_value, exc_traceback) error_message = lst[-1] thetime = strftime("%H:%M:%S", localtime()) print "%s: Query %s" % (thetime, error_message) if root: return "Bad query" else: raise ValueError("%s: Query %s" % (thetime, error_message)) def tok_by_reg(pattern, list_of_toks, concordancing=False, **kwargs): """search for regex in plaintext corpora""" import re comped = compiler(pattern) if comped == "Bad query": return "Bad query" if not concordancing: matches = [m for m in list_of_toks if re.search(comped, m)] else: matches = [] for index, token in enumerate(list_of_toks): if re.search(comped, token): match = [" ".join([t for t in unsplitter(list_of_toks[:index])])[-140:]] match.append(re.search(comped, token).group(0)) match.append(" ".join([t for t in unsplitter(list_of_toks[index + 1 :])])[:140]) matches.append(match) if countmode: return len(matches) else: return matches def plaintext_regex_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for regex in plaintext corpora it searches over lines, so the user needs to be careful. """ import re if concordancing: pattern = r"(.{,140})\b(" + pattern + r")\b(.{,140})" compiled_pattern = compiler(pattern) if compiled_pattern == "Bad query": return "Bad query" matches = re.findall(compiled_pattern, plaintext_data) if concordancing: matches = [list(m) for m in matches] if not concordancing: for index, i in enumerate(matches): if type(i) == tuple: matches[index] = i[0] if countmode: return len(matches) else: return matches def correct_spelling(a_string): if not spelling: return a_string from dictionaries.word_transforms import usa_convert if spelling.lower() == "uk": usa_convert = {v: k for k, v in usa_convert.items()} spell_out = [] bits = a_string.split("/") for index, i in enumerate(bits): converted = usa_convert.get(i.lower(), i) if i.islower() or preserve_case is False: converted = converted.lower() elif i.isupper() and preserve_case: converted = converted.upper() elif i.istitle() and preserve_case: converted = converted.title() bits[index] = converted r = "/".join(bits) return r def plaintext_simple_search(pattern, plaintext_data, concordancing=False, **kwargs): """search for tokens in plaintext corpora""" import re result = [] if type(pattern) == str: pattern = [pattern] for p in pattern: if concordancing: pat = r"(.{0,140})\b(" + re.escape(p) + r")\b(.{0,140})" pat = compiler(pat) if pat == "Bad query": return "Bad query" matches = re.findall(pat, plaintext_data) if concordancing: matches = [list(m) for m in matches] for i in matches: result.append(i) else: for m in range(len(matches)): result.append(p) return result # do multiprocessing if need be im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers) locs["search"] = search locs["query"] = query locs["just_speakers"] = just_speakers locs["corpus"] = corpus locs["multiprocess"] = multiprocess if im: from corpkit.multiprocess import pmultiquery return pmultiquery(**locs) datatype = corpus.datatype singlefile = corpus.singlefile # store all results in here results = {} # check if just counting countmode = "c" in show # where we are at in interrogation current_iter = 0 # multiprocessing progress bar denom = kwargs.get("denominator", 1) startnum = kwargs.get("startnum", 0) ############################################ # Determine the search function to be used # ############################################ # simple tregex is tregex over whole dirs simple_tregex_mode = False statsmode = False if not just_speakers and "t" in search.keys(): simple_tregex_mode = True else: if corpus.datatype == "plaintext": if search.get("n"): raise NotImplementedError("Use a tokenised corpus for n-gramming.") # searcher = plaintext_ngram optiontext = "n-grams via plaintext" if search.get("w"): if kwargs.get("regex", True): searcher = plaintext_regex_search else: searcher = plaintext_simple_search optiontext = "Searching plaintext" elif corpus.datatype == "tokens": if search.get("n"): searcher = tok_ngrams optiontext = "n-grams via tokens" elif search.get("w"): if kwargs.get("regex", True): searcher = tok_by_reg else: searcher = tok_by_list if type(search.get("w")) == list: searcher = tok_by_list optiontext = "Searching tokens" only_parse = ["r", "d", "g", "dl", "gl", "df", "gf", "dp", "gp", "f"] if corpus.datatype != "parse" and any(i in only_parse for i in search.keys()): raise ValueError( 'Need parsed corpus to search with "%s" option(s).' % ", ".join([i for i in search.keys() if i in only_parse]) ) elif corpus.datatype == "parse": if search.get("t"): searcher = slow_tregex elif search.get("s"): searcher = get_stats statsmode = True optiontext = "General statistics" global numdone numdone = 0 else: from corpkit.depsearch import dep_searcher searcher = dep_searcher optiontext = "Dependency querying" ############################################ # Set some Tregex-related values # ############################################ if search.get("t"): query = search.get("t") # check the query q = tregex_engine(corpus=False, query=search.get("t"), options=["-t"], check_query=True, root=root) if query is False: if root: return "Bad query" else: return optiontext = "Searching parse trees" if "p" in show or "pl" in show: translated_option = "u" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "t" in show: translated_option = "o" if type(search["t"]) == list: search["t"] = r"__ < (/%s/ !< __)" % as_regex( search["t"], boundaries="line", case_sensitive=case_sensitive ) if search["t"] == "any": search["t"] = r"__ < (/.?[A-Za-z0-9].?/ !< __)" elif "w" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "c" in show: count_results = {} only_count = True translated_option = "C" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" elif "l" in show: translated_option = "t" if type(search["t"]) == list: search["t"] = r"/%s/ !< __" % as_regex(search["t"], boundaries="line", case_sensitive=case_sensitive) if search["t"] == "any": search["t"] = r"/.?[A-Za-z0-9].?/ !< __" query = search["t"] ############################################ # Make iterable for corpus/subcorpus/file # ############################################ if corpus.singlefile: to_iterate_over = {(corpus.name, corpus.path): [corpus]} elif not corpus.subcorpora: to_iterate_over = {(corpus.name, corpus.path): corpus.files} else: to_iterate_over = {} for k, v in sorted(corpus.structure.items()): to_iterate_over[(k.name, k.path)] = v if files_as_subcorpora: to_iterate_over = {} for f in corpus.files: to_iterate_over[(f.name, f.path)] = [f] ############################################ # Print welcome message # ############################################ if conc: message = "Concordancing" else: message = "Interrogating" if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) sformat = "\n ".join(["%s: %s" % (k.rjust(3), v) for k, v in search.items()]) if search == {"s": r".*"}: sformat = "features" welcome = "\n%s: %s %s ...\n %s\n Query: %s\n" % ( thetime, message, corpus.name, optiontext, sformat, ) print welcome ############################################ # Make progress bar # ############################################ if simple_tregex_mode: total_files = len(to_iterate_over.keys()) else: if search.get("s"): total_files = sum([len(x) for x in to_iterate_over.values()]) * 12 else: total_files = sum([len(x) for x in to_iterate_over.values()]) par_args = {"printstatus": kwargs.get("printstatus", True), "root": root, "note": note, "length": total_files} term = None if kwargs.get("paralleling", None) is not None: from blessings import Terminal term = Terminal() par_args["terminal"] = term par_args["linenum"] = kwargs.get("paralleling") outn = kwargs.get("outname", "") if outn: outn = outn + ": " tstr = "%s%d/%d" % (outn, current_iter, total_files) p = animator(None, None, init=True, tot_string=tstr, **par_args) tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) ############################################ # Iterate over data, doing interrogations # ############################################ for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()): if countmode or conc: results[subcorpus_name] = [] else: results[subcorpus_name] = Counter() # tregex over subcorpora, not files if simple_tregex_mode: op = ["-o", "-" + translated_option] result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if countmode: results[subcorpus_name].append(result) continue result = Counter(format_tregex(result)) if conc: op.append("-w") whole_result = tregex_engine( query=search["t"], options=op, corpus=subcorpus_path, root=root, preserve_case=preserve_case ) if not only_format_match: whole_result = format_tregex(whole_result) result = make_conc_lines_from_whole_mid(whole_result, result, speakr=False) if spelling: for index, line in enumerate(result): result[index] = [correct_spelling(b) for b in line] results[subcorpus_name] += result current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) animator(p, current_iter, tstr, **par_args) # dependencies, plaintext, tokens or slow_tregex else: for f in files: if corpus.datatype == "parse": with open(f.path, "r") as data: data = data.read() from corenlp_xml.document import Document try: corenlp_xml = Document(data) except: print "Could not read file: %s" % f.path continue if just_speakers: sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers] if not sents: continue else: sents = corenlp_xml.sentences res = searcher( sents, search=search, show=show, dep_type=dep_type, exclude=exclude, excludemode=excludemode, searchmode=searchmode, lemmatise=False, case_sensitive=case_sensitive, concordancing=conc, only_format_match=only_format_match, ) if res == "Bad query": return "Bad query" if searcher == slow_tregex and not countmode: res = format_tregex(res) elif corpus.datatype == "tokens": import pickle with open(f.path, "rb") as fo: data = pickle.load(fo) res = searcher(search.values()[0], data, split_contractions=split_contractions, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") elif corpus.datatype == "plaintext": with open(f.path, "rb") as data: data = data.read() data = unicode(data, errors="ignore") res = searcher(search.values()[0], data, concordancing=conc) if conc: for index, line in enumerate(res): line.insert(0, "") if countmode: results[subcorpus_name] += res continue # add filename and do lowercasing for conc if conc: for index, line in enumerate(res): line.insert(0, f.name) if not preserve_case: line = [b.lower() for b in line] if spelling: line = [correct_spelling(b) for b in line] results[subcorpus_name] += [line] # do lowercasing and spelling else: if not preserve_case: res = [r.lower() for r in res] if spelling: res = [correct_spelling(r) for r in res] results[subcorpus_name] += Counter(res) if not statsmode: current_iter += 1 if kwargs.get("paralleling", None) is not None: tstr = "%s%d/%d" % (outn, current_iter + 2, total_files) else: tstr = "%s%d/%d" % (outn, current_iter + 1, total_files) # delete temp file if there import os if os.path.isfile("tmp.txt"): os.remove("tmp.txt") ############################################ # Get concordances into DataFrame # ############################################ if conc: all_conc_lines = [] for sc_name, resu in sorted(results.items()): if only_unique: unique_results = uniquify(resu) else: unique_results = resu # make into series pindex = "c f s l m r".encode("utf-8").split() for fname, spkr, start, word, end in unique_results: spkr = unicode(spkr, errors="ignore") fname = os.path.basename(fname) # the use of ascii here makes sure the string formats ok, but will also screw over # anyone doing non-english work. so, change to utf-8, then fix errors as they come # in the corpkit-gui "add_conc_lines_to_window" function all_conc_lines.append( Series( [ sc_name.encode("ascii", errors="ignore"), fname.encode("ascii", errors="ignore"), spkr.encode("ascii", errors="ignore"), start.encode("ascii", errors="ignore"), word.encode("ascii", errors="ignore"), end.encode("ascii", errors="ignore"), ], index=pindex, ) ) # randomise results... if random: from random import shuffle shuffle(all_conc_lines) df = pd.concat(all_conc_lines, axis=1).T # not doing anything yet --- this is for multimodal concordancing add_links = False if not add_links: df.columns = ["c", "f", "s", "l", "m", "r"] else: df.columns = ["c", "f", "s", "l", "m", "r", "link"] if all(x == "" for x in list(df["s"].values)): df.drop("s", axis=1, inplace=True) if kwargs.get("note"): kwargs["note"].progvar.set(100) if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Concordancing finished! %d matches.\n" % (thetime, len(df.index)) print finalstring from corpkit.interrogation import Concordance output = Concordance(df) output.query = locs if quicksave: interro.save() return output ############################################ # Get interrogation into DataFrame # ############################################ else: if countmode: df = Series({k: sum(v) for k, v in sorted(results.items())}) tot = df.sum() else: the_big_dict = {} unique_results = set([item for sublist in results.values() for item in sublist]) for word in unique_results: the_big_dict[word] = [subcorp_result[word] for subcorp_result in sorted(results.values())] # turn master dict into dataframe, sorted df = DataFrame(the_big_dict, index=sorted(results.keys())) numentries = len(df.columns) tot = df.sum(axis=1) total_total = df.sum().sum() ############################################ # Format, output as Interrogation object # ############################################ if not countmode: if not corpus.subcorpora or singlefile: if not files_as_subcorpora: if not kwargs.get("df1_always_df"): df = Series(df.ix[0]) df.sort(ascending=False) tot = df.sum() numentries = len(df.index) total_total = tot # sort by total if type(df) == pd.core.frame.DataFrame: if not df.empty: df.ix["Total-tmp"] = df.sum() the_tot = df.ix["Total-tmp"] df = df[the_tot.argsort()[::-1]] df = df.drop("Total-tmp", axis=0) # format final string if kwargs.get("printstatus", True): thetime = strftime("%H:%M:%S", localtime()) finalstring = "\n\n%s: Interrogation finished!" % thetime if countmode: finalstring += " %d matches." % tot else: finalstring += " %d unique results, %d total occurrences." % (numentries, total_total) print finalstring interro = Interrogation(results=df, totals=tot, query=locs) if quicksave: interro.save() return interro
def get_stats(sents, **dummy_args): """get a bunch of frequencies on interpersonal phenomena""" import os import re from collections import Counter statsmode_results = Counter() # first, put the relevant trees into temp file if kwargs.get("outname"): to_open = "tmp-%s.txt" % kwargs["outname"] else: to_open = "tmp.txt" with open(to_open, "w") as fo: for sent in sents: statsmode_results["Sentences"] += 1 sts = sent.parse_string.rstrip() encd = sts.encode("utf-8", errors="ignore") + "\n" fo.write(encd) deps = get_deps(sent, dep_type) numpass = len([x for x in deps.links if x.type.endswith("pass")]) statsmode_results["Passives"] += numpass statsmode_results["Tokens"] += len(sent.tokens) words = [w.word for w in sent.tokens if w.word.isalnum()] statsmode_results["Words"] += len(words) statsmode_results["Characters"] += len("".join(words)) # count moods via trees (/\?/ !< __) from dictionaries.process_types import processes from corpkit.other import as_regex tregex_qs = { "Imperative": r"ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/", "Open interrogative": r"ROOT < SBARQ <<- (/\?/ !< __)", "Closed interrogative": r"ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))", "Unmodalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))", "Modalised declarative": r"ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))", "Open class words": r"/^(NN|JJ|VB|RB)/ < __", "Closed class words": r"__ !< __ !> /^(NN|JJ|VB|RB)/", "Clauses": r"/^S/ < __", "Interrogative": r"ROOT << (/\?/ !< __)", "Mental processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.mental, boundaries="w"), "Verbal processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.verbal, boundaries="w"), "Relational processes": r"VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)" % as_regex(processes.relational, boundaries="w"), } for name, q in sorted(tregex_qs.items()): res = tregex_engine(query=q, options=["-o", "-C"], corpus=to_open, root=root) statsmode_results[name] += int(res) global numdone numdone += 1 if root: root.update() else: tot_string = str(numdone + 1) + "/" + str(total_files) if kwargs.get("outname"): tot_string = "%s: %s" % (kwargs["outname"], tot_string) animator(p, numdone, tot_string, **par_args) if kwargs.get("note", False): kwargs["note"].progvar.set((numdone * 100.0 / total_files / denom) + startnum) os.remove(to_open) return statsmode_results
def dep_searcher(sents, search, show = 'w', dep_type = 'collapsed-ccprocessed-dependencies', regex_nonword_filter = r'[A-Za-z0-9:_]', concordancing = False, exclude = False, excludemode = 'any', searchmode = 'all', lemmatise = False, case_sensitive = False, progbar = False, only_format_match = False): import re from corenlp_xml.document import Document from collections import Counter from corpkit.build import flatten_treestring from corpkit.process import filtermaker, animator, get_deps """ search corenlp dependency parse 1. search for 'search' keyword arg governor dependent function pos lemma word index etc 2. exclude entries if need be, using same method as search 3. return '/'-sep list of 'show' keyword arg, or conc lines: governor dependent function pos lemma word index distance etc ... or just return int count. """ def distancer(lks, lk): "determine number of jumps to root" c = 0 # get the gov index, stop when it's zero root_found = False while not root_found: if c == 0: try: link_to_check = next(i for i in lks if i.dependent.idx == lk.id) except StopIteration: root_found = True break #link_to_check = lk gov_index = link_to_check.governor.idx if gov_index == 0: root_found = True else: if c > 29: root_found = True break link_to_check = [l for l in lks if l.dependent.idx == gov_index] if len(link_to_check) > 0: link_to_check = link_to_check[0] else: break c += 1 if c < 30: return c def get_matches_from_sent(s, search, deps = False, tokens = False, dep_type = 'basic-dependencies', mode = 'all'): """process a sentence object, returning matching tok ids""" from corpkit.process import get_deps import re lks = [] if not deps: deps = get_deps(s, dep_type) if not tokens: tokens = s.tokens for opt, pat in search.items(): if type(pat) == list: if all(type(x) == int for x in pat): pat = [str(x) for x in pat] pat = filtermaker(pat, case_sensitive = case_sensitive) search[opt] = pat if type(pat) == dict: del search[opt] for k, v in pat.items(): if k != 'w': search[opt + k] = v else: search[opt] = v if type(pat) == str and pat.lower() == 'any': search[opt] = re.compile(r'.*') for opt, pat in search.items(): if opt == 'g': got = [] for l in deps.links: if re.search(pat, l.governor.text): got.append(s.get_token_by_id(l.dependent.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'gf': got = [] for l in deps.links: if re.search(pat, l.type): gov_index = l.dependent.idx for l2 in deps.links: if l2.governor.idx == gov_index: got.append(s.get_token_by_id(l2.dependent.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'df': got = [] for l in deps.links: if re.search(pat, l.type): got.append(s.get_token_by_id(l.governor.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'gl': got = [] for tok in tokens: if re.search(pat, tok.lemma): for i in deps.links: if i.governor.idx == tok.id: got.append(s.get_token_by_id(i.dependent.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'gp': got = [] for tok in tokens: if re.search(pat, tok.pos): for i in deps.links: if i.governor.idx == tok.id: got.append(s.get_token_by_id(i.dependent.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'dl': got = [] for tok in tokens: if re.search(pat, tok.lemma): for i in deps.links: if i.dependent.idx == tok.id: got.append(s.get_token_by_id(i.governor.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'dp': got = [] for tok in tokens: if re.search(pat, tok.pos): for i in deps.links: if i.dependent.idx == tok.id: got.append(s.get_token_by_id(i.governor.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'd': got = [] for l in deps.links: if re.search(pat, l.dependent.text): got.append(s.get_token_by_id(l.governor.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'f': got = [] for l in deps.links: if re.search(pat, l.type): got.append(s.get_token_by_id(l.dependent.idx)) got = set(got) for i in got: lks.append(i) elif opt == 'p': for tok in tokens: if re.search(pat, tok.pos): lks.append(tok) elif opt == 'pl': for tok in tokens: from dictionaries.word_transforms import taglemma postag = tok.pos if postag.lower() in taglemma.keys(): stemmedtag = taglemma[postag.lower()] else: stemmedtag = postag.lower() if re.search(pat, stemmedtag): lks.append(tok) elif opt == 'l': for tok in tokens: if re.search(pat, tok.lemma): lks.append(tok) elif opt == 'w': for tok in tokens: if re.search(pat, tok.word): lks.append(tok) elif opt == 'i': for tok in tokens: if re.search(pat, str(tok.id)): lks.append(tok) elif opt == 'r': got = [] for tok in tokens: dist = distancer(deps.links, tok) if dist is not None and dist is not False: try: if int(dist) == int(pat): lks.append(tok) except TypeError: if re.search(pat, str(dist)): lks.append(tok) if mode == 'all': from collections import Counter counted = Counter(lks) lks = [k for k, v in sorted(counted.items()) if v >= len(search.keys())] return lks result = [] numdone = 0 for s in sents: numdone += 1 deps = get_deps(s, dep_type) tokens = s.tokens lks = get_matches_from_sent(s, search, deps, tokens, dep_type, mode = searchmode) if not concordancing: lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)])) if exclude is not False: to_remove = get_matches_from_sent(s, exclude, deps, tokens, dep_type, mode = excludemode) for i in to_remove: try: lks.remove(i) except ValueError: pass if progbar: tstr = '%d/%d' % (numdone, len(sents)) animator(progbar, numdone, tstr) if 'c' in show: result.append(len(lks)) continue if concordancing: for lk in lks: # for each concordance middle part one_result = [] if not lk: continue # get the index of the match windex = int(lk.id) - 1 speakr = s.speakername if not speakr: speakr = '' # begin building line with speaker first conc_line = [speakr] # format a single word correctly if only_format_match: start = ' '.join([t.word for index, t in enumerate(s.tokens) if index < windex]) end = ' '.join([t.word for index, t in enumerate(s.tokens) if index > windex]) s.tokens = [s.get_token_by_id(lk.id)] for tok in s.tokens: single_wd = {} intermediate_result = [] if 'w' in show: single_wd['w'] = tok.word if 'l' in show: from dictionaries.word_transforms import wordlist if tok.lemma in wordlist.keys(): lem = wordlist[tok.lemma] else: lem = tok.lemma single_wd['l'] = lem if 'p' in show: single_wd['p'] = tok.pos if 'pl' in show: single_wd['pl'] = lk.pos from dictionaries.word_transforms import taglemma if postag.lower() in taglemma.keys(): single_wd['pl'] = taglemma[postag.lower()] else: single_wd['pl'] = postag.lower() if not single_wd['pl']: single_wd['pl'] == 'none' if 'r' in show: all_lks = [l for l in deps.links] distance = distancer(all_lks, tok) if distance: single_wd['r'] = str(distance) else: single_wd['r'] = '0' if 'f' in show: for lin in deps.links: single_wd['f'] = '.' if tok.id == lin.dependent.idx: single_wd['f'] = lin.type break if 'i' in show: single_wd['i'] = str(tok.id) for i in show: intermediate_result.append(single_wd[i]) intermediate_result = [i.replace('/', '-slash-') for i in intermediate_result] one_result.append('/'.join(intermediate_result)) # now we have formatted tokens as a list. we need to split # it into start, middle and end if not only_format_match: start = ' '.join([w for index, w in enumerate(one_result) if index < windex]) end = ' '.join([w for index, w in enumerate(one_result) if index > windex]) middle = one_result[windex] else: middle = one_result[0] for bit in start, middle, end: conc_line.append(bit) result.append(conc_line) else: # figure out what to show for lk in lks: single_result = {} if not lk: continue if 'w' in show: single_result['w'] = 'none' if lemmatise: single_result['w'] = lk.lemma else: single_result['w'] = lk.word if 'l' in show: from dictionaries.word_transforms import wordlist if lk.lemma in wordlist.keys(): lem = wordlist[lk.lemma] else: lem = lk.lemma single_result['l'] = lem if 'p' in show: single_result['p'] = 'none' postag = lk.pos if lemmatise: from dictionaries.word_transforms import taglemma if postag.lower() in taglemma.keys(): single_result['p'] = taglemma[postag.lower()] else: single_result['p'] = postag.lower() else: single_result['p'] = postag if not single_result['p']: single_result['p'] == 'none' if 'pl' in show: single_result['pl'] = 'none' postag = lk.pos from dictionaries.word_transforms import taglemma if postag.lower() in taglemma.keys(): single_result['pl'] = taglemma[postag.lower()] else: single_result['pl'] = postag.lower() if not single_result['pl']: single_result['pl'] == 'none' if 'f' in show: single_result['f'] = 'none' for i in deps.links: if i.dependent.idx == lk.id: single_result['f'] = i.type.rstrip(',') break if single_result['f'] == '': single_result['f'] = 'root' if 'g' in show: single_result['g'] = 'none' for i in deps.links: if i.dependent.idx == lk.id: if s.get_token_by_id(i.governor.idx): if lemmatise: single_result['g'] = s.get_token_by_id(i.governor.idx).lemma else: single_result['g'] = i.governor.text else: single_result['g'] = 'root' break if 'd' in show: single_result['d'] = 'none' for i in deps.links: if i.governor.idx == lk.id: if s.get_token_by_id(i.dependent.idx): if lemmatise: single_result['d'] = s.get_token_by_id(i.dependent.idx).lemma else: single_result['d'] = i.dependent.text break if 'gl' in show: single_result['gl'] = 'none' for i in deps.links: if i.dependent.idx == lk.id: if s.get_token_by_id(i.governor.idx): single_result['gl'] = s.get_token_by_id(i.governor.idx).lemma else: single_result['gl'] = 'root' break if 'dl' in show: single_result['dl'] = 'none' for i in deps.links: if i.governor.idx == lk.id: if s.get_token_by_id(i.dependent.idx): single_result['dl'] = s.get_token_by_id(i.dependent.idx).lemma break if 'gp' in show: single_result['gp'] = 'none' for i in deps.links: if i.dependent.idx == lk.id: if s.get_token_by_id(i.governor.idx): single_result['gp'] = s.get_token_by_id(i.governor.idx).pos break if 'dp' in show: single_result['dp'] = 'none' for i in deps.links: if i.governor.idx == lk.id: if s.get_token_by_id(i.dependent.idx): single_result['dp'] = s.get_token_by_id(i.dependent.idx).pos break if 'df' in show: single_result['df'] = 'none' for i in deps.links: if i.governor.idx == lk.id: single_result['df'] = i.type break if 'gf' in show: single_result['gf'] = 'none' for i in deps.links: # if the result is the dependent, get the governor, find where # it is a dependent, then gt the type if i.dependent.idx == lk.id: gv = next(x for x in deps.links if x.dependent.idx == i.governor.idx) single_result['gf'] = gv.type break if 'r' in show: all_lks = [l for l in deps.links] distance = distancer(all_lks, lk) if distance is not False and distance is not None: single_result['r'] = str(distance) if 'i' in show: single_result['i'] = str(lk.id) if 'c' not in show: # add them in order out = [] for i in show: out.append(single_result[i]) out = [i.replace('/', '-slash-') for i in out] result.append('/'.join(out)) if 'c' in show: result = sum(result) return result