Example #1
0
def dep_searcher(sents,
                 search,
                 show = 'w',
                 dep_type = 'collapsed-ccprocessed-dependencies',
                 regex_nonword_filter = r'[A-Za-z0-9:_]',
                 do_concordancing = False,
                 exclude = False,
                 excludemode = 'any',
                 searchmode = 'all',
                 lemmatise = False,
                 case_sensitive = False,
                 progbar = False,
                 only_format_match = False,
                 speaker = False):
    import re
    from corenlp_xml.document import Document
    from collections import Counter
    from build import flatten_treestring
    from process import filtermaker, animator, get_deps
    """
    search corenlp dependency parse
    1. search for 'search' keyword arg
       governor
       dependent
       function
       pos
       lemma
       word
       index
       etc

    2. exclude entries if need be, using same method as search

    3. return '/'-sep list of 'show' keyword arg, or conc lines:
       governor
       dependent
       function
       pos
       lemma
       word
       index
       distance
       etc
       
       ... or just return int count.
       """

    def distancer(lks, lk):
        "determine number of jumps to root"      
        c = 0
        # get the gov index, stop when it's zero
        root_found = False
        while not root_found:
            if c == 0:
                try:
                    link_to_check = next(i for i in lks if i.dependent.idx == lk.id)
                except StopIteration:
                    root_found = True
                    break
                #link_to_check = lk
            gov_index = link_to_check.governor.idx
            if gov_index == 0:
                root_found = True
            else:
                if c > 29:
                    root_found = True
                    break
                link_to_check = [l for l in lks if l.dependent.idx == gov_index]
                if len(link_to_check) > 0:
                    link_to_check = link_to_check[0]
                else:
                    break
                c += 1
        if c < 30:
            return c

    def get_matches_from_sent(s, search, deps = False, tokens = False, 
        dep_type = 'basic-dependencies', mode = 'all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive = case_sensitive)
                search[opt] = pat
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v
            if pat == 'any':
                search[opt] = re.compile(r'.*')

        for opt, pat in list(search.items()):
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [k for k, v in counted.items() if v >= len(list(search.keys()))]
        return lks

    result = []
    conc_result = []
    numdone = 0

    for s in sents:
        numdone += 1
        deps = get_deps(s, dep_type)
        tokens = s.tokens
        lks = get_matches_from_sent(s, search, deps, tokens, dep_type, mode = searchmode)

        #if not concordancing:
        #    lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)]))

        if exclude is not False:
            to_remove = get_matches_from_sent(s, exclude, deps, tokens, dep_type, mode = excludemode)

            for i in to_remove:
                try:
                    lks.remove(i)
                except ValueError:
                    pass

        if progbar:
            tstr = '%d/%d' % (numdone, len(sents))
            animator(progbar, numdone, tstr)

        if 'c' in show:
            result.append(len(lks))
            continue

        if do_concordancing:
            for lk in lks: # for each concordance middle part
                one_result = []
                if not lk:
                    continue
                # get the index of the match
                windex = int(lk.id) - 1
                speakr = s.speakername
                if not speakr:
                    speakr = ''
                # begin building line with speaker first
                conc_line = [speakr]
                # format a single word correctly
                if only_format_match:
                    start = ' '.join([t.word for index, t in enumerate(s.tokens) if index < windex])
                    end = ' '.join([t.word for index, t in enumerate(s.tokens) if index > windex])
                    s.tokens = [s.get_token_by_id(lk.id)]
                for tok in s.tokens:
                    single_wd = {}
                    intermediate_result = []
                    if 'w' in show:
                        single_wd['w'] = tok.word
                    if 'l' in show:
                        from dictionaries.word_transforms import wordlist
                        if tok.lemma in list(wordlist.keys()):
                            lem = wordlist[tok.lemma]
                        else:
                            lem = tok.lemma
                        single_wd['l'] = lem
                    if 'p' in show:
                        single_wd['p'] = tok.pos

                    if 'pl' in show:
                        single_wd['pl'] = lk.pos
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in list(taglemma.keys()):
                            single_wd['pl'] = taglemma[postag.lower()]
                        else:
                            single_wd['pl'] = postag.lower()
                        if not single_wd['pl']:
                            single_wd['pl'] == 'none'

                    if 'r' in show:
                        all_lks = [l for l in deps.links]
                        distance = distancer(all_lks, tok)
                        if distance:
                            single_wd['r'] = str(distance)
                        else:
                            single_wd['r'] = '0'
                    if 'f' in show:
                        for lin in deps.links:
                            single_wd['f'] = '.'
                            if tok.id == lin.dependent.idx:
                                single_wd['f'] = lin.type
                                break
                    if 'i' in show:
                        single_wd['i'] = str(tok.id)

                    if any(x.startswith('g') for x in show):
                        thegovid = next((q.governor.idx for q in deps.links \
                                        if q.dependent.idx == tok.id), False)
                        govtok = False
                        if thegovid is not False:
                            govtok = s.get_token_by_id(thegovid)
                            
                        if 'g' in show:
                            if govtok:
                                single_wd['g'] = govtok.word
                            else:
                                single_wd['g'] = 'none'
                        if 'gl' in show:
                            if govtok:
                                single_wd['gl'] = govtok.lemma
                            else: 
                                single_wd['gl'] = 'none'
                        if 'gp' in show:
                            if govtok:
                                single_wd['gp'] = govtok.pos
                            else: 
                                single_wd['gp'] = 'none'

                        if 'gf' in show:
                            if govtok:
                                single_wd['gf'] = next(x.type for x in deps.links \
                                            if x.dependent.idx == thegovid)
                            else: 
                                single_wd['gf'] = 'none'

                    if any(x.startswith('d') for x in show):
                        thedepid = next((q.dependent.idx for q in deps.links \
                                        if q.governor.idx == tok.id), False)

                        deptok = False
                        if thedepid is not False:
                            deptok = s.get_token_by_id(thedepid)

                        if 'd' in show:
                            if thedepid:
                                single_wd['d'] = deptok.word
                            else: 
                                single_wd['d'] = 'none'

                        if 'dl' in show:
                            if thedepid:
                                single_wd['dl'] = deptok.lemma
                            else: 
                                single_wd['dl'] = 'none'
                        if 'dp' in show:
                            if thedepid:
                                single_wd['dp'] = deptok.pos
                            else: 
                                single_wd['dp'] = 'none'
                        if 'df' in show:
                            if thedepid:
                                single_wd['df'] = next(x.type for x in deps.links \
                                if x.dependent.idx == thedepid)
                            else: 
                                single_wd['df'] = 'none'
                    for i in show:
                        intermediate_result.append(single_wd[i])
                    intermediate_result = [i.replace('/', '-slash-').encode('utf-8', errors = 'ignore') for i in intermediate_result]
                    one_result.append('/'.join(intermediate_result))
                # now we have formatted tokens as a list. we need to split
                # it into start, middle and end
                if not only_format_match:
                    start = ' '.join([w for index, w in enumerate(one_result) if index < windex])
                    end = ' '.join([w for index, w in enumerate(one_result) if index > windex])
                    middle = one_result[windex]
                else:
                    middle = one_result[0]

                for bit in start, middle, end:
                    conc_line.append(bit)
                conc_result.append(conc_line)

        # figure out what to show
        for lk in lks:
            single_result = {}
            if not lk:
                continue
            if 'w' in show:
                single_result['w'] = 'none'
                if lemmatise:
                    single_result['w'] = lk.lemma
                else:
                    single_result['w'] = lk.word
            if 'l' in show:
                from dictionaries.word_transforms import wordlist
                if lk.lemma in list(wordlist.keys()):
                    lem = wordlist[lk.lemma]
                else:
                    lem = lk.lemma
                single_result['l'] = lem
            if 'p' in show:
                single_result['p'] = 'none'
                postag = lk.pos
                if lemmatise:
                    from dictionaries.word_transforms import taglemma
                    if postag.lower() in list(taglemma.keys()):
                        single_result['p'] = taglemma[postag.lower()]
                    else:
                        single_result['p'] = postag.lower()
                else:
                    single_result['p'] = postag
                if not single_result['p']:
                    single_result['p'] == 'none'

            if 'pl' in show:
                single_result['pl'] = 'none'
                postag = lk.pos
                from dictionaries.word_transforms import taglemma
                if postag.lower() in list(taglemma.keys()):
                    single_result['pl'] = taglemma[postag.lower()]
                else:
                    single_result['pl'] = postag.lower()
                if not single_result['pl']:
                    single_result['pl'] == 'none'

            if 'f' in show:
                single_result['f'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        single_result['f'] = i.type.rstrip(',')
                        break
                if single_result['f'] == '':
                    single_result['f'] = 'root'

            if 'g' in show:
                single_result['g'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            if lemmatise:                          
                                    single_result['g'] = s.get_token_by_id(i.governor.idx).lemma
                            else:
                                single_result['g'] = i.governor.text
                        else:
                            single_result['g'] = 'root'
                        break

            if 'd' in show:
                single_result['d'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            if lemmatise:
                                single_result['d'] = s.get_token_by_id(i.dependent.idx).lemma
                            else:
                                single_result['d'] = i.dependent.text
                        break

            if 'gl' in show:
                single_result['gl'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gl'] = s.get_token_by_id(i.governor.idx).lemma
                        else:
                            single_result['gl'] = 'root'
                        break

            if 'dl' in show:
                single_result['dl'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            single_result['dl'] = s.get_token_by_id(i.dependent.idx).lemma
                        break

            if 'gp' in show:
                single_result['gp'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):       
                            single_result['gp'] = s.get_token_by_id(i.governor.idx).pos
                        break

            if 'dp' in show:
                single_result['dp'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            single_result['dp'] = s.get_token_by_id(i.dependent.idx).pos
                        break

            if 'df' in show:
                single_result['df'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        single_result['df'] = i.type
                        break  

            if 'gf' in show:
                single_result['gf'] = 'none'
                for i in deps.links:
                    # if the result is the dependent, get the governor, find where
                    # it is a dependent, then gt the type
                    if i.dependent.idx == lk.id:
                        gv = next(x for x in deps.links if x.dependent.idx == i.governor.idx)
                        single_result['gf'] = gv.type
                        break                

            if 'r' in show:
                all_lks = [l for l in deps.links]
                distance = distancer(all_lks, lk)
                if distance is not False and distance is not None:
                    single_result['r'] = str(distance)
                else:
                    single_result['r'] = '-1'

            if 'i' in show:
                single_result['i'] = str(lk.id)

            if 'c' not in show:
                
                # add them in order
                out = []
                for i in show:
                    out.append(single_result[i])

                out = [i.replace('/', '-slash-') for i in out]
                result.append('/'.join(out))
    
    if 'c' in show:
        result = sum(result)

    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        result = []
    return result, conc_result
Example #2
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            quicksave = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            do_concordancing = False,
            maxconc = 9999,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""

    only_conc = False
    no_conc = False
    if do_concordancing is False:
        no_conc = True
    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        only_conc = True
        no_conc = False

    # iteratively count conc lines
    numconc = 0

    # store kwargs
    locs = locals()
    
    if kwargs:
        for k, v in kwargs.items():
            locs[k] = v
        locs.pop('kwargs', None)

    import corpkit
    from interrogation import Interrogation
    from process import tregex_engine
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from process import get_deps
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs
    import signal

    original_sigint = signal.getsignal(signal.SIGINT)

    if kwargs.get('paralleling', None) is None:
        original_sigint = signal.getsignal(signal.SIGINT)
        
        def signal_handler(signal, frame):
            """pause on ctrl+c, rather than just stop loop"""   
            import signal
            import sys
            from time import localtime, strftime
            signal.signal(signal.SIGINT, original_sigint)
            thetime = strftime("%H:%M:%S", localtime())
            try:
                sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            except NameError:
                sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            time = strftime("%H:%M:%S", localtime())
            print('%s: Interrogation resumed.\n' % time)
            signal.signal(signal.SIGINT, signal_handler)

        signal.signal(signal.SIGINT, signal_handler)

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if type(corpus) == str:
        from corpus import Corpus
        corpus = Corpus(corpus)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search, search_iterable = searchfixer(search, query)
    
    # for better printing of query, esp during multiprocess
    # can remove if multiprocess printing improved
    if len(list(search.keys())) == 1:
        query = list(search.values())[0]

    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        if hasattr(corpus, '__iter__'):
            im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        speakr = dummy_args.get('speaker', False)
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            encd = to_write.encode('utf-8', errors = 'ignore') + '\n'
            fo.write(encd)
        q = list(search.values())[0]
        ops = ['-o', '-%s' % translated_option]
        concs = []
        res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if not no_conc:
            ops += ['-w', '-f']
            whole_res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True) 

            res = format_tregex(res)
            whole_res = format_tregex(whole_res, whole = True)
            concs = make_conc_lines_from_whole_mid(whole_res, res, speakr)

        if root:
            root.update()
        try:
            os.remove(to_open)
        except OSError:
            pass
        if countmode:
            return(len(res))
        else:
            return res, concs

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        import re, os
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results, whole = False):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        
        if whole:
            fnames = [x for x, y in results]
            results = [y for x, y in results]

        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)

        if whole:
            done = zip(fnames, done)

        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess

    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    datatype = corpus.datatype
    singlefile = corpus.singlefile

    # store all results in here
    results = {}
    count_results = {}
    conc_results = {}
    # check if just counting
    countmode = 'c' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if corpus.datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif corpus.datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l']
        if corpus.datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif corpus.datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
                no_conc = True
                only_conc = False
                do_concordancing = False
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        translated_option = 't'
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not corpus.subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for subcorpus in corpus.subcorpora:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
        #    to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if no_conc:
        message = 'Interrogating'
    else:
        message = 'Interrogating and concordancing'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n          %s corpus ... \n' % \
                  (thetime, message, corpus.name, optiontext, sformat, message)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files,
                'startnum': kwargs.get('startnum'),
                'denom': kwargs.get('denominator', 1)}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        conc_results[subcorpus_name] = []
        count_results[subcorpus_name] = []
        results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)

            if not countmode:
                result = format_tregex(result)

            if not no_conc:
                op += ['-w', '-f']
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result, whole = True)

                conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                result = Counter(result)
                results[subcorpus_name] += result
                if not no_conc:
                    for lin in conc_result:
                        if numconc < maxconc or not maxconc:
                            conc_results[subcorpus_name].append(lin)
                        numconc += 1

            current_iter += 1
            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)

            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:
                slow_treg_speaker_guess = kwargs.get('outname', False)
                if corpus.datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if len(just_speakers) == 1:
                                slow_treg_speaker_guess = just_speakers[0]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res, conc_res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            do_concordancing = do_concordancing,
                            only_format_match = only_format_match,
                            speaker = slow_treg_speaker_guess)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                elif corpus.datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    if not only_conc:
                        res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = False)
                    if not no_conc:
                        conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = True)
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            line.insert(0, '')

                elif corpus.datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        if not only_conc:
                            res = searcher(list(search.values())[0], data, 
                            concordancing = False)
                        if not no_conc:
                            conc_res = searcher(list(search.values())[0], data, 
                            concordancing = True)
                        if not no_conc:
                            for index, line in enumerate(conc_res):
                                line.insert(0, '')

                if countmode:
                    count_results[subcorpus_name] += [res]
                else:
                    # add filename and do lowercasing for conc
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            if searcher != slow_tregex:
                                line.insert(0, f.name)
                            else:
                                line[0] = f.name
                            if not preserve_case:
                                line[3:] = [x.lower() for x in line[3:]]
                            if spelling:
                                line = [correct_spelling(b) for b in line]
                            if numconc < maxconc or not maxconc:
                                conc_results[subcorpus_name].append(line)
                                numconc += 1

                    # do lowercasing and spelling
                    if not only_conc:
                        if not preserve_case:
                            if not statsmode:
                                res = [i.lower() for i in res]
                        if spelling:
                            if not statsmode:
                                res = [correct_spelling(r) for r in res]
                        #if not statsmode:
                        results[subcorpus_name] += Counter(res)
                        #else:
                        #results[subcorpus_name] += res

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if not no_conc:
        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        conc_df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis = 1, inplace = True)

        #if kwargs.get('note'):
        #    kwargs['note'].progvar.set(100)

        #if kwargs.get('printstatus', True):
        #    thetime = strftime("%H:%M:%S", localtime())
        #    finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index))
        #    print(finalstring)

        from interrogation import Concordance
        output = Concordance(conc_df)
        if only_conc:
            output.query = locs
            if quicksave:
                output.save()

            if kwargs.get('printstatus', True):
                thetime = strftime("%H:%M:%S", localtime())
                finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df))
                print(finalstring)
            return output

        #output.query = locs

        #return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    if not only_conc:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(count_results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not corpus.subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort_values(ascending = False, inplace = True)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        if not no_conc:
            interro = Interrogation(results = df, totals = tot, query = locs, concordance = output)
        else:
            interro = Interrogation(results = df, totals = tot, query = locs)

        if quicksave:
            interro.save()
        
        return interro
Example #3
0
    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []
Example #4
0
def interrogator(corpus, 
            search, 
            query = 'any', 
            show = 'w',
            exclude = False,
            excludemode = 'any',
            searchmode = 'all',
            dep_type = 'collapsed-ccprocessed-dependencies',
            case_sensitive = False,
            save = False,
            just_speakers = False,
            preserve_case = False,
            lemmatag = False,
            files_as_subcorpora = False,
            only_unique = False,
            random = False,
            only_format_match = False,
            multiprocess = False,
            spelling = False,
            regex_nonword_filter = r'[A-Za-z0-9:_]',
            gramsize = 2,
            split_contractions = False,
            do_concordancing = False,
            maxconc = 9999,
            **kwargs):
    """interrogate corpus, corpora, subcorpus and file objects

    see corpkit.interrogation.interrogate() for docstring"""

    only_conc = False
    no_conc = False
    if do_concordancing is False:
        no_conc = True
    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        only_conc = True
        no_conc = False

    # iteratively count conc lines
    numconc = 0

    # store kwargs
    locs = locals()
    
    if kwargs:
        for k, v in kwargs.items():
            locs[k] = v
        locs.pop('kwargs', None)

    import corpkit
    from interrogation import Interrogation
    from corpus import Datalist, Corpora, Corpus, File
    from process import tregex_engine, get_deps
    import pandas as pd
    from pandas import DataFrame, Series
    from collections import Counter
    from other import as_regex
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    from process import animator
    from dictionaries.word_transforms import wordlist, taglemma
    import corenlp_xml
    import codecs
    import signal

    original_sigint = signal.getsignal(signal.SIGINT)

    if kwargs.get('paralleling', None) is None:
        original_sigint = signal.getsignal(signal.SIGINT)
        
        def signal_handler(signal, frame):
            """pause on ctrl+c, rather than just stop loop"""   
            import signal
            import sys
            from time import localtime, strftime
            signal.signal(signal.SIGINT, original_sigint)
            thetime = strftime("%H:%M:%S", localtime())
            try:
                sel = raw_input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            except NameError:
                sel = input('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
            time = strftime("%H:%M:%S", localtime())
            print('%s: Interrogation resumed.\n' % time)
            signal.signal(signal.SIGINT, signal_handler)

        signal.signal(signal.SIGINT, signal_handler)

    # find out if using gui
    root = kwargs.get('root')
    note = kwargs.get('note')

    # convert path to corpus object
    if corpus.__class__ not in [Corpus, Corpora, File]:
        if not multiprocess and not kwargs.get('outname'):
            corpus = Corpus(corpus, print_info = False)

    # figure out how the user has entered the query and normalise
    from process import searchfixer
    search = searchfixer(search, query)
    
    if 'l' in show and search.get('t'):
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    if type(show) == str:
        show = [show]

    def is_multiquery(corpus, search, query, just_speakers):
        """determine if multiprocessing is needed
        do some retyping if need be as well"""
        im = False
        from collections import OrderedDict
        #if hasattr(corpus, '__iter__'):
        #    im = True
        # so we can do search = 't', query = ['NP', 'VP']:
        if type(query) == list:
            if query != list(search.values())[0] or len(list(search.keys())) > 1:
                query = {c.title(): c for c in query}
        if type(query) == dict or type(query) == OrderedDict:
            im = True
        if just_speakers:
            if just_speakers == 'each':
                im = True
                just_speakers = ['each']
            if just_speakers == ['each']:
                im = True
            if type(just_speakers) == str:
                im = False
                just_speakers = [just_speakers]
            if type(just_speakers) == list:
                if len(just_speakers) > 1:
                    im = True
        if type(search) == dict:
            if all(type(i) == dict for i in list(search.values())):
                im = True
        return im, corpus, search, query, just_speakers

    def slow_tregex(sents, **dummy_args):
        """do the speaker-specific version of tregex queries"""
        speakr = dummy_args.get('speaker', False)
        import os
        from process import tregex_engine
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        to_write = '\n'.join([sent._parse_string.strip() for sent in sents \
                              if sent.parse_string is not None])
        to_write.encode('utf-8', errors = 'ignore')
        with open(to_open, "w") as fo:
            encd = to_write.encode('utf-8', errors = 'ignore') + '\n'
            fo.write(encd)
        q = list(search.values())[0]
        ops = ['-o', '-%s' % translated_option]
        concs = []
        res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True)
        if not no_conc:
            ops += ['-w', '-f']
            whole_res = tregex_engine(query = q, 
                            options = ops, 
                            corpus = to_open,
                            root = root,
                            preserve_case = True) 

            res = format_tregex(res)
            whole_res = format_tregex(whole_res, whole = True)
            concs = make_conc_lines_from_whole_mid(whole_res, res, speakr)

        if root:
            root.update()
        try:
            os.remove(to_open)
        except OSError:
            pass
        if countmode:
            return(len(res))
        else:
            return res, concs

    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []

    def make_conc_lines_from_whole_mid(wholes, middle_column_result, 
                                       speakr = False):
        import re, os
        if speakr is False:
            speakr = ''
        conc_lines = []
        # remove duplicates from results
        unique_wholes = []
        unique_middle_column_result = []
        duplicates = []
        for index, ((f, whole), mid) in enumerate(zip(wholes, middle_column_result)):
            if '-join-'.join([f, whole, mid]) not in duplicates:
                duplicates.append('-join-'.join([f, whole, mid]))
                unique_wholes.append([f, whole])
                unique_middle_column_result.append(mid)

        # split into start, middle and end, dealing with multiple occurrences
        for index, ((f, whole), mid) in enumerate(zip(unique_wholes, unique_middle_column_result)):
            reg = re.compile(r'([^a-zA-Z0-9-]|^)(' + re.escape(mid) + r')([^a-zA-Z0-9-]|$)', re.IGNORECASE | re.UNICODE)
            offsets = [(m.start(), m.end()) for m in re.finditer(reg,whole)]
            for offstart, offend in offsets:              
                start, middle, end = whole[0:offstart].strip(), whole[offstart:offend].strip(), whole[offend:].strip()
                conc_lines.append([os.path.basename(f), speakr, start, middle, end])
        return conc_lines

    def uniquify(conc_lines):
        from collections import OrderedDict
        unique_lines = []
        checking = []
        for index, (f, speakr, start, middle, end) in enumerate(conc_lines):
            joined = ' '.join([speakr, start, 'MIDDLEHERE:', middle, ':MIDDLEHERE', end])
            if joined not in checking:
                unique_lines.append(conc_lines[index])
            checking.append(joined)
        return unique_lines

    def lemmatiser(list_of_words, tag):
        """take a list of unicode words and a tag and return a lemmatised list."""
        output = []
        for word in list_of_words:
            if translated_option.startswith('u'):
                if word.lower() in list(taglemma.keys()):
                    word = taglemma[word.lower()]
                else:
                    if word == 'x':
                        word = 'Other'
            # only use wordnet lemmatiser when appropriate
            else:
                if word in wordlist:
                    word = wordlist[word]
                word = lmtzr.lemmatize(word, tag)
            output.append(word)
        return output

    def gettag(query, lemmatag = False):
        """
        Find tag for WordNet lemmatisation
        """
        import re

        tagdict = {'N': 'n',
                   'A': 'a',
                   'V': 'v',
                   'A': 'r',
                   'None': False,
                   '': False,
                   'Off': False}

        if lemmatag is False:
            tag = 'n' # same default as wordnet
            # attempt to find tag from tregex query
            tagfinder = re.compile(r'^[^A-Za-z]*([A-Za-z]*)')
            tagchecker = re.compile(r'^[A-Z]{1,4}$')
            qr = query.replace(r'\w', '').replace(r'\s', '').replace(r'\b', '')
            treebank_tag = re.findall(tagfinder, qr)
            if re.match(tagchecker, treebank_tag[0]):
                tag = tagdict.get(treebank_tag[0], 'n')
        elif lemmatag:
            tag = lemmatag
        return tag

    def format_tregex(results, whole = False):
        """format tregex by show list"""
        if countmode:
            return results
        import re
        done = []
        
        if whole:
            fnames = [x for x, y in results]
            results = [y for x, y in results]

        if 'l' in show or 'pl' in show:
            lemmata = lemmatiser(results, gettag(search.get('t'), lemmatag))
        else:
            lemmata = [None for i in results]
        for word, lemma in zip(results, lemmata):
            bits = []
            if exclude and exclude.get('w'):
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('w'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('l'), lemma):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('p'), word):
                        continue
                if len(list(exclude.keys())) == 1 or excludemode == 'any':
                    if re.search(exclude.get('pl'), lemma):
                        continue
            if exclude and excludemode == 'all':
                num_to_cause_exclude = len(list(exclude.keys()))
                current_num = 0
                if exclude.get('w'):
                    if re.search(exclude.get('w'), word):
                        current_num += 1
                if exclude.get('l'):
                    if re.search(exclude.get('l'), lemma):
                        current_num += 1
                if exclude.get('p'):
                    if re.search(exclude.get('p'), word):
                        current_num += 1
                if exclude.get('pl'):
                    if re.search(exclude.get('pl'), lemma):
                        current_num += 1   
                if current_num == num_to_cause_exclude:
                    continue                 

            for i in show:
                if i == 't':
                    bits.append(word)
                if i == 'l':
                    bits.append(lemma)
                elif i == 'w':
                    bits.append(word)
                elif i == 'p':
                    bits.append(word)
                elif i == 'pl':
                    bits.append(lemma)
            joined = '/'.join(bits)
            done.append(joined)

        if whole:
            done = zip(fnames, done)

        return done

    def tok_by_list(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        if type(pattern) == str:
            pattern = [pattern]
        if not case_sensitive:
            pattern = [p.lower() for p in pattern]
        if not concordancing:
            if case_sensitive:
                matches = [m for m in list_of_toks if m in pattern]
            else:
                matches = [m for m in list_of_toks if m.lower() in pattern]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if token in pattern:
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(token)
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def unsplitter(lst):
        """unsplit contractions and apostophes from tokenised text"""
        if split_contractions:
            return lst
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    def tok_ngrams(pattern, list_of_toks, concordancing = False, split_contractions = True):
        from collections import Counter
        import re
        ngrams = Counter()
        result = []
        # if it's not a compiled regex
        list_of_toks = [x for x in list_of_toks if re.search(regex_nonword_filter, x)]
        if pattern.lower() == 'any':
            pattern = r'.*'

        if not split_contractions:
            list_of_toks = unsplitter(list_of_toks)
            
            #list_of_toks = [x for x in list_of_toks if "'" not in x]
        for index, w in enumerate(list_of_toks):
            try:
                the_gram = [list_of_toks[index+x] for x in range(gramsize)]
                if not any(re.search(pattern, x) for x in the_gram):
                    continue
                ngrams[' '.join(the_gram)] += 1
            except IndexError:
                pass

        # turn counter into list of results
        for k, v in list(ngrams.items()):
            if v > 1:
                for i in range(v):
                    result.append(k)
        if countmode:
            return(len(result))
        else:
            return result

    def compiler(pattern):
        """compile regex or fail gracefully"""
        import re
        try:
            if case_sensitive:
                comped = re.compile(pattern)
            else:
                comped = re.compile(pattern, re.IGNORECASE)
            return comped
        except:
            import traceback
            import sys
            from time import localtime, strftime
            exc_type, exc_value, exc_traceback = sys.exc_info()
            lst = traceback.format_exception(exc_type, exc_value,
                          exc_traceback)
            error_message = lst[-1]
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Query %s' % (thetime, error_message))
            if root:
                return 'Bad query'
            else:
                raise ValueError('%s: Query %s' % (thetime, error_message))

    def tok_by_reg(pattern, list_of_toks, concordancing = False, **kwargs):
        """search for regex in plaintext corpora"""
        import re
        comped = compiler(pattern)
        if comped == 'Bad query':
            return 'Bad query'
        if not concordancing:
            matches = [m for m in list_of_toks if re.search(comped, m)]
        else:
            matches = []
            for index, token in enumerate(list_of_toks):
                if re.search(comped, token):
                    match = [' '.join([t for t in unsplitter(list_of_toks[:index])])[-140:]]
                    match.append(re.search(comped, token).group(0))
                    match.append(' '.join([t for t in unsplitter(list_of_toks[index + 1:])])[:140])
                    matches.append(match)
        if countmode:
            return(len(matches))
        else:
            return matches

    def plaintext_regex_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for regex in plaintext corpora

        it searches over lines, so the user needs to be careful.
        """
        import re
        if concordancing:
            pattern = r'(.{,140})\b(' + pattern + r')\b(.{,140})'
        compiled_pattern = compiler(pattern)
        if compiled_pattern == 'Bad query':
            return 'Bad query'
        matches = re.findall(compiled_pattern, plaintext_data)
        if concordancing:
            matches = [list(m) for m in matches]
        if not concordancing:
            for index, i in enumerate(matches):
                if type(i) == tuple:
                    matches[index] = i[0]
        if countmode:
            return(len(matches))
        else:
            return matches

    def correct_spelling(a_string):
        if not spelling:
            return a_string
        from dictionaries.word_transforms import usa_convert
        if spelling.lower() == 'uk':
            usa_convert = {v: k for k, v in list(usa_convert.items())}
        spell_out = []
        bits = a_string.split('/')
        for index, i in enumerate(bits):
            converted = usa_convert.get(i.lower(), i)
            if i.islower() or preserve_case is False:
                converted = converted.lower()
            elif i.isupper() and preserve_case:
                converted = converted.upper()
            elif i.istitle() and preserve_case:
                converted = converted.title()
            bits[index] = converted
        r = '/'.join(bits)
        return r

    def plaintext_simple_search(pattern, plaintext_data, concordancing = False, **kwargs):
        """search for tokens in plaintext corpora"""
        import re
        result = []
        if type(pattern) == str:
            pattern = [pattern]
        for p in pattern:
            if concordancing:
                pat = r'(.{0,140})\b(' + re.escape(p) + r')\b(.{0,140})'
            pat = compiler(pat)
            if pat == 'Bad query':
                return 'Bad query'
            matches = re.findall(pat, plaintext_data)
            if concordancing:
                matches = [list(m) for m in matches]
                for i in matches:
                    result.append(i)
            else:   
                for m in range(len(matches)):
                    result.append(p)
        return result

    # do multiprocessing if need be
    im, corpus, search, query, just_speakers = is_multiquery(corpus, search, query, just_speakers)

    if hasattr(corpus, '__iter__') and im:
        corpus = Corpus(corpus)
    if hasattr(corpus, '__iter__') and not im:
        im = True
    if corpus.__class__ == Corpora:
        im = True

    if not im and multiprocess:
        im = True
        corpus = corpus[:]
    # if it's already been through pmultiquery, don't do it again
    
    locs['search'] = search
    locs['query'] = query
    locs['just_speakers'] = just_speakers
    locs['corpus'] = corpus
    locs['multiprocess'] = multiprocess
    locs['print_info'] = kwargs.get('printstatus', True)

    if im:
        signal.signal(signal.SIGINT, original_sigint)
        from multiprocess import pmultiquery
        return pmultiquery(**locs)

    cname = corpus.name
    subcorpora = corpus.subcorpora
    
    try:
        datatype = corpus.datatype
        singlefile = corpus.singlefile
    except AttributeError:
        datatype = 'parse'
        singlefile = False
        
    # store all results in here
    results = {}
    count_results = {}
    conc_results = {}
    # check if just counting
    countmode = 'c' in show
    if countmode:
        no_conc = True
        only_conc = False
    # where we are at in interrogation
    current_iter = 0

    # multiprocessing progress bar
    denom = kwargs.get('denominator', 1)
    startnum = kwargs.get('startnum', 0)

    ############################################
    # Determine the search function to be used #
    ############################################
    
    # simple tregex is tregex over whole dirs
    simple_tregex_mode = False
    statsmode = False
    if not just_speakers and 't' in list(search.keys()):
        simple_tregex_mode = True
    else:
        if datatype == 'plaintext':
            if search.get('n'):
                raise NotImplementedError('Use a tokenised corpus for n-gramming.')
                #searcher = plaintext_ngram
                optiontext = 'n-grams via plaintext'
            if search.get('w'):
                if kwargs.get('regex', True):
                    searcher = plaintext_regex_search
                else:
                    searcher = plaintext_simple_search
                optiontext = 'Searching plaintext'

        elif datatype == 'tokens':
            if search.get('n'):
                searcher = tok_ngrams
                optiontext = 'n-grams via tokens'
            elif search.get('w'):
                if kwargs.get('regex', True):
                    searcher = tok_by_reg
                else:
                    searcher = tok_by_list
                if type(search.get('w')) == list:
                    searcher = tok_by_list
                optiontext = 'Searching tokens'
        only_parse = ['r', 'd', 'g', 'dl', 'gl', 'df', 'gf', 'dp', 'gp', 'f', 'd2', 'd2f', 'd2p', 'd2l']
        if datatype != 'parse' and any(i in only_parse for i in list(search.keys())):
            raise ValueError('Need parsed corpus to search with "%s" option(s).' % ', '.join([i for i in list(search.keys()) if i in only_parse]))

        elif datatype == 'parse':
            if search.get('t'):
                searcher = slow_tregex
            elif search.get('s'):
                searcher = get_stats
                statsmode = True
                optiontext = 'General statistics'
                global numdone
                numdone = 0
                no_conc = True
                only_conc = False
                do_concordancing = False
            else:
                from depsearch import dep_searcher
                searcher = dep_searcher
                optiontext = 'Dependency querying'

    ############################################
    #      Set some Tregex-related values      #
    ############################################

    if search.get('t'):
        translated_option = 't'
        query = search.get('t')

        # check the query
        q = tregex_engine(corpus = False, query = search.get('t'), 
                          options = ['-t'], check_query = True, root = root)
        if query is False:
            if root:
                return 'Bad query'
            else:
                return

        optiontext = 'Searching parse trees'
        if 'p' in show or 'pl' in show:
            translated_option = 'u'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 't' in show:
            translated_option = 'o'
            if type(search['t']) == list:
                search['t'] = r'__ < (/%s/ !< __)' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
        elif 'w' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'c' in show:
            only_count = True
            translated_option = 'C'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __'  % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'
        elif 'l' in show:
            translated_option = 't'
            if type(search['t']) == list:
                search['t'] = r'/%s/ !< __' % as_regex(search['t'], boundaries = 'line', 
                                            case_sensitive = case_sensitive)
            if search['t'] == 'any':
                search['t'] = r'/.?[A-Za-z0-9].?/ !< __'

        query = search['t']

    ############################################
    # Make iterable for corpus/subcorpus/file  #
    ############################################

    if corpus.__class__ == Datalist:
        to_iterate_over = {}
        for subcorpus in corpus:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
    elif singlefile:
        to_iterate_over = {(corpus.name, corpus.path): [corpus]}
    elif not subcorpora:
        to_iterate_over = {(corpus.name, corpus.path): corpus.files}
    else:
        to_iterate_over = {}
        for subcorpus in subcorpora:
            to_iterate_over[(subcorpus.name, subcorpus.path)] = subcorpus.files
        #for k, v in sorted(corpus.structure.items(), key=lambda obj: obj[0].name):
        #    to_iterate_over[(k.name, k.path)] = v
    if files_as_subcorpora:
        to_iterate_over = {}
        for f in corpus.files:
            to_iterate_over[(f.name, f.path)] = [f]

    ############################################
    #           Print welcome message          #
    ############################################

    if no_conc:
        message = 'Interrogating'
    else:
        message = 'Interrogating and concordancing'
    if kwargs.get('printstatus', True):
        thetime = strftime("%H:%M:%S", localtime())

        sformat = '\n                 '.join(['%s: %s' % (k.rjust(3), v) for k, v in list(search.items())])
        if search == {'s': r'.*'}:
            sformat = 'features'
        welcome = '\n%s: %s %s ...\n          %s\n          Query: %s\n          %s corpus ... \n' % \
                  (thetime, message, cname, optiontext, sformat, message)
        print(welcome)

    ############################################
    #           Make progress bar              #
    ############################################

    if simple_tregex_mode:
        total_files = len(list(to_iterate_over.keys()))
    else:
        if search.get('s'):
            total_files = sum([len(x) for x in list(to_iterate_over.values())]) * 12
        else:
            total_files = sum([len(x) for x in list(to_iterate_over.values())])

    par_args = {'printstatus': kwargs.get('printstatus', True),
                'root': root, 
                'note': note,
                'length': total_files,
                'startnum': kwargs.get('startnum'),
                'denom': kwargs.get('denominator', 1)}

    term = None
    if kwargs.get('paralleling', None) is not None:
        from blessings import Terminal
        term = Terminal()
        par_args['terminal'] = term
        par_args['linenum'] = kwargs.get('paralleling')

    outn = kwargs.get('outname', '')
    if outn:
        outn = outn + ': '
    tstr = '%s%d/%d' % (outn, current_iter, total_files)
    p = animator(None, None, init = True, tot_string = tstr, **par_args)
    tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
    animator(p, current_iter, tstr, **par_args)

    ############################################
    # Iterate over data, doing interrogations  #
    ############################################

    for (subcorpus_name, subcorpus_path), files in sorted(to_iterate_over.items()):

        conc_results[subcorpus_name] = []
        count_results[subcorpus_name] = []
        results[subcorpus_name] = Counter()
        
        # tregex over subcorpora, not files
        if simple_tregex_mode:

            op = ['-o', '-' + translated_option]                
            result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)

            if not countmode:
                result = format_tregex(result)

            if not no_conc:
                op += ['-w', '-f']
                whole_result = tregex_engine(query = search['t'], options = op, 
                                   corpus = subcorpus_path, root = root, preserve_case = preserve_case)
                
                if not only_format_match:
                    whole_result = format_tregex(whole_result, whole = True)

                conc_result = make_conc_lines_from_whole_mid(whole_result, result, speakr = False)

            if countmode:
                count_results[subcorpus_name] += [result]            
            else:
                result = Counter(result)
                results[subcorpus_name] += result
                if not no_conc:
                    for lin in conc_result:
                        if numconc < maxconc or not maxconc:
                            conc_results[subcorpus_name].append(lin)
                        numconc += 1

            current_iter += 1
            if kwargs.get('paralleling', None) is not None:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
            else:
                tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)

            animator(p, current_iter, tstr, **par_args)

        # dependencies, plaintext, tokens or slow_tregex
        else:
            for f in files:
                slow_treg_speaker_guess = kwargs.get('outname', False)
                if datatype == 'parse':
                    with open(f.path, 'r') as data:
                        data = data.read()
                        from corenlp_xml.document import Document
                        try:
                            corenlp_xml = Document(data)
                        except:
                            print('Could not read file: %s' % f.path)
                            continue
                        if just_speakers:  
                            sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                            if len(just_speakers) == 1:
                                slow_treg_speaker_guess = just_speakers[0]
                            if not sents:
                                continue
                        else:
                            sents = corenlp_xml.sentences

                        res, conc_res = searcher(sents, search = search, show = show,
                            dep_type = dep_type,
                            exclude = exclude,
                            excludemode = excludemode,
                            searchmode = searchmode,
                            lemmatise = False,
                            case_sensitive = case_sensitive,
                            do_concordancing = do_concordancing,
                            only_format_match = only_format_match,
                            speaker = slow_treg_speaker_guess)
                        
                        if res == 'Bad query':
                            return 'Bad query'

                elif datatype == 'tokens':
                    import pickle
                    with codecs.open(f.path, "rb") as fo:
                        data = pickle.load(fo)
                    if not only_conc:
                        res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = False)
                    if not no_conc:
                        conc_res = searcher(list(search.values())[0], data, split_contractions = split_contractions, 
                        concordancing = True)
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            line.insert(0, '')

                elif datatype == 'plaintext':
                    with codecs.open(f.path, 'rb', encoding = 'utf-8') as data:
                        data = data.read()
                        if not only_conc:
                            res = searcher(list(search.values())[0], data, 
                            concordancing = False)
                        if not no_conc:
                            conc_res = searcher(list(search.values())[0], data, 
                            concordancing = True)
                        if not no_conc:
                            for index, line in enumerate(conc_res):
                                line.insert(0, '')

                if countmode:
                    count_results[subcorpus_name] += [res]
                else:
                    # add filename and do lowercasing for conc
                    if not no_conc:
                        for index, line in enumerate(conc_res):
                            if searcher != slow_tregex:
                                line.insert(0, f.name)
                            else:
                                line[0] = f.name
                            if not preserve_case:
                                line[3:] = [x.lower() for x in line[3:]]
                            if spelling:
                                line = [correct_spelling(b) for b in line]
                            if numconc < maxconc or not maxconc:
                                conc_results[subcorpus_name].append(line)
                                numconc += 1

                    # do lowercasing and spelling
                    if not only_conc:
                        if not preserve_case:
                            if not statsmode:
                                res = [i.lower() for i in res]
                        if spelling:
                            if not statsmode:
                                res = [correct_spelling(r) for r in res]
                        #if not statsmode:
                        results[subcorpus_name] += Counter(res)
                        #else:
                        #results[subcorpus_name] += res

                if not statsmode:
                    current_iter += 1
                    if kwargs.get('paralleling', None) is not None:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    else:
                        tstr = '%s%d/%d' % (outn, current_iter + 1, total_files)
                    animator(p, current_iter, tstr, **par_args)

    # delete temp file if there
    import os
    if os.path.isfile('tmp.txt'):
        os.remove('tmp.txt')

    ############################################
    #     Get concordances into DataFrame      #
    ############################################

    if not no_conc:
        all_conc_lines = []
        for sc_name, resu in sorted(conc_results.items()):
            if only_unique:
                unique_results = uniquify(resu)
            else:
                unique_results = resu
            #make into series
            pindex = 'c f s l m r'.encode('utf-8').split()
            for fname, spkr, start, word, end in unique_results:
                #spkr = str(spkr, errors = 'ignore')
                fname = os.path.basename(fname)
                all_conc_lines.append(Series([sc_name,
                                     fname, \
                                     spkr, \
                                     start, \
                                     word, \
                                     end], \
                                     index = pindex))

        # randomise results...
        if random:
            from random import shuffle
            shuffle(all_conc_lines)

        conc_df = pd.concat(all_conc_lines, axis = 1).T

        # not doing anything yet --- this is for multimodal concordancing
        add_links = False
        if not add_links:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r']
        else:
            conc_df.columns = ['c', 'f', 's', 'l', 'm', 'r', 'link']

        if all(x == '' for x in list(conc_df['s'].values)):
            conc_df.drop('s', axis = 1, inplace = True)

        #if kwargs.get('note'):
        #    kwargs['note'].progvar.set(100)

        #if kwargs.get('printstatus', True):
        #    thetime = strftime("%H:%M:%S", localtime())
        #    finalstring = '\n\n%s: Concordancing finished! %d matches.\n' % (thetime, len(conc_df.index))
        #    print(finalstring)

        from interrogation import Concordance
        output = Concordance(conc_df)
        if only_conc:
            output.query = locs
            if save:
                output.save(save)

            if kwargs.get('printstatus', True):
                thetime = strftime("%H:%M:%S", localtime())
                finalstring = '\n\n%s: Concordancing finished! %d results.' % (thetime, len(conc_df))
                print(finalstring)
            signal.signal(signal.SIGINT, original_sigint)
            return output

        #output.query = locs

        #return output 

    ############################################
    #     Get interrogation into DataFrame     #
    ############################################

    if not only_conc:
        if countmode:
            df = Series({k: sum(v) for k, v in sorted(count_results.items())})
            tot = df.sum()
        else:
            the_big_dict = {}
            unique_results = set([item for sublist in list(results.values()) for item in sublist])
            for word in unique_results:
                the_big_dict[word] = [subcorp_result[word] for name, subcorp_result in sorted(results.items(), key=lambda x: x[0])]
            # turn master dict into dataframe, sorted
            df = DataFrame(the_big_dict, index = sorted(results.keys()))

            numentries = len(df.columns)
            tot = df.sum(axis = 1)
            total_total = df.sum().sum()

        ############################################
        # Format, output as Interrogation object   #
        ############################################

        if not countmode:
            if not subcorpora or singlefile:
                if not files_as_subcorpora:
                    if not kwargs.get('df1_always_df'):
                        df = Series(df.ix[0])
                        df.sort_values(ascending = False, inplace = True)
                        tot = df.sum()
                        numentries = len(df.index)
                        total_total = tot

        # sort by total
        if type(df) == pd.core.frame.DataFrame:
            if not df.empty:   
                df.ix['Total-tmp'] = df.sum()
                the_tot = df.ix['Total-tmp']
                df = df[the_tot.argsort()[::-1]]
                df = df.drop('Total-tmp', axis = 0)

        # format final string
        if kwargs.get('printstatus', True):
            thetime = strftime("%H:%M:%S", localtime())
            finalstring = '\n\n%s: Interrogation finished!' % thetime
            if countmode:
                finalstring += ' %d matches.' % tot
            else:
                finalstring += ' %d unique results, %d total occurrences.' % (numentries, total_total)
            print(finalstring)

        if not no_conc:
            interro = Interrogation(results = df, totals = tot, query = locs, concordance = output)
        else:
            interro = Interrogation(results = df, totals = tot, query = locs)

        if save:
            interro.save(save)
        signal.signal(signal.SIGINT, original_sigint)
        return interro
Example #5
0
    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []
Example #6
0
def dep_searcher(sents,
                 search,
                 show='w',
                 dep_type='collapsed-ccprocessed-dependencies',
                 regex_nonword_filter=r'[A-Za-z0-9:_]',
                 do_concordancing=False,
                 exclude=False,
                 excludemode='any',
                 searchmode='all',
                 lemmatise=False,
                 case_sensitive=False,
                 progbar=False,
                 only_format_match=False,
                 speaker=False):
    import re
    from corenlp_xml.document import Document
    from collections import Counter
    from build import flatten_treestring
    from process import filtermaker, animator, get_deps
    """
    search corenlp dependency parse
    1. search for 'search' keyword arg
       governor
       dependent
       function
       pos
       lemma
       word
       index
       etc

    2. exclude entries if need be, using same method as search

    3. return '/'-sep list of 'show' keyword arg, or conc lines:
       governor
       dependent
       function
       pos
       lemma
       word
       index
       distance
       etc
       
       ... or just return int count.
       """
    def distancer(lks, lk):
        "determine number of jumps to root"
        c = 0
        # get the gov index, stop when it's zero
        root_found = False
        while not root_found:
            if c == 0:
                try:
                    link_to_check = next(i for i in lks
                                         if i.dependent.idx == lk.id)
                except StopIteration:
                    root_found = True
                    break
                #link_to_check = lk
            gov_index = link_to_check.governor.idx
            if gov_index == 0:
                root_found = True
            else:
                if c > 29:
                    root_found = True
                    break
                link_to_check = [
                    l for l in lks if l.dependent.idx == gov_index
                ]
                if len(link_to_check) > 0:
                    link_to_check = link_to_check[0]
                else:
                    break
                c += 1
        if c < 30:
            return c

    def get_matches_from_sent(s,
                              search,
                              deps=False,
                              tokens=False,
                              dep_type='basic-dependencies',
                              mode='all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v

        for opt, pat in list(search.items()):
            if pat == 'any':
                pat = re.compile(r'.*')
            elif type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive=case_sensitive)
            else:
                if case_sensitive:
                    pat = re.compile(pat)
                else:
                    pat = re.compile(pat, re.IGNORECASE)
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [
                k for k, v in counted.items() if v >= len(list(search.keys()))
            ]
        return lks

    result = []
    conc_result = []
    numdone = 0

    for s in sents:
        numdone += 1
        deps = get_deps(s, dep_type)
        tokens = s.tokens
        lks = get_matches_from_sent(s,
                                    search,
                                    deps,
                                    tokens,
                                    dep_type,
                                    mode=searchmode)

        #if not concordancing:
        #    lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)]))

        if exclude is not False:
            to_remove = get_matches_from_sent(s,
                                              exclude,
                                              deps,
                                              tokens,
                                              dep_type,
                                              mode=excludemode)

            for i in to_remove:
                try:
                    lks.remove(i)
                except ValueError:
                    pass

        if progbar:
            tstr = '%d/%d' % (numdone, len(sents))
            animator(progbar, numdone, tstr)

        if 'c' in show:
            result.append(len(lks))
            continue

        if do_concordancing:
            for lk in lks:  # for each concordance middle part
                one_result = []
                if not lk:
                    continue
                # get the index of the match
                windex = int(lk.id) - 1
                speakr = s.speakername
                if not speakr:
                    speakr = ''
                # begin building line with speaker first
                conc_line = [speakr]
                # format a single word correctly
                if only_format_match:
                    start = ' '.join([
                        t.word for index, t in enumerate(s.tokens)
                        if index < windex
                    ])
                    end = ' '.join([
                        t.word for index, t in enumerate(s.tokens)
                        if index > windex
                    ])
                    s.tokens = [s.get_token_by_id(lk.id)]
                for tok in s.tokens:
                    single_wd = {}
                    intermediate_result = []
                    if 'w' in show:
                        single_wd['w'] = tok.word
                    if 'l' in show:
                        from dictionaries.word_transforms import wordlist
                        if tok.lemma in list(wordlist.keys()):
                            lem = wordlist[tok.lemma]
                        else:
                            lem = tok.lemma
                        single_wd['l'] = lem
                    if 'p' in show:
                        single_wd['p'] = tok.pos

                    if 'pl' in show:
                        single_wd['pl'] = lk.pos
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in list(taglemma.keys()):
                            single_wd['pl'] = taglemma[postag.lower()]
                        else:
                            single_wd['pl'] = postag.lower()
                        if not single_wd['pl']:
                            single_wd['pl'] == 'none'

                    if 'r' in show:
                        all_lks = [l for l in deps.links]
                        distance = distancer(all_lks, tok)
                        if distance:
                            single_wd['r'] = str(distance)
                        else:
                            single_wd['r'] = '0'
                    if 'f' in show:
                        for lin in deps.links:
                            single_wd['f'] = '.'
                            if tok.id == lin.dependent.idx:
                                single_wd['f'] = lin.type
                                break
                    if 'i' in show:
                        single_wd['i'] = str(tok.id)

                    if any(x.startswith('g') for x in show):
                        thegovid = next((q.governor.idx for q in deps.links \
                                        if q.dependent.idx == tok.id), False)
                        govtok = False
                        if thegovid is not False:
                            govtok = s.get_token_by_id(thegovid)

                        if 'g' in show:
                            if govtok:
                                single_wd['g'] = govtok.word
                            else:
                                single_wd['g'] = 'none'
                        if 'gl' in show:
                            if govtok:
                                single_wd['gl'] = govtok.lemma
                            else:
                                single_wd['gl'] = 'none'
                        if 'gp' in show:
                            if govtok:
                                single_wd['gp'] = govtok.pos
                            else:
                                single_wd['gp'] = 'none'

                        if 'gf' in show:
                            if govtok:
                                single_wd['gf'] = next(x.type for x in deps.links \
                                            if x.dependent.idx == thegovid)
                            else:
                                single_wd['gf'] = 'none'

                    if any(x.startswith('d') for x in show):
                        thedepid = next((q.dependent.idx for q in deps.links \
                                        if q.governor.idx == tok.id), False)

                        deptok = False
                        if thedepid is not False:
                            deptok = s.get_token_by_id(thedepid)

                        if 'd' in show:
                            if thedepid:
                                single_wd['d'] = deptok.word
                            else:
                                single_wd['d'] = 'none'

                        if 'dl' in show:
                            if thedepid:
                                single_wd['dl'] = deptok.lemma
                            else:
                                single_wd['dl'] = 'none'
                        if 'dp' in show:
                            if thedepid:
                                single_wd['dp'] = deptok.pos
                            else:
                                single_wd['dp'] = 'none'
                        if 'df' in show:
                            if thedepid:
                                single_wd['df'] = next(x.type for x in deps.links \
                                if x.dependent.idx == thedepid)
                            else:
                                single_wd['df'] = 'none'
                    for i in show:
                        intermediate_result.append(single_wd[i])
                    intermediate_result = [
                        i.replace('/', '-slash-').encode('utf-8',
                                                         errors='ignore')
                        for i in intermediate_result
                    ]
                    one_result.append('/'.join(intermediate_result))
                # now we have formatted tokens as a list. we need to split
                # it into start, middle and end
                if not only_format_match:
                    start = ' '.join([
                        w for index, w in enumerate(one_result)
                        if index < windex
                    ])
                    end = ' '.join([
                        w for index, w in enumerate(one_result)
                        if index > windex
                    ])
                    middle = one_result[windex]
                else:
                    middle = one_result[0]

                for bit in start, middle, end:
                    conc_line.append(bit)
                conc_result.append(conc_line)

        # figure out what to show
        for lk in lks:
            single_result = {}
            if not lk:
                continue
            if 'w' in show:
                single_result['w'] = 'none'
                if lemmatise:
                    single_result['w'] = lk.lemma
                else:
                    single_result['w'] = lk.word
            if 'l' in show:
                from dictionaries.word_transforms import wordlist
                if lk.lemma in list(wordlist.keys()):
                    lem = wordlist[lk.lemma]
                else:
                    lem = lk.lemma
                single_result['l'] = lem
            if 'p' in show:
                single_result['p'] = 'none'
                postag = lk.pos
                if lemmatise:
                    from dictionaries.word_transforms import taglemma
                    if postag.lower() in list(taglemma.keys()):
                        single_result['p'] = taglemma[postag.lower()]
                    else:
                        single_result['p'] = postag.lower()
                else:
                    single_result['p'] = postag
                if not single_result['p']:
                    single_result['p'] == 'none'

            if 'pl' in show:
                single_result['pl'] = 'none'
                postag = lk.pos
                from dictionaries.word_transforms import taglemma
                if postag.lower() in list(taglemma.keys()):
                    single_result['pl'] = taglemma[postag.lower()]
                else:
                    single_result['pl'] = postag.lower()
                if not single_result['pl']:
                    single_result['pl'] == 'none'

            if 'f' in show:
                single_result['f'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        single_result['f'] = i.type.rstrip(',')
                        break
                if single_result['f'] == '':
                    single_result['f'] = 'root'

            if 'g' in show:
                single_result['g'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            if lemmatise:
                                single_result['g'] = s.get_token_by_id(
                                    i.governor.idx).lemma
                            else:
                                single_result['g'] = i.governor.text
                        else:
                            single_result['g'] = 'root'
                        break

            if 'd' in show:
                single_result['d'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            if lemmatise:
                                single_result['d'] = s.get_token_by_id(
                                    i.dependent.idx).lemma
                            else:
                                single_result['d'] = i.dependent.text
                        break

            if 'gl' in show:
                single_result['gl'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gl'] = s.get_token_by_id(
                                i.governor.idx).lemma
                        else:
                            single_result['gl'] = 'root'
                        break

            if 'dl' in show:
                single_result['dl'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            single_result['dl'] = s.get_token_by_id(
                                i.dependent.idx).lemma
                        break

            if 'gp' in show:
                single_result['gp'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gp'] = s.get_token_by_id(
                                i.governor.idx).pos
                        break

            if 'dp' in show:
                single_result['dp'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            single_result['dp'] = s.get_token_by_id(
                                i.dependent.idx).pos
                        break

            if 'df' in show:
                single_result['df'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        single_result['df'] = i.type
                        break

            if 'gf' in show:
                single_result['gf'] = 'none'
                for i in deps.links:
                    # if the result is the dependent, get the governor, find where
                    # it is a dependent, then gt the type
                    if i.dependent.idx == lk.id:
                        gv = next(x for x in deps.links
                                  if x.dependent.idx == i.governor.idx)
                        single_result['gf'] = gv.type
                        break

            if 'r' in show:
                all_lks = [l for l in deps.links]
                distance = distancer(all_lks, lk)
                if distance is not False and distance is not None:
                    single_result['r'] = str(distance)
                else:
                    single_result['r'] = '-1'

            if 'i' in show:
                single_result['i'] = str(lk.id)

            if 'c' not in show:

                # add them in order
                out = []
                for i in show:
                    out.append(single_result[i])

                out = [i.replace('/', '-slash-') for i in out]
                result.append('/'.join(out))

    if 'c' in show:
        result = sum(result)

    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        result = []
    return result, conc_result