コード例 #1
0
ファイル: depsearch.py プロジェクト: nkhuyu/corpkit
    def get_matches_from_sent(s, search, deps = False, tokens = False, 
        dep_type = 'basic-dependencies', mode = 'all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive = case_sensitive)
                search[opt] = pat
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v
            if pat == 'any':
                search[opt] = re.compile(r'.*')

        for opt, pat in list(search.items()):
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [k for k, v in counted.items() if v >= len(list(search.keys()))]
        return lks
コード例 #2
0
ファイル: depsearch.py プロジェクト: nkhuyu/corpkit
def dep_searcher(sents,
                 search,
                 show = 'w',
                 dep_type = 'collapsed-ccprocessed-dependencies',
                 regex_nonword_filter = r'[A-Za-z0-9:_]',
                 do_concordancing = False,
                 exclude = False,
                 excludemode = 'any',
                 searchmode = 'all',
                 lemmatise = False,
                 case_sensitive = False,
                 progbar = False,
                 only_format_match = False,
                 speaker = False):
    import re
    from corenlp_xml.document import Document
    from collections import Counter
    from build import flatten_treestring
    from process import filtermaker, animator, get_deps
    """
    search corenlp dependency parse
    1. search for 'search' keyword arg
       governor
       dependent
       function
       pos
       lemma
       word
       index
       etc

    2. exclude entries if need be, using same method as search

    3. return '/'-sep list of 'show' keyword arg, or conc lines:
       governor
       dependent
       function
       pos
       lemma
       word
       index
       distance
       etc
       
       ... or just return int count.
       """

    def distancer(lks, lk):
        "determine number of jumps to root"      
        c = 0
        # get the gov index, stop when it's zero
        root_found = False
        while not root_found:
            if c == 0:
                try:
                    link_to_check = next(i for i in lks if i.dependent.idx == lk.id)
                except StopIteration:
                    root_found = True
                    break
                #link_to_check = lk
            gov_index = link_to_check.governor.idx
            if gov_index == 0:
                root_found = True
            else:
                if c > 29:
                    root_found = True
                    break
                link_to_check = [l for l in lks if l.dependent.idx == gov_index]
                if len(link_to_check) > 0:
                    link_to_check = link_to_check[0]
                else:
                    break
                c += 1
        if c < 30:
            return c

    def get_matches_from_sent(s, search, deps = False, tokens = False, 
        dep_type = 'basic-dependencies', mode = 'all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive = case_sensitive)
                search[opt] = pat
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v
            if pat == 'any':
                search[opt] = re.compile(r'.*')

        for opt, pat in list(search.items()):
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [k for k, v in counted.items() if v >= len(list(search.keys()))]
        return lks

    result = []
    conc_result = []
    numdone = 0

    for s in sents:
        numdone += 1
        deps = get_deps(s, dep_type)
        tokens = s.tokens
        lks = get_matches_from_sent(s, search, deps, tokens, dep_type, mode = searchmode)

        #if not concordancing:
        #    lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)]))

        if exclude is not False:
            to_remove = get_matches_from_sent(s, exclude, deps, tokens, dep_type, mode = excludemode)

            for i in to_remove:
                try:
                    lks.remove(i)
                except ValueError:
                    pass

        if progbar:
            tstr = '%d/%d' % (numdone, len(sents))
            animator(progbar, numdone, tstr)

        if 'c' in show:
            result.append(len(lks))
            continue

        if do_concordancing:
            for lk in lks: # for each concordance middle part
                one_result = []
                if not lk:
                    continue
                # get the index of the match
                windex = int(lk.id) - 1
                speakr = s.speakername
                if not speakr:
                    speakr = ''
                # begin building line with speaker first
                conc_line = [speakr]
                # format a single word correctly
                if only_format_match:
                    start = ' '.join([t.word for index, t in enumerate(s.tokens) if index < windex])
                    end = ' '.join([t.word for index, t in enumerate(s.tokens) if index > windex])
                    s.tokens = [s.get_token_by_id(lk.id)]
                for tok in s.tokens:
                    single_wd = {}
                    intermediate_result = []
                    if 'w' in show:
                        single_wd['w'] = tok.word
                    if 'l' in show:
                        from dictionaries.word_transforms import wordlist
                        if tok.lemma in list(wordlist.keys()):
                            lem = wordlist[tok.lemma]
                        else:
                            lem = tok.lemma
                        single_wd['l'] = lem
                    if 'p' in show:
                        single_wd['p'] = tok.pos

                    if 'pl' in show:
                        single_wd['pl'] = lk.pos
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in list(taglemma.keys()):
                            single_wd['pl'] = taglemma[postag.lower()]
                        else:
                            single_wd['pl'] = postag.lower()
                        if not single_wd['pl']:
                            single_wd['pl'] == 'none'

                    if 'r' in show:
                        all_lks = [l for l in deps.links]
                        distance = distancer(all_lks, tok)
                        if distance:
                            single_wd['r'] = str(distance)
                        else:
                            single_wd['r'] = '0'
                    if 'f' in show:
                        for lin in deps.links:
                            single_wd['f'] = '.'
                            if tok.id == lin.dependent.idx:
                                single_wd['f'] = lin.type
                                break
                    if 'i' in show:
                        single_wd['i'] = str(tok.id)

                    if any(x.startswith('g') for x in show):
                        thegovid = next((q.governor.idx for q in deps.links \
                                        if q.dependent.idx == tok.id), False)
                        govtok = False
                        if thegovid is not False:
                            govtok = s.get_token_by_id(thegovid)
                            
                        if 'g' in show:
                            if govtok:
                                single_wd['g'] = govtok.word
                            else:
                                single_wd['g'] = 'none'
                        if 'gl' in show:
                            if govtok:
                                single_wd['gl'] = govtok.lemma
                            else: 
                                single_wd['gl'] = 'none'
                        if 'gp' in show:
                            if govtok:
                                single_wd['gp'] = govtok.pos
                            else: 
                                single_wd['gp'] = 'none'

                        if 'gf' in show:
                            if govtok:
                                single_wd['gf'] = next(x.type for x in deps.links \
                                            if x.dependent.idx == thegovid)
                            else: 
                                single_wd['gf'] = 'none'

                    if any(x.startswith('d') for x in show):
                        thedepid = next((q.dependent.idx for q in deps.links \
                                        if q.governor.idx == tok.id), False)

                        deptok = False
                        if thedepid is not False:
                            deptok = s.get_token_by_id(thedepid)

                        if 'd' in show:
                            if thedepid:
                                single_wd['d'] = deptok.word
                            else: 
                                single_wd['d'] = 'none'

                        if 'dl' in show:
                            if thedepid:
                                single_wd['dl'] = deptok.lemma
                            else: 
                                single_wd['dl'] = 'none'
                        if 'dp' in show:
                            if thedepid:
                                single_wd['dp'] = deptok.pos
                            else: 
                                single_wd['dp'] = 'none'
                        if 'df' in show:
                            if thedepid:
                                single_wd['df'] = next(x.type for x in deps.links \
                                if x.dependent.idx == thedepid)
                            else: 
                                single_wd['df'] = 'none'
                    for i in show:
                        intermediate_result.append(single_wd[i])
                    intermediate_result = [i.replace('/', '-slash-').encode('utf-8', errors = 'ignore') for i in intermediate_result]
                    one_result.append('/'.join(intermediate_result))
                # now we have formatted tokens as a list. we need to split
                # it into start, middle and end
                if not only_format_match:
                    start = ' '.join([w for index, w in enumerate(one_result) if index < windex])
                    end = ' '.join([w for index, w in enumerate(one_result) if index > windex])
                    middle = one_result[windex]
                else:
                    middle = one_result[0]

                for bit in start, middle, end:
                    conc_line.append(bit)
                conc_result.append(conc_line)

        # figure out what to show
        for lk in lks:
            single_result = {}
            if not lk:
                continue
            if 'w' in show:
                single_result['w'] = 'none'
                if lemmatise:
                    single_result['w'] = lk.lemma
                else:
                    single_result['w'] = lk.word
            if 'l' in show:
                from dictionaries.word_transforms import wordlist
                if lk.lemma in list(wordlist.keys()):
                    lem = wordlist[lk.lemma]
                else:
                    lem = lk.lemma
                single_result['l'] = lem
            if 'p' in show:
                single_result['p'] = 'none'
                postag = lk.pos
                if lemmatise:
                    from dictionaries.word_transforms import taglemma
                    if postag.lower() in list(taglemma.keys()):
                        single_result['p'] = taglemma[postag.lower()]
                    else:
                        single_result['p'] = postag.lower()
                else:
                    single_result['p'] = postag
                if not single_result['p']:
                    single_result['p'] == 'none'

            if 'pl' in show:
                single_result['pl'] = 'none'
                postag = lk.pos
                from dictionaries.word_transforms import taglemma
                if postag.lower() in list(taglemma.keys()):
                    single_result['pl'] = taglemma[postag.lower()]
                else:
                    single_result['pl'] = postag.lower()
                if not single_result['pl']:
                    single_result['pl'] == 'none'

            if 'f' in show:
                single_result['f'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        single_result['f'] = i.type.rstrip(',')
                        break
                if single_result['f'] == '':
                    single_result['f'] = 'root'

            if 'g' in show:
                single_result['g'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            if lemmatise:                          
                                    single_result['g'] = s.get_token_by_id(i.governor.idx).lemma
                            else:
                                single_result['g'] = i.governor.text
                        else:
                            single_result['g'] = 'root'
                        break

            if 'd' in show:
                single_result['d'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            if lemmatise:
                                single_result['d'] = s.get_token_by_id(i.dependent.idx).lemma
                            else:
                                single_result['d'] = i.dependent.text
                        break

            if 'gl' in show:
                single_result['gl'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gl'] = s.get_token_by_id(i.governor.idx).lemma
                        else:
                            single_result['gl'] = 'root'
                        break

            if 'dl' in show:
                single_result['dl'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            single_result['dl'] = s.get_token_by_id(i.dependent.idx).lemma
                        break

            if 'gp' in show:
                single_result['gp'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):       
                            single_result['gp'] = s.get_token_by_id(i.governor.idx).pos
                        break

            if 'dp' in show:
                single_result['dp'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):       
                            single_result['dp'] = s.get_token_by_id(i.dependent.idx).pos
                        break

            if 'df' in show:
                single_result['df'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        single_result['df'] = i.type
                        break  

            if 'gf' in show:
                single_result['gf'] = 'none'
                for i in deps.links:
                    # if the result is the dependent, get the governor, find where
                    # it is a dependent, then gt the type
                    if i.dependent.idx == lk.id:
                        gv = next(x for x in deps.links if x.dependent.idx == i.governor.idx)
                        single_result['gf'] = gv.type
                        break                

            if 'r' in show:
                all_lks = [l for l in deps.links]
                distance = distancer(all_lks, lk)
                if distance is not False and distance is not None:
                    single_result['r'] = str(distance)
                else:
                    single_result['r'] = '-1'

            if 'i' in show:
                single_result['i'] = str(lk.id)

            if 'c' not in show:
                
                # add them in order
                out = []
                for i in show:
                    out.append(single_result[i])

                out = [i.replace('/', '-slash-') for i in out]
                result.append('/'.join(out))
    
    if 'c' in show:
        result = sum(result)

    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        result = []
    return result, conc_result
コード例 #3
0
ファイル: interrogator.py プロジェクト: nkhuyu/corpkit
    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []
コード例 #4
0
    def get_stats(sents, **dummy_args):
        """get a bunch of frequencies on interpersonal phenomena"""
        import os
        import re
        from collections import Counter
        statsmode_results = Counter()  
        # first, put the relevant trees into temp file
        if kwargs.get('outname'):
            to_open = 'tmp-%s.txt' % kwargs['outname']
        else:
            to_open = 'tmp.txt'
        with open(to_open, "w") as fo:
            for sent in sents:
                statsmode_results['Sentences'] += 1
                sts = sent.parse_string.rstrip()
                encd = sts.encode('utf-8', errors = 'ignore') + '\n'
                fo.write(encd)
                deps = get_deps(sent, dep_type)
                numpass = len([x for x in deps.links if x.type.endswith('pass')])
                statsmode_results['Passives'] += numpass
                statsmode_results['Tokens'] += len(sent.tokens)
                words = [w.word for w in sent.tokens if w.word.isalnum()]
                statsmode_results['Words'] += len(words)
                statsmode_results['Characters'] += len(''.join(words))

        # count moods via trees          (/\?/ !< __)
        from dictionaries.process_types import processes
        from other import as_regex
        tregex_qs = {'Imperative': r'ROOT < (/(S|SBAR)/ < (VP !< VBD !< VBG !$ NP !$ SBAR < NP !$-- S !$-- VP !$ VP)) !<< (/\?/ !< __) !<<- /-R.B-/ !<<, /(?i)^(-l.b-|hi|hey|hello|oh|wow|thank|thankyou|thanks|welcome)$/',
                     'Open interrogative': r'ROOT < SBARQ <<- (/\?/ !< __)', 
                     'Closed interrogative': r'ROOT ( < (SQ < (NP $+ VP)) << (/\?/ !< __) | < (/(S|SBAR)/ < (VP $+ NP)) <<- (/\?/ !< __))',
                     'Unmodalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP !< MD)))',
                     'Modalised declarative': r'ROOT < (S < (/(NP|SBAR|VP)/ $+ (VP < MD)))',
                     'Open class words': r'/^(NN|JJ|VB|RB)/ < __',
                     'Closed class words': r'__ !< __ !> /^(NN|JJ|VB|RB)/',
                     'Clauses': r'/^S/ < __',
                     'Interrogative': r'ROOT << (/\?/ !< __)',
                     'Mental processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.mental, boundaries = 'w'),
                     'Verbal processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.verbal, boundaries = 'w'),
                     'Relational processes': r'VP > /^(S|ROOT)/ <+(VP) (VP <<# /%s/)' % as_regex(processes.relational, boundaries = 'w')
                     }

        for name, q in sorted(tregex_qs.items()):
            res = tregex_engine(query = q, 
                  options = ['-o', '-C'], 
                  corpus = to_open,  
                  root = root)
            statsmode_results[name] += int(res)
            global numdone
            numdone += 1
            if root:
                root.update()
            else:
                tot_string = str(numdone + 1) + '/' + str(total_files)
                if kwargs.get('outname'):
                    tot_string = '%s: %s' % (kwargs['outname'], tot_string)
                animator(p, numdone, tot_string, **par_args)
            if kwargs.get('note', False):
                kwargs['note'].progvar.set((numdone * 100.0 / total_files / denom) + startnum)
        os.remove(to_open)
        return statsmode_results, []
コード例 #5
0
    def get_matches_from_sent(s,
                              search,
                              deps=False,
                              tokens=False,
                              dep_type='basic-dependencies',
                              mode='all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v

        for opt, pat in list(search.items()):
            if pat == 'any':
                pat = re.compile(r'.*')
            elif type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive=case_sensitive)
            else:
                if case_sensitive:
                    pat = re.compile(pat)
                else:
                    pat = re.compile(pat, re.IGNORECASE)
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [
                k for k, v in counted.items() if v >= len(list(search.keys()))
            ]
        return lks
コード例 #6
0
def dep_searcher(sents,
                 search,
                 show='w',
                 dep_type='collapsed-ccprocessed-dependencies',
                 regex_nonword_filter=r'[A-Za-z0-9:_]',
                 do_concordancing=False,
                 exclude=False,
                 excludemode='any',
                 searchmode='all',
                 lemmatise=False,
                 case_sensitive=False,
                 progbar=False,
                 only_format_match=False,
                 speaker=False):
    import re
    from corenlp_xml.document import Document
    from collections import Counter
    from build import flatten_treestring
    from process import filtermaker, animator, get_deps
    """
    search corenlp dependency parse
    1. search for 'search' keyword arg
       governor
       dependent
       function
       pos
       lemma
       word
       index
       etc

    2. exclude entries if need be, using same method as search

    3. return '/'-sep list of 'show' keyword arg, or conc lines:
       governor
       dependent
       function
       pos
       lemma
       word
       index
       distance
       etc
       
       ... or just return int count.
       """
    def distancer(lks, lk):
        "determine number of jumps to root"
        c = 0
        # get the gov index, stop when it's zero
        root_found = False
        while not root_found:
            if c == 0:
                try:
                    link_to_check = next(i for i in lks
                                         if i.dependent.idx == lk.id)
                except StopIteration:
                    root_found = True
                    break
                #link_to_check = lk
            gov_index = link_to_check.governor.idx
            if gov_index == 0:
                root_found = True
            else:
                if c > 29:
                    root_found = True
                    break
                link_to_check = [
                    l for l in lks if l.dependent.idx == gov_index
                ]
                if len(link_to_check) > 0:
                    link_to_check = link_to_check[0]
                else:
                    break
                c += 1
        if c < 30:
            return c

    def get_matches_from_sent(s,
                              search,
                              deps=False,
                              tokens=False,
                              dep_type='basic-dependencies',
                              mode='all'):
        """process a sentence object, returning matching tok ids"""
        from process import get_deps
        import re
        lks = []
        if not deps:
            deps = get_deps(s, dep_type)
        if not tokens:
            tokens = s.tokens

        for opt, pat in list(search.items()):
            if type(pat) == dict:
                del search[opt]
                for k, v in list(pat.items()):
                    if k != 'w':
                        search[opt + k] = v
                    else:
                        search[opt] = v

        for opt, pat in list(search.items()):
            if pat == 'any':
                pat = re.compile(r'.*')
            elif type(pat) == list:
                if all(type(x) == int for x in pat):
                    pat = [str(x) for x in pat]
                pat = filtermaker(pat, case_sensitive=case_sensitive)
            else:
                if case_sensitive:
                    pat = re.compile(pat)
                else:
                    pat = re.compile(pat, re.IGNORECASE)
            if opt == 'g':
                got = []
                for l in deps.links:
                    if re.search(pat, l.governor.text):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gf':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        gov_index = l.dependent.idx
                        for l2 in deps.links:
                            if l2.governor.idx == gov_index:
                                got.append(s.get_token_by_id(l2.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'df':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'gp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.governor.idx == tok.id:
                                got.append(s.get_token_by_id(i.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dl':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'dp':
                got = []
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        for i in deps.links:
                            if i.dependent.idx == tok.id:
                                got.append(s.get_token_by_id(i.governor.idx))
                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'd':
                got = []
                for l in deps.links:
                    if re.search(pat, l.dependent.text):
                        got.append(s.get_token_by_id(l.governor.idx))

                got = set(got)
                for i in got:
                    lks.append(i)

            elif opt == 'f':
                got = []
                for l in deps.links:
                    if re.search(pat, l.type):
                        got.append(s.get_token_by_id(l.dependent.idx))
                got = set(got)
                for i in got:
                    lks.append(i)
            elif opt == 'p':
                for tok in tokens:
                    if re.search(pat, tok.pos):
                        lks.append(tok)
            elif opt == 'pl':
                for tok in tokens:
                    from dictionaries.word_transforms import taglemma
                    postag = tok.pos
                    if postag.lower() in list(taglemma.keys()):
                        stemmedtag = taglemma[postag.lower()]
                    else:
                        stemmedtag = postag.lower()
                    if re.search(pat, stemmedtag):
                        lks.append(tok)
            elif opt == 'l':
                for tok in tokens:
                    if re.search(pat, tok.lemma):
                        lks.append(tok)
            elif opt == 'w':
                for tok in tokens:
                    if re.search(pat, tok.word):
                        lks.append(tok)
            elif opt == 'i':
                for tok in tokens:
                    if re.search(pat, str(tok.id)):
                        lks.append(tok)
            elif opt == 'r':
                got = []
                for tok in tokens:
                    dist = distancer(deps.links, tok)
                    if dist is not None and dist is not False:
                        try:
                            if int(dist) == int(pat):
                                lks.append(tok)

                        except TypeError:
                            if re.search(pat, str(dist)):
                                lks.append(tok)

        if mode == 'all':
            from collections import Counter
            counted = Counter(lks)
            lks = [
                k for k, v in counted.items() if v >= len(list(search.keys()))
            ]
        return lks

    result = []
    conc_result = []
    numdone = 0

    for s in sents:
        numdone += 1
        deps = get_deps(s, dep_type)
        tokens = s.tokens
        lks = get_matches_from_sent(s,
                                    search,
                                    deps,
                                    tokens,
                                    dep_type,
                                    mode=searchmode)

        #if not concordancing:
        #    lks = list(set([x for x in lks if x and re.search(regex_nonword_filter, x.word)]))

        if exclude is not False:
            to_remove = get_matches_from_sent(s,
                                              exclude,
                                              deps,
                                              tokens,
                                              dep_type,
                                              mode=excludemode)

            for i in to_remove:
                try:
                    lks.remove(i)
                except ValueError:
                    pass

        if progbar:
            tstr = '%d/%d' % (numdone, len(sents))
            animator(progbar, numdone, tstr)

        if 'c' in show:
            result.append(len(lks))
            continue

        if do_concordancing:
            for lk in lks:  # for each concordance middle part
                one_result = []
                if not lk:
                    continue
                # get the index of the match
                windex = int(lk.id) - 1
                speakr = s.speakername
                if not speakr:
                    speakr = ''
                # begin building line with speaker first
                conc_line = [speakr]
                # format a single word correctly
                if only_format_match:
                    start = ' '.join([
                        t.word for index, t in enumerate(s.tokens)
                        if index < windex
                    ])
                    end = ' '.join([
                        t.word for index, t in enumerate(s.tokens)
                        if index > windex
                    ])
                    s.tokens = [s.get_token_by_id(lk.id)]
                for tok in s.tokens:
                    single_wd = {}
                    intermediate_result = []
                    if 'w' in show:
                        single_wd['w'] = tok.word
                    if 'l' in show:
                        from dictionaries.word_transforms import wordlist
                        if tok.lemma in list(wordlist.keys()):
                            lem = wordlist[tok.lemma]
                        else:
                            lem = tok.lemma
                        single_wd['l'] = lem
                    if 'p' in show:
                        single_wd['p'] = tok.pos

                    if 'pl' in show:
                        single_wd['pl'] = lk.pos
                        from dictionaries.word_transforms import taglemma
                        if postag.lower() in list(taglemma.keys()):
                            single_wd['pl'] = taglemma[postag.lower()]
                        else:
                            single_wd['pl'] = postag.lower()
                        if not single_wd['pl']:
                            single_wd['pl'] == 'none'

                    if 'r' in show:
                        all_lks = [l for l in deps.links]
                        distance = distancer(all_lks, tok)
                        if distance:
                            single_wd['r'] = str(distance)
                        else:
                            single_wd['r'] = '0'
                    if 'f' in show:
                        for lin in deps.links:
                            single_wd['f'] = '.'
                            if tok.id == lin.dependent.idx:
                                single_wd['f'] = lin.type
                                break
                    if 'i' in show:
                        single_wd['i'] = str(tok.id)

                    if any(x.startswith('g') for x in show):
                        thegovid = next((q.governor.idx for q in deps.links \
                                        if q.dependent.idx == tok.id), False)
                        govtok = False
                        if thegovid is not False:
                            govtok = s.get_token_by_id(thegovid)

                        if 'g' in show:
                            if govtok:
                                single_wd['g'] = govtok.word
                            else:
                                single_wd['g'] = 'none'
                        if 'gl' in show:
                            if govtok:
                                single_wd['gl'] = govtok.lemma
                            else:
                                single_wd['gl'] = 'none'
                        if 'gp' in show:
                            if govtok:
                                single_wd['gp'] = govtok.pos
                            else:
                                single_wd['gp'] = 'none'

                        if 'gf' in show:
                            if govtok:
                                single_wd['gf'] = next(x.type for x in deps.links \
                                            if x.dependent.idx == thegovid)
                            else:
                                single_wd['gf'] = 'none'

                    if any(x.startswith('d') for x in show):
                        thedepid = next((q.dependent.idx for q in deps.links \
                                        if q.governor.idx == tok.id), False)

                        deptok = False
                        if thedepid is not False:
                            deptok = s.get_token_by_id(thedepid)

                        if 'd' in show:
                            if thedepid:
                                single_wd['d'] = deptok.word
                            else:
                                single_wd['d'] = 'none'

                        if 'dl' in show:
                            if thedepid:
                                single_wd['dl'] = deptok.lemma
                            else:
                                single_wd['dl'] = 'none'
                        if 'dp' in show:
                            if thedepid:
                                single_wd['dp'] = deptok.pos
                            else:
                                single_wd['dp'] = 'none'
                        if 'df' in show:
                            if thedepid:
                                single_wd['df'] = next(x.type for x in deps.links \
                                if x.dependent.idx == thedepid)
                            else:
                                single_wd['df'] = 'none'
                    for i in show:
                        intermediate_result.append(single_wd[i])
                    intermediate_result = [
                        i.replace('/', '-slash-').encode('utf-8',
                                                         errors='ignore')
                        for i in intermediate_result
                    ]
                    one_result.append('/'.join(intermediate_result))
                # now we have formatted tokens as a list. we need to split
                # it into start, middle and end
                if not only_format_match:
                    start = ' '.join([
                        w for index, w in enumerate(one_result)
                        if index < windex
                    ])
                    end = ' '.join([
                        w for index, w in enumerate(one_result)
                        if index > windex
                    ])
                    middle = one_result[windex]
                else:
                    middle = one_result[0]

                for bit in start, middle, end:
                    conc_line.append(bit)
                conc_result.append(conc_line)

        # figure out what to show
        for lk in lks:
            single_result = {}
            if not lk:
                continue
            if 'w' in show:
                single_result['w'] = 'none'
                if lemmatise:
                    single_result['w'] = lk.lemma
                else:
                    single_result['w'] = lk.word
            if 'l' in show:
                from dictionaries.word_transforms import wordlist
                if lk.lemma in list(wordlist.keys()):
                    lem = wordlist[lk.lemma]
                else:
                    lem = lk.lemma
                single_result['l'] = lem
            if 'p' in show:
                single_result['p'] = 'none'
                postag = lk.pos
                if lemmatise:
                    from dictionaries.word_transforms import taglemma
                    if postag.lower() in list(taglemma.keys()):
                        single_result['p'] = taglemma[postag.lower()]
                    else:
                        single_result['p'] = postag.lower()
                else:
                    single_result['p'] = postag
                if not single_result['p']:
                    single_result['p'] == 'none'

            if 'pl' in show:
                single_result['pl'] = 'none'
                postag = lk.pos
                from dictionaries.word_transforms import taglemma
                if postag.lower() in list(taglemma.keys()):
                    single_result['pl'] = taglemma[postag.lower()]
                else:
                    single_result['pl'] = postag.lower()
                if not single_result['pl']:
                    single_result['pl'] == 'none'

            if 'f' in show:
                single_result['f'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        single_result['f'] = i.type.rstrip(',')
                        break
                if single_result['f'] == '':
                    single_result['f'] = 'root'

            if 'g' in show:
                single_result['g'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            if lemmatise:
                                single_result['g'] = s.get_token_by_id(
                                    i.governor.idx).lemma
                            else:
                                single_result['g'] = i.governor.text
                        else:
                            single_result['g'] = 'root'
                        break

            if 'd' in show:
                single_result['d'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            if lemmatise:
                                single_result['d'] = s.get_token_by_id(
                                    i.dependent.idx).lemma
                            else:
                                single_result['d'] = i.dependent.text
                        break

            if 'gl' in show:
                single_result['gl'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gl'] = s.get_token_by_id(
                                i.governor.idx).lemma
                        else:
                            single_result['gl'] = 'root'
                        break

            if 'dl' in show:
                single_result['dl'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            single_result['dl'] = s.get_token_by_id(
                                i.dependent.idx).lemma
                        break

            if 'gp' in show:
                single_result['gp'] = 'none'
                for i in deps.links:
                    if i.dependent.idx == lk.id:
                        if s.get_token_by_id(i.governor.idx):
                            single_result['gp'] = s.get_token_by_id(
                                i.governor.idx).pos
                        break

            if 'dp' in show:
                single_result['dp'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        if s.get_token_by_id(i.dependent.idx):
                            single_result['dp'] = s.get_token_by_id(
                                i.dependent.idx).pos
                        break

            if 'df' in show:
                single_result['df'] = 'none'
                for i in deps.links:
                    if i.governor.idx == lk.id:
                        single_result['df'] = i.type
                        break

            if 'gf' in show:
                single_result['gf'] = 'none'
                for i in deps.links:
                    # if the result is the dependent, get the governor, find where
                    # it is a dependent, then gt the type
                    if i.dependent.idx == lk.id:
                        gv = next(x for x in deps.links
                                  if x.dependent.idx == i.governor.idx)
                        single_result['gf'] = gv.type
                        break

            if 'r' in show:
                all_lks = [l for l in deps.links]
                distance = distancer(all_lks, lk)
                if distance is not False and distance is not None:
                    single_result['r'] = str(distance)
                else:
                    single_result['r'] = '-1'

            if 'i' in show:
                single_result['i'] = str(lk.id)

            if 'c' not in show:

                # add them in order
                out = []
                for i in show:
                    out.append(single_result[i])

                out = [i.replace('/', '-slash-') for i in out]
                result.append('/'.join(out))

    if 'c' in show:
        result = sum(result)

    if type(do_concordancing) == str and do_concordancing.lower() == 'only':
        result = []
    return result, conc_result