Example #1
0
def searchtree(tree, query, options = ['-t', '-o']):
    "Searches a tree with Tregex and returns matching terminals"
    import os
    from corpkit.other import tregex_engine
    from corpkit.tests import check_dit
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    fo = open('tree.tmp',"w")
    fo.write(tree + '\n')
    fo.close()
    result = tregex_engine(query = query, check_query = True)
    result = tregex_engine(query = query, options = options, corpus = "tree.tmp")
    os.remove("tree.tmp")
    return result
Example #2
0
def make_nltk_text(directory, 
                   collapse_dirs = True, 
                   tagged = False, 
                   lemmatise = False, 
                   just_content_words = False):
    """turn a lot of trees into an nltk style text"""
    import nltk
    import os
    from corpkit.other import tregex_engine
    if type(directory) == str:
        dirs = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
        if len(dirs) == 0:
            dirs = [directory]
    elif type(directory) == list:
        dirs = directory

    return_tuples = False
    if tagged:
        return_tuples = True

    if just_content_words:
        lemmatise = True

    query = r'__ < (/.?[A-Za-z0-9].?/ !< __)'
    if not return_tuples and not lemmatise:
        options = ['-o', '-t']
    else:
        options = ['-o']

    # filthy code.
    all_out = []

    for d in dirs:
        print "Flattening %s ... " % str(d)
        res = tregex_engine(corpus = d, 
                            query = query, 
                            options = options,
                            lemmatise = lemmatise,
                            just_content_words = just_content_words,
                            return_tuples = return_tuples)
        all_out.append(res)

    if collapse_dirs:
        tmp = []
        for res in all_out:
            for w in res:
                tmp.append(w)
        all_out = tmp
        textx = nltk.Text(all_out)
    else:
        textx = {}
        for name, text in zip(dirs, all_out):
            t = nltk.Text(all_out)
            textx[os.path.basename(name)] = t
    return textx
Example #3
0
def make_nltk_text(directory):
    """turn a lot of trees into an nltk style text"""
    import nltk
    import os
    from corpkit.other import tregex_engine
    if type(directory) == str:
        dirs = [os.path.join(directory, d) for d in os.listdir(directory) if os.path.isdir(os.path.join(directory, d))]
        if len(dirs) == 0:
            dirs = [directory]
    elif type(directory) == list:
        dirs = directory
    out = []
    for d in dirs:
        print d
        res = tregex_engine(corpus = dir, query = 'ROOT < __', options = ['-w', '-t'])
        for r in res:
            out.append(r)
    print 'Tokenising ...'
    as_string = '\n'.join(out)
    as_list_of_tokens = nltk.word_tokenize(as_string)
    text = nltk.Text(as_list_of_tokens)
    return text
Example #4
0
def conc(corpus, query, 
        n = 100, 
        random = False, 
        window = 40, 
        trees = False,
        plaintext = 'guess',
        add_links = False,
        show_links = False): 
    """A concordancer for Tregex queries over trees or regexes over plain text"""
    import os
    import re
    import pandas as pd
    from pandas import DataFrame
    from time import localtime, strftime
    
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    from corpkit.other import tregex_engine
    from corpkit.tests import check_pytex, check_dit
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    
    # convert list to query
    if type(query) == list:
        from corpkit.other import as_regex
        query = r'/%s/ !< __' % as_regex(query, boundaries = 'line')
    
    # lazy, daniel!
    if window == 'all':
        window = 9999

    # check query
    good_tregex_query = tregex_engine(query, check_query = True)
    if good_tregex_query is False:
        return

    # make sure there's a corpus
    if not os.path.exists(corpus):
        raise ValueError('Corpus file or folder not found: %s' % corpus)

    # welcome message
    time = strftime("%H:%M:%S", localtime())
    print "\n%s: Getting concordances for %s ... \n          Query: %s\n" % (time, corpus, query)
    output = []

    if plaintext == 'guess':
        if not tregex_engine(corpus = corpus, check_for_trees = True):
            plaintext = True
        else:
            plaintext = False

    if trees:
        options = '-s'
    else:
        options = '-t'
    if not plaintext:
        whole_results = tregex_engine(query, 
                                  options = ['-o', '-w', options], 
                                  corpus = corpus)
        middle_column_result = tregex_engine(query, 
                                  options = ['-o', options], 
                                  corpus = corpus)
    
    if plaintext:
        import nltk
        sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
        whole_results = []
        middle_column_result = []
        small_regex = re.compile(query)
        big_regex = re.compile(r'.*' + query + r'.*')
        fs = [os.path.join(corpus, f) for f in os.listdir(corpus)]
        # do recursive if need
        if any(os.path.isdir(f) for f in fs):
            recursive_files = []
            for dirname, dirnames, filenames in os.walk(corpus):
                for filename in filenames:
                    recursive_files.append(os.path.join(dirname, filename))
            fs = recursive_files
        for f in fs:
            raw = open(f).read().replace('\n', ' ')
            # encoding ... ?
            sents = sent_tokenizer.tokenize(raw)
            for sent in sents:
                try:
                    for match in re.findall(small_regex, raw):
                        middle_column_result.append(match)
                        whole_results.append(sent)
                except:
                    continue

    try:
        # get longest middle column result, or discover no results and raise error
        maximum = len(max(middle_column_result, key=len))
    except ValueError:
        time = strftime("%H:%M:%S", localtime())
        print "\n%s: No matches found." % time
        return

    zipped = zip(whole_results, middle_column_result)
    unique_results = []

    for whole_result, middle_part in zipped:
        if not trees:
            regex = re.compile(r"(\b[^\s]{0,1}.{," + re.escape(str(window)) + r"})(\b" + re.escape(middle_part) + r"\b)(.{," + re.escape(str(window)) + r"}[^\s]\b)")
        else:
            regex = re.compile(r"(.{,%s})(%s)(.{,%s})" % (window, re.escape(middle_part), window ))
        search = re.findall(regex, whole_result)
        for result in search:
            unique_results.append(result)
    unique_results = set(sorted(unique_results)) # make unique
    
    #make into series
    series = []

    lname = ' ' * (window/2-1) + 'l'
    # centering middle column
    #mname = ' ' * (maximum/2+1) + 'm'
    mname = ' ' * (maximum/2-1) + 'm'
    rname = ' ' * (window/2-1) + 'r'
    for start, word, end in unique_results:
        #spaces = ' ' * (maximum / 2 - (len(word) / 2))
        #new_word = spaces + word + spaces
        series.append(pd.Series([start.encode('utf-8', errors = 'ignore'), word.encode('utf-8', errors = 'ignore'), end.encode('utf-8', errors = 'ignore')], index = [lname.encode('utf-8', errors = 'ignore'), mname.encode('utf-8', errors = 'ignore'), rname.encode('utf-8', errors = 'ignore')]))

    # randomise results...
    if random:
        from random import shuffle
        shuffle(series)

    try:
        df = pd.concat(series, axis = 1).T
    except ValueError:
        raise ValueError("No results found, I'm afraid. Check your query and path.")

    if add_links:

        def _add_links(lines, links = False, show = 'thread'):
            link = "http://www.healthboards.com/boards/bipolar-disorder/695089-labels.html"
            linktext = '<a href="%s>link</a>' % link
            import pandas as pd
            inds = list(df.index)
            num_objects = len(list(df.index))
            ser = pd.Series([link for n in range(num_objects)], index = inds)
            lines['link'] = ser
            return lines
        
        df = _add_links(df)
        
    # make temporary
    pd.set_option('display.max_columns', 500)
    pd.set_option('max_colwidth',window * 2)
    pd.set_option('display.width', 1000)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('colheader_justify', 'left')
    if add_links:
        if not show_links:
            print df.drop('link', axis = 1).head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format})
        else:
            print HTML(df.to_html(escape=False))
    else:
        print df.head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format})

    if not add_links:
        df.columns = ['l', 'm', 'r']
    else:
        df.columns = ['l', 'm', 'r', 'link']
    return df

# r'/NN.?/ < /(?i)\brisk/ $ (/NN.?/ < /(?i)factor >># NP)' 
Example #5
0
def dictmaker(path, 
              dictname,
              query = 'any',
              dictpath = 'data/dictionaries',
              lemmatise = False,
              just_content_words = False,
              use_dependencies = False):
    """makes a pickle wordlist named dictname in dictpath"""
    import os
    import pickle
    import re
    import nltk
    from time import localtime, strftime
    from StringIO import StringIO
    import shutil
    from collections import Counter
    from corpkit.progressbar import ProgressBar
    from corpkit.other import tregex_engine
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    
    if lemmatise:
        dictname = dictname + '-lemmatised'
    if not dictname.endswith('.p'):
        dictname = dictname + '.p'

    # allow direct passing of dirs
    path_is_list = False
    one_big_corpus = False
    if type(path) == str:
        sorted_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))]
    # if no subcorpora, just do the dir passed in
        if len(sorted_dirs) == 0:
            one_big_corpus = True
            sorted_dirs = [path]
    elif type(path) == list:
        path_is_list = True
        sorted_dirs = sorted(path)
        if type(sorted_dirs[0]) == int:
            sorted_dirs = [str(d) for d in sorted_dirs]

    try:
        sorted_dirs.sort(key=int)
    except:
        pass
    try:
        if not os.path.exists(dictpath):
            os.makedirs(dictpath)
    except IOError:
        print "Error making " + dictpath + "/ directory."
    while os.path.isfile(os.path.join(dictpath, dictname)):
        time = strftime("%H:%M:%S", localtime())
        selection = raw_input('\n%s: %s already exists in %s.\n' \
               '          You have the following options:\n\n' \
               '              a) save with a new name\n' \
               '              b) delete %s\n' \
               '              c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname)))
        if 'a' in selection:
            sel = raw_input('\nNew save name: ')
            dictname = sel
            if lemmatise:
                dictname = dictname.replace('-lemmatised.p', '')
                dictname = dictname + '-lemmatised'
            if not dictname.endswith('.p'):
                dictname = dictname + '.p'
        elif 'b' in selection:
            os.remove(os.path.join(dictpath, dictname))
        elif 'c' in selection:
            print ''
            return
        else:
            as_str = str(selection)
            print '          Choice "%s" not recognised.' % selection

    time = strftime("%H:%M:%S", localtime())
    print '\n%s: Extracting words from files ... \n' % time

    # all this just to get a list of files and make a better progress bar
    if use_dependencies:
        counts = []
        for d in sorted_dirs:
            if not one_big_corpus:
                subcorpus = os.path.join(path, d)
            else:
                subcorpus = path
            if use_dependencies:
                files = [f for f in os.listdir(subcorpus) if f.endswith('.xml')]
            else:
                files = [f for f in os.listdir(subcorpus)]
            counts.append(len(files))
        num_files = sum(counts)
        c = 0
        p = ProgressBar(num_files)
    else:
        p = ProgressBar(len(sorted_dirs))

    def tokener(xmldata):
        """print word, using good lemmatisation"""
        from bs4 import BeautifulSoup
        import gc
        open_classes = ['N', 'V', 'R', 'J']
        result = []
        just_good_deps = SoupStrainer('tokens')
        soup = BeautifulSoup(xmldata, parse_only=just_good_deps)   
        for token in soup.find_all('token'):
            word = token.word.text
            query = re.compile(r'.*')
            if re.search(query, word):
                if lemmatise:
                    word = token.lemma.text
                    if just_content_words:
                        if not token.pos.text[0] in open_classes:
                            continue        
                result.append(word)
        # attempt to stop memory problems. 
        # not sure if this helps, though:
        soup.decompose()
        soup = None
        data = None
        gc.collect()
        return result
    
    # translate 'any' query
    if query == 'any':
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !< __'
    
    if lemmatise:
        options = ['-o']
    else:
        options = ['-t', '-o']
    
    if use_dependencies:
        from bs4 import BeautifulSoup, SoupStrainer
        if query == 'any':
            query = r'.*'
        query = re.compile(query)

    allwords = []

    for index, d in enumerate(sorted_dirs):
        if not use_dependencies:
            p.animate(index)
        if not path_is_list:
            if len(sorted_dirs) == 1:
                subcorp = d
            else:
                subcorp = os.path.join(path, d)
        else:
            subcorp = d

        # check query first time through    
        if not use_dependencies:
            if index == 0:
                trees_found = tregex_engine(corpus = subcorp, check_for_trees = True)
                if not trees_found:
                    lemmatise = False
                    dictname = dictname.replace('-lemmatised', '')
            if trees_found:
                results = tregex_engine(corpus = subcorp, options = options, query = query, 
                                        lemmatise = lemmatise,
                                        just_content_words = just_content_words)

                for result in results:
                    allwords.append(result)  

        elif use_dependencies:
            regex_nonword_filter = re.compile("[A-Za-z]")
            results = []
            fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)]
            for f in fs:
                p.animate(c, str(c) + '/' + str(num_files))
                c += 1
                data = open(f).read()
                result_from_a_file = tokener(data)
                for w in result_from_a_file:
                    if re.search(regex_nonword_filter, w):
                        allwords.append(w.lower())

        if not use_dependencies:
            if not trees_found:
                for f in os.listdir(subcorp):
                    raw = unicode(open(os.path.join(subcorp, f)).read(), 'utf-8', errors = 'ignore')
                    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
                    sents = sent_tokenizer.tokenize(raw)
                    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                    for sent in tokenized_sents:
                        for w in sent:
                            allwords.append(w.lower()) 

    #100%
    p.animate(len(sorted_dirs))
    
    # make a dict
    dictionary = Counter(allwords)

    with open(os.path.join(dictpath, dictname), 'wb') as handle:
        pickle.dump(dictionary, handle)
    time = strftime("%H:%M:%S", localtime())
    print '\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath + '/'
Example #6
0
def datareader(data, plaintext = False, **kwargs):
    """
    Returns a string of plain text from a number of kinds of data.

    The kinds of data currently accepted are:

    path to corpus : all trees are flattened
    path to subcorpus : all trees are flattened
    conc() output (list of concordance lines)
    csv file generated with conc()
    a string of text
    """
    import os
    import pandas
    from corpkit.other import tregex_engine
    from corpkit.tests import check_dit
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False

    tregex_engine_used = False
    
    # if unicode, make it a string
    if type(data) == unicode:
        if not os.path.isdir(data):
            if not os.path.isfile(data):
                return good
    if type(data) == str:
        # if it's a file, read it
        if os.path.isfile(data):
            good = open(data).read()
        # if it's a dir, flatten all trees
        elif os.path.isdir(data):
            # get all sentences newline separated
            query = r'__ !< __'
            options = ['-o', '-t']

            # if lemmatise, we get each word on a newline
            if 'lemmatise' in kwargs:
                if kwargs['lemmatise'] is True:
                    query = r'__ <# (__ !< __)'
                    options = ['-o']
 
            # check for trees ...
            #while plaintext is False:
                #for f in first_twenty:
                    #plaintext = tregex_engine(corpus = f, check_for_trees = True)
            
            if not plaintext:
                tregex_engine_used = True
                results = tregex_engine(corpus = data,
                                              options = options,
                                              query = query, 
                                              **kwargs)
            else:
                results = []
                fs = [os.path.join(data, f) for f in os.listdir(data)]
                # do recursive if need
                if any(os.path.isdir(f) for f in fs):
                    recursive_files = []
                    for dirname, dirnames, filenames in os.walk(data):
                        for filename in filenames:
                            recursive_files.append(os.path.join(dirname, filename))
                    fs = recursive_files
                
                import nltk
                sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
                for f in fs:
                    raw = unicode(open(f).read(), 'utf-8', errors = 'ignore')
                    sents = sent_tokenizer.tokenize(raw)
                    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                    for sent in tokenized_sents:
                        for w in sent:
                            results.append(w.lower()) 

            return results

            #good = '\n'.join(results)
        # if a string of text, 
        else:
            good = data
    # if conc results, turn into string...
    elif type(data) == pandas.core.frame.DataFrame:
        # if conc lines:
        try:
            if list(data.columns) == ['l', 'm', 'r']:
                conc_lines = True
            else:
                conc_lines = False
        except:
            conc_lines = False
        if conc_lines:
            # may not be unicode!?
            good = [' '.join(list(data.ix[l])) for l in list(data.index)]

    else:
        good = data

    # make unicode
    if not tregex_engine_used:
        try:
            good = unicode(good, 'utf-8', errors = 'ignore')
        except TypeError:
            pass

    return good
Example #7
0
def eugener(path, 
            query, 
            depth = 5, 
            top = 20, 
            lemmatise = False,
            just_content_words = False,
            remove_query_from_output = False,
            remove_zero_depth = False,
            return_tags = False):
    """ 
    ***This is probably broken now, can fix if there's a use for it.***
    
    get most frequent words in corpus path to left and right of query regex

    path: path to corpus containing subcorpora
    query: regex to match word to be zero depth
    depth: number of places left and right to look
    top: number of most frequent entries to return
    lemmatise: wordnet lemmatisation
    just_content_words: keep only n, v, a, r tagged words
    remove_query_from_output: remove o
    """
    import os
    import nltk
    import re
    from collections import Counter
    import pandas as pd
    from corpkit.progressbar import ProgressBar
    from corpkit.other import tregex_engine

    # manual lemmatisation here:
    from dictionaries.word_transforms import wordlist
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
        from IPython.display import display, clear_output
    except NameError:
        import subprocess
        have_ipython = False
    from corpkit.tests import check_dit # probably never needed
    
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    regex = re.compile(query)
    wordregex = re.compile('[A-Za-z0-9]')

    print ''

    # get list of subcorpora
    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    sorted_dirs = sorted(dirs)
    # define risk word
    # place for our output
    dfs = {}
    p = ProgressBar(len(sorted_dirs))
    for index, corpus in enumerate(sorted_dirs):
        p.animate(index)
        # search the corpus for whole sents containing risk word
        subcorpus = os.path.join(path, corpus)
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !> __'
        results = tregex_engine(query, ['-o'], subcorpus, 
                                lemmatise = lemmatise, 
                                just_content_words = just_content_words)

        # lowercase
        processed = [(r.lower(), tag) for r, tag in processed]

        # remove punct
        processed = [w for w in processed if re.search(wordregex, w[0])]

        # a place for info about each corpus
        # word list to use later
        all_words = []
        dicts = []

        # go left and right depth times (for 2, makes [-2, -1, 0, 1, 2])
        for i in range(-depth, (depth + 1)):
            newdict = Counter()
            matching = []
            # go through each token
            for index, (token, tag) in enumerate(processed):
                # if token matches risk expression
                if re.search(regex, token):
                    # get the word at depth index
                    # try statement for cases where the target word index isn't there
                    try:
                        if i < 0:
                            num = index - abs(i)
                            if return_tags:
                                matching.append(processed[num][1])
                            else:
                                matching.append(processed[num][0])
                        else:
                            if return_tags:
                                matching.append(processed[index + i][1])
                            else:
                                matching.append(processed[index + i][0])
                    except:
                        pass
            # tally results
            counted = Counter(matching)
            # remove punctuation etc
            for key in counted:
                # commented because this stuff was moved earlier.
                #if key.isalnum():
                    #if key not in stopwords:
                    #if remove_stopwords:
                        #if key not in stopwords:
                            #newdict[key] = counted[key]
                    #else:
                        #newdict[key] = counted[key]
                newdict[key] = counted[key]
            for w in counted.keys():
                all_words.append(w)
            #top_tokens = newdict.most_common(top)
            dicts.append(newdict)
        
        # make pandas series
        sers = []
        # for each unique word
        for word in list(set(all_words)):
            #get counts for each depth
            series = [dct[word] for dct in dicts]
            # add a total
            series.append(sum([dct[word] for dct in dicts]))
            #make index names for depths plus total
            index_names = range(-depth, (depth + 1))
            index_names.append('Total')
            # turn into pandas data, and name the series the word
            ser = pd.Series(series, index = index_names)
            ser.name = word
            sers.append(ser)
        
        # concatenate series into dataframe
        df = pd.concat(sers, axis=1)

        # sort by total
        tot = df.ix['Total']
        df = df[tot.argsort()[::-1]]

        # remove words matching the regex if need be
        if remove_query_from_output:
            cols = [c for c in list(df.columns) if not re.search(regex, c)]
            df = pd.DataFrame(df[cols])
        # remove zero depth if need be
        if remove_zero_depth:
            df = df.drop(0, axis = 0)

        # just top entries
        df = pd.DataFrame(df[list(df.columns)[:top]])
        
        #transpose
        dfs[corpus] = df.T

    # complete animation, then clear
    p.animate(len(sorted_dirs))
    if have_ipython:
        clear_output()

    # some settings for good display
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('colheader_justify', 'right')

    # print the start of each frame, then return them all
    for item in sorted(dfs):
        print item, '\n', dfs[item].head(), '\n'
    return dfs
Example #8
0
def conc(corpus, 
        query, 
        option = 'tregex',
        dep_function = 'any',
        dep_type = 'basic-dependencies',
        n = 100, 
        random = False, 
        window = 100, 
        trees = False,
        plaintext = False, #'guess',
        add_links = False,
        show_links = False,
        print_status = True,
        print_output = True,
        just_speakers = False,
        root = False,
        **kwargs): 
    """A concordancer for Tregex queries and dependencies"""
    import corpkit
    import os
    import re
    import pandas as pd
    from pandas import DataFrame
    from time import localtime, strftime    
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    from corpkit.other import tregex_engine
    from corpkit.tests import check_pytex, check_dit
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False

    if query == 'any':
        query = r'.*'
    
    # convert list to query
    if type(query) == list:
        from other import as_regex
        if option.startswith('t'):
            query = r'/%s/ !< __' % as_regex(query, boundaries = 'line')
        else:
            query = as_regex(query, boundaries = 'w')

    can_do_fast = False
    if option.startswith('t'):
        if just_speakers is False:
            can_do_fast = True

    just_speakers_is_list = False
    if type(just_speakers) == list:
        just_speakers_is_list = True

    if type(just_speakers) == str:
        if just_speakers.lower() != 'all':
            just_speakers = [just_speakers]

    def get_deps(sentence, dep_type):
        if dep_type == 'basic-dependencies':
            return sentence.basic_dependencies
        if dep_type == 'collapsed-dependencies':
            return sentence.collapsed_dependencies
        if dep_type == 'collapsed-ccprocessed-dependencies':
            return sentence.collapsed_ccprocessed_dependencies

    conc_lines = []
    if option.startswith('t'):
        if trees:
            options = '-s'
        else:
            options = '-t'
    if can_do_fast:
        speakr = ''
        tregex_engine(query = query, check_query = True, root = root)
        wholes = tregex_engine(query = query, 
                                options = ['-o', '-w', '-f', options], 
                                corpus = corpus,
                                preserve_case = True,
                                root = root)
        middle_column_result = tregex_engine(query = query, 
                                options = ['-o', options], 
                                corpus = corpus,
                                preserve_case = True,
                                root = root)
        for (f, whole), mid in zip(wholes, middle_column_result):
            reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE)
            start, middle, end = re.split(reg, whole, 1)
            conc_lines.append([os.path.basename(f), speakr, start, middle, end])
    else:

        fs_to_conc = []
        for r, dirs, fs in os.walk(corpus):
            for f in fs:
                if not os.path.isfile(os.path.join(r, f)):
                    continue
                if not f.endswith('.txt') and not f.endswith('.xml'):
                    continue
                fs_to_conc.append(os.path.join(r, f))

        def normalise(concline):
            import re
            reg = re.compile(r'\([^ ]+')
            spaces = re.compile(r'\s+')
            concline = re.sub(reg, '', concline)
            concline = re.sub(spaces, ' ', concline)
            concline = concline.replace(')', '').replace('  ', ' ')
            return concline.strip()

        num_fs = len(fs_to_conc)
        for index, filepath in enumerate(fs_to_conc):
            f = os.path.basename(filepath)
            if num_fs > 1:
                if 'note' in kwargs.keys():
                    kwargs['note'].progvar.set((index) * 100.0 / num_fs)
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print '%s: Extracting data from %s ...' % (thetime, f)
            if root:
                root.update()
            with open(filepath, "rb") as text:
                parsetreedict = {}
                data = text.read()
                if option.startswith('p') or option.startswith('l'):
                    if option.startswith('l'):
                        lstokens = pickle.load(open(filepath, 'rb'))
                        data = ' '.join(tokens)
                        data = data.split(' . ')
                    else:
                        lines = data.splitlines()
                    for l in lines:
                        m = re.compile(r'^(.*?)(' + query + r')(.*)$', re.IGNORECASE)
                        mat = re.search(m, l)
                        if mat:
                            conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(3)])
                    continue
                from corenlp_xml.document import Document
                corenlp_xml = Document(data)
                #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents)  
                if just_speakers:
                    for s in just_speakers:
                        parsetreedict[s] = []
                    sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                    #sents = [s for s in corenlp_xml.find_all('sentence') \
                    #if s.speakername.text.strip() in just_speakers]
                else:
                    sents = corenlp_xml.sentences
                nsents = len(sents)
                for i, s in enumerate(sents):
                    if num_fs == 1:
                        if 'note' in kwargs.keys():
                            kwargs['note'].progvar.set((index) * 100.0 / nsents)
                            if root:
                                root.update()
                    try:
                        speakr = s.speakername.strip()
                    except:
                        speakr = '' 
                    parsetree = s.parse_string
                    if option.startswith('t'):
                        parsetreedict[speakr].append(parsetree)
                        continue
                    elif option.startswith('d'): 
                        #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1)
                        deps = get_deps(s, dep_type)
                        if dep_function == 'any' or dep_function is False:
                            wdsmatching = [l.dependent.text.strip() for l in deps.links \
                                           if re.match(query, l.dependent.text.strip())]
                        else:
                            comped = re.compile(dep_function, re.IGNORECASE)
                            #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip()))
                            wdsmatching = [l.dependent.text.strip() for l in deps.links \
                                           if re.match(comped, l.type.strip()) and \
                                           re.match(query, l.dependent.text.strip())]
                        # this is shit, needs indexing or something
                        for wd in wdsmatching:
                            line = normalise(parsetree)
                            start, middle, end = re.split(r'(' + wd + r')', line, 1)
                            conc_lines.append([f, speakr, start, middle, end])

                if option.startswith('t'):
                    for speakr, dt in parsetreedict.items():
                        trees_as_string = '\n'.join(dt)
                        if trees:
                            options = '-s'
                        else:
                            options = '-t'
                        with open('tmp.txt', 'w') as fo:
                            fo.write(trees_as_string.encode('utf-8', errors = 'ignore'))
                        tregex_engine(query = query, check_query = True, root = root)
                        wholes = tregex_engine(query = query, 
                                    options = ['-o', '-w', options], 
                                    corpus = 'tmp.txt',
                                    preserve_case = True,
                                    root = root)
                        middle_column_result = tregex_engine(query = query, 
                                    options = ['-o', options], 
                                    corpus = 'tmp.txt',
                                    preserve_case = True,
                                    root = root)
                        for whole, mid in zip(wholes, middle_column_result):
                            reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE)
                            start, middle, end = re.split(reg, whole, 1)
                            conc_lines.append([f, speakr, start, middle, end])

    # does not keep results ordered!
    try:
        os.remove('tmp.txt')
    except:
        pass

    unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)]

    #make into series
    series = []
    pindex = 'f s l m r'.encode('utf-8').split()

    for fname, spkr, start, word, end in unique_results:
        import os
        fname = os.path.basename(fname)
        start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        #spaces = ' ' * (maximum / 2 - (len(word) / 2))
        #new_word = spaces + word + spaces
        series.append(pd.Series([fname.encode('utf-8', errors = 'ignore'), \
                                 spkr.encode('utf-8', errors = 'ignore'), \
                                 start.encode('utf-8', errors = 'ignore'), \
                                 word.encode('utf-8', errors = 'ignore'), \
                                 end.encode('utf-8', errors = 'ignore')], index = pindex))

    # randomise results...
    if random:
        from random import shuffle
        shuffle(series)

    if series == []:
        if root:
            print 'No results found, sorry.'
            return
        else:
            raise ValueError("No results found, I'm afraid. Check your query and path.")

    df = pd.concat(series, axis = 1).T


    if not add_links:
        df.columns = ['f', 's', 'l', 'm', 'r']
    else:
        df.columns = ['f', 's', 'l', 'm', 'r', 'link']

    if all(x == '' for x in list(df['s'].values)):
        df.drop('s', axis = 1, inplace = True)

    formatl = lambda x: "{0}".format(x[-window:])
    formatf = lambda x: "{0}".format(x[-20:])
    #formatr = lambda x: 
    formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window])
    st = df.head(n).to_string(header = False, formatters={'l': formatl,
                                                          'r': formatr,
                                                          'f': formatf}).splitlines()
    
    # hack because i can't figure out formatter:
    rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st])
    if print_output:
        print rem

    if 'note' in kwargs.keys():
        kwargs['note'].progvar.set(100)

    return df

    if add_links:

        def _add_links(lines, links = False, show = 'thread'):
            link = "http://www.healthboards.com/boards/bipolar-disorder/695089-labels.html"
            linktext = '<a href="%s>link</a>' % link
            import pandas as pd
            inds = list(df.index)
            num_objects = len(list(df.index))
            ser = pd.Series([link for n in range(num_objects)], index = inds)
            lines['link'] = ser
            return lines
        
        df = _add_links(df)

    if add_links:
        if not show_links:
            if print_output:
                print df.drop('link', axis = 1).head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format})
        else:
            if print_output:
                print HTML(df.to_html(escape=False))
    else:
        if print_output:
            print df.head(n).to_string(header = False, formatters={rname:'{{:<{}s}}'.format(df[rname].str.len().max()).format})

    if not add_links:
        df.columns = ['f', 'l', 'm', 'r']
    else:
        df.columns = ['f', 'l', 'm', 'r', 'link']
    return df
Example #9
0
File: conc.py Project: whrl/corpkit
def conc(corpus, 
        option = 'tregex',
        query = 'any', 
        dep_function = 'any',
        dep_type = 'collapsed-ccprocessed-dependencies',
        n = 100, 
        random = False, 
        split_sents = True,
        window = 100, 
        trees = False,
        plaintext = False,
        add_links = False,
        show_links = False,
        print_status = True,
        print_output = True,
        just_speakers = False,
        root = False,
        **kwargs): 
    """
    A concordancer for Tregex queries and dependencies.

    * Revisions forthcoming to facilitate better dependency querying

    :returns: a Pandas DataFrame containing concordance lines"""
    import corpkit
    import os
    import re
    import pandas as pd
    from pandas import DataFrame
    from time import localtime, strftime    
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    from corpkit.other import tregex_engine
    from corpkit.tests import check_pytex, check_dit
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False

    if query == 'any':
        query = r'.*'
    
    # convert list to query
    if type(query) == list:
        from other import as_regex
        if option.startswith('t'):
            query = r'/%s/ !< __' % as_regex(query, boundaries = 'line')
        else:
            query = as_regex(query, boundaries = 'w')

    can_do_fast = False
    if option.startswith('t'):
        if just_speakers is False:
            can_do_fast = True

    just_speakers_is_list = False
    if type(just_speakers) == list:
        just_speakers_is_list = True

    if type(just_speakers) == str:
        if just_speakers.lower() != 'all':
            just_speakers = [just_speakers]

    def get_deps(sentence, dep_type):
        if dep_type == 'basic-dependencies':
            return sentence.basic_dependencies
        if dep_type == 'collapsed-dependencies':
            return sentence.collapsed_dependencies
        if dep_type == 'collapsed-ccprocessed-dependencies':
            return sentence.collapsed_ccprocessed_dependencies

    conc_lines = []
    if option.startswith('t'):
        if trees:
            options = '-s'
        else:
            options = '-t'
    if can_do_fast:
        speakr = ''
        tregex_engine(query = query, check_query = True, root = root)
        wholes = tregex_engine(query = query, 
                                options = ['-o', '-w', '-f', options], 
                                corpus = corpus,
                                preserve_case = True,
                                root = root)
        middle_column_result = tregex_engine(query = query, 
                                options = ['-o', options], 
                                corpus = corpus,
                                preserve_case = True,
                                root = root)
        for (f, whole), mid in zip(wholes, middle_column_result):
            reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE)
            start, middle, end = re.split(reg, whole, 1)
            conc_lines.append([os.path.basename(f), speakr, start, middle, end])
    else:

        if query.startswith(r'\b'):
            query = query[2:]
        if query.endswith(r'\b'):
            query = query[:-2]

        fs_to_conc = []
        for r, dirs, fs in os.walk(corpus):
            for f in fs:
                if not os.path.isfile(os.path.join(r, f)):
                    continue
                if not f.endswith('.txt') and not f.endswith('.xml') and not f.endswith('.p'):
                    continue
                fs_to_conc.append(os.path.join(r, f))

        def normalise(concline):
            import re
            reg = re.compile(r'\([^ ]+')
            spaces = re.compile(r'\s+')
            concline = re.sub(reg, '', concline)
            concline = re.sub(spaces, ' ', concline)
            concline = concline.replace(')', '').replace('  ', ' ')
            return concline.strip()

        num_fs = len(fs_to_conc)
        for index, filepath in enumerate(fs_to_conc):
            f = os.path.basename(filepath)
            if num_fs > 1:
                if 'note' in kwargs.keys():
                    kwargs['note'].progvar.set((index) * 100.0 / num_fs)
            if print_status:
                from time import localtime, strftime
                thetime = strftime("%H:%M:%S", localtime())
                print '%s: Extracting data from %s ...' % (thetime, f)
            if root:
                root.update()
            with open(filepath, "r") as text:
                parsetreedict = {}
                data = text.read()
                if option.startswith('p'):
                    import chardet
                    enc = chardet.detect(data)
                    data = unicode(data, enc['encoding'], errors = 'ignore')
                if option.startswith('p') or option.startswith('l'):
                    if option.startswith('l'):
                        import pickle
                        try:
                            lstokens = pickle.load(open(filepath, 'rb'))
                        except EOFError:
                            thetime = strftime("%H:%M:%S", localtime())
                            print '%s: File "%s" could not be opened.' % (thetime, os.path.basename(filepath))
                        data = ' '.join(lstokens)
                        if split_sents:
                            lines = data.split(' . ')
                        else:
                            lines = [data.replace('\n', '')]
                    else:
                        if split_sents:
                            lines = data.splitlines()
                        else:
                            lines = [data.replace('\n', '')]
                    for l in lines:
                        if split_sents:
                            m = re.compile(r'(?i)^(.*?)(\b' + query + r'\b)(.*)$', re.UNICODE)
                        else:
                            m = re.compile(r'(?i)(.{,%s})(\b' % window + query + r'\b)(.{,%s})' % window, re.UNICODE)
                        if split_sents:
                            mat = re.search(m, l)
                        else:
                            mat = re.findall(m, l)
                        if split_sents:
                            if mat:
                                last_num = len(mat.groups())
                                conc_lines.append([f, '', mat.group(1), mat.group(2), mat.group(last_num)])
                        else:
                            if mat:
                                #print len(mat)
                                for ent in mat:
                                    #print len(ent)
                                    last_num = len(ent) - 1
                                    conc_lines.append([f, '', ent[0], ent[1], ent[last_num]])

                if any(f.endswith('.xml') for f in fs_to_conc):
                    from corenlp_xml.document import Document
                    corenlp_xml = Document(data)
                    #corenlp_xml = Beautifulcorenlp_xml(data, parse_only=justsents)  
                    if just_speakers:
                        for s in just_speakers:
                            parsetreedict[s] = []
                        sents = [s for s in corenlp_xml.sentences if s.speakername in just_speakers]
                        #sents = [s for s in corenlp_xml.find_all('sentence') \
                        #if s.speakername.text.strip() in just_speakers]
                    else:
                        sents = corenlp_xml.sentences
                    nsents = len(sents)
                    for i, s in enumerate(sents):
                        if num_fs == 1:
                            if 'note' in kwargs.keys():
                                kwargs['note'].progvar.set((index) * 100.0 / nsents)
                                if root:
                                    root.update()
                        try:
                            speakr = s.speakername.strip()
                        except:
                            speakr = '' 
                        parsetree = s.parse_string
                        if option.startswith('t'):
                            parsetreedict[speakr].append(parsetree)
                            continue
                        elif option.startswith('d'):
                            try:
                                compiled_query = re.compile(query)
                            except:
                                import traceback
                                import sys
                                exc_type, exc_value, exc_traceback = sys.exc_info()
                                lst = traceback.format_exception(exc_type, exc_value,
                                              exc_traceback)
                                error_message = lst[-1]
                                thetime = strftime("%H:%M:%S", localtime())
                                print '%s: Query %s' % (thetime, error_message)
                                return
        
                            #right_dependency_grammar = s.find_all('dependencies', type=dep_type, limit = 1)
                            deps = get_deps(s, dep_type)
                            if dep_function == 'any' or dep_function is False:
                                wdsmatching = [l.dependent.text.strip() for l in deps.links \
                                               if re.match(query, l.dependent.text.strip())]
                            else:
                                comped = re.compile(dep_function, re.IGNORECASE)
                                #goodsent = any(re.match(query, l.dependent.text.strip()) for l in deps.links if re.match(comped, l.type.strip()))
                                wdsmatching = [l.dependent.text.strip() for l in deps.links \
                                               if re.match(comped, l.type.strip()) and \
                                               re.match(query, l.dependent.text.strip())]
                            # this is shit, needs indexing or something
                            for wd in wdsmatching:
                                line = normalise(parsetree)
                                try:
                                    start, middle, end = re.split(r'(' + wd + r')', line, 1)
                                except ValueError:
                                    continue
                                conc_lines.append([f, speakr, start, middle, end])

                    if option.startswith('t'):
                        for speakr, dt in parsetreedict.items():
                            trees_as_string = '\n'.join(dt)
                            if trees:
                                options = '-s'
                            else:
                                options = '-t'
                            with open('tmp.txt', 'w') as fo:
                                fo.write(trees_as_string.encode('utf-8', errors = 'ignore'))
                            tregex_engine(query = query, check_query = True, root = root)
                            wholes = tregex_engine(query = query, 
                                        options = ['-o', '-w', options], 
                                        corpus = 'tmp.txt',
                                        preserve_case = True,
                                        root = root)
                            middle_column_result = tregex_engine(query = query, 
                                        options = ['-o', options], 
                                        corpus = 'tmp.txt',
                                        preserve_case = True,
                                        root = root)
                            for whole, mid in zip(wholes, middle_column_result):
                                reg = re.compile(r'(' + re.escape(mid) + r')', re.IGNORECASE)
                                start, middle, end = re.split(reg, whole, 1)
                                conc_lines.append([f, speakr, start, middle, end])

    # does not keep results ordered!
    try:
        os.remove('tmp.txt')
    except:
        pass

    unique_results = [list(x) for x in set(tuple(x) for x in conc_lines)]

    #make into series
    series = []
    pindex = 'f s l m r'.encode('utf-8').split()

    for fname, spkr, start, word, end in unique_results:
        spkr = unicode(spkr, errors = 'ignore')
        fname = os.path.basename(fname)
        start = start.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        word = word.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        end = end.replace('$ ', '$').replace('`` ', '``').replace(' ,', ',').replace(' .', '.').replace("'' ", "''").replace(" n't", "n't").replace(" 're","'re").replace(" 'm","'m").replace(" 's","'s").replace(" 'd","'d").replace(" 'll","'ll").replace('  ', ' ')
        #spaces = ' ' * (maximum / 2 - (len(word) / 2))
        #new_word = spaces + word + spaces

        # the use of ascii here makes sure the string formats ok, but will also screw over
        # anyone doing non-english work. so, change to utf-8, then fix errors as they come
        # in the corpkit-gui "add_conc_lines_to_window" function
        series.append(pd.Series([fname.encode('ascii', errors = 'ignore'), \
                                 spkr.encode('ascii', errors = 'ignore'), \
                                 start.encode('ascii', errors = 'ignore'), \
                                 word.encode('ascii', errors = 'ignore'), \
                                 end.encode('ascii', errors = 'ignore')], index = pindex))

    # randomise results...
    if random:
        from random import shuffle
        shuffle(series)

    if series == []:
        if root:
            print 'No results found, sorry.'
            return
        else:
            raise ValueError("No results found, I'm afraid. Check your query and path.")

    df = pd.concat(series, axis = 1).T

    if not add_links:
        df.columns = ['f', 's', 'l', 'm', 'r']
    else:
        df.columns = ['f', 's', 'l', 'm', 'r', 'link']

    if all(x == '' for x in list(df['s'].values)):
        df.drop('s', axis = 1, inplace = True)

    if 'note' in kwargs.keys():
        kwargs['note'].progvar.set(100)

    if print_output:

        formatl = lambda x: "{0}".format(x[-window:])
        formatf = lambda x: "{0}".format(x[-20:])
        #formatr = lambda x: 
        formatr = lambda x: "{{:<{}s}}".format(df['r'].str.len().max()).format(x[:window])
        st = df.head(n).to_string(header = False, formatters={'l': formatl,
                                                              'r': formatr,
                                                              'f': formatf}).splitlines()
        
        # hack because i can't figure out formatter:
        rem = '\n'.join([re.sub('\s*\.\.\.\s*$', '', s) for s in st])
        print rem

    return df