Example #1
0
def ngrams(data,
           reference_corpus = 'bnc.p',
           clear = True, 
           printstatus = True, 
           n = 'all',
           **kwargs):
    """Feed this function some data and get its keywords.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """
    
    import re
    import time
    from time import localtime, strftime
    from dictionaries.stopwords import stopwords as my_stopwords

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    from keys import keywords_and_ngrams, turn_input_into_counter
    from other import datareader

    loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print "\n%s: Generating ngrams... \n" % time
    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[A-Za-z-\']")
    good = [i for i in good if re.search(regex_nonword_filter, i) and i not in my_stopwords] 

    ngrams = keywords_and_ngrams(good, reference_corpus = reference_corpus, 
                                 calc_all = calc_all, show = 'ngrams', **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        clear_output()
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print '%s: Done! %d results.\n' % (time, len(list(out.index)))

    if n  == 'all':
        n = len(out)

    return out[:n]
Example #2
0
def collocates(data, nbest = 30, window = 5):
    """Feed this data and get its collocations"""
    import nltk
    from nltk import collocations
    from nltk.collocations import BigramCollocationFinder
    import os
    import time
    from time import localtime, strftime
    from other import datareader
    from tests import check_dit
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
        
    # turn all sentences into long string
    time = strftime("%H:%M:%S", localtime())
    #if noprint is False:
    print "\n%s: Generating %d collocates ... \n" % (time, nbest)
    good = datareader(data)
    if type(good) != unicode:
        good = unicode(good.lower(), 'utf-8', errors = 'ignore')
    else:
        good = good.lower()
    # sent and word tokenise
    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
    sents = sent_tokenizer.tokenize(good)
    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
    allwords = []
    # for each sentence,
    for sent in tokenized_sents:
    # for each word,
        for word in sent:
        # make a list of all words
            allwords.append(word)
    bigram_measures = nltk.collocations.BigramAssocMeasures()
    finder = BigramCollocationFinder.from_words(allwords, window_size=window)
    # should be consistent in stopwords
    ignored_words = nltk.corpus.stopwords.words('english')
    finder.apply_word_filter(lambda w: len(w) < 2 or w.lower() \
        in ignored_words or not w.isalnum())
    #unncessary?:
    finder.apply_freq_filter(2)
    results = sorted(finder.nbest(bigram_measures.raw_freq, nbest))
    listversion = []
    for index, thecollocation in enumerate(results):
        aslist = [index, thecollocation[0], thecollocation[1]]
        listversion.append(aslist)
    clear_output()
    return listversion
Example #3
0
File: keys.py Project: whrl/corpkit
def turn_input_into_counter(data, **kwargs):
    """from string (filepath) or variable, return a counter."""
    import sys
    import os
    import re
    import collections
    import pickle
    import pandas
    from corpkit.other import datareader
    
    dict_found = False

    if type(data) == str:
        if os.path.isdir(data):
            # get list of words
            good = datareader(data, **kwargs)
            # remove bad stuff from result
            regex_nonword_filter = re.compile("[A-Za-z]")
            data = [i for i in good if re.search(regex_nonword_filter, i)]
            return collections.Counter(data)

    while not dict_found:
        if 'interrogation' in str(type(data)):
            try:
                data = data.results
            except:
                raise ValueError("Can't find .results branch of input.")

        # if passing in results, sum them
        if type(data) == pandas.core.frame.DataFrame:
            data = data.sum()

        # count sum
        if type(data) == pandas.core.series.Series:
            data = data[data != 0]
            data = collections.Counter(data.to_dict())
            dict_found = True
            return data

        # turn notmal dicts into counter
        if type(data) == dict:
            dict_found = True
            return collections.Counter(data)
        
        # the best case scenario:
        if type(data) == collections.Counter:
            dict_found = True
            return data

        # filepath stuff
        if type(data) == str:
            if not data.endswith('.p'):
                data = data + '.p'
            try:
                ref_corp_dict = pickle.load( open( data, "rb" ) )
                dict_found = True
                return ref_corp_dict
            except IOError:
                try:
                    ref_corp_dict = pickle.load( open( os.path.join('dictionaries', data), "rb" ) )
                    dict_found = True
                    return ref_corp_dict
                except IOError:
                    #try:
                    #    import corpkit
                    #    path_to_corpkit = os.path.dirname(corpkit.__file__)
                    #    thepath, corpkitname = os.path.split(path_to_corpkit)
                    #    dictionaries_path = os.path.join(thepath, 'dictionaries')
                    #    ref_corp_dict = pickle.load( open( os.path.join(dictionaries_path, data), "rb" ) )
                    #    dict_found = True
                    #    return ref_corp_dict
                    #except:
                    pass

            dict_of_dicts = {}
            d_for_print = []
            
            dicts = [f for f in os.listdir('dictionaries') if f.endswith('.p')]
            for index, d in enumerate(dicts):
                dict_of_dicts[index] = d
                d_for_print.append('    % 2d) %s' % (index, d))
            
            d_for_print = '\n'.join(d_for_print) 

            selection = raw_input("\nReference corpus not found. Select an existing reference corpus or exit or type 'exit' to quit.\n\n%s\n\nYour selection: " % d_for_print)
            if selection.startswith('e'):
                import sys
                sys.exit()
            else:
                try:
                    data = dict_of_dicts[int(selection)]
                except:
                    print '\nInput "%s" not recognised.' % data
Example #4
0
File: keys.py Project: whrl/corpkit
def ngrams(data,
           clear = True, 
           printstatus = True, 
           n = 'all',
           calc_all = True,
           blacklist = False,
           split_contractions = True,
           gramsize = 2,
           **kwargs):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """
    
    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords
            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from corpkit.keys import keywords_and_ngrams, turn_input_into_counter
    from other import datareader

    #loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print "\n%s: Generating ngrams... \n" % time
    
    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-\']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    #print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    #print ' '.join(good[:500])

    #if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(good, stopwords = stopwords,
                                 calc_all = calc_all, show = 'ngrams', gramsize = gramsize, **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print '%s: Done! %d results.\n' % (time, len(list(out.index)))

    if n  == 'all':
        n = len(out)

    return out[:n]