Ejemplo n.º 1
0
def ngrams(data,
           reference_corpus = 'bnc.p',
           clear = True, 
           printstatus = True, 
           n = 'all',
           **kwargs):
    """Feed this function some data and get its keywords.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """
    
    import re
    import time
    from time import localtime, strftime
    from dictionaries.stopwords import stopwords as my_stopwords

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    from keys import keywords_and_ngrams, turn_input_into_counter
    from other import datareader

    loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print "\n%s: Generating ngrams... \n" % time
    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[A-Za-z-\']")
    good = [i for i in good if re.search(regex_nonword_filter, i) and i not in my_stopwords] 

    ngrams = keywords_and_ngrams(good, reference_corpus = reference_corpus, 
                                 calc_all = calc_all, show = 'ngrams', **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        clear_output()
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print '%s: Done! %d results.\n' % (time, len(list(out.index)))

    if n  == 'all':
        n = len(out)

    return out[:n]
Ejemplo n.º 2
0
def keywords(data, 
             reference_corpus = 'bnc.p', 
             clear = True, 
             printstatus = True, 
             n = 'all',
             threshold = False,
             selfdrop = True,
             editing = False,
             calc_all = True,
             **kwargs):
    """Feed this function some data and get its keywords.
    """
    
    import re
    import time
    from time import localtime, strftime
    import collections
    import pandas
    import pandas as pd
    import numpy as np
    from collections import Counter

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    from .keys import keywords_and_ngrams, turn_input_into_counter

    the_threshold = False

    if type(reference_corpus) == str:
        if reference_corpus == 'self':
            if type(data) == pandas.core.series.Series:
                import warnings
                warnings.warn('Using "self" option with Series as data will result in 0.0 keyness.')
            reference_corpus = data.copy()

        else:
            selfdrop = False

    # turn of selfdrop if df indices aren't shared:
    # this is skipped if loading from file or something
    if type(data) == pandas.core.frame.DataFrame and type(reference_corpus) == pandas.core.frame.DataFrame:
        ref_subc = list(reference_corpus.index)
        tg_subc = list(data.index)
        if not all([x in ref_subc for x in tg_subc]):
            selfdrop = False

    if printstatus and not editing:
        time = strftime("%H:%M:%S", localtime())
        print("\n%s: Generating keywords ...\n" % time)
    
    def set_threshold_and_remove_under(reference_corpus, threshold, for_keywords = False):
        from collections import Counter
        import pandas

        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if for_keywords:
                denominator = denominator * 5 

            tot = sum(reference_corpus.values())

            the_threshold = float(tot) / float(denominator)

        else:
            the_threshold = threshold
        if printstatus:
            print('Threshold: %d\n' % the_threshold)

        # drop infrequent words from keywording
        to_drop = []
        for w, v in list(reference_corpus.items()):
            if v < the_threshold:
                to_drop.append(w)
                #if type(data) == collections.Counter or type(data) == dict:
                    #del data[w]
                if calc_all:
                    del reference_corpus[w]

        if printstatus:
            to_show = [w for w in to_drop[:5]]
            if len(to_drop) > 10:
                to_show.append('...')
                [to_show.append(w) for w in to_drop[-5:]]
            if len(to_drop) > 0:
                print('Removing %d entries below threshold:\n    %s' % (len(to_drop), '\n    '.join(to_show)))
            if len(to_drop) > 10:
                print('... and %d more ... \n' % (len(to_drop) - len(to_show) + 1))
            else:
                print('')
        return reference_corpus, the_threshold, to_drop


    if type(data) == pandas.core.frame.DataFrame:
        loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
        # set threshold
        if threshold:
            loaded_ref_corpus, the_threshold, to_drop = set_threshold_and_remove_under(loaded_ref_corpus, threshold, for_keywords = True)
            # remove under threshold from target corpora
            data = data.drop(to_drop, errors = 'ignore', axis = 1)

        else:
            the_threshold = False

        kwds = []
        for i in list(data.index):
            # this could potentially slow down calculation using saved dicts
            if selfdrop:
                try:
                    loaded_ref_corpus = turn_input_into_counter(reference_corpus.drop(i), **kwargs)
                # if dropping doesn't work, make loaded_ref_corpus without dropping, but only once
                except:
                    try:
                        loaded_ref_corpus
                    except NameError:
                        loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
            else:
                loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
            

            loaded_target_corpus = turn_input_into_counter(data.ix[i], **kwargs)


            ser = keywords_and_ngrams(loaded_target_corpus, loaded_ref_corpus, calc_all = calc_all,
                                   show = 'keywords', **kwargs)
            # turn into series
            ser = pd.Series([s for k, s in ser], index = [k for k, s in ser])
            pd.set_option('display.float_format', lambda x: '%.2f' % x)
            ser.name = i
            kwds.append(ser)
        out = pd.concat(kwds, axis = 1)

    else:
        if selfdrop and type(reference_corpus) == pandas.core.frame.DataFrame:
            try:
                loaded_ref_corpus = turn_input_into_counter(reference_corpus.drop(data.name), **kwargs)
            except:
                try:
                    loaded_ref_corpus
                except NameError:
                    loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
        else:
            loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
    
        if threshold:
            loaded_ref_corpus, the_threshold, to_drop = set_threshold_and_remove_under(loaded_ref_corpus, threshold, for_keywords = True)
            # remove under threshold from target corpora
            
            data = data.drop(to_drop, errors = 'ignore')
        else:
            the_threshold = False

        loaded_target_corpus = turn_input_into_counter(data, **kwargs)

        kwds = keywords_and_ngrams(loaded_target_corpus, loaded_ref_corpus, calc_all = calc_all,
                               show = 'keywords', **kwargs)
        # turn into series
        out = pd.Series([s for k, s in kwds], index = [k for k, s in kwds])
        pd.set_option('display.float_format', lambda x: '%.2f' % x)
        out.name = 'keywords'

    # drop infinites and nans
    out = out.replace([np.inf, -np.inf], np.nan)
    out = out.fillna(0.0)
    
    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus and not editing:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n  == 'all':
        n = len(out)

    return out[:n]
Ejemplo n.º 3
0
def ngrams(data,
           clear = True, 
           printstatus = True, 
           n = 'all',
           calc_all = True,
           blacklist = False,
           split_contractions = True,
           gramsize = 2,
           **kwargs):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """
    
    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords
            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from keys import keywords_and_ngrams, turn_input_into_counter
    from process import datareader

    #loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print("\n%s: Generating ngrams... \n" % time)
    
    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-\']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    #print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    #print ' '.join(good[:500])

    #if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(good, stopwords = stopwords,
                                 calc_all = calc_all, show = 'ngrams', gramsize = gramsize, **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index = [k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n  == 'all':
        n = len(out)

    return out[:n]
Ejemplo n.º 4
0
def keywords(data,
             reference_corpus='bnc.p',
             clear=True,
             printstatus=True,
             n='all',
             threshold=False,
             selfdrop=True,
             editing=False,
             calc_all=True,
             **kwargs):
    """Feed this function some data and get its keywords.
    """

    import re
    import time
    from time import localtime, strftime
    import collections
    import pandas
    import pandas as pd
    import numpy as np
    from collections import Counter

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    from keys import keywords_and_ngrams, turn_input_into_counter

    the_threshold = False

    if type(reference_corpus) == str:
        if reference_corpus == 'self':
            if type(data) == pandas.core.series.Series:
                import warnings
                warnings.warn(
                    'Using "self" option with Series as data will result in 0.0 keyness.'
                )
            reference_corpus = data.copy()

        else:
            selfdrop = False

    # turn of selfdrop if df indices aren't shared:
    # this is skipped if loading from file or something
    if type(data) == pandas.core.frame.DataFrame and type(
            reference_corpus) == pandas.core.frame.DataFrame:
        ref_subc = list(reference_corpus.index)
        tg_subc = list(data.index)
        if not all([x in ref_subc for x in tg_subc]):
            selfdrop = False

    if printstatus and not editing:
        time = strftime("%H:%M:%S", localtime())
        print("\n%s: Generating keywords ...\n" % time)

    def set_threshold_and_remove_under(reference_corpus,
                                       threshold,
                                       for_keywords=False):
        from collections import Counter
        import pandas

        if type(threshold) == str:
            if threshold.startswith('l'):
                denominator = 10000
            if threshold.startswith('m'):
                denominator = 5000
            if threshold.startswith('h'):
                denominator = 2500
            if for_keywords:
                denominator = denominator * 5

            tot = sum(reference_corpus.values())

            the_threshold = float(tot) / float(denominator)

        else:
            the_threshold = threshold
        if printstatus:
            print('Threshold: %d\n' % the_threshold)

        # drop infrequent words from keywording
        to_drop = []
        for w, v in list(reference_corpus.items()):
            if v < the_threshold:
                to_drop.append(w)
                #if type(data) == collections.Counter or type(data) == dict:
                #del data[w]
                if calc_all:
                    del reference_corpus[w]

        if printstatus:
            to_show = [w for w in to_drop[:5]]
            if len(to_drop) > 10:
                to_show.append('...')
                [to_show.append(w) for w in to_drop[-5:]]
            if len(to_drop) > 0:
                print('Removing %d entries below threshold:\n    %s' %
                      (len(to_drop), '\n    '.join(to_show)))
            if len(to_drop) > 10:
                print('... and %d more ... \n' %
                      (len(to_drop) - len(to_show) + 1))
            else:
                print('')
        return reference_corpus, the_threshold, to_drop

    if type(data) == pandas.core.frame.DataFrame:
        loaded_ref_corpus = turn_input_into_counter(reference_corpus, **kwargs)
        # set threshold
        if threshold:
            loaded_ref_corpus, the_threshold, to_drop = set_threshold_and_remove_under(
                loaded_ref_corpus, threshold, for_keywords=True)
            # remove under threshold from target corpora
            data = data.drop(to_drop, errors='ignore', axis=1)

        else:
            the_threshold = False

        kwds = []
        for i in list(data.index):
            # this could potentially slow down calculation using saved dicts
            if selfdrop:
                try:
                    loaded_ref_corpus = turn_input_into_counter(
                        reference_corpus.drop(i), **kwargs)
                # if dropping doesn't work, make loaded_ref_corpus without dropping, but only once
                except:
                    try:
                        loaded_ref_corpus
                    except NameError:
                        loaded_ref_corpus = turn_input_into_counter(
                            reference_corpus, **kwargs)
            else:
                loaded_ref_corpus = turn_input_into_counter(
                    reference_corpus, **kwargs)

            loaded_target_corpus = turn_input_into_counter(
                data.ix[i], **kwargs)

            ser = keywords_and_ngrams(loaded_target_corpus,
                                      loaded_ref_corpus,
                                      calc_all=calc_all,
                                      show='keywords',
                                      **kwargs)
            # turn into series
            ser = pd.Series([s for k, s in ser], index=[k for k, s in ser])
            pd.set_option('display.float_format', lambda x: '%.2f' % x)
            ser.name = i
            kwds.append(ser)
        out = pd.concat(kwds, axis=1)

    else:
        if selfdrop and type(reference_corpus) == pandas.core.frame.DataFrame:
            try:
                loaded_ref_corpus = turn_input_into_counter(
                    reference_corpus.drop(data.name), **kwargs)
            except:
                try:
                    loaded_ref_corpus
                except NameError:
                    loaded_ref_corpus = turn_input_into_counter(
                        reference_corpus, **kwargs)
        else:
            loaded_ref_corpus = turn_input_into_counter(
                reference_corpus, **kwargs)

        if threshold:
            loaded_ref_corpus, the_threshold, to_drop = set_threshold_and_remove_under(
                loaded_ref_corpus, threshold, for_keywords=True)
            # remove under threshold from target corpora

            data = data.drop(to_drop, errors='ignore')
        else:
            the_threshold = False

        loaded_target_corpus = turn_input_into_counter(data, **kwargs)

        kwds = keywords_and_ngrams(loaded_target_corpus,
                                   loaded_ref_corpus,
                                   calc_all=calc_all,
                                   show='keywords',
                                   **kwargs)
        # turn into series
        out = pd.Series([s for k, s in kwds], index=[k for k, s in kwds])
        pd.set_option('display.float_format', lambda x: '%.2f' % x)
        out.name = 'keywords'

    # drop infinites and nans
    out = out.replace([np.inf, -np.inf], np.nan)
    out = out.fillna(0.0)

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus and not editing:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n == 'all':
        n = len(out)

    return out[:n]
Ejemplo n.º 5
0
def ngrams(data,
           clear=True,
           printstatus=True,
           n='all',
           calc_all=True,
           blacklist=False,
           split_contractions=True,
           gramsize=2,
           **kwargs):
    """Feed this function some data and get ngrams.

    You can use dictmaker() to build a new reference_corpus 
    to serve as reference corpus, or use bnc.p

    A list of what counts as data is available in the 
    docstring of datareader().
    """

    import re
    import time
    from time import localtime, strftime

    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass

    stopwords = False
    if blacklist is not False:
        if blacklist is True:
            from dictionaries.stopwords import stopwords as my_stopwords
            stopwords = [i.lower() for i in my_stopwords]
        else:
            stopwords = [i.lower() for i in blacklist]

    from keys import keywords_and_ngrams, turn_input_into_counter
    from process import datareader

    #loaded_ref_corpus = turn_input_into_counter(reference_corpus)

    if n == 'all':
        n = 99999

    time = strftime("%H:%M:%S", localtime())
    if printstatus:
        print("\n%s: Generating ngrams... \n" % time)

    good = datareader(data, **kwargs)

    regex_nonword_filter = re.compile("[0-9A-Za-z-\']")

    good = [i for i in good if re.search(regex_nonword_filter, i)]

    #print ' '.join(good[:500])

    def unsplitter(lst):
        unsplit = []
        for index, t in enumerate(lst):
            if index == 0 or index == len(lst) - 1:
                unsplit.append(t)
                continue
            if "'" in t and not t.endswith("'"):
                rejoined = ''.join([lst[index - 1], t])
                unsplit.append(rejoined)
            else:
                if not "'" in lst[index + 1]:
                    unsplit.append(t)
        return unsplit

    if not split_contractions:
        good = unsplitter(good)

    #print ' '.join(good[:500])

    #if stopwords:
    #    good = [i for i in good if i not in stopwords]
    ngrams = keywords_and_ngrams(good,
                                 stopwords=stopwords,
                                 calc_all=calc_all,
                                 show='ngrams',
                                 gramsize=gramsize,
                                 **kwargs)

    import pandas as pd
    out = pd.Series([s for k, s in ngrams], index=[k for k, s in ngrams])
    out.name = 'ngrams'

    # print and return
    if clear:
        try:
            clear_output()
        except:
            pass
    if printstatus:
        time = strftime("%H:%M:%S", localtime())
        print('%s: Done! %d results.\n' % (time, len(list(out.index))))

    if n == 'all':
        n = len(out)

    return out[:n]