Exemple #1
0
def downloader(url_list, new_path='html', wait=5):
    """download a bunch of urls and store in a local folder"""
    import corpkit
    import urllib.request, urllib.parse, urllib.error
    import time
    import os
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    thetime = strftime("%H:%M:%S", localtime())
    print(
        "\n%s: Attempting to download %d URLs with %d seconds wait-time ... \n"
        % (thetime, len(url_list), wait))
    p = TextProgressBar(len(urls))
    if not os.path.exists(new_path):
        os.makedirs(new_path)
    paths = []
    for index, url in enumerate(url_list):
        p.animate(index)
        base = os.path.basename(url)
        new_filename = os.path.join(new_path, base)
        paths.append(new_filename)
        urllib.request.urlretrieve(url, new_filename)
        time.sleep(wait)
    p.animate(len(url_list))
    num_downloaded = len(paths)
    thetime = strftime("%H:%M:%S", localtime())
    print('\n\n%s: Done! %d files downloaded.' % (thetime, num_downloaded))
    return paths
Exemple #2
0
def animator(progbar,
             count,
             tot_string=False,
             linenum=False,
             terminal=False,
             init=False,
             length=False,
             **kwargs):
    """
    Animates progress bar in unique position in terminal
    Multiple progress bars not supported in jupyter yet.
    """

    # add startnum
    start_at = kwargs.get('startnum', 0)
    if start_at is None:
        start_at = 0.0
    denominator = kwargs.get('denom', 1)
    if kwargs.get('note'):
        if count is None:
            perc_done = 0.0
        else:
            perc_done = (count * 100.0 / float(length)) / float(denominator)
        kwargs['note'].progvar.set(start_at + perc_done)
        kwargs['root'].update()
        return

    if init:
        from textprogressbar import TextProgressBar
        return TextProgressBar(length, dirname=tot_string)
    if type(linenum) == int:
        # this try is for sublime text nosetests, which don't take terminal object
        try:
            with terminal.location(0, terminal.height - (linenum + 1)):
                if tot_string:
                    progbar.animate(count, tot_string)
                else:
                    progbar.animate(count)
        except:
            pass
    else:
        if tot_string:
            progbar.animate(count, tot_string)
        else:
            progbar.animate(count)
Exemple #3
0
def dictmaker(path,
              dictname,
              query='any',
              dictpath='data/dictionaries',
              lemmatise=False,
              just_content_words=False,
              use_dependencies=False):
    """makes a pickle wordlist named dictname in dictpath"""
    import corpkit
    import os
    import pickle
    import re
    import nltk
    from time import localtime, strftime
    from io import StringIO
    import shutil
    from collections import Counter
    from textprogressbar import TextProgressBar
    from process import tregex_engine
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False

    if lemmatise:
        dictname = dictname + '-lemmatised'
    if not dictname.endswith('.p'):
        dictname = dictname + '.p'

    # allow direct passing of dirs
    path_is_list = False
    one_big_corpus = False
    if type(path) == str:
        sorted_dirs = [
            d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))
        ]
        # if no subcorpora, just do the dir passed in
        if len(sorted_dirs) == 0:
            one_big_corpus = True
            sorted_dirs = [path]
    elif type(path) == list:
        path_is_list = True
        sorted_dirs = sorted(path)
        if type(sorted_dirs[0]) == int:
            sorted_dirs = [str(d) for d in sorted_dirs]

    try:
        sorted_dirs.sort(key=int)
    except:
        pass
    try:
        if not os.path.exists(dictpath):
            os.makedirs(dictpath)
    except IOError:
        print("Error making " + dictpath + "/ directory.")
    while os.path.isfile(os.path.join(dictpath, dictname)):
        time = strftime("%H:%M:%S", localtime())
        selection = input('\n%s: %s already exists in %s.\n' \
               '          You have the following options:\n\n' \
               '              a) save with a new name\n' \
               '              b) delete %s\n' \
               '              c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname)))
        if 'a' in selection:
            sel = input('\nNew save name: ')
            dictname = sel
            if lemmatise:
                dictname = dictname.replace('-lemmatised.p', '')
                dictname = dictname + '-lemmatised'
            if not dictname.endswith('.p'):
                dictname = dictname + '.p'
        elif 'b' in selection:
            os.remove(os.path.join(dictpath, dictname))
        elif 'c' in selection:
            print('')
            return
        else:
            as_str = str(selection)
            print('          Choice "%s" not recognised.' % selection)

    time = strftime("%H:%M:%S", localtime())
    print('\n%s: Extracting words from files ... \n' % time)

    # all this just to get a list of files and make a better progress bar
    if use_dependencies:
        counts = []
        for d in sorted_dirs:
            if not one_big_corpus:
                subcorpus = os.path.join(path, d)
            else:
                subcorpus = path
            if use_dependencies:
                files = [
                    f for f in os.listdir(subcorpus) if f.endswith('.xml')
                ]
            else:
                files = [f for f in os.listdir(subcorpus)]
            counts.append(len(files))
        num_files = sum(counts)
        c = 0
        p = TextProgressBar(num_files)
    else:
        p = TextProgressBar(len(sorted_dirs))

    def tokener(xmldata):
        import corpkit
        """print word, using good lemmatisation"""
        from bs4 import BeautifulSoup
        import gc
        open_classes = ['N', 'V', 'R', 'J']
        result = []
        just_good_deps = SoupStrainer('tokens')
        soup = BeautifulSoup(xmldata, parse_only=just_good_deps)
        for token in soup.find_all('token'):
            word = token.word.text
            query = re.compile(r'.*')
            if re.search(query, word):
                if lemmatise:
                    word = token.lemma.text
                    if just_content_words:
                        if not token.pos.text[0] in open_classes:
                            continue
                result.append(word)
        # attempt to stop memory problems.
        # not sure if this helps, though:
        soup.decompose()
        soup = None
        data = None
        gc.collect()
        return result

    # translate 'any' query
    if query == 'any':
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !< __'

    if lemmatise:
        options = ['-o']
    else:
        options = ['-t', '-o']

    if use_dependencies:
        from bs4 import BeautifulSoup, SoupStrainer
        if query == 'any':
            query = r'.*'
        query = re.compile(query)

    allwords = []

    for index, d in enumerate(sorted_dirs):
        if not use_dependencies:
            p.animate(index)
        if not path_is_list:
            if len(sorted_dirs) == 1:
                subcorp = d
            else:
                subcorp = os.path.join(path, d)
        else:
            subcorp = d

        # check query first time through
        if not use_dependencies:
            if index == 0:
                trees_found = tregex_engine(corpus=subcorp,
                                            check_for_trees=True)
                if not trees_found:
                    lemmatise = False
                    dictname = dictname.replace('-lemmatised', '')
            if trees_found:
                results = tregex_engine(corpus=subcorp,
                                        options=options,
                                        query=query,
                                        lemmatise=lemmatise,
                                        just_content_words=just_content_words)

                for result in results:
                    allwords.append(result)

        elif use_dependencies:
            regex_nonword_filter = re.compile("[A-Za-z]")
            results = []
            fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)]
            for f in fs:
                p.animate(c, str(c) + '/' + str(num_files))
                c += 1
                data = open(f).read()
                result_from_a_file = tokener(data)
                for w in result_from_a_file:
                    if re.search(regex_nonword_filter, w):
                        allwords.append(w.lower())

        if not use_dependencies:
            if not trees_found:
                for f in os.listdir(subcorp):
                    raw = str(open(os.path.join(subcorp, f)).read(),
                              'utf-8',
                              errors='ignore')
                    sent_tokenizer = nltk.data.load(
                        'tokenizers/punkt/english.pickle')
                    sents = sent_tokenizer.tokenize(raw)
                    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                    for sent in tokenized_sents:
                        for w in sent:
                            allwords.append(w.lower())

    #100%
    p.animate(len(sorted_dirs))

    # make a dict
    dictionary = Counter(allwords)

    with open(os.path.join(dictpath, dictname), 'wb') as handle:
        pickle.dump(dictionary, handle)
    time = strftime("%H:%M:%S", localtime())
    print('\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath +
          '/')