Beispiel #1
0
def downloader(url_list, new_path = 'html', wait = 5):
    """download a bunch of urls and store in a local folder"""
    import corpkit
    import urllib
    import time
    import os
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    thetime = strftime("%H:%M:%S", localtime())
    print "\n%s: Attempting to download %d URLs with %d seconds wait-time ... \n" % (thetime, len(url_list), wait)
    p = TextProgressBar(len(urls))
    if not os.path.exists(new_path):
        os.makedirs(new_path)
    paths = []
    for index, url in enumerate(url_list):
        p.animate(index)
        base = os.path.basename(url)
        new_filename = os.path.join(new_path, base)
        paths.append(new_filename)
        urllib.urlretrieve(url, new_filename)
        time.sleep(wait)
    p.animate(len(url_list))
    num_downloaded = len(paths)
    thetime = strftime("%H:%M:%S", localtime())
    print '\n\n%s: Done! %d files downloaded.' % (thetime, num_downloaded)
    return paths
Beispiel #2
0
def downloader(url_list, new_path='html', wait=5):
    """download a bunch of urls and store in a local folder"""
    import corpkit
    import urllib.request, urllib.parse, urllib.error
    import time
    import os
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    thetime = strftime("%H:%M:%S", localtime())
    print(
        "\n%s: Attempting to download %d URLs with %d seconds wait-time ... \n"
        % (thetime, len(url_list), wait))
    p = TextProgressBar(len(urls))
    if not os.path.exists(new_path):
        os.makedirs(new_path)
    paths = []
    for index, url in enumerate(url_list):
        p.animate(index)
        base = os.path.basename(url)
        new_filename = os.path.join(new_path, base)
        paths.append(new_filename)
        urllib.request.urlretrieve(url, new_filename)
        time.sleep(wait)
    p.animate(len(url_list))
    num_downloaded = len(paths)
    thetime = strftime("%H:%M:%S", localtime())
    print('\n\n%s: Done! %d files downloaded.' % (thetime, num_downloaded))
    return paths
Beispiel #3
0
def dictmaker(path, 
              dictname,
              query = 'any',
              dictpath = 'data/dictionaries',
              lemmatise = False,
              just_content_words = False,
              use_dependencies = False):
    """makes a pickle wordlist named dictname in dictpath"""
    import corpkit
    import os
    import pickle
    import re
    import nltk
    from time import localtime, strftime
    from StringIO import StringIO
    import shutil
    from collections import Counter
    from textprogressbar import TextProgressBar
    from corpkit.process import tregex_engine
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False
    
    if lemmatise:
        dictname = dictname + '-lemmatised'
    if not dictname.endswith('.p'):
        dictname = dictname + '.p'

    # allow direct passing of dirs
    path_is_list = False
    one_big_corpus = False
    if type(path) == str:
        sorted_dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path,d))]
    # if no subcorpora, just do the dir passed in
        if len(sorted_dirs) == 0:
            one_big_corpus = True
            sorted_dirs = [path]
    elif type(path) == list:
        path_is_list = True
        sorted_dirs = sorted(path)
        if type(sorted_dirs[0]) == int:
            sorted_dirs = [str(d) for d in sorted_dirs]

    try:
        sorted_dirs.sort(key=int)
    except:
        pass
    try:
        if not os.path.exists(dictpath):
            os.makedirs(dictpath)
    except IOError:
        print "Error making " + dictpath + "/ directory."
    while os.path.isfile(os.path.join(dictpath, dictname)):
        time = strftime("%H:%M:%S", localtime())
        selection = raw_input('\n%s: %s already exists in %s.\n' \
               '          You have the following options:\n\n' \
               '              a) save with a new name\n' \
               '              b) delete %s\n' \
               '              c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname)))
        if 'a' in selection:
            sel = raw_input('\nNew save name: ')
            dictname = sel
            if lemmatise:
                dictname = dictname.replace('-lemmatised.p', '')
                dictname = dictname + '-lemmatised'
            if not dictname.endswith('.p'):
                dictname = dictname + '.p'
        elif 'b' in selection:
            os.remove(os.path.join(dictpath, dictname))
        elif 'c' in selection:
            print ''
            return
        else:
            as_str = str(selection)
            print '          Choice "%s" not recognised.' % selection

    time = strftime("%H:%M:%S", localtime())
    print '\n%s: Extracting words from files ... \n' % time

    # all this just to get a list of files and make a better progress bar
    if use_dependencies:
        counts = []
        for d in sorted_dirs:
            if not one_big_corpus:
                subcorpus = os.path.join(path, d)
            else:
                subcorpus = path
            if use_dependencies:
                files = [f for f in os.listdir(subcorpus) if f.endswith('.xml')]
            else:
                files = [f for f in os.listdir(subcorpus)]
            counts.append(len(files))
        num_files = sum(counts)
        c = 0
        p = TextProgressBar(num_files)
    else:
        p = TextProgressBar(len(sorted_dirs))

    def tokener(xmldata):
        import corpkit
        """print word, using good lemmatisation"""
        from bs4 import BeautifulSoup
        import gc
        open_classes = ['N', 'V', 'R', 'J']
        result = []
        just_good_deps = SoupStrainer('tokens')
        soup = BeautifulSoup(xmldata, parse_only=just_good_deps)   
        for token in soup.find_all('token'):
            word = token.word.text
            query = re.compile(r'.*')
            if re.search(query, word):
                if lemmatise:
                    word = token.lemma.text
                    if just_content_words:
                        if not token.pos.text[0] in open_classes:
                            continue        
                result.append(word)
        # attempt to stop memory problems. 
        # not sure if this helps, though:
        soup.decompose()
        soup = None
        data = None
        gc.collect()
        return result
    
    # translate 'any' query
    if query == 'any':
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !< __'
    
    if lemmatise:
        options = ['-o']
    else:
        options = ['-t', '-o']
    
    if use_dependencies:
        from bs4 import BeautifulSoup, SoupStrainer
        if query == 'any':
            query = r'.*'
        query = re.compile(query)

    allwords = []

    for index, d in enumerate(sorted_dirs):
        if not use_dependencies:
            p.animate(index)
        if not path_is_list:
            if len(sorted_dirs) == 1:
                subcorp = d
            else:
                subcorp = os.path.join(path, d)
        else:
            subcorp = d

        # check query first time through    
        if not use_dependencies:
            if index == 0:
                trees_found = tregex_engine(corpus = subcorp, check_for_trees = True)
                if not trees_found:
                    lemmatise = False
                    dictname = dictname.replace('-lemmatised', '')
            if trees_found:
                results = tregex_engine(corpus = subcorp, options = options, query = query, 
                                        lemmatise = lemmatise,
                                        just_content_words = just_content_words)

                for result in results:
                    allwords.append(result)  

        elif use_dependencies:
            regex_nonword_filter = re.compile("[A-Za-z]")
            results = []
            fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)]
            for f in fs:
                p.animate(c, str(c) + '/' + str(num_files))
                c += 1
                data = open(f).read()
                result_from_a_file = tokener(data)
                for w in result_from_a_file:
                    if re.search(regex_nonword_filter, w):
                        allwords.append(w.lower())

        if not use_dependencies:
            if not trees_found:
                for f in os.listdir(subcorp):
                    raw = unicode(open(os.path.join(subcorp, f)).read(), 'utf-8', errors = 'ignore')
                    sent_tokenizer=nltk.data.load('tokenizers/punkt/english.pickle')
                    sents = sent_tokenizer.tokenize(raw)
                    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                    for sent in tokenized_sents:
                        for w in sent:
                            allwords.append(w.lower()) 

    #100%
    p.animate(len(sorted_dirs))
    
    # make a dict
    dictionary = Counter(allwords)

    with open(os.path.join(dictpath, dictname), 'wb') as handle:
        pickle.dump(dictionary, handle)
    time = strftime("%H:%M:%S", localtime())
    print '\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath + '/'
Beispiel #4
0
def eugener(path, 
            query, 
            depth = 5, 
            top = 20, 
            lemmatise = False,
            just_content_words = False,
            remove_query_from_output = False,
            remove_zero_depth = False,
            return_tags = False):
    """ 
    ***This is probably broken now, can fix if there's a use for it.***
    
    get most frequent words in corpus path to left and right of query regex

    path: path to corpus containing subcorpora
    query: regex to match word to be zero depth
    depth: number of places left and right to look
    top: number of most frequent entries to return
    lemmatise: wordnet lemmatisation
    just_content_words: keep only n, v, a, r tagged words
    remove_query_from_output: remove o
    """
    import os
    import nltk
    import re
    from collections import Counter
    import pandas as pd
    from textprogressbar import TextProgressBar
    from other import tregex_engine

    # manual lemmatisation here:
    from dictionaries.word_transforms import wordlist
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
        from IPython.display import display, clear_output
    except NameError:
        import subprocess
        have_ipython = False
    from tests import check_dit # probably never needed
    
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr=WordNetLemmatizer()

    regex = re.compile(query)
    wordregex = re.compile('[A-Za-z0-9]')

    print ''

    # get list of subcorpora
    dirs = [d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))]
    sorted_dirs = sorted(dirs)
    # define risk word
    # place for our output
    dfs = {}
    p = TextProgressBar(len(sorted_dirs))
    for index, corpus in enumerate(sorted_dirs):
        p.animate(index)
        # search the corpus for whole sents containing risk word
        subcorpus = os.path.join(path, corpus)
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !> __'
        results = tregex_engine(query, ['-o'], subcorpus, 
                                lemmatise = lemmatise, 
                                just_content_words = just_content_words)

        # lowercase
        processed = [(r.lower(), tag) for r, tag in processed]

        # remove punct
        processed = [w for w in processed if re.search(wordregex, w[0])]

        # a place for info about each corpus
        # word list to use later
        all_words = []
        dicts = []

        # go left and right depth times (for 2, makes [-2, -1, 0, 1, 2])
        for i in range(-depth, (depth + 1)):
            newdict = Counter()
            matching = []
            # go through each token
            for index, (token, tag) in enumerate(processed):
                # if token matches risk expression
                if re.search(regex, token):
                    # get the word at depth index
                    # try statement for cases where the target word index isn't there
                    try:
                        if i < 0:
                            num = index - abs(i)
                            if return_tags:
                                matching.append(processed[num][1])
                            else:
                                matching.append(processed[num][0])
                        else:
                            if return_tags:
                                matching.append(processed[index + i][1])
                            else:
                                matching.append(processed[index + i][0])
                    except:
                        pass
            # tally results
            counted = Counter(matching)
            # remove punctuation etc
            for key in counted:
                # commented because this stuff was moved earlier.
                #if key.isalnum():
                    #if key not in stopwords:
                    #if remove_stopwords:
                        #if key not in stopwords:
                            #newdict[key] = counted[key]
                    #else:
                        #newdict[key] = counted[key]
                newdict[key] = counted[key]
            for w in counted.keys():
                all_words.append(w)
            #top_tokens = newdict.most_common(top)
            dicts.append(newdict)
        
        # make pandas series
        sers = []
        # for each unique word
        for word in list(set(all_words)):
            #get counts for each depth
            series = [dct[word] for dct in dicts]
            # add a total
            series.append(sum([dct[word] for dct in dicts]))
            #make index names for depths plus total
            index_names = range(-depth, (depth + 1))
            index_names.append('Total')
            # turn into pandas data, and name the series the word
            ser = pd.Series(series, index = index_names)
            ser.name = word
            sers.append(ser)
        
        # concatenate series into dataframe
        df = pd.concat(sers, axis=1)

        # sort by total
        tot = df.ix['Total']
        df = df[tot.argsort()[::-1]]

        # remove words matching the regex if need be
        if remove_query_from_output:
            cols = [c for c in list(df.columns) if not re.search(regex, c)]
            df = pd.DataFrame(df[cols])
        # remove zero depth if need be
        if remove_zero_depth:
            df = df.drop(0, axis = 0)

        # just top entries
        df = pd.DataFrame(df[list(df.columns)[:top]])
        
        #transpose
        dfs[corpus] = df.T

    # complete animation, then clear
    p.animate(len(sorted_dirs))
    if have_ipython:
        clear_output()

    # some settings for good display
    pd.set_option('display.max_columns', 500)
    pd.set_option('display.width', 1000)
    pd.set_option('expand_frame_repr', False)
    pd.set_option('colheader_justify', 'right')

    # print the start of each frame, then return them all
    for item in sorted(dfs):
        print item, '\n', dfs[item].head(), '\n'
    return dfs
Beispiel #5
0
def download_large_file(proj_path, url, actually_download = True, root = False, **kwargs):
    """download something to proj_path"""
    import corpkit
    import os
    import urllib2
    from time import localtime, strftime
    from textprogressbar import TextProgressBar
    file_name = url.split('/')[-1]
    home = os.path.expanduser("~")
    if 'stanford' in url:
        downloaded_dir = os.path.join(home, 'corenlp')
    else:
        downloaded_dir = os.path.join(proj_path, 'temp')
    fullfile = os.path.join(downloaded_dir, file_name)
    try:
        os.makedirs(downloaded_dir)
    except OSError:
        if 'stanford-corenlp-full-2015-04-20.zip' in os.listdir(downloaded_dir):
            import zipfile
            the_zip_file = zipfile.ZipFile(fullfile)
            ret = the_zip_file.testzip()
            if ret is None:
                return downloaded_dir, fullfile
            else:
                os.remove(fullfile)
    
    if actually_download:
        try:
            u = urllib2.urlopen(url)
            f = open(fullfile, 'wb')
            meta = u.info()
            file_size = int(meta.getheaders("Content-Length")[0])
            if root:
                root.update()
            if 'note' in kwargs.keys():
                kwargs['note'].progvar.set(0)
            else:
                p = TextProgressBar(int(file_size))
            from time import localtime, strftime
            thetime = strftime("%H:%M:%S", localtime())
            print '%s: Downloading ... ' % thetime
            file_size_dl = 0
            block_sz = 8192
            while True:
                buffer = u.read(block_sz)
                if not buffer:
                    break
                file_size_dl += len(buffer)
                f.write(buffer)
                #status = r"%10d  [%3.2f%%]" % (file_size_dl, file_size_dl * 100. / file_size)
                #status = status + chr(8)*(len(status)+1)
                if 'note' in kwargs.keys():
                    kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size))
                else:
                    p.animate(file_size_dl)
                if root:
                    root.update()
            if 'note' in kwargs.keys():  
                kwargs['note'].progvar.set(100)
            else:    
                p.animate(int(file_size))
        except:
            time = strftime("%H:%M:%S", localtime())
            print '%s: Downloaded failed: bad connection.' % time
            f.close()
            if root:
                root.update()
            return
        time = strftime("%H:%M:%S", localtime())
        print '%s: Downloaded successully.' % time
        f.close()
    return downloaded_dir, fullfile
Beispiel #6
0
def dictmaker(path,
              dictname,
              query='any',
              dictpath='data/dictionaries',
              lemmatise=False,
              just_content_words=False,
              use_dependencies=False):
    """makes a pickle wordlist named dictname in dictpath"""
    import corpkit
    import os
    import pickle
    import re
    import nltk
    from time import localtime, strftime
    from io import StringIO
    import shutil
    from collections import Counter
    from textprogressbar import TextProgressBar
    from process import tregex_engine
    try:
        from IPython.display import display, clear_output
    except ImportError:
        pass
    try:
        get_ipython().getoutput()
    except TypeError:
        have_ipython = True
    except NameError:
        import subprocess
        have_ipython = False

    if lemmatise:
        dictname = dictname + '-lemmatised'
    if not dictname.endswith('.p'):
        dictname = dictname + '.p'

    # allow direct passing of dirs
    path_is_list = False
    one_big_corpus = False
    if type(path) == str:
        sorted_dirs = [
            d for d in os.listdir(path) if os.path.isdir(os.path.join(path, d))
        ]
        # if no subcorpora, just do the dir passed in
        if len(sorted_dirs) == 0:
            one_big_corpus = True
            sorted_dirs = [path]
    elif type(path) == list:
        path_is_list = True
        sorted_dirs = sorted(path)
        if type(sorted_dirs[0]) == int:
            sorted_dirs = [str(d) for d in sorted_dirs]

    try:
        sorted_dirs.sort(key=int)
    except:
        pass
    try:
        if not os.path.exists(dictpath):
            os.makedirs(dictpath)
    except IOError:
        print("Error making " + dictpath + "/ directory.")
    while os.path.isfile(os.path.join(dictpath, dictname)):
        time = strftime("%H:%M:%S", localtime())
        selection = input('\n%s: %s already exists in %s.\n' \
               '          You have the following options:\n\n' \
               '              a) save with a new name\n' \
               '              b) delete %s\n' \
               '              c) exit\n\nYour selection: ' % (time, dictname, dictpath, os.path.join(dictpath, dictname)))
        if 'a' in selection:
            sel = input('\nNew save name: ')
            dictname = sel
            if lemmatise:
                dictname = dictname.replace('-lemmatised.p', '')
                dictname = dictname + '-lemmatised'
            if not dictname.endswith('.p'):
                dictname = dictname + '.p'
        elif 'b' in selection:
            os.remove(os.path.join(dictpath, dictname))
        elif 'c' in selection:
            print('')
            return
        else:
            as_str = str(selection)
            print('          Choice "%s" not recognised.' % selection)

    time = strftime("%H:%M:%S", localtime())
    print('\n%s: Extracting words from files ... \n' % time)

    # all this just to get a list of files and make a better progress bar
    if use_dependencies:
        counts = []
        for d in sorted_dirs:
            if not one_big_corpus:
                subcorpus = os.path.join(path, d)
            else:
                subcorpus = path
            if use_dependencies:
                files = [
                    f for f in os.listdir(subcorpus) if f.endswith('.xml')
                ]
            else:
                files = [f for f in os.listdir(subcorpus)]
            counts.append(len(files))
        num_files = sum(counts)
        c = 0
        p = TextProgressBar(num_files)
    else:
        p = TextProgressBar(len(sorted_dirs))

    def tokener(xmldata):
        import corpkit
        """print word, using good lemmatisation"""
        from bs4 import BeautifulSoup
        import gc
        open_classes = ['N', 'V', 'R', 'J']
        result = []
        just_good_deps = SoupStrainer('tokens')
        soup = BeautifulSoup(xmldata, parse_only=just_good_deps)
        for token in soup.find_all('token'):
            word = token.word.text
            query = re.compile(r'.*')
            if re.search(query, word):
                if lemmatise:
                    word = token.lemma.text
                    if just_content_words:
                        if not token.pos.text[0] in open_classes:
                            continue
                result.append(word)
        # attempt to stop memory problems.
        # not sure if this helps, though:
        soup.decompose()
        soup = None
        data = None
        gc.collect()
        return result

    # translate 'any' query
    if query == 'any':
        if lemmatise:
            query = r'__ <# (__ !< __)'
        else:
            query = r'__ !< __'

    if lemmatise:
        options = ['-o']
    else:
        options = ['-t', '-o']

    if use_dependencies:
        from bs4 import BeautifulSoup, SoupStrainer
        if query == 'any':
            query = r'.*'
        query = re.compile(query)

    allwords = []

    for index, d in enumerate(sorted_dirs):
        if not use_dependencies:
            p.animate(index)
        if not path_is_list:
            if len(sorted_dirs) == 1:
                subcorp = d
            else:
                subcorp = os.path.join(path, d)
        else:
            subcorp = d

        # check query first time through
        if not use_dependencies:
            if index == 0:
                trees_found = tregex_engine(corpus=subcorp,
                                            check_for_trees=True)
                if not trees_found:
                    lemmatise = False
                    dictname = dictname.replace('-lemmatised', '')
            if trees_found:
                results = tregex_engine(corpus=subcorp,
                                        options=options,
                                        query=query,
                                        lemmatise=lemmatise,
                                        just_content_words=just_content_words)

                for result in results:
                    allwords.append(result)

        elif use_dependencies:
            regex_nonword_filter = re.compile("[A-Za-z]")
            results = []
            fs = [os.path.join(subcorp, f) for f in os.listdir(subcorp)]
            for f in fs:
                p.animate(c, str(c) + '/' + str(num_files))
                c += 1
                data = open(f).read()
                result_from_a_file = tokener(data)
                for w in result_from_a_file:
                    if re.search(regex_nonword_filter, w):
                        allwords.append(w.lower())

        if not use_dependencies:
            if not trees_found:
                for f in os.listdir(subcorp):
                    raw = str(open(os.path.join(subcorp, f)).read(),
                              'utf-8',
                              errors='ignore')
                    sent_tokenizer = nltk.data.load(
                        'tokenizers/punkt/english.pickle')
                    sents = sent_tokenizer.tokenize(raw)
                    tokenized_sents = [nltk.word_tokenize(i) for i in sents]
                    for sent in tokenized_sents:
                        for w in sent:
                            allwords.append(w.lower())

    #100%
    p.animate(len(sorted_dirs))

    # make a dict
    dictionary = Counter(allwords)

    with open(os.path.join(dictpath, dictname), 'wb') as handle:
        pickle.dump(dictionary, handle)
    time = strftime("%H:%M:%S", localtime())
    print('\n\n' + time + ': Done! ' + dictname + ' created in ' + dictpath +
          '/')