Python INPUTFUNC Examples

Programming Language: Python

Namespace/Package Name: corpkit.constants

Class/Type: INPUTFUNC

Examples at hotexamples.com: 8

Python INPUTFUNC - 8 examples found. These are the top rated real world Python examples of corpkit.constants.INPUTFUNC extracted from open source projects. You can rate examples to help us improve the quality of examples.

Frequently Used Methods

Show Hide

INPUTFUNC(5)

lower(2)

endswith(1)

replace(1)

split(1)

Example #1

Show file

File: other.py Project: tranduythanh/corpkit

def loader(savedir='saved_interrogations'):
    """Show a list of data that can be loaded, and then load by user input of index"""
    import glob
    import os
    import corpkit
    from corpkit.other import load
    fs = [
        i for i in glob.glob(r'%s/*' % savedir)
        if not os.path.basename(i).startswith('.')
    ]
    string_to_show = '\nFiles in %s:\n' % savedir
    most_digits = max([len(str(i)) for i, j in enumerate(fs)])
    for index, fname in enumerate(fs):
        string_to_show += str(index).rjust(
            most_digits) + ':\t' + os.path.basename(fname) + '\n'
    print(string_to_show)
    INPUTFUNC('Enter index of item to load: ')
    if ' ' in index or '=' in index:
        if '=' in index:
            index = index.replace(' = ', ' ')
            index = index.replace('=', ' ')
        varname, ind = index.split(' ', 1)
        globals()[varname] = load(os.path.basename(fs[int(ind)]))
        print("%s = %s. Don't do this again." %
              (varname, os.path.basename(fs[int(ind)])))
        return
    try:
        index = int(index)
    except:
        raise ValueError('Selection not recognised.')
    return load(os.path.basename(fs[index]))

Example #2

Show file

File: interrogator.py Project: ashhher3/corpkit

 def signal_handler(signal, _):
     """
     Allow pausing and restarting whn not in GUI
     """
     if root:
         return  
     import signal
     import sys
     from time import localtime, strftime
     signal.signal(signal.SIGINT, original_sigint)
     thetime = strftime("%H:%M:%S", localtime())
     INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime)
     time = strftime("%H:%M:%S", localtime())
     print('%s: Interrogation resumed.\n' % time)
     signal.signal(signal.SIGINT, signal_handler)

Example #3

Show file

File: build.py Project: tranduythanh/corpkit

def download_large_file(proj_path,
                        url,
                        actually_download=True,
                        root=False,
                        **kwargs):
    """
    Download something to proj_path, unless it's CoreNLP, which goes to ~/corenlp
    """
    import os
    import shutil
    import glob
    import zipfile
    from time import localtime, strftime
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator

    file_name = url.split('/')[-1]
    home = os.path.expanduser("~")
    customdir = kwargs.get('custom_corenlp_dir', False)
    # if it's corenlp, put it in home/corenlp
    # if that dir exists, check if for a zip file
    # if there's a zipfile and it works, move on
    # if there's a zipfile and it's broken, delete it
    if 'stanford' in url:
        if customdir:
            downloaded_dir = customdir
        else:
            downloaded_dir = os.path.join(home, 'corenlp')
        if not os.path.isdir(downloaded_dir):
            os.makedirs(downloaded_dir)
        else:
            poss_zips = glob.glob(
                os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip'))
            if poss_zips:
                fullfile = poss_zips[-1]
                from zipfile import BadZipfile
                try:
                    the_zip_file = zipfile.ZipFile(fullfile)
                    ret = the_zip_file.testzip()
                    if ret is None:
                        return downloaded_dir, fullfile
                    else:
                        os.remove(fullfile)
                except BadZipfile:
                    os.remove(fullfile)
            #else:
            #    shutil.rmtree(downloaded_dir)
    else:
        downloaded_dir = os.path.join(proj_path, 'temp')
        try:
            os.makedirs(downloaded_dir)
        except OSError:
            pass
    fullfile = os.path.join(downloaded_dir, file_name)

    if actually_download:
        import __main__ as main
        if not root and not hasattr(main, '__file__'):
            txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url

            selection = INPUTFUNC(txt)

            if 'n' in selection.lower():
                return None, None
        try:
            import requests
            # NOTE the stream=True parameter
            r = requests.get(url, stream=True, verify=False)
            file_size = int(r.headers['content-length'])
            file_size_dl = 0
            block_sz = 8192
            showlength = file_size / block_sz
            thetime = strftime("%H:%M:%S", localtime())
            print('\n%s: Downloading ... \n' % thetime)
            par_args = {
                'printstatus': kwargs.get('printstatus', True),
                'length': showlength
            }
            if not root:
                tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength)
                p = animator(None,
                             None,
                             init=True,
                             tot_string=tstr,
                             **par_args)
                animator(p, file_size_dl + 1, tstr)

            with open(fullfile, 'wb') as f:
                for chunk in r.iter_content(chunk_size=block_sz):
                    if chunk:  # filter out keep-alive new chunks
                        f.write(chunk)
                        file_size_dl += len(chunk)
                        #print file_size_dl * 100.0 / file_size
                        if kwargs.get('note'):
                            kwargs['note'].progvar.set(file_size_dl * 100.0 /
                                                       int(file_size))
                        else:
                            tstr = '%d/%d' % (file_size_dl / block_sz,
                                              showlength)
                            animator(p, file_size_dl / block_sz, tstr,
                                     **par_args)
                        if root:
                            root.update()
        except Exception as err:
            import traceback
            print(traceback.format_exc())
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Download failed' % thetime)
            try:
                f.close()
            except:
                pass
            if root:
                root.update()
            return None, None

        if kwargs.get('note'):
            kwargs['note'].progvar.set(100)
        else:
            p.animate(int(file_size))
        thetime = strftime("%H:%M:%S", localtime())
        print('\n%s: Downloaded successully.' % thetime)
        try:
            f.close()
        except:
            pass
    return downloaded_dir, fullfile

Example #4

Show file

File: other.py Project: tranduythanh/corpkit

def save(interrogation, savename, savedir='saved_interrogations', **kwargs):
    """
    Save an interrogation as pickle to *savedir*.

       >>> interro_interrogator(corpus, 'words', 'any')
       >>> save(interro, 'savename')

    will create ``./saved_interrogations/savename.p``

    :param interrogation: Corpus interrogation to save
    :type interrogation: corpkit interogation/edited result
    
    :param savename: A name for the saved file
    :type savename: str
    
    :param savedir: Relative path to directory in which to save file
    :type savedir: str
    
    :param print_info: Show/hide stdout
    :type print_info: bool
    
    :returns: None
    """

    try:
        import cPickle as pickle
    except ImportError:
        import pickle as pickle
    import os
    from time import localtime, strftime
    import corpkit
    from corpkit.process import makesafe, sanitise_dict

    from corpkit.interrogation import Interrogation
    from corpkit.corpus import Corpus, Datalist

    print_info = kwargs.get('print_info', True)

    def make_filename(interrogation, savename):
        """create a filename"""
        if '/' in savename:
            return savename

        firstpart = ''
        if savename.endswith('.p'):
            savename = savename[:-2]
        savename = makesafe(savename, drop_datatype=False, hyphens_ok=True)
        if not savename.endswith('.p'):
            savename = savename + '.p'
        if hasattr(interrogation, 'query') and isinstance(
                interrogation.query, dict):
            corpus = interrogation.query.get('corpus', False)
            if corpus:
                if isinstance(corpus, STRINGTYPE):
                    firstpart = corpus
                else:
                    if isinstance(corpus, Datalist):
                        firstpart = Corpus(corpus).name
                    if hasattr(corpus, 'name'):
                        firstpart = corpus.name
                    else:
                        firstpart = ''

        firstpart = os.path.basename(firstpart)

        if firstpart:
            return firstpart + '-' + savename
        else:
            return savename

    savename = make_filename(interrogation, savename)

    # delete unpicklable parts of query
    if hasattr(interrogation, 'query') and isinstance(interrogation.query,
                                                      dict):
        iq = interrogation.query
        if iq:
            from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType
            interrogation.query = {k: v for k, v in iq.items() if not isinstance(v, ModuleType) \
                and not isinstance(v, FunctionType) \
                and not isinstance(v, BuiltinFunctionType) \
                and not isinstance(v, BuiltinMethodType)}
        else:
            iq = {}

    if savedir and not '/' in savename:
        if not os.path.exists(savedir):
            os.makedirs(savedir)
        fullpath = os.path.join(savedir, savename)
    else:
        fullpath = savename

    while os.path.isfile(fullpath):
        selection = INPUTFUNC(("\nSave error: %s already exists in %s.\n\n" \
                "Type 'o' to overwrite, or enter a new name: " % (savename, savedir)))

        if selection == 'o' or selection == 'O':
            os.remove(fullpath)
        else:
            selection = selection.replace('.p', '')
            if not selection.endswith('.p'):
                selection = selection + '.p'
                fullpath = os.path.join(savedir, selection)

    if hasattr(interrogation, 'query'):
        interrogation.query = sanitise_dict(interrogation.query)

    with open(fullpath, 'wb') as fo:
        pickle.dump(interrogation, fo)

    time = strftime("%H:%M:%S", localtime())
    if print_info:
        print('\n%s: Data saved: %s\n' % (time, fullpath))

Example #5

Show file

File: make.py Project: javelir/corpkit

def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser

    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str

    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool

    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool

    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead
    pyver = sys.version_info.major
    from corpkit.build import (get_corpus_filepaths, 
                               check_jdk, 
                               add_ids_to_xml, 
                               rename_all_files,
                               make_no_id_corpus, parse_corpus, move_parsed_files)

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')
    
    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    if tokenise:
        newpath = unparsed_corpus_path + '-tokenised'
        if isdir(newpath):
            shutil.rmtree(newpath)
        import nltk
        if nltk_data_path:
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
        try:
            from nltk import word_tokenize as tokenise
        except:
            print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
            raise

    if sys.platform == "darwin":
        if not check_jdk():
            print("Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html")

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')
    
    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)
    
    unparsed_corpus_path = newp

    # ask to folderise?
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        if do_folderise is None:
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n)")
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)
            
    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(join('data', 'data'), 'data')

    if parse:

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i+n]

        # this loop shortens files containing more than 500 lines, for corenlp memory sake
        # maybe user needs a warning or something in case s/he is doing coref
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace('.txt', '-%s.txt' % str(index + 1).zfill(3))
                        with codecs.open(newname, 'w', encoding='utf-8') as fo:
                            txt = '\n'.join(c) + '\n'
                            fo.write(txt.encode('utf-8'))
                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if speaker_segmentation:
            newpath = unparsed_corpus_path + '-stripped-parsed'
            if isdir(newpath) and not root:
                ans = INPUTFUNC('\n Path exists: %s. Do you want to overwrite? (y/n)\n' %newpath)
                if ans.lower().strip()[0] == 'y':
                    shutil.rmtree(newpath)
                else:
                    return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' %newpath)
            print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped')
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        if not fileparse:
            pp = os.path.dirname(unparsed_corpus_path)
            filelist = get_corpus_filepaths(projpath=pp, 
                                            corpuspath=to_parse)

        else:
            filelist = unparsed_corpus_path.replace('.txt', '-filelist.txt')
            with open(filelist, 'w') as fo:
                fo.write(unparsed_corpus_path + '\n')

        if multiprocess is not False:

            if multiprocess is True:
                import multiprocessing
                multiprocess = multiprocessing.cpu_count()
            from joblib import Parallel, delayed
            # split old file into n parts
            data, enc = saferead(filelist)
            fs = [i for i in data.splitlines() if i]
            # make generator with list of lists
            divl = len(fs) / multiprocess
            fgen = chunks(fs, divl)
            filelists = []
            # for each list, make new file
            for index, flist in enumerate(fgen):
                as_str = '\n'.join(flist) + '\n'
                new_fpath = filelist.replace('.txt', '-%s.txt' % str(index).zfill(4))
                filelists.append(new_fpath)
                with codecs.open(new_fpath, 'w', encoding='utf-8') as fo:
                    fo.write(as_str.encode('utf-8'))
            try:
                os.remove(filelist)
            except:
                pass

            ds = []
            for listpath in filelists:
                d = {'proj_path': project_path, 
                     'corpuspath': to_parse,
                     'filelist': listpath,
                     'corenlppath': corenlppath,
                     'nltk_data_path': nltk_data_path,
                     'operations': operations,
                     'copula_head': cop_head,
                     'multiprocessing': True,
                     'root': root,
                     'note': note,
                     'stdout': stdout
                    }
                ds.append(d)

            res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds)
            if len(res) > 0:
                newparsed = res[0]
            else:
                return
            if all(r is False for r in res):
                return

            for i in filelists:
                try:
                    os.remove(i)
                except:
                    pass

        else:
            newparsed = parse_corpus(proj_path=project_path, 
                                     corpuspath=to_parse,
                                     filelist=filelist,
                                     corenlppath=corenlppath,
                                     nltk_data_path=nltk_data_path,
                                     operations=operations,
                                     copula_head=cop_head,
                                     root=root,
                                     note=note,
                                     stdout=stdout,
                                     fileparse=fileparse)

        if not newparsed:
            return 
        if all(not x for x in newparsed):
            return

        if fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt'))
            return unparsed_corpus_path + '.xml'
        
        move_parsed_files(project_path, to_parse, newparsed)
        outpath = newparsed
        if speaker_segmentation:
            add_ids_to_xml(newparsed)
        try:
            os.remove(filelist)
        except:
            pass

    else:
        filelist = get_corpus_filepaths(projpath=os.path.dirname(unparsed_corpus_path), 
                                        corpuspath=unparsed_corpus_path)

    if tokenise:
        newtok = parse_corpus(proj_path=project_path, 
                              corpuspath=unparsed_corpus_path,
                              filelist=filelist,
                              nltk_data_path=nltk_data_path,
                              operations=operations,
                              only_tokenise=True
                             )
        if newtok is False:
            return   
        outpath = newtok

    rename_all_files(outpath)
    print('Done!\n')
    return outpath

Example #6

Show file

File: build.py Project: javelir/corpkit

def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs):
    """
    Download something to proj_path
    """
    import corpkit
    import os
    import shutil
    import glob
    import sys
    import zipfile
    from time import localtime, strftime
    from corpkit.textprogressbar import TextProgressBar
    from corpkit.process import animator

    file_name = url.split('/')[-1]
    home = os.path.expanduser("~")
    # if it's corenlp, put it in home/corenlp
    # if that dir exists, check if for a zip file
    # if there's a zipfile and it works, move on
    # if there's a zipfile and it's broken, delete it
    if 'stanford' in url:
        downloaded_dir = os.path.join(home, 'corenlp')
        if not os.path.isdir(downloaded_dir):
            os.makedirs(downloaded_dir)
        else:
            poss_zips = glob.glob(os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip'))
            if poss_zips:
                fullfile = poss_zips[-1]   
                the_zip_file = zipfile.ZipFile(fullfile)
                ret = the_zip_file.testzip()
                if ret is None:
                    return downloaded_dir, fullfile
                else:
                    os.remove(fullfile)
            #else:
            #    shutil.rmtree(downloaded_dir)
    else:
        downloaded_dir = os.path.join(proj_path, 'temp')
        try:
            os.makedirs(downloaded_dir)
        except OSError:
            pass
    fullfile = os.path.join(downloaded_dir, file_name)

    if actually_download:
        if not root:
            txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url
            
            selection = INPUTFUNC(txt)

            if 'n' in selection.lower():
                return None, None
        try:
            import requests
            # NOTE the stream=True parameter
            r = requests.get(url, stream=True, verify=False)
            file_size = int(r.headers['content-length'])
            file_size_dl = 0
            block_sz = 8192
            showlength = file_size / block_sz
            thetime = strftime("%H:%M:%S", localtime())
            print('\n%s: Downloading ... \n' % thetime)
            par_args = {'printstatus': kwargs.get('printstatus', True),
                        'length': showlength}
            if not root:
                tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength)
                p = animator(None, None, init=True, tot_string=tstr, **par_args)
                animator(p, file_size_dl + 1, tstr)

            with open(fullfile, 'wb') as f:
                for chunk in r.iter_content(chunk_size=block_sz): 
                    if chunk: # filter out keep-alive new chunks
                        f.write(chunk)
                        file_size_dl += len(chunk)
                        #print file_size_dl * 100.0 / file_size
                        if kwargs.get('note'):
                            kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size))
                        else:
                            tstr = '%d/%d' % (file_size_dl / block_sz, showlength)
                            animator(p, file_size_dl / block_sz, tstr, **par_args)
                        if root:
                            root.update()
        except Exception as err:
            import traceback
            print(traceback.format_exc())
            thetime = strftime("%H:%M:%S", localtime())
            print('%s: Download failed' % thetime)
            try:
                f.close()
            except:
                pass
            if root:
                root.update()
            return

        if kwargs.get('note'):  
            kwargs['note'].progvar.set(100)
        else:    
            p.animate(int(file_size))
        thetime = strftime("%H:%M:%S", localtime())
        print('\n%s: Downloaded successully.' % thetime)
        try:
            f.close()
        except:
            pass
    return downloaded_dir, fullfile

Example #7

Show file

File: process.py Project: javelir/corpkit

def tregex_engine(corpus=False,  
                  options=False, 
                  query=False, 
                  check_query=False,
                  check_for_trees=False,
                  just_content_words=False,
                  root=False,
                  preserve_case=False,
                  **kwarg
                 ):
    """
    Run a Java Tregex query
    
    :param query: tregex query
    :type query: str
    
    :param options: list of tregex options
    :type options: list of strs -- ['-t', '-o']
    
    :param corpus: place to search
    :type corpus: str
    
    :param check_query: just make sure query ok
    :type check_query: bool
    
    :param check_for_trees: find out if corpus contains parse trees
    :type check_for_trees: bool

    :returns: list of search results

    """
    import corpkit
    add_corpkit_to_path()
    
    # in case someone compiles the tregex query
    try:
        query = query.pattern
    except AttributeError:
        query = query
    
    import subprocess 
    from subprocess import Popen, PIPE, STDOUT

    import re
    from time import localtime, strftime
    from corpkit.dictionaries.word_transforms import wordlist
    import os
    import sys

    DEVNULL = open(os.devnull, 'w')

    if check_query or check_for_trees:
        send_stderr_to = subprocess.STDOUT
        send_stdout_to = DEVNULL
    else:
        send_stderr_to = DEVNULL
        send_stdout_to = subprocess.STDOUT

    filtermode = False
    if isinstance(options, list):
        filtermode = '-filter' in options
    if filtermode:
        options.pop(options.index('-filter'))

    on_cloud = checkstack('/opt/python/lib')

    # if check_query, enter the while loop
    # if not, get out of it
    an_error_occurred = True

    # site pack path
    corpath = os.path.join(os.path.dirname(corpkit.__file__))
    cor1 = os.path.join(corpath, 'tregex.sh')
    cor2 = os.path.join(corpath, 'corpkit', 'tregex.sh')

    # pyinstaller
    pyi = sys.argv[0].split('Contents/MacOS')[0] + 'Contents/MacOS/tregex.sh'

    possible_paths = ['tregex.sh', corpath, pyi, cor1, cor2]

    while an_error_occurred:
        tregex_file_found = False
        for i in possible_paths:
            if os.path.isfile(i):
                tregex_command = [i]
                tregex_file_found = True
                break
        if not tregex_file_found:
            thetime = strftime("%H:%M:%S", localtime())
            print("%s: Couldn't find Tregex in %s." % (thetime, ', '.join(possible_paths)))
            return False

        if not query:
            query = 'NP'
        # if checking for trees, use the -T option
        if check_for_trees:
            options = ['-o', '-T']

        filenaming = False
        if isinstance(options, list):
            if '-f' in options:
                filenaming = True

        # append list of options to query 
        if options:
            if '-s' not in options and '-t' not in options:
                options.append('-s')
        else:
            options = ['-o', '-t']
        for opt in options:
            tregex_command.append(opt)       
        if query:
            tregex_command.append(query)
        
        # if corpus is string or unicode, and is path, add that
        # if it's not string or unicode, it's some kind of corpus obj
        # in which case, add its path var

        if corpus:
            if isinstance(corpus, STRINGTYPE):
                if os.path.isdir(corpus) or os.path.isfile(corpus):
                    tregex_command.append(corpus)
                else:
                    filtermode = True
            elif hasattr(corpus, 'path'):
                tregex_command.append(corpus.path)
        
        if filtermode:
            tregex_command.append('-filter')

        if not filtermode:
            res = subprocess.check_output(tregex_command, stderr=send_stderr_to)
            res = res.decode(encoding='UTF-8').splitlines()
        else:
            p = Popen(tregex_command, stdout=PIPE, stdin=PIPE, stderr=send_stderr_to)
            p.stdin.write(corpus.encode('UTF-8', errors='ignore'))
            res = p.communicate()[0].decode(encoding='UTF-8').splitlines()
            p.stdin.close()
        
        # Fix up the stderr stdout rubbish
        if check_query:
            # define error searches 
            tregex_error = re.compile(r'^Error parsing expression')
            regex_error = re.compile(r'^Exception in thread.*PatternSyntaxException')
            # if tregex error, give general error message
            if re.match(tregex_error, res[0]):
                if root:
                    time = strftime("%H:%M:%S", localtime())
                    print('%s: Error parsing Tregex query.' % time)
                    return False
                time = strftime("%H:%M:%S", localtime())

                selection = INPUTFUNC('\n%s: Error parsing Tregex expression "%s".'\
                                      '\nWould you like to:\n\n' \
                    '              a) rewrite it now\n' \
                    '              b) exit\n\nYour selection: ' % (time, query))
                if 'a' in selection.lower():
                    query = INPUTFUNC('\nNew Tregex query: ')
                elif 'b' in selection.lower():
                    print('')
                    return False
            
            # if regex error, try to help
            elif re.match(regex_error, res[0]):
                if root:
                    time = strftime("%H:%M:%S", localtime())
                    print('%s: Regular expression in Tregex query contains an error.' % time)
                    return False
                info = res[0].split(':')
                index_of_error = re.findall(r'index [0-9]+', info[1])
                justnum = index_of_error[0].split('dex ')
                spaces = ' ' * int(justnum[1])
                remove_start = query.split('/', 1)
                remove_end = remove_start[1].split('/', -1)
                time = strftime("%H:%M:%S", localtime())
                selection = INPUTFUNC('\n%s: Error parsing regex inside Tregex query: %s'\
                '. Best guess: \n%s\n%s^\n\nYou can either: \n' \
                '              a) rewrite it now\n' \
                '              b) exit\n\nYour selection: ' % \
                    (time, str(info[1]), str(remove_end[0]), spaces))
                if 'a' in selection:
                    query = INPUTFUNC('\nNew Tregex query: ')
                elif 'b' in selection:
                    print('')
                    return                
            else:
                an_error_occurred = False
                return query
        # if not query checking, leave this horrible while loop
        else: 
            an_error_occurred = False
    
    # counting is easy, just get out with the number
    if '-C' in options:
        return int(res[-1])

    res = [r.strip() for r in res if r.strip()]

    # this is way slower than it needs to be, because it searches a whole subcorpus!
    if check_for_trees:
        if res[0].startswith('1:Next tree read:'):
            return True
        else:
            return False
    # return if no matches
    if not res:
        return []

    # make unicode and lowercase
    make_tuples = []

    if filenaming:
        for index, r in enumerate(res):
            if r.startswith('# /'):
                make_tuples.append([r, res[index + 1]])
        res = make_tuples
    
    if not preserve_case:
        if not filenaming:
            res = [w.lower().replace('/', '-slash-') for w in res]
        else:
            res = [[t, w.lower().replace('/', '-slash-')] for t, w in res]
    return res

Example #8

Show file

def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                postag=False,
                lemmatise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                outname=False,
                metadata=False,
                restart=False,
                coref=True,
                lang='en',
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser
    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str
    
    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool
    
    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool
    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead, make_dotfile

    from corpkit.build import (get_corpus_filepaths, check_jdk,
                               rename_all_files, make_no_id_corpus,
                               parse_corpus, move_parsed_files)
    from corpkit.constants import REPEAT_PARSE_ATTEMPTS

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')

    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    #if tokenise:
    #    if outname:
    #        newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname)
    #    else:
    #        newpath = unparsed_corpus_path + '-tokenised'
    #    if isdir(newpath):
    #        shutil.rmtree(newpath)
    #    import nltk
    #    if nltk_data_path:
    #        if nltk_data_path not in nltk.data.path:
    #            nltk.data.path.append(nltk_data_path)
    #    try:
    #        from nltk import word_tokenize as tokenise
    #    except:
    #        print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
    #        raise

    if sys.platform == "darwin":
        if not check_jdk():
            print(
                "Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html"
            )

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')

    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)

    unparsed_corpus_path = newp

    # ask to folderise?
    check_do_folderise = False
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        import __main__ as main
        if do_folderise is None and not hasattr(main, '__file__'):
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n) ")
            check_do_folderise = check_do_folderise.lower().startswith('y')
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)

    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(
            join('data', 'data'), 'data')

    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    if parse or tokenise:

        # this loop shortens files containing more than 500 lines,
        # for corenlp memory's sake. maybe user needs a warning or
        # something in case s/he is doing coref?
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace(
                            '.txt', '-%s.txt' % str(index + 1).zfill(3))
                        # does this work?
                        if PYTHON_VERSION == 2:
                            with codecs.open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt.encode(enc))
                        else:
                            with open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt)

                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if outname:
            newpath = os.path.join(os.path.dirname(unparsed_corpus_path),
                                   outname)
        else:
            newpath = unparsed_corpus_path + '-parsed'
        if restart:
            restart = newpath
        if speaker_segmentation or metadata:
            if isdir(newpath) and not root:
                import __main__ as main
                if not restart and not hasattr(main, '__file__'):
                    ans = INPUTFUNC(
                        '\n Path exists: %s. Do you want to overwrite? (y/n)\n'
                        % newpath)
                    if ans.lower().strip()[0] == 'y':
                        shutil.rmtree(newpath)
                    else:
                        return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' % newpath)
            if speaker_segmentation:
                print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path,
                              unparsed_corpus_path + '-stripped',
                              metadata_mode=metadata,
                              speaker_segmentation=speaker_segmentation)
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        # now we enter a while loop while not all files are parsed
        #todo: these file lists are not necessary when not parsing

        if outname:
            newparsed = os.path.join(project_path, 'data', outname)
        else:
            basecp = os.path.basename(to_parse)
            newparsed = os.path.join(project_path, 'data',
                                     '%s-parsed' % basecp)
            newparsed = newparsed.replace('-stripped-', '-')

        while REPEAT_PARSE_ATTEMPTS:

            if not parse:
                break

            if not fileparse:
                pp = os.path.dirname(unparsed_corpus_path)
                # if restart mode, the filepaths won't include those already parsed...
                filelist, fs = get_corpus_filepaths(
                    projpath=pp,
                    corpuspath=to_parse,
                    restart=restart,
                    out_ext=kwargs.get('output_format'))

            else:
                filelist = unparsed_corpus_path.replace(
                    '.txt', '-filelist.txt')
                with open(filelist, 'w') as fo:
                    fo.write(unparsed_corpus_path + '\n')

            # split up filelists
            if multiprocess is not False:

                if multiprocess is True:
                    import multiprocessing
                    multiprocess = multiprocessing.cpu_count()

                from joblib import Parallel, delayed
                # split old file into n parts
                if os.path.isfile(filelist):
                    data, enc = saferead(filelist)
                    fs = [i for i in data.splitlines() if i]
                else:
                    fs = []
                # if there's nothing here, we're done
                if not fs:
                    # double dutch
                    REPEAT_PARSE_ATTEMPTS = 0
                    break
                if len(fs) <= multiprocess:
                    multiprocess = len(fs)
                # make generator with list of lists
                divl = int(len(fs) / multiprocess)
                filelists = []
                if not divl:
                    filelists.append(filelist)
                else:
                    fgen = chunks(fs, divl)

                    # for each list, make new file
                    from corpkit.constants import OPENER
                    for index, flist in enumerate(fgen):
                        as_str = '\n'.join(flist) + '\n'
                        new_fpath = filelist.replace(
                            '.txt', '-%s.txt' % str(index).zfill(4))
                        filelists.append(new_fpath)
                        with OPENER(new_fpath, 'w', encoding='utf-8') as fo:
                            try:
                                fo.write(as_str.encode('utf-8'))
                            except TypeError:
                                fo.write(as_str)

                    try:
                        os.remove(filelist)
                    except:
                        pass

                ds = []
                for listpath in filelists:
                    d = {
                        'proj_path': project_path,
                        'corpuspath': to_parse,
                        'filelist': listpath,
                        'corenlppath': corenlppath,
                        'nltk_data_path': nltk_data_path,
                        'operations': operations,
                        'copula_head': cop_head,
                        'multiprocessing': True,
                        'root': root,
                        'note': note,
                        'stdout': stdout,
                        'outname': outname,
                        'coref': coref,
                        'output_format': kwargs.get('output_format', 'xml')
                    }
                    ds.append(d)

                res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x)
                                                    for x in ds)
                if len(res) > 0:
                    newparsed = res[0]
                else:
                    return
                if all(r is False for r in res):
                    return

                for i in filelists:
                    try:
                        os.remove(i)
                    except:
                        pass

            else:
                newparsed = parse_corpus(proj_path=project_path,
                                         corpuspath=to_parse,
                                         filelist=filelist,
                                         corenlppath=corenlppath,
                                         nltk_data_path=nltk_data_path,
                                         operations=operations,
                                         copula_head=cop_head,
                                         root=root,
                                         note=note,
                                         stdout=stdout,
                                         fileparse=fileparse,
                                         outname=outname,
                                         output_format=kwargs.get(
                                             'output_format', 'conll'))

            if not restart:
                REPEAT_PARSE_ATTEMPTS = 0
            else:
                REPEAT_PARSE_ATTEMPTS -= 1
                print('Repeating parsing due to missing files. '\
                      '%d iterations remaining.' % REPEAT_PARSE_ATTEMPTS)

        if parse and not newparsed:
            return

        if parse and all(not x for x in newparsed):
            print('Error after parsing.')
            return

        if parse and fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt',
                                                       '-filelist.txt'))
            return unparsed_corpus_path + '.conll'

        if parse:
            move_parsed_files(project_path,
                              to_parse,
                              newparsed,
                              ext=kwargs.get('output_format', 'conll'),
                              restart=restart)

            from corpkit.conll import convert_json_to_conll
            coref = False
            if operations is False:
                coref = True
            elif 'coref' in operations or 'dcoref' in operations:
                coref = True

            convert_json_to_conll(newparsed,
                                  speaker_segmentation=speaker_segmentation,
                                  coref=coref,
                                  metadata=metadata)

        try:
            os.remove(filelist)
        except:
            pass

    if not parse and tokenise:
        #todo: outname
        newparsed = to_parse.replace('-stripped', '-tokenised')
        from corpkit.tokenise import plaintext_to_conll
        newparsed = plaintext_to_conll(
            to_parse,
            postag=postag,
            lemmatise=lemmatise,
            lang=lang,
            metadata=metadata,
            nltk_data_path=nltk_data_path,
            speaker_segmentation=speaker_segmentation,
            outpath=newparsed)

        if outname:
            if not os.path.isdir(outname):
                outname = os.path.join('data', os.path.basename(outdir))
            import shutil
            shutil.copytree(newparsed, outname)
            newparsed = outname
        if newparsed is False:
            return
        else:
            make_dotfile(newparsed)
            return newparsed

    rename_all_files(newparsed)
    print('Generating corpus metadata...')
    make_dotfile(newparsed)
    print('Done!\n')
    return newparsed