def loader(savedir='saved_interrogations'): """Show a list of data that can be loaded, and then load by user input of index""" import glob import os import corpkit from corpkit.other import load fs = [ i for i in glob.glob(r'%s/*' % savedir) if not os.path.basename(i).startswith('.') ] string_to_show = '\nFiles in %s:\n' % savedir most_digits = max([len(str(i)) for i, j in enumerate(fs)]) for index, fname in enumerate(fs): string_to_show += str(index).rjust( most_digits) + ':\t' + os.path.basename(fname) + '\n' print(string_to_show) INPUTFUNC('Enter index of item to load: ') if ' ' in index or '=' in index: if '=' in index: index = index.replace(' = ', ' ') index = index.replace('=', ' ') varname, ind = index.split(' ', 1) globals()[varname] = load(os.path.basename(fs[int(ind)])) print("%s = %s. Don't do this again." % (varname, os.path.basename(fs[int(ind)]))) return try: index = int(index) except: raise ValueError('Selection not recognised.') return load(os.path.basename(fs[index]))
def signal_handler(signal, _): """ Allow pausing and restarting whn not in GUI """ if root: return import signal import sys from time import localtime, strftime signal.signal(signal.SIGINT, original_sigint) thetime = strftime("%H:%M:%S", localtime()) INPUTFUNC('\n\n%s: Paused. Press any key to resume, or ctrl+c to quit.\n' % thetime) time = strftime("%H:%M:%S", localtime()) print('%s: Interrogation resumed.\n' % time) signal.signal(signal.SIGINT, signal_handler)
def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs): """ Download something to proj_path, unless it's CoreNLP, which goes to ~/corenlp """ import os import shutil import glob import zipfile from time import localtime, strftime from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator file_name = url.split('/')[-1] home = os.path.expanduser("~") customdir = kwargs.get('custom_corenlp_dir', False) # if it's corenlp, put it in home/corenlp # if that dir exists, check if for a zip file # if there's a zipfile and it works, move on # if there's a zipfile and it's broken, delete it if 'stanford' in url: if customdir: downloaded_dir = customdir else: downloaded_dir = os.path.join(home, 'corenlp') if not os.path.isdir(downloaded_dir): os.makedirs(downloaded_dir) else: poss_zips = glob.glob( os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip')) if poss_zips: fullfile = poss_zips[-1] from zipfile import BadZipfile try: the_zip_file = zipfile.ZipFile(fullfile) ret = the_zip_file.testzip() if ret is None: return downloaded_dir, fullfile else: os.remove(fullfile) except BadZipfile: os.remove(fullfile) #else: # shutil.rmtree(downloaded_dir) else: downloaded_dir = os.path.join(proj_path, 'temp') try: os.makedirs(downloaded_dir) except OSError: pass fullfile = os.path.join(downloaded_dir, file_name) if actually_download: import __main__ as main if not root and not hasattr(main, '__file__'): txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url selection = INPUTFUNC(txt) if 'n' in selection.lower(): return None, None try: import requests # NOTE the stream=True parameter r = requests.get(url, stream=True, verify=False) file_size = int(r.headers['content-length']) file_size_dl = 0 block_sz = 8192 showlength = file_size / block_sz thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloading ... \n' % thetime) par_args = { 'printstatus': kwargs.get('printstatus', True), 'length': showlength } if not root: tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength) p = animator(None, None, init=True, tot_string=tstr, **par_args) animator(p, file_size_dl + 1, tstr) with open(fullfile, 'wb') as f: for chunk in r.iter_content(chunk_size=block_sz): if chunk: # filter out keep-alive new chunks f.write(chunk) file_size_dl += len(chunk) #print file_size_dl * 100.0 / file_size if kwargs.get('note'): kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size)) else: tstr = '%d/%d' % (file_size_dl / block_sz, showlength) animator(p, file_size_dl / block_sz, tstr, **par_args) if root: root.update() except Exception as err: import traceback print(traceback.format_exc()) thetime = strftime("%H:%M:%S", localtime()) print('%s: Download failed' % thetime) try: f.close() except: pass if root: root.update() return None, None if kwargs.get('note'): kwargs['note'].progvar.set(100) else: p.animate(int(file_size)) thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloaded successully.' % thetime) try: f.close() except: pass return downloaded_dir, fullfile
def save(interrogation, savename, savedir='saved_interrogations', **kwargs): """ Save an interrogation as pickle to *savedir*. >>> interro_interrogator(corpus, 'words', 'any') >>> save(interro, 'savename') will create ``./saved_interrogations/savename.p`` :param interrogation: Corpus interrogation to save :type interrogation: corpkit interogation/edited result :param savename: A name for the saved file :type savename: str :param savedir: Relative path to directory in which to save file :type savedir: str :param print_info: Show/hide stdout :type print_info: bool :returns: None """ try: import cPickle as pickle except ImportError: import pickle as pickle import os from time import localtime, strftime import corpkit from corpkit.process import makesafe, sanitise_dict from corpkit.interrogation import Interrogation from corpkit.corpus import Corpus, Datalist print_info = kwargs.get('print_info', True) def make_filename(interrogation, savename): """create a filename""" if '/' in savename: return savename firstpart = '' if savename.endswith('.p'): savename = savename[:-2] savename = makesafe(savename, drop_datatype=False, hyphens_ok=True) if not savename.endswith('.p'): savename = savename + '.p' if hasattr(interrogation, 'query') and isinstance( interrogation.query, dict): corpus = interrogation.query.get('corpus', False) if corpus: if isinstance(corpus, STRINGTYPE): firstpart = corpus else: if isinstance(corpus, Datalist): firstpart = Corpus(corpus).name if hasattr(corpus, 'name'): firstpart = corpus.name else: firstpart = '' firstpart = os.path.basename(firstpart) if firstpart: return firstpart + '-' + savename else: return savename savename = make_filename(interrogation, savename) # delete unpicklable parts of query if hasattr(interrogation, 'query') and isinstance(interrogation.query, dict): iq = interrogation.query if iq: from types import ModuleType, FunctionType, BuiltinMethodType, BuiltinFunctionType interrogation.query = {k: v for k, v in iq.items() if not isinstance(v, ModuleType) \ and not isinstance(v, FunctionType) \ and not isinstance(v, BuiltinFunctionType) \ and not isinstance(v, BuiltinMethodType)} else: iq = {} if savedir and not '/' in savename: if not os.path.exists(savedir): os.makedirs(savedir) fullpath = os.path.join(savedir, savename) else: fullpath = savename while os.path.isfile(fullpath): selection = INPUTFUNC(("\nSave error: %s already exists in %s.\n\n" \ "Type 'o' to overwrite, or enter a new name: " % (savename, savedir))) if selection == 'o' or selection == 'O': os.remove(fullpath) else: selection = selection.replace('.p', '') if not selection.endswith('.p'): selection = selection + '.p' fullpath = os.path.join(savedir, selection) if hasattr(interrogation, 'query'): interrogation.query = sanitise_dict(interrogation.query) with open(fullpath, 'wb') as fo: pickle.dump(interrogation, fo) time = strftime("%H:%M:%S", localtime()) if print_info: print('\n%s: Data saved: %s\n' % (time, fullpath))
def make_corpus(unparsed_corpus_path, project_path=None, parse=True, tokenise=False, corenlppath=False, nltk_data_path=False, operations=False, speaker_segmentation=False, root=False, multiprocess=False, split_texts=400, **kwargs): """ Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser :param unparsed_corpus_path: path to corpus containing text files, or subdirs containing text files :type unparsed_corpus_path: str :param project_path: path to corpkit project :type project_path: str :param parse: Do parsing? :type parse: bool :param tokenise: Do tokenising? :type tokenise: bool :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param nltk_data_path: path to tokeniser if tokenising :type nltk_data_path: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :returns: list of paths to created corpora """ import sys import os from os.path import join, isfile, isdir, basename, splitext, exists import shutil import codecs from corpkit.build import folderise, can_folderise from corpkit.process import saferead pyver = sys.version_info.major from corpkit.build import (get_corpus_filepaths, check_jdk, add_ids_to_xml, rename_all_files, make_no_id_corpus, parse_corpus, move_parsed_files) if parse is True and tokenise is True: raise ValueError('Select either parse or tokenise, not both.') if project_path is None: project_path = os.getcwd() fileparse = isfile(unparsed_corpus_path) if fileparse: copier = shutil.copyfile else: copier = shutil.copytree # raise error if no tokeniser if tokenise: newpath = unparsed_corpus_path + '-tokenised' if isdir(newpath): shutil.rmtree(newpath) import nltk if nltk_data_path: if nltk_data_path not in nltk.data.path: nltk.data.path.append(nltk_data_path) try: from nltk import word_tokenize as tokenise except: print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n') raise if sys.platform == "darwin": if not check_jdk(): print("Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html") cop_head = kwargs.get('copula_head', True) note = kwargs.get('note', False) stdout = kwargs.get('stdout', False) # make absolute path to corpus unparsed_corpus_path = os.path.abspath(unparsed_corpus_path) # move it into project if fileparse: datapath = project_path else: datapath = join(project_path, 'data') if isdir(datapath): newp = join(datapath, basename(unparsed_corpus_path)) else: os.makedirs(datapath) if fileparse: noext = splitext(unparsed_corpus_path)[0] newp = join(datapath, basename(noext)) else: newp = join(datapath, basename(unparsed_corpus_path)) if exists(newp): pass else: copier(unparsed_corpus_path, newp) unparsed_corpus_path = newp # ask to folderise? do_folderise = kwargs.get('folderise', None) if can_folderise(unparsed_corpus_path): if do_folderise is None: check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\ "Would you like each file to be treated as a subcorpus? (y/n)") if check_do_folderise or do_folderise: folderise(unparsed_corpus_path) # this is bad! if join('data', 'data') in unparsed_corpus_path: unparsed_corpus_path = unparsed_corpus_path.replace(join('data', 'data'), 'data') if parse: def chunks(l, n): for i in range(0, len(l), n): yield l[i:i+n] # this loop shortens files containing more than 500 lines, for corenlp memory sake # maybe user needs a warning or something in case s/he is doing coref for rootx, dirs, fs in os.walk(unparsed_corpus_path): for f in fs: if f.startswith('.'): continue fp = join(rootx, f) data, enc = saferead(fp) data = data.splitlines() if len(data) > split_texts: chk = chunks(data, split_texts) for index, c in enumerate(chk): newname = fp.replace('.txt', '-%s.txt' % str(index + 1).zfill(3)) with codecs.open(newname, 'w', encoding='utf-8') as fo: txt = '\n'.join(c) + '\n' fo.write(txt.encode('utf-8')) os.remove(fp) else: pass #newname = fp.replace('.txt', '-000.txt') #os.rename(fp, newname) if speaker_segmentation: newpath = unparsed_corpus_path + '-stripped-parsed' if isdir(newpath) and not root: ans = INPUTFUNC('\n Path exists: %s. Do you want to overwrite? (y/n)\n' %newpath) if ans.lower().strip()[0] == 'y': shutil.rmtree(newpath) else: return elif isdir(newpath) and root: raise OSError('Path exists: %s' %newpath) print('Processing speaker IDs ...') make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped') to_parse = unparsed_corpus_path + '-stripped' else: to_parse = unparsed_corpus_path if not fileparse: print('Making list of files ... ') if not fileparse: pp = os.path.dirname(unparsed_corpus_path) filelist = get_corpus_filepaths(projpath=pp, corpuspath=to_parse) else: filelist = unparsed_corpus_path.replace('.txt', '-filelist.txt') with open(filelist, 'w') as fo: fo.write(unparsed_corpus_path + '\n') if multiprocess is not False: if multiprocess is True: import multiprocessing multiprocess = multiprocessing.cpu_count() from joblib import Parallel, delayed # split old file into n parts data, enc = saferead(filelist) fs = [i for i in data.splitlines() if i] # make generator with list of lists divl = len(fs) / multiprocess fgen = chunks(fs, divl) filelists = [] # for each list, make new file for index, flist in enumerate(fgen): as_str = '\n'.join(flist) + '\n' new_fpath = filelist.replace('.txt', '-%s.txt' % str(index).zfill(4)) filelists.append(new_fpath) with codecs.open(new_fpath, 'w', encoding='utf-8') as fo: fo.write(as_str.encode('utf-8')) try: os.remove(filelist) except: pass ds = [] for listpath in filelists: d = {'proj_path': project_path, 'corpuspath': to_parse, 'filelist': listpath, 'corenlppath': corenlppath, 'nltk_data_path': nltk_data_path, 'operations': operations, 'copula_head': cop_head, 'multiprocessing': True, 'root': root, 'note': note, 'stdout': stdout } ds.append(d) res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds) if len(res) > 0: newparsed = res[0] else: return if all(r is False for r in res): return for i in filelists: try: os.remove(i) except: pass else: newparsed = parse_corpus(proj_path=project_path, corpuspath=to_parse, filelist=filelist, corenlppath=corenlppath, nltk_data_path=nltk_data_path, operations=operations, copula_head=cop_head, root=root, note=note, stdout=stdout, fileparse=fileparse) if not newparsed: return if all(not x for x in newparsed): return if fileparse: # cleanup mistakes :) if isfile(splitext(unparsed_corpus_path)[0]): os.remove(splitext(unparsed_corpus_path)[0]) if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')): os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt')) return unparsed_corpus_path + '.xml' move_parsed_files(project_path, to_parse, newparsed) outpath = newparsed if speaker_segmentation: add_ids_to_xml(newparsed) try: os.remove(filelist) except: pass else: filelist = get_corpus_filepaths(projpath=os.path.dirname(unparsed_corpus_path), corpuspath=unparsed_corpus_path) if tokenise: newtok = parse_corpus(proj_path=project_path, corpuspath=unparsed_corpus_path, filelist=filelist, nltk_data_path=nltk_data_path, operations=operations, only_tokenise=True ) if newtok is False: return outpath = newtok rename_all_files(outpath) print('Done!\n') return outpath
def download_large_file(proj_path, url, actually_download=True, root=False, **kwargs): """ Download something to proj_path """ import corpkit import os import shutil import glob import sys import zipfile from time import localtime, strftime from corpkit.textprogressbar import TextProgressBar from corpkit.process import animator file_name = url.split('/')[-1] home = os.path.expanduser("~") # if it's corenlp, put it in home/corenlp # if that dir exists, check if for a zip file # if there's a zipfile and it works, move on # if there's a zipfile and it's broken, delete it if 'stanford' in url: downloaded_dir = os.path.join(home, 'corenlp') if not os.path.isdir(downloaded_dir): os.makedirs(downloaded_dir) else: poss_zips = glob.glob(os.path.join(downloaded_dir, 'stanford-corenlp-full*.zip')) if poss_zips: fullfile = poss_zips[-1] the_zip_file = zipfile.ZipFile(fullfile) ret = the_zip_file.testzip() if ret is None: return downloaded_dir, fullfile else: os.remove(fullfile) #else: # shutil.rmtree(downloaded_dir) else: downloaded_dir = os.path.join(proj_path, 'temp') try: os.makedirs(downloaded_dir) except OSError: pass fullfile = os.path.join(downloaded_dir, file_name) if actually_download: if not root: txt = 'CoreNLP not found. Download latest version (%s)? (y/n) ' % url selection = INPUTFUNC(txt) if 'n' in selection.lower(): return None, None try: import requests # NOTE the stream=True parameter r = requests.get(url, stream=True, verify=False) file_size = int(r.headers['content-length']) file_size_dl = 0 block_sz = 8192 showlength = file_size / block_sz thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloading ... \n' % thetime) par_args = {'printstatus': kwargs.get('printstatus', True), 'length': showlength} if not root: tstr = '%d/%d' % (file_size_dl + 1 / block_sz, showlength) p = animator(None, None, init=True, tot_string=tstr, **par_args) animator(p, file_size_dl + 1, tstr) with open(fullfile, 'wb') as f: for chunk in r.iter_content(chunk_size=block_sz): if chunk: # filter out keep-alive new chunks f.write(chunk) file_size_dl += len(chunk) #print file_size_dl * 100.0 / file_size if kwargs.get('note'): kwargs['note'].progvar.set(file_size_dl * 100.0 / int(file_size)) else: tstr = '%d/%d' % (file_size_dl / block_sz, showlength) animator(p, file_size_dl / block_sz, tstr, **par_args) if root: root.update() except Exception as err: import traceback print(traceback.format_exc()) thetime = strftime("%H:%M:%S", localtime()) print('%s: Download failed' % thetime) try: f.close() except: pass if root: root.update() return if kwargs.get('note'): kwargs['note'].progvar.set(100) else: p.animate(int(file_size)) thetime = strftime("%H:%M:%S", localtime()) print('\n%s: Downloaded successully.' % thetime) try: f.close() except: pass return downloaded_dir, fullfile
def tregex_engine(corpus=False, options=False, query=False, check_query=False, check_for_trees=False, just_content_words=False, root=False, preserve_case=False, **kwarg ): """ Run a Java Tregex query :param query: tregex query :type query: str :param options: list of tregex options :type options: list of strs -- ['-t', '-o'] :param corpus: place to search :type corpus: str :param check_query: just make sure query ok :type check_query: bool :param check_for_trees: find out if corpus contains parse trees :type check_for_trees: bool :returns: list of search results """ import corpkit add_corpkit_to_path() # in case someone compiles the tregex query try: query = query.pattern except AttributeError: query = query import subprocess from subprocess import Popen, PIPE, STDOUT import re from time import localtime, strftime from corpkit.dictionaries.word_transforms import wordlist import os import sys DEVNULL = open(os.devnull, 'w') if check_query or check_for_trees: send_stderr_to = subprocess.STDOUT send_stdout_to = DEVNULL else: send_stderr_to = DEVNULL send_stdout_to = subprocess.STDOUT filtermode = False if isinstance(options, list): filtermode = '-filter' in options if filtermode: options.pop(options.index('-filter')) on_cloud = checkstack('/opt/python/lib') # if check_query, enter the while loop # if not, get out of it an_error_occurred = True # site pack path corpath = os.path.join(os.path.dirname(corpkit.__file__)) cor1 = os.path.join(corpath, 'tregex.sh') cor2 = os.path.join(corpath, 'corpkit', 'tregex.sh') # pyinstaller pyi = sys.argv[0].split('Contents/MacOS')[0] + 'Contents/MacOS/tregex.sh' possible_paths = ['tregex.sh', corpath, pyi, cor1, cor2] while an_error_occurred: tregex_file_found = False for i in possible_paths: if os.path.isfile(i): tregex_command = [i] tregex_file_found = True break if not tregex_file_found: thetime = strftime("%H:%M:%S", localtime()) print("%s: Couldn't find Tregex in %s." % (thetime, ', '.join(possible_paths))) return False if not query: query = 'NP' # if checking for trees, use the -T option if check_for_trees: options = ['-o', '-T'] filenaming = False if isinstance(options, list): if '-f' in options: filenaming = True # append list of options to query if options: if '-s' not in options and '-t' not in options: options.append('-s') else: options = ['-o', '-t'] for opt in options: tregex_command.append(opt) if query: tregex_command.append(query) # if corpus is string or unicode, and is path, add that # if it's not string or unicode, it's some kind of corpus obj # in which case, add its path var if corpus: if isinstance(corpus, STRINGTYPE): if os.path.isdir(corpus) or os.path.isfile(corpus): tregex_command.append(corpus) else: filtermode = True elif hasattr(corpus, 'path'): tregex_command.append(corpus.path) if filtermode: tregex_command.append('-filter') if not filtermode: res = subprocess.check_output(tregex_command, stderr=send_stderr_to) res = res.decode(encoding='UTF-8').splitlines() else: p = Popen(tregex_command, stdout=PIPE, stdin=PIPE, stderr=send_stderr_to) p.stdin.write(corpus.encode('UTF-8', errors='ignore')) res = p.communicate()[0].decode(encoding='UTF-8').splitlines() p.stdin.close() # Fix up the stderr stdout rubbish if check_query: # define error searches tregex_error = re.compile(r'^Error parsing expression') regex_error = re.compile(r'^Exception in thread.*PatternSyntaxException') # if tregex error, give general error message if re.match(tregex_error, res[0]): if root: time = strftime("%H:%M:%S", localtime()) print('%s: Error parsing Tregex query.' % time) return False time = strftime("%H:%M:%S", localtime()) selection = INPUTFUNC('\n%s: Error parsing Tregex expression "%s".'\ '\nWould you like to:\n\n' \ ' a) rewrite it now\n' \ ' b) exit\n\nYour selection: ' % (time, query)) if 'a' in selection.lower(): query = INPUTFUNC('\nNew Tregex query: ') elif 'b' in selection.lower(): print('') return False # if regex error, try to help elif re.match(regex_error, res[0]): if root: time = strftime("%H:%M:%S", localtime()) print('%s: Regular expression in Tregex query contains an error.' % time) return False info = res[0].split(':') index_of_error = re.findall(r'index [0-9]+', info[1]) justnum = index_of_error[0].split('dex ') spaces = ' ' * int(justnum[1]) remove_start = query.split('/', 1) remove_end = remove_start[1].split('/', -1) time = strftime("%H:%M:%S", localtime()) selection = INPUTFUNC('\n%s: Error parsing regex inside Tregex query: %s'\ '. Best guess: \n%s\n%s^\n\nYou can either: \n' \ ' a) rewrite it now\n' \ ' b) exit\n\nYour selection: ' % \ (time, str(info[1]), str(remove_end[0]), spaces)) if 'a' in selection: query = INPUTFUNC('\nNew Tregex query: ') elif 'b' in selection: print('') return else: an_error_occurred = False return query # if not query checking, leave this horrible while loop else: an_error_occurred = False # counting is easy, just get out with the number if '-C' in options: return int(res[-1]) res = [r.strip() for r in res if r.strip()] # this is way slower than it needs to be, because it searches a whole subcorpus! if check_for_trees: if res[0].startswith('1:Next tree read:'): return True else: return False # return if no matches if not res: return [] # make unicode and lowercase make_tuples = [] if filenaming: for index, r in enumerate(res): if r.startswith('# /'): make_tuples.append([r, res[index + 1]]) res = make_tuples if not preserve_case: if not filenaming: res = [w.lower().replace('/', '-slash-') for w in res] else: res = [[t, w.lower().replace('/', '-slash-')] for t, w in res] return res
def make_corpus(unparsed_corpus_path, project_path=None, parse=True, tokenise=False, postag=False, lemmatise=False, corenlppath=False, nltk_data_path=False, operations=False, speaker_segmentation=False, root=False, multiprocess=False, split_texts=400, outname=False, metadata=False, restart=False, coref=True, lang='en', **kwargs): """ Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser :param unparsed_corpus_path: path to corpus containing text files, or subdirs containing text files :type unparsed_corpus_path: str :param project_path: path to corpkit project :type project_path: str :param parse: Do parsing? :type parse: bool :param tokenise: Do tokenising? :type tokenise: bool :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param nltk_data_path: path to tokeniser if tokenising :type nltk_data_path: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :returns: list of paths to created corpora """ import sys import os from os.path import join, isfile, isdir, basename, splitext, exists import shutil import codecs from corpkit.build import folderise, can_folderise from corpkit.process import saferead, make_dotfile from corpkit.build import (get_corpus_filepaths, check_jdk, rename_all_files, make_no_id_corpus, parse_corpus, move_parsed_files) from corpkit.constants import REPEAT_PARSE_ATTEMPTS if parse is True and tokenise is True: raise ValueError('Select either parse or tokenise, not both.') if project_path is None: project_path = os.getcwd() fileparse = isfile(unparsed_corpus_path) if fileparse: copier = shutil.copyfile else: copier = shutil.copytree # raise error if no tokeniser #if tokenise: # if outname: # newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname) # else: # newpath = unparsed_corpus_path + '-tokenised' # if isdir(newpath): # shutil.rmtree(newpath) # import nltk # if nltk_data_path: # if nltk_data_path not in nltk.data.path: # nltk.data.path.append(nltk_data_path) # try: # from nltk import word_tokenize as tokenise # except: # print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n') # raise if sys.platform == "darwin": if not check_jdk(): print( "Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html" ) cop_head = kwargs.get('copula_head', True) note = kwargs.get('note', False) stdout = kwargs.get('stdout', False) # make absolute path to corpus unparsed_corpus_path = os.path.abspath(unparsed_corpus_path) # move it into project if fileparse: datapath = project_path else: datapath = join(project_path, 'data') if isdir(datapath): newp = join(datapath, basename(unparsed_corpus_path)) else: os.makedirs(datapath) if fileparse: noext = splitext(unparsed_corpus_path)[0] newp = join(datapath, basename(noext)) else: newp = join(datapath, basename(unparsed_corpus_path)) if exists(newp): pass else: copier(unparsed_corpus_path, newp) unparsed_corpus_path = newp # ask to folderise? check_do_folderise = False do_folderise = kwargs.get('folderise', None) if can_folderise(unparsed_corpus_path): import __main__ as main if do_folderise is None and not hasattr(main, '__file__'): check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\ "Would you like each file to be treated as a subcorpus? (y/n) ") check_do_folderise = check_do_folderise.lower().startswith('y') if check_do_folderise or do_folderise: folderise(unparsed_corpus_path) # this is bad! if join('data', 'data') in unparsed_corpus_path: unparsed_corpus_path = unparsed_corpus_path.replace( join('data', 'data'), 'data') def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] if parse or tokenise: # this loop shortens files containing more than 500 lines, # for corenlp memory's sake. maybe user needs a warning or # something in case s/he is doing coref? for rootx, dirs, fs in os.walk(unparsed_corpus_path): for f in fs: if f.startswith('.'): continue fp = join(rootx, f) data, enc = saferead(fp) data = data.splitlines() if len(data) > split_texts: chk = chunks(data, split_texts) for index, c in enumerate(chk): newname = fp.replace( '.txt', '-%s.txt' % str(index + 1).zfill(3)) # does this work? if PYTHON_VERSION == 2: with codecs.open(newname, 'w', encoding=enc) as fo: txt = '\n'.join(c) + '\n' fo.write(txt.encode(enc)) else: with open(newname, 'w', encoding=enc) as fo: txt = '\n'.join(c) + '\n' fo.write(txt) os.remove(fp) else: pass #newname = fp.replace('.txt', '-000.txt') #os.rename(fp, newname) if outname: newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname) else: newpath = unparsed_corpus_path + '-parsed' if restart: restart = newpath if speaker_segmentation or metadata: if isdir(newpath) and not root: import __main__ as main if not restart and not hasattr(main, '__file__'): ans = INPUTFUNC( '\n Path exists: %s. Do you want to overwrite? (y/n)\n' % newpath) if ans.lower().strip()[0] == 'y': shutil.rmtree(newpath) else: return elif isdir(newpath) and root: raise OSError('Path exists: %s' % newpath) if speaker_segmentation: print('Processing speaker IDs ...') make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped', metadata_mode=metadata, speaker_segmentation=speaker_segmentation) to_parse = unparsed_corpus_path + '-stripped' else: to_parse = unparsed_corpus_path if not fileparse: print('Making list of files ... ') # now we enter a while loop while not all files are parsed #todo: these file lists are not necessary when not parsing if outname: newparsed = os.path.join(project_path, 'data', outname) else: basecp = os.path.basename(to_parse) newparsed = os.path.join(project_path, 'data', '%s-parsed' % basecp) newparsed = newparsed.replace('-stripped-', '-') while REPEAT_PARSE_ATTEMPTS: if not parse: break if not fileparse: pp = os.path.dirname(unparsed_corpus_path) # if restart mode, the filepaths won't include those already parsed... filelist, fs = get_corpus_filepaths( projpath=pp, corpuspath=to_parse, restart=restart, out_ext=kwargs.get('output_format')) else: filelist = unparsed_corpus_path.replace( '.txt', '-filelist.txt') with open(filelist, 'w') as fo: fo.write(unparsed_corpus_path + '\n') # split up filelists if multiprocess is not False: if multiprocess is True: import multiprocessing multiprocess = multiprocessing.cpu_count() from joblib import Parallel, delayed # split old file into n parts if os.path.isfile(filelist): data, enc = saferead(filelist) fs = [i for i in data.splitlines() if i] else: fs = [] # if there's nothing here, we're done if not fs: # double dutch REPEAT_PARSE_ATTEMPTS = 0 break if len(fs) <= multiprocess: multiprocess = len(fs) # make generator with list of lists divl = int(len(fs) / multiprocess) filelists = [] if not divl: filelists.append(filelist) else: fgen = chunks(fs, divl) # for each list, make new file from corpkit.constants import OPENER for index, flist in enumerate(fgen): as_str = '\n'.join(flist) + '\n' new_fpath = filelist.replace( '.txt', '-%s.txt' % str(index).zfill(4)) filelists.append(new_fpath) with OPENER(new_fpath, 'w', encoding='utf-8') as fo: try: fo.write(as_str.encode('utf-8')) except TypeError: fo.write(as_str) try: os.remove(filelist) except: pass ds = [] for listpath in filelists: d = { 'proj_path': project_path, 'corpuspath': to_parse, 'filelist': listpath, 'corenlppath': corenlppath, 'nltk_data_path': nltk_data_path, 'operations': operations, 'copula_head': cop_head, 'multiprocessing': True, 'root': root, 'note': note, 'stdout': stdout, 'outname': outname, 'coref': coref, 'output_format': kwargs.get('output_format', 'xml') } ds.append(d) res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds) if len(res) > 0: newparsed = res[0] else: return if all(r is False for r in res): return for i in filelists: try: os.remove(i) except: pass else: newparsed = parse_corpus(proj_path=project_path, corpuspath=to_parse, filelist=filelist, corenlppath=corenlppath, nltk_data_path=nltk_data_path, operations=operations, copula_head=cop_head, root=root, note=note, stdout=stdout, fileparse=fileparse, outname=outname, output_format=kwargs.get( 'output_format', 'conll')) if not restart: REPEAT_PARSE_ATTEMPTS = 0 else: REPEAT_PARSE_ATTEMPTS -= 1 print('Repeating parsing due to missing files. '\ '%d iterations remaining.' % REPEAT_PARSE_ATTEMPTS) if parse and not newparsed: return if parse and all(not x for x in newparsed): print('Error after parsing.') return if parse and fileparse: # cleanup mistakes :) if isfile(splitext(unparsed_corpus_path)[0]): os.remove(splitext(unparsed_corpus_path)[0]) if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')): os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt')) return unparsed_corpus_path + '.conll' if parse: move_parsed_files(project_path, to_parse, newparsed, ext=kwargs.get('output_format', 'conll'), restart=restart) from corpkit.conll import convert_json_to_conll coref = False if operations is False: coref = True elif 'coref' in operations or 'dcoref' in operations: coref = True convert_json_to_conll(newparsed, speaker_segmentation=speaker_segmentation, coref=coref, metadata=metadata) try: os.remove(filelist) except: pass if not parse and tokenise: #todo: outname newparsed = to_parse.replace('-stripped', '-tokenised') from corpkit.tokenise import plaintext_to_conll newparsed = plaintext_to_conll( to_parse, postag=postag, lemmatise=lemmatise, lang=lang, metadata=metadata, nltk_data_path=nltk_data_path, speaker_segmentation=speaker_segmentation, outpath=newparsed) if outname: if not os.path.isdir(outname): outname = os.path.join('data', os.path.basename(outdir)) import shutil shutil.copytree(newparsed, outname) newparsed = outname if newparsed is False: return else: make_dotfile(newparsed) return newparsed rename_all_files(newparsed) print('Generating corpus metadata...') make_dotfile(newparsed) print('Done!\n') return newparsed