def parse_corpus(proj_path = False, corpuspath = False, filelist = False, corenlppath = False, operations = False, only_tokenise = False, root = False, stdout = False, nltk_data_path = False, memory_mb = 2000, copula_head = True, **kwargs): """ Create a CoreNLP-parsed and/or NLTK tokenised corpus """ import corpkit import subprocess from subprocess import PIPE, STDOUT, Popen import os import sys import chardet from time import localtime, strftime import time if not only_tokenise: if not check_jdk(): print 'Need latest Java.' return curdir = os.getcwd() if nltk_data_path: if only_tokenise: import nltk if nltk_data_path not in nltk.data.path: nltk.data.path.append(nltk_data_path) from nltk import word_tokenize as tokenise # add nltk to path #td = {} #from corpkit.other import add_nltk_data_to_nltk_path #if 'note' in kwargs.keys(): # td['note'] = kwargs['note'] #add_nltk_data_to_nltk_path(**td) if proj_path is False: proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/'))) basecp = os.path.basename(corpuspath) if only_tokenise: new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp) else: new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp) if os.path.join('data', 'data') in new_corpus_path: new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'), 'data') if not os.path.isdir(new_corpus_path): os.makedirs(new_corpus_path) else: fs = os.listdir(new_corpus_path) if not only_tokenise: if any([f.endswith('.xml') for f in fs]): print 'Folder containing xml already exists: "%s-parsed"' % basecp return False else: if any([f.endswith('.txt') for f in fs]): print 'Folder containing tokens already exists: "%s-tokenised"' % basecp return False #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.6.0.jar:stanford-corenlp-3.6.0-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar') cwd = os.getcwd() if corenlppath is False: home = os.path.expanduser("~") corenlppath = os.path.join(home, 'corenlp') find_install = [d for d in os.listdir(corenlppath) \ if os.path.isdir(os.path.join(corenlppath, d)) \ and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))] if len(find_install) > 0: corenlppath = os.path.join(corenlppath, find_install[0]) else: print 'No parser found. Try using the keyword arg "corenlp = <path>".' return # if not gui, don't mess with stdout if stdout is False: stdout = sys.stdout if not only_tokenise: os.chdir(corenlppath) if root: root.update_idletasks() reload(sys) import os import time if memory_mb is False: memory_mb = 2024 if operations is False: operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref' if type(operations) == list: operations = ','.join(operations) num_files_to_parse = len([l for l in open(filelist, 'r').read().splitlines() if l]) # get corenlp version number import re reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar') fver = next(re.search(reg, s).group(1) for s in os.listdir('.') if re.search(reg, s)) arglist = ['java', '-cp', 'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar' % (fver, fver), '-Xmx%sm' % str(memory_mb), 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', operations, '-filelist', filelist, '-noClobber', '-outputDirectory', new_corpus_path] if copula_head: arglist.append('--parse.flags') arglist.append(' -makeCopulaHead') try: proc = subprocess.Popen(arglist, stdout=sys.stdout) # maybe a problem with stdout. sacrifice it if need be except AttributeError: proc = subprocess.Popen(arglist) #p = TextProgressBar(num_files_to_parse) while proc.poll() is None: sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')]) if num_parsed == 0: if root: print '%s: Initialising parser ... ' % (thetime) if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse: if root: print '%s: Parsing file %d/%d ... ' % (thetime, num_parsed + 1, num_files_to_parse) if 'note' in kwargs.keys(): kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse) #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse)) time.sleep(1) if root: root.update() else: from nltk import word_tokenize as tokenise # tokenise each file import pickle fs = open(filelist).read().splitlines() dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs]))) if len(dirs) == 0: one_big_corpus = True else: one_big_corpus = False if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs): thetime = strftime("%H:%M:%S", localtime()) print '%s: Directory already exists. Delete it if need be.' % thetime return False for d in dirs: os.makedirs(os.path.join(new_corpus_path, d)) nfiles = len(fs) thetime = strftime("%H:%M:%S", localtime()) print '%s: Tokenising ... ' % (thetime) for index, f in enumerate(fs): data = open(f).read() enc = chardet.detect(data) enc_text = unicode(data, enc['encoding'], errors = 'ignore') tokens = tokenise(enc_text) thedir = os.path.basename(os.path.dirname(f)) newname = os.path.basename(f).replace('.txt', '-tokenised.p') if one_big_corpus: pth = os.path.join(new_corpus_path, newname) else: pth = os.path.join(new_corpus_path, thedir, newname) with open(pth, "wb") as fo: pickle.dump(tokens, fo) if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles) if root: root.update() #p.animate(num_files_to_parse) if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) print '%s: Parsing finished. Moving parsed files into place ...' % thetime os.chdir(curdir) return new_corpus_path
def parse_corpus(proj_path=False, corpuspath=False, filelist=False, corenlppath=False, operations=False, only_tokenise=False, root=False, stdout=False, nltk_data_path=False, memory_mb=2000, copula_head=True, multiprocessing=False, **kwargs ): """ Create a CoreNLP-parsed and/or NLTK tokenised corpus """ import corpkit import subprocess from subprocess import PIPE, STDOUT, Popen from corpkit.process import get_corenlp_path import os import sys import re import chardet from time import localtime, strftime import time fileparse = kwargs.get('fileparse', False) url = 'http://nlp.stanford.edu/software/stanford-corenlp-full-2015-12-09.zip' if not only_tokenise: if not check_jdk(): print('Need latest Java.') return curdir = os.getcwd() note = kwargs.get('note', False) if nltk_data_path: if only_tokenise: import nltk if nltk_data_path not in nltk.data.path: nltk.data.path.append(nltk_data_path) from nltk import word_tokenize as tokenise if proj_path is False: proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/'))) basecp = os.path.basename(corpuspath) if fileparse: new_corpus_path = os.path.dirname(corpuspath) else: if only_tokenise: new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp) else: new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp) # todo: # this is not stable if os.path.join('data', 'data') in new_corpus_path: new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'), 'data') # this caused errors when multiprocessing # it used to be isdir, but supposedly there was a file there # i don't see how it's possible ... if not os.path.exists(new_corpus_path): os.makedirs(new_corpus_path) else: if not os.path.isfile(new_corpus_path): fs = os.listdir(new_corpus_path) if not multiprocessing: if not only_tokenise: if any([f.endswith('.xml') for f in fs]): print('Folder containing xml already exists: "%s-parsed"' % basecp) return False else: if any([f.endswith('.p') for f in fs]): print('Folder containing tokens already exists: "%s-tokenised"' % basecp) return False corenlppath = get_corenlp_path(corenlppath) if not corenlppath: cnlp_dir = os.path.join(os.path.expanduser("~"), 'corenlp') corenlppath, fpath = download_large_file(cnlp_dir, url, root=root, note=note, actually_download=True, custom_corenlp_dir=corenlppath) if corenlppath is None and fpath is None: import shutil shutil.rmtree(new_corpus_path) shutil.rmtree(new_corpus_path.replace('-parsed', '')) os.remove(new_corpus_path.replace('-parsed', '-filelist.txt')) raise ValueError('CoreNLP needed to parse texts.') extract_cnlp(fpath) import glob globpath = os.path.join(corenlppath, 'stanford-corenlp*') corenlppath = [i for i in glob.glob(globpath) if os.path.isdir(i)] if corenlppath: corenlppath = corenlppath[-1] else: raise ValueError('CoreNLP installation failed for some reason. Try manual download.') # if not gui, don't mess with stdout if stdout is False: stdout = sys.stdout if not only_tokenise: os.chdir(corenlppath) if root: root.update_idletasks() reload(sys) if memory_mb is False: memory_mb = 2024 if operations is False: operations = 'tokenize,ssplit,pos,lemma,parse,ner,dcoref' if isinstance(operations, list): operations = ','.join([i.lower() for i in operations]) with open(filelist, 'r') as fo: dat = fo.read() num_files_to_parse = len([l for l in dat.splitlines() if l]) # get corenlp version number reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar') fver = next(re.search(reg, s).group(1) for s in os.listdir('.') if re.search(reg, s)) if fver == '3.6.0': extra_jar = 'slf4j-api.jar:slf4j-simple.jar:' else: extra_jar = '' arglist = ['java', '-cp', 'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:%sjollyday.jar:ejml-0.23.jar' % (fver, fver, extra_jar), '-Xmx%sm' % str(memory_mb), 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', operations, '-filelist', filelist, '-noClobber', '-outputExtension', '.xml', '-outputDirectory', new_corpus_path] if copula_head: arglist.append('--parse.flags') arglist.append(' -makeCopulaHead') try: proc = subprocess.Popen(arglist, stdout=sys.stdout) # maybe a problem with stdout. sacrifice it if need be except: proc = subprocess.Popen(arglist) #p = TextProgressBar(num_files_to_parse) while proc.poll() is None: sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) if not fileparse: num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')]) if num_parsed == 0: if root: print('%s: Initialising parser ... ' % (thetime)) if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse: if root: print('%s: Parsing file %d/%d ... ' % \ (thetime, num_parsed + 1, num_files_to_parse)) if kwargs.get('note'): kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse) #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse)) time.sleep(1) if root: root.update() else: from nltk import word_tokenize as tokenise # tokenise each file import cPickle as pickle fs = open(filelist).read().splitlines() dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs]))) one_big_corpus = len(dirs) == 0 if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs): thetime = strftime("%H:%M:%S", localtime()) print('%s: Directory already exists. Delete it if need be.' % thetime) return False for d in dirs: os.makedirs(os.path.join(new_corpus_path, d)) nfiles = len(fs) thetime = strftime("%H:%M:%S", localtime()) print('%s: Tokenising ... ' % (thetime)) for index, f in enumerate(fs): with open(f, 'r') as fo: data = fo.read() enc = chardet.detect(data) enc_text = data.decode(enc['encoding'], errors='ignore') tokens = tokenise(enc_text) thedir = os.path.basename(os.path.dirname(f)) newname = os.path.basename(f).replace('.txt', '-tokenised.p') if one_big_corpus: pth = os.path.join(new_corpus_path, newname) else: pth = os.path.join(new_corpus_path, thedir, newname) with open(pth, "wb") as fo: pickle.dump(tokens, fo) if kwargs.get('note'): kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles) if root: root.update() #p.animate(num_files_to_parse) if kwargs.get('note'): kwargs['note'].progvar.set(100) sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) print('%s: Parsing finished. Moving parsed files into place ...' % thetime) os.chdir(curdir) return new_corpus_path
import os, json from nltk import word_tokenize as tokenise ## os.getcwd() returns the current working directory ## os.path.dirname() takes a path and returns the directory above it corpus_filename = os.path.dirname(os.getcwd()) + '/corpus.json' ## load the corpus corpus = json.load(open(corpus_filename)) index = {} ## number of training examples in corpus m = len(corpus) for i in range(m): tokens = tokenise(corpus[i]['title']) + tokenise(corpus[i]['text']) for token in tokens: token_lower = token.lower() if token_lower not in index: index[token_lower] = [i] else: index[token_lower].append(i) with open('index.json', 'w') as op: op.write(json.dumps(index))
def parse_corpus(proj_path, corpuspath, filelist, corenlppath = False, operations = False, only_tokenise = False, root = False, stdout = False, **kwargs): import corpkit import subprocess from subprocess import PIPE, STDOUT, Popen import os import sys import chardet from time import localtime, strftime import time if not only_tokenise: if not check_jdk(): print 'Need latest Java.' return # add nltk to path td = {} from corpkit.other import add_nltk_data_to_nltk_path if 'note' in kwargs.keys(): td['note'] = kwargs['note'] add_nltk_data_to_nltk_path(**td) basecp = os.path.basename(corpuspath) if only_tokenise: new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp) else: new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp) if not os.path.isdir(new_corpus_path): os.makedirs(new_corpus_path) else: fs = os.listdir(new_corpus_path) if not only_tokenise: if any([f.endswith('.xml') for f in fs]): print 'Folder containing xml already exists: "%s-parsed"' % basecp return False else: if any([f.endswith('.txt') for f in fs]): print 'Folder containing tokens already exists: "%s-tokenised"' % basecp return False #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.5.2.jar:stanford-corenlp-3.5.2-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar') cwd = os.getcwd() if corenlppath is False: home = os.path.expanduser("~") corenlppath = os.path.join(home, 'corenlp') find_install = [d for d in os.listdir(corenlppath) \ if os.path.isdir(os.path.join(corenlppath, d)) \ and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))] if len(find_install) > 0: corenlppath = os.path.join(corenlppath, find_install[0]) else: print 'No parser found.' return if not only_tokenise: os.chdir(corenlppath) root.update_idletasks() reload(sys) import os import time if operations is False: operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref' num_files_to_parse = len([l for l in open(filelist, 'r').read().splitlines() if l]) proc = subprocess.Popen(['java', '-cp', 'stanford-corenlp-3.5.2.jar:stanford-corenlp-3.5.2-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar', '-Xmx2g', 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', operations, '-filelist', filelist, '-noClobber', '-outputDirectory', new_corpus_path, '--parse.flags', ' -makeCopulaHead'], stdout=sys.stdout) #p = TextProgressBar(num_files_to_parse) while proc.poll() is None: sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) num_parsed = len([f for f in os.listdir(new_corpus_path) if f.endswith('.xml')]) if num_parsed == 0: print '%s: Initialising parser ... ' % (thetime) if num_parsed > 0 and num_parsed <= num_files_to_parse: print '%s: Parsing file %d/%d ... ' % (thetime, num_parsed + 1, num_files_to_parse) if 'note' in kwargs.keys(): kwargs['note'].progvar.set((num_parsed) * 100.0 / num_files_to_parse) #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse)) if root: root.update() time.sleep(1) else: # tokenise each file from nltk import word_tokenize as tokenise import pickle fs = open(filelist).read().splitlines() dirs = sorted(list(set([os.path.basename(os.path.dirname(f)) for f in fs]))) if len(dirs) == 0: one_big_corpus = True else: one_big_corpus = False if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs): thetime = strftime("%H:%M:%S", localtime()) print '%s: Directory already exists. Delete it if need be.' % thetime return for d in dirs: os.makedirs(os.path.join(new_corpus_path, d)) nfiles = len(fs) thetime = strftime("%H:%M:%S", localtime()) print '%s: Tokenising ... ' % (thetime) for index, f in enumerate(fs): data = open(f).read() enc = chardet.detect(data) enc_text = unicode(data, enc['encoding'], errors = 'ignore') tokens = tokenise(enc_text) thedir = os.path.basename(os.path.dirname(f)) newname = os.path.basename(f).replace('.txt', '-tokenised.p') if one_big_corpus: pth = os.path.join(new_corpus_path, newname) else: pth = os.path.join(new_corpus_path, thedir, newname) with open(pth, "wb") as fo: pickle.dump(tokens, fo) if 'note' in kwargs.keys(): kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles) if root: root.update() #p.animate(num_files_to_parse) if 'note' in kwargs.keys(): kwargs['note'].progvar.set(100) sys.stdout = stdout print 'Parsing finished. Moving parsed files into place ...' os.chdir(proj_path) return new_corpus_path
def parse_corpus(proj_path=False, corpuspath=False, filelist=False, corenlppath=False, operations=False, only_tokenise=False, root=False, stdout=False, nltk_data_path=False, memory_mb=2000, copula_head=True, **kwargs): """ Create a CoreNLP-parsed and/or NLTK tokenised corpus """ import corpkit import subprocess from subprocess import PIPE, STDOUT, Popen import os import sys import chardet from time import localtime, strftime import time if not only_tokenise: if not check_jdk(): print('Need latest Java.') return curdir = os.getcwd() if nltk_data_path: if only_tokenise: import nltk if nltk_data_path not in nltk.data.path: nltk.data.path.append(nltk_data_path) from nltk import word_tokenize as tokenise # add nltk to path #td = {} #from other import add_nltk_data_to_nltk_path #if 'note' in kwargs.keys(): # td['note'] = kwargs['note'] #add_nltk_data_to_nltk_path(**td) if proj_path is False: proj_path = os.path.dirname(os.path.abspath(corpuspath.rstrip('/'))) basecp = os.path.basename(corpuspath) if only_tokenise: new_corpus_path = os.path.join(proj_path, 'data', '%s-tokenised' % basecp) else: new_corpus_path = os.path.join(proj_path, 'data', '%s-parsed' % basecp) if os.path.join('data', 'data') in new_corpus_path: new_corpus_path = new_corpus_path.replace(os.path.join('data', 'data'), 'data') if not os.path.isdir(new_corpus_path): os.makedirs(new_corpus_path) else: fs = os.listdir(new_corpus_path) if not only_tokenise: if any([f.endswith('.xml') for f in fs]): print('Folder containing xml already exists: "%s-parsed"' % basecp) return False else: if any([f.endswith('.txt') for f in fs]): print( 'Folder containing tokens already exists: "%s-tokenised"' % basecp) return False #javaloc = os.path.join(proj_path, 'corenlp', 'stanford-corenlp-3.6.0.jar:stanford-corenlp-3.6.0-models.jar:xom.jar:joda-time.jar:jollyday.jar:ejml-0.23.jar') cwd = os.getcwd() if corenlppath is False: home = os.path.expanduser("~") corenlppath = os.path.join(home, 'corenlp') find_install = [d for d in os.listdir(corenlppath) \ if os.path.isdir(os.path.join(corenlppath, d)) \ and os.path.isfile(os.path.join(corenlppath, d, 'jollyday.jar'))] if len(find_install) > 0: corenlppath = os.path.join(corenlppath, find_install[0]) else: print( 'No parser found. Try using the keyword arg "corenlp = <path>", or moving your corenlp folder to ~/corenlp/stanford-corenlp-full ...' ) return # if not gui, don't mess with stdout if stdout is False: stdout = sys.stdout if not only_tokenise: os.chdir(corenlppath) if root: root.update_idletasks() reload(sys) import os import time if memory_mb is False: memory_mb = 2024 if operations is False: operations = 'tokenize,ssplit,pos,lemma,ner,parse,dcoref' if type(operations) == list: operations = ','.join(operations) num_files_to_parse = len( [l for l in open(filelist, 'r').read().splitlines() if l]) # get corenlp version number import re reg = re.compile(r'stanford-corenlp-([0-9].[0-9].[0-9])-javadoc.jar') fver = next( re.search(reg, s).group(1) for s in os.listdir('.') if re.search(reg, s)) if fver == '3.6.0': extra_jar = 'slf4j-api.jar:slf4j-simple.jar:' else: extra_jar = '' arglist = [ 'java', '-cp', 'stanford-corenlp-%s.jar:stanford-corenlp-%s-models.jar:xom.jar:joda-time.jar:%sjollyday.jar:ejml-0.23.jar' % (fver, fver, extra_jar), '-Xmx%sm' % str(memory_mb), 'edu.stanford.nlp.pipeline.StanfordCoreNLP', '-annotators', operations, '-filelist', filelist, '-noClobber', '-outputExtension', '.xml', '-outputDirectory', new_corpus_path ] if copula_head: arglist.append('--parse.flags') arglist.append(' -makeCopulaHead') try: proc = subprocess.Popen(arglist, stdout=sys.stdout) # maybe a problem with stdout. sacrifice it if need be except: proc = subprocess.Popen(arglist) #p = TextProgressBar(num_files_to_parse) while proc.poll() is None: sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) num_parsed = len( [f for f in os.listdir(new_corpus_path) if f.endswith('.xml')]) if num_parsed == 0: if root: print('%s: Initialising parser ... ' % (thetime)) if num_parsed > 0 and (num_parsed + 1) <= num_files_to_parse: if root: print('%s: Parsing file %d/%d ... ' % (thetime, num_parsed + 1, num_files_to_parse)) if 'note' in list(kwargs.keys()): kwargs['note'].progvar.set( (num_parsed) * 100.0 / num_files_to_parse) #p.animate(num_parsed - 1, str(num_parsed) + '/' + str(num_files_to_parse)) time.sleep(1) if root: root.update() else: from nltk import word_tokenize as tokenise # tokenise each file import pickle fs = open(filelist).read().splitlines() dirs = sorted( list(set([os.path.basename(os.path.dirname(f)) for f in fs]))) if len(dirs) == 0: one_big_corpus = True else: one_big_corpus = False if any(os.path.isdir(os.path.join(new_corpus_path, d)) for d in dirs): thetime = strftime("%H:%M:%S", localtime()) print('%s: Directory already exists. Delete it if need be.' % thetime) return False for d in dirs: os.makedirs(os.path.join(new_corpus_path, d)) nfiles = len(fs) thetime = strftime("%H:%M:%S", localtime()) print('%s: Tokenising ... ' % (thetime)) for index, f in enumerate(fs): data = open(f).read() enc = chardet.detect(data) enc_text = str(data, enc['encoding'], errors='ignore') tokens = tokenise(enc_text) thedir = os.path.basename(os.path.dirname(f)) newname = os.path.basename(f).replace('.txt', '-tokenised.p') if one_big_corpus: pth = os.path.join(new_corpus_path, newname) else: pth = os.path.join(new_corpus_path, thedir, newname) with open(pth, "wb") as fo: pickle.dump(tokens, fo) if 'note' in list(kwargs.keys()): kwargs['note'].progvar.set((index + 1) * 100.0 / nfiles) if root: root.update() #p.animate(num_files_to_parse) if 'note' in list(kwargs.keys()): kwargs['note'].progvar.set(100) sys.stdout = stdout thetime = strftime("%H:%M:%S", localtime()) print('%s: Parsing finished. Moving parsed files into place ...' % thetime) os.chdir(curdir) return new_corpus_path