def get_names(filepath, speakid): """ Get a list of speaker names from a file """ import re from corpkit.process import saferead txt, enc = saferead(filepath) res = re.findall(speakid, txt) if res: return sorted(list(set([i.strip() for i in res])))
def get_all_metadata_fields(corpus, include_speakers=False): """ Get a list of metadata fields in a corpus This could take a while for very little infor """ from corpkit.corpus import Corpus from corpkit.constants import OPENER, PYTHON_VERSION, MAX_METADATA_FIELDS # allow corpus object if not isinstance(corpus, Corpus): corpus = Corpus(corpus, print_info=False) if not corpus.datatype == 'conll': return [] path = getattr(corpus, 'path', corpus) fs = [] import os for root, dirnames, filenames in os.walk(path): for filename in filenames: fs.append(os.path.join(root, filename)) badfields = ['parse', 'sent_id'] if not include_speakers: badfields.append('speaker') fields = set() for f in fs: if PYTHON_VERSION == 2: from corpkit.process import saferead lines = saferead(f)[0].splitlines() else: with open(f, 'rb') as fo: lines = fo.read().decode('utf-8', errors='ignore') lines = lines.strip('\n') lines = lines.splitlines() lines = [l[2:].split('=', 1)[0] for l in lines if l.startswith('# ') \ if not l.startswith('# sent_id')] for l in lines: if l not in fields and l not in badfields: fields.add(l) if len(fields) > MAX_METADATA_FIELDS: break return list(fields)
def make_no_id_corpus(pth, newpth, metadata_mode=False, speaker_segmentation=False): """ Make version of pth without ids """ import os import re import shutil from corpkit.process import saferead # define regex broadly enough to accept timestamps, locations if need be from corpkit.constants import MAX_SPEAKERNAME_SIZE idregex = re.compile(r'(^.{,%d}?):\s+(.*$)' % MAX_SPEAKERNAME_SIZE) try: shutil.copytree(pth, newpth) except OSError: shutil.rmtree(newpth) shutil.copytree(pth, newpth) files = get_filepaths(newpth) names = [] metadata = [] for f in files: good_data = [] fo, enc = saferead(f) data = fo.splitlines() # for each line in the file, remove speaker and metadata for datum in data: if speaker_segmentation: matched = re.search(idregex, datum) if matched: names.append(matched.group(1)) datum = matched.group(2) if metadata_mode: splitmet = datum.rsplit('<metadata ', 1) # for the impossibly rare case of a line that is '<metadata ' if not splitmet: continue datum = splitmet[0] if datum: good_data.append(datum) with open(f, "w") as fo: if PYTHON_VERSION == 2: fo.write('\n'.join(good_data).encode('utf-8')) else: fo.write('\n'.join(good_data)) if speaker_segmentation: from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) if len(names) == 0: print( '%s: No speaker names found. Turn off speaker segmentation.' % thetime) shutil.rmtree(newpth) else: try: if len(sorted(set(names))) < 19: print('%s: Speaker names found: %s' % (thetime, ', '.join(sorted(set(names))))) else: print('%s: Speaker names found: %s ... ' % (thetime, ', '.join(sorted(set(names[:20]))))) except: pass
def delete_lines(corpus, annotation, dry_run=True, colour={}): """ Show or delete the necessary lines """ from corpkit.constants import OPENER, PYTHON_VERSION import re import os tagmode = True no_can_do = ['sent_id', 'parse'] if isinstance(annotation, dict): tagmode = False for k, v in annotation.items(): if k in no_can_do: print("You aren't allowed to delete '%s', sorry." % k) return if not v: v = r'.*?' regex = re.compile(r'(# %s=%s)\n' % (k, v), re.MULTILINE) else: if annotation in no_can_do: print("You aren't allowed to delete '%s', sorry." % k) return regex = re.compile(r'((# tags=.*?)%s;?(.*?))\n' % annotation, re.MULTILINE) fs = [] for (root, dirs, fls) in os.walk(corpus): for f in fls: fs.append(os.path.join(root, f)) for f in fs: if PYTHON_VERSION == 2: from corpkit.process import saferead data = saferead(f)[0] else: with open(f, 'rb') as fo: data = fo.read().decode('utf-8', errors='ignore') if dry_run: if tagmode: repl_str = r'\1 <=======\n%s\2\3 <=======\n' % colour.get( 'green', '') else: repl_str = r'\1 <=======\n' try: repl_str = colour['red'] + repl_str + colour['reset'] except: pass data, n = re.subn(regex, repl_str, data) nspl = 100 if tagmode else 50 delim = '<=======' data = re.split(delim, data, maxsplit=nspl) toshow = delim.join(data[:nspl + 1]) toshow = toshow.rsplit('\n\n', 1)[0] print(toshow) if n > 50: n = n - 50 print('\n... and %d more changes ... ' % n) else: if tagmode: repl_str = r'\2\3\n' else: repl_str = '' data = re.sub(regex, repl_str, data) with OPENER(f, 'w') as fo: from corpkit.constants import PYTHON_VERSION if PYTHON_VERSION == 2: data = data.encode('utf-8', errors='ignore') fo.write(data)
def make_corpus(unparsed_corpus_path, project_path=None, parse=True, tokenise=False, corenlppath=False, nltk_data_path=False, operations=False, speaker_segmentation=False, root=False, multiprocess=False, split_texts=400, **kwargs): """ Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser :param unparsed_corpus_path: path to corpus containing text files, or subdirs containing text files :type unparsed_corpus_path: str :param project_path: path to corpkit project :type project_path: str :param parse: Do parsing? :type parse: bool :param tokenise: Do tokenising? :type tokenise: bool :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param nltk_data_path: path to tokeniser if tokenising :type nltk_data_path: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :returns: list of paths to created corpora """ import sys import os from os.path import join, isfile, isdir, basename, splitext, exists import shutil import codecs from corpkit.build import folderise, can_folderise from corpkit.process import saferead pyver = sys.version_info.major from corpkit.build import (get_corpus_filepaths, check_jdk, add_ids_to_xml, rename_all_files, make_no_id_corpus, parse_corpus, move_parsed_files) if parse is True and tokenise is True: raise ValueError('Select either parse or tokenise, not both.') if project_path is None: project_path = os.getcwd() fileparse = isfile(unparsed_corpus_path) if fileparse: copier = shutil.copyfile else: copier = shutil.copytree # raise error if no tokeniser if tokenise: newpath = unparsed_corpus_path + '-tokenised' if isdir(newpath): shutil.rmtree(newpath) import nltk if nltk_data_path: if nltk_data_path not in nltk.data.path: nltk.data.path.append(nltk_data_path) try: from nltk import word_tokenize as tokenise except: print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n') raise if sys.platform == "darwin": if not check_jdk(): print("Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html") cop_head = kwargs.get('copula_head', True) note = kwargs.get('note', False) stdout = kwargs.get('stdout', False) # make absolute path to corpus unparsed_corpus_path = os.path.abspath(unparsed_corpus_path) # move it into project if fileparse: datapath = project_path else: datapath = join(project_path, 'data') if isdir(datapath): newp = join(datapath, basename(unparsed_corpus_path)) else: os.makedirs(datapath) if fileparse: noext = splitext(unparsed_corpus_path)[0] newp = join(datapath, basename(noext)) else: newp = join(datapath, basename(unparsed_corpus_path)) if exists(newp): pass else: copier(unparsed_corpus_path, newp) unparsed_corpus_path = newp # ask to folderise? do_folderise = kwargs.get('folderise', None) if can_folderise(unparsed_corpus_path): if do_folderise is None: check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\ "Would you like each file to be treated as a subcorpus? (y/n)") if check_do_folderise or do_folderise: folderise(unparsed_corpus_path) # this is bad! if join('data', 'data') in unparsed_corpus_path: unparsed_corpus_path = unparsed_corpus_path.replace(join('data', 'data'), 'data') if parse: def chunks(l, n): for i in range(0, len(l), n): yield l[i:i+n] # this loop shortens files containing more than 500 lines, for corenlp memory sake # maybe user needs a warning or something in case s/he is doing coref for rootx, dirs, fs in os.walk(unparsed_corpus_path): for f in fs: if f.startswith('.'): continue fp = join(rootx, f) data, enc = saferead(fp) data = data.splitlines() if len(data) > split_texts: chk = chunks(data, split_texts) for index, c in enumerate(chk): newname = fp.replace('.txt', '-%s.txt' % str(index + 1).zfill(3)) with codecs.open(newname, 'w', encoding='utf-8') as fo: txt = '\n'.join(c) + '\n' fo.write(txt.encode('utf-8')) os.remove(fp) else: pass #newname = fp.replace('.txt', '-000.txt') #os.rename(fp, newname) if speaker_segmentation: newpath = unparsed_corpus_path + '-stripped-parsed' if isdir(newpath) and not root: ans = INPUTFUNC('\n Path exists: %s. Do you want to overwrite? (y/n)\n' %newpath) if ans.lower().strip()[0] == 'y': shutil.rmtree(newpath) else: return elif isdir(newpath) and root: raise OSError('Path exists: %s' %newpath) print('Processing speaker IDs ...') make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped') to_parse = unparsed_corpus_path + '-stripped' else: to_parse = unparsed_corpus_path if not fileparse: print('Making list of files ... ') if not fileparse: pp = os.path.dirname(unparsed_corpus_path) filelist = get_corpus_filepaths(projpath=pp, corpuspath=to_parse) else: filelist = unparsed_corpus_path.replace('.txt', '-filelist.txt') with open(filelist, 'w') as fo: fo.write(unparsed_corpus_path + '\n') if multiprocess is not False: if multiprocess is True: import multiprocessing multiprocess = multiprocessing.cpu_count() from joblib import Parallel, delayed # split old file into n parts data, enc = saferead(filelist) fs = [i for i in data.splitlines() if i] # make generator with list of lists divl = len(fs) / multiprocess fgen = chunks(fs, divl) filelists = [] # for each list, make new file for index, flist in enumerate(fgen): as_str = '\n'.join(flist) + '\n' new_fpath = filelist.replace('.txt', '-%s.txt' % str(index).zfill(4)) filelists.append(new_fpath) with codecs.open(new_fpath, 'w', encoding='utf-8') as fo: fo.write(as_str.encode('utf-8')) try: os.remove(filelist) except: pass ds = [] for listpath in filelists: d = {'proj_path': project_path, 'corpuspath': to_parse, 'filelist': listpath, 'corenlppath': corenlppath, 'nltk_data_path': nltk_data_path, 'operations': operations, 'copula_head': cop_head, 'multiprocessing': True, 'root': root, 'note': note, 'stdout': stdout } ds.append(d) res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds) if len(res) > 0: newparsed = res[0] else: return if all(r is False for r in res): return for i in filelists: try: os.remove(i) except: pass else: newparsed = parse_corpus(proj_path=project_path, corpuspath=to_parse, filelist=filelist, corenlppath=corenlppath, nltk_data_path=nltk_data_path, operations=operations, copula_head=cop_head, root=root, note=note, stdout=stdout, fileparse=fileparse) if not newparsed: return if all(not x for x in newparsed): return if fileparse: # cleanup mistakes :) if isfile(splitext(unparsed_corpus_path)[0]): os.remove(splitext(unparsed_corpus_path)[0]) if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')): os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt')) return unparsed_corpus_path + '.xml' move_parsed_files(project_path, to_parse, newparsed) outpath = newparsed if speaker_segmentation: add_ids_to_xml(newparsed) try: os.remove(filelist) except: pass else: filelist = get_corpus_filepaths(projpath=os.path.dirname(unparsed_corpus_path), corpuspath=unparsed_corpus_path) if tokenise: newtok = parse_corpus(proj_path=project_path, corpuspath=unparsed_corpus_path, filelist=filelist, nltk_data_path=nltk_data_path, operations=operations, only_tokenise=True ) if newtok is False: return outpath = newtok rename_all_files(outpath) print('Done!\n') return outpath
def plaintext_to_conll(inpath, postag=False, lemmatise=False, lang='en', metadata=False, outpath=False, nltk_data_path=False, speaker_segmentation=False): """ Take a plaintext corpus and sent/word tokenise. :param inpath: The corpus to read in :param postag: do POS tagging? :param lemmatise: do lemmatisation? :param lang: choose language for pos/lemmatiser (not implemented yet) :param metadata: add metadata to conll (not implemented yet) :param outpath: custom name for the resulting corpus :param speaker_segmentation: did the corpus has speaker names? """ import nltk import shutil import pandas as pd from corpkit.process import saferead from corpkit.build import get_filepaths fps = get_filepaths(inpath, 'txt') # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO # SENT TOKENISERS from nltk.tokenize.punkt import PunktSentenceTokenizer stoker = PunktSentenceTokenizer() s_tokers = {'en': stoker} sent_tokenizer = s_tokers.get(lang, stoker) # WORD TOKENISERS tokenisers = {'en': nltk.word_tokenize} tokeniser = tokenisers.get(lang, nltk.word_tokenize) # LEMMATISERS if lemmatise: from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lemmatisers = {'en': lmtzr} lemmatiser = lemmatisers.get(lang, lmtzr) # POS TAGGERS if postag: # nltk.download('averaged_perceptron_tagger') postaggers = {'en': nltk.pos_tag} tagger = postaggers.get(lang, nltk.pos_tag) # iterate over files, make df of each, convert this # to conll and sent to new filename for f in fps: for_df = [] data, enc = saferead(f) plain, enc = saferead(f.replace('-stripped', '')) #orig_data = data #data, offsets = process_meta(data, speaker_segmentation, metadata) #nest = [] sents = sent_tokenizer.tokenize(data) soffs = sent_tokenizer.span_tokenize(data) toks = [tokeniser(sent) for sent in sents] ser = nested_list_to_pandas(toks) for_df.append(ser) if postag or lemmatise: postags = pos_tag_series(ser, tagger) if lemmatise: lemma = lemmatise_series(ser, postags, lemmatiser) for_df.append(lemma) for_df.append(postags) else: if postag: for_df.append(postags) df = pd.concat(for_df, axis=1) fo = new_fname(f, inpath) write_df_to_conll(df, fo, metadata=metadata, plain=plain, stripped=data, speaker_segmentation=speaker_segmentation, offsets=soffs) nsent = len(set(df.index.labels[0])) print('%s created (%d sentences)' % (fo, nsent)) if '-stripped' in inpath: return inpath.replace('-stripped', '-tokenised') else: return inpath + '-tokenised'
def make_corpus(unparsed_corpus_path, project_path=None, parse=True, tokenise=False, postag=False, lemmatise=False, corenlppath=False, nltk_data_path=False, operations=False, speaker_segmentation=False, root=False, multiprocess=False, split_texts=400, outname=False, metadata=False, restart=False, coref=True, lang='en', **kwargs): """ Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser :param unparsed_corpus_path: path to corpus containing text files, or subdirs containing text files :type unparsed_corpus_path: str :param project_path: path to corpkit project :type project_path: str :param parse: Do parsing? :type parse: bool :param tokenise: Do tokenising? :type tokenise: bool :param corenlppath: folder containing corenlp jar files :type corenlppath: str :param nltk_data_path: path to tokeniser if tokenising :type nltk_data_path: str :param operations: which kinds of annotations to do :type operations: str :param speaker_segmentation: add speaker name to parser output if your corpus is script-like: :type speaker_segmentation: bool :returns: list of paths to created corpora """ import sys import os from os.path import join, isfile, isdir, basename, splitext, exists import shutil import codecs from corpkit.build import folderise, can_folderise from corpkit.process import saferead, make_dotfile from corpkit.build import (get_corpus_filepaths, check_jdk, rename_all_files, make_no_id_corpus, parse_corpus, move_parsed_files) from corpkit.constants import REPEAT_PARSE_ATTEMPTS if parse is True and tokenise is True: raise ValueError('Select either parse or tokenise, not both.') if project_path is None: project_path = os.getcwd() fileparse = isfile(unparsed_corpus_path) if fileparse: copier = shutil.copyfile else: copier = shutil.copytree # raise error if no tokeniser #if tokenise: # if outname: # newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname) # else: # newpath = unparsed_corpus_path + '-tokenised' # if isdir(newpath): # shutil.rmtree(newpath) # import nltk # if nltk_data_path: # if nltk_data_path not in nltk.data.path: # nltk.data.path.append(nltk_data_path) # try: # from nltk import word_tokenize as tokenise # except: # print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n') # raise if sys.platform == "darwin": if not check_jdk(): print( "Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html" ) cop_head = kwargs.get('copula_head', True) note = kwargs.get('note', False) stdout = kwargs.get('stdout', False) # make absolute path to corpus unparsed_corpus_path = os.path.abspath(unparsed_corpus_path) # move it into project if fileparse: datapath = project_path else: datapath = join(project_path, 'data') if isdir(datapath): newp = join(datapath, basename(unparsed_corpus_path)) else: os.makedirs(datapath) if fileparse: noext = splitext(unparsed_corpus_path)[0] newp = join(datapath, basename(noext)) else: newp = join(datapath, basename(unparsed_corpus_path)) if exists(newp): pass else: copier(unparsed_corpus_path, newp) unparsed_corpus_path = newp # ask to folderise? check_do_folderise = False do_folderise = kwargs.get('folderise', None) if can_folderise(unparsed_corpus_path): import __main__ as main if do_folderise is None and not hasattr(main, '__file__'): check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\ "Would you like each file to be treated as a subcorpus? (y/n) ") check_do_folderise = check_do_folderise.lower().startswith('y') if check_do_folderise or do_folderise: folderise(unparsed_corpus_path) # this is bad! if join('data', 'data') in unparsed_corpus_path: unparsed_corpus_path = unparsed_corpus_path.replace( join('data', 'data'), 'data') def chunks(l, n): for i in range(0, len(l), n): yield l[i:i + n] if parse or tokenise: # this loop shortens files containing more than 500 lines, # for corenlp memory's sake. maybe user needs a warning or # something in case s/he is doing coref? for rootx, dirs, fs in os.walk(unparsed_corpus_path): for f in fs: if f.startswith('.'): continue fp = join(rootx, f) data, enc = saferead(fp) data = data.splitlines() if len(data) > split_texts: chk = chunks(data, split_texts) for index, c in enumerate(chk): newname = fp.replace( '.txt', '-%s.txt' % str(index + 1).zfill(3)) # does this work? if PYTHON_VERSION == 2: with codecs.open(newname, 'w', encoding=enc) as fo: txt = '\n'.join(c) + '\n' fo.write(txt.encode(enc)) else: with open(newname, 'w', encoding=enc) as fo: txt = '\n'.join(c) + '\n' fo.write(txt) os.remove(fp) else: pass #newname = fp.replace('.txt', '-000.txt') #os.rename(fp, newname) if outname: newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname) else: newpath = unparsed_corpus_path + '-parsed' if restart: restart = newpath if speaker_segmentation or metadata: if isdir(newpath) and not root: import __main__ as main if not restart and not hasattr(main, '__file__'): ans = INPUTFUNC( '\n Path exists: %s. Do you want to overwrite? (y/n)\n' % newpath) if ans.lower().strip()[0] == 'y': shutil.rmtree(newpath) else: return elif isdir(newpath) and root: raise OSError('Path exists: %s' % newpath) if speaker_segmentation: print('Processing speaker IDs ...') make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped', metadata_mode=metadata, speaker_segmentation=speaker_segmentation) to_parse = unparsed_corpus_path + '-stripped' else: to_parse = unparsed_corpus_path if not fileparse: print('Making list of files ... ') # now we enter a while loop while not all files are parsed #todo: these file lists are not necessary when not parsing if outname: newparsed = os.path.join(project_path, 'data', outname) else: basecp = os.path.basename(to_parse) newparsed = os.path.join(project_path, 'data', '%s-parsed' % basecp) newparsed = newparsed.replace('-stripped-', '-') while REPEAT_PARSE_ATTEMPTS: if not parse: break if not fileparse: pp = os.path.dirname(unparsed_corpus_path) # if restart mode, the filepaths won't include those already parsed... filelist, fs = get_corpus_filepaths( projpath=pp, corpuspath=to_parse, restart=restart, out_ext=kwargs.get('output_format')) else: filelist = unparsed_corpus_path.replace( '.txt', '-filelist.txt') with open(filelist, 'w') as fo: fo.write(unparsed_corpus_path + '\n') # split up filelists if multiprocess is not False: if multiprocess is True: import multiprocessing multiprocess = multiprocessing.cpu_count() from joblib import Parallel, delayed # split old file into n parts if os.path.isfile(filelist): data, enc = saferead(filelist) fs = [i for i in data.splitlines() if i] else: fs = [] # if there's nothing here, we're done if not fs: # double dutch REPEAT_PARSE_ATTEMPTS = 0 break if len(fs) <= multiprocess: multiprocess = len(fs) # make generator with list of lists divl = int(len(fs) / multiprocess) filelists = [] if not divl: filelists.append(filelist) else: fgen = chunks(fs, divl) # for each list, make new file from corpkit.constants import OPENER for index, flist in enumerate(fgen): as_str = '\n'.join(flist) + '\n' new_fpath = filelist.replace( '.txt', '-%s.txt' % str(index).zfill(4)) filelists.append(new_fpath) with OPENER(new_fpath, 'w', encoding='utf-8') as fo: try: fo.write(as_str.encode('utf-8')) except TypeError: fo.write(as_str) try: os.remove(filelist) except: pass ds = [] for listpath in filelists: d = { 'proj_path': project_path, 'corpuspath': to_parse, 'filelist': listpath, 'corenlppath': corenlppath, 'nltk_data_path': nltk_data_path, 'operations': operations, 'copula_head': cop_head, 'multiprocessing': True, 'root': root, 'note': note, 'stdout': stdout, 'outname': outname, 'coref': coref, 'output_format': kwargs.get('output_format', 'xml') } ds.append(d) res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds) if len(res) > 0: newparsed = res[0] else: return if all(r is False for r in res): return for i in filelists: try: os.remove(i) except: pass else: newparsed = parse_corpus(proj_path=project_path, corpuspath=to_parse, filelist=filelist, corenlppath=corenlppath, nltk_data_path=nltk_data_path, operations=operations, copula_head=cop_head, root=root, note=note, stdout=stdout, fileparse=fileparse, outname=outname, output_format=kwargs.get( 'output_format', 'conll')) if not restart: REPEAT_PARSE_ATTEMPTS = 0 else: REPEAT_PARSE_ATTEMPTS -= 1 print('Repeating parsing due to missing files. '\ '%d iterations remaining.' % REPEAT_PARSE_ATTEMPTS) if parse and not newparsed: return if parse and all(not x for x in newparsed): print('Error after parsing.') return if parse and fileparse: # cleanup mistakes :) if isfile(splitext(unparsed_corpus_path)[0]): os.remove(splitext(unparsed_corpus_path)[0]) if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')): os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt')) return unparsed_corpus_path + '.conll' if parse: move_parsed_files(project_path, to_parse, newparsed, ext=kwargs.get('output_format', 'conll'), restart=restart) from corpkit.conll import convert_json_to_conll coref = False if operations is False: coref = True elif 'coref' in operations or 'dcoref' in operations: coref = True convert_json_to_conll(newparsed, speaker_segmentation=speaker_segmentation, coref=coref, metadata=metadata) try: os.remove(filelist) except: pass if not parse and tokenise: #todo: outname newparsed = to_parse.replace('-stripped', '-tokenised') from corpkit.tokenise import plaintext_to_conll newparsed = plaintext_to_conll( to_parse, postag=postag, lemmatise=lemmatise, lang=lang, metadata=metadata, nltk_data_path=nltk_data_path, speaker_segmentation=speaker_segmentation, outpath=newparsed) if outname: if not os.path.isdir(outname): outname = os.path.join('data', os.path.basename(outdir)) import shutil shutil.copytree(newparsed, outname) newparsed = outname if newparsed is False: return else: make_dotfile(newparsed) return newparsed rename_all_files(newparsed) print('Generating corpus metadata...') make_dotfile(newparsed) print('Done!\n') return newparsed