Beispiel #1
0
def get_names(filepath, speakid):
    """
    Get a list of speaker names from a file
    """
    import re
    from corpkit.process import saferead
    txt, enc = saferead(filepath)
    res = re.findall(speakid, txt)
    if res:
        return sorted(list(set([i.strip() for i in res])))
Beispiel #2
0
def get_all_metadata_fields(corpus, include_speakers=False):
    """
    Get a list of metadata fields in a corpus

    This could take a while for very little infor
    """
    from corpkit.corpus import Corpus
    from corpkit.constants import OPENER, PYTHON_VERSION, MAX_METADATA_FIELDS

    # allow corpus object
    if not isinstance(corpus, Corpus):
        corpus = Corpus(corpus, print_info=False)
    if not corpus.datatype == 'conll':
        return []

    path = getattr(corpus, 'path', corpus)

    fs = []
    import os
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            fs.append(os.path.join(root, filename))

    badfields = ['parse', 'sent_id']
    if not include_speakers:
        badfields.append('speaker')

    fields = set()
    for f in fs:
        if PYTHON_VERSION == 2:
            from corpkit.process import saferead
            lines = saferead(f)[0].splitlines()
        else:
            with open(f, 'rb') as fo:
                lines = fo.read().decode('utf-8', errors='ignore')
                lines = lines.strip('\n')
                lines = lines.splitlines()

        lines = [l[2:].split('=', 1)[0] for l in lines if l.startswith('# ') \
                 if not l.startswith('# sent_id')]
        for l in lines:
            if l not in fields and l not in badfields:
                fields.add(l)
        if len(fields) > MAX_METADATA_FIELDS:
            break
    return list(fields)
Beispiel #3
0
def make_no_id_corpus(pth,
                      newpth,
                      metadata_mode=False,
                      speaker_segmentation=False):
    """
    Make version of pth without ids
    """
    import os
    import re
    import shutil
    from corpkit.process import saferead
    # define regex broadly enough to accept timestamps, locations if need be

    from corpkit.constants import MAX_SPEAKERNAME_SIZE
    idregex = re.compile(r'(^.{,%d}?):\s+(.*$)' % MAX_SPEAKERNAME_SIZE)

    try:
        shutil.copytree(pth, newpth)
    except OSError:
        shutil.rmtree(newpth)
        shutil.copytree(pth, newpth)
    files = get_filepaths(newpth)
    names = []
    metadata = []
    for f in files:
        good_data = []
        fo, enc = saferead(f)
        data = fo.splitlines()
        # for each line in the file, remove speaker and metadata
        for datum in data:
            if speaker_segmentation:
                matched = re.search(idregex, datum)
                if matched:
                    names.append(matched.group(1))
                    datum = matched.group(2)
            if metadata_mode:
                splitmet = datum.rsplit('<metadata ', 1)
                # for the impossibly rare case of a line that is '<metadata '
                if not splitmet:
                    continue
                datum = splitmet[0]
            if datum:
                good_data.append(datum)

        with open(f, "w") as fo:
            if PYTHON_VERSION == 2:
                fo.write('\n'.join(good_data).encode('utf-8'))
            else:
                fo.write('\n'.join(good_data))

    if speaker_segmentation:
        from time import localtime, strftime
        thetime = strftime("%H:%M:%S", localtime())
        if len(names) == 0:
            print(
                '%s: No speaker names found. Turn off speaker segmentation.' %
                thetime)
            shutil.rmtree(newpth)
        else:
            try:
                if len(sorted(set(names))) < 19:
                    print('%s: Speaker names found: %s' %
                          (thetime, ', '.join(sorted(set(names)))))
                else:
                    print('%s: Speaker names found: %s ... ' %
                          (thetime, ', '.join(sorted(set(names[:20])))))
            except:
                pass
Beispiel #4
0
def delete_lines(corpus, annotation, dry_run=True, colour={}):
    """
    Show or delete the necessary lines
    """
    from corpkit.constants import OPENER, PYTHON_VERSION
    import re
    import os
    tagmode = True
    no_can_do = ['sent_id', 'parse']

    if isinstance(annotation, dict):
        tagmode = False
        for k, v in annotation.items():
            if k in no_can_do:
                print("You aren't allowed to delete '%s', sorry." % k)
                return
            if not v:
                v = r'.*?'
            regex = re.compile(r'(# %s=%s)\n' % (k, v), re.MULTILINE)
    else:
        if annotation in no_can_do:
            print("You aren't allowed to delete '%s', sorry." % k)
            return
        regex = re.compile(r'((# tags=.*?)%s;?(.*?))\n' % annotation,
                           re.MULTILINE)

    fs = []
    for (root, dirs, fls) in os.walk(corpus):
        for f in fls:
            fs.append(os.path.join(root, f))

    for f in fs:

        if PYTHON_VERSION == 2:
            from corpkit.process import saferead
            data = saferead(f)[0]
        else:
            with open(f, 'rb') as fo:
                data = fo.read().decode('utf-8', errors='ignore')

        if dry_run:
            if tagmode:
                repl_str = r'\1 <=======\n%s\2\3 <=======\n' % colour.get(
                    'green', '')
            else:
                repl_str = r'\1 <=======\n'
            try:
                repl_str = colour['red'] + repl_str + colour['reset']
            except:
                pass
            data, n = re.subn(regex, repl_str, data)
            nspl = 100 if tagmode else 50
            delim = '<======='
            data = re.split(delim, data, maxsplit=nspl)
            toshow = delim.join(data[:nspl + 1])
            toshow = toshow.rsplit('\n\n', 1)[0]
            print(toshow)
            if n > 50:
                n = n - 50
                print('\n... and %d more changes ... ' % n)

        else:
            if tagmode:
                repl_str = r'\2\3\n'
            else:
                repl_str = ''
            data = re.sub(regex, repl_str, data)
            with OPENER(f, 'w') as fo:
                from corpkit.constants import PYTHON_VERSION
                if PYTHON_VERSION == 2:
                    data = data.encode('utf-8', errors='ignore')
                fo.write(data)
Beispiel #5
0
def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser

    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str

    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool

    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool

    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead
    pyver = sys.version_info.major
    from corpkit.build import (get_corpus_filepaths, 
                               check_jdk, 
                               add_ids_to_xml, 
                               rename_all_files,
                               make_no_id_corpus, parse_corpus, move_parsed_files)

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')
    
    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    if tokenise:
        newpath = unparsed_corpus_path + '-tokenised'
        if isdir(newpath):
            shutil.rmtree(newpath)
        import nltk
        if nltk_data_path:
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
        try:
            from nltk import word_tokenize as tokenise
        except:
            print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
            raise

    if sys.platform == "darwin":
        if not check_jdk():
            print("Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html")

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')
    
    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)
    
    unparsed_corpus_path = newp

    # ask to folderise?
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        if do_folderise is None:
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n)")
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)
            
    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(join('data', 'data'), 'data')

    if parse:

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i+n]

        # this loop shortens files containing more than 500 lines, for corenlp memory sake
        # maybe user needs a warning or something in case s/he is doing coref
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace('.txt', '-%s.txt' % str(index + 1).zfill(3))
                        with codecs.open(newname, 'w', encoding='utf-8') as fo:
                            txt = '\n'.join(c) + '\n'
                            fo.write(txt.encode('utf-8'))
                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if speaker_segmentation:
            newpath = unparsed_corpus_path + '-stripped-parsed'
            if isdir(newpath) and not root:
                ans = INPUTFUNC('\n Path exists: %s. Do you want to overwrite? (y/n)\n' %newpath)
                if ans.lower().strip()[0] == 'y':
                    shutil.rmtree(newpath)
                else:
                    return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' %newpath)
            print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped')
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        if not fileparse:
            pp = os.path.dirname(unparsed_corpus_path)
            filelist = get_corpus_filepaths(projpath=pp, 
                                            corpuspath=to_parse)

        else:
            filelist = unparsed_corpus_path.replace('.txt', '-filelist.txt')
            with open(filelist, 'w') as fo:
                fo.write(unparsed_corpus_path + '\n')

        if multiprocess is not False:

            if multiprocess is True:
                import multiprocessing
                multiprocess = multiprocessing.cpu_count()
            from joblib import Parallel, delayed
            # split old file into n parts
            data, enc = saferead(filelist)
            fs = [i for i in data.splitlines() if i]
            # make generator with list of lists
            divl = len(fs) / multiprocess
            fgen = chunks(fs, divl)
            filelists = []
            # for each list, make new file
            for index, flist in enumerate(fgen):
                as_str = '\n'.join(flist) + '\n'
                new_fpath = filelist.replace('.txt', '-%s.txt' % str(index).zfill(4))
                filelists.append(new_fpath)
                with codecs.open(new_fpath, 'w', encoding='utf-8') as fo:
                    fo.write(as_str.encode('utf-8'))
            try:
                os.remove(filelist)
            except:
                pass

            ds = []
            for listpath in filelists:
                d = {'proj_path': project_path, 
                     'corpuspath': to_parse,
                     'filelist': listpath,
                     'corenlppath': corenlppath,
                     'nltk_data_path': nltk_data_path,
                     'operations': operations,
                     'copula_head': cop_head,
                     'multiprocessing': True,
                     'root': root,
                     'note': note,
                     'stdout': stdout
                    }
                ds.append(d)

            res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds)
            if len(res) > 0:
                newparsed = res[0]
            else:
                return
            if all(r is False for r in res):
                return

            for i in filelists:
                try:
                    os.remove(i)
                except:
                    pass

        else:
            newparsed = parse_corpus(proj_path=project_path, 
                                     corpuspath=to_parse,
                                     filelist=filelist,
                                     corenlppath=corenlppath,
                                     nltk_data_path=nltk_data_path,
                                     operations=operations,
                                     copula_head=cop_head,
                                     root=root,
                                     note=note,
                                     stdout=stdout,
                                     fileparse=fileparse)

        if not newparsed:
            return 
        if all(not x for x in newparsed):
            return

        if fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt'))
            return unparsed_corpus_path + '.xml'
        
        move_parsed_files(project_path, to_parse, newparsed)
        outpath = newparsed
        if speaker_segmentation:
            add_ids_to_xml(newparsed)
        try:
            os.remove(filelist)
        except:
            pass

    else:
        filelist = get_corpus_filepaths(projpath=os.path.dirname(unparsed_corpus_path), 
                                        corpuspath=unparsed_corpus_path)

    if tokenise:
        newtok = parse_corpus(proj_path=project_path, 
                              corpuspath=unparsed_corpus_path,
                              filelist=filelist,
                              nltk_data_path=nltk_data_path,
                              operations=operations,
                              only_tokenise=True
                             )
        if newtok is False:
            return   
        outpath = newtok

    rename_all_files(outpath)
    print('Done!\n')
    return outpath
Beispiel #6
0
def plaintext_to_conll(inpath,
                       postag=False,
                       lemmatise=False,
                       lang='en',
                       metadata=False,
                       outpath=False,
                       nltk_data_path=False,
                       speaker_segmentation=False):
    """
    Take a plaintext corpus and sent/word tokenise.

    :param inpath: The corpus to read in
    :param postag: do POS tagging?
    :param lemmatise: do lemmatisation?
    :param lang: choose language for pos/lemmatiser (not implemented yet)
    :param metadata: add metadata to conll (not implemented yet)
    :param outpath: custom name for the resulting corpus
    :param speaker_segmentation: did the corpus has speaker names?
    """

    import nltk
    import shutil
    import pandas as pd
    from corpkit.process import saferead

    from corpkit.build import get_filepaths
    fps = get_filepaths(inpath, 'txt')

    # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL
    # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO

    # SENT TOKENISERS
    from nltk.tokenize.punkt import PunktSentenceTokenizer
    stoker = PunktSentenceTokenizer()
    s_tokers = {'en': stoker}
    sent_tokenizer = s_tokers.get(lang, stoker)

    # WORD TOKENISERS
    tokenisers = {'en': nltk.word_tokenize}
    tokeniser = tokenisers.get(lang, nltk.word_tokenize)

    # LEMMATISERS
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        lemmatisers = {'en': lmtzr}
        lemmatiser = lemmatisers.get(lang, lmtzr)

    # POS TAGGERS
    if postag:
        # nltk.download('averaged_perceptron_tagger')
        postaggers = {'en': nltk.pos_tag}
        tagger = postaggers.get(lang, nltk.pos_tag)

    # iterate over files, make df of each, convert this
    # to conll and sent to new filename
    for f in fps:
        for_df = []
        data, enc = saferead(f)
        plain, enc = saferead(f.replace('-stripped', ''))
        #orig_data = data
        #data, offsets = process_meta(data, speaker_segmentation, metadata)
        #nest = []
        sents = sent_tokenizer.tokenize(data)
        soffs = sent_tokenizer.span_tokenize(data)
        toks = [tokeniser(sent) for sent in sents]
        ser = nested_list_to_pandas(toks)
        for_df.append(ser)
        if postag or lemmatise:
            postags = pos_tag_series(ser, tagger)
        if lemmatise:
            lemma = lemmatise_series(ser, postags, lemmatiser)
            for_df.append(lemma)
            for_df.append(postags)
        else:
            if postag:
                for_df.append(postags)
        df = pd.concat(for_df, axis=1)
        fo = new_fname(f, inpath)
        write_df_to_conll(df,
                          fo,
                          metadata=metadata,
                          plain=plain,
                          stripped=data,
                          speaker_segmentation=speaker_segmentation,
                          offsets=soffs)
        nsent = len(set(df.index.labels[0]))
        print('%s created (%d sentences)' % (fo, nsent))

    if '-stripped' in inpath:
        return inpath.replace('-stripped', '-tokenised')
    else:
        return inpath + '-tokenised'
Beispiel #7
0
def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                postag=False,
                lemmatise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                outname=False,
                metadata=False,
                restart=False,
                coref=True,
                lang='en',
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser
    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str
    
    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool
    
    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool
    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead, make_dotfile

    from corpkit.build import (get_corpus_filepaths, check_jdk,
                               rename_all_files, make_no_id_corpus,
                               parse_corpus, move_parsed_files)
    from corpkit.constants import REPEAT_PARSE_ATTEMPTS

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')

    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    #if tokenise:
    #    if outname:
    #        newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname)
    #    else:
    #        newpath = unparsed_corpus_path + '-tokenised'
    #    if isdir(newpath):
    #        shutil.rmtree(newpath)
    #    import nltk
    #    if nltk_data_path:
    #        if nltk_data_path not in nltk.data.path:
    #            nltk.data.path.append(nltk_data_path)
    #    try:
    #        from nltk import word_tokenize as tokenise
    #    except:
    #        print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
    #        raise

    if sys.platform == "darwin":
        if not check_jdk():
            print(
                "Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html"
            )

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')

    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)

    unparsed_corpus_path = newp

    # ask to folderise?
    check_do_folderise = False
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        import __main__ as main
        if do_folderise is None and not hasattr(main, '__file__'):
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n) ")
            check_do_folderise = check_do_folderise.lower().startswith('y')
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)

    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(
            join('data', 'data'), 'data')

    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    if parse or tokenise:

        # this loop shortens files containing more than 500 lines,
        # for corenlp memory's sake. maybe user needs a warning or
        # something in case s/he is doing coref?
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace(
                            '.txt', '-%s.txt' % str(index + 1).zfill(3))
                        # does this work?
                        if PYTHON_VERSION == 2:
                            with codecs.open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt.encode(enc))
                        else:
                            with open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt)

                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if outname:
            newpath = os.path.join(os.path.dirname(unparsed_corpus_path),
                                   outname)
        else:
            newpath = unparsed_corpus_path + '-parsed'
        if restart:
            restart = newpath
        if speaker_segmentation or metadata:
            if isdir(newpath) and not root:
                import __main__ as main
                if not restart and not hasattr(main, '__file__'):
                    ans = INPUTFUNC(
                        '\n Path exists: %s. Do you want to overwrite? (y/n)\n'
                        % newpath)
                    if ans.lower().strip()[0] == 'y':
                        shutil.rmtree(newpath)
                    else:
                        return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' % newpath)
            if speaker_segmentation:
                print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path,
                              unparsed_corpus_path + '-stripped',
                              metadata_mode=metadata,
                              speaker_segmentation=speaker_segmentation)
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        # now we enter a while loop while not all files are parsed
        #todo: these file lists are not necessary when not parsing

        if outname:
            newparsed = os.path.join(project_path, 'data', outname)
        else:
            basecp = os.path.basename(to_parse)
            newparsed = os.path.join(project_path, 'data',
                                     '%s-parsed' % basecp)
            newparsed = newparsed.replace('-stripped-', '-')

        while REPEAT_PARSE_ATTEMPTS:

            if not parse:
                break

            if not fileparse:
                pp = os.path.dirname(unparsed_corpus_path)
                # if restart mode, the filepaths won't include those already parsed...
                filelist, fs = get_corpus_filepaths(
                    projpath=pp,
                    corpuspath=to_parse,
                    restart=restart,
                    out_ext=kwargs.get('output_format'))

            else:
                filelist = unparsed_corpus_path.replace(
                    '.txt', '-filelist.txt')
                with open(filelist, 'w') as fo:
                    fo.write(unparsed_corpus_path + '\n')

            # split up filelists
            if multiprocess is not False:

                if multiprocess is True:
                    import multiprocessing
                    multiprocess = multiprocessing.cpu_count()

                from joblib import Parallel, delayed
                # split old file into n parts
                if os.path.isfile(filelist):
                    data, enc = saferead(filelist)
                    fs = [i for i in data.splitlines() if i]
                else:
                    fs = []
                # if there's nothing here, we're done
                if not fs:
                    # double dutch
                    REPEAT_PARSE_ATTEMPTS = 0
                    break
                if len(fs) <= multiprocess:
                    multiprocess = len(fs)
                # make generator with list of lists
                divl = int(len(fs) / multiprocess)
                filelists = []
                if not divl:
                    filelists.append(filelist)
                else:
                    fgen = chunks(fs, divl)

                    # for each list, make new file
                    from corpkit.constants import OPENER
                    for index, flist in enumerate(fgen):
                        as_str = '\n'.join(flist) + '\n'
                        new_fpath = filelist.replace(
                            '.txt', '-%s.txt' % str(index).zfill(4))
                        filelists.append(new_fpath)
                        with OPENER(new_fpath, 'w', encoding='utf-8') as fo:
                            try:
                                fo.write(as_str.encode('utf-8'))
                            except TypeError:
                                fo.write(as_str)

                    try:
                        os.remove(filelist)
                    except:
                        pass

                ds = []
                for listpath in filelists:
                    d = {
                        'proj_path': project_path,
                        'corpuspath': to_parse,
                        'filelist': listpath,
                        'corenlppath': corenlppath,
                        'nltk_data_path': nltk_data_path,
                        'operations': operations,
                        'copula_head': cop_head,
                        'multiprocessing': True,
                        'root': root,
                        'note': note,
                        'stdout': stdout,
                        'outname': outname,
                        'coref': coref,
                        'output_format': kwargs.get('output_format', 'xml')
                    }
                    ds.append(d)

                res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x)
                                                    for x in ds)
                if len(res) > 0:
                    newparsed = res[0]
                else:
                    return
                if all(r is False for r in res):
                    return

                for i in filelists:
                    try:
                        os.remove(i)
                    except:
                        pass

            else:
                newparsed = parse_corpus(proj_path=project_path,
                                         corpuspath=to_parse,
                                         filelist=filelist,
                                         corenlppath=corenlppath,
                                         nltk_data_path=nltk_data_path,
                                         operations=operations,
                                         copula_head=cop_head,
                                         root=root,
                                         note=note,
                                         stdout=stdout,
                                         fileparse=fileparse,
                                         outname=outname,
                                         output_format=kwargs.get(
                                             'output_format', 'conll'))

            if not restart:
                REPEAT_PARSE_ATTEMPTS = 0
            else:
                REPEAT_PARSE_ATTEMPTS -= 1
                print('Repeating parsing due to missing files. '\
                      '%d iterations remaining.' % REPEAT_PARSE_ATTEMPTS)

        if parse and not newparsed:
            return

        if parse and all(not x for x in newparsed):
            print('Error after parsing.')
            return

        if parse and fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt',
                                                       '-filelist.txt'))
            return unparsed_corpus_path + '.conll'

        if parse:
            move_parsed_files(project_path,
                              to_parse,
                              newparsed,
                              ext=kwargs.get('output_format', 'conll'),
                              restart=restart)

            from corpkit.conll import convert_json_to_conll
            coref = False
            if operations is False:
                coref = True
            elif 'coref' in operations or 'dcoref' in operations:
                coref = True

            convert_json_to_conll(newparsed,
                                  speaker_segmentation=speaker_segmentation,
                                  coref=coref,
                                  metadata=metadata)

        try:
            os.remove(filelist)
        except:
            pass

    if not parse and tokenise:
        #todo: outname
        newparsed = to_parse.replace('-stripped', '-tokenised')
        from corpkit.tokenise import plaintext_to_conll
        newparsed = plaintext_to_conll(
            to_parse,
            postag=postag,
            lemmatise=lemmatise,
            lang=lang,
            metadata=metadata,
            nltk_data_path=nltk_data_path,
            speaker_segmentation=speaker_segmentation,
            outpath=newparsed)

        if outname:
            if not os.path.isdir(outname):
                outname = os.path.join('data', os.path.basename(outdir))
            import shutil
            shutil.copytree(newparsed, outname)
            newparsed = outname
        if newparsed is False:
            return
        else:
            make_dotfile(newparsed)
            return newparsed

    rename_all_files(newparsed)
    print('Generating corpus metadata...')
    make_dotfile(newparsed)
    print('Done!\n')
    return newparsed