Python saferead Examples

Programming Language: Python

Namespace/Package Name: corpkit.process

Method/Function: saferead

Examples at hotexamples.com: 7

Python saferead - 7 examples found. These are the top rated real world Python examples of corpkit.process.saferead extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: build.py Project: tranduythanh/corpkit

def get_names(filepath, speakid):
    """
    Get a list of speaker names from a file
    """
    import re
    from corpkit.process import saferead
    txt, enc = saferead(filepath)
    res = re.findall(speakid, txt)
    if res:
        return sorted(list(set([i.strip() for i in res])))

Example #2

Show file

File: build.py Project: ashhher3/corpkit

def get_all_metadata_fields(corpus, include_speakers=False):
    """
    Get a list of metadata fields in a corpus

    This could take a while for very little infor
    """
    from corpkit.corpus import Corpus
    from corpkit.constants import OPENER, PYTHON_VERSION, MAX_METADATA_FIELDS

    # allow corpus object
    if not isinstance(corpus, Corpus):
        corpus = Corpus(corpus, print_info=False)
    if not corpus.datatype == 'conll':
        return []

    path = getattr(corpus, 'path', corpus)

    fs = []
    import os
    for root, dirnames, filenames in os.walk(path):
        for filename in filenames:
            fs.append(os.path.join(root, filename))

    badfields = ['parse', 'sent_id']
    if not include_speakers:
        badfields.append('speaker')

    fields = set()
    for f in fs:
        if PYTHON_VERSION == 2:
            from corpkit.process import saferead
            lines = saferead(f)[0].splitlines()
        else:
            with open(f, 'rb') as fo:
                lines = fo.read().decode('utf-8', errors='ignore')
                lines = lines.strip('\n')
                lines = lines.splitlines()

        lines = [l[2:].split('=', 1)[0] for l in lines if l.startswith('# ') \
                 if not l.startswith('# sent_id')]
        for l in lines:
            if l not in fields and l not in badfields:
                fields.add(l)
        if len(fields) > MAX_METADATA_FIELDS:
            break
    return list(fields)

Example #3

Show file

File: build.py Project: tranduythanh/corpkit

def make_no_id_corpus(pth,
                      newpth,
                      metadata_mode=False,
                      speaker_segmentation=False):
    """
    Make version of pth without ids
    """
    import os
    import re
    import shutil
    from corpkit.process import saferead
    # define regex broadly enough to accept timestamps, locations if need be

    from corpkit.constants import MAX_SPEAKERNAME_SIZE
    idregex = re.compile(r'(^.{,%d}?):\s+(.*$)' % MAX_SPEAKERNAME_SIZE)

    try:
        shutil.copytree(pth, newpth)
    except OSError:
        shutil.rmtree(newpth)
        shutil.copytree(pth, newpth)
    files = get_filepaths(newpth)
    names = []
    metadata = []
    for f in files:
        good_data = []
        fo, enc = saferead(f)
        data = fo.splitlines()
        # for each line in the file, remove speaker and metadata
        for datum in data:
            if speaker_segmentation:
                matched = re.search(idregex, datum)
                if matched:
                    names.append(matched.group(1))
                    datum = matched.group(2)
            if metadata_mode:
                splitmet = datum.rsplit('<metadata ', 1)
                # for the impossibly rare case of a line that is '<metadata '
                if not splitmet:
                    continue
                datum = splitmet[0]
            if datum:
                good_data.append(datum)

        with open(f, "w") as fo:
            if PYTHON_VERSION == 2:
                fo.write('\n'.join(good_data).encode('utf-8'))
            else:
                fo.write('\n'.join(good_data))

    if speaker_segmentation:
        from time import localtime, strftime
        thetime = strftime("%H:%M:%S", localtime())
        if len(names) == 0:
            print(
                '%s: No speaker names found. Turn off speaker segmentation.' %
                thetime)
            shutil.rmtree(newpth)
        else:
            try:
                if len(sorted(set(names))) < 19:
                    print('%s: Speaker names found: %s' %
                          (thetime, ', '.join(sorted(set(names)))))
                else:
                    print('%s: Speaker names found: %s ... ' %
                          (thetime, ', '.join(sorted(set(names[:20])))))
            except:
                pass

Example #4

Show file

def delete_lines(corpus, annotation, dry_run=True, colour={}):
    """
    Show or delete the necessary lines
    """
    from corpkit.constants import OPENER, PYTHON_VERSION
    import re
    import os
    tagmode = True
    no_can_do = ['sent_id', 'parse']

    if isinstance(annotation, dict):
        tagmode = False
        for k, v in annotation.items():
            if k in no_can_do:
                print("You aren't allowed to delete '%s', sorry." % k)
                return
            if not v:
                v = r'.*?'
            regex = re.compile(r'(# %s=%s)\n' % (k, v), re.MULTILINE)
    else:
        if annotation in no_can_do:
            print("You aren't allowed to delete '%s', sorry." % k)
            return
        regex = re.compile(r'((# tags=.*?)%s;?(.*?))\n' % annotation,
                           re.MULTILINE)

    fs = []
    for (root, dirs, fls) in os.walk(corpus):
        for f in fls:
            fs.append(os.path.join(root, f))

    for f in fs:

        if PYTHON_VERSION == 2:
            from corpkit.process import saferead
            data = saferead(f)[0]
        else:
            with open(f, 'rb') as fo:
                data = fo.read().decode('utf-8', errors='ignore')

        if dry_run:
            if tagmode:
                repl_str = r'\1 <=======\n%s\2\3 <=======\n' % colour.get(
                    'green', '')
            else:
                repl_str = r'\1 <=======\n'
            try:
                repl_str = colour['red'] + repl_str + colour['reset']
            except:
                pass
            data, n = re.subn(regex, repl_str, data)
            nspl = 100 if tagmode else 50
            delim = '<======='
            data = re.split(delim, data, maxsplit=nspl)
            toshow = delim.join(data[:nspl + 1])
            toshow = toshow.rsplit('\n\n', 1)[0]
            print(toshow)
            if n > 50:
                n = n - 50
                print('\n... and %d more changes ... ' % n)

        else:
            if tagmode:
                repl_str = r'\2\3\n'
            else:
                repl_str = ''
            data = re.sub(regex, repl_str, data)
            with OPENER(f, 'w') as fo:
                from corpkit.constants import PYTHON_VERSION
                if PYTHON_VERSION == 2:
                    data = data.encode('utf-8', errors='ignore')
                fo.write(data)

Example #5

Show file

File: make.py Project: javelir/corpkit

def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser

    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str

    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool

    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool

    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead
    pyver = sys.version_info.major
    from corpkit.build import (get_corpus_filepaths, 
                               check_jdk, 
                               add_ids_to_xml, 
                               rename_all_files,
                               make_no_id_corpus, parse_corpus, move_parsed_files)

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')
    
    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    if tokenise:
        newpath = unparsed_corpus_path + '-tokenised'
        if isdir(newpath):
            shutil.rmtree(newpath)
        import nltk
        if nltk_data_path:
            if nltk_data_path not in nltk.data.path:
                nltk.data.path.append(nltk_data_path)
        try:
            from nltk import word_tokenize as tokenise
        except:
            print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
            raise

    if sys.platform == "darwin":
        if not check_jdk():
            print("Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html")

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')
    
    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)
    
    unparsed_corpus_path = newp

    # ask to folderise?
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        if do_folderise is None:
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n)")
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)
            
    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(join('data', 'data'), 'data')

    if parse:

        def chunks(l, n):
            for i in range(0, len(l), n):
                yield l[i:i+n]

        # this loop shortens files containing more than 500 lines, for corenlp memory sake
        # maybe user needs a warning or something in case s/he is doing coref
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace('.txt', '-%s.txt' % str(index + 1).zfill(3))
                        with codecs.open(newname, 'w', encoding='utf-8') as fo:
                            txt = '\n'.join(c) + '\n'
                            fo.write(txt.encode('utf-8'))
                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if speaker_segmentation:
            newpath = unparsed_corpus_path + '-stripped-parsed'
            if isdir(newpath) and not root:
                ans = INPUTFUNC('\n Path exists: %s. Do you want to overwrite? (y/n)\n' %newpath)
                if ans.lower().strip()[0] == 'y':
                    shutil.rmtree(newpath)
                else:
                    return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' %newpath)
            print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path, unparsed_corpus_path + '-stripped')
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        if not fileparse:
            pp = os.path.dirname(unparsed_corpus_path)
            filelist = get_corpus_filepaths(projpath=pp, 
                                            corpuspath=to_parse)

        else:
            filelist = unparsed_corpus_path.replace('.txt', '-filelist.txt')
            with open(filelist, 'w') as fo:
                fo.write(unparsed_corpus_path + '\n')

        if multiprocess is not False:

            if multiprocess is True:
                import multiprocessing
                multiprocess = multiprocessing.cpu_count()
            from joblib import Parallel, delayed
            # split old file into n parts
            data, enc = saferead(filelist)
            fs = [i for i in data.splitlines() if i]
            # make generator with list of lists
            divl = len(fs) / multiprocess
            fgen = chunks(fs, divl)
            filelists = []
            # for each list, make new file
            for index, flist in enumerate(fgen):
                as_str = '\n'.join(flist) + '\n'
                new_fpath = filelist.replace('.txt', '-%s.txt' % str(index).zfill(4))
                filelists.append(new_fpath)
                with codecs.open(new_fpath, 'w', encoding='utf-8') as fo:
                    fo.write(as_str.encode('utf-8'))
            try:
                os.remove(filelist)
            except:
                pass

            ds = []
            for listpath in filelists:
                d = {'proj_path': project_path, 
                     'corpuspath': to_parse,
                     'filelist': listpath,
                     'corenlppath': corenlppath,
                     'nltk_data_path': nltk_data_path,
                     'operations': operations,
                     'copula_head': cop_head,
                     'multiprocessing': True,
                     'root': root,
                     'note': note,
                     'stdout': stdout
                    }
                ds.append(d)

            res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x) for x in ds)
            if len(res) > 0:
                newparsed = res[0]
            else:
                return
            if all(r is False for r in res):
                return

            for i in filelists:
                try:
                    os.remove(i)
                except:
                    pass

        else:
            newparsed = parse_corpus(proj_path=project_path, 
                                     corpuspath=to_parse,
                                     filelist=filelist,
                                     corenlppath=corenlppath,
                                     nltk_data_path=nltk_data_path,
                                     operations=operations,
                                     copula_head=cop_head,
                                     root=root,
                                     note=note,
                                     stdout=stdout,
                                     fileparse=fileparse)

        if not newparsed:
            return 
        if all(not x for x in newparsed):
            return

        if fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt', '-filelist.txt'))
            return unparsed_corpus_path + '.xml'
        
        move_parsed_files(project_path, to_parse, newparsed)
        outpath = newparsed
        if speaker_segmentation:
            add_ids_to_xml(newparsed)
        try:
            os.remove(filelist)
        except:
            pass

    else:
        filelist = get_corpus_filepaths(projpath=os.path.dirname(unparsed_corpus_path), 
                                        corpuspath=unparsed_corpus_path)

    if tokenise:
        newtok = parse_corpus(proj_path=project_path, 
                              corpuspath=unparsed_corpus_path,
                              filelist=filelist,
                              nltk_data_path=nltk_data_path,
                              operations=operations,
                              only_tokenise=True
                             )
        if newtok is False:
            return   
        outpath = newtok

    rename_all_files(outpath)
    print('Done!\n')
    return outpath

Example #6

Show file

def plaintext_to_conll(inpath,
                       postag=False,
                       lemmatise=False,
                       lang='en',
                       metadata=False,
                       outpath=False,
                       nltk_data_path=False,
                       speaker_segmentation=False):
    """
    Take a plaintext corpus and sent/word tokenise.

    :param inpath: The corpus to read in
    :param postag: do POS tagging?
    :param lemmatise: do lemmatisation?
    :param lang: choose language for pos/lemmatiser (not implemented yet)
    :param metadata: add metadata to conll (not implemented yet)
    :param outpath: custom name for the resulting corpus
    :param speaker_segmentation: did the corpus has speaker names?
    """

    import nltk
    import shutil
    import pandas as pd
    from corpkit.process import saferead

    from corpkit.build import get_filepaths
    fps = get_filepaths(inpath, 'txt')

    # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL
    # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO

    # SENT TOKENISERS
    from nltk.tokenize.punkt import PunktSentenceTokenizer
    stoker = PunktSentenceTokenizer()
    s_tokers = {'en': stoker}
    sent_tokenizer = s_tokers.get(lang, stoker)

    # WORD TOKENISERS
    tokenisers = {'en': nltk.word_tokenize}
    tokeniser = tokenisers.get(lang, nltk.word_tokenize)

    # LEMMATISERS
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        lemmatisers = {'en': lmtzr}
        lemmatiser = lemmatisers.get(lang, lmtzr)

    # POS TAGGERS
    if postag:
        # nltk.download('averaged_perceptron_tagger')
        postaggers = {'en': nltk.pos_tag}
        tagger = postaggers.get(lang, nltk.pos_tag)

    # iterate over files, make df of each, convert this
    # to conll and sent to new filename
    for f in fps:
        for_df = []
        data, enc = saferead(f)
        plain, enc = saferead(f.replace('-stripped', ''))
        #orig_data = data
        #data, offsets = process_meta(data, speaker_segmentation, metadata)
        #nest = []
        sents = sent_tokenizer.tokenize(data)
        soffs = sent_tokenizer.span_tokenize(data)
        toks = [tokeniser(sent) for sent in sents]
        ser = nested_list_to_pandas(toks)
        for_df.append(ser)
        if postag or lemmatise:
            postags = pos_tag_series(ser, tagger)
        if lemmatise:
            lemma = lemmatise_series(ser, postags, lemmatiser)
            for_df.append(lemma)
            for_df.append(postags)
        else:
            if postag:
                for_df.append(postags)
        df = pd.concat(for_df, axis=1)
        fo = new_fname(f, inpath)
        write_df_to_conll(df,
                          fo,
                          metadata=metadata,
                          plain=plain,
                          stripped=data,
                          speaker_segmentation=speaker_segmentation,
                          offsets=soffs)
        nsent = len(set(df.index.labels[0]))
        print('%s created (%d sentences)' % (fo, nsent))

    if '-stripped' in inpath:
        return inpath.replace('-stripped', '-tokenised')
    else:
        return inpath + '-tokenised'

Example #7

Show file

def make_corpus(unparsed_corpus_path,
                project_path=None,
                parse=True,
                tokenise=False,
                postag=False,
                lemmatise=False,
                corenlppath=False,
                nltk_data_path=False,
                operations=False,
                speaker_segmentation=False,
                root=False,
                multiprocess=False,
                split_texts=400,
                outname=False,
                metadata=False,
                restart=False,
                coref=True,
                lang='en',
                **kwargs):
    """
    Create a parsed version of unparsed_corpus using CoreNLP or NLTK's tokeniser
    :param unparsed_corpus_path: path to corpus containing text files, 
                                 or subdirs containing text files
    :type unparsed_corpus_path: str
    
    :param project_path: path to corpkit project
    :type project_path: str

    :param parse: Do parsing?
    :type parse: bool
    
    :param tokenise: Do tokenising?
    :type tokenise: bool
    
    :param corenlppath: folder containing corenlp jar files
    :type corenlppath: str
    
    :param nltk_data_path: path to tokeniser if tokenising
    :type nltk_data_path: str
    
    :param operations: which kinds of annotations to do
    :type operations: str
    
    :param speaker_segmentation: add speaker name to parser output if your corpus is script-like:
    :type speaker_segmentation: bool
    :returns: list of paths to created corpora
    """

    import sys
    import os
    from os.path import join, isfile, isdir, basename, splitext, exists
    import shutil
    import codecs
    from corpkit.build import folderise, can_folderise
    from corpkit.process import saferead, make_dotfile

    from corpkit.build import (get_corpus_filepaths, check_jdk,
                               rename_all_files, make_no_id_corpus,
                               parse_corpus, move_parsed_files)
    from corpkit.constants import REPEAT_PARSE_ATTEMPTS

    if parse is True and tokenise is True:
        raise ValueError('Select either parse or tokenise, not both.')

    if project_path is None:
        project_path = os.getcwd()

    fileparse = isfile(unparsed_corpus_path)
    if fileparse:
        copier = shutil.copyfile
    else:
        copier = shutil.copytree

    # raise error if no tokeniser
    #if tokenise:
    #    if outname:
    #        newpath = os.path.join(os.path.dirname(unparsed_corpus_path), outname)
    #    else:
    #        newpath = unparsed_corpus_path + '-tokenised'
    #    if isdir(newpath):
    #        shutil.rmtree(newpath)
    #    import nltk
    #    if nltk_data_path:
    #        if nltk_data_path not in nltk.data.path:
    #            nltk.data.path.append(nltk_data_path)
    #    try:
    #        from nltk import word_tokenize as tokenise
    #    except:
    #        print('\nTokeniser not found. Pass in its path as keyword arg "nltk_data_path = <path>".\n')
    #        raise

    if sys.platform == "darwin":
        if not check_jdk():
            print(
                "Get the latest Java from http://www.oracle.com/technetwork/java/javase/downloads/jdk8-downloads-2133151.html"
            )

    cop_head = kwargs.get('copula_head', True)
    note = kwargs.get('note', False)
    stdout = kwargs.get('stdout', False)

    # make absolute path to corpus
    unparsed_corpus_path = os.path.abspath(unparsed_corpus_path)

    # move it into project
    if fileparse:
        datapath = project_path
    else:
        datapath = join(project_path, 'data')

    if isdir(datapath):
        newp = join(datapath, basename(unparsed_corpus_path))
    else:
        os.makedirs(datapath)
        if fileparse:
            noext = splitext(unparsed_corpus_path)[0]
            newp = join(datapath, basename(noext))
        else:
            newp = join(datapath, basename(unparsed_corpus_path))

    if exists(newp):
        pass
    else:
        copier(unparsed_corpus_path, newp)

    unparsed_corpus_path = newp

    # ask to folderise?
    check_do_folderise = False
    do_folderise = kwargs.get('folderise', None)
    if can_folderise(unparsed_corpus_path):
        import __main__ as main
        if do_folderise is None and not hasattr(main, '__file__'):
            check_do_folderise = INPUTFUNC("Your corpus has multiple files, but no subcorpora. "\
                                 "Would you like each file to be treated as a subcorpus? (y/n) ")
            check_do_folderise = check_do_folderise.lower().startswith('y')
        if check_do_folderise or do_folderise:
            folderise(unparsed_corpus_path)

    # this is bad!
    if join('data', 'data') in unparsed_corpus_path:
        unparsed_corpus_path = unparsed_corpus_path.replace(
            join('data', 'data'), 'data')

    def chunks(l, n):
        for i in range(0, len(l), n):
            yield l[i:i + n]

    if parse or tokenise:

        # this loop shortens files containing more than 500 lines,
        # for corenlp memory's sake. maybe user needs a warning or
        # something in case s/he is doing coref?
        for rootx, dirs, fs in os.walk(unparsed_corpus_path):
            for f in fs:
                if f.startswith('.'):
                    continue
                fp = join(rootx, f)
                data, enc = saferead(fp)
                data = data.splitlines()
                if len(data) > split_texts:
                    chk = chunks(data, split_texts)
                    for index, c in enumerate(chk):
                        newname = fp.replace(
                            '.txt', '-%s.txt' % str(index + 1).zfill(3))
                        # does this work?
                        if PYTHON_VERSION == 2:
                            with codecs.open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt.encode(enc))
                        else:
                            with open(newname, 'w', encoding=enc) as fo:
                                txt = '\n'.join(c) + '\n'
                                fo.write(txt)

                    os.remove(fp)
                else:
                    pass
                    #newname = fp.replace('.txt', '-000.txt')
                    #os.rename(fp, newname)

        if outname:
            newpath = os.path.join(os.path.dirname(unparsed_corpus_path),
                                   outname)
        else:
            newpath = unparsed_corpus_path + '-parsed'
        if restart:
            restart = newpath
        if speaker_segmentation or metadata:
            if isdir(newpath) and not root:
                import __main__ as main
                if not restart and not hasattr(main, '__file__'):
                    ans = INPUTFUNC(
                        '\n Path exists: %s. Do you want to overwrite? (y/n)\n'
                        % newpath)
                    if ans.lower().strip()[0] == 'y':
                        shutil.rmtree(newpath)
                    else:
                        return
            elif isdir(newpath) and root:
                raise OSError('Path exists: %s' % newpath)
            if speaker_segmentation:
                print('Processing speaker IDs ...')
            make_no_id_corpus(unparsed_corpus_path,
                              unparsed_corpus_path + '-stripped',
                              metadata_mode=metadata,
                              speaker_segmentation=speaker_segmentation)
            to_parse = unparsed_corpus_path + '-stripped'
        else:
            to_parse = unparsed_corpus_path

        if not fileparse:
            print('Making list of files ... ')

        # now we enter a while loop while not all files are parsed
        #todo: these file lists are not necessary when not parsing

        if outname:
            newparsed = os.path.join(project_path, 'data', outname)
        else:
            basecp = os.path.basename(to_parse)
            newparsed = os.path.join(project_path, 'data',
                                     '%s-parsed' % basecp)
            newparsed = newparsed.replace('-stripped-', '-')

        while REPEAT_PARSE_ATTEMPTS:

            if not parse:
                break

            if not fileparse:
                pp = os.path.dirname(unparsed_corpus_path)
                # if restart mode, the filepaths won't include those already parsed...
                filelist, fs = get_corpus_filepaths(
                    projpath=pp,
                    corpuspath=to_parse,
                    restart=restart,
                    out_ext=kwargs.get('output_format'))

            else:
                filelist = unparsed_corpus_path.replace(
                    '.txt', '-filelist.txt')
                with open(filelist, 'w') as fo:
                    fo.write(unparsed_corpus_path + '\n')

            # split up filelists
            if multiprocess is not False:

                if multiprocess is True:
                    import multiprocessing
                    multiprocess = multiprocessing.cpu_count()

                from joblib import Parallel, delayed
                # split old file into n parts
                if os.path.isfile(filelist):
                    data, enc = saferead(filelist)
                    fs = [i for i in data.splitlines() if i]
                else:
                    fs = []
                # if there's nothing here, we're done
                if not fs:
                    # double dutch
                    REPEAT_PARSE_ATTEMPTS = 0
                    break
                if len(fs) <= multiprocess:
                    multiprocess = len(fs)
                # make generator with list of lists
                divl = int(len(fs) / multiprocess)
                filelists = []
                if not divl:
                    filelists.append(filelist)
                else:
                    fgen = chunks(fs, divl)

                    # for each list, make new file
                    from corpkit.constants import OPENER
                    for index, flist in enumerate(fgen):
                        as_str = '\n'.join(flist) + '\n'
                        new_fpath = filelist.replace(
                            '.txt', '-%s.txt' % str(index).zfill(4))
                        filelists.append(new_fpath)
                        with OPENER(new_fpath, 'w', encoding='utf-8') as fo:
                            try:
                                fo.write(as_str.encode('utf-8'))
                            except TypeError:
                                fo.write(as_str)

                    try:
                        os.remove(filelist)
                    except:
                        pass

                ds = []
                for listpath in filelists:
                    d = {
                        'proj_path': project_path,
                        'corpuspath': to_parse,
                        'filelist': listpath,
                        'corenlppath': corenlppath,
                        'nltk_data_path': nltk_data_path,
                        'operations': operations,
                        'copula_head': cop_head,
                        'multiprocessing': True,
                        'root': root,
                        'note': note,
                        'stdout': stdout,
                        'outname': outname,
                        'coref': coref,
                        'output_format': kwargs.get('output_format', 'xml')
                    }
                    ds.append(d)

                res = Parallel(n_jobs=multiprocess)(delayed(parse_corpus)(**x)
                                                    for x in ds)
                if len(res) > 0:
                    newparsed = res[0]
                else:
                    return
                if all(r is False for r in res):
                    return

                for i in filelists:
                    try:
                        os.remove(i)
                    except:
                        pass

            else:
                newparsed = parse_corpus(proj_path=project_path,
                                         corpuspath=to_parse,
                                         filelist=filelist,
                                         corenlppath=corenlppath,
                                         nltk_data_path=nltk_data_path,
                                         operations=operations,
                                         copula_head=cop_head,
                                         root=root,
                                         note=note,
                                         stdout=stdout,
                                         fileparse=fileparse,
                                         outname=outname,
                                         output_format=kwargs.get(
                                             'output_format', 'conll'))

            if not restart:
                REPEAT_PARSE_ATTEMPTS = 0
            else:
                REPEAT_PARSE_ATTEMPTS -= 1
                print('Repeating parsing due to missing files. '\
                      '%d iterations remaining.' % REPEAT_PARSE_ATTEMPTS)

        if parse and not newparsed:
            return

        if parse and all(not x for x in newparsed):
            print('Error after parsing.')
            return

        if parse and fileparse:
            # cleanup mistakes :)
            if isfile(splitext(unparsed_corpus_path)[0]):
                os.remove(splitext(unparsed_corpus_path)[0])
            if isfile(unparsed_corpus_path.replace('.txt', '-filelist.txt')):
                os.remove(unparsed_corpus_path.replace('.txt',
                                                       '-filelist.txt'))
            return unparsed_corpus_path + '.conll'

        if parse:
            move_parsed_files(project_path,
                              to_parse,
                              newparsed,
                              ext=kwargs.get('output_format', 'conll'),
                              restart=restart)

            from corpkit.conll import convert_json_to_conll
            coref = False
            if operations is False:
                coref = True
            elif 'coref' in operations or 'dcoref' in operations:
                coref = True

            convert_json_to_conll(newparsed,
                                  speaker_segmentation=speaker_segmentation,
                                  coref=coref,
                                  metadata=metadata)

        try:
            os.remove(filelist)
        except:
            pass

    if not parse and tokenise:
        #todo: outname
        newparsed = to_parse.replace('-stripped', '-tokenised')
        from corpkit.tokenise import plaintext_to_conll
        newparsed = plaintext_to_conll(
            to_parse,
            postag=postag,
            lemmatise=lemmatise,
            lang=lang,
            metadata=metadata,
            nltk_data_path=nltk_data_path,
            speaker_segmentation=speaker_segmentation,
            outpath=newparsed)

        if outname:
            if not os.path.isdir(outname):
                outname = os.path.join('data', os.path.basename(outdir))
            import shutil
            shutil.copytree(newparsed, outname)
            newparsed = outname
        if newparsed is False:
            return
        else:
            make_dotfile(newparsed)
            return newparsed

    rename_all_files(newparsed)
    print('Generating corpus metadata...')
    make_dotfile(newparsed)
    print('Done!\n')
    return newparsed