Example #1
0
def make_no_id_corpus(pth, newpth):
    """make version of pth without ids"""
    import os
    import re
    import shutil
    from corpkit.build import get_filepaths
    # define regex broadly enough to accept timestamps, locations if need be
    idregex = re.compile(r'(^.*?):\s+(.*$)')
    try:
        shutil.copytree(pth, newpth)
    except OSError:
        shutil.rmtree(newpth)
        shutil.copytree(pth, newpth)
    files = get_filepaths(newpth)
    names = []
    for f in files:
        good_data = []
        with open(f) as fo:
            data = fo.read().splitlines()
            for datum in data:
                matched = re.search(idregex, datum)
                if matched:
                    names.append(matched.group(1))
                    good_data.append(matched.group(2))
        with open(f, "w") as fo:
            fo.write('\n'.join(good_data))
    if len(names) == 0:
        from time import localtime, strftime
        thetime = strftime("%H:%M:%S", localtime())
        print '%s: No speaker names found. Turn off speaker segmentation.' % thetime
        shutil.rmtree(newpth)
Example #2
0
def rename_all_files(dirs_to_do):
    """get rid of the inserted dirname in filenames after parsing"""
    import os
    from corpkit.build import get_filepaths
    for d in dirs_to_do:
        if not d.endswith('-parsed'):
            ext = 'txt'
        else:
            ext = 'txt.xml'
        fs = get_filepaths(d, ext)
        for f in fs:
            fname = os.path.basename(f)
            justdir = os.path.dirname(f)
            subcorpus = os.path.basename(justdir)
            newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext)
            os.rename(f, os.path.join(justdir, newname))
Example #3
0
def get_list_of_speaker_names(corpuspath):
    """return a list of speaker names in a pre-processed corpus"""
    import os
    import re
    from corpkit.build import get_filepaths
    files = get_filepaths(corpuspath)
    names = []
    idregex = re.compile(r'(^.*?):\s+(.*$)')
    for f in files:
        data = open(f).read().splitlines()
        for l in data:
            m = re.search(idregex, l)
            if m:
                if m.group(1) not in names:
                    names.append(m.group(1))
    return sorted(list(set(names)))
Example #4
0
def add_ids_to_xml(corpuspath, root = False, note = False):
    """add ids to the xml in corpuspath

    needs the raw files to be in the same dir as corpuspath, without
    '-parsed' in the dir name
    also needs the id files to be in the dir, with '-parsed' changed 
    to -cleaned"""
    import os
    import re
    from bs4 import BeautifulSoup, SoupStrainer
    from corpkit.build import get_filepaths
    from time import strftime, localtime

    files = get_filepaths(corpuspath, ext = 'xml')
    if note:
        note.progvar.set(0)
    thetime = strftime("%H:%M:%S", localtime())
    print '%s: Processing speaker IDs ...' % thetime
    if root:
        root.update()

    for i, f in enumerate(files):
        if note:
            note.progvar.set(i * 100.0 / len(files))
        thetime = strftime("%H:%M:%S", localtime())
        print '%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files))
        if root:
            root.update()
        xmlf = open(f)
        data = xmlf.read()
        xmlf.close()

        # open the unparsed version of the file, read into memory
        stripped_txtfile = f.replace('.xml', '').replace('-parsed', '')
        old_txt = open(stripped_txtfile)
        stripped_txtdata = old_txt.read()
        old_txt.close()

        # open the unparsed version with speaker ids
        id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '')
        idttxt = open(id_txtfile)
        id_txtdata = idttxt.read()
        idttxt.close()

        # todo: do this with lxml
        soup = BeautifulSoup(data, "lxml")
        for s in soup.find_all('sentence'):
            # don't get corefs
            if s.parent.name == 'sentences':
                tokens = s.find_all('token')
                start = int(tokens[0].find_all('characteroffsetbegin', limit = 1)[0].text)
                end = int(tokens[-1].find_all('characteroffsetend', limit = 1)[0].text)
                # extract this sentence from the unparsed version
                sent = stripped_txtdata[start:end]
                # find out line number
                # sever at start of match
                cut_old_text = stripped_txtdata[:start]
                line_index = cut_old_text.count('\n')
                # lookup this text
                with_id = id_txtdata.splitlines()[line_index]
                split_line = with_id.split(': ', 1)
                if len(split_line) > 1:
                    speakerid = split_line[0]
                else:
                    speakerid = 'UNIDENTIFIED'
                new_tag = soup.new_tag("speakername")
                s.append(new_tag)
                new_tag.string = speakerid
        html = str(soup.root)
        # make changes
        with open(f, "wb") as fopen:
            fopen.write(html)
    if note:
        note.progvar.set(100)
def turn_to_plaintext(corpus_dir='xml'):
    """
    turn  xml corpus into corpkit input
    takes about 5 minutes, most of which is lang detection
    also determines language so that we can remove baddies later
    """
    import os
    # just an os.walk/glob type function
    from corpkit.build import get_filepaths
    from lxml import etree as ET
    from langdetect import detect, detect_langs

    # make a new directory if need be
    corpus_dir = 'xml'
    outdir = '%s-form' % corpus_dir
    try:
        os.makedirs(outdir)
    except OSError:
        pass

    # this function pretty much just calls os.walk
    fs = get_filepaths(corpus_dir, 'xml')

    # parse metadata, put in corpkit format
    # write to same filepath in outdir
    for index, f in enumerate(fs, start=1):
        # progress info
        print("%.2f: %s" % ((index * 100.0 / len(fs)), f))
        # make new filename
        fpath, fname = os.path.split(f)
        outpath = fpath.replace(corpus_dir, outdir, 1)
        outname = fname.replace('.xml', '.txt')
        try:
            os.makedirs(outpath)
        except OSError:
            pass
        # get all the xml into a string
        root = ET.parse(f).getroot()
        metastring = ' <metadata '
        for k, v in sorted(root.items()):
            k = k.strip('"').strip("'").lstrip().rstrip()
            v = v.strip('"').strip("'").lstrip().rstrip()
            metastring += "%s='%s' " % (k, v)

        pages = ''
        # detect language
        langs = detect_langs('\n'.join([i.text for i in root if i.text]))
        lang, score = format_langs_output(langs)

        # pages don't end at sentence boundaries. join it all together
        for i, page in enumerate(root, start=1):
            if page.text:
                pagetext = page.text.replace('\n', ' ')
                pages += pagetext.rstrip('\n')

        metend = "lang='%s' engprob='%s'>\n" % (lang, score)
        pages += metastring + metend

        # write to file
        with open(os.path.join(outpath, outname), 'w') as fo:
            fo.write(pages.encode('utf-8'))
    return outdir
Example #6
0
def plaintext_to_conll(inpath,
                       postag=False,
                       lemmatise=False,
                       lang='en',
                       metadata=False,
                       outpath=False,
                       nltk_data_path=False,
                       speaker_segmentation=False):
    """
    Take a plaintext corpus and sent/word tokenise.

    :param inpath: The corpus to read in
    :param postag: do POS tagging?
    :param lemmatise: do lemmatisation?
    :param lang: choose language for pos/lemmatiser (not implemented yet)
    :param metadata: add metadata to conll (not implemented yet)
    :param outpath: custom name for the resulting corpus
    :param speaker_segmentation: did the corpus has speaker names?
    """

    import nltk
    import shutil
    import pandas as pd
    from corpkit.process import saferead

    from corpkit.build import get_filepaths
    fps = get_filepaths(inpath, 'txt')

    # IN THE SECTIONS BELOW, WE COULD ADD MULTILINGUAL
    # ANNOTATORS, PROVIDED THEY BEHAVE AS THE NLTK ONES DO

    # SENT TOKENISERS
    from nltk.tokenize.punkt import PunktSentenceTokenizer
    stoker = PunktSentenceTokenizer()
    s_tokers = {'en': stoker}
    sent_tokenizer = s_tokers.get(lang, stoker)

    # WORD TOKENISERS
    tokenisers = {'en': nltk.word_tokenize}
    tokeniser = tokenisers.get(lang, nltk.word_tokenize)

    # LEMMATISERS
    if lemmatise:
        from nltk.stem.wordnet import WordNetLemmatizer
        lmtzr = WordNetLemmatizer()
        lemmatisers = {'en': lmtzr}
        lemmatiser = lemmatisers.get(lang, lmtzr)

    # POS TAGGERS
    if postag:
        # nltk.download('averaged_perceptron_tagger')
        postaggers = {'en': nltk.pos_tag}
        tagger = postaggers.get(lang, nltk.pos_tag)

    # iterate over files, make df of each, convert this
    # to conll and sent to new filename
    for f in fps:
        for_df = []
        data, enc = saferead(f)
        plain, enc = saferead(f.replace('-stripped', ''))
        #orig_data = data
        #data, offsets = process_meta(data, speaker_segmentation, metadata)
        #nest = []
        sents = sent_tokenizer.tokenize(data)
        soffs = sent_tokenizer.span_tokenize(data)
        toks = [tokeniser(sent) for sent in sents]
        ser = nested_list_to_pandas(toks)
        for_df.append(ser)
        if postag or lemmatise:
            postags = pos_tag_series(ser, tagger)
        if lemmatise:
            lemma = lemmatise_series(ser, postags, lemmatiser)
            for_df.append(lemma)
            for_df.append(postags)
        else:
            if postag:
                for_df.append(postags)
        df = pd.concat(for_df, axis=1)
        fo = new_fname(f, inpath)
        write_df_to_conll(df,
                          fo,
                          metadata=metadata,
                          plain=plain,
                          stripped=data,
                          speaker_segmentation=speaker_segmentation,
                          offsets=soffs)
        nsent = len(set(df.index.labels[0]))
        print('%s created (%d sentences)' % (fo, nsent))

    if '-stripped' in inpath:
        return inpath.replace('-stripped', '-tokenised')
    else:
        return inpath + '-tokenised'