コード例 #1
0
ファイル: build.py プロジェクト: javelir/corpkit
def rename_all_files(dirs_to_do):
    """get rid of the inserted dirname in filenames after parsing"""
    import os
    if isinstance(dirs_to_do, STRINGTYPE):
        dirs_to_do = [dirs_to_do]
    for d in dirs_to_do:
        if d.endswith('-parsed'):
            ext = 'txt.xml'
        elif d.endswith('-tokenised'):
            ext = '.p'
        else:
            ext = '.txt'
        fs = get_filepaths(d, ext)
        for f in fs:
            fname = os.path.basename(f)
            justdir = os.path.dirname(f)
            subcorpus = os.path.basename(justdir)
            newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext)
            os.rename(f, os.path.join(justdir, newname))
コード例 #2
0
def make_no_id_corpus(pth, newpth):
    """make version of pth without ids"""
    import os
    import re
    import shutil
    from build import get_filepaths
    # define regex broadly enough to accept timestamps, locations if need be
    idregex = re.compile(r'(^.*?):\s+(.*$)')
    try:
        shutil.copytree(pth, newpth)
    except OSError:
        shutil.rmtree(newpth)
        shutil.copytree(pth, newpth)
    files = get_filepaths(newpth)
    names = []
    for f in files:
        good_data = []
        with open(f, 'r') as fo:
            data = fo.read().splitlines()
            for datum in data:
                matched = re.search(idregex, datum)
                if matched:
                    names.append(matched.group(1))
                    good_data.append(matched.group(2))
                else:
                    names.append('UNIDENTIFIED')
                    good_data.append(datum)
        with open(f, "w") as fo:
            fo.write('\n'.join(good_data))

    from time import localtime, strftime
    thetime = strftime("%H:%M:%S", localtime())
    if len(names) == 0:
        print('%s: No speaker names found. Turn off speaker segmentation.' %
              thetime)
        shutil.rmtree(newpth)
    else:
        if len(sorted(set(names))) < 19:
            print('%s: Speaker names found: %s' %
                  (thetime, ', '.join(sorted(set(names)))))
        else:
            print('%s: Speaker names found: %s ... ' %
                  (thetime, ', '.join(sorted(set(names[:20])))))
コード例 #3
0
def rename_all_files(dirs_to_do):
    """get rid of the inserted dirname in filenames after parsing"""
    import os
    from build import get_filepaths
    if type(dirs_to_do) == str:
        dirs_to_do = [dirs_to_do]
    for d in dirs_to_do:
        if d.endswith('-parsed'):
            ext = 'txt.xml'
        elif d.endswith('-tokenised'):
            ext = '.p'
        else:
            ext = '.txt'
        fs = get_filepaths(d, ext)
        for f in fs:
            fname = os.path.basename(f)
            justdir = os.path.dirname(f)
            subcorpus = os.path.basename(justdir)
            newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext)
            os.rename(f, os.path.join(justdir, newname))
コード例 #4
0
ファイル: build.py プロジェクト: xsongx/corpkit
def make_no_id_corpus(pth, newpth):
    """make version of pth without ids"""
    import os
    import re
    import shutil
    from build import get_filepaths
    # define regex broadly enough to accept timestamps, locations if need be
    idregex = re.compile(r'(^.*?):\s+(.*$)')
    try:
        shutil.copytree(pth, newpth)
    except OSError:
        shutil.rmtree(newpth)
        shutil.copytree(pth, newpth)
    files = get_filepaths(newpth)
    names = []
    for f in files:
        good_data = []
        with open(f, 'r') as fo:
            data = fo.read().splitlines()
            for datum in data:
                matched = re.search(idregex, datum)
                if matched:
                    names.append(matched.group(1))
                    good_data.append(matched.group(2))
                else:
                    names.append('UNIDENTIFIED')
                    good_data.append(datum)
        with open(f, "w") as fo:
            fo.write('\n'.join(good_data))

    from time import localtime, strftime
    thetime = strftime("%H:%M:%S", localtime())
    if len(names) == 0:
        print('%s: No speaker names found. Turn off speaker segmentation.' % thetime)
        shutil.rmtree(newpth)
    else:
        if len(sorted(set(names))) < 19:
            print('%s: Speaker names found: %s' % (thetime, ', '.join(sorted(set(names)))))
        else:
            print('%s: Speaker names found: %s ... ' % (thetime, ', '.join(sorted(set(names[:20])))))
コード例 #5
0
ファイル: build.py プロジェクト: xsongx/corpkit
def add_ids_to_xml(corpuspath, root = False, note = False):
    """add ids to the xml in corpuspath

    needs the raw files to be in the same dir as corpuspath, without
    '-parsed' in the dir name
    also needs the id files to be in the dir, with '-parsed' changed 
    to -cleaned"""
    import os
    import re
    from bs4 import BeautifulSoup, SoupStrainer
    from build import get_filepaths
    from time import strftime, localtime

    files = get_filepaths(corpuspath, ext = 'xml')
    if note:
        note.progvar.set(0)
    thetime = strftime("%H:%M:%S", localtime())
    print('%s: Processing speaker IDs ...' % thetime)
    if root:
        root.update()

    for i, f in enumerate(files):
        if note:
            note.progvar.set(i * 100.0 / len(files))
        thetime = strftime("%H:%M:%S", localtime())
        print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files)))
        if root:
            root.update()
        xmlf = open(f)
        data = xmlf.read()
        xmlf.close()

        # open the unparsed version of the file, read into memory
        stripped_txtfile = f.replace('.xml', '').replace('-parsed', '')
        old_txt = open(stripped_txtfile)
        stripped_txtdata = old_txt.read()
        old_txt.close()

        # open the unparsed version with speaker ids
        id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '')
        idttxt = open(id_txtfile)
        id_txtdata = idttxt.read()
        idttxt.close()

        # todo: do this with lxml
        soup = BeautifulSoup(data, 'lxml')
        for s in soup.find_all('sentence'):
            # don't get corefs
            if s.parent.name == 'sentences':
                tokens = s.find_all('token')
                start = int(tokens[0].find_all('characteroffsetbegin', limit = 1)[0].text)
                end = int(tokens[-1].find_all('characteroffsetend', limit = 1)[0].text)
                # extract this sentence from the unparsed version
                sent = stripped_txtdata[start:end]
                # find out line number
                # sever at start of match
                cut_old_text = stripped_txtdata[:start]
                line_index = cut_old_text.count('\n')
                # lookup this text
                with_id = id_txtdata.splitlines()[line_index]
                split_line = with_id.split(': ', 1)
                if len(split_line) > 1:
                    speakerid = split_line[0]
                else:
                    speakerid = 'UNIDENTIFIED'
                new_tag = soup.new_tag("speakername")
                s.append(new_tag)
                new_tag.string = speakerid
        html = str(soup.root)
        # make changes
        with open(f, "wb") as fopen:
            fopen.write(bytes(html.encode('utf-8')))
    if note:
        note.progvar.set(100)
コード例 #6
0
ファイル: build.py プロジェクト: javelir/corpkit
def add_ids_to_xml(corpuspath, root=False, note=False):
    """
    Add ids to the xml in corpuspath

    needs the raw files to be in the same dir as corpuspath, without
    '-parsed' in the dir name
    also needs the id files to be in the dir, with '-parsed' changed 
    to -cleaned
    """
    import os
    import re
    from time import strftime, localtime
    from lxml import etree as ET

    files = get_filepaths(corpuspath, ext='xml')
    if note:
        note.progvar.set(0)
    thetime = strftime("%H:%M:%S", localtime())
    print('%s: Processing speaker IDs ...' % thetime)
    if root:
        root.update()

    for i, f in enumerate(files):
        if note:
            note.progvar.set(i * 100.0 / len(files))
        thetime = strftime("%H:%M:%S", localtime())
        print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files)))
        if root:
            root.update()

        # quick check for speakernames already existing
        from itertools import islice
        with open(f, 'r') as xmlf:
            head = list(islice(xmlf, 1000))
        if '<speakername>' in '\n'.join(head):
            continue
        
        tree = ET.parse(f)
        xmlroot = tree.getroot()
        sents = xmlroot[0][0]

        # open the unparsed version of the file, read into memory
        stripped_txtfile = f.replace('.xml', '').replace('-parsed', '')
        old_txt = open(stripped_txtfile, 'r')
        stripped_txtdata = old_txt.read()
        old_txt.close()

        # open the unparsed version with speaker ids
        id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '')
        idttxt = open(id_txtfile, 'r')
        id_txtdata = idttxt.read()
        idttxt.close()

        for s in sents:
            # don't get corefs
            tokens = [x for x in s.iter('token')]
            start = int(tokens[0][2].text)
            end = int(tokens[-1][3].text)
            # extract this sentence from the unparsed version
            sent = stripped_txtdata[start:end]
            # find out line number
            # sever at start of match
            cut_old_text = stripped_txtdata[:start]
            line_index = cut_old_text.count('\n')
            # lookup this text
            with_id = id_txtdata.splitlines()[line_index]
            split_line = with_id.split(': ', 1)
            if len(split_line) > 1:
                speakerid = split_line[0]
            else:
                speakerid = 'UNIDENTIFIED'
            newtag = ET.Element('speakername')
            newtag.text = speakerid
            newtag.tail = '\n    '
            s.append(newtag)
        tree.write(f, pretty_print=True)
        # make changes
        #with open(f, "wb") as fopen:
        #    try:
        #        fopen.write(bytes(html.encode('utf-8')))
        #    except UnicodeDecodeError:
        #        fopen.write(bytes(html))

    if note:
        note.progvar.set(100)
コード例 #7
0
def add_ids_to_xml(corpuspath, root=False, note=False):
    """add ids to the xml in corpuspath

    needs the raw files to be in the same dir as corpuspath, without
    '-parsed' in the dir name
    also needs the id files to be in the dir, with '-parsed' changed 
    to -cleaned"""
    import os
    import re
    from bs4 import BeautifulSoup, SoupStrainer
    from build import get_filepaths
    from time import strftime, localtime

    files = get_filepaths(corpuspath, ext='xml')
    if note:
        note.progvar.set(0)
    thetime = strftime("%H:%M:%S", localtime())
    print('%s: Processing speaker IDs ...' % thetime)
    if root:
        root.update()

    for i, f in enumerate(files):
        if note:
            note.progvar.set(i * 100.0 / len(files))
        thetime = strftime("%H:%M:%S", localtime())
        print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files)))
        if root:
            root.update()
        xmlf = open(f)
        data = xmlf.read()
        xmlf.close()

        # open the unparsed version of the file, read into memory
        stripped_txtfile = f.replace('.xml', '').replace('-parsed', '')
        old_txt = open(stripped_txtfile)
        stripped_txtdata = old_txt.read()
        old_txt.close()

        # open the unparsed version with speaker ids
        id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '')
        idttxt = open(id_txtfile)
        id_txtdata = idttxt.read()
        idttxt.close()

        # todo: do this with lxml
        soup = BeautifulSoup(data, 'lxml')
        for s in soup.find_all('sentence'):
            # don't get corefs
            if s.parent.name == 'sentences':
                tokens = s.find_all('token')
                start = int(tokens[0].find_all('characteroffsetbegin',
                                               limit=1)[0].text)
                end = int(tokens[-1].find_all('characteroffsetend',
                                              limit=1)[0].text)
                # extract this sentence from the unparsed version
                sent = stripped_txtdata[start:end]
                # find out line number
                # sever at start of match
                cut_old_text = stripped_txtdata[:start]
                line_index = cut_old_text.count('\n')
                # lookup this text
                with_id = id_txtdata.splitlines()[line_index]
                split_line = with_id.split(': ', 1)
                if len(split_line) > 1:
                    speakerid = split_line[0]
                else:
                    speakerid = 'UNIDENTIFIED'
                new_tag = soup.new_tag("speakername")
                s.append(new_tag)
                new_tag.string = speakerid
        html = str(soup.root)
        # make changes
        with open(f, "wb") as fopen:
            fopen.write(bytes(html.encode('utf-8')))
    if note:
        note.progvar.set(100)