def rename_all_files(dirs_to_do): """get rid of the inserted dirname in filenames after parsing""" import os if isinstance(dirs_to_do, STRINGTYPE): dirs_to_do = [dirs_to_do] for d in dirs_to_do: if d.endswith('-parsed'): ext = 'txt.xml' elif d.endswith('-tokenised'): ext = '.p' else: ext = '.txt' fs = get_filepaths(d, ext) for f in fs: fname = os.path.basename(f) justdir = os.path.dirname(f) subcorpus = os.path.basename(justdir) newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext) os.rename(f, os.path.join(justdir, newname))
def make_no_id_corpus(pth, newpth): """make version of pth without ids""" import os import re import shutil from build import get_filepaths # define regex broadly enough to accept timestamps, locations if need be idregex = re.compile(r'(^.*?):\s+(.*$)') try: shutil.copytree(pth, newpth) except OSError: shutil.rmtree(newpth) shutil.copytree(pth, newpth) files = get_filepaths(newpth) names = [] for f in files: good_data = [] with open(f, 'r') as fo: data = fo.read().splitlines() for datum in data: matched = re.search(idregex, datum) if matched: names.append(matched.group(1)) good_data.append(matched.group(2)) else: names.append('UNIDENTIFIED') good_data.append(datum) with open(f, "w") as fo: fo.write('\n'.join(good_data)) from time import localtime, strftime thetime = strftime("%H:%M:%S", localtime()) if len(names) == 0: print('%s: No speaker names found. Turn off speaker segmentation.' % thetime) shutil.rmtree(newpth) else: if len(sorted(set(names))) < 19: print('%s: Speaker names found: %s' % (thetime, ', '.join(sorted(set(names))))) else: print('%s: Speaker names found: %s ... ' % (thetime, ', '.join(sorted(set(names[:20])))))
def rename_all_files(dirs_to_do): """get rid of the inserted dirname in filenames after parsing""" import os from build import get_filepaths if type(dirs_to_do) == str: dirs_to_do = [dirs_to_do] for d in dirs_to_do: if d.endswith('-parsed'): ext = 'txt.xml' elif d.endswith('-tokenised'): ext = '.p' else: ext = '.txt' fs = get_filepaths(d, ext) for f in fs: fname = os.path.basename(f) justdir = os.path.dirname(f) subcorpus = os.path.basename(justdir) newname = fname.replace('-%s.%s' % (subcorpus, ext), '.%s' % ext) os.rename(f, os.path.join(justdir, newname))
def add_ids_to_xml(corpuspath, root = False, note = False): """add ids to the xml in corpuspath needs the raw files to be in the same dir as corpuspath, without '-parsed' in the dir name also needs the id files to be in the dir, with '-parsed' changed to -cleaned""" import os import re from bs4 import BeautifulSoup, SoupStrainer from build import get_filepaths from time import strftime, localtime files = get_filepaths(corpuspath, ext = 'xml') if note: note.progvar.set(0) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs ...' % thetime) if root: root.update() for i, f in enumerate(files): if note: note.progvar.set(i * 100.0 / len(files)) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files))) if root: root.update() xmlf = open(f) data = xmlf.read() xmlf.close() # open the unparsed version of the file, read into memory stripped_txtfile = f.replace('.xml', '').replace('-parsed', '') old_txt = open(stripped_txtfile) stripped_txtdata = old_txt.read() old_txt.close() # open the unparsed version with speaker ids id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '') idttxt = open(id_txtfile) id_txtdata = idttxt.read() idttxt.close() # todo: do this with lxml soup = BeautifulSoup(data, 'lxml') for s in soup.find_all('sentence'): # don't get corefs if s.parent.name == 'sentences': tokens = s.find_all('token') start = int(tokens[0].find_all('characteroffsetbegin', limit = 1)[0].text) end = int(tokens[-1].find_all('characteroffsetend', limit = 1)[0].text) # extract this sentence from the unparsed version sent = stripped_txtdata[start:end] # find out line number # sever at start of match cut_old_text = stripped_txtdata[:start] line_index = cut_old_text.count('\n') # lookup this text with_id = id_txtdata.splitlines()[line_index] split_line = with_id.split(': ', 1) if len(split_line) > 1: speakerid = split_line[0] else: speakerid = 'UNIDENTIFIED' new_tag = soup.new_tag("speakername") s.append(new_tag) new_tag.string = speakerid html = str(soup.root) # make changes with open(f, "wb") as fopen: fopen.write(bytes(html.encode('utf-8'))) if note: note.progvar.set(100)
def add_ids_to_xml(corpuspath, root=False, note=False): """ Add ids to the xml in corpuspath needs the raw files to be in the same dir as corpuspath, without '-parsed' in the dir name also needs the id files to be in the dir, with '-parsed' changed to -cleaned """ import os import re from time import strftime, localtime from lxml import etree as ET files = get_filepaths(corpuspath, ext='xml') if note: note.progvar.set(0) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs ...' % thetime) if root: root.update() for i, f in enumerate(files): if note: note.progvar.set(i * 100.0 / len(files)) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files))) if root: root.update() # quick check for speakernames already existing from itertools import islice with open(f, 'r') as xmlf: head = list(islice(xmlf, 1000)) if '<speakername>' in '\n'.join(head): continue tree = ET.parse(f) xmlroot = tree.getroot() sents = xmlroot[0][0] # open the unparsed version of the file, read into memory stripped_txtfile = f.replace('.xml', '').replace('-parsed', '') old_txt = open(stripped_txtfile, 'r') stripped_txtdata = old_txt.read() old_txt.close() # open the unparsed version with speaker ids id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '') idttxt = open(id_txtfile, 'r') id_txtdata = idttxt.read() idttxt.close() for s in sents: # don't get corefs tokens = [x for x in s.iter('token')] start = int(tokens[0][2].text) end = int(tokens[-1][3].text) # extract this sentence from the unparsed version sent = stripped_txtdata[start:end] # find out line number # sever at start of match cut_old_text = stripped_txtdata[:start] line_index = cut_old_text.count('\n') # lookup this text with_id = id_txtdata.splitlines()[line_index] split_line = with_id.split(': ', 1) if len(split_line) > 1: speakerid = split_line[0] else: speakerid = 'UNIDENTIFIED' newtag = ET.Element('speakername') newtag.text = speakerid newtag.tail = '\n ' s.append(newtag) tree.write(f, pretty_print=True) # make changes #with open(f, "wb") as fopen: # try: # fopen.write(bytes(html.encode('utf-8'))) # except UnicodeDecodeError: # fopen.write(bytes(html)) if note: note.progvar.set(100)
def add_ids_to_xml(corpuspath, root=False, note=False): """add ids to the xml in corpuspath needs the raw files to be in the same dir as corpuspath, without '-parsed' in the dir name also needs the id files to be in the dir, with '-parsed' changed to -cleaned""" import os import re from bs4 import BeautifulSoup, SoupStrainer from build import get_filepaths from time import strftime, localtime files = get_filepaths(corpuspath, ext='xml') if note: note.progvar.set(0) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs ...' % thetime) if root: root.update() for i, f in enumerate(files): if note: note.progvar.set(i * 100.0 / len(files)) thetime = strftime("%H:%M:%S", localtime()) print('%s: Processing speaker IDs (%d/%d)' % (thetime, i, len(files))) if root: root.update() xmlf = open(f) data = xmlf.read() xmlf.close() # open the unparsed version of the file, read into memory stripped_txtfile = f.replace('.xml', '').replace('-parsed', '') old_txt = open(stripped_txtfile) stripped_txtdata = old_txt.read() old_txt.close() # open the unparsed version with speaker ids id_txtfile = f.replace('.xml', '').replace('-stripped-parsed', '') idttxt = open(id_txtfile) id_txtdata = idttxt.read() idttxt.close() # todo: do this with lxml soup = BeautifulSoup(data, 'lxml') for s in soup.find_all('sentence'): # don't get corefs if s.parent.name == 'sentences': tokens = s.find_all('token') start = int(tokens[0].find_all('characteroffsetbegin', limit=1)[0].text) end = int(tokens[-1].find_all('characteroffsetend', limit=1)[0].text) # extract this sentence from the unparsed version sent = stripped_txtdata[start:end] # find out line number # sever at start of match cut_old_text = stripped_txtdata[:start] line_index = cut_old_text.count('\n') # lookup this text with_id = id_txtdata.splitlines()[line_index] split_line = with_id.split(': ', 1) if len(split_line) > 1: speakerid = split_line[0] else: speakerid = 'UNIDENTIFIED' new_tag = soup.new_tag("speakername") s.append(new_tag) new_tag.string = speakerid html = str(soup.root) # make changes with open(f, "wb") as fopen: fopen.write(bytes(html.encode('utf-8'))) if note: note.progvar.set(100)