def mktxt(self, type='normalized', lower=False, new_sen_file=None): self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', pdt.PDT_ENCODING) try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(type) for utter in acts: for txt, attrs in utter: if not txt: self.logger.info( "Empty <dialogue_act> in file: %s", fn) if lower: txt = txt.lower() + '\n' else: txt = txt.upper() + '\n' fw.write(txt) finally: fw.close()
def __iter__(self): dxml = DXML.readFromFile(self.fn) types = dxml.getDialogueTypes() utters = zip(dxml.getUtterances(), dxml.getDialogueActs(self.ne_type, removeNE=False), *[dxml.getDialogueActs(t) for t in types]) file_id = os.path.splitext(os.path.basename(self.fn))[0] i = 0 for multi_utter in utters: utter_attrs = multi_utter[0] multi_utter = multi_utter[1:] for multi_da in zip(*multi_utter): ne_typed = multi_da[0][0] multi_da = multi_da[1:] attrs = multi_da[0][1] txts = [da[0] for da in multi_da] da_dict = {} da_dict['semantics'] = attrs.get('semantics', '') da_dict['da_conversational_domain'] = attrs.get('conversational_domain', '') da_dict['da_speech_act'] = attrs.get('speech_act', '') da_dict['speaker'] = utter_attrs.get('speaker', 'unknown') da_dict['id'] = "%s_%.5d" % (file_id, i) da_dict['fn'] = self.fn da_dict['ne_source'] = ne_typed for t, txt in zip(types, txts): da_dict[t] = txt.split() yield da_dict i += 1 dxml.unlink()
def readSemantics(files, data_sets, parseType='LR', default_data_set='normalized'): from svc.ui.dxml import DXML types = [] for d in data_sets: d = DATASET_TYPES[d] if d is None: d = default_data_set types.append(d) for fn in files: dxml = DXML.readFromFile(fn) to_zip = [] sem_list = [] for da in dxml.getSemantics(): sem_list.extend(da) to_zip.append(sem_list) for set_name, type_name in zip(data_sets, types): acts = dxml.getDialogueActs(type_name) to_zip.append(list(getDialogueActs(acts, set_name))) lengths = [len(i) for i in to_zip] if min(lengths) != max(lengths): raise ValueError("Bad count of <parametrized_act>s in file %r" % fn) file_id = os.path.splitext(os.path.basename(fn))[0] for i, item in enumerate(zip(*to_zip)): da_id = "%s_%.5d" % (file_id, i) da_semantics = item[0] da_txts = item[1:] smntcs = [ semantics.Semantics(da_id, da_semantics, txt, parseType) for txt in da_txts ] yield smntcs
def mlfparametrize(self, mlf, inDir=None, outDir=None, source_type='normalized', target_type='decoded', use_empty=False): self.setupDirs(inDir, outDir) foo, new_sen_file = tempfile.mkstemp() self.logger.info("Reading MLF") mlf = MLF.readFromFile(mlf) self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', 'utf-8') try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(source_type) idx = 0 for utter in acts: for txt, attrs in utter: if not txt: self.logger.info( "Empty <dialogue_act> in file: %s", fn) key = "*/%s_%05d" % (os.path.splitext( os.path.basename(fn))[0], idx) if key in mlf: line = mlf[key] elif not use_empty: line = [txt + '\n'] else: line = ['\n'] line = '\n'.join(line) fw.write(line) idx += 1 finally: fw.close() self.parametrize(target_type, new_sen_file) os.remove(new_sen_file)
def _multiParametrize(self, args): self.logger.info("Parametrizing XMLs") types = [a[0] for a in args] fns = [a[1] for a in args] frs = [codecs.open(fn, 'r', 'utf-8') for fn in fns] try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) txts = dxml.getTexts() acts = dxml.getDialogueActs() for fr, new_type in zip(frs, types): new_acts = [] new_txts = [] for utter, (foo1, attrs_txt) in zip(acts, txts): new_utter_acts = [] new_text = [] for da_text, attrs_act in utter: if not da_text: # Skip empty <dialogue_act> if new_type == types[0]: # Warn only once self.logger.info( "Empty <dialogue_act> in file: %s", fn) new_txt = self._readDAFromSenFile(fr) if new_type in LOWER_DATASETS: new_txt = new_txt.lower() new_text.append(new_txt) new_utter_acts.append((new_txt, attrs_act)) new_text = ' '.join(new_text) new_txts.append((new_text, attrs_txt)) new_acts.append(new_utter_acts) dxml.setTexts(new_type, new_txts) dxml.setDialogueActs(new_type, new_acts) fn_base = os.path.basename(fn) dxml.writeToFile(os.path.join(self.outDir, fn_base)) dxml.unlink() finally: for fr in frs: fr.close()
def _multiParametrize(self, args): self.logger.info("Parametrizing XMLs") types = [a[0] for a in args] fns = [a[1] for a in args] frs = [codecs.open(fn, 'r', 'utf-8') for fn in fns] try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) txts = dxml.getTexts() acts = dxml.getDialogueActs() for fr, new_type in zip(frs, types): new_acts = [] new_txts = [] for utter, (foo1, attrs_txt) in zip(acts, txts): new_utter_acts = [] new_text = [] for da_text, attrs_act in utter: if not da_text: # Skip empty <dialogue_act> if new_type == types[0]: # Warn only once self.logger.info("Empty <dialogue_act> in file: %s", fn) new_txt = self._readDAFromSenFile(fr) if new_type in LOWER_DATASETS: new_txt = new_txt.lower() new_text.append(new_txt) new_utter_acts.append((new_txt, attrs_act)) new_text = ' '.join(new_text) new_txts.append((new_text, attrs_txt)) new_acts.append(new_utter_acts) dxml.setTexts(new_type, new_txts) dxml.setDialogueActs(new_type, new_acts) fn_base = os.path.basename(fn) dxml.writeToFile(os.path.join(self.outDir, fn_base)) dxml.unlink() finally: for fr in frs: fr.close()
def readSemantics(files, data_sets, parseType='LR', default_data_set='normalized'): from svc.ui.dxml import DXML types = [] for d in data_sets: d = DATASET_TYPES[d] if d is None: d = default_data_set types.append(d) for fn in files: dxml = DXML.readFromFile(fn) to_zip = [] sem_list = [] for da in dxml.getSemantics(): sem_list.extend(da) to_zip.append(sem_list) for set_name, type_name in zip(data_sets, types): acts = dxml.getDialogueActs(type_name) to_zip.append(list(getDialogueActs(acts, set_name))) lengths = [len(i) for i in to_zip] if min(lengths) != max(lengths): raise ValueError("Bad count of <parametrized_act>s in file %r" % fn) file_id = os.path.splitext(os.path.basename(fn))[0] for i, item in enumerate(zip(*to_zip)): da_id = "%s_%.5d" % (file_id, i) da_semantics = item[0] da_txts = item[1:] smntcs = [semantics.Semantics(da_id, da_semantics, txt, parseType) for txt in da_txts] yield smntcs
def mlfparametrize(self, mlf, inDir=None, outDir=None, source_type='normalized', target_type='decoded', use_empty=False): self.setupDirs(inDir, outDir) foo, new_sen_file = tempfile.mkstemp() self.logger.info("Reading MLF") mlf = MLF.readFromFile(mlf) self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', 'utf-8') try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(source_type) idx = 0 for utter in acts: for txt, attrs in utter: if not txt: self.logger.info("Empty <dialogue_act> in file: %s", fn) key = "*/%s_%05d" % (os.path.splitext(os.path.basename(fn))[0], idx) if key in mlf: line = mlf[key] elif not use_empty: line = [txt + '\n'] else: line = ['\n'] line = '\n'.join(line) fw.write(line) idx += 1 finally: fw.close() self.parametrize(target_type, new_sen_file) os.remove(new_sen_file)
def mktxt(self, type='normalized', lower=False, new_sen_file=None): self.logger.info("Making build directories") self.makeDirs() self.logger.info("Getting text from XMLs") if new_sen_file is None: fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file') else: fn = new_sen_file fw = codecs.open(fn, 'w', pdt.PDT_ENCODING) try: for fn in self.getInputXMLs(): dxml = DXML.readFromFile(fn) acts = dxml.getDialogueActs(type) for utter in acts: for txt, attrs in utter: if not txt: self.logger.info("Empty <dialogue_act> in file: %s", fn) if lower: txt = txt.lower() + '\n' else: txt = txt.upper() + '\n' fw.write(txt) finally: fw.close()