コード例 #1
0
 def mktxt(self, type='normalized', lower=False, new_sen_file=None):
     self.logger.info("Making build directories")
     self.makeDirs()
     self.logger.info("Getting text from XMLs")
     if new_sen_file is None:
         fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
     else:
         fn = new_sen_file
     fw = codecs.open(fn, 'w', pdt.PDT_ENCODING)
     try:
         for fn in self.getInputXMLs():
             dxml = DXML.readFromFile(fn)
             acts = dxml.getDialogueActs(type)
             for utter in acts:
                 for txt, attrs in utter:
                     if not txt:
                         self.logger.info(
                             "Empty <dialogue_act> in file: %s", fn)
                     if lower:
                         txt = txt.lower() + '\n'
                     else:
                         txt = txt.upper() + '\n'
                     fw.write(txt)
     finally:
         fw.close()
コード例 #2
0
    def __iter__(self):
        dxml = DXML.readFromFile(self.fn)
        types = dxml.getDialogueTypes()
        utters = zip(dxml.getUtterances(), dxml.getDialogueActs(self.ne_type, removeNE=False), 
                     *[dxml.getDialogueActs(t) for t in types])
        file_id = os.path.splitext(os.path.basename(self.fn))[0]

        i = 0

        for multi_utter in utters:
            utter_attrs = multi_utter[0]
            multi_utter = multi_utter[1:]
            for multi_da in zip(*multi_utter):
                ne_typed = multi_da[0][0]
                multi_da = multi_da[1:]
                attrs = multi_da[0][1]
                txts = [da[0] for da in multi_da]


                da_dict = {}
                da_dict['semantics'] = attrs.get('semantics', '')
                da_dict['da_conversational_domain'] = attrs.get('conversational_domain', '')
                da_dict['da_speech_act'] = attrs.get('speech_act', '')
                da_dict['speaker'] = utter_attrs.get('speaker', 'unknown')
                da_dict['id'] = "%s_%.5d" % (file_id, i) 
                da_dict['fn'] = self.fn
                da_dict['ne_source'] = ne_typed

                for t, txt in zip(types, txts):
                    da_dict[t] = txt.split()
                yield da_dict
                i += 1

        dxml.unlink()
コード例 #3
0
def readSemantics(files,
                  data_sets,
                  parseType='LR',
                  default_data_set='normalized'):
    from svc.ui.dxml import DXML

    types = []
    for d in data_sets:
        d = DATASET_TYPES[d]
        if d is None:
            d = default_data_set
        types.append(d)

    for fn in files:
        dxml = DXML.readFromFile(fn)
        to_zip = []

        sem_list = []
        for da in dxml.getSemantics():
            sem_list.extend(da)

        to_zip.append(sem_list)

        for set_name, type_name in zip(data_sets, types):
            acts = dxml.getDialogueActs(type_name)
            to_zip.append(list(getDialogueActs(acts, set_name)))

        lengths = [len(i) for i in to_zip]

        if min(lengths) != max(lengths):
            raise ValueError("Bad count of <parametrized_act>s in file %r" %
                             fn)

        file_id = os.path.splitext(os.path.basename(fn))[0]

        for i, item in enumerate(zip(*to_zip)):
            da_id = "%s_%.5d" % (file_id, i)

            da_semantics = item[0]
            da_txts = item[1:]

            smntcs = [
                semantics.Semantics(da_id, da_semantics, txt, parseType)
                for txt in da_txts
            ]
            yield smntcs
コード例 #4
0
    def mlfparametrize(self,
                       mlf,
                       inDir=None,
                       outDir=None,
                       source_type='normalized',
                       target_type='decoded',
                       use_empty=False):
        self.setupDirs(inDir, outDir)
        foo, new_sen_file = tempfile.mkstemp()
        self.logger.info("Reading MLF")
        mlf = MLF.readFromFile(mlf)
        self.logger.info("Making build directories")
        self.makeDirs()
        self.logger.info("Getting text from XMLs")
        if new_sen_file is None:
            fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
        else:
            fn = new_sen_file
        fw = codecs.open(fn, 'w', 'utf-8')
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                acts = dxml.getDialogueActs(source_type)
                idx = 0
                for utter in acts:
                    for txt, attrs in utter:
                        if not txt:
                            self.logger.info(
                                "Empty <dialogue_act> in file: %s", fn)
                        key = "*/%s_%05d" % (os.path.splitext(
                            os.path.basename(fn))[0], idx)
                        if key in mlf:
                            line = mlf[key]
                        elif not use_empty:
                            line = [txt + '\n']
                        else:
                            line = ['\n']
                        line = '\n'.join(line)
                        fw.write(line)
                        idx += 1
        finally:
            fw.close()

        self.parametrize(target_type, new_sen_file)
        os.remove(new_sen_file)
コード例 #5
0
    def _multiParametrize(self, args):
        self.logger.info("Parametrizing XMLs")
        types = [a[0] for a in args]
        fns = [a[1] for a in args]
        frs = [codecs.open(fn, 'r', 'utf-8') for fn in fns]
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                txts = dxml.getTexts()
                acts = dxml.getDialogueActs()
                for fr, new_type in zip(frs, types):
                    new_acts = []
                    new_txts = []
                    for utter, (foo1, attrs_txt) in zip(acts, txts):
                        new_utter_acts = []
                        new_text = []
                        for da_text, attrs_act in utter:
                            if not da_text:
                                # Skip empty <dialogue_act>
                                if new_type == types[0]:
                                    # Warn only once
                                    self.logger.info(
                                        "Empty <dialogue_act> in file: %s", fn)
                            new_txt = self._readDAFromSenFile(fr)
                            if new_type in LOWER_DATASETS:
                                new_txt = new_txt.lower()
                            new_text.append(new_txt)
                            new_utter_acts.append((new_txt, attrs_act))
                        new_text = ' '.join(new_text)
                        new_txts.append((new_text, attrs_txt))
                        new_acts.append(new_utter_acts)

                    dxml.setTexts(new_type, new_txts)
                    dxml.setDialogueActs(new_type, new_acts)

                fn_base = os.path.basename(fn)
                dxml.writeToFile(os.path.join(self.outDir, fn_base))
                dxml.unlink()
        finally:
            for fr in frs:
                fr.close()
コード例 #6
0
    def _multiParametrize(self, args):
        self.logger.info("Parametrizing XMLs")
        types = [a[0] for a in args]
        fns = [a[1] for a in args]
        frs = [codecs.open(fn, 'r', 'utf-8') for fn in fns]
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                txts = dxml.getTexts()
                acts = dxml.getDialogueActs()
                for fr, new_type in zip(frs, types):
                    new_acts = []
                    new_txts = []
                    for utter, (foo1, attrs_txt) in zip(acts, txts):
                        new_utter_acts = []
                        new_text = []
                        for da_text, attrs_act in utter:
                            if not da_text:
                                # Skip empty <dialogue_act>
                                if new_type == types[0]:
                                    # Warn only once
                                    self.logger.info("Empty <dialogue_act> in file: %s", fn)
                            new_txt = self._readDAFromSenFile(fr)
                            if new_type in LOWER_DATASETS:
                                new_txt = new_txt.lower()
                            new_text.append(new_txt)
                            new_utter_acts.append((new_txt, attrs_act))
                        new_text = ' '.join(new_text)
                        new_txts.append((new_text, attrs_txt))
                        new_acts.append(new_utter_acts)

                    dxml.setTexts(new_type, new_txts)
                    dxml.setDialogueActs(new_type, new_acts)

                fn_base = os.path.basename(fn)
                dxml.writeToFile(os.path.join(self.outDir, fn_base))
                dxml.unlink()
        finally:
            for fr in frs:
                fr.close()
コード例 #7
0
def readSemantics(files, data_sets, parseType='LR', default_data_set='normalized'):
    from svc.ui.dxml import DXML

    types = []
    for d in data_sets:
        d = DATASET_TYPES[d]
        if d is None:
            d = default_data_set
        types.append(d)

    for fn in files:
        dxml = DXML.readFromFile(fn)
        to_zip = []

        sem_list = []
        for da in dxml.getSemantics():
            sem_list.extend(da)

        to_zip.append(sem_list)

        for set_name, type_name in zip(data_sets, types):
            acts = dxml.getDialogueActs(type_name)
            to_zip.append(list(getDialogueActs(acts, set_name)))

        lengths = [len(i) for i in to_zip]

        if min(lengths) != max(lengths):
            raise ValueError("Bad count of <parametrized_act>s in file %r" % fn)

        file_id = os.path.splitext(os.path.basename(fn))[0]

        for i, item in enumerate(zip(*to_zip)):
            da_id = "%s_%.5d" % (file_id, i) 

            da_semantics = item[0]
            da_txts = item[1:]

            smntcs = [semantics.Semantics(da_id, da_semantics, txt, parseType) for txt in da_txts]
            yield smntcs
コード例 #8
0
    def mlfparametrize(self, mlf, inDir=None, outDir=None,
            source_type='normalized', target_type='decoded', use_empty=False):
        self.setupDirs(inDir, outDir)
        foo, new_sen_file = tempfile.mkstemp()
        self.logger.info("Reading MLF")
        mlf = MLF.readFromFile(mlf)
        self.logger.info("Making build directories")
        self.makeDirs()
        self.logger.info("Getting text from XMLs")
        if new_sen_file is None:
            fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
        else:
            fn = new_sen_file
        fw = codecs.open(fn, 'w', 'utf-8')
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                acts = dxml.getDialogueActs(source_type)
                idx = 0
                for utter in acts:
                    for txt, attrs in utter:
                        if not txt:
                            self.logger.info("Empty <dialogue_act> in file: %s", fn)
                        key = "*/%s_%05d" % (os.path.splitext(os.path.basename(fn))[0], idx)
                        if key in mlf:
                            line = mlf[key]
                        elif not use_empty:
                            line = [txt + '\n']
                        else:
                            line = ['\n']
                        line = '\n'.join(line)
                        fw.write(line)
                        idx += 1
        finally:
            fw.close()

        self.parametrize(target_type, new_sen_file)
        os.remove(new_sen_file)
コード例 #9
0
 def mktxt(self, type='normalized', lower=False, new_sen_file=None):
     self.logger.info("Making build directories")
     self.makeDirs()
     self.logger.info("Getting text from XMLs")
     if new_sen_file is None:
         fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
     else:
         fn = new_sen_file
     fw = codecs.open(fn, 'w', pdt.PDT_ENCODING)
     try:
         for fn in self.getInputXMLs():
             dxml = DXML.readFromFile(fn)
             acts = dxml.getDialogueActs(type)
             for utter in acts:
                 for txt, attrs in utter:
                     if not txt:
                         self.logger.info("Empty <dialogue_act> in file: %s", fn)
                     if lower:
                         txt = txt.lower() + '\n'
                     else:
                         txt = txt.upper() + '\n'
                     fw.write(txt)
     finally:
         fw.close()