コード例 #1
0
    def __extractIntentionsFile(self, fn, data_dir, out_fn):
        self.logger.info('Making intention level from: %s', fn)
        if not os.path.exists(fn):
            self.logger.warn('File does not exists, skipping: %s', fn)
            return
        mlf = ConceptMLF.readFromFile(fn)
        fw = MLF()
        processor = NadraziProcessor([data_dir])
        for mlf_fn, tree in mlf.iteritems():
            fw[mlf_fn] = ['%s\n' % processor.process(mlf_fn, tree)]

        dirname = os.path.split(fn)[0]
        fw.writeToFile(os.path.join(dirname, out_fn))
コード例 #2
0
    def __extractIntentionsFile(self, fn, data_dir, out_fn):
        self.logger.info('Making intention level from: %s', fn)
        if not os.path.exists(fn):
            self.logger.warn('File does not exists, skipping: %s', fn)
            return
        mlf = ConceptMLF.readFromFile(fn)
        fw = MLF()
        processor = NadraziProcessor([data_dir])
        for mlf_fn, tree in mlf.iteritems():
            fw[mlf_fn] = ['%s\n' % processor.process(mlf_fn, tree)]

        dirname = os.path.split(fn)[0]
        fw.writeToFile(os.path.join(dirname, out_fn))
コード例 #3
0
    def mlfparametrize(self,
                       mlf,
                       inDir=None,
                       outDir=None,
                       source_type='normalized',
                       target_type='decoded',
                       use_empty=False):
        self.setupDirs(inDir, outDir)
        foo, new_sen_file = tempfile.mkstemp()
        self.logger.info("Reading MLF")
        mlf = MLF.readFromFile(mlf)
        self.logger.info("Making build directories")
        self.makeDirs()
        self.logger.info("Getting text from XMLs")
        if new_sen_file is None:
            fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
        else:
            fn = new_sen_file
        fw = codecs.open(fn, 'w', 'utf-8')
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                acts = dxml.getDialogueActs(source_type)
                idx = 0
                for utter in acts:
                    for txt, attrs in utter:
                        if not txt:
                            self.logger.info(
                                "Empty <dialogue_act> in file: %s", fn)
                        key = "*/%s_%05d" % (os.path.splitext(
                            os.path.basename(fn))[0], idx)
                        if key in mlf:
                            line = mlf[key]
                        elif not use_empty:
                            line = [txt + '\n']
                        else:
                            line = ['\n']
                        line = '\n'.join(line)
                        fw.write(line)
                        idx += 1
        finally:
            fw.close()

        self.parametrize(target_type, new_sen_file)
        os.remove(new_sen_file)
コード例 #4
0
    def mlfparametrize(self, mlf, inDir=None, outDir=None,
            source_type='normalized', target_type='decoded', use_empty=False):
        self.setupDirs(inDir, outDir)
        foo, new_sen_file = tempfile.mkstemp()
        self.logger.info("Reading MLF")
        mlf = MLF.readFromFile(mlf)
        self.logger.info("Making build directories")
        self.makeDirs()
        self.logger.info("Getting text from XMLs")
        if new_sen_file is None:
            fn = os.path.join(self.settings['PDT_WORK_DIR'], 'sen_file')
        else:
            fn = new_sen_file
        fw = codecs.open(fn, 'w', 'utf-8')
        try:
            for fn in self.getInputXMLs():
                dxml = DXML.readFromFile(fn)
                acts = dxml.getDialogueActs(source_type)
                idx = 0
                for utter in acts:
                    for txt, attrs in utter:
                        if not txt:
                            self.logger.info("Empty <dialogue_act> in file: %s", fn)
                        key = "*/%s_%05d" % (os.path.splitext(os.path.basename(fn))[0], idx)
                        if key in mlf:
                            line = mlf[key]
                        elif not use_empty:
                            line = [txt + '\n']
                        else:
                            line = ['\n']
                        line = '\n'.join(line)
                        fw.write(line)
                        idx += 1
        finally:
            fw.close()

        self.parametrize(target_type, new_sen_file)
        os.remove(new_sen_file)
コード例 #5
0
    def main(
        self,
        model_dir,
        encoding=None,
        batch=False,
        omit_leaves=False,
        mlf=False,
        xml_dir=None,
        ref_mlf=None,
        skip_empty=False,
        input_chain=None,
        batch_size=100,
        no_underscores=True,
        force_pdt=False,
        pdt_dir=None,
    ):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == "nt":
                encoding = "cp1250"
            else:
                encoding = "iso-8859-2"

        datasets_fn = pjoin(model_dir, "datasets")
        datasets_fr = file(datasets_fn, "r")
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != "off":
                isymMaps.append(SymMap.readFromFile(pjoin(model_dir, "isym%d.map" % (i + 1,))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, "osym.map"))

        if "signed" in datasets:
            da_type = "signed"
        else:
            da_type = "normalized"

        if not pdt_dir:
            pdt_dir = "/opt/PDT-2.0/tools/machine-annotation"

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if "lemma" in datasets or "pos" in datasets:
                if os.name == "nt":
                    raise RuntimeError("Datasets 'lemma' and 'pos' are unsupported on Windows")
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader, datasets, datasets[0], noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if da_empty and skip_empty:
                    continue

                refMLF[da_id] = da_semantics + "\n"
                dcd = self.parseLine(model_dir, [da_txts], isymMaps, osymMap, omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + "\n"
                    else:
                        hypMLF[da_id] = ";".join(dcd).encode(encoding) + "\n"
                else:
                    hypMLF[da_id] = line + "\n"
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if da_empty and skip_empty:
                        continue

                    refMLF[da_id] = da_semantics + "\n"
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir, lines, isymMaps, osymMap, omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + "\n"
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = "".join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)
コード例 #6
0
    def main(self,
             model_dir,
             encoding=None,
             batch=False,
             omit_leaves=False,
             mlf=False,
             xml_dir=None,
             ref_mlf=None,
             skip_empty=False,
             input_chain=None,
             batch_size=100,
             no_underscores=True,
             force_pdt=False,
             pdt_dir=None):
        encoding = sys.stdout.encoding
        if encoding is None:
            if os.name == 'nt':
                encoding = 'cp1250'
            else:
                encoding = 'iso-8859-2'

        datasets_fn = pjoin(model_dir, 'datasets')
        datasets_fr = file(datasets_fn, 'r')
        datasets = []
        isymMaps = []
        for i, line in enumerate(datasets_fr):
            line = line.strip()
            datasets.append(line)
            if line != 'off':
                isymMaps.append(
                    SymMap.readFromFile(
                        pjoin(model_dir, 'isym%d.map' % (i + 1, ))))

        osymMap = SymMap.readFromFile(pjoin(model_dir, 'osym.map'))

        if 'signed' in datasets:
            da_type = 'signed'
        else:
            da_type = 'normalized'

        if not pdt_dir:
            pdt_dir = '/opt/PDT-2.0/tools/machine-annotation'

        if xml_dir:
            reader = input.MultiReader([xml_dir], input.DXMLReader)
            if force_pdt and 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        else:
            reader = input.StdInReader(encoding=encoding, type=da_type)
            if 'lemma' in datasets or 'pos' in datasets:
                if os.name == 'nt':
                    raise RuntimeError(
                        "Datasets 'lemma' and 'pos' are unsupported on Windows"
                    )
                reader = input.PDTReader(pdt_dir, reader, online=not batch)
        if input_chain is not None:
            reader = input.InputChain(input_chain, reader)
        generator = input.InputGenerator(reader,
                                         datasets,
                                         datasets[0],
                                         noUnderscores=no_underscores)
        hypMLF = MLF()
        refMLF = MLF()
        if not batch:
            for da_fn, da_id, da_semantics, da_txts in generator.readInputs():
                da_empty = not bool(da_semantics.strip())
                if (da_empty and skip_empty):
                    continue

                refMLF[da_id] = da_semantics + '\n'
                dcd = self.parseLine(model_dir, [da_txts],
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                if dcd:
                    if len(dcd) == 1:
                        hypMLF[da_id] = dcd[0].encode(encoding) + '\n'
                    else:
                        hypMLF[da_id] = ';'.join(dcd).encode(encoding) + '\n'
                else:
                    hypMLF[da_id] = line + '\n'
                if not mlf:
                    print hypMLF[da_id],
        else:
            all_processed = False
            inputs = generator.readInputs()
            while not all_processed:
                da_count = 0
                lines = []
                ids = []
                for da_fn, da_id, da_semantics, da_txts in inputs:
                    da_empty = not bool(da_semantics.strip())
                    if (da_empty and skip_empty):
                        continue

                    refMLF[da_id] = da_semantics + '\n'
                    lines.append(da_txts)
                    ids.append(da_id)
                    da_count += 1
                    if da_count >= batch_size:
                        break
                else:
                    all_processed = True

                dcd = self.parseLine(model_dir,
                                     lines,
                                     isymMaps,
                                     osymMap,
                                     omitLeaves=omit_leaves)
                for da_id, ol in zip(ids, dcd):
                    hypMLF[da_id] = ol.encode(encoding) + '\n'
                    if not mlf:
                        print hypMLF[da_id],
        if mlf:
            s = ''.join(hypMLF.toLines())
            print s

        if ref_mlf:
            refMLF.writeToFile(ref_mlf)