Esempio n. 1
0
    def eval(self, gold_file, output_file):
        def split_out(string):
            if string.strip():
                print(string.split(':')[1].strip())


        cmd = self.cmd_start + 'eval gold-file:{} output-file:{}'.format(gold_file, output_file)
        p = ProcessCommunicator(cmd, shell=True, stdout_func=split_out, stderr_func=watch_for_java_exception)
        exit_code = p.wait()
        return exit_code
Esempio n. 2
0
    def train(self, prefix, e, f):
        """
        Train the giza word alignments on the provided text files.

        :param prefix: Prefix for where the giza output files will be stored.
        :type prefix: path+prefix
        :param e: Path to the "e" file
        :type e: path
        :param f: Path to the "f"
        :type f: path
        """
        GIZA_LOG.info("Starting mgiza training from scratch...")
        self.tf = GizaFiles(prefix, e, f)

        GIZA_LOG.info("Converting txt files to SNTS and VCB files...")
        self.tf.txt_to_snt(ev = Vocab(), fv = Vocab())

        # Now, do the aligning...
        exe = c.getpath('mgiza')

        if exe is None:
            raise GizaAlignmentException('Path to mgiza binary not defined.')
        elif not os.path.exists(exe):
            raise GizaAlignmentException('Path to mgiza binary "%s" invalid.')


        elts = [exe,
                '-o', os.path.join(self.tf.prefix, self.tf.name),
                '-S', self.tf.e_vcb,
                '-T', self.tf.f_vcb,
                '-C', self.tf.ef_snt,
                '-CoocurrenceFile', self.tf.ef_cooc,
                '-hmmiterations', '5',
                '-model4iterations', '0',
                '-ncpus', '0']
        cmd = ' '.join(elts)

        GIZA_LOG.debug('Command: "{}"'.format(cmd))

        p = ProcessCommunicator(elts)
        status = p.wait()
        GIZA_LOG.debug("Exit code: {}".format(str(status)))

        if status != 0:
            raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status)))

        self.tf.merge_a3()
        # self.tf.clean()

        return self.tf.aligned_sents()
Esempio n. 3
0
def svmlight_to_vectors(txt):
    """
    Convert a text file to vectors.

    :param txt: Path to the text file.
    """

    MAXENT_LOG.info("Attempting to convert {} to a vector file.".format(txt))

    ntf = NamedTemporaryFile(mode='w', delete=False)
    ntf.close()

    p = ProcessCommunicator('{} import-svmlight --input "{}" --output "{}"'.format(mallet_bin, txt, ntf.name),
                            stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.warn, shell=True)


    if p.wait() == 0:
        MAXENT_LOG.debug("Successfully created temporary vector file {}".format(ntf.name))
        return ntf.name
    else:
        raise ClassifierException("SVMLight Conversion did not complete successfully.")
Esempio n. 4
0
def train_txt(txt_path, model_path):
    """
    Train a classifier from a svm-light format text file.

    :param txt_path:
    :param model_path:
    """

    vectors = svmlight_to_vectors(txt_path)
    MAXENT_LOG.info("Attempting to train classifier {}".format(model_path))
    p = ProcessCommunicator([mallet_bin, 'train-classifier',
                             '--input', vectors,
                             '--trainer', 'MaxEntTrainer',
                            '--output-classifier', model_path],
                            stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.info)

    if p.wait() == 0:
        MAXENT_LOG.debug("Success.")
        os.unlink(vectors)
        return MalletMaxent(model_path)
    else:
        raise ClassifierException("Training the classifier did not complete. Check the logs.")
Esempio n. 5
0
    def resume(self, prefix, new_e, new_f):
        """
        "Force" align a new set of data using the old
        model, per the instructions at:

        http://www.kyloo.net/software/doku.php/mgiza:forcealignment

        """
        # First, initialize a new GizaFile container for
        # the files we are going to create

        new_gf = GizaFiles(prefix, new_e, new_f)

        # Now, we're going to extend the old vocabulary files
        # with the new text to align.
        old_ev = Vocab.load(self.tf.e_vcb)
        old_fv = Vocab.load(self.tf.f_vcb)

        old_ev.add_from_txt(new_gf.e)
        old_fv.add_from_txt(new_gf.f)

        # Now that we've extended the vocabs, let's dump the
        # now-extended vocabs into the new filepaths.
        old_ev.dump(new_gf.e_vcb)
        old_fv.dump(new_gf.f_vcb)

        # Write out
        new_gf.txt_to_snt(ev = old_ev, fv = old_fv)

        exe = c.getpath('mgiza')

        if exe is None:
            raise GizaAlignmentException('Path to mgiza binary not defined.')
        elif not os.path.exists(exe):
            raise GizaAlignmentException('Path to mgiza binary "%s" invalid.' % exe)

        args = [exe, #self.tf.cfg,
                '-restart', '2',
                '-o', os.path.join(new_gf.prefix, new_gf.name),
                '-m2', '5',
                '-previoust', self.tf.t,
                '-previousa', self.tf.a,
                '-previousn', self.tf.n,
                '-previousd', self.tf.d3,
                '-c', new_gf.ef_snt,
                '-s', new_gf.e_vcb,
                '-t', new_gf.f_vcb,
                '-Coocurrencefile', new_gf.ef_cooc]

        cmd = ' '.join(args)
        GIZA_LOG.debug('Command: "{}"'.format(cmd))

        p = ProcessCommunicator(args)
        status = p.wait()

        GIZA_LOG.debug("Exit status {}".format(str(status)))

        if status != 0:
            raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status)))



        new_gf.merge_a3()
        # new_gf.clean()

        return new_gf.aligned_sents()
Esempio n. 6
0
 def test(self, model_file, test_file, output_file):
     cmd = self.cmd_start + 'test model-name:{} test-file:{} output-file:{}'.format(model_file, test_file, output_file)
     p = ProcessCommunicator(cmd, shell=True, stdout_func=None, stderr_func=watch_for_java_exception)
     exit_code = p.wait()
     return exit_code
Esempio n. 7
0
    def train(self, train_file, model_file):
        cmd = self.cmd_start + 'train train-file:{} model-name:{}'.format(train_file, model_file)

        p = ProcessCommunicator(cmd, shell=True, stdout_func=print, stderr_func=print)
        exit_code = p.wait()
        return exit_code