Ejemplo n.º 1
0
    def eval(self, gold_file, output_file):
        def split_out(string):
            if string.strip():
                print(string.split(':')[1].strip())


        cmd = self.cmd_start + 'eval gold-file:{} output-file:{}'.format(gold_file, output_file)
        p = ProcessCommunicator(cmd, shell=True, stdout_func=split_out, stderr_func=watch_for_java_exception)
        exit_code = p.wait()
        return exit_code
Ejemplo n.º 2
0
    def train(self, prefix, e, f):
        """
        Train the giza word alignments on the provided text files.

        :param prefix: Prefix for where the giza output files will be stored.
        :type prefix: path+prefix
        :param e: Path to the "e" file
        :type e: path
        :param f: Path to the "f"
        :type f: path
        """
        GIZA_LOG.info("Starting mgiza training from scratch...")
        self.tf = GizaFiles(prefix, e, f)

        GIZA_LOG.info("Converting txt files to SNTS and VCB files...")
        self.tf.txt_to_snt(ev = Vocab(), fv = Vocab())

        # Now, do the aligning...
        exe = c.getpath('mgiza')

        if exe is None:
            raise GizaAlignmentException('Path to mgiza binary not defined.')
        elif not os.path.exists(exe):
            raise GizaAlignmentException('Path to mgiza binary "%s" invalid.')


        elts = [exe,
                '-o', os.path.join(self.tf.prefix, self.tf.name),
                '-S', self.tf.e_vcb,
                '-T', self.tf.f_vcb,
                '-C', self.tf.ef_snt,
                '-CoocurrenceFile', self.tf.ef_cooc,
                '-hmmiterations', '5',
                '-model4iterations', '0',
                '-ncpus', '0']
        cmd = ' '.join(elts)

        GIZA_LOG.debug('Command: "{}"'.format(cmd))

        p = ProcessCommunicator(elts)
        status = p.wait()
        GIZA_LOG.debug("Exit code: {}".format(str(status)))

        if status != 0:
            raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status)))

        self.tf.merge_a3()
        # self.tf.clean()

        return self.tf.aligned_sents()
Ejemplo n.º 3
0
def svmlight_to_vectors(txt):
    """
    Convert a text file to vectors.

    :param txt: Path to the text file.
    """

    MAXENT_LOG.info("Attempting to convert {} to a vector file.".format(txt))

    ntf = NamedTemporaryFile(mode='w', delete=False)
    ntf.close()

    p = ProcessCommunicator('{} import-svmlight --input "{}" --output "{}"'.format(mallet_bin, txt, ntf.name),
                            stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.warn, shell=True)


    if p.wait() == 0:
        MAXENT_LOG.debug("Successfully created temporary vector file {}".format(ntf.name))
        return ntf.name
    else:
        raise ClassifierException("SVMLight Conversion did not complete successfully.")
Ejemplo n.º 4
0
def train_txt(txt_path, model_path):
    """
    Train a classifier from a svm-light format text file.

    :param txt_path:
    :param model_path:
    """

    vectors = svmlight_to_vectors(txt_path)
    MAXENT_LOG.info("Attempting to train classifier {}".format(model_path))
    p = ProcessCommunicator([mallet_bin, 'train-classifier',
                             '--input', vectors,
                             '--trainer', 'MaxEntTrainer',
                            '--output-classifier', model_path],
                            stdout_func=MAXENT_LOG.info, stderr_func=MAXENT_LOG.info)

    if p.wait() == 0:
        MAXENT_LOG.debug("Success.")
        os.unlink(vectors)
        return MalletMaxent(model_path)
    else:
        raise ClassifierException("Training the classifier did not complete. Check the logs.")
Ejemplo n.º 5
0
    def __init__(self, model):

        # Get the jar defined in the env.conf file.

        # If the .jar is not defined... ----------------------------------------
        """
        :param model: Path to the model file.
        :type model: str
        """
        # -------------------------------------------
        # Do some error checking.
        # -------------------------------------------
        if tagger_dir is None:
            TAG_LOG.critical('Path to the stanford tagger .jar file is not defined.')
            raise TaggerError('Path to the stanford tagger .jar file is not defined.')

        elif not os.path.exists(tagger_dir):
            raise TaggerError('Path to the stanford tagger "{}" is not found.'.format(tagger_dir))


        if java_bin is None:
            raise TaggerError("Path to java bin is undefined!")
        # -------------------------------------------


        other_jars = glob.glob(os.path.join(tagger_dir, 'lib/*.jar'))

        classpath = ':'.join([tagger_jar]+other_jars)

        self.results_queue = []

        self.st = ProcessCommunicator([java_bin,
                                       '-cp', classpath,
                                       'edu.stanford.nlp.tagger.maxent.MaxentTagger',
                                       '-model', model,
                                       '-sentenceDelimiter', 'newline',
                                       '-tokenize', 'false'],
                                      stderr_func=stanford_stderr_handler,
                                      stdout_func=lambda x: stanford_stdout_handler(x, self.results_queue),
                                      blocking=False)
Ejemplo n.º 6
0
class StanfordPOSTagger(object):
    """
    Instantiate a java VM to run the stanford tagger.
    """
    def __init__(self, model):

        # Get the jar defined in the env.conf file.

        # If the .jar is not defined... ----------------------------------------
        """
        :param model: Path to the model file.
        :type model: str
        """
        # -------------------------------------------
        # Do some error checking.
        # -------------------------------------------
        if tagger_dir is None:
            TAG_LOG.critical('Path to the stanford tagger .jar file is not defined.')
            raise TaggerError('Path to the stanford tagger .jar file is not defined.')

        elif not os.path.exists(tagger_dir):
            raise TaggerError('Path to the stanford tagger "{}" is not found.'.format(tagger_dir))


        if java_bin is None:
            raise TaggerError("Path to java bin is undefined!")
        # -------------------------------------------


        other_jars = glob.glob(os.path.join(tagger_dir, 'lib/*.jar'))

        classpath = ':'.join([tagger_jar]+other_jars)

        self.results_queue = []

        self.st = ProcessCommunicator([java_bin,
                                       '-cp', classpath,
                                       'edu.stanford.nlp.tagger.maxent.MaxentTagger',
                                       '-model', model,
                                       '-sentenceDelimiter', 'newline',
                                       '-tokenize', 'false'],
                                      stderr_func=stanford_stderr_handler,
                                      stdout_func=lambda x: stanford_stdout_handler(x, self.results_queue),
                                      blocking=False)


    def tag_tokenization(self, tokenization, **kwargs):
        return self.tag(tokenization.text(), **kwargs)

    def tag(self, s, **kwargs):

        # Lowercase if asked for
        """
        :rtype: list[POSToken]
        """
        if kwargs.get('lowercase', True):
            s = s.lower()

        self.st.stdin.write(bytes(s+'\r\n', encoding='utf-8'))

        # Try to flush out to stdin
        try:
            self.st.stdin.flush()
        except BrokenPipeError:
            raise CriticalTaggerError('The Stanford parser unexpectedly quit.')

        while len(self.results_queue) == 0:
            time.sleep(0.25)

        return self.results_queue.pop()

    def close(self):
        self.st.kill()
Ejemplo n.º 7
0
    def resume(self, prefix, new_e, new_f):
        """
        "Force" align a new set of data using the old
        model, per the instructions at:

        http://www.kyloo.net/software/doku.php/mgiza:forcealignment

        """
        # First, initialize a new GizaFile container for
        # the files we are going to create

        new_gf = GizaFiles(prefix, new_e, new_f)

        # Now, we're going to extend the old vocabulary files
        # with the new text to align.
        old_ev = Vocab.load(self.tf.e_vcb)
        old_fv = Vocab.load(self.tf.f_vcb)

        old_ev.add_from_txt(new_gf.e)
        old_fv.add_from_txt(new_gf.f)

        # Now that we've extended the vocabs, let's dump the
        # now-extended vocabs into the new filepaths.
        old_ev.dump(new_gf.e_vcb)
        old_fv.dump(new_gf.f_vcb)

        # Write out
        new_gf.txt_to_snt(ev = old_ev, fv = old_fv)

        exe = c.getpath('mgiza')

        if exe is None:
            raise GizaAlignmentException('Path to mgiza binary not defined.')
        elif not os.path.exists(exe):
            raise GizaAlignmentException('Path to mgiza binary "%s" invalid.' % exe)

        args = [exe, #self.tf.cfg,
                '-restart', '2',
                '-o', os.path.join(new_gf.prefix, new_gf.name),
                '-m2', '5',
                '-previoust', self.tf.t,
                '-previousa', self.tf.a,
                '-previousn', self.tf.n,
                '-previousd', self.tf.d3,
                '-c', new_gf.ef_snt,
                '-s', new_gf.e_vcb,
                '-t', new_gf.f_vcb,
                '-Coocurrencefile', new_gf.ef_cooc]

        cmd = ' '.join(args)
        GIZA_LOG.debug('Command: "{}"'.format(cmd))

        p = ProcessCommunicator(args)
        status = p.wait()

        GIZA_LOG.debug("Exit status {}".format(str(status)))

        if status != 0:
            raise GizaAlignmentException("mgiza exited abnormally with a return code of {}".format(str(status)))



        new_gf.merge_a3()
        # new_gf.clean()

        return new_gf.aligned_sents()
Ejemplo n.º 8
0
 def test(self, model_file, test_file, output_file):
     cmd = self.cmd_start + 'test model-name:{} test-file:{} output-file:{}'.format(model_file, test_file, output_file)
     p = ProcessCommunicator(cmd, shell=True, stdout_func=None, stderr_func=watch_for_java_exception)
     exit_code = p.wait()
     return exit_code
Ejemplo n.º 9
0
    def train(self, train_file, model_file):
        cmd = self.cmd_start + 'train train-file:{} model-name:{}'.format(train_file, model_file)

        p = ProcessCommunicator(cmd, shell=True, stdout_func=print, stderr_func=print)
        exit_code = p.wait()
        return exit_code