def _init_server(self):
     """Initializes a server that registers new workers."""
     registrar_class, ranker_dump_path = get_worker_registrar_for(self)
     n_tries = 0
     self.server = None
     last_error = None
     while self.server is None and n_tries < 10:
         try:
             n_tries += 1
             self.server = ThreadPoolServer(service=registrar_class,
                                            nbThreads=1,
                                            port=self.port)
         except socket.error as e:
             log_warn('Port %d in use, trying to use a higher port...' %
                      self.port)
             self.port += 1
             last_error = e
     if self.server is None:
         if last_error is not None:
             raise last_error
         raise Exception('Could not initialize server')
     self.services = set()
     self.free_services = deque()
     self.pending_requests = set()
     self.jobs = []
     self.server_thread = Thread(target=self.server.start)
     self.server_thread.setDaemon(True)
     self.server_thread.start()
     self.ranker_dump_path = ranker_dump_path
 def delete(self):
     """Delete this job."""
     if self.submitted:
         try:
             self.__try_command('qdel ' + self.jobid)
         except RuntimeError as e:
             log_warn('Could not delete job: ' + str(e))
 def _init_server(self):
     """Initializes a server that registers new workers."""
     registrar_class, ranker_dump_path = get_worker_registrar_for(self)
     n_tries = 0
     self.server = None
     last_error = None
     while self.server is None and n_tries < 10:
         try:
             n_tries += 1
             self.server = ThreadPoolServer(service=registrar_class, nbThreads=1, port=self.port)
         except socket.error as e:
             log_warn('Port %d in use, trying to use a higher port...' % self.port)
             self.port += 1
             last_error = e
     if self.server is None:
         if last_error is not None:
             raise last_error
         raise Exception('Could not initialize server')
     self.services = set()
     self.free_services = deque()
     self.pending_requests = set()
     self.jobs = []
     self.server_thread = Thread(target=self.server.start)
     self.server_thread.setDaemon(True)
     self.server_thread.start()
     self.ranker_dump_path = ranker_dump_path
Exemple #4
0
    def _get_merged_child_type_cdfs(self, da):
        """Get merged child CDFs (i.e. lists of possible children, given parent IDs) for the
        given DA.

        All nodes  occurring in training data items that contain DAIs from the current DA are
        included. If `compatible_dais` is set, nodes that always occur with DAIs not in the
        current DA will be excluded.

        @param da: the current dialogue act
        """
        # get all nodes occurring in training data items containing the DAIs from the current DA
        merged_counts = defaultdict(Counter)
        for dai in da:
            try:
                for parent_id in self.child_type_counts[dai]:
                    merged_counts[parent_id].update(self.child_type_counts[dai][parent_id])
            except KeyError:
                log_warn('DAI ' + unicode(dai) + ' unknown, adding nothing to CDF.')

#         log_info('Node types: %d' % sum(len(c.keys()) for c in merged_counts.values()))

        # remove nodes that are not compatible with the current DA (their list of
        # minimum compatibility DAIs is not similar to the current DA)
        for _, counts in merged_counts.items():
            for node in counts.keys():
                if not self._compatible(da, NodeData(t_lemma=node[1], formeme=node[0])):
                    del counts[node]

#         log_info('Node types after pruning: %d' % sum(len(c.keys()) for c in merged_counts.values()))
#         log_info('Compatible lemmas: %s' % ' '.join(set([n[1] for c in merged_counts.values()
#                                                          for n in c.keys()])))

        return self.cdfs_from_counts(merged_counts)
Exemple #5
0
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$',
                                          r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(
                    classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
Exemple #6
0
    def save_model(self, model_fname_pattern):
        if not self._trained_model:
            log_warn('No lexicalizer model trained, skipping saving!')

        model_fname = re.sub(r'(\.pickle)?(\.gz)?$', '.kenlm.bin',
                             model_fname_pattern)
        shutil.copyfile(self._trained_model, model_fname)
Exemple #7
0
    def load_from_file(model_fname):
        """Load the generator from a file (actually two files, one for configuration and one
        for the TensorFlow graph, which must be stored separately).

        @param model_fname: file name (for the configuration file); TF graph must be stored with a \
            different extension
        """
        log_info("Loading generator from %s..." % model_fname)
        with file_stream(model_fname, 'rb', encoding=None) as fh:
            data = pickle.load(fh)
            ret = Seq2SeqGen(cfg=data['cfg'])
            ret.load_all_settings(data)

        if ret.classif_filter:
            classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname)
            if os.path.isfile(classif_filter_fname):
                ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname)
            else:
                log_warn("Classification filter data not found, ignoring.")
                ret.classif_filter = False

        # re-build TF graph and restore the TF session
        tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname)
        ret._init_neural_network()
        ret.saver.restore(ret.session, tf_session_fname)

        return ret
Exemple #8
0
 def save_model(self, model_fname_pattern):
     if not self._word_freq:
         log_warn('No lexicalizer model trained, skipping saving!')
     model_fname = re.sub(r'(.pickle)?(.gz)?$', '.wfreq',
                          model_fname_pattern)
     with file_stream(model_fname, 'wb', encoding=None) as fh:
         pickle.dump(self._word_freq, fh, pickle.HIGHEST_PROTOCOL)
Exemple #9
0
 def tf_check_filename(self, fname):
     """Checks if a directory is specified in the file name (otherwise newer TF versions
     would crash when saving a model).
     @param fname: The file name to be checked.
     @return: Adjusted file name (with "./" if no directory was specified)."""
     if not os.path.dirname(fname):
         log_warn("Directory not specified, using current directory: %s" % fname)
         fname = os.path.join(os.curdir, fname)
     return fname
Exemple #10
0
 def tf_check_filename(self, fname):
     """Checks if a directory is specified in the file name (otherwise newer TF versions
     would crash when saving a model).
     @param fname: The file name to be checked.
     @return: Adjusted file name (with "./" if no directory was specified)."""
     if not os.path.dirname(fname):
         log_warn("Directory not specified, using current directory: %s" %
                  fname)
         fname = os.path.join(os.curdir, fname)
     return fname
Exemple #11
0
 def create_ttree(self):
     """Convert the TreeData structure to a regular t-tree."""
     tnodes = [T(data={'ord': 0})] + [
         T(data={
             't_lemma': node.t_lemma,
             'formeme': node.formeme,
             'ord': i
         }) for i, node in enumerate(self.nodes[1:], start=1)
     ]
     for parent_idx, tnode in zip(self.parents[1:], tnodes[1:]):
         try:
             tnode.parent = tnodes[parent_idx]
         except RuntimeException as e:
             # if a cycle is attempted, the node will hang on technical root
             log_warn("Error when creatting t-tree: %s\nTree: %s" %
                      (str(e), str(self)))
     return tnodes[0]
Exemple #12
0
    def get_merged_limits(self, da):
        """Return merged limits on node counts (total and on each tree level). Uses a
        maximum for all DAIs in the given DA.

        Returns None if the given candidate generator does not have any node limits.

        @param da: the current dialogue act
        @rtype: defaultdict(Counter)
        """
        if not self.node_limits:
            return None
        merged_limits = defaultdict(int)
        for dai in da:
            try:
                for level, level_limit in self.node_limits[dai].iteritems():
                    merged_limits[level] = max((level_limit, merged_limits[level]))
            except KeyError:
                log_warn('DAI ' + unicode(dai) + ' unknown, limits unchanged.')
        return merged_limits
Exemple #13
0
    def get_merged_limits(self, da):
        """Return merged limits on node counts (total and on each tree level). Uses a
        maximum for all DAIs in the given DA.

        Returns None if the given candidate generator does not have any node limits.

        @param da: the current dialogue act
        @rtype: defaultdict(Counter)
        """
        if not self.node_limits:
            return None
        merged_limits = defaultdict(int)
        for dai in da:
            try:
                for level, level_limit in self.node_limits[dai].items():
                    merged_limits[level] = max(
                        (level_limit, merged_limits[level]))
            except KeyError:
                log_warn('DAI ' + str(dai) + ' unknown, limits unchanged.')
        return merged_limits
Exemple #14
0
    def _get_merged_child_type_cdfs(self, da):
        """Get merged child CDFs (i.e. lists of possible children, given parent IDs) for the
        given DA.

        All nodes  occurring in training data items that contain DAIs from the current DA are
        included. If `compatible_dais` is set, nodes that always occur with DAIs not in the
        current DA will be excluded.

        @param da: the current dialogue act
        """
        # get all nodes occurring in training data items containing the DAIs from the current DA
        merged_counts = defaultdict(Counter)
        for dai in da:
            try:
                for parent_id in self.child_type_counts[dai]:
                    merged_counts[parent_id].update(
                        self.child_type_counts[dai][parent_id])
            except KeyError:
                log_warn('DAI ' + str(dai) +
                         ' unknown, adding nothing to CDF.')

#         log_info('Node types: %d' % sum(len(c.keys()) for c in merged_counts.values()))

# remove nodes that are not compatible with the current DA (their list of
# minimum compatibility DAIs is not similar to the current DA)
        for _, counts in list(merged_counts.items()):
            for node in list(counts.keys()):
                if not self._compatible(
                        da, NodeData(t_lemma=node[1], formeme=node[0])):
                    del counts[node]

#         log_info('Node types after pruning: %d' % sum(len(c.keys()) for c in merged_counts.values()))
#         log_info('Compatible lemmas: %s' % ' '.join(set([n[1] for c in merged_counts.values()
#                                                          for n in c.keys()])))

        return self.cdfs_from_counts(merged_counts)
Exemple #15
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # prepare evaluation
    if args.eval_file is None or args.eval_file.endswith('.txt'):  # just tokens
        gen_doc = []
    else:  # Trees: depending on PyTreex
        from pytreex.core.document import Document
        eval_doc = read_ttrees(args.eval_file)
        if args.ref_selector == args.target_selector:
            gen_doc = Document()
        else:
            gen_doc = eval_doc

    if args.eval_file:
        tgen.init_slot_err_stats()

    # generate
    log_info('Generating...')
    tgen.selector = args.target_selector  # override target selector for generation
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        tgen.generate_tree(da, gen_doc)

    # evaluate
    if args.eval_file is not None:
        log_info(tgen.get_slot_err_stats())
        # evaluate the generated tokens (F1 and BLEU scores)
        if args.eval_file.endswith('.txt'):
            lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file))
            eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc)
        # evaluate the generated trees against golden trees
        else:
            eval_trees(das,
                       ttrees_from_doc(eval_doc, tgen.language, args.ref_selector),
                       ttrees_from_doc(gen_doc, tgen.language, args.target_selector),
                       eval_doc, tgen.language, tgen.selector)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_doc, args.output_file)
        else:
            write_ttrees(gen_doc, args.output_file)
Exemple #16
0
def main(argv):

    opts, files = getopt(argv, 'f:c:d:')

    folds = 10
    chunk_size = 2
    dir_prefix = 'cv'

    for opt, arg in opts:
        if opt == '-f':
            folds = int(arg)
        elif opt == '-c':
            chunk_size = int(arg)
        elif opt == '-d':
            dir_prefix = arg

    if not files:
        sys.exit(__doc__)

    random.seed(1206)
    ordering = None

    for file in files:
        # read all data
        data = []
        with file_stream(file) as fh:
            chunk = []
            for line in fh:
                chunk.append(line)
                if len(chunk) == chunk_size:
                    data.append(chunk)
                    chunk = []
            if chunk:
                log_warn('Incomplete chunk at end of file %s, size %d' %
                         (file, len(chunk)))

        if ordering is None:
            # create ordering
            ordering = range(len(data))
            random.shuffle(ordering)

            # create directories
            for fold_no in xrange(folds):
                os.mkdir(dir_prefix + "%02d" % fold_no)

        # output as train and test into all CV portions
        fold_size, bigger_folds = divmod(len(data), folds)
        for fold_no in xrange(folds):
            # compute test data bounds
            if fold_no < bigger_folds:
                test_lo = (fold_size + 1) * fold_no
                test_hi = (fold_size + 1) * (fold_no + 1)
            else:
                test_lo = fold_size * fold_no + bigger_folds
                test_hi = fold_size * (fold_no + 1) + bigger_folds
            # select train and test data instances
            train_data = [
                data[idx] for ord, idx in enumerate(ordering)
                if ord < test_lo or ord >= test_hi
            ]
            test_data = [
                data[idx] for ord, idx in enumerate(ordering)
                if ord >= test_lo and ord < test_hi
            ]

            # write them out to a file (replace `all' in name with train/test)
            fname_base = os.path.basename(file)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train',
                       train_data)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test',
                       test_data)
Exemple #17
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Exemple #18
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""

    ap = ArgumentParser()

    ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation')
    ap.add_argument('-a', '--abstr-file', type=str,
                    help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)')
    ap.add_argument('-r', '--ref-selector', type=str, default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument('-t', '--target-selector', type=str, default='',
                    help='Target selector for generated trees in the output file')
    ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name')
    ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file')
    ap.add_argument('-b', '--beam-size', type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c', '--context-file', type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn('Generator is not trained to use context, ignoring context input file.')
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
    log_info(tgen.get_slot_err_stats())

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector,
                                args.target_selector or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees)

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        if args.output_file.endswith('.txt'):
            write_tokens(gen_trees, args.output_file)
        else:
            write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language,
                                          args.target_selector or tgen.selector),
                         args.output_file)
Exemple #19
0
def seq2seq_gen(args):
    """Sequence-to-sequence generation"""
    def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language,
                              selector):
        """Decide to write t-trees or tokens based on the output file name."""
        if output_file.endswith('.txt'):
            gen_toks = [t.to_tok_list() for t in gen_trees]
            postprocess_tokens(gen_toks, das)
            write_tokens(gen_toks, output_file)
        else:
            write_ttrees(
                create_ttree_doc(gen_trees, base_doc, language, selector),
                output_file)

    ap = ArgumentParser(prog=' '.join(sys.argv[0:2]))

    ap.add_argument('-e',
                    '--eval-file',
                    type=str,
                    help='A ttree/text file for evaluation')
    ap.add_argument(
        '-a',
        '--abstr-file',
        type=str,
        help=
        'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)'
    )
    ap.add_argument('-r',
                    '--ref-selector',
                    type=str,
                    default='',
                    help='Selector for reference trees in the evaluation file')
    ap.add_argument(
        '-t',
        '--target-selector',
        type=str,
        default='',
        help='Target selector for generated trees in the output file')
    ap.add_argument('-d',
                    '--debug-logfile',
                    type=str,
                    help='Debug output file name')
    ap.add_argument('-w',
                    '--output-file',
                    type=str,
                    help='Output tree/text file')
    ap.add_argument('-D',
                    '--delex-output-file',
                    type=str,
                    help='Output file for trees/text before lexicalization')
    ap.add_argument('-b',
                    '--beam-size',
                    type=int,
                    help='Override beam size for beam search decoding')
    ap.add_argument('-c',
                    '--context-file',
                    type=str,
                    help='Input ttree/text file with context utterances')

    ap.add_argument('seq2seq_model_file',
                    type=str,
                    help='Trained Seq2Seq generator model')
    ap.add_argument('da_test_file', type=str, help='Input DAs for generation')

    args = ap.parse_args(args)

    if args.debug_logfile:
        set_debug_stream(file_stream(args.debug_logfile, mode='w'))

    # load the generator
    tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file)
    if args.beam_size is not None:
        tgen.beam_size = args.beam_size

    # read input files (DAs, contexts)
    das = read_das(args.da_test_file)
    if args.context_file:
        if not tgen.use_context and not tgen.context_bleu_weight:
            log_warn(
                'Generator is not trained to use context, ignoring context input file.'
            )
        else:
            if args.context_file.endswith('.txt'):
                contexts = read_tokens(args.context_file)
            else:
                contexts = tokens_from_doc(read_ttrees(args.context_file),
                                           tgen.language, tgen.selector)
            das = [(context, da) for context, da in zip(contexts, das)]
    elif tgen.use_context or tgen.context_bleu_weight:
        log_warn('Generator is trained to use context. ' +
                 'Using empty contexts, expect lower performance.')
        das = [([], da) for da in das]

    # generate
    log_info('Generating...')
    gen_trees = []
    for num, da in enumerate(das, start=1):
        log_debug("\n\nTREE No. %03d" % num)
        gen_trees.append(tgen.generate_tree(da))
        if num % 100 == 0:
            log_info("Generated tree %d" % num)
    log_info(tgen.get_slot_err_stats())

    if args.delex_output_file is not None:
        log_info('Writing delex output...')
        write_trees_or_tokens(args.delex_output_file, das, gen_trees, None,
                              tgen.language, args.target_selector
                              or tgen.selector)

    # evaluate the generated trees against golden trees (delexicalized)
    eval_doc = None
    if args.eval_file and not args.eval_file.endswith('.txt'):
        eval_doc = read_ttrees(args.eval_file)
        evaler = Evaluator()
        evaler.process_eval_doc(eval_doc, gen_trees, tgen.language,
                                args.ref_selector, args.target_selector
                                or tgen.selector)

    # lexicalize, if required
    if args.abstr_file and tgen.lexicalizer:
        log_info('Lexicalizing...')
        tgen.lexicalize(gen_trees, args.abstr_file)

    # we won't need contexts anymore, but we do need DAs
    if tgen.use_context or tgen.context_bleu_weight:
        das = [da for _, da in das]

    # evaluate the generated & lexicalized tokens (F1 and BLEU scores)
    if args.eval_file and args.eval_file.endswith('.txt'):
        eval_tokens(das, read_tokens(args.eval_file, ref_mode=True),
                    [t.to_tok_list() for t in gen_trees])

    # write output .yaml.gz or .txt
    if args.output_file is not None:
        log_info('Writing output...')
        write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc,
                              tgen.language, args.target_selector
                              or tgen.selector)
Exemple #20
0
"""
Evaluation (t-tree comparison functions).
"""

from __future__ import unicode_literals
from collections import defaultdict
from enum import Enum
from tgen.logf import log_debug
from tgen.logf import log_warn
from tgen.tree import TreeData, TreeNode
import numpy as np

try:
    from pytreex.core.node import T
except ImportError:
    log_warn(
        'Pytreex modules not available, will not be able to evaluate trees.')

EvalTypes = Enum(b'EvalTypes', b'TOKEN NODE DEP')
EvalTypes.__doc__ = """Evaluation flavors (tokens, tree node-only, tree dependency)"""


def collect_counts(sent, eval_type=EvalTypes.NODE):
    """Collects counts of different node/dependency types in the given t-tree.

    @param sent: the tree/sentence to collect counts from
    @param eval_type: if set to EvalTypes.NODE (default), count nodes (formemes, lemmas, dependency \
        direction), if set to EvalTypes.DEP, count dependencies (including parent's formeme, lemma, \
        dependency direction), if set to EvalTypes.TOKEN, count just word forms (in list of tokens).
    @rtype: defaultdict
    """
    counts = defaultdict(int)
Exemple #21
0
def main(argv):

    opts, files = getopt(argv, 'f:c:d:')

    folds = 10
    chunk_size = 2
    dir_prefix = 'cv'

    for opt, arg in opts:
        if opt == '-f':
            folds = int(arg)
        elif opt == '-c':
            chunk_size = int(arg)
        elif opt == '-d':
            dir_prefix = arg

    if not files:
        sys.exit(__doc__)
    
    random.seed(1206)
    ordering = None

    for file in files:
        # read all data
        data = []
        with file_stream(file) as fh:
            chunk = []
            for line in fh:
                chunk.append(line)
                if len(chunk) == chunk_size:
                    data.append(chunk)
                    chunk = []
            if chunk:
                log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk)))

        if ordering is None:
            # create ordering
            ordering = range(len(data))
            random.shuffle(ordering)

            # create directories
            for fold_no in xrange(folds):
                os.mkdir(dir_prefix + "%02d" % fold_no)
            
        # output as train and test into all CV portions
        fold_size, bigger_folds = divmod(len(data), folds)
        for fold_no in xrange(folds):
            # compute test data bounds
            if fold_no < bigger_folds:
                test_lo = (fold_size + 1) * fold_no
                test_hi = (fold_size + 1) * (fold_no + 1)
            else:
                test_lo = fold_size * fold_no + bigger_folds
                test_hi = fold_size * (fold_no + 1) + bigger_folds
            # select train and test data instances
            train_data = [data[idx] for ord, idx in enumerate(ordering)
                          if ord < test_lo or ord >= test_hi]
            test_data = [data[idx] for ord, idx in enumerate(ordering)
                         if ord >= test_lo and ord < test_hi]

            # write them out to a file (replace `all' in name with train/test)
            fname_base = os.path.basename(file)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data)
            write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)
Exemple #22
0
"""
Evaluation (t-tree comparison functions).
"""

from __future__ import unicode_literals
from collections import defaultdict
from enum import Enum
from tgen.logf import log_debug
from tgen.logf import log_warn
from tgen.tree import TreeData, TreeNode
import numpy as np

try:
    from pytreex.core.node import T
except ImportError:
    log_warn('Pytreex modules not available, will not be able to evaluate trees.')


EvalTypes = Enum(b'EvalTypes', b'TOKEN NODE DEP')
EvalTypes.__doc__ = """Evaluation flavors (tokens, tree node-only, tree dependency)"""


def collect_counts(sent, eval_type=EvalTypes.NODE):
    """Collects counts of different node/dependency types in the given t-tree.

    @param sent: the tree/sentence to collect counts from
    @param eval_type: if set to EvalTypes.NODE (default), count nodes (formemes, lemmas, dependency \
        direction), if set to EvalTypes.DEP, count dependencies (including parent's formeme, lemma, \
        dependency direction), if set to EvalTypes.TOKEN, count just word forms (in list of tokens).
    @rtype: defaultdict
    """