def _init_server(self): """Initializes a server that registers new workers.""" registrar_class, ranker_dump_path = get_worker_registrar_for(self) n_tries = 0 self.server = None last_error = None while self.server is None and n_tries < 10: try: n_tries += 1 self.server = ThreadPoolServer(service=registrar_class, nbThreads=1, port=self.port) except socket.error as e: log_warn('Port %d in use, trying to use a higher port...' % self.port) self.port += 1 last_error = e if self.server is None: if last_error is not None: raise last_error raise Exception('Could not initialize server') self.services = set() self.free_services = deque() self.pending_requests = set() self.jobs = [] self.server_thread = Thread(target=self.server.start) self.server_thread.setDaemon(True) self.server_thread.start() self.ranker_dump_path = ranker_dump_path
def delete(self): """Delete this job.""" if self.submitted: try: self.__try_command('qdel ' + self.jobid) except RuntimeError as e: log_warn('Could not delete job: ' + str(e))
def _get_merged_child_type_cdfs(self, da): """Get merged child CDFs (i.e. lists of possible children, given parent IDs) for the given DA. All nodes occurring in training data items that contain DAIs from the current DA are included. If `compatible_dais` is set, nodes that always occur with DAIs not in the current DA will be excluded. @param da: the current dialogue act """ # get all nodes occurring in training data items containing the DAIs from the current DA merged_counts = defaultdict(Counter) for dai in da: try: for parent_id in self.child_type_counts[dai]: merged_counts[parent_id].update(self.child_type_counts[dai][parent_id]) except KeyError: log_warn('DAI ' + unicode(dai) + ' unknown, adding nothing to CDF.') # log_info('Node types: %d' % sum(len(c.keys()) for c in merged_counts.values())) # remove nodes that are not compatible with the current DA (their list of # minimum compatibility DAIs is not similar to the current DA) for _, counts in merged_counts.items(): for node in counts.keys(): if not self._compatible(da, NodeData(t_lemma=node[1], formeme=node[0])): del counts[node] # log_info('Node types after pruning: %d' % sum(len(c.keys()) for c in merged_counts.values())) # log_info('Compatible lemmas: %s' % ' '.join(set([n[1] for c in merged_counts.values() # for n in c.keys()]))) return self.cdfs_from_counts(merged_counts)
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file( classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def save_model(self, model_fname_pattern): if not self._trained_model: log_warn('No lexicalizer model trained, skipping saving!') model_fname = re.sub(r'(\.pickle)?(\.gz)?$', '.kenlm.bin', model_fname_pattern) shutil.copyfile(self._trained_model, model_fname)
def load_from_file(model_fname): """Load the generator from a file (actually two files, one for configuration and one for the TensorFlow graph, which must be stored separately). @param model_fname: file name (for the configuration file); TF graph must be stored with a \ different extension """ log_info("Loading generator from %s..." % model_fname) with file_stream(model_fname, 'rb', encoding=None) as fh: data = pickle.load(fh) ret = Seq2SeqGen(cfg=data['cfg']) ret.load_all_settings(data) if ret.classif_filter: classif_filter_fname = re.sub(r'((.pickle)?(.gz)?)$', r'.tftreecl\1', model_fname) if os.path.isfile(classif_filter_fname): ret.classif_filter = RerankingClassifier.load_from_file(classif_filter_fname) else: log_warn("Classification filter data not found, ignoring.") ret.classif_filter = False # re-build TF graph and restore the TF session tf_session_fname = re.sub(r'(.pickle)?(.gz)?$', '.tfsess', model_fname) ret._init_neural_network() ret.saver.restore(ret.session, tf_session_fname) return ret
def save_model(self, model_fname_pattern): if not self._word_freq: log_warn('No lexicalizer model trained, skipping saving!') model_fname = re.sub(r'(.pickle)?(.gz)?$', '.wfreq', model_fname_pattern) with file_stream(model_fname, 'wb', encoding=None) as fh: pickle.dump(self._word_freq, fh, pickle.HIGHEST_PROTOCOL)
def tf_check_filename(self, fname): """Checks if a directory is specified in the file name (otherwise newer TF versions would crash when saving a model). @param fname: The file name to be checked. @return: Adjusted file name (with "./" if no directory was specified).""" if not os.path.dirname(fname): log_warn("Directory not specified, using current directory: %s" % fname) fname = os.path.join(os.curdir, fname) return fname
def create_ttree(self): """Convert the TreeData structure to a regular t-tree.""" tnodes = [T(data={'ord': 0})] + [ T(data={ 't_lemma': node.t_lemma, 'formeme': node.formeme, 'ord': i }) for i, node in enumerate(self.nodes[1:], start=1) ] for parent_idx, tnode in zip(self.parents[1:], tnodes[1:]): try: tnode.parent = tnodes[parent_idx] except RuntimeException as e: # if a cycle is attempted, the node will hang on technical root log_warn("Error when creatting t-tree: %s\nTree: %s" % (str(e), str(self))) return tnodes[0]
def get_merged_limits(self, da): """Return merged limits on node counts (total and on each tree level). Uses a maximum for all DAIs in the given DA. Returns None if the given candidate generator does not have any node limits. @param da: the current dialogue act @rtype: defaultdict(Counter) """ if not self.node_limits: return None merged_limits = defaultdict(int) for dai in da: try: for level, level_limit in self.node_limits[dai].iteritems(): merged_limits[level] = max((level_limit, merged_limits[level])) except KeyError: log_warn('DAI ' + unicode(dai) + ' unknown, limits unchanged.') return merged_limits
def get_merged_limits(self, da): """Return merged limits on node counts (total and on each tree level). Uses a maximum for all DAIs in the given DA. Returns None if the given candidate generator does not have any node limits. @param da: the current dialogue act @rtype: defaultdict(Counter) """ if not self.node_limits: return None merged_limits = defaultdict(int) for dai in da: try: for level, level_limit in self.node_limits[dai].items(): merged_limits[level] = max( (level_limit, merged_limits[level])) except KeyError: log_warn('DAI ' + str(dai) + ' unknown, limits unchanged.') return merged_limits
def _get_merged_child_type_cdfs(self, da): """Get merged child CDFs (i.e. lists of possible children, given parent IDs) for the given DA. All nodes occurring in training data items that contain DAIs from the current DA are included. If `compatible_dais` is set, nodes that always occur with DAIs not in the current DA will be excluded. @param da: the current dialogue act """ # get all nodes occurring in training data items containing the DAIs from the current DA merged_counts = defaultdict(Counter) for dai in da: try: for parent_id in self.child_type_counts[dai]: merged_counts[parent_id].update( self.child_type_counts[dai][parent_id]) except KeyError: log_warn('DAI ' + str(dai) + ' unknown, adding nothing to CDF.') # log_info('Node types: %d' % sum(len(c.keys()) for c in merged_counts.values())) # remove nodes that are not compatible with the current DA (their list of # minimum compatibility DAIs is not similar to the current DA) for _, counts in list(merged_counts.items()): for node in list(counts.keys()): if not self._compatible( da, NodeData(t_lemma=node[1], formeme=node[0])): del counts[node] # log_info('Node types after pruning: %d' % sum(len(c.keys()) for c in merged_counts.values())) # log_info('Compatible lemmas: %s' % ' '.join(set([n[1] for c in merged_counts.values() # for n in c.keys()]))) return self.cdfs_from_counts(merged_counts)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instsructions, for tokens only)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # prepare evaluation if args.eval_file is None or args.eval_file.endswith('.txt'): # just tokens gen_doc = [] else: # Trees: depending on PyTreex from pytreex.core.document import Document eval_doc = read_ttrees(args.eval_file) if args.ref_selector == args.target_selector: gen_doc = Document() else: gen_doc = eval_doc if args.eval_file: tgen.init_slot_err_stats() # generate log_info('Generating...') tgen.selector = args.target_selector # override target selector for generation for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) tgen.generate_tree(da, gen_doc) # evaluate if args.eval_file is not None: log_info(tgen.get_slot_err_stats()) # evaluate the generated tokens (F1 and BLEU scores) if args.eval_file.endswith('.txt'): lexicalize_tokens(gen_doc, lexicalization_from_doc(args.abstr_file)) eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_doc) # evaluate the generated trees against golden trees else: eval_trees(das, ttrees_from_doc(eval_doc, tgen.language, args.ref_selector), ttrees_from_doc(gen_doc, tgen.language, args.target_selector), eval_doc, tgen.language, tgen.selector) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_doc, args.output_file) else: write_ttrees(gen_doc, args.output_file)
def main(argv): opts, files = getopt(argv, 'f:c:d:') folds = 10 chunk_size = 2 dir_prefix = 'cv' for opt, arg in opts: if opt == '-f': folds = int(arg) elif opt == '-c': chunk_size = int(arg) elif opt == '-d': dir_prefix = arg if not files: sys.exit(__doc__) random.seed(1206) ordering = None for file in files: # read all data data = [] with file_stream(file) as fh: chunk = [] for line in fh: chunk.append(line) if len(chunk) == chunk_size: data.append(chunk) chunk = [] if chunk: log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk))) if ordering is None: # create ordering ordering = range(len(data)) random.shuffle(ordering) # create directories for fold_no in xrange(folds): os.mkdir(dir_prefix + "%02d" % fold_no) # output as train and test into all CV portions fold_size, bigger_folds = divmod(len(data), folds) for fold_no in xrange(folds): # compute test data bounds if fold_no < bigger_folds: test_lo = (fold_size + 1) * fold_no test_hi = (fold_size + 1) * (fold_no + 1) else: test_lo = fold_size * fold_no + bigger_folds test_hi = fold_size * (fold_no + 1) + bigger_folds # select train and test data instances train_data = [ data[idx] for ord, idx in enumerate(ordering) if ord < test_lo or ord >= test_hi ] test_data = [ data[idx] for ord, idx in enumerate(ordering) if ord >= test_lo and ord < test_hi ] # write them out to a file (replace `all' in name with train/test) fname_base = os.path.basename(file) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
def seq2seq_gen(args): """Sequence-to-sequence generation""" ap = ArgumentParser() ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument('-a', '--abstr-file', type=str, help='Lexicalization file (a.k.a. abstraction instructions, for postprocessing)') ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument('-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn('Generator is not trained to use context, ignoring context input file.') else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) log_info(tgen.get_slot_err_stats()) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), gen_trees) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') if args.output_file.endswith('.txt'): write_tokens(gen_trees, args.output_file) else: write_ttrees(create_ttree_doc(gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector), args.output_file)
def seq2seq_gen(args): """Sequence-to-sequence generation""" def write_trees_or_tokens(output_file, das, gen_trees, base_doc, language, selector): """Decide to write t-trees or tokens based on the output file name.""" if output_file.endswith('.txt'): gen_toks = [t.to_tok_list() for t in gen_trees] postprocess_tokens(gen_toks, das) write_tokens(gen_toks, output_file) else: write_ttrees( create_ttree_doc(gen_trees, base_doc, language, selector), output_file) ap = ArgumentParser(prog=' '.join(sys.argv[0:2])) ap.add_argument('-e', '--eval-file', type=str, help='A ttree/text file for evaluation') ap.add_argument( '-a', '--abstr-file', type=str, help= 'Lexicalization file (a.k.a. abstraction instructions, for postprocessing)' ) ap.add_argument('-r', '--ref-selector', type=str, default='', help='Selector for reference trees in the evaluation file') ap.add_argument( '-t', '--target-selector', type=str, default='', help='Target selector for generated trees in the output file') ap.add_argument('-d', '--debug-logfile', type=str, help='Debug output file name') ap.add_argument('-w', '--output-file', type=str, help='Output tree/text file') ap.add_argument('-D', '--delex-output-file', type=str, help='Output file for trees/text before lexicalization') ap.add_argument('-b', '--beam-size', type=int, help='Override beam size for beam search decoding') ap.add_argument('-c', '--context-file', type=str, help='Input ttree/text file with context utterances') ap.add_argument('seq2seq_model_file', type=str, help='Trained Seq2Seq generator model') ap.add_argument('da_test_file', type=str, help='Input DAs for generation') args = ap.parse_args(args) if args.debug_logfile: set_debug_stream(file_stream(args.debug_logfile, mode='w')) # load the generator tgen = Seq2SeqBase.load_from_file(args.seq2seq_model_file) if args.beam_size is not None: tgen.beam_size = args.beam_size # read input files (DAs, contexts) das = read_das(args.da_test_file) if args.context_file: if not tgen.use_context and not tgen.context_bleu_weight: log_warn( 'Generator is not trained to use context, ignoring context input file.' ) else: if args.context_file.endswith('.txt'): contexts = read_tokens(args.context_file) else: contexts = tokens_from_doc(read_ttrees(args.context_file), tgen.language, tgen.selector) das = [(context, da) for context, da in zip(contexts, das)] elif tgen.use_context or tgen.context_bleu_weight: log_warn('Generator is trained to use context. ' + 'Using empty contexts, expect lower performance.') das = [([], da) for da in das] # generate log_info('Generating...') gen_trees = [] for num, da in enumerate(das, start=1): log_debug("\n\nTREE No. %03d" % num) gen_trees.append(tgen.generate_tree(da)) if num % 100 == 0: log_info("Generated tree %d" % num) log_info(tgen.get_slot_err_stats()) if args.delex_output_file is not None: log_info('Writing delex output...') write_trees_or_tokens(args.delex_output_file, das, gen_trees, None, tgen.language, args.target_selector or tgen.selector) # evaluate the generated trees against golden trees (delexicalized) eval_doc = None if args.eval_file and not args.eval_file.endswith('.txt'): eval_doc = read_ttrees(args.eval_file) evaler = Evaluator() evaler.process_eval_doc(eval_doc, gen_trees, tgen.language, args.ref_selector, args.target_selector or tgen.selector) # lexicalize, if required if args.abstr_file and tgen.lexicalizer: log_info('Lexicalizing...') tgen.lexicalize(gen_trees, args.abstr_file) # we won't need contexts anymore, but we do need DAs if tgen.use_context or tgen.context_bleu_weight: das = [da for _, da in das] # evaluate the generated & lexicalized tokens (F1 and BLEU scores) if args.eval_file and args.eval_file.endswith('.txt'): eval_tokens(das, read_tokens(args.eval_file, ref_mode=True), [t.to_tok_list() for t in gen_trees]) # write output .yaml.gz or .txt if args.output_file is not None: log_info('Writing output...') write_trees_or_tokens(args.output_file, das, gen_trees, eval_doc, tgen.language, args.target_selector or tgen.selector)
""" Evaluation (t-tree comparison functions). """ from __future__ import unicode_literals from collections import defaultdict from enum import Enum from tgen.logf import log_debug from tgen.logf import log_warn from tgen.tree import TreeData, TreeNode import numpy as np try: from pytreex.core.node import T except ImportError: log_warn( 'Pytreex modules not available, will not be able to evaluate trees.') EvalTypes = Enum(b'EvalTypes', b'TOKEN NODE DEP') EvalTypes.__doc__ = """Evaluation flavors (tokens, tree node-only, tree dependency)""" def collect_counts(sent, eval_type=EvalTypes.NODE): """Collects counts of different node/dependency types in the given t-tree. @param sent: the tree/sentence to collect counts from @param eval_type: if set to EvalTypes.NODE (default), count nodes (formemes, lemmas, dependency \ direction), if set to EvalTypes.DEP, count dependencies (including parent's formeme, lemma, \ dependency direction), if set to EvalTypes.TOKEN, count just word forms (in list of tokens). @rtype: defaultdict """ counts = defaultdict(int)
def main(argv): opts, files = getopt(argv, 'f:c:d:') folds = 10 chunk_size = 2 dir_prefix = 'cv' for opt, arg in opts: if opt == '-f': folds = int(arg) elif opt == '-c': chunk_size = int(arg) elif opt == '-d': dir_prefix = arg if not files: sys.exit(__doc__) random.seed(1206) ordering = None for file in files: # read all data data = [] with file_stream(file) as fh: chunk = [] for line in fh: chunk.append(line) if len(chunk) == chunk_size: data.append(chunk) chunk = [] if chunk: log_warn('Incomplete chunk at end of file %s, size %d' % (file, len(chunk))) if ordering is None: # create ordering ordering = range(len(data)) random.shuffle(ordering) # create directories for fold_no in xrange(folds): os.mkdir(dir_prefix + "%02d" % fold_no) # output as train and test into all CV portions fold_size, bigger_folds = divmod(len(data), folds) for fold_no in xrange(folds): # compute test data bounds if fold_no < bigger_folds: test_lo = (fold_size + 1) * fold_no test_hi = (fold_size + 1) * (fold_no + 1) else: test_lo = fold_size * fold_no + bigger_folds test_hi = fold_size * (fold_no + 1) + bigger_folds # select train and test data instances train_data = [data[idx] for ord, idx in enumerate(ordering) if ord < test_lo or ord >= test_hi] test_data = [data[idx] for ord, idx in enumerate(ordering) if ord >= test_lo and ord < test_hi] # write them out to a file (replace `all' in name with train/test) fname_base = os.path.basename(file) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'train', train_data) write_data(dir_prefix + "%02d" % fold_no, fname_base, 'test', test_data)
""" Evaluation (t-tree comparison functions). """ from __future__ import unicode_literals from collections import defaultdict from enum import Enum from tgen.logf import log_debug from tgen.logf import log_warn from tgen.tree import TreeData, TreeNode import numpy as np try: from pytreex.core.node import T except ImportError: log_warn('Pytreex modules not available, will not be able to evaluate trees.') EvalTypes = Enum(b'EvalTypes', b'TOKEN NODE DEP') EvalTypes.__doc__ = """Evaluation flavors (tokens, tree node-only, tree dependency)""" def collect_counts(sent, eval_type=EvalTypes.NODE): """Collects counts of different node/dependency types in the given t-tree. @param sent: the tree/sentence to collect counts from @param eval_type: if set to EvalTypes.NODE (default), count nodes (formemes, lemmas, dependency \ direction), if set to EvalTypes.DEP, count dependencies (including parent's formeme, lemma, \ dependency direction), if set to EvalTypes.TOKEN, count just word forms (in list of tokens). @rtype: defaultdict """