def load_vectors(cls, filename): ''' Load word vectors from a file Args: filename: the name of the file that contains the word vectors Comment lines are started with # If the first line except comments contains only two integers, it's assumed that the first is the vocabulary size and the second is the word embedding size (the same as word2vec). Return: a class of word vectors ''' at_beginning = True with Reader(filename) as f: idx = 1 # 0 for OOV vectors = [[0]] # placeholder for OOV word2id = {'OOV': 0} for line in f: if line.startswith('#'): continue if at_beginning: at_beginning = False parts = line.strip().split() if len(parts) == 2: embsize = int(parts[1]) oov = np.zeros(embsize) else: word = parts[0] vec = np.array([float(v) for v in parts[1:]]) embsize = len(vec) oov = np.zeros(embsize) oov += vec vectors.append(vec) word2id[word] = idx idx += 1 else: parts = line.strip().split(' ') word = parts[0] vec = np.array([float(v) for v in parts[1:]]) assert (vec.size == embsize) oov += vec vectors.append(vec) word2id[word] = idx idx += 1 oov = oov / (len(vectors) - 1) vectors[0] = oov word_vectors = WordVectors(embsize) word_vectors._vectors = np.array(vectors).T word_vectors._word2id = word2id return word_vectors
def prepare_data(word_vectors=None, datafile=None): '''Prepare training data Args: word_vectors: an instance of vec.wordvector datafile: location of data file Return: instances: a list of Instance word_vectors: word_vectors total_internal_node: total number of internal nodes ''' if rank == 0: # broadcast word vectors comm.bcast(word_vectors, root=0) # load raw data with Reader(datafile) as datafile: instance_strs = [line for line in datafile] # send training data instance_num = len(instance_strs) esize = int(instance_num / worker_num + 0.5) sizes = [esize] * worker_num sizes[-1] = instance_num - esize * (worker_num - 1) offset = sizes[0] for i in range(1, worker_num): comm.send(instance_strs[offset:offset + sizes[i]], dest=i) offset += sizes[i] comm.barrier() local_instance_strs = instance_strs[0:sizes[0]] del instance_strs instances, internal_node_num = load_instances(local_instance_strs, word_vectors) total_internal_node = comm.allreduce(internal_node_num, op=MPI.SUM) return instances, word_vectors, total_internal_node else: word_vectors = comm.bcast(root=0) # receive data local_instance_strs = comm.recv(source=0) comm.barrier() instances, internal_node_num = load_instances(local_instance_strs, word_vectors) total_internal_node = comm.allreduce(internal_node_num, op=MPI.SUM) return instances, word_vectors, total_internal_node
def prepare_data(word_vectors=None, datafile=None): ''' Prepare training data Args: word_vectors: an instance of vec.wordvector datafile: location of data file Return: instances: a list of Instance word_vectors: word_vectors total_internal_node: total number of internal nodes ''' # load raw data with Reader(datafile) as datafile: instance_strs = [line for line in datafile] # send training data instance_num = len(instance_strs) # esize = int(instance_num+0.5) instances, internal_node_num = load_instances(instance_strs, word_vectors) return instances, word_vectors, internal_node_num
def __init__(self, filename): with Reader(filename) as reader: config_lines = [ re.sub(ur'^([^ \[]*) ', ur'\1=', line) for line in reader ] config_str = u''.join(config_lines) config = ConfigParser.ConfigParser() config.readfp(io.StringIO(config_str)) # decide feature orders order = { 'lex-sgt': 0, 'lex-tgs': 1, 'trans-sgt': 2, 'trans-tgs': 3, 'word-count': 4, 'rule-count': 5, 'glue-rule-count': 6, 'lm': 7, } # set other feature order #if config.has_option('switch', 'use-nn-feature'): # self.use_neural_feature = config.getboolean('switch', 'use-nn-feature') #else: # self.use_neural_feature = False #if self.use_neural_feature: # order['nn-feature'] = len(order) # if not config.has_section(self.NN_SECTION): # raise NoSectionError('section "%s" is absent' % self.NN_SECTION) order['oov'] = len(order) # it should always be the last feature # load weights all_feature_names = set(order.keys()) weights = [0] * len(order) for key, value in config.items('weights'): # DO NOT load useless weights #if key == 'nn-feature' and not self.use_neural_feature: # continue if key == 'base': # skip [DEFAULT] continue weights[order[key]] = float(value) all_feature_names.remove(key) weights[order['oov']] = -100 all_feature_names.remove('oov') if len(all_feature_names) != 0: msg = 'weight(s) absent for feature(s): %s' % all_feature_names logger.error(msg) raise AbsentWeightError(msg) self.order = order self.weights = weights logger.info('weights : %s' % self.weights) # load common parameters self.x_beta = config.getfloat('param', 'X-beta') self.x_beamsize = config.getint('param', 'X-beamsize') self.s_beta = config.getfloat('param', 'S-beta') self.s_beamsize = config.getint('param', 'S-beamsize') self.rule_beamsize = config.getint('param', 'rule-beamsize') self.max_X_len = config.getint('param', 'max-X-len') self.epsilon = config.getint('param', 'epsilon') # load rule table self.rule_table_file = config.get('data', 'rules') # load language model data self.lm_file = config.get('data', 'lm-file') self.lm_order = config.getint('data', 'lm-order') self.enable_type3_glue_rule = False self.raw_config = config
def __load_rules(cls, filename, lm, config): ''' Load rule table from filename Args: filename: the name of the file that stores the rules lm: language model config: an instance of Config Return: a RuleTable ''' feature_num = config.get_feature_num() glue_rule_index = config.order['glue-rule-count'] max_rule_num = config.rule_beamsize table = RuleTable() keys = [] ranges = [] idx = 0 # glue rules # S -> X # type 1 glue rule is not counted features = [0] * feature_num glue_rule1 = Rule('|0', ['|0'], [0], features, cls.GLUE_RULE1_GLOBAL_ID) table._rules.append(glue_rule1) keys.append(cls.GLUE_RULE1.decode('utf-8')) ranges.append((idx, idx)) idx += 1 # S -> S X features = [0] * feature_num features[glue_rule_index] = 1 glue_rule2 = Rule('|0 |1', ['|0', '|1'], [0, 1], features, cls.GLUE_RULE2_GLOBAL_ID) glue_rule2.score = config.weights[config.order['glue-rule-count']] table._rules.append(glue_rule2) idx += 1 if config.enable_type3_glue_rule: # S -> <S X; X S> features = [0] * feature_num features[glue_rule_index] = 1 glue_rule3 = Rule('|0 |1', ['|1', '|0'], [1, 0], features, cls.GLUE_RULE3_GLOBAL_ID) glue_rule3.score = config.weights[config.order['glue-rule-count']] table._rules.append(glue_rule3) idx += 1 keys.append(cls.GLUE_RULE2.decode('utf-8')) ranges.append((1, idx - 1)) table.glue_rule_ids = tuple(i for i in range(idx)) # normal rules with Reader(filename) as reader: last_src = None current_rules = [] for rule_str in reader: parts = rule_str.strip().split(' ||| ') src = parts[0] tgt = parts[1].split(' ') nonterminal_pos = [] for tword, pos in zip(tgt, range(len(tgt))): if tword[0] == '|': if len(nonterminal_pos) == 0: nonterminal_pos.append(pos) else: index = int(tword[1:]) nonterminal_pos.insert(index, pos) features = [float(f) for f in parts[2].split(' ')] features.append(len(tgt) - len(nonterminal_pos)) # word number features.append(1) # rule count features.append(0) # glue rule count if len(parts) >= 4: global_rule_id = int(parts[3]) rule = Rule(src, tgt, nonterminal_pos, features, global_rule_id) else: rule = Rule(src, tgt, nonterminal_pos, features, idx) lmscore, hlmscore = cls.__get_lm_scores(rule, lm) features.append(lmscore) # lm score rule.hlmscore = hlmscore if last_src == None or src == last_src: current_rules.append(rule) last_src = src else: cls.__update_table(table, keys, ranges, last_src, current_rules, config, max_rule_num) current_rules = [rule] last_src = src idx += 1 cls.__update_table(table, keys, ranges, last_src, current_rules, config, max_rule_num) table._idranges = RecordTrie('<II', zip(keys, ranges)) del keys del ranges gc.collect() return table
word_vectors = WordVectors.load_vectors(word_vector_file) embsize = word_vectors.embsize() print >> stderr, 'load RAE parameters...' theta = unpickle(theta_file) rae = RecursiveAutoencoder.build(theta, embsize) total_cost = 0 total_instance_num = 0 total_internal_node_num = 0 print '=' * 63 print '%20s %20s %20s' % ('all', 'avg/node', 'internal node') print '-' * 63 with Reader(phrases_file) as reader, Writer(output_file) as writer: for phrase in reader: instance = Instance.parse_from_str(phrase, word_vectors) words_embedded = word_vectors[instance.words] root_node, cost = rae.forward(words_embedded) vec = root_node.p.T[0] # convert n*1 vector to common vector writer.write(' '.join([str(vec[i]) for i in range(vec.size)])) writer.write('\n') internal_node_num = len(instance.words) - 1 if internal_node_num > 0: print '%20.8f, %20.8f, %18d' % (cost, cost / internal_node_num, internal_node_num) else: print '%20.8f, %20.8f, %18d' % (cost, cost, 0)
logging.config.dictConfig(config_) k = options.kbest drop_oov = options.drop_oov debug = options.debug output_features = options.features checking = options.checking expend_loser = options.expend_loser with_rule_tree = options.with_rule_tree threads = options.threads logger.info('process num: %d' % threads) if options.input == '-': source = sys.stdin # TODO encoding else: source = Reader(options.input) if options.output == '-': writer = sys.stdout else: writer = Writer(options.output) if debug: rules.DEBUG = 1 config = Config(options.config) if logger.level <= logging.INFO: config.write(sys.stderr) lm = LanguageModel(config.lm_file, config.lm_order) rule_table = RuleTable.load(config.rule_table_file, lm, config)