def write_sequence(out, sequence): num_features = len(dicts) is_beginning = True for features in sequence: assert len(features) == num_features, \ "Wrong number of features: " + line sample = DataFormat.DataSample() for i in xrange(num_original_columns): id = dicts[i].get(features[i], -1) if id != -1: sample.id_slots.append(id) elif oov_policy[i] == OOV_POLICY_IGNORE: sample.id_slots.append(0xffffffff) elif oov_policy[i] == OOV_POLICY_ERROR: logger.fatal("Unknown token: %s" % features[i]) else: sample.id_slots.append(0) if patterns: dim = 0 vec = sample.vector_slots.add() for i in xrange(num_original_columns, num_features): id = dicts[i].get(features[i], -1) if id != -1: vec.ids.append(dim + id) elif oov_policy[i] == OOV_POLICY_IGNORE: pass elif oov_policy[i] == OOV_POLICY_ERROR: logger.fatal("Unknown token: %s" % features[i]) else: vec.ids.append(dim + 0) dim += len(dicts[i]) sample.is_beginning = is_beginning is_beginning = False write_proto(out, sample)
@return True success, False for end of file """ buf = file.read(8) if not buf: return False result, pos = _DecodeVarint(buf, 0) buf = buf[pos:] + file.read(result - len(buf) + pos) message.ParseFromString(buf) return True def usage(): print >> sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE" exit(1) if __name__ == '__main__': if len(sys.argv) < 2: usage() f = open(sys.argv[1]) header = DataFormat.DataHeader() read_proto(f, header) print header sample = DataFormat.DataSample() while read_proto(f, sample): print sample
def gen_proto_file( input_file, dicts, oov_policy, output_file): def write_sequence(out, sequence): num_features = len(dicts) is_beginning = True for features in sequence: assert len(features) == num_features, \ "Wrong number of features: " + line sample = DataFormat.DataSample() for i in xrange(num_original_columns): id = dicts[i].get(features[i], -1) if id != -1: sample.id_slots.append(id) elif oov_policy[i] == OOV_POLICY_IGNORE: sample.id_slots.append(0xffffffff) elif oov_policy[i] == OOV_POLICY_ERROR: logger.fatal("Unknown token: %s" % features[i]) else: sample.id_slots.append(0) if patterns: dim = 0 vec = sample.vector_slots.add() for i in xrange(num_original_columns, num_features): id = dicts[i].get(features[i], -1) if id != -1: vec.ids.append(dim + id) elif oov_policy[i] == OOV_POLICY_IGNORE: pass elif oov_policy[i] == OOV_POLICY_ERROR: logger.fatal("Unknown token: %s" % features[i]) else: vec.ids.append(dim + 0) dim += len(dicts[i]) sample.is_beginning = is_beginning is_beginning = False write_proto(out, sample) num_features = len(dicts) f = open(input_file, 'rb') out = open(output_file, 'wb') header = DataFormat.DataHeader() if patterns: slot_def = header.slot_defs.add() slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE slot_def.dim = sum([len(dicts[i]) for i in xrange(num_original_columns, len(dicts))]) logger.info("feature_dim=%s" % slot_def.dim) for i in xrange(num_original_columns): slot_def = header.slot_defs.add() slot_def.type = DataFormat.SlotDef.INDEX slot_def.dim = len(dicts[i]) write_proto(out, header) num_sequences = 0 sequence = [] for line in f: line = line.strip() if not line: make_features(sequence) write_sequence(out, sequence) sequence = [] num_sequences += 1 continue features = line.split(' ') sequence.append(features) f.close() out.close() logger.info("num_sequences=%s" % num_sequences)