Exemple #1
0
    def write_sequence(out, sequence):
        num_features = len(dicts)
        is_beginning = True
        for features in sequence:
            assert len(features) == num_features, \
                "Wrong number of features: " + line
            sample = DataFormat.DataSample()
            for i in xrange(num_original_columns):
                id = dicts[i].get(features[i], -1)
                if id != -1:
                    sample.id_slots.append(id)
                elif oov_policy[i] == OOV_POLICY_IGNORE:
                    sample.id_slots.append(0xffffffff)
                elif oov_policy[i] == OOV_POLICY_ERROR:
                    logger.fatal("Unknown token: %s" % features[i])
                else:
                    sample.id_slots.append(0)

            if patterns:
                dim = 0
                vec = sample.vector_slots.add()
                for i in xrange(num_original_columns, num_features):
                    id = dicts[i].get(features[i], -1)
                    if id != -1:
                        vec.ids.append(dim + id)
                    elif oov_policy[i] == OOV_POLICY_IGNORE:
                        pass
                    elif oov_policy[i] == OOV_POLICY_ERROR:
                        logger.fatal("Unknown token: %s" % features[i])
                    else:
                        vec.ids.append(dim + 0)

                    dim += len(dicts[i])

            sample.is_beginning = is_beginning
            is_beginning = False
            write_proto(out, sample)
Exemple #2
0
    @return True success, False for end of file
    """

    buf = file.read(8)
    if not buf:
        return False
    result, pos = _DecodeVarint(buf, 0)
    buf = buf[pos:] + file.read(result - len(buf) + pos)
    message.ParseFromString(buf)

    return True


def usage():
    print >> sys.stderr, "Usage: python show_pb.py PROTO_DATA_FILE"
    exit(1)


if __name__ == '__main__':
    if len(sys.argv) < 2:
        usage()

    f = open(sys.argv[1])
    header = DataFormat.DataHeader()
    read_proto(f, header)
    print header

    sample = DataFormat.DataSample()
    while read_proto(f, sample):
        print sample
Exemple #3
0
def gen_proto_file(
        input_file,
        dicts,
        oov_policy,
        output_file):

    def write_sequence(out, sequence):
        num_features = len(dicts)
        is_beginning = True
        for features in sequence:
            assert len(features) == num_features, \
                "Wrong number of features: " + line
            sample = DataFormat.DataSample()
            for i in xrange(num_original_columns):
                id = dicts[i].get(features[i], -1)
                if id != -1:
                    sample.id_slots.append(id)
                elif oov_policy[i] == OOV_POLICY_IGNORE:
                    sample.id_slots.append(0xffffffff)
                elif oov_policy[i] == OOV_POLICY_ERROR:
                    logger.fatal("Unknown token: %s" % features[i])
                else:
                    sample.id_slots.append(0)

            if patterns:
                dim = 0
                vec = sample.vector_slots.add()
                for i in xrange(num_original_columns, num_features):
                    id = dicts[i].get(features[i], -1)
                    if id != -1:
                        vec.ids.append(dim + id)
                    elif oov_policy[i] == OOV_POLICY_IGNORE:
                        pass
                    elif oov_policy[i] == OOV_POLICY_ERROR:
                        logger.fatal("Unknown token: %s" % features[i])
                    else:
                        vec.ids.append(dim + 0)

                    dim += len(dicts[i])

            sample.is_beginning = is_beginning
            is_beginning = False
            write_proto(out, sample)

    num_features = len(dicts)
    f = open(input_file, 'rb')
    out = open(output_file, 'wb')

    header = DataFormat.DataHeader()
    if patterns:
        slot_def = header.slot_defs.add()
        slot_def.type = DataFormat.SlotDef.VECTOR_SPARSE_NON_VALUE
        slot_def.dim = sum([len(dicts[i])
                            for i in xrange(num_original_columns, len(dicts))])
        logger.info("feature_dim=%s" % slot_def.dim)

    for i in xrange(num_original_columns):
        slot_def = header.slot_defs.add()
        slot_def.type = DataFormat.SlotDef.INDEX
        slot_def.dim = len(dicts[i])

    write_proto(out, header)

    num_sequences = 0
    sequence = []
    for line in f:
        line = line.strip()
        if not line:
            make_features(sequence)
            write_sequence(out, sequence)
            sequence = []
            num_sequences += 1
            continue
        features = line.split(' ')
        sequence.append(features)

    f.close()
    out.close()

    logger.info("num_sequences=%s" % num_sequences)