Ejemplo n.º 1
0
def write_to_tfrecords(filename, destination_dir, responses, prompts, q_ids, grades, speakers, targets, predictions,
                       debug=False):
    # Check that all the input lists are of equal lengths
    assert len({len(responses), len(prompts), len(q_ids), len(grades), len(speakers), len(targets), len(predictions)}) == 1

    # Create the training TF Record file
    print('Writing: ', filename)

    writer = tf.python_io.TFRecordWriter(os.path.join(destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt, example_pred, idx in zip(responses, prompts, q_ids, grades, speakers, targets, predictions, range(len(q_ids))):
        example = tf.train.SequenceExample(
            context=tf.train.Features(feature={
                'targets': tfrecord_utils.float_feature([tgt]),
                'grade': tfrecord_utils.float_feature([float(grd)]),
                'teacher_pred': tfrecord_utils.float_feature(list(example_pred)),
                'spkr': tfrecord_utils.bytes_feature([spkr]),
                'q_id': tfrecord_utils.int64_feature([q_id]),
                'example_idx': tfrecord_utils.int64_feature([idx])  # Stores the example number for easy back-reference to txt files even when examples get shuffled (0 indexed)
            }),
            feature_lists=tf.train.FeatureLists(feature_list={
                'response': tfrecord_utils.int64_feature_list(response),
                'prompt': tfrecord_utils.int64_feature_list(prompt)}))
        if debug:
            # Print out the data that is going to be saved:
            print("-----------------\n", "EXAMPLE: \n", "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n,teacher_pred: {}\nexample_num: {}\n\n".format(response, prompt, q_id, tgt, grd, example_pred, idx))
        writer.write(example.SerializeToString())
    writer.close()
    return
def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if os.path.isdir(args.destination_dir):
        print 'destination directory exists. Exiting...'
    else:
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

        # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the prompts as sequences of words
    with open(args.input_prompt_path, 'r') as file:
        topics = [line.replace('\n', '') for line in file.readlines()]

    # Get unique set of topics and topic counts (and sort tem)
    unique_topics, topic_counts = np.unique(topics, return_counts=True)
    topics = unique_topics[np.flip(np.argsort(topic_counts), 0)]
    topic_counts = np.flip(np.sort(topic_counts), 0)

    # Create dictionary for topics mapping sentence to topic id
    # Also create file of sorted topics and unigrams file
    # Unigram file later used for training
    topic_dict = {}
    with open(os.path.join(args.destination_dir, 'unigrams.txt'),
              'w') as ufile:
        with open(os.path.join(args.destination_dir, 'sorted_topics.txt'),
                  'w') as tfile:
            for i, topic, count in zip(xrange(topics.shape[0]), topics,
                                       topic_counts):
                topic_dict[topic] = i
                ufile.write(str(i) + ',' + str(int(count)) + '\n')
                tfile.write(topic + '\n')

    # Load up the speakers and speakers
    grades = np.loadtxt(args.input_grade_path)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Create a list of topic IDs for every response
    with open(args.input_prompt_path, 'r') as file:
        q_ids = np.asarray(
            [topic_dict[line.replace('\n', '')] for line in file.readlines()])

    ### Split data into train and validation  data sets
    n = len(responses)
    train_size = int(n * (1.0 - args.valid_fraction))
    valid_size = n - train_size

    print 'Total dataset size', n, 'Train dataset size', train_size, 'Valid dataset size', valid_size

    np.random.seed(1000)

    permutation = np.random.choice(np.arange(n), n, replace=False)
    index_train = permutation[:train_size]
    inded_valid = permutation[train_size:]

    trn_responses = responses[index_train]
    trn_prompts = prompts[index_train]
    trn_q_ids = q_ids[index_train]
    trn_speakers = speakers[index_train]
    trn_grades = grades[index_train]

    valid_responses = responses[inded_valid]
    valid_prompts = prompts[inded_valid]
    valid_q_ids = q_ids[inded_valid]
    valid_speakers = speakers[inded_valid]
    valid_grades = grades[inded_valid]

    # Create the training TF Record file
    filename = 'relevance.train.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(trn_responses, trn_prompts,
                                                 trn_q_ids, trn_grades,
                                                 trn_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()

    # Create the validation TF Record file
    filename = 'relevance.valid.tfrecords'
    print 'Writing', filename
    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr in zip(valid_responses,
                                                 valid_prompts, valid_q_ids,
                                                 valid_grades, valid_speakers):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([1.0]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
def main(argv=None):
    """Converts a dataset to tfrecords."""
    args = commandLineParser.parse_args()

    if not os.path.isdir(args.destination_dir):
        os.makedirs(args.destination_dir)

    if not os.path.isdir('CMDs'):
        os.makedirs('CMDs')

    with open('CMDs/step_preprocess_test_data.cmd', 'a') as f:
        f.write(' '.join(sys.argv) + '\n')
        f.write('--------------------------------\n')

    # Load responses and prompts as sequences of word ids
    responses, _ = load_text(args.input_data_path, args.input_wlist_path)
    prompts, _ = load_text(args.input_prompt_path, args.input_wlist_path)

    # Load up the grades, targets and speakers
    grades = np.loadtxt(args.input_grade_path)
    targets = np.loadtxt(args.input_tgt_path, dtype=np.float32)
    with open(args.input_spkr_path, 'r') as file:
        speakers = np.asarray(
            [line.replace('\n', '') for line in file.readlines()])

    # Load up sorted topics and (re)construct the topic dict so that I map each prompt word sequence to its q_id
    topic_dict = {}
    i = 0
    with open(os.path.join(args.sorted_topics_path), 'r') as tfile:
        for topic in tfile.readlines():
            topic_dict[topic.replace('\n', '')] = i
            i += 1

    # Load up the prompts as sequences of words and convert to q_id
    try:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([
                topic_dict[line.replace('\n', '')]
                for line in file.readlines()
            ])
    except:
        with open(args.input_prompt_path, 'r') as file:
            q_ids = np.asarray([-1 for line in file.readlines()])

    # Create the training TF Record file
    filename = args.name + '.tfrecords'
    print 'Writing', filename

    writer = tf.python_io.TFRecordWriter(
        os.path.join(args.destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts,
                                                      q_ids, grades, speakers,
                                                      targets):
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([tgt]),
                    'grade': tfrecord_utils.float_feature([grd]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
Ejemplo n.º 4
0
def write_to_tfrecords(filename,
                       destination_dir,
                       responses,
                       prompts,
                       q_ids,
                       grades,
                       speakers,
                       targets=1.0,
                       debug=False):
    # Check that all the input lists are of equal lengths

    # TEMP
    print(len(responses))
    print(len(prompts))
    print(len(q_ids))
    print(len(grades))
    print(len(speakers))

    assert len(
        {len(responses),
         len(prompts),
         len(q_ids),
         len(grades),
         len(speakers)}) == 1

    if type(targets) is float or type(targets) is int:
        # If targets is an integer make each target this value
        targets = [float(targets)] * len(responses)
    else:
        assert type(targets) is list
        assert len(targets) == len(responses)

    # Create the training TF Record file
    print('Writing: ', filename)

    writer = tf.python_io.TFRecordWriter(
        os.path.join(destination_dir, filename))
    for response, prompt, q_id, grd, spkr, tgt in zip(responses, prompts,
                                                      q_ids, grades, speakers,
                                                      targets):
        if debug:
            # Print out the data that is going to be saved:
            print(
                "-----------------\n", "EXAMPLE: \n",
                "Response: {}\nPrompt: {}\nQ_id: {}\n\ntarget: {}\ngrade: {}\n\n"
                .format(response, prompt, q_id, tgt, grd))
        example = tf.train.SequenceExample(
            context=tf.train.Features(
                feature={
                    'targets': tfrecord_utils.float_feature([tgt]),
                    'grade': tfrecord_utils.float_feature([float(grd)]),
                    'spkr': tfrecord_utils.bytes_feature([spkr]),
                    'q_id': tfrecord_utils.int64_feature([q_id])
                }),
            feature_lists=tf.train.FeatureLists(
                feature_list={
                    'response': tfrecord_utils.int64_feature_list(response),
                    'prompt': tfrecord_utils.int64_feature_list(prompt)
                }))
        writer.write(example.SerializeToString())
    writer.close()
    return