def load_text_pairs(fname, config_data, vocabulary, noutputs=3):
    max_input_length = config_data['max_input_length']
    max_output_length = config_data['max_output_length']
    max_idx = max(vocabulary.values())
    dummy_word_idx = max_idx + 1

    ifile = open(fname, encoding='utf-8', mode='rt')
    inputs_raw = []
    outputs_raw = []
    for line in ifile:
        sline = line.replace('\n', '').split('\t')
        text0 = sline[0]
        text1 = sline[1]

        inputs_raw.append(text0)
        outputs_raw.append(text1)

    input_idx = convert2indices(inputs_raw,
                                vocabulary,
                                dummy_word_idx,
                                dummy_word_idx,
                                max_sent_length=max_input_length)
    target_idx = convert2indices(outputs_raw,
                                 vocabulary,
                                 dummy_word_idx,
                                 dummy_word_idx,
                                 max_sent_length=max_output_length)

    outputs = [np.ones(len(input_idx))] * noutputs
    return [input_idx, target_idx], outputs
def load_text_gen_data(fname, config_data, vocabulary, noutputs=3):
    max_input_length = config_data['max_input_length']
    max_output_length = config_data['max_output_length']
    max_idx = max(vocabulary.values())
    dummy_word_idx = max_idx + 1
    reader = csv.DictReader(open(fname, encoding='utf-8', mode='rt'))

    inputs_raw = []
    outputs_raw = []
    for row in reader:
        i1 = row['mr']
        i2 = row['ref']

        inputs_raw.append(i1)
        outputs_raw.append(i2)

    input_idx = convert2indices(inputs_raw,
                                vocabulary,
                                dummy_word_idx,
                                dummy_word_idx,
                                max_sent_length=max_input_length)
    target_idx = convert2indices(outputs_raw,
                                 vocabulary,
                                 dummy_word_idx,
                                 dummy_word_idx,
                                 max_sent_length=max_output_length)

    outputs = [np.ones(len(input_idx))] * noutputs

    return [input_idx, target_idx], outputs
def generate_data_stream(fname, config_data, vocabulary, batch_size, noutputs=2, skip_data=0):
    max_sentence_len = config_data['max_sentence_length']
    dummy_word_idx = vocabulary['DUMMY_WORD']
    outputs = [np.ones(batch_size)]*noutputs
    #vocabulary = {k: v[0] for k, v in vocabulary.items()}
    current_batch = []
    while True:
        if fname.endswith('.tsv') or fname.endswith('.txt'):
            ifile = open(fname, mode='rt', encoding='utf-8')
        elif fname.endswith('.gz') or fname.endswith('.gzip'):
            ifile = gzip.open(fname, mode='rt', encoding='utf-8')

        for i, line in enumerate(ifile, start=1):
            if not skip_data == 0:
                skip_data -= 1
                continue

            current_batch.append(line)
            if i % batch_size == 0:
                random.shuffle(current_batch)
                processed_batch = [preprocess(x.replace('\r', '').split('\t')[-1]) for x in current_batch]
                batch_idx = convert2indices(processed_batch, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_sentence_len)
                yield [batch_idx, batch_idx], outputs
                current_batch = []
        ifile.close()
Exemple #4
0
def transform_data(fname, vocabulary_word, vocabulary_char, max_sentence_len,
                   noutputs):
    dummy_word_idx = vocabulary_word['DUMMY_WORD'][0]
    dummy_char_idx = max(vocabulary_char.values()) + 1

    file = open(fname, encoding='utf-8', mode='rt')

    curr_tweets = [
        x.replace('\r', '').split('\t')[-1] for x in file.readlines()
    ]
    processed_batch = [preprocess(x) for x in curr_tweets]
    text_idx = hybrid_convert2indices(curr_tweets,
                                      processed_batch,
                                      vocabulary_word,
                                      dummy_word_idx,
                                      dummy_word_idx,
                                      max_sent_length=max_sentence_len)
    char_idx = convert2indices(curr_tweets,
                               vocabulary_char,
                               dummy_char_idx,
                               dummy_char_idx,
                               max_sent_length=max_sentence_len)
    outputs = [np.ones(len(curr_tweets))] * noutputs

    return [char_idx, text_idx], outputs
def transform_data(fname, vocabulary, max_sentence_len, noutputs):
    max_idx = max(vocabulary.values())
    dummy_word_idx = max_idx + 1

    file = open(fname, encoding='utf-8', mode='rt')

    curr_tweets = [
        x.replace('\r', '').split('\t')[-1] for x in file.readlines()
    ]
    text_idx = convert2indices(curr_tweets,
                               vocabulary,
                               dummy_word_idx,
                               dummy_word_idx,
                               max_sent_length=max_sentence_len)
    outputs = [np.ones(len(curr_tweets))] * noutputs

    return [text_idx, text_idx], outputs
def load_text_gen_data(fname, config_data, vocabulary, noutputs=3, random_output=False, word_based=False, random_first_word=False):
    max_output_length = config_data['max_sentence_len']
    vocab_path = config_data['vocab_path']
    fw_vocab = cPickle.load(open(join(vocab_path, 'fw_vocab.pkl'), 'rb'))
    overlap_map_for_fw = cPickle.load(open(join(vocab_path, 'overlap_map_for_fw.pkl'), 'rb'))

    dummy_word_idx = len(vocabulary)
    dropout_word_idx = len(vocabulary) + 1
    reader = csv.DictReader(open(fname, encoding='utf-8', mode='rt'))
    if word_based:
        vocabulary = {token: idx for token, (idx, freq) in vocabulary.items()}

    headers = [
        ('name', process_name),
        ('eatType', process_eat_type),
        ('priceRange', process_price_range),
        ('customer rating', process_customer_rating),
        ('near', process_near),
        ('food', process_food),
        ('area', process_area),
        ('familyFriendly', process_family_friendly)
    ]

    field_ops = {
        'eatType': 3,
        'priceRange': 6,
        'customer rating': 6,
        'food': 7,
        'area': 2,
        'familyFriendly': 2
    }

    processed_fields = defaultdict(lambda: [])
    outputs_raw = []
    weights_raw = []
    mr_list = []
    for row in reader:
        i1 = row['mr']
        i2 = row.get('ref', '')
        i3 = row.get('weight', 1.0)
        mr_list.append(i1)
        weights_raw.append(float(i3))
        outputs_raw.append(i2)
        keywords = i1.split(',')

        kv = {}
        for keyword in keywords:
            kidx = keyword.find('[')
            key = keyword[:kidx].strip()
            value = keyword[kidx + 1: keyword.find(']')]
            kv[key] = value

        for header, funct in headers:
            val = kv.get(header, None)
            processed_value = funct(val)
            processed_fields[header].append(processed_value)

    inputs = []

    for header, _ in headers:
        values = processed_fields[header]
        if header in ['name', 'near', 'food']:
            value_idx = []
            for value in values:
                x = np.zeros(2)
                if value:
                    x[0] = 1
                else:
                    x[1] = 1
                value_idx.append(x)
            value_idx = np.array(value_idx).astype('float32')
        else:
            value_idx = []
            for value in values:
                x = np.zeros(field_ops[header] + 1)
                if value is not None:
                    x[value] = 1
                value_idx.append(x)

            value_idx = np.array(value_idx).astype('float32')
        inputs.append(value_idx)

    outputs_delex = [preprocess_nlg_text(x, name, near, food, name_tok, near_tok, food_tok, word_based=word_based) for x, name, near, food in zip(outputs_raw, processed_fields['name'], processed_fields['near'],  processed_fields['food'])]
    if not random_first_word:
        first_words = get_first_words(outputs_delex, fw_vocab, random_first_word)
    else:
        first_words, _ = sample_first_word(inputs, overlap_map_for_fw, fw_vocab)
    inputs.append(first_words)
    target_idx = convert2indices(outputs_delex, vocabulary, dummy_word_idx, dummy_word_idx, max_sent_length=max_output_length)

    if random_output:
        target_idx = np.random.normal(loc=0, scale=0.25, size=target_idx.shape)#np.ones_like(target_idx)*dropout_word_idx
    inputs.append(target_idx)
    weights = np.array(weights_raw)

    outputs = [np.ones(len(inputs[0]))] * noutputs

    lex_dict = {
        name_tok: processed_fields['name'],
        near_tok: processed_fields['near'],
        food_tok: processed_fields['food'],
    }

    return inputs, outputs, [weights]*noutputs, lex_dict
def load_text_gen_data(fname,
                       feature_fname,
                       tree_fname,
                       config_data,
                       vocabulary,
                       noutputs=3,
                       random_output=False,
                       word_based=False,
                       random_first_word=False):
    max_output_length = config_data['max_sentence_len']
    feature_list = config_data['features']
    dummy_word_idx = len(vocabulary)

    if word_based:
        vocabulary = {token: idx for token, (idx, freq) in vocabulary.items()}

    field_ops = {
        'eatType': 3,
        'priceRange': 6,
        'customer rating': 6,
        'food': 7,
        'area': 2,
        'familyFriendly': 2
    }

    inputs = []
    outputs_raw, processed_fields, weights_raw = _load_nlg_data(fname)

    for header, _ in headers:
        values = processed_fields[header]
        if header in ['name', 'near', 'food']:
            value_idx = []
            for value in values:
                x = np.zeros(2)
                if value:
                    x[0] = 1
                else:
                    x[1] = 1
                value_idx.append(x)
            value_idx = np.array(value_idx).astype('float32')
        else:
            value_idx = []
            for value in values:
                x = np.zeros(field_ops[header] + 1)
                if value is not None:
                    x[value] = 1
                value_idx.append(x)

            value_idx = np.array(value_idx).astype('float32')
        inputs.append(value_idx)

    outputs_delex = [
        preprocess_nlg_text(x,
                            name,
                            near,
                            food,
                            name_tok,
                            near_tok,
                            food_tok,
                            word_based=word_based) for x, name, near, food in
        zip(outputs_raw, processed_fields['name'], processed_fields['near'],
            processed_fields['food'])
    ]
    target_idx = convert2indices(outputs_delex,
                                 vocabulary,
                                 dummy_word_idx,
                                 dummy_word_idx,
                                 max_sent_length=max_output_length)
    if not random_first_word:
        nsentence_embeddings, tr_fwords_full_vectors, tr_fphrase_full_vectors, tr_fpos_full_vectors, tr_fwords_vectors, tr_fpos_vectors, tr_fphrase_vectors = load_lex_features(
            feature_fname, config_data)
        pos_tag_feature, phrase_tag_feature = load_special_tags(
            tree_fname, config_data)
    else:
        nsentence_embeddings, tr_fwords_full_vectors, tr_fphrase_full_vectors, tr_fpos_full_vectors, tr_fwords_vectors, tr_fpos_vectors, tr_fphrase_vectors = sample_lex_features(
            processed_fields, config_data)
        pos_tag_feature, phrase_tag_feature = sample_special_tags(
            processed_fields, config_data)

    if 'nsent' in feature_list:
        inputs.append(nsentence_embeddings)

    if 'fout_word_vectors':
        inputs.append(tr_fwords_full_vectors)

    if 'fout_phrase_vectors':
        inputs.append(tr_fphrase_full_vectors)

    if 'fout_pos_vectors':
        inputs.append(tr_fpos_full_vectors)

    if 'fword_vectors':
        inputs.extend(tr_fwords_vectors)

    if 'fphrase_vectors':
        inputs.extend(tr_fphrase_vectors)

    if 'fpos_vectors':
        inputs.extend(tr_fpos_vectors)

    if 'pos_tag_feature':
        inputs.append(pos_tag_feature)

    if 'phrase_tag_feature':
        inputs.append(phrase_tag_feature)

    if random_output:
        target_idx = np.random.normal(loc=0, scale=0.25, size=target_idx.shape)
    inputs.append(target_idx)
    weights = np.array(weights_raw)

    outputs = [np.ones(len(inputs[0]))] * noutputs

    lex_dict = {
        name_tok: processed_fields['name'],
        near_tok: processed_fields['near'],
        food_tok: processed_fields['food'],
    }

    return inputs, outputs, [weights] * noutputs, lex_dict