Ejemplo n.º 1
0
def preprocess_data(dir_path='NKJP_1.2_nltk_POS', window_size=3):
    sentences, tagnames, dictionary = parse_sentences(dir_path)
    print '{} sentences loaded'.format(len(sentences))
    tagnames.update(['<s>', '</s>'])  # Add special tags
    num_to_tag = dict(enumerate(tagnames))
    tag_to_num = invert_dict(num_to_tag)

    dictionary.update(['UUUNKKK', '<s>', '</s>'])  # Add special tokens
    num_to_word = dict(enumerate(dictionary))
    word_to_num = invert_dict(num_to_word)

    X, y = docs_to_windows(sentences, word_to_num, tag_to_num, window_size)
    print '{} {}-word windows loaded'.format(len(X), window_size)
    print 'Shape of X is {}\nShape of y is {}'.format(X.shape, y.shape)
    return X, y, word_to_num, tag_to_num
Ejemplo n.º 2
0
def load_wv(vocabfile):
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd if line.strip() != '']
    words = list(set(words))
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
    return word_to_num, num_to_word
Ejemplo n.º 3
0
def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd]
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
    return wv, word_to_num, num_to_word
Ejemplo n.º 4
0
def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with open(vocabfile) as fd:
        words = [line.strip() for line in fd]
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
    return wv, word_to_num, num_to_word
Ejemplo n.º 5
0
def load_wv(vocabfile, wvfile):
    wv = loadtxt(wvfile, dtype=float)
    with codecs.open(vocabfile,'r',encoding='utf-8') as fd:
        words = [line.strip('\n').strip(' ') for line in fd]
    num_to_word = dict(enumerate(words))
    word_to_num = invert_dict(num_to_word)
#    print words[170:180]
    return wv, word_to_num, num_to_word
Ejemplo n.º 6
0
    def handle_prediction(self):
        probs, classes = self.predict_image(self.image_path, self.model)

        idx_to_class: Dict = invert_dict(self.model.class_to_idx)
        classes = [idx_to_class[c] for c in classes]
        if self.cat_name_map:
            classes = [self.cat_name_map[c] for c in classes]
        print(f"Prediction for top {self.top_k} classes")
        total_p: float = sum(probs)
        for p, c in zip(probs, classes):
            percent: float = 100.0 * p / total_p
            print(f"\t{c.title()}: {percent:.1f}%")
Ejemplo n.º 7
0
    def load_data(self, debug=False):
        """Loads starter word-vectors and train/dev/test-split the data."""

        # Load the training set
        X, y, self.word_to_num, self.tag_to_num = preprocess_data(
            dir_path='NKJP_1.2_nltk_POS')

        self.num_to_word = invert_dict(self.word_to_num)
        self.num_to_tag = invert_dict(self.tag_to_num)
        self.tagset_size = len(self.tag_to_num)

        self.X_train, self.X_dev, self.y_train, self.y_dev = train_test_split(
            X, y, test_size=0.2)
        # A hacky way to get 3-part split from 2-part-splitting function
        self.X_dev, self.X_test, self.y_dev, self.y_test = train_test_split(
            self.X_dev, self.y_dev, test_size=0.5)

        if debug:
            self.X_train = self.X_train[:1024]
            self.y_train = self.y_train[:1024]
            self.X_dev = self.X_dev[:1024]
            self.y_dev = self.y_dev[:1024]
Ejemplo n.º 8
0
    def __init__(self,
                 trn_file,
                 wav_file,
                 mfcc_file,
                 args,
                 vocab_create_mode='BUILD',
                 mfcc_create='Y'):
        ''' 
        Args:
        data_file: data file path
        vocab_create_mode: 
                BUILD: create the vocab dict from raw label data
                LOAD : read from file directly
        '''
        self.args = args

        #trn file path
        self.trn_file = trn_file
        #wav file path
        self.wav_file = wav_file
        #mfcc file path
        self.mfcc_file = mfcc_file

        # data file path
        #self.data_file = data_file
        # <EOS>: end of the sentenset tag
        # <SOS>: start of the sentenset tag
        # <PAD>: padding tag
        self.special_signs = ['<EOS>', '<SOS>', '<PAD>', '<BIAS>']
        # label to index dict
        self.vocab = {}
        # index to label dict
        self.inverse_vocab = {}

        if vocab_create_mode == 'BUILD':
            self.label_process()
        elif vocab_create_mode == 'LOAD':
            self.vocab = utils.load_from_pkl('vocab.pkl')
            self.inverse_vocab = utils.invert_dict(self.vocab)

        if mfcc_create == 'Y':
            for i in range(len(self.wav_file)):
                wavlist = os.listdir(self.wav_file[i])
                for j in range(len(wavlist)):
                    wav_path = os.path.join(self.wav_file[i], wavlist[j])
                    # invert the radio to the mfcc feature
                    mfcc = self.read_wav_file(wav_path, 26, 9)
                    mfcc = np.transpose(mfcc)
                    np.save(os.path.join(self.mfcc_file[i], \
                            os.path.splitext(wavlist[j])[0]), mfcc, 'utf-8')
Ejemplo n.º 9
0
 def load(metadata_path):
     directory = os.path.dirname(metadata_path)
     with open(metadata_path, 'rb') as f:
         iterator = pickle.load(f)
     iterator._deserialize_np_arrays(directory)
     iterator.mappings = None
     # Create inverse lookup of buckets.
     # If TN, ba
     #src_ex.shape[0], src_ex.shape[1])
     iterator.bucket_idx_to_key = []
     for bucket in iterator.bucketed_data:
         src_len = np.shape(bucket[0])[1]
         label_len = np.shape(bucket[2])[1]
         iterator.bucket_idx_to_key.append((src_len, label_len))
     iterator.bucket_key_to_idx = invert_dict(dict(enumerate(iterator.bucket_idx_to_key)))
     return iterator        
Ejemplo n.º 10
0
def create_train_test_dic(total_dic):
    testdic = defaultdict(list)
    traindic = defaultdict(list)

    invert_total_dic = invert_dict(total_dic)
    for user in total_dic:
        if len(total_dic[user]) < 2:
            traindic[user] = total_dic[user]
        else:
            i = 0
            for ref in total_dic[user]:
                i = i + 1
                if i < 2:
                    traindic[user].append(ref)
                else:
                    if len(invert_total_dic[ref]) < 2:
                        traindic[user].append(ref)
                    else:
                        invert_total_dic[ref].remove(user)
                        testdic[user].append(ref)

    return traindic, testdic
Ejemplo n.º 11
0
def parse_freedict(fn, invert=False):
    with open(fn, 'r') as freedict:
        translation_dict = {}
        entry_found = False
        word_source = ""
        for line in freedict:
            if line.strip() == "<entry>":
                entry_found = True
            elif line.strip() == "</entry>":
                entry_found = False
            if entry_found:
                if line.strip()[:6] == "<orth>":
                    word_source = re.sub(r'<orth>|</orth>', '', line.strip()).lower()
                    if " " in word_source:
                        entry_found = False
                    elif word_source not in translation_dict:
                        translation_dict[word_source] = []
                elif line.strip()[:7] == "<quote>":
                    word_target = re.sub(r'<quote>|</quote>', '', line.strip()).lower()
                    if " " not in word_target:
                        translation_dict[word_source].append(word_target)
    if invert:
        translation_dict = invert_dict(translation_dict)
    return translation_dict
Ejemplo n.º 12
0
def main(options):
    args = get_default_args()
    set_args(args, options)
    mode, dataset_name = args['mode'], args['dataset']

    # default setting
    args['raw_data'] = "data/%s/" % args['dataset']
    args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset']
    print_args(args)

    # get train/val/test names for specific dataset
    train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset(
        args)

    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}}
    test_vocab = {'word': {}, '3gram': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s_%s" %
                 (mode, dataset_name, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset, _, _, _, _, _ = load_data(
                "%s/%s/%s" % (args["experimental_data"], data_name, val_name),
                False)
        if args['embedding'] == 'glove':
            train_vocab_emb, test_vocab_emb = construct_vocab_emb(
                "%s/%s" % (args["experimental_data"], data_name),
                vocab['word'],
                test_vocab['word'],
                300,
                "word",
                base_embed_path=args["base_embed_path"],
                type=args["embedding"])
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, num_classes, args)
        print("create training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            val_dataset = gen_data(args["raw_data"], val_set, vocab,
                                   test_vocab, False, max_query_len,
                                   max_doc_len, max_url_len, num_classes, args)
            print("create validation set successfully...")

        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                num_classes, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        if dataset_name != 'twitter' and dataset_name != 'TwitterURL':
            save_data("%s/%s/%s" %
                      (args["experimental_data"], data_name, val_name),
                      False,
                      val_dataset,
                      vocab=test_vocab,
                      vocab_emb=test_vocab_emb)
            print("save val set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if dataset_name == 'twitter' or dataset_name == 'TwitterURL':
        val_split = args['val_split']
        num_samples, _ = train_dataset["query_word_input"].shape
        # randomly sample queries and all their documents if query_random is True
        # otherwise, query-doc pairs are randomly sampled
        query_random = True if dataset_name == 'twitter' else False
        if query_random:
            del train_dataset["overlap_feat"]
            val_indices = sample_aaai_val_set(args["raw_data"], train_set,
                                              val_split)
        else:
            val_split = 0.1
            val_indices, val_set = [], set()
            for i in range(int(num_samples * val_split)):
                val_index = np.random.randint(num_samples)
                while val_index in val_set:
                    val_index = np.random.randint(num_samples)
                val_indices.append(val_index)
                val_set.add(val_index)

        val_dataset = {}
        for key in train_dataset:
            #print(key, train_dataset[key].shape)
            val_dataset[key] = train_dataset[key][val_indices]
            train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        if train_dataset[key].size == 0:
            continue
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # merge the vocabulory of train and test set
    merged_vocab = {}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'],
                                            test_vocab['3gram'])
    print("TRAIN vocab: word(%d) 3gram(%d)" %
          (len(vocab['word']), len(vocab['3gram'])))
    print("TEST vocab: word(%d) 3gram(%d)" %
          (len(test_vocab['word']), len(test_vocab['3gram'])))
    print("MERGED vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(merged_vocab['3gram'])))

    vocab_inv, vocab_size = {}, {}
    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    # create model
    model = create_attention_model(max_query_len,
                                   max_doc_len,
                                   max_url_len,
                                   vocab_size,
                                   train_vocab_emb,
                                   args["nb_filters"],
                                   args["nb_layers"],
                                   embed_size=300,
                                   dropout_rate=args['dropout'],
                                   trainable=args["trainable"],
                                   weighting=args['weighting'],
                                   mask=args["mask"],
                                   conv_option=args['conv_option'],
                                   model_option=args['model_option'],
                                   join=args['join'],
                                   num_classes=num_classes,
                                   with_url=with_url,
                                   highway=args['highway'],
                                   att=args['co_attention'],
                                   ext_feat=args["external_feat"],
                                   encoder_option=args['encoder_option'])
    model_name = (
        "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s"
        % (mode, train_name, args['model_option'], args["encoder_option"],
           args['conv_option'], args["nb_filters"], args["nb_layers"],
           args["trainable"], args['dropout'], args['weighting'], args['mask'],
           args['batch_size'], args['val_split'], args['join'],
           args['highway'], args['co_attention'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    if num_classes <= 2:
        model.compile(loss='binary_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    else:
        print('compile model with categorical cross-entropy')
        model.compile(loss='categorical_crossentropy',
                      optimizer=opt,
                      metrics=['accuracy'])
    class_weight = None
    if args['dataset'] == 'Quora':
        #class_weight = {0:1, 1:2}
        print('apply class weight:', class_weight)

    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        model.fit(
            train_dataset,
            train_dataset['sim'],  #validation_split=0.05,
            batch_size=args['batch_size'],
            validation_data=(val_dataset, val_dataset['sim']),
            epochs=args['epochs'],
            shuffle=False,
            callbacks=[checkpoint, lr_reducer, early_stopping],
            class_weight=class_weight,
            verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    # load trained vocab embedding.
    trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0]
    # merge trained vocab embedding with test OOV word embeddings
    merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
    merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
    merged_vocab_emb[
        len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
    for key in vocab:
        vocab_size[key] = len(merged_vocab[key])
    print(vocab_size)

    new_model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       merged_vocab_emb,
                                       args["nb_filters"],
                                       args["nb_layers"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       join=args['join'],
                                       num_classes=num_classes,
                                       with_url=with_url,
                                       highway=args['highway'],
                                       att=args['co_attention'],
                                       ext_feat=args["external_feat"],
                                       encoder_option=args['encoder_option'])
    new_model.compile(loss='binary_crossentropy',
                      optimizer='adam',
                      metrics=['accuracy'])
    #print(new_model.summary())
    for layer_id in range(len(model.layers)):
        layer = model.layers[layer_id]
        if layer.name != 'word-embedding':
            new_model.layers[layer_id].set_weights(layer.get_weights())
    print('copy weight done.')
    val_predictions = new_model.predict(val_dataset)
    predictions = new_model.predict(test_dataset)

    if dataset_name == 'twitter' or dataset_name == 'TrecQA':
        val_predictions = val_predictions[:, 1]
        predictions = predictions[:, 1]
        print(predictions[:10])
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            for i in range(test_dataset['id'].shape[0]):
                f.write("%s %.4f %s\n" %
                        (test_dataset['id'][i], predictions[i], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write(
                    "%s %.4f %s\n" %
                    (val_dataset['id'][i], val_predictions[i], args['mode']))
        map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"])
        print('write val predictions with trec format to %s' %
              val_predictions_file)
        print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
        map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
        print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
    else:
        preds = np.argmax(predictions, axis=-1)
        labels = np.argmax(test_dataset['sim'], axis=-1)
        corrects = preds == labels
        predictions_file = "%s/%s/predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(predictions_file, 'w') as f:
            f.write("id label pred prob model\n")
            for i in range(len(preds)):
                f.write("%s %s %s %.4f %s\n" %
                        (test_dataset['id'][i], labels[i], preds[i],
                         predictions[i][preds[i]], args['mode']))
        print('write predictions with trec format to %s' % predictions_file)
        val_preds = np.argmax(val_predictions, axis=-1)
        val_labels = np.argmax(val_dataset['sim'], axis=-1)
        val_corrects = val_preds == val_labels
        val_predictions_file = "%s/%s/val_predictions_%s.txt" % (
            args["experimental_data"], data_name, model_name)
        with open(val_predictions_file, 'w') as f:
            for i in range(val_dataset['id'].shape[0]):
                f.write("%s %s %s %.4f %s\n" %
                        (val_dataset['id'][i], val_labels[i], val_preds[i],
                         val_predictions[i][val_preds[i]], args['mode']))
        print('write val predictions with trec format to %s' %
              val_predictions_file)

        print('val accuracy: %.4f' %
              (np.count_nonzero(val_corrects) * 1.0 / len(val_preds)))
        print('accuracy: %.4f' %
              (np.count_nonzero(corrects) * 1.0 / len(preds)))
        macro_prec = precision_score(labels, preds, average="macro")
        macro_recall = recall_score(labels, preds, average="macro")
        print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (macro_prec, macro_recall, 2 * macro_prec * macro_recall /
               (macro_prec + macro_recall)))
        print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' %
              (precision_score(labels, preds, average="micro"),
               recall_score(labels, preds, average="micro"),
               f1_score(labels, preds, average="micro")))
        print('Confusion matrix:', confusion_matrix(labels, preds))
def fibonacci_numbers_inverted_mapping(**kwds):
    if 'start' not in kwds: kwds['start'] = 2
    return  invert_dict(fibonacci_numbers(**kwds))
Ejemplo n.º 14
0
    'P': 'Pro',
    'Q': 'Gln',
    'R': 'Arg',
    'S': 'Ser',
    'T': 'Thr',
    'V': 'Val',
    'W': 'Trp',
    'Y': 'Tyr',
    'Z': 'Glx',
    'X': 'Xaa',
    'U': 'Sec',
    'J': 'Xle',
    'O': 'Pyl'
}

standard_three_to_one = utils.invert_dict(one_to_three)

extended_three_to_one = {
    '2as': 'D',
    '3ah': 'H',
    '5hp': 'E',
    'Acl': 'R',
    'Agm': 'R',
    'Aib': 'A',
    'Ala': 'A',
    'Alm': 'A',
    'Alo': 'T',
    'Aly': 'K',
    'Arg': 'R',
    'Arm': 'R',
    'Asa': 'D',
Ejemplo n.º 15
0
from keras.preprocessing.sequence import pad_sequences
from tokenizer import RE_PATTERN
from utils import get_val_as_str, invert_dict, load_dataset, load_model, load_turk_scores, merge_datasets
import numpy as np
import pandas as pd
import re

ASPECT = 'naturalness'
AUTOMATED_EVALUATION_BASE_PATH = f'../evaluations/automated/{ASPECT}/sentence_level'
CLASSIFIER_BASE_PATH = '../models/naturalness_classifiers'
MAX_SEQ_LEN = 30  # for neural classifier
TEXT_VECTORIZER = load_model('../models/vectorizer.pkl')

# adjust vocabulary to account for unknowns
VOCABULARY = TEXT_VECTORIZER.vocabulary_
INVERSE_VOCABULARY = invert_dict(VOCABULARY)
VOCABULARY[INVERSE_VOCABULARY[0]] = len(VOCABULARY)
VOCABULARY['CUSTOM_UNKNOWN'] = len(VOCABULARY) + 1


## DATA PREP
def convert_to_indices(text):
    # tokenize input text
    tokens = re.compile(RE_PATTERN).split(text)
    non_empty_tokens = list(filter(lambda token: token, tokens))

    indices = []

    # collect indices of tokens in vocabulary
    for token in non_empty_tokens:
        if token in VOCABULARY:
Ejemplo n.º 16
0
import json

from options import get_options
from datasets import get_dataloader
from model import get_model
import utils

COMP_CAT_DICT_PATH = '/home/ubuntu/vdp/clevr_inference/scene_parse/attr_net/tools/clevr_comp_cat_dict.json'

opt = get_options('test')
test_loader = get_dataloader(opt, 'test')
model = get_model(opt)

if opt.use_cat_label:
    with open(COMP_CAT_DICT_PATH) as f:
        cat_dict = utils.invert_dict(json.load(f))

if opt.dataset == 'clevr':
    num_images = len(glob.glob(opt.clevr_val_img_dir + '/*.png'))
    scenes = [{
        'image_index': i,
        'image_filename': 'CLEVR_val_%06d.png' % i,
        'objects': []
    } for i in range(num_images)]
    # print("run_test.py", scenes)

count = 0
for data, _, idxs, cat_idxs in test_loader:
    model.set_input(data)
    model.forward(idxs=idxs, name=opt.name)
    pred = model.get_pred()
Ejemplo n.º 17
0
def main(options):
    args = get_default_args()
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = [test_name]
    print("train_set", train_set)
    print("test_set", test_set)
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}
    train_vocab_emb, test_vocab_emb = None, None

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab)
        #print('build vocab done. %d' % len(vocab['word']))
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab,
                                False, max_query_len, max_doc_len, max_url_len,
                                args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    if mode == 'dssm':
        train_dataset = convert_data_to_dssm_format(train_dataset,
                                                    vocab,
                                                    is_train_or_val=True)
        test_dataset = convert_data_to_dssm_format(test_dataset,
                                                   vocab,
                                                   is_train_or_val=False)
        print('data convertion done!')

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val(train_set,
                                 num_samples=num_samples,
                                 val_split=val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    print(val_indices[:5], np.sum(np.array(val_indices)))

    # sample validation set for debug purpose
    # val_indices = val_indices[:100]

    train_dataset["query_word_weight"] = train_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    train_dataset["query_3gram_weight"] = train_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    train_dataset["doc_word_weight"] = train_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    train_dataset["doc_3gram_weight"] = train_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    train_dataset["url_3gram_weight"] = train_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    test_dataset["query_word_weight"] = test_dataset[
        "query_word_weight"][:, :args['deeplevel']]
    test_dataset["query_3gram_weight"] = test_dataset[
        "query_3gram_weight"][:, :args['deeplevel']]
    test_dataset["doc_word_weight"] = test_dataset[
        "doc_word_weight"][:, :args['deeplevel']]
    test_dataset["doc_3gram_weight"] = test_dataset[
        "doc_3gram_weight"][:, :args['deeplevel']]
    test_dataset["url_3gram_weight"] = test_dataset[
        "url_3gram_weight"][:, :args['deeplevel']]
    # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100])))

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5],
          train_dataset['query_word_input'][:5])

    # sample training dataset for debug purpose
    # sample_num = 1000
    # for key in train_dataset:
    #     train_dataset[key] = train_dataset[key][:sample_num]

    # merge the vocabulory of train and test set
    print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" %
          (len(vocab['word']), len(vocab['3gram']), len(vocab['url'])))
    print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len(
        test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url'])))
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d) url(%d)" %
          (len(merged_vocab['word']), len(
              merged_vocab['3gram']), len(merged_vocab['url'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])

    print(vocab_size)
    # Print data samples for debug purpose
    # print_dataset(mode, train_dataset, vocab_inv)
    # print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'],
                                       external=args["external_feat"],
                                       norm_weight=args['norm_weight'],
                                       cos_norm=args['cos'],
                                       only_word=args['only_word'],
                                       only_char=args['only_char'],
                                       pooling=args['pooling'],
                                       deeplevel=args['deeplevel'])
    elif mode == 'dssm':
        model = create_dssm_model(max_query_len,
                                  max_doc_len,
                                  max_url_len,
                                  vocab_size,
                                  train_vocab_emb,
                                  args["nb_filters"],
                                  embed_size=300,
                                  dropout_rate=args['dropout'],
                                  trainable=args["trainable"])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=False)
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    model_weights, parameter_num = get_model_weights(model)
    print('model init weights sum: {} of {} parameters'.format(
        model_weights, parameter_num))
    #

    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.3,
                                       patience=3,
                                       min_lr=0.0001)

        fit_mode = "fit"
        if fit_mode == "fit":
            model.fit(
                train_dataset,
                train_dataset['sim'],  # validation_split=0.05,
                batch_size=args['batch_size'],
                validation_data=(val_dataset, val_dataset['sim']),
                epochs=args['epochs'],
                shuffle=False,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)
        else:
            train_steps, train_batches = batch_iter(
                train_dataset,
                train_dataset["sim"],
                batch_size=args['batch_size'])
            valid_steps, valid_batches = batch_iter(
                val_dataset, val_dataset["sim"], batch_size=args['batch_size'])
            model.fit_generator(
                train_batches,
                train_steps,
                epochs=args['epochs'],
                validation_data=valid_batches,
                validation_steps=valid_steps,
                callbacks=[checkpoint, lr_reducer, early_stopping],
                verbose=2)

    #plot_model(model, to_file='model.png')
    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        if args["only_char"]:
            merged_vocab_emb = None
        else:
            embedding_layer_name = 'word_embedding'
            trained_vocab_emb = model.get_layer(
                embedding_layer_name).get_weights()[0]
            # merge trained vocab embedding with test OOV word embeddings
            merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
            merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
            merged_vocab_emb[len(vocab['word']):len(merged_vocab['word']
                                                    ), :] = test_vocab_emb
            for key in vocab:
                vocab_size[key] = len(merged_vocab[key])
            print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'],
                                           external=args["external_feat"],
                                           norm_weight=args['norm_weight'],
                                           cos_norm=args['cos'],
                                           only_word=args['only_word'],
                                           only_char=args['only_char'],
                                           pooling=args['pooling'],
                                           deeplevel=args['deeplevel'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        # print(new_model.summary())

        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if not args["only_char"] and layer.name != embedding_layer_name:
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)
    elif mode == 'dssm':
        getter = K.function([model.layers[0].input, model.layers[1].input],
                            model.layers[-2].output)
        print('create DSSM functional getter...')
        num_samples, _, _ = test_dataset['query_3gram_input'].shape
        batch_size = 128
        num_batch = int(math.ceil(num_samples * 1.0 / batch_size))
        predictions = np.zeros((num_samples, ))
        for i in range(num_batch):
            start_idx, end_idx = i * batch_size, min(num_samples,
                                                     (i + 1) * batch_size)
            predictions[start_idx:end_idx] = getter([
                test_dataset['query_3gram_input'][start_idx:end_idx],
                test_dataset['doc_3gram_input'][start_idx:end_idx]
            ])[:, 0]

    #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']])
    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
Ejemplo n.º 18
0
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len,
             max_doc_len, max_url_len, nb_classes, args):
    if is_train:
        vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX
        vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX
        vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX
        vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX
    query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], []
    all_url_list, all_ids_list, all_sim_list = [], [], []
    for data_name in datasets:  # there can be multiple data sets combined as the train or test data
        data_folder = "%s/%s" % (path, data_name)
        print('creating dataset %s' % data_name)
        t = time.time()
        q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        url_list, max_url_len_dataset = [], 0
        if os.path.exists("%s/url.txt" % data_folder):
            url_list, max_url_len_dataset = read_urls(
                "%s/url.txt" % data_folder, vocab, is_train, '3gram')
        ids_list = read_metadata("%s/id.txt" % data_folder)
        if is_train:
            max_query_len['word'] = max(max_query_len['word'],
                                        min(max_q1_word_len, MAX_WORD_LENGTH))
            max_query_len['3gram'] = max(
                max_query_len['3gram'], min(max_q1_3gram_len,
                                            MAX_3GRAM_LENGTH))
            max_doc_len['word'] = max(max_doc_len['word'],
                                      min(max_q2_word_len, MAX_WORD_LENGTH))
            max_doc_len['3gram'] = max(max_doc_len['3gram'],
                                       min(max_q2_3gram_len, MAX_3GRAM_LENGTH))
            max_url_len['url'] = max(max_url_len['url'],
                                     min(max_url_len_dataset, MAX_URL_LENGTH))
        sim_list = read_relevance("%s/sim.txt" % data_folder)
        categorical_sim_list = np.zeros((len(sim_list), nb_classes),
                                        dtype='int')
        for i, sim in enumerate(sim_list):
            categorical_sim_list[i][sim] = 1
        print(sim_list[:5], categorical_sim_list[:5])
        query_word_list.extend(q1_word_list)
        doc_word_list.extend(q2_word_list)
        query_3gram_list.extend(q1_3gram_list)
        doc_3gram_list.extend(q2_3gram_list)
        all_url_list.extend(url_list)
        all_ids_list.extend(ids_list)
        all_sim_list.extend(categorical_sim_list)
        print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" %
              (max_q1_word_len, max_q2_word_len, max_query_len['word'],
               max_doc_len['word']))
        print(
            "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" %
            (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'],
             max_doc_len['3gram']))
        print('max_url_len: %d, limit: %d' %
              (max_url_len_dataset, max_url_len['url']))
        print('creating dataset done: %d' % (time.time() - t))

    # question padding
    data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)}
    data['query_word_input'] = pad_sequences(query_word_list,
                                             maxlen=max_query_len['word'],
                                             value=PAD_WORD_INDEX,
                                             padding='post',
                                             truncating='post')
    data['query_word_mask'] = create_masks(data['query_word_input'], args)
    data['doc_word_input'] = pad_sequences(doc_word_list,
                                           maxlen=max_doc_len['word'],
                                           value=PAD_WORD_INDEX,
                                           padding='post',
                                           truncating='post')
    data['doc_word_mask'] = create_masks(data['doc_word_input'], args)
    data['query_3gram_input'] = pad_sequences(query_3gram_list,
                                              maxlen=max_query_len['3gram'],
                                              value=PAD_WORD_INDEX,
                                              padding='post',
                                              truncating='post')
    data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args)
    data['doc_3gram_input'] = pad_sequences(doc_3gram_list,
                                            maxlen=max_doc_len['3gram'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='post')
    data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args)
    data['url_3gram_input'] = pad_sequences(all_url_list,
                                            maxlen=max_url_len['url'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='pre')
    data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args)

    if os.path.exists("%s/collection_ngram_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_ngram_idf.json" % path, "r"))
        vocab_inv = invert_dict(vocab['3gram'])
        data['query_3gram_weight'] = inject_ngram_weight(
            data['query_3gram_input'], vocab_inv, weights)
        data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'],
                                                       vocab_inv, weights)
        data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'],
                                                       vocab_inv, weights)
        print('ngram weight injection done: %d' % (time.time() - t))
    else:
        num_samples, max_query_len = data['query_3gram_input'].shape
        data['query_3gram_weight'] = np.ones(
            (num_samples, ATTENTION_DEEP_LEVEL, max_query_len))
        data['doc_3gram_weight'] = np.ones((num_samples, ATTENTION_DEEP_LEVEL,
                                            data['doc_3gram_input'].shape[1]))

    if os.path.exists("%s/collection_word_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_word_idf.json" % path, "r"))
        merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word'])
        vocab_inv = invert_dict(merge_vocab)
        print('inject query IDF weights')
        data['query_word_weight'] = inject_word_weight(
            data['query_word_input'], vocab_inv, weights)
        print('inject doc IDF weights')
        data['doc_word_weight'] = inject_word_weight(data['doc_word_input'],
                                                     vocab_inv, weights)
        data['overlap_feat'] = compute_overlap_feat(data['query_word_input'],
                                                    data['doc_word_input'],
                                                    vocab_inv, weights)
        print('word weight injection done: %d' % (time.time() - t))

    return data
Ejemplo n.º 19
0
def load_def(localdir, ent_name, section_def, required_fields):
    if 'type' in section_def and 'fields' in section_def:
        raise Exception("invalid structure for '%s': "
                        "type and fields sections are mutually exclusive" %
                        ent_name)

    if 'type' in section_def:
        csv_filename = section_def.get('path', ent_name + ".csv")
        csv_filepath = complete_path(localdir, csv_filename)
        str_type = section_def['type']
        if isinstance(str_type, basestring):
            celltype = field_str_to_type(str_type, "array '%s'" % ent_name)
        else:
            assert isinstance(str_type, type)
            celltype = str_type
        return 'ndarray', load_ndarray(csv_filepath, celltype)

    fields_def = section_def.get('fields')
    if fields_def is not None:
        for fdef in fields_def:
            if isinstance(fdef, basestring):
                raise SyntaxError("invalid field declaration: '%s', you are "
                                  "probably missing a ':'" % fdef)
        if all(isinstance(fdef, dict) for fdef in fields_def):
            fields = fields_yaml_to_type(fields_def)
        else:
            assert all(isinstance(fdef, tuple) for fdef in fields_def)
            fields = fields_def
        fnames = {name for name, _ in fields}
        for reqname, reqtype in required_fields[::-1]:
            if reqname not in fnames:
                fields.insert(0, (reqname, reqtype))
    else:
        fields = None
    newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})),
                           section_def.get('newnames', {}))
    transpose = section_def.get('transposed', False)

    interpolate_def = section_def.get('interpolate')
    files_def = section_def.get('files')
    if files_def is None:
        # XXX: it might be cleaner to use the same code path than for the
        # multi-file case (however, that would loose the "import any file
        # size" feature that I'm fond of.

        # we can simply return the stream as-is
        # FIXME: stream is not sorted
        # csv file is assumed to be in the correct order (ie by period then id)
        csv_filename = section_def.get('path', ent_name + ".csv")
        csv_filepath = complete_path(localdir, csv_filename)
        csv_file = CSV(csv_filepath,
                       newnames,
                       delimiter=',',
                       transpose=transpose)
        stream = csv_file.read(fields)
        if fields is None:
            fields = csv_file.fields
        if interpolate_def is not None:
            raise Exception('interpolate is currently only supported with '
                            'multiple files')
        return 'table', (fields, csv_file.numlines, stream, csv_file)
    else:
        # we have to load all files, merge them and return a stream out of that
        print(" * computing number of rows...")

        # 1) only load required fields
        default_args = dict(newnames=newnames, transpose=transpose)
        if isinstance(files_def, dict):
            files_items = files_def.items()
        elif isinstance(files_def, list) and files_def:
            if isinstance(files_def[0], dict):
                # handle YAML ordered dict structure
                files_items = [d.items()[0] for d in files_def]
            elif isinstance(files_def[0], basestring):
                files_items = [(path, {}) for path in files_def]
            else:
                raise Exception("invalid structure for 'files'")
        else:
            raise Exception("invalid structure for 'files'")

        # XXX: shouldn't we use the "path" defined for the whole entity if any?
        # section_def.get('path')
        files = []
        for path, kwargs in files_items:
            kwargs['newnames'] = \
                merge_dicts(invert_dict(kwargs.pop('oldnames', {})),
                            kwargs.get('newnames', {}))
            f = CSV(complete_path(localdir, path),
                    **merge_dicts(default_args, kwargs))
            files.append(f)
        id_periods = union1d(f.as_array(required_fields) for f in files)

        print(" * reading files...")
        # 2) load all fields
        if fields is None:
            target_fields = merge_items(*[f.fields for f in files])
            fields_per_file = [None for _ in files]
        else:
            target_fields = fields
            fields_per_file = [[(name, type_) for name, type_ in target_fields
                                if name in f.field_names] for f in files]
            total_fields = set.union(*[set(f.field_names) for f in files])
            missing = set(name for name, _ in target_fields) - total_fields
            if missing:
                raise Exception("the following fields were not found in any "
                                "file: %s" % ", ".join(missing))

        total_lines = len(id_periods)

        # allocate main array
        target = get_default_array(total_lines, np.dtype(target_fields))
        target['period'] = id_periods['period']
        target['id'] = id_periods['id']

        arrays = [
            f.as_array(fields_to_load)
            for f, fields_to_load in zip(files, fields_per_file)
        ]

        # close all files
        for f in files:
            f.close()

        # FIXME: interpolation currently only interpolates missing data points,
        # not data points with their value equal the missing value
        # corresponding to the field type. This can only be fixed once
        # booleans are loaded as int8.
        if interpolate_def is not None:
            if any(v != 'previous_value'
                   for v in interpolate_def.itervalues()):
                raise Exception("currently, only 'previous_value' "
                                "interpolation is supported")
            to_interpolate = [
                k for k, v in interpolate_def.iteritems()
                if v == 'previous_value'
            ]
        else:
            to_interpolate = []

        interpolate(target, arrays, id_periods, to_interpolate)
        return 'table', (target_fields, total_lines, iter(target), None)
Ejemplo n.º 20
0
    def bucketize(self):
        tuples = []
        ctr = 0
        for src, targ in zip(self.train_sent, self.targ_sent):
            len_tup = self.bisect.twod_bisect(src, targ)
            tuples.append((src, targ, len_tup))
            
        sorted_keys = sorted(tuples, key=operator.itemgetter(2))
        grouped = groupby(sorted_keys, lambda x: x[2])
        self.sorted_keys = map(lambda x: x[2], sorted_keys)
        self.bucketed_data = [] 
        self.bucket_idx_to_key = []

        global_count = 0L
        error_count  = 0L        

        for group in grouped:
            
            # get src and targ sentences, ignore the last elem of the tuple 
            # (the grouping key of (src_len, targ_len))
            key, value = group[0], map(lambda x: x[:2], group[1])
            if len(value) < self.batch_size:
                continue

            # create padded representation
            new_src = np.full((len(value), key[0]), self.pad_id, dtype=self.dtype)
            new_targ = np.full((len(value), key[1] + 1), self.pad_id, dtype=self.dtype)
            new_label = np.full((len(value), key[1] + 1), self.pad_id, dtype=self.dtype)
            
            for idx, example in enumerate(value):
                try:
                    global_count += 1
                    curr_src, curr_targ = example
                    rev_src = curr_src[::-1]
                    new_src[idx, -len(curr_src):] = rev_src

                    new_targ[idx, 0] = self.go_id
                    new_targ[idx, 1:(len(curr_targ)+1)] = curr_targ

                    new_label[idx, 0:len(curr_targ)] = curr_targ
                    new_label[idx, len(curr_targ)] = self.eos_id
                except ValueError as ve:
                    error_count += 1
                    print(ve.message)
                    print("global count: %d, error count: %d" % (global_count, error_count))
                    continue
                            
            self.bucketed_data.append((new_src, new_targ, new_label))

            self.bucket_idx_to_key.append((key[0], key[1]+1))


        self.bucket_key_to_idx = invert_dict(dict(enumerate(self.bucket_idx_to_key)))
        self.interbucket_idx = -1
        self.curr_bucket_id = None
        self.curr_chunks = None
        self.curr_buck = None
        self.switch_bucket = True
        self.num_buckets = len(self.bucket_idx_to_key)
        self.bucket_iterator_indices = list(range(self.num_buckets))
        self.default_bucket_key = self.sorted_keys[-1]
Ejemplo n.º 21
0
import utils

content_types = utils.invert_dict(
    {
        "text/html": ["htm", "html"],
        "application/json": ["json"],
        "application/xhtml+xml": ["xht", "xhtm", "xhtml"],
        "application/xml": ["xml"],
        "application/x-xpinstall": ["xpi"],
        "text/javascript": ["js"],
        "text/css": ["css"],
        "text/plain": ["txt", "md"],
        "image/svg+xml": ["svg"],
        "image/gif": ["gif"],
        "image/jpeg": ["jpg", "jpeg"],
        "image/png": ["png"],
        "image/bmp": ["bmp"],
        "text/event-stream": ["event_stream"],
        "text/cache-manifest": ["manifest"],
        "video/mp4": ["mp4", "m4v"],
        "audio/mp4": ["m4a"],
        "audio/mpeg": ["mp3"],
        "video/webm": ["webm"],
        "audio/webm": ["weba"],
        "video/ogg": ["ogg", "ogv"],
        "audio/ogg": ["oga"],
        "audio/x-wav": ["wav"],
        "text/vtt": ["vtt"],
    }
)

response_codes = {
Ejemplo n.º 22
0
def main(options):
    args = get_default_args()
    load_best_args(args, options, get_best_args())
    set_args(args, options)
    print_args(args)
    mode = args['mode']
    train_name, test_name = args['split']['train'], args['split']['test']
    if train_name == 'train_all':
        train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014']
        train_set.remove(test_name)
    else:
        train_set = [train_name]
    test_set = test_name
    print('train_set: {}, test_set: {}'.format(train_set, test_set))
    max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict(
        int), defaultdict(int)
    vocab = {'word': {}, '3gram': {}, 'url': {}}
    test_vocab = {'word': {}, '3gram': {}, 'url': {}}

    ############################# LOAD DATA ##################################
    data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower()
    if args["load_data"]:
        train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True)
        test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, test_name),
            False)
        print('load dataset successfully')
    else:
        train_dataset = gen_data(args["raw_data"], train_set, vocab,
                                 test_vocab, True, max_query_len, max_doc_len,
                                 max_url_len, args)
        print("create training set successfully...")
        test_dataset = gen_data(args["raw_data"], [test_set], vocab,
                                test_vocab, False, max_query_len, max_doc_len,
                                max_url_len, args)
        train_vocab_emb, test_vocab_emb = construct_vocab_emb(
            "%s/%s" % (args["experimental_data"], data_name),
            vocab['word'],
            test_vocab['word'],
            300,
            "word",
            base_embed_path=args["base_embed_path"])
        save_data(
            "%s/%s/%s" % (args["experimental_data"], data_name, train_name),
            True, train_dataset, max_query_len, max_doc_len, max_url_len,
            vocab, train_vocab_emb)
        print("save training set successfully...")
        save_data("%s/%s/%s" %
                  (args["experimental_data"], data_name, test_name),
                  False,
                  test_dataset,
                  vocab=test_vocab,
                  vocab_emb=test_vocab_emb)
        print("save test set successfully...")

    val_split = args['val_split']
    num_samples, _ = train_dataset["query_word_input"].shape
    # randomly sample queries and all their documents if query_random is True
    # otherwise, query-doc pairs are randomly sampled
    query_random = True
    if query_random:
        val_indices = sample_val_set(args["raw_data"], train_set, val_split)
    else:
        val_indices, val_set = [], set()
        for i in range(int(num_samples * val_split)):
            val_index = np.random.randint(num_samples)
            while val_index in val_set:
                val_index = np.random.randint(num_samples)
            val_indices.append(val_index)
            val_set.add(val_index)

    val_dataset = {}
    for key in train_dataset:
        val_dataset[key] = train_dataset[key][val_indices]
        train_dataset[key] = np.delete(train_dataset[key], val_indices, 0)

    # shuffle the train dataset explicitly to make results reproducible
    # whether the performance will be affected remains a question
    keys, values = [], []
    for key in train_dataset:
        keys.append(key)
        values.append(train_dataset[key])
    zipped_values = list(zip(*values))
    random.shuffle(zipped_values)
    shuffled_values = list(zip(*zipped_values))
    for i, key in enumerate(keys):
        train_dataset[key] = np.array(shuffled_values[i])
    print('after shuffle: id {}, sim {}, query_word_input'.format(
        train_dataset['id'][:3], train_dataset['sim'][:3],
        train_dataset['query_word_input'][:3]))

    # merge the vocabulory of train and test set
    merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']}
    merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word'])
    print("merged vocab: word(%d) 3gram(%d)" %
          (len(merged_vocab['word']), len(test_vocab['3gram'])))
    vocab_inv, vocab_size = {}, {}
    vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url'])
    test_vocab['char'] = merge_two_dicts(test_vocab['3gram'],
                                         test_vocab['url'])
    merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char'])

    for key in vocab:
        vocab_inv[key] = invert_dict(merged_vocab[key])
        vocab_size[key] = len(vocab[key])
    print(vocab_size)

    # Print data samples for debug purpose
    print_dataset(mode, train_dataset, vocab_inv)
    print_dataset(mode, test_dataset, vocab_inv)

    ############################ TRAIN MODEL #################################
    model = None
    if mode == 'deep_twitter':
        model = create_attention_model(max_query_len,
                                       max_doc_len,
                                       max_url_len,
                                       vocab_size,
                                       train_vocab_emb,
                                       args["nb_filters"],
                                       embed_size=300,
                                       dropout_rate=args['dropout'],
                                       trainable=args["trainable"],
                                       weighting=args['weighting'],
                                       mask=args["mask"],
                                       conv_option=args['conv_option'],
                                       model_option=args['model_option'])
    model_name = (
        "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" %
        (mode, train_name, args['model_option'], args['conv_option'],
         args["nb_filters"], args["trainable"], args['dropout'],
         args['weighting'], args['mask'], args['batch_size'],
         args['val_split'])).lower()
    model_path = "%s/%s/%s" % (args['experimental_data'], data_name,
                               model_name)
    print(model_path)

    if args['optimizer'] == "adam":
        opt = optimizers.Adam(lr=args["learning_rate"],
                              beta_1=0.9,
                              beta_2=0.999,
                              epsilon=None,
                              decay=0.0,
                              amsgrad=True)
        print('use Adam optimizer')
    elif args['optimizer'] == "sgd":
        opt = optimizers.SGD(lr=args["learning_rate"],
                             decay=1e-6,
                             momentum=0.9,
                             nesterov=True)
        print('use SGD optimizer')
    elif args['optimizer'] == 'rmsprop':
        opt = optimizers.RMSprop(lr=args["learning_rate"],
                                 rho=0.9,
                                 epsilon=None,
                                 decay=0.0)
        print('use RMSprop optimizer')

    model.compile(loss='binary_crossentropy',
                  optimizer=opt,
                  metrics=['accuracy'])
    print(model.summary())
    print('model init weights sum: %.4f' % get_model_weights(model))
    if not args['load_model']:
        early_stopping = EarlyStopping(monitor='val_loss', patience=4)
        checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights",
                                     monitor='val_loss',
                                     save_best_only=True,
                                     verbose=1)
        lr_reducer = ReduceLROnPlateau(monitor='val_loss',
                                       factor=0.5,
                                       patience=2,
                                       min_lr=0.0001,
                                       verbose=1)
        #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:])
        model.fit(train_dataset,
                  train_dataset['sim'],
                  validation_data=(val_dataset, val_dataset['sim']),
                  batch_size=args['batch_size'],
                  epochs=args['epochs'],
                  shuffle=False,
                  callbacks=[checkpoint, lr_reducer, early_stopping],
                  verbose=args['verbose'])

    ############################ TEST MODEL #################################
    print('load best model from %s.best.weights' % model_path)
    model.load_weights("%s.best.weights" % model_path)
    if mode == 'deep_twitter':
        # load trained vocab embedding.
        trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0]
        # merge trained vocab embedding with test OOV word embeddings
        merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300))
        merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb
        merged_vocab_emb[
            len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb
        for key in vocab:
            vocab_size[key] = len(merged_vocab[key])
        print(vocab_size)

        new_model = create_attention_model(max_query_len,
                                           max_doc_len,
                                           max_url_len,
                                           vocab_size,
                                           merged_vocab_emb,
                                           args["nb_filters"],
                                           embed_size=300,
                                           dropout_rate=args['dropout'],
                                           trainable=args["trainable"],
                                           weighting=args['weighting'],
                                           mask=args["mask"],
                                           conv_option=args['conv_option'],
                                           model_option=args['model_option'])
        new_model.compile(loss='binary_crossentropy',
                          optimizer='adam',
                          metrics=['accuracy'])
        print(new_model.summary())
        num_layers = 0
        for layer in model.layers:
            num_layers += 1
        for layer_id in range(num_layers):
            layer = model.layers[layer_id]
            if layer.name != 'sequential_2':
                new_model.layers[layer_id].set_weights(layer.get_weights())
        print('copy weight done.')
        predictions = new_model.predict(test_dataset)

    print(predictions[:10])
    predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"],
                                                     data_name, model_name)
    with open(predictions_file, 'w') as f:
        for i in range(test_dataset['id'].shape[0]):
            f.write("%s %.4f %s\n" %
                    (test_dataset['id'][i], predictions[i], args['mode']))
    print('write predictions with trec format to %s' % predictions_file)
    map, mrr, p30 = evaluate(predictions_file, args["qrels_file"])
    print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
Ejemplo n.º 23
0
    "G": 379.,
    "U": 340.,
}

one_to_three = {
    'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp',
    'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His',
    'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met',
    'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg',
    'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp',
    'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', 
    'U':'Sec', 'J':'Xle', 'O':'Pyl'
    }


standard_three_to_one = utils.invert_dict(one_to_three)

extended_three_to_one= {
'2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'}
# Initial table is from the ASTRAL RAF release notes.
# added UNK
# Extra IUPAC: Xle, Xaa, Sec, Pyl
# The following have been seen in biopython code.
# Ter : '*'     Termination
# Sel : 'U'     A typo for Sec, selenocysteine? 
# Xer : 'X'     Another alternative for unknown?


amino_acid_names = {
    'A'	: 'alanine',	
    'M'	: 'methionine',  
Ejemplo n.º 24
0
import utils

content_types = utils.invert_dict({
    "text/html": ["htm", "html"],
    "application/xhtml+xml": ["xht", "xhtm", "xhtml"],
    "text/javascript": ["js"],
    "text/css": ["css"],
    "text/plain": ["txt", "md"],
    "text/xml": ["xml"],
    "image/svg+xml": ["svg"],
    "image/jpeg": ["jpg", "jpeg"],
    "image/png": ["png"],
    "text/event-stream": ["event_stream"],
    "text/cache-manifest": ["manifest"],
    "video/mp4": ["mp4", "m4v"],
    "audio/mp4": ["m4a"],
    "audio/mpeg": ["mp3"],
    "video/webm": ["webm"],
    "audio/webm": ["weba"],
    "video/ogg": ["ogg", "ogv"],
    "audio/ogg": ["oga"],
    "audio/x-wav": ["wav"],
    "text/vtt": ["vtt"],
})

response_codes = {
    100: ('Continue', 'Request received, please continue'),
    101:
    ('Switching Protocols', 'Switching to new protocol; obey Upgrade header'),
    200: ('OK', 'Request fulfilled, document follows'),
    201: ('Created', 'Document created, URL follows'),
Ejemplo n.º 25
0
    import utils

SFA = "http://developer.apple.com/namespaces/sfa"
SF = "http://developer.apple.com/namespaces/sf"
XSI = "http://www.w3.org/2001/XMLSchema-instance"
KEY = "http://developer.apple.com/namespaces/keynote2"

NSMAP = {
    "sfa": SFA,
    "sf": SF,
    "xsi": XSI,
    "key": KEY,
}

NAMESPACE_TO_URL = {k:"{"+v+"}" for k,v in NSMAP.items()}
URL_TO_NAMESPACE = utils.invert_dict(NAMESPACE_TO_URL)

def ns(qname):
    """ returns the lxml representation of an xml namespace,
        using a static lookup table. """
    if len(qname) and qname[0] == "{":
        return qname
    i = qname.find(":")
    if i<0:
        return qname
    return NAMESPACE_TO_URL[qname[0:i]] + qname[i+1:]

class XMLError(Exception):
    pass

class Element(object):
Ejemplo n.º 26
0
                                                  inverse_vocabulary,
                                                  weighted_feature_numbers)
        style_features_and_weights[style] = ranked_features

    return style_features_and_weights


## STEP 1.
## LOAD AND PREPARE DATA
x_tr, y_tr = load_train_set()

# edit fit_vectorizer() to create custom vectorizer for dataset
# otherwise, load existing vectorizer under DATA_VECTORIZER_PATH
# fit_vectorizer(x_tr)
vectorizer = load_model(DATA_VECTORIZER_PATH)
inverse_vocabulary = invert_dict(vectorizer.vocabulary_)

## STEP 2.
## TRAIN MODEL TO OBTAIN STYLE WEIGHTS
# to experiment with style weighting, edit parameters and train new model
regularization_type = 'l1'
C = 3
lr_path = f'../models/style_weights_extractor_{regularization_type}_reg_C_{C}.pkl'
# vec_x_tr = vectorizer.transform(x_tr)
# lr_model = train(regularization_type, C, vec_x_tr, y_tr)
# save_model(lr_model, lr_path)

model = load_model(lr_path)

## STEP 3.
## EXTRACT STYLE FEATURES AND WEIGHTS
Ejemplo n.º 27
0
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len,
             max_doc_len, max_url_len, args):
    if is_train:
        vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX
        vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX
        vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX
        vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX
        vocab['url']['PAD_URL_INDEX'] = PAD_WORD_INDEX
        vocab['url']['OOV_URL_INDEX'] = OOV_WORD_INDEX
    query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], []
    all_url_list, all_ids_list, all_sim_list = [], [], []
    t0 = time.time()
    for data_name in datasets:  # there can be multiple data sets combined as the train or test data
        data_folder = "%s/%s" % (path, data_name)
        print('load dataset %s' % data_name)
        t = time.time()
        q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" %
                                                       data_folder,
                                                       vocab,
                                                       is_train,
                                                       "word",
                                                       test_vocab=test_vocab)
        q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" %
                                                         data_folder,
                                                         vocab,
                                                         is_train,
                                                         "3gram",
                                                         test_vocab=test_vocab)
        url_list, max_url_len_dataset = read_urls("%s/url.txt" % data_folder,
                                                  vocab, is_train, '3gram')
        ids_list = read_metadata("%s/id.txt" % data_folder)
        if is_train:
            max_query_len['word'] = max(max_query_len['word'], max_q1_word_len)
            max_query_len['3gram'] = max(max_query_len['3gram'],
                                         max_q1_3gram_len)
            max_doc_len['word'] = max(max_doc_len['word'], max_q2_word_len)
            max_doc_len['3gram'] = max(max_doc_len['3gram'],
                                       min(max_q2_3gram_len, MAX_TWEET_LENGTH))
            max_url_len['url'] = max(max_url_len['url'],
                                     min(max_url_len_dataset, MAX_URL_LENGTH))
        sim_list = read_relevance("%s/sim.txt" % data_folder)
        query_word_list.extend(q1_word_list)
        doc_word_list.extend(q2_word_list)
        query_3gram_list.extend(q1_3gram_list)
        doc_3gram_list.extend(q2_3gram_list)
        all_url_list.extend(url_list)
        all_ids_list.extend(ids_list)
        all_sim_list.extend(sim_list)
        print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" %
              (max_q1_word_len, max_q2_word_len, max_query_len['word'],
               max_doc_len['word']))
        print(
            "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" %
            (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'],
             max_doc_len['3gram']))
        print('max_url_len: %d, limit: %d' %
              (max_url_len_dataset, max_url_len['url']))
        print('load dataset done: %d' % (time.time() - t))

    # question padding
    data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)}
    data['query_word_input'] = pad_sequences(query_word_list,
                                             maxlen=max_query_len['word'],
                                             value=PAD_WORD_INDEX,
                                             padding='post',
                                             truncating='post')
    data['query_word_mask'] = create_masks(data['query_word_input'], args)
    data['doc_word_input'] = pad_sequences(doc_word_list,
                                           maxlen=max_doc_len['word'],
                                           value=PAD_WORD_INDEX,
                                           padding='post',
                                           truncating='post')
    data['doc_word_mask'] = create_masks(data['doc_word_input'], args)
    data['query_3gram_input'] = pad_sequences(query_3gram_list,
                                              maxlen=max_query_len['3gram'],
                                              value=PAD_WORD_INDEX,
                                              padding='post',
                                              truncating='post')
    data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args)
    data['doc_3gram_input'] = pad_sequences(doc_3gram_list,
                                            maxlen=max_doc_len['3gram'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='post')
    data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args)
    data['url_3gram_input'] = pad_sequences(all_url_list,
                                            maxlen=max_url_len['url'],
                                            value=PAD_WORD_INDEX,
                                            padding='post',
                                            truncating='pre')
    data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args)

    if os.path.exists("%s/collection_ngram_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_ngram_idf.json" % path, "r"))
        vocab_inv = invert_dict(vocab['3gram'])
        data['query_3gram_weight'] = inject_ngram_weight(
            data['query_3gram_input'], vocab_inv, weights)
        data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'],
                                                       vocab_inv, weights)
        vocab_inv = invert_dict(vocab['url'])
        data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'],
                                                       vocab_inv, weights)
        print('ngram weight injection done: %d' % (time.time() - t))

    if os.path.exists("%s/collection_word_idf.json" % path):
        t = time.time()
        weights = json.load(open("%s/collection_word_idf.json" % path, "r"))
        merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word'])
        vocab_inv = invert_dict(merge_vocab)
        data['query_word_weight'] = inject_word_weight(
            data['query_word_input'], vocab_inv, weights)
        data['doc_word_weight'] = inject_word_weight(data['doc_word_input'],
                                                     vocab_inv, weights)
        data['overlap_feat'] = compute_overlap_feat(data['query_word_input'],
                                                    data['doc_word_input'],
                                                    vocab_inv, weights)
        print('word weight injection done: %d' % (time.time() - t))

    print('data creation is done: %d' % (time.time() - t0))
    return data
Ejemplo n.º 28
0
def load_def(localdir, ent_name, section_def, required_fields):
    if 'type' in section_def and 'fields' in section_def:
        raise Exception("invalid structure for '%s': "
                        "type and fields sections are mutually exclusive"
                        % ent_name)

    if 'type' in section_def:
        csv_filename = section_def.get('path', ent_name + ".csv")
        csv_filepath = complete_path(localdir, csv_filename)
        str_type = section_def['type']
        if isinstance(str_type, basestring):
            celltype = field_str_to_type(str_type, "array '%s'" % ent_name)
        else:
            assert isinstance(str_type, type)
            celltype = str_type
        return 'ndarray', load_ndarray(csv_filepath, celltype)

    fields_def = section_def.get('fields')
    if fields_def is not None:
        for fdef in fields_def:
            if isinstance(fdef, basestring):
                raise SyntaxError("invalid field declaration: '%s', you are "
                                  "probably missing a ':'" % fdef)
        if all(isinstance(fdef, dict) for fdef in fields_def):
            fields = fields_yaml_to_type(fields_def)
        else:
            assert all(isinstance(fdef, tuple) for fdef in fields_def)
            fields = fields_def
    else:
        fields = None
    newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})),
                           section_def.get('newnames', {}))
    transpose = section_def.get('transposed', False)

    interpolate_def = section_def.get('interpolate')
    files_def = section_def.get('files')
    if files_def is None:
        #XXX: it might be cleaner to use the same code path than for the
        # multi-file case (however, that would loose the "import any file
        # size" feature that I'm fond of.

        # we can simply return the stream as-is
        #FIXME: stream is not sorted
        # csv file is assumed to be in the correct order (ie by period then id)
        csv_filename = section_def.get('path', ent_name + ".csv")
        csv_filepath = complete_path(localdir, csv_filename)
        csv_file = CSV(csv_filepath, newnames,
                       delimiter=',', transpose=transpose)
        if fields is not None:
            fields = required_fields + fields
        stream = csv_file.read(fields)
        if fields is None:
            fields = csv_file.fields
        if interpolate_def is not None:
            raise Exception('interpolate is currently only supported with '
                            'multiple files')
        return 'table', (fields, csv_file.numlines, stream, csv_file)
    else:
        # we have to load all files, merge them and return a stream out of that
        print(" * computing number of rows...")

        # 1) only load required fields
        default_args = dict(newnames=newnames, transpose=transpose)
        if isinstance(files_def, dict):
            files_items = files_def.items()
        elif isinstance(files_def, list) and files_def:
            if isinstance(files_def[0], dict):
                # handle YAML ordered dict structure
                files_items = [d.items()[0] for d in files_def]
            elif isinstance(files_def[0], basestring):
                files_items = [(path, {}) for path in files_def]
            else:
                raise Exception("invalid structure for 'files'")
        else:
            raise Exception("invalid structure for 'files'")

        #XXX: shouldn't we use the "path" defined for the whole entity if any?
        # section_def.get('path')
        files = []
        for path, kwargs in files_items:
            kwargs['newnames'] = \
                merge_dicts(invert_dict(kwargs.pop('oldnames', {})),
                            kwargs.get('newnames', {}))
            f = CSV(complete_path(localdir, path),
                    **merge_dicts(default_args, kwargs))
            files.append(f)
        id_periods = union1d(f.as_array(required_fields) for f in files)

        print(" * reading files...")
        # 2) load all fields
        if fields is None:
            target_fields = merge_items(*[f.fields for f in files])
            fields_per_file = [None for f in files]
        else:
            target_fields = required_fields + fields
            fields_per_file = [[(name, type_) for name, type_ in target_fields
                               if name in f.field_names]
                              for f in files]
            total_fields = set.union(*[set(f.field_names) for f in files])
            missing = set(name for name, _ in target_fields) - total_fields
            if missing:
                raise Exception("the following fields were not found in any "
                                "file: %s" % ", ".join(missing))

        total_lines = len(id_periods)

        # allocate main array
        target = np.empty(total_lines, dtype=np.dtype(target_fields))
        # fill with default values
        target[:] = tuple(missing_values[ftype] for _, ftype in target_fields)
        target['period'] = id_periods['period']
        target['id'] = id_periods['id']

        arrays = [f.as_array(fields_to_load)
                  for f, fields_to_load in zip(files, fields_per_file)]

        # close all files
        for f in files:
            f.close()

        #FIXME: interpolation currently only interpolates missing data points,
        # not data points with their value equal the missing value
        # corresponding to the field type. This can only be fixed once
        # booleans are loaded as int8.
        if interpolate_def is not None:
            if any(v != 'previous_value'
                   for v in interpolate_def.itervalues()):
                raise Exception("currently, only 'previous_value' "
                                "interpolation is supported")
            to_interpolate = [k for k, v in interpolate_def.iteritems()
                              if v == 'previous_value']
        else:
            to_interpolate = []

        interpolate(target, arrays, id_periods, to_interpolate)
        return 'table', (target_fields, total_lines, iter(target), None)
Ejemplo n.º 29
0
    import utils

SFA = "http://developer.apple.com/namespaces/sfa"
SF = "http://developer.apple.com/namespaces/sf"
XSI = "http://www.w3.org/2001/XMLSchema-instance"
KEY = "http://developer.apple.com/namespaces/keynote2"

NSMAP = {
    "sfa": SFA,
    "sf": SF,
    "xsi": XSI,
    "key": KEY,
}

NAMESPACE_TO_URL = {k: "{" + v + "}" for k, v in NSMAP.items()}
URL_TO_NAMESPACE = utils.invert_dict(NAMESPACE_TO_URL)


def ns(qname):
    """ returns the lxml representation of an xml namespace,
        using a static lookup table. """
    if len(qname) and qname[0] == "{":
        return qname
    i = qname.find(":")
    if i < 0:
        return qname
    return NAMESPACE_TO_URL[qname[0:i]] + qname[i + 1:]


class XMLError(Exception):
    pass