def preprocess_data(dir_path='NKJP_1.2_nltk_POS', window_size=3): sentences, tagnames, dictionary = parse_sentences(dir_path) print '{} sentences loaded'.format(len(sentences)) tagnames.update(['<s>', '</s>']) # Add special tags num_to_tag = dict(enumerate(tagnames)) tag_to_num = invert_dict(num_to_tag) dictionary.update(['UUUNKKK', '<s>', '</s>']) # Add special tokens num_to_word = dict(enumerate(dictionary)) word_to_num = invert_dict(num_to_word) X, y = docs_to_windows(sentences, word_to_num, tag_to_num, window_size) print '{} {}-word windows loaded'.format(len(X), window_size) print 'Shape of X is {}\nShape of y is {}'.format(X.shape, y.shape) return X, y, word_to_num, tag_to_num
def load_wv(vocabfile): with open(vocabfile) as fd: words = [line.strip() for line in fd if line.strip() != ''] words = list(set(words)) num_to_word = dict(enumerate(words)) word_to_num = invert_dict(num_to_word) return word_to_num, num_to_word
def load_wv(vocabfile, wvfile): wv = loadtxt(wvfile, dtype=float) with open(vocabfile) as fd: words = [line.strip() for line in fd] num_to_word = dict(enumerate(words)) word_to_num = invert_dict(num_to_word) return wv, word_to_num, num_to_word
def load_wv(vocabfile, wvfile): wv = loadtxt(wvfile, dtype=float) with codecs.open(vocabfile,'r',encoding='utf-8') as fd: words = [line.strip('\n').strip(' ') for line in fd] num_to_word = dict(enumerate(words)) word_to_num = invert_dict(num_to_word) # print words[170:180] return wv, word_to_num, num_to_word
def handle_prediction(self): probs, classes = self.predict_image(self.image_path, self.model) idx_to_class: Dict = invert_dict(self.model.class_to_idx) classes = [idx_to_class[c] for c in classes] if self.cat_name_map: classes = [self.cat_name_map[c] for c in classes] print(f"Prediction for top {self.top_k} classes") total_p: float = sum(probs) for p, c in zip(probs, classes): percent: float = 100.0 * p / total_p print(f"\t{c.title()}: {percent:.1f}%")
def load_data(self, debug=False): """Loads starter word-vectors and train/dev/test-split the data.""" # Load the training set X, y, self.word_to_num, self.tag_to_num = preprocess_data( dir_path='NKJP_1.2_nltk_POS') self.num_to_word = invert_dict(self.word_to_num) self.num_to_tag = invert_dict(self.tag_to_num) self.tagset_size = len(self.tag_to_num) self.X_train, self.X_dev, self.y_train, self.y_dev = train_test_split( X, y, test_size=0.2) # A hacky way to get 3-part split from 2-part-splitting function self.X_dev, self.X_test, self.y_dev, self.y_test = train_test_split( self.X_dev, self.y_dev, test_size=0.5) if debug: self.X_train = self.X_train[:1024] self.y_train = self.y_train[:1024] self.X_dev = self.X_dev[:1024] self.y_dev = self.y_dev[:1024]
def __init__(self, trn_file, wav_file, mfcc_file, args, vocab_create_mode='BUILD', mfcc_create='Y'): ''' Args: data_file: data file path vocab_create_mode: BUILD: create the vocab dict from raw label data LOAD : read from file directly ''' self.args = args #trn file path self.trn_file = trn_file #wav file path self.wav_file = wav_file #mfcc file path self.mfcc_file = mfcc_file # data file path #self.data_file = data_file # <EOS>: end of the sentenset tag # <SOS>: start of the sentenset tag # <PAD>: padding tag self.special_signs = ['<EOS>', '<SOS>', '<PAD>', '<BIAS>'] # label to index dict self.vocab = {} # index to label dict self.inverse_vocab = {} if vocab_create_mode == 'BUILD': self.label_process() elif vocab_create_mode == 'LOAD': self.vocab = utils.load_from_pkl('vocab.pkl') self.inverse_vocab = utils.invert_dict(self.vocab) if mfcc_create == 'Y': for i in range(len(self.wav_file)): wavlist = os.listdir(self.wav_file[i]) for j in range(len(wavlist)): wav_path = os.path.join(self.wav_file[i], wavlist[j]) # invert the radio to the mfcc feature mfcc = self.read_wav_file(wav_path, 26, 9) mfcc = np.transpose(mfcc) np.save(os.path.join(self.mfcc_file[i], \ os.path.splitext(wavlist[j])[0]), mfcc, 'utf-8')
def load(metadata_path): directory = os.path.dirname(metadata_path) with open(metadata_path, 'rb') as f: iterator = pickle.load(f) iterator._deserialize_np_arrays(directory) iterator.mappings = None # Create inverse lookup of buckets. # If TN, ba #src_ex.shape[0], src_ex.shape[1]) iterator.bucket_idx_to_key = [] for bucket in iterator.bucketed_data: src_len = np.shape(bucket[0])[1] label_len = np.shape(bucket[2])[1] iterator.bucket_idx_to_key.append((src_len, label_len)) iterator.bucket_key_to_idx = invert_dict(dict(enumerate(iterator.bucket_idx_to_key))) return iterator
def create_train_test_dic(total_dic): testdic = defaultdict(list) traindic = defaultdict(list) invert_total_dic = invert_dict(total_dic) for user in total_dic: if len(total_dic[user]) < 2: traindic[user] = total_dic[user] else: i = 0 for ref in total_dic[user]: i = i + 1 if i < 2: traindic[user].append(ref) else: if len(invert_total_dic[ref]) < 2: traindic[user].append(ref) else: invert_total_dic[ref].remove(user) testdic[user].append(ref) return traindic, testdic
def parse_freedict(fn, invert=False): with open(fn, 'r') as freedict: translation_dict = {} entry_found = False word_source = "" for line in freedict: if line.strip() == "<entry>": entry_found = True elif line.strip() == "</entry>": entry_found = False if entry_found: if line.strip()[:6] == "<orth>": word_source = re.sub(r'<orth>|</orth>', '', line.strip()).lower() if " " in word_source: entry_found = False elif word_source not in translation_dict: translation_dict[word_source] = [] elif line.strip()[:7] == "<quote>": word_target = re.sub(r'<quote>|</quote>', '', line.strip()).lower() if " " not in word_target: translation_dict[word_source].append(word_target) if invert: translation_dict = invert_dict(translation_dict) return translation_dict
def main(options): args = get_default_args() set_args(args, options) mode, dataset_name = args['mode'], args['dataset'] # default setting args['raw_data'] = "data/%s/" % args['dataset'] args['qrels_file'] = "data/%s/qrels.all.txt" % args['dataset'] print_args(args) # get train/val/test names for specific dataset train_name, val_name, test_name, train_set, val_set, test_set, num_classes, with_url = config_dataset( args) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}} test_vocab = {'word': {}, '3gram': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s_%s" % (mode, dataset_name, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset, _, _, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, val_name), False) if args['embedding'] == 'glove': train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"], type=args["embedding"]) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': val_dataset = gen_data(args["raw_data"], val_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) print("create validation set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, num_classes, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") if dataset_name != 'twitter' and dataset_name != 'TwitterURL': save_data("%s/%s/%s" % (args["experimental_data"], data_name, val_name), False, val_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save val set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if dataset_name == 'twitter' or dataset_name == 'TwitterURL': val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if dataset_name == 'twitter' else False if query_random: del train_dataset["overlap_feat"] val_indices = sample_aaai_val_set(args["raw_data"], train_set, val_split) else: val_split = 0.1 val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: #print(key, train_dataset[key].shape) val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: if train_dataset[key].size == 0: continue keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # merge the vocabulory of train and test set merged_vocab = {} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) merged_vocab['3gram'] = merge_two_dicts(vocab['3gram'], test_vocab['3gram']) print("TRAIN vocab: word(%d) 3gram(%d)" % (len(vocab['word']), len(vocab['3gram']))) print("TEST vocab: word(%d) 3gram(%d)" % (len(test_vocab['word']), len(test_vocab['3gram']))) print("MERGED vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(merged_vocab['3gram']))) vocab_inv, vocab_size = {}, {} for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# # create model model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) model_name = ( "model_N%s_data%s_mo%s_e%s_c%s_NumFilter%d_nblayer%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f_Join%s_H%s_Att%s" % (mode, train_name, args['model_option'], args["encoder_option"], args['conv_option'], args["nb_filters"], args["nb_layers"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'], args['join'], args['highway'], args['co_attention'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') if num_classes <= 2: model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) else: print('compile model with categorical cross-entropy') model.compile(loss='categorical_crossentropy', optimizer=opt, metrics=['accuracy']) class_weight = None if args['dataset'] == 'Quora': #class_weight = {0:1, 1:2} print('apply class weight:', class_weight) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) model.fit( train_dataset, train_dataset['sim'], #validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], class_weight=class_weight, verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) # load trained vocab embedding. trained_vocab_emb = model.get_layer('word-embedding').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], args["nb_layers"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], join=args['join'], num_classes=num_classes, with_url=with_url, highway=args['highway'], att=args['co_attention'], ext_feat=args["external_feat"], encoder_option=args['encoder_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) #print(new_model.summary()) for layer_id in range(len(model.layers)): layer = model.layers[layer_id] if layer.name != 'word-embedding': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') val_predictions = new_model.predict(val_dataset) predictions = new_model.predict(test_dataset) if dataset_name == 'twitter' or dataset_name == 'TrecQA': val_predictions = val_predictions[:, 1] predictions = predictions[:, 1] print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write( "%s %.4f %s\n" % (val_dataset['id'][i], val_predictions[i], args['mode'])) map, mrr, p30 = evaluate(val_predictions_file, args["qrels_file"]) print('write val predictions with trec format to %s' % val_predictions_file) print('Validation MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr)) else: preds = np.argmax(predictions, axis=-1) labels = np.argmax(test_dataset['sim'], axis=-1) corrects = preds == labels predictions_file = "%s/%s/predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: f.write("id label pred prob model\n") for i in range(len(preds)): f.write("%s %s %s %.4f %s\n" % (test_dataset['id'][i], labels[i], preds[i], predictions[i][preds[i]], args['mode'])) print('write predictions with trec format to %s' % predictions_file) val_preds = np.argmax(val_predictions, axis=-1) val_labels = np.argmax(val_dataset['sim'], axis=-1) val_corrects = val_preds == val_labels val_predictions_file = "%s/%s/val_predictions_%s.txt" % ( args["experimental_data"], data_name, model_name) with open(val_predictions_file, 'w') as f: for i in range(val_dataset['id'].shape[0]): f.write("%s %s %s %.4f %s\n" % (val_dataset['id'][i], val_labels[i], val_preds[i], val_predictions[i][val_preds[i]], args['mode'])) print('write val predictions with trec format to %s' % val_predictions_file) print('val accuracy: %.4f' % (np.count_nonzero(val_corrects) * 1.0 / len(val_preds))) print('accuracy: %.4f' % (np.count_nonzero(corrects) * 1.0 / len(preds))) macro_prec = precision_score(labels, preds, average="macro") macro_recall = recall_score(labels, preds, average="macro") print('Macro Precision: %.3f, Recall: %.3f, F1: %.3f' % (macro_prec, macro_recall, 2 * macro_prec * macro_recall / (macro_prec + macro_recall))) print('Micro Precision: %.3f, Recall: %.3f, F1: %.3f' % (precision_score(labels, preds, average="micro"), recall_score(labels, preds, average="micro"), f1_score(labels, preds, average="micro"))) print('Confusion matrix:', confusion_matrix(labels, preds))
def fibonacci_numbers_inverted_mapping(**kwds): if 'start' not in kwds: kwds['start'] = 2 return invert_dict(fibonacci_numbers(**kwds))
'P': 'Pro', 'Q': 'Gln', 'R': 'Arg', 'S': 'Ser', 'T': 'Thr', 'V': 'Val', 'W': 'Trp', 'Y': 'Tyr', 'Z': 'Glx', 'X': 'Xaa', 'U': 'Sec', 'J': 'Xle', 'O': 'Pyl' } standard_three_to_one = utils.invert_dict(one_to_three) extended_three_to_one = { '2as': 'D', '3ah': 'H', '5hp': 'E', 'Acl': 'R', 'Agm': 'R', 'Aib': 'A', 'Ala': 'A', 'Alm': 'A', 'Alo': 'T', 'Aly': 'K', 'Arg': 'R', 'Arm': 'R', 'Asa': 'D',
from keras.preprocessing.sequence import pad_sequences from tokenizer import RE_PATTERN from utils import get_val_as_str, invert_dict, load_dataset, load_model, load_turk_scores, merge_datasets import numpy as np import pandas as pd import re ASPECT = 'naturalness' AUTOMATED_EVALUATION_BASE_PATH = f'../evaluations/automated/{ASPECT}/sentence_level' CLASSIFIER_BASE_PATH = '../models/naturalness_classifiers' MAX_SEQ_LEN = 30 # for neural classifier TEXT_VECTORIZER = load_model('../models/vectorizer.pkl') # adjust vocabulary to account for unknowns VOCABULARY = TEXT_VECTORIZER.vocabulary_ INVERSE_VOCABULARY = invert_dict(VOCABULARY) VOCABULARY[INVERSE_VOCABULARY[0]] = len(VOCABULARY) VOCABULARY['CUSTOM_UNKNOWN'] = len(VOCABULARY) + 1 ## DATA PREP def convert_to_indices(text): # tokenize input text tokens = re.compile(RE_PATTERN).split(text) non_empty_tokens = list(filter(lambda token: token, tokens)) indices = [] # collect indices of tokens in vocabulary for token in non_empty_tokens: if token in VOCABULARY:
import json from options import get_options from datasets import get_dataloader from model import get_model import utils COMP_CAT_DICT_PATH = '/home/ubuntu/vdp/clevr_inference/scene_parse/attr_net/tools/clevr_comp_cat_dict.json' opt = get_options('test') test_loader = get_dataloader(opt, 'test') model = get_model(opt) if opt.use_cat_label: with open(COMP_CAT_DICT_PATH) as f: cat_dict = utils.invert_dict(json.load(f)) if opt.dataset == 'clevr': num_images = len(glob.glob(opt.clevr_val_img_dir + '/*.png')) scenes = [{ 'image_index': i, 'image_filename': 'CLEVR_val_%06d.png' % i, 'objects': [] } for i in range(num_images)] # print("run_test.py", scenes) count = 0 for data, _, idxs, cat_idxs in test_loader: model.set_input(data) model.forward(idxs=idxs, name=opt.name) pred = model.get_pred()
def main(options): args = get_default_args() set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['train_2011', 'test_2011', 'train_2013', 'test_2013'] train_set.remove(test_name) else: train_set = [train_name] test_set = [test_name] print("train_set", train_set) print("test_set", test_set) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} train_vocab_emb, test_vocab_emb = None, None ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: #vocab = build_vocab(args["raw_data"], train_set, test_set, vocab) #print('build vocab done. %d' % len(vocab['word'])) train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], test_set, vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") if mode == 'dssm': train_dataset = convert_data_to_dssm_format(train_dataset, vocab, is_train_or_val=True) test_dataset = convert_data_to_dssm_format(test_dataset, vocab, is_train_or_val=False) print('data convertion done!') val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val(train_set, num_samples=num_samples, val_split=val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) print(val_indices[:5], np.sum(np.array(val_indices))) # sample validation set for debug purpose # val_indices = val_indices[:100] train_dataset["query_word_weight"] = train_dataset[ "query_word_weight"][:, :args['deeplevel']] train_dataset["query_3gram_weight"] = train_dataset[ "query_3gram_weight"][:, :args['deeplevel']] train_dataset["doc_word_weight"] = train_dataset[ "doc_word_weight"][:, :args['deeplevel']] train_dataset["doc_3gram_weight"] = train_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] train_dataset["url_3gram_weight"] = train_dataset[ "url_3gram_weight"][:, :args['deeplevel']] test_dataset["query_word_weight"] = test_dataset[ "query_word_weight"][:, :args['deeplevel']] test_dataset["query_3gram_weight"] = test_dataset[ "query_3gram_weight"][:, :args['deeplevel']] test_dataset["doc_word_weight"] = test_dataset[ "doc_word_weight"][:, :args['deeplevel']] test_dataset["doc_3gram_weight"] = test_dataset[ "doc_3gram_weight"][:, :args['deeplevel']] test_dataset["url_3gram_weight"] = test_dataset[ "url_3gram_weight"][:, :args['deeplevel']] # print("SHAPEEEEEEEEEEEEEEEEEEEE: {}".format(len(train_dataset["query_word_weight"][100]))) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle:', train_dataset['id'][:5], train_dataset['sim'][:5], train_dataset['query_word_input'][:5]) # sample training dataset for debug purpose # sample_num = 1000 # for key in train_dataset: # train_dataset[key] = train_dataset[key][:sample_num] # merge the vocabulory of train and test set print("TRAIN vocab: word(%d) 3gram(%d) url(%d)" % (len(vocab['word']), len(vocab['3gram']), len(vocab['url']))) print("TEST vocab: word(%d) 3gram(%d) url(%d)" % (len( test_vocab['word']), len(test_vocab['3gram']), len(test_vocab['url']))) merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d) url(%d)" % (len(merged_vocab['word']), len( merged_vocab['3gram']), len(merged_vocab['url']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose # print_dataset(mode, train_dataset, vocab_inv) # print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) elif mode == 'dssm': model = create_dssm_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"]) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=False) elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) model_weights, parameter_num = get_model_weights(model) print('model init weights sum: {} of {} parameters'.format( model_weights, parameter_num)) # if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.3, patience=3, min_lr=0.0001) fit_mode = "fit" if fit_mode == "fit": model.fit( train_dataset, train_dataset['sim'], # validation_split=0.05, batch_size=args['batch_size'], validation_data=(val_dataset, val_dataset['sim']), epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) else: train_steps, train_batches = batch_iter( train_dataset, train_dataset["sim"], batch_size=args['batch_size']) valid_steps, valid_batches = batch_iter( val_dataset, val_dataset["sim"], batch_size=args['batch_size']) model.fit_generator( train_batches, train_steps, epochs=args['epochs'], validation_data=valid_batches, validation_steps=valid_steps, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=2) #plot_model(model, to_file='model.png') ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. if args["only_char"]: merged_vocab_emb = None else: embedding_layer_name = 'word_embedding' trained_vocab_emb = model.get_layer( embedding_layer_name).get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[len(vocab['word']):len(merged_vocab['word'] ), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option'], external=args["external_feat"], norm_weight=args['norm_weight'], cos_norm=args['cos'], only_word=args['only_word'], only_char=args['only_char'], pooling=args['pooling'], deeplevel=args['deeplevel']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) # print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if not args["only_char"] and layer.name != embedding_layer_name: new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) elif mode == 'dssm': getter = K.function([model.layers[0].input, model.layers[1].input], model.layers[-2].output) print('create DSSM functional getter...') num_samples, _, _ = test_dataset['query_3gram_input'].shape batch_size = 128 num_batch = int(math.ceil(num_samples * 1.0 / batch_size)) predictions = np.zeros((num_samples, )) for i in range(num_batch): start_idx, end_idx = i * batch_size, min(num_samples, (i + 1) * batch_size) predictions[start_idx:end_idx] = getter([ test_dataset['query_3gram_input'][start_idx:end_idx], test_dataset['doc_3gram_input'][start_idx:end_idx] ])[:, 0] #predictions = getter([test_dataset['query_3gram_input'], test_dataset['doc_3gram_input']]) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len, max_doc_len, max_url_len, nb_classes, args): if is_train: vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], [] all_url_list, all_ids_list, all_sim_list = [], [], [] for data_name in datasets: # there can be multiple data sets combined as the train or test data data_folder = "%s/%s" % (path, data_name) print('creating dataset %s' % data_name) t = time.time() q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) url_list, max_url_len_dataset = [], 0 if os.path.exists("%s/url.txt" % data_folder): url_list, max_url_len_dataset = read_urls( "%s/url.txt" % data_folder, vocab, is_train, '3gram') ids_list = read_metadata("%s/id.txt" % data_folder) if is_train: max_query_len['word'] = max(max_query_len['word'], min(max_q1_word_len, MAX_WORD_LENGTH)) max_query_len['3gram'] = max( max_query_len['3gram'], min(max_q1_3gram_len, MAX_3GRAM_LENGTH)) max_doc_len['word'] = max(max_doc_len['word'], min(max_q2_word_len, MAX_WORD_LENGTH)) max_doc_len['3gram'] = max(max_doc_len['3gram'], min(max_q2_3gram_len, MAX_3GRAM_LENGTH)) max_url_len['url'] = max(max_url_len['url'], min(max_url_len_dataset, MAX_URL_LENGTH)) sim_list = read_relevance("%s/sim.txt" % data_folder) categorical_sim_list = np.zeros((len(sim_list), nb_classes), dtype='int') for i, sim in enumerate(sim_list): categorical_sim_list[i][sim] = 1 print(sim_list[:5], categorical_sim_list[:5]) query_word_list.extend(q1_word_list) doc_word_list.extend(q2_word_list) query_3gram_list.extend(q1_3gram_list) doc_3gram_list.extend(q2_3gram_list) all_url_list.extend(url_list) all_ids_list.extend(ids_list) all_sim_list.extend(categorical_sim_list) print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" % (max_q1_word_len, max_q2_word_len, max_query_len['word'], max_doc_len['word'])) print( "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" % (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'], max_doc_len['3gram'])) print('max_url_len: %d, limit: %d' % (max_url_len_dataset, max_url_len['url'])) print('creating dataset done: %d' % (time.time() - t)) # question padding data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)} data['query_word_input'] = pad_sequences(query_word_list, maxlen=max_query_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_word_mask'] = create_masks(data['query_word_input'], args) data['doc_word_input'] = pad_sequences(doc_word_list, maxlen=max_doc_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_word_mask'] = create_masks(data['doc_word_input'], args) data['query_3gram_input'] = pad_sequences(query_3gram_list, maxlen=max_query_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args) data['doc_3gram_input'] = pad_sequences(doc_3gram_list, maxlen=max_doc_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args) data['url_3gram_input'] = pad_sequences(all_url_list, maxlen=max_url_len['url'], value=PAD_WORD_INDEX, padding='post', truncating='pre') data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args) if os.path.exists("%s/collection_ngram_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_ngram_idf.json" % path, "r")) vocab_inv = invert_dict(vocab['3gram']) data['query_3gram_weight'] = inject_ngram_weight( data['query_3gram_input'], vocab_inv, weights) data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'], vocab_inv, weights) data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'], vocab_inv, weights) print('ngram weight injection done: %d' % (time.time() - t)) else: num_samples, max_query_len = data['query_3gram_input'].shape data['query_3gram_weight'] = np.ones( (num_samples, ATTENTION_DEEP_LEVEL, max_query_len)) data['doc_3gram_weight'] = np.ones((num_samples, ATTENTION_DEEP_LEVEL, data['doc_3gram_input'].shape[1])) if os.path.exists("%s/collection_word_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_word_idf.json" % path, "r")) merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word']) vocab_inv = invert_dict(merge_vocab) print('inject query IDF weights') data['query_word_weight'] = inject_word_weight( data['query_word_input'], vocab_inv, weights) print('inject doc IDF weights') data['doc_word_weight'] = inject_word_weight(data['doc_word_input'], vocab_inv, weights) data['overlap_feat'] = compute_overlap_feat(data['query_word_input'], data['doc_word_input'], vocab_inv, weights) print('word weight injection done: %d' % (time.time() - t)) return data
def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def fnames = {name for name, _ in fields} for reqname, reqtype in required_fields[::-1]: if reqname not in fnames: fields.insert(0, (reqname, reqtype)) else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: # XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is # FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") # XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for _ in files] else: target_fields = fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = get_default_array(total_lines, np.dtype(target_fields)) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [ f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file) ] # close all files for f in files: f.close() # FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [ k for k, v in interpolate_def.iteritems() if v == 'previous_value' ] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
def bucketize(self): tuples = [] ctr = 0 for src, targ in zip(self.train_sent, self.targ_sent): len_tup = self.bisect.twod_bisect(src, targ) tuples.append((src, targ, len_tup)) sorted_keys = sorted(tuples, key=operator.itemgetter(2)) grouped = groupby(sorted_keys, lambda x: x[2]) self.sorted_keys = map(lambda x: x[2], sorted_keys) self.bucketed_data = [] self.bucket_idx_to_key = [] global_count = 0L error_count = 0L for group in grouped: # get src and targ sentences, ignore the last elem of the tuple # (the grouping key of (src_len, targ_len)) key, value = group[0], map(lambda x: x[:2], group[1]) if len(value) < self.batch_size: continue # create padded representation new_src = np.full((len(value), key[0]), self.pad_id, dtype=self.dtype) new_targ = np.full((len(value), key[1] + 1), self.pad_id, dtype=self.dtype) new_label = np.full((len(value), key[1] + 1), self.pad_id, dtype=self.dtype) for idx, example in enumerate(value): try: global_count += 1 curr_src, curr_targ = example rev_src = curr_src[::-1] new_src[idx, -len(curr_src):] = rev_src new_targ[idx, 0] = self.go_id new_targ[idx, 1:(len(curr_targ)+1)] = curr_targ new_label[idx, 0:len(curr_targ)] = curr_targ new_label[idx, len(curr_targ)] = self.eos_id except ValueError as ve: error_count += 1 print(ve.message) print("global count: %d, error count: %d" % (global_count, error_count)) continue self.bucketed_data.append((new_src, new_targ, new_label)) self.bucket_idx_to_key.append((key[0], key[1]+1)) self.bucket_key_to_idx = invert_dict(dict(enumerate(self.bucket_idx_to_key))) self.interbucket_idx = -1 self.curr_bucket_id = None self.curr_chunks = None self.curr_buck = None self.switch_bucket = True self.num_buckets = len(self.bucket_idx_to_key) self.bucket_iterator_indices = list(range(self.num_buckets)) self.default_bucket_key = self.sorted_keys[-1]
import utils content_types = utils.invert_dict( { "text/html": ["htm", "html"], "application/json": ["json"], "application/xhtml+xml": ["xht", "xhtm", "xhtml"], "application/xml": ["xml"], "application/x-xpinstall": ["xpi"], "text/javascript": ["js"], "text/css": ["css"], "text/plain": ["txt", "md"], "image/svg+xml": ["svg"], "image/gif": ["gif"], "image/jpeg": ["jpg", "jpeg"], "image/png": ["png"], "image/bmp": ["bmp"], "text/event-stream": ["event_stream"], "text/cache-manifest": ["manifest"], "video/mp4": ["mp4", "m4v"], "audio/mp4": ["m4a"], "audio/mpeg": ["mp3"], "video/webm": ["webm"], "audio/webm": ["weba"], "video/ogg": ["ogg", "ogv"], "audio/ogg": ["oga"], "audio/x-wav": ["wav"], "text/vtt": ["vtt"], } ) response_codes = {
def main(options): args = get_default_args() load_best_args(args, options, get_best_args()) set_args(args, options) print_args(args) mode = args['mode'] train_name, test_name = args['split']['train'], args['split']['test'] if train_name == 'train_all': train_set = ['trec-2011', 'trec-2012', 'trec-2013', 'trec-2014'] train_set.remove(test_name) else: train_set = [train_name] test_set = test_name print('train_set: {}, test_set: {}'.format(train_set, test_set)) max_query_len, max_doc_len, max_url_len = defaultdict(int), defaultdict( int), defaultdict(int) vocab = {'word': {}, '3gram': {}, 'url': {}} test_vocab = {'word': {}, '3gram': {}, 'url': {}} ############################# LOAD DATA ################################## data_name = ("data_m%s_%s_%s" % (mode, train_name, test_name)).lower() if args["load_data"]: train_dataset, vocab, train_vocab_emb, max_query_len, max_doc_len, max_url_len = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True) test_dataset, test_vocab, test_vocab_emb, _, _, _ = load_data( "%s/%s/%s" % (args["experimental_data"], data_name, test_name), False) print('load dataset successfully') else: train_dataset = gen_data(args["raw_data"], train_set, vocab, test_vocab, True, max_query_len, max_doc_len, max_url_len, args) print("create training set successfully...") test_dataset = gen_data(args["raw_data"], [test_set], vocab, test_vocab, False, max_query_len, max_doc_len, max_url_len, args) train_vocab_emb, test_vocab_emb = construct_vocab_emb( "%s/%s" % (args["experimental_data"], data_name), vocab['word'], test_vocab['word'], 300, "word", base_embed_path=args["base_embed_path"]) save_data( "%s/%s/%s" % (args["experimental_data"], data_name, train_name), True, train_dataset, max_query_len, max_doc_len, max_url_len, vocab, train_vocab_emb) print("save training set successfully...") save_data("%s/%s/%s" % (args["experimental_data"], data_name, test_name), False, test_dataset, vocab=test_vocab, vocab_emb=test_vocab_emb) print("save test set successfully...") val_split = args['val_split'] num_samples, _ = train_dataset["query_word_input"].shape # randomly sample queries and all their documents if query_random is True # otherwise, query-doc pairs are randomly sampled query_random = True if query_random: val_indices = sample_val_set(args["raw_data"], train_set, val_split) else: val_indices, val_set = [], set() for i in range(int(num_samples * val_split)): val_index = np.random.randint(num_samples) while val_index in val_set: val_index = np.random.randint(num_samples) val_indices.append(val_index) val_set.add(val_index) val_dataset = {} for key in train_dataset: val_dataset[key] = train_dataset[key][val_indices] train_dataset[key] = np.delete(train_dataset[key], val_indices, 0) # shuffle the train dataset explicitly to make results reproducible # whether the performance will be affected remains a question keys, values = [], [] for key in train_dataset: keys.append(key) values.append(train_dataset[key]) zipped_values = list(zip(*values)) random.shuffle(zipped_values) shuffled_values = list(zip(*zipped_values)) for i, key in enumerate(keys): train_dataset[key] = np.array(shuffled_values[i]) print('after shuffle: id {}, sim {}, query_word_input'.format( train_dataset['id'][:3], train_dataset['sim'][:3], train_dataset['query_word_input'][:3])) # merge the vocabulory of train and test set merged_vocab = {'url': vocab['url'], '3gram': vocab['3gram']} merged_vocab['word'] = merge_two_dicts(vocab['word'], test_vocab['word']) print("merged vocab: word(%d) 3gram(%d)" % (len(merged_vocab['word']), len(test_vocab['3gram']))) vocab_inv, vocab_size = {}, {} vocab['char'] = merge_two_dicts(vocab['3gram'], vocab['url']) test_vocab['char'] = merge_two_dicts(test_vocab['3gram'], test_vocab['url']) merged_vocab['char'] = merge_two_dicts(vocab['char'], test_vocab['char']) for key in vocab: vocab_inv[key] = invert_dict(merged_vocab[key]) vocab_size[key] = len(vocab[key]) print(vocab_size) # Print data samples for debug purpose print_dataset(mode, train_dataset, vocab_inv) print_dataset(mode, test_dataset, vocab_inv) ############################ TRAIN MODEL ################################# model = None if mode == 'deep_twitter': model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, train_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) model_name = ( "model_N%s_data%s_mo%s_c%s_NumFilter%d_T%s_D%.1f_W%s_M%s_B%d_Val%.2f" % (mode, train_name, args['model_option'], args['conv_option'], args["nb_filters"], args["trainable"], args['dropout'], args['weighting'], args['mask'], args['batch_size'], args['val_split'])).lower() model_path = "%s/%s/%s" % (args['experimental_data'], data_name, model_name) print(model_path) if args['optimizer'] == "adam": opt = optimizers.Adam(lr=args["learning_rate"], beta_1=0.9, beta_2=0.999, epsilon=None, decay=0.0, amsgrad=True) print('use Adam optimizer') elif args['optimizer'] == "sgd": opt = optimizers.SGD(lr=args["learning_rate"], decay=1e-6, momentum=0.9, nesterov=True) print('use SGD optimizer') elif args['optimizer'] == 'rmsprop': opt = optimizers.RMSprop(lr=args["learning_rate"], rho=0.9, epsilon=None, decay=0.0) print('use RMSprop optimizer') model.compile(loss='binary_crossentropy', optimizer=opt, metrics=['accuracy']) print(model.summary()) print('model init weights sum: %.4f' % get_model_weights(model)) if not args['load_model']: early_stopping = EarlyStopping(monitor='val_loss', patience=4) checkpoint = ModelCheckpoint(filepath=model_path + ".best.weights", monitor='val_loss', save_best_only=True, verbose=1) lr_reducer = ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=0.0001, verbose=1) #print(train_dataset['id'][:3], val_dataset['id'][:3], val_dataset['id'][-3:]) model.fit(train_dataset, train_dataset['sim'], validation_data=(val_dataset, val_dataset['sim']), batch_size=args['batch_size'], epochs=args['epochs'], shuffle=False, callbacks=[checkpoint, lr_reducer, early_stopping], verbose=args['verbose']) ############################ TEST MODEL ################################# print('load best model from %s.best.weights' % model_path) model.load_weights("%s.best.weights" % model_path) if mode == 'deep_twitter': # load trained vocab embedding. trained_vocab_emb = model.get_layer('sequential_2').get_weights()[0] # merge trained vocab embedding with test OOV word embeddings merged_vocab_emb = np.zeros(shape=(len(merged_vocab['word']), 300)) merged_vocab_emb[0:len(vocab['word']), :] = trained_vocab_emb merged_vocab_emb[ len(vocab['word']):len(merged_vocab['word']), :] = test_vocab_emb for key in vocab: vocab_size[key] = len(merged_vocab[key]) print(vocab_size) new_model = create_attention_model(max_query_len, max_doc_len, max_url_len, vocab_size, merged_vocab_emb, args["nb_filters"], embed_size=300, dropout_rate=args['dropout'], trainable=args["trainable"], weighting=args['weighting'], mask=args["mask"], conv_option=args['conv_option'], model_option=args['model_option']) new_model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy']) print(new_model.summary()) num_layers = 0 for layer in model.layers: num_layers += 1 for layer_id in range(num_layers): layer = model.layers[layer_id] if layer.name != 'sequential_2': new_model.layers[layer_id].set_weights(layer.get_weights()) print('copy weight done.') predictions = new_model.predict(test_dataset) print(predictions[:10]) predictions_file = "%s/%s/predictions_%s.txt" % (args["experimental_data"], data_name, model_name) with open(predictions_file, 'w') as f: for i in range(test_dataset['id'].shape[0]): f.write("%s %.4f %s\n" % (test_dataset['id'][i], predictions[i], args['mode'])) print('write predictions with trec format to %s' % predictions_file) map, mrr, p30 = evaluate(predictions_file, args["qrels_file"]) print('MAP: %.4f P30: %.4f MRR: %.4f' % (map, p30, mrr))
"G": 379., "U": 340., } one_to_three = { 'A':'Ala', 'B':'Asx', 'C':'Cys', 'D':'Asp', 'E':'Glu', 'F':'Phe', 'G':'Gly', 'H':'His', 'I':'Ile', 'K':'Lys', 'L':'Leu', 'M':'Met', 'N':'Asn', 'P':'Pro', 'Q':'Gln', 'R':'Arg', 'S':'Ser', 'T':'Thr', 'V':'Val', 'W':'Trp', 'Y':'Tyr', 'Z':'Glx', 'X':'Xaa', 'U':'Sec', 'J':'Xle', 'O':'Pyl' } standard_three_to_one = utils.invert_dict(one_to_three) extended_three_to_one= { '2as':'D', '3ah':'H', '5hp':'E', 'Acl':'R', 'Agm':'R', 'Aib':'A', 'Ala':'A', 'Alm':'A', 'Alo':'T', 'Aly':'K', 'Arg':'R', 'Arm':'R', 'Asa':'D', 'Asb':'D', 'Ask':'D', 'Asl':'D', 'Asn':'N', 'Asp':'D', 'Asq':'D', 'Asx':'B', 'Aya':'A', 'Bcs':'C', 'Bhd':'D', 'Bmt':'T', 'Bnn':'A', 'Buc':'C', 'Bug':'L', 'C5c':'C', 'C6c':'C', 'Ccs':'C', 'Cea':'C', 'Cgu':'E', 'Chg':'A', 'Cle':'L', 'Cme':'C', 'Csd':'A', 'Cso':'C', 'Csp':'C', 'Css':'C', 'Csw':'C', 'Csx':'C', 'Cxm':'M', 'Cy1':'C', 'Cy3':'C', 'Cyg':'C', 'Cym':'C', 'Cyq':'C', 'Cys':'C', 'Dah':'F', 'Dal':'A', 'Dar':'R', 'Das':'D', 'Dcy':'C', 'Dgl':'E', 'Dgn':'Q', 'Dha':'A', 'Dhi':'H', 'Dil':'I', 'Div':'V', 'Dle':'L', 'Dly':'K', 'Dnp':'A', 'Dpn':'F', 'Dpr':'P', 'Dsn':'S', 'Dsp':'D', 'Dth':'T', 'Dtr':'W', 'Dty':'Y', 'Dva':'V', 'Efc':'C', 'Fla':'A', 'Fme':'M', 'Ggl':'E', 'Gl3':'G', 'Gln':'Q', 'Glu':'E', 'Glx':'Z', 'Gly':'G', 'Glz':'G', 'Gma':'E', 'Gsc':'G', 'Hac':'A', 'Har':'R', 'Hic':'H', 'Hip':'H', 'His':'H', 'Hmr':'R', 'Hpq':'F', 'Htr':'W', 'Hyp':'P', 'Iil':'I', 'Ile':'I', 'Iyr':'Y', 'Kcx':'K', 'Leu':'L', 'Llp':'K', 'Lly':'K', 'Ltr':'W', 'Lym':'K', 'Lys':'K', 'Lyz':'K', 'Maa':'A', 'Men':'N', 'Met':'M', 'Mhs':'H', 'Mis':'S', 'Mle':'L', 'Mpq':'G', 'Msa':'G', 'Mse':'M', 'Mva':'V', 'Nem':'H', 'Nep':'H', 'Nle':'L', 'Nln':'L', 'Nlp':'L', 'Nmc':'G', 'Oas':'S', 'Ocs':'C', 'Omt':'M', 'Paq':'Y', 'Pca':'E', 'Pec':'C', 'Phe':'F', 'Phi':'F', 'Phl':'F', 'Pr3':'C', 'Pro':'P', 'Prr':'A', 'Ptr':'Y', 'Pyl':'O', 'Sac':'S', 'Sar':'G', 'Sch':'C', 'Scs':'C', 'Scy':'C', 'Sec':'U', 'Sel':'U', 'Sep':'S', 'Ser':'S', 'Set':'S', 'Shc':'C', 'Shr':'K', 'Smc':'C', 'Soc':'C', 'Sty':'Y', 'Sva':'S', 'Ter':'*', 'Thr':'T', 'Tih':'A', 'Tpl':'W', 'Tpo':'T', 'Tpq':'A', 'Trg':'K', 'Tro':'W', 'Trp':'W', 'Tyb':'Y', 'Tyq':'Y', 'Tyr':'Y', 'Tys':'Y', 'Tyy':'Y', 'Unk':'X', 'Val':'V', 'Xaa':'X', 'Xer':'X', 'Xle':'J'} # Initial table is from the ASTRAL RAF release notes. # added UNK # Extra IUPAC: Xle, Xaa, Sec, Pyl # The following have been seen in biopython code. # Ter : '*' Termination # Sel : 'U' A typo for Sec, selenocysteine? # Xer : 'X' Another alternative for unknown? amino_acid_names = { 'A' : 'alanine', 'M' : 'methionine',
import utils content_types = utils.invert_dict({ "text/html": ["htm", "html"], "application/xhtml+xml": ["xht", "xhtm", "xhtml"], "text/javascript": ["js"], "text/css": ["css"], "text/plain": ["txt", "md"], "text/xml": ["xml"], "image/svg+xml": ["svg"], "image/jpeg": ["jpg", "jpeg"], "image/png": ["png"], "text/event-stream": ["event_stream"], "text/cache-manifest": ["manifest"], "video/mp4": ["mp4", "m4v"], "audio/mp4": ["m4a"], "audio/mpeg": ["mp3"], "video/webm": ["webm"], "audio/webm": ["weba"], "video/ogg": ["ogg", "ogv"], "audio/ogg": ["oga"], "audio/x-wav": ["wav"], "text/vtt": ["vtt"], }) response_codes = { 100: ('Continue', 'Request received, please continue'), 101: ('Switching Protocols', 'Switching to new protocol; obey Upgrade header'), 200: ('OK', 'Request fulfilled, document follows'), 201: ('Created', 'Document created, URL follows'),
import utils SFA = "http://developer.apple.com/namespaces/sfa" SF = "http://developer.apple.com/namespaces/sf" XSI = "http://www.w3.org/2001/XMLSchema-instance" KEY = "http://developer.apple.com/namespaces/keynote2" NSMAP = { "sfa": SFA, "sf": SF, "xsi": XSI, "key": KEY, } NAMESPACE_TO_URL = {k:"{"+v+"}" for k,v in NSMAP.items()} URL_TO_NAMESPACE = utils.invert_dict(NAMESPACE_TO_URL) def ns(qname): """ returns the lxml representation of an xml namespace, using a static lookup table. """ if len(qname) and qname[0] == "{": return qname i = qname.find(":") if i<0: return qname return NAMESPACE_TO_URL[qname[0:i]] + qname[i+1:] class XMLError(Exception): pass class Element(object):
inverse_vocabulary, weighted_feature_numbers) style_features_and_weights[style] = ranked_features return style_features_and_weights ## STEP 1. ## LOAD AND PREPARE DATA x_tr, y_tr = load_train_set() # edit fit_vectorizer() to create custom vectorizer for dataset # otherwise, load existing vectorizer under DATA_VECTORIZER_PATH # fit_vectorizer(x_tr) vectorizer = load_model(DATA_VECTORIZER_PATH) inverse_vocabulary = invert_dict(vectorizer.vocabulary_) ## STEP 2. ## TRAIN MODEL TO OBTAIN STYLE WEIGHTS # to experiment with style weighting, edit parameters and train new model regularization_type = 'l1' C = 3 lr_path = f'../models/style_weights_extractor_{regularization_type}_reg_C_{C}.pkl' # vec_x_tr = vectorizer.transform(x_tr) # lr_model = train(regularization_type, C, vec_x_tr, y_tr) # save_model(lr_model, lr_path) model = load_model(lr_path) ## STEP 3. ## EXTRACT STYLE FEATURES AND WEIGHTS
def gen_data(path, datasets, vocab, test_vocab, is_train, max_query_len, max_doc_len, max_url_len, args): if is_train: vocab['word']['PAD_WORD_INDEX'] = PAD_WORD_INDEX vocab['word']['OOV_WORD_INDEX'] = OOV_WORD_INDEX vocab['3gram']['PAD_3GRAM_INDEX'] = PAD_WORD_INDEX vocab['3gram']['OOV_3GRAM_INDEX'] = OOV_WORD_INDEX vocab['url']['PAD_URL_INDEX'] = PAD_WORD_INDEX vocab['url']['OOV_URL_INDEX'] = OOV_WORD_INDEX query_word_list, doc_word_list, query_3gram_list, doc_3gram_list = [], [], [], [] all_url_list, all_ids_list, all_sim_list = [], [], [] t0 = time.time() for data_name in datasets: # there can be multiple data sets combined as the train or test data data_folder = "%s/%s" % (path, data_name) print('load dataset %s' % data_name) t = time.time() q1_word_list, max_q1_word_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q2_word_list, max_q2_word_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "word", test_vocab=test_vocab) q1_3gram_list, max_q1_3gram_len = read_sentences("%s/a.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) q2_3gram_list, max_q2_3gram_len = read_sentences("%s/b.toks" % data_folder, vocab, is_train, "3gram", test_vocab=test_vocab) url_list, max_url_len_dataset = read_urls("%s/url.txt" % data_folder, vocab, is_train, '3gram') ids_list = read_metadata("%s/id.txt" % data_folder) if is_train: max_query_len['word'] = max(max_query_len['word'], max_q1_word_len) max_query_len['3gram'] = max(max_query_len['3gram'], max_q1_3gram_len) max_doc_len['word'] = max(max_doc_len['word'], max_q2_word_len) max_doc_len['3gram'] = max(max_doc_len['3gram'], min(max_q2_3gram_len, MAX_TWEET_LENGTH)) max_url_len['url'] = max(max_url_len['url'], min(max_url_len_dataset, MAX_URL_LENGTH)) sim_list = read_relevance("%s/sim.txt" % data_folder) query_word_list.extend(q1_word_list) doc_word_list.extend(q2_word_list) query_3gram_list.extend(q1_3gram_list) doc_3gram_list.extend(q2_3gram_list) all_url_list.extend(url_list) all_ids_list.extend(ids_list) all_sim_list.extend(sim_list) print("q1 max_word_len: %d, q2 max_word_len: %d, len limit: (%d, %d)" % (max_q1_word_len, max_q2_word_len, max_query_len['word'], max_doc_len['word'])) print( "q1 max_3gram_len: %d, q2 max_3gram_len: %d, len limit: (%d, %d)" % (max_q1_3gram_len, max_q2_3gram_len, max_query_len['3gram'], max_doc_len['3gram'])) print('max_url_len: %d, limit: %d' % (max_url_len_dataset, max_url_len['url'])) print('load dataset done: %d' % (time.time() - t)) # question padding data = {'sim': np.array(all_sim_list), 'id': np.array(all_ids_list)} data['query_word_input'] = pad_sequences(query_word_list, maxlen=max_query_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_word_mask'] = create_masks(data['query_word_input'], args) data['doc_word_input'] = pad_sequences(doc_word_list, maxlen=max_doc_len['word'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_word_mask'] = create_masks(data['doc_word_input'], args) data['query_3gram_input'] = pad_sequences(query_3gram_list, maxlen=max_query_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['query_3gram_mask'] = create_masks(data['query_3gram_input'], args) data['doc_3gram_input'] = pad_sequences(doc_3gram_list, maxlen=max_doc_len['3gram'], value=PAD_WORD_INDEX, padding='post', truncating='post') data['doc_3gram_mask'] = create_masks(data['doc_3gram_input'], args) data['url_3gram_input'] = pad_sequences(all_url_list, maxlen=max_url_len['url'], value=PAD_WORD_INDEX, padding='post', truncating='pre') data['url_3gram_mask'] = create_masks(data['url_3gram_input'], args) if os.path.exists("%s/collection_ngram_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_ngram_idf.json" % path, "r")) vocab_inv = invert_dict(vocab['3gram']) data['query_3gram_weight'] = inject_ngram_weight( data['query_3gram_input'], vocab_inv, weights) data['doc_3gram_weight'] = inject_ngram_weight(data['doc_3gram_input'], vocab_inv, weights) vocab_inv = invert_dict(vocab['url']) data['url_3gram_weight'] = inject_ngram_weight(data['url_3gram_input'], vocab_inv, weights) print('ngram weight injection done: %d' % (time.time() - t)) if os.path.exists("%s/collection_word_idf.json" % path): t = time.time() weights = json.load(open("%s/collection_word_idf.json" % path, "r")) merge_vocab = merge_two_dicts(vocab['word'], test_vocab['word']) vocab_inv = invert_dict(merge_vocab) data['query_word_weight'] = inject_word_weight( data['query_word_input'], vocab_inv, weights) data['doc_word_weight'] = inject_word_weight(data['doc_word_input'], vocab_inv, weights) data['overlap_feat'] = compute_overlap_feat(data['query_word_input'], data['doc_word_input'], vocab_inv, weights) print('word weight injection done: %d' % (time.time() - t)) print('data creation is done: %d' % (time.time() - t0)) return data
def load_def(localdir, ent_name, section_def, required_fields): if 'type' in section_def and 'fields' in section_def: raise Exception("invalid structure for '%s': " "type and fields sections are mutually exclusive" % ent_name) if 'type' in section_def: csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) str_type = section_def['type'] if isinstance(str_type, basestring): celltype = field_str_to_type(str_type, "array '%s'" % ent_name) else: assert isinstance(str_type, type) celltype = str_type return 'ndarray', load_ndarray(csv_filepath, celltype) fields_def = section_def.get('fields') if fields_def is not None: for fdef in fields_def: if isinstance(fdef, basestring): raise SyntaxError("invalid field declaration: '%s', you are " "probably missing a ':'" % fdef) if all(isinstance(fdef, dict) for fdef in fields_def): fields = fields_yaml_to_type(fields_def) else: assert all(isinstance(fdef, tuple) for fdef in fields_def) fields = fields_def else: fields = None newnames = merge_dicts(invert_dict(section_def.get('oldnames', {})), section_def.get('newnames', {})) transpose = section_def.get('transposed', False) interpolate_def = section_def.get('interpolate') files_def = section_def.get('files') if files_def is None: #XXX: it might be cleaner to use the same code path than for the # multi-file case (however, that would loose the "import any file # size" feature that I'm fond of. # we can simply return the stream as-is #FIXME: stream is not sorted # csv file is assumed to be in the correct order (ie by period then id) csv_filename = section_def.get('path', ent_name + ".csv") csv_filepath = complete_path(localdir, csv_filename) csv_file = CSV(csv_filepath, newnames, delimiter=',', transpose=transpose) if fields is not None: fields = required_fields + fields stream = csv_file.read(fields) if fields is None: fields = csv_file.fields if interpolate_def is not None: raise Exception('interpolate is currently only supported with ' 'multiple files') return 'table', (fields, csv_file.numlines, stream, csv_file) else: # we have to load all files, merge them and return a stream out of that print(" * computing number of rows...") # 1) only load required fields default_args = dict(newnames=newnames, transpose=transpose) if isinstance(files_def, dict): files_items = files_def.items() elif isinstance(files_def, list) and files_def: if isinstance(files_def[0], dict): # handle YAML ordered dict structure files_items = [d.items()[0] for d in files_def] elif isinstance(files_def[0], basestring): files_items = [(path, {}) for path in files_def] else: raise Exception("invalid structure for 'files'") else: raise Exception("invalid structure for 'files'") #XXX: shouldn't we use the "path" defined for the whole entity if any? # section_def.get('path') files = [] for path, kwargs in files_items: kwargs['newnames'] = \ merge_dicts(invert_dict(kwargs.pop('oldnames', {})), kwargs.get('newnames', {})) f = CSV(complete_path(localdir, path), **merge_dicts(default_args, kwargs)) files.append(f) id_periods = union1d(f.as_array(required_fields) for f in files) print(" * reading files...") # 2) load all fields if fields is None: target_fields = merge_items(*[f.fields for f in files]) fields_per_file = [None for f in files] else: target_fields = required_fields + fields fields_per_file = [[(name, type_) for name, type_ in target_fields if name in f.field_names] for f in files] total_fields = set.union(*[set(f.field_names) for f in files]) missing = set(name for name, _ in target_fields) - total_fields if missing: raise Exception("the following fields were not found in any " "file: %s" % ", ".join(missing)) total_lines = len(id_periods) # allocate main array target = np.empty(total_lines, dtype=np.dtype(target_fields)) # fill with default values target[:] = tuple(missing_values[ftype] for _, ftype in target_fields) target['period'] = id_periods['period'] target['id'] = id_periods['id'] arrays = [f.as_array(fields_to_load) for f, fields_to_load in zip(files, fields_per_file)] # close all files for f in files: f.close() #FIXME: interpolation currently only interpolates missing data points, # not data points with their value equal the missing value # corresponding to the field type. This can only be fixed once # booleans are loaded as int8. if interpolate_def is not None: if any(v != 'previous_value' for v in interpolate_def.itervalues()): raise Exception("currently, only 'previous_value' " "interpolation is supported") to_interpolate = [k for k, v in interpolate_def.iteritems() if v == 'previous_value'] else: to_interpolate = [] interpolate(target, arrays, id_periods, to_interpolate) return 'table', (target_fields, total_lines, iter(target), None)
import utils SFA = "http://developer.apple.com/namespaces/sfa" SF = "http://developer.apple.com/namespaces/sf" XSI = "http://www.w3.org/2001/XMLSchema-instance" KEY = "http://developer.apple.com/namespaces/keynote2" NSMAP = { "sfa": SFA, "sf": SF, "xsi": XSI, "key": KEY, } NAMESPACE_TO_URL = {k: "{" + v + "}" for k, v in NSMAP.items()} URL_TO_NAMESPACE = utils.invert_dict(NAMESPACE_TO_URL) def ns(qname): """ returns the lxml representation of an xml namespace, using a static lookup table. """ if len(qname) and qname[0] == "{": return qname i = qname.find(":") if i < 0: return qname return NAMESPACE_TO_URL[qname[0:i]] + qname[i + 1:] class XMLError(Exception): pass