def get_distance(seta, setb): seta = util.vectorize(seta) setb = util.vectorize(setb) nbrs = NearestNeighbors(n_neighbors=1).fit(seta) distances, indices = nbrs.kneighbors(setb) d = distances[:, 0] return np.mean(d), np.percentile(d, 25), np.percentile(d, 75)
def lofarGenUVW(corrMatrix, ants, obs, sbs, ts): """Generate UVW coordinates from antenna positions, timestamps/subbands corrMatrix: [Nsubbands, Nints, nantpol, nantpol] array, correlation matrix for each subband, time integration ants: [Nantennas, 3] array, antenna positions in XYZ obs: ephem.Observer() of station sbs: [Nsubbands] array, subband IDs ts: datetime 2D array [Nsubbands, Nints], timestamp for each correlation matrix returns: vis: visibilities [4, Nsamples*Nints, Nsubbands] uvw: UVW coordinates [Nsamples*Nints, 3, Nsubbands] """ nants = ants.shape[0] ncorrs = nants*(nants+1)/2 nints = ts.shape[1] uvw = np.zeros((nints, ncorrs, 3, len(sbs)), dtype=float) vis = np.zeros((4, nints, ncorrs, len(sbs)), dtype=complex) # 4 polarizations: xx, xy, yx, yy for sbIdx, sb in enumerate(sbs): for tIdx in np.arange(nints): #TODO: using a reference Obs emperically works, but I can't quite justify it yet #TODO: using a reference Obs probably breaks the FT script, check if there is another roation needed refObs = lofarObserver(0., -90., 0., ts[sbIdx, tIdx]) # create an observatory at (lat,long)=(0,-90) to get the sidereal time at the reference position, this is along the Y axis I believe LSTangle = refObs.sidereal_time() # sidereal time at reference location, radians #obs.epoch = ts[sbIdx, tIdx] #obs.date = ts[sbIdx, tIdx] #LSTangle = obs.sidereal_time() # radians print 'LST:', LSTangle, 'Dec:', obs.lat # Compute baselines in XYZ antPosRep = np.repeat(ants[:,0,:], nants, axis=0).reshape((nants, nants, 3)) # ants is of the form [nants, npol, 3], assume pols are at the same position xyz = util.vectorize(antPosRep - np.transpose(antPosRep, (1, 0, 2))) # Rotation matricies for XYZ -> UVW transform dec = float(np.pi/2.) # set the north pole to be dec 90, thus the dec rotation matrix below is not really needed decRotMat = np.array([ [1., 0., 0.], [0., np.sin(dec), np.cos(dec)], [0., -1.*np.cos(dec), np.sin(dec)]]) #rotate about x-axis ha = float(LSTangle) - 0. # Hour Angle in reference to longitude/RA=0 #ha = float(LSTangle) - 0. - (np.pi/2.) # Hour Angle in reference to longitude/RA=0, use if refObs at (0,0) haRotMat = np.array([ [ np.sin(ha), np.cos(ha), 0.], [-1.*np.cos(ha), np.sin(ha), 0.], [0., 0., 1.]]) #rotate about z-axis rotMatrix = np.dot(decRotMat, haRotMat) uvw[tIdx, :, :, sbIdx] = np.dot(rotMatrix, xyz.T).T # split up polarizations, vectorize the correlation matrix, and drop the lower triangle vis[0, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 0::2]) vis[1, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 0::2]) vis[2, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 1::2]) vis[3, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 1::2]) vis = np.reshape(vis, (vis.shape[0], vis.shape[1]*vis.shape[2], vis.shape[3])) uvw = np.reshape(uvw, (uvw.shape[0]*uvw.shape[1], uvw.shape[2], uvw.shape[3])) #TODO: i don't think we need to return the LST angle return vis, uvw, LSTangle
def shape_map_all(self, pred): shapes = [] shape_max = vectorize(get_index(pred)) shape_max = shape_max * 4 shape_max_pca = self.pca_noise_reduction(shape_max) shape_mean = vectorize(get_index_mean(pred)) shape_mean = shape_mean * 4 shape_mean_pca = self.pca_noise_reduction(shape_mean) shapes.append(shape_max) shapes.append(shape_max_pca) shapes.append(shape_mean) shapes.append(shape_mean_pca) if p.debug: show_predict(self.img, shape_max) show_predict(self.img, shape_max_pca) show_predict(self.img, shape_mean) show_predict(self.img, shape_mean_pca) return shapes
def shape_map(self, pred, method): '''method can be one of ['max', 'mean', 'max_pca', 'mean_pca']''' if method == 'max': shape = vectorize(get_index(pred)) shape = shape * 4 elif method == 'mean': shape = vectorize(get_index_mean(pred)) shape = shape * 4 elif method == 'max_pca': shape = vectorize(get_index(pred)) shape = shape * 4 shape = self.pca_noise_reduction(shape) elif method == 'mean_pca': shape = vectorize(get_index_mean(pred)) shape = shape * 4 shape_pca = self.pca_noise_reduction(shape) if p.debug: show_predict(self.img, shape) show_predict(self.img, shape_pca) else: print "Unknown method: ", method exit(0) return shape
def generate_answers(sess, model, dataset, rev_vocab): """ Loop over the dev or test dataset and generate answer. Note: output format must be answers[uuid] = "real answer" You must provide a string of words instead of just a list, or start and end index In main() function we are dumping onto a JSON file evaluate.py will take the output JSON along with the original JSON file and output a F1 and EM You must implement this function in order to submit to Leaderboard. :param sess: active TF session :param model: a built QASystem model :param rev_vocab: this is a list of vocabulary that maps index to actual words :return: """ answers = {} (context, question, question_uuid_data) = dataset context_data = convert_data_to_list(context) question_data = convert_data_to_list(question) context_padded, context_mask = pad_sequence(context_data, FLAGS.max_context_len) question_padded, question_mask = pad_sequence(question_data, FLAGS.max_question_len) input_data = vectorize(context_padded, context_mask, question_padded, question_mask, question_uuid_data) batch_size = 32 num_batches = int(len(input_data) / batch_size) + 1 prog = Progbar(target=num_batches) for i, batch in enumerate(minibatches(input_data, batch_size)): a_s_vec, a_e_vec = model.answer(sess, batch) prog.update(i + 1) for (a_s, a_e, context, uuid) in zip(a_s_vec, a_e_vec, batch[0], batch[4]): if a_s > a_e: tmp = a_s a_s = a_e a_e = tmp predicted_answer = model.formulate_answer(context, rev_vocab, a_s, a_e) answers[uuid] = predicted_answer return answers
def do_train(train_bodies, train_stances, dimension, embedding_path, config, max_headline_len=None, max_body_len=None, verbose=False, include_stopwords=True, similarity_metric_feature=None, weight_embeddings=False, idf=False): logging.info("Loading training and dev data ...") fnc_data, fnc_data_train, fnc_data_dev = util.load_and_preprocess_fnc_data( train_bodies, train_stances, include_stopwords, similarity_metric_feature) logging.info("%d training examples", len(fnc_data_train.headlines)) logging.info("%d dev examples", len(fnc_data_dev.headlines)) if max_headline_len is None: max_headline_len = fnc_data_train.max_headline_len if max_body_len is None: max_body_len = fnc_data_train.max_body_len logging.info("Max headline length: %d", max_headline_len) logging.info("Max body length: %d", max_body_len) # For convenience, create the word indices map over the entire dataset logging.info("Building word-to-index map ...") corpus = ([w for bod in fnc_data.bodies for w in bod] + [w for headline in fnc_data.headlines for w in headline]) word_indices = util.process_corpus(corpus) logging.info("Building embedding matrix ...") embeddings, known_words = util.load_embeddings(word_indices=word_indices, dimension=dimension, embedding_path=embedding_path, weight_embeddings=weight_embeddings) logging.info("Vectorizing data ...") # Vectorize and assemble the training data headline_vectors = util.vectorize(fnc_data_train.headlines, word_indices, known_words, max_headline_len) body_vectors = util.vectorize(fnc_data_train.bodies, word_indices, known_words, max_body_len) headlines_pc = bodies_pc = None if config.method == "arora": headlines_pc = util.arora_embeddings_pc(headline_vectors, embeddings) bodies_pc = util.arora_embeddings_pc(body_vectors, embeddings) else: headlines_pc = None bodies_pc = None if config.method == "vanilla_bag_of_words": logging.info("Precomputing training sentence embeddings ...") train_emb = embeddings if idf: train_emb = util.idf_embeddings(word_indices, headline_vectors + body_vectors, train_emb) headlines_emb = util.sentence_embeddings(headline_vectors, dimension, max_headline_len, train_emb) bodies_emb = util.sentence_embeddings(body_vectors, dimension, max_body_len, train_emb) training_data = [headlines_emb, bodies_emb, fnc_data_train.stances] else: training_data = [headline_vectors, body_vectors, fnc_data_train.stances] if similarity_metric_feature: training_data.append(fnc_data_train.sim_scores) training_data = zip(*training_data) # Vectorize and assemble the dev data; note that we use the training # maximum length dev_headline_vectors = util.vectorize(fnc_data_dev.headlines, word_indices, known_words, max_headline_len) dev_body_vectors = util.vectorize(fnc_data_dev.bodies, word_indices, known_words, max_body_len) if config.method == "vanilla_bag_of_words": logging.info("Precomputing dev sentence embeddings ...") test_emb = embeddings if idf: # TODO(akshayka): Experiment with using whole corpus as # documents vs just training vs just testing test_emb = util.idf_embeddings(word_indices, headline_vecotrs + dev_headline_vectors + body_vectors + dev_body_vectors, test_emb) dev_headlines_emb = util.sentence_embeddings(dev_headline_vectors, dimension, max_headline_len, test_emb) dev_bodies_emb = util.sentence_embeddings(dev_body_vectors, dimension, max_body_len, test_emb) dev_data = [dev_headlines_emb, dev_bodies_emb, fnc_data_dev.stances] else: dev_data = [dev_headline_vectors, dev_body_vectors, fnc_data_dev.stances] if similarity_metric_feature: dev_data.append(fnc_data_dev.sim_scores) dev_data = zip(*dev_data) with tf.Graph().as_default(): logger.info("Building model...",) start = time.time() model = FNCModel(config, max_headline_len, max_body_len, embeddings, headlines_pc=headlines_pc, bodies_pc=bodies_pc, verbose=verbose) logger.info("took %.2f seconds", time.time() - start) init = tf.global_variables_initializer() saver = tf.train.Saver() with tf.Session() as session: session.run(init) logging.info('Fitting ...') model.fit(session, saver, training_data, dev_data) logging.info('Outputting ...') output = model.output(session, dev_data) indices_to_words = {word_indices[w] : w for w in word_indices} # TODO(akshayka): Please code-review this. In particular, # please validate whether dev_headline_vectors is an equivalent # representation of output[0][0], and dev_body_vectors for output[0][1] headlines = [' '.join( util.word_indices_to_words(h, indices_to_words)) for h in dev_headline_vectors] bodies = [' '.join( util.word_indices_to_words(b, indices_to_words)) for b in dev_body_vectors] output = zip(headlines, bodies, output[1], output[2]) with open(model.config.eval_output, 'w') as f, open( model.config.error_output, "w") as g: for headline, body, label, prediction in output: f.write("%s\t%s\tgold:%d\tpred:%d\n\n" % ( headline, body, label, prediction)) if label != prediction: g.write("%s\t%s\tgold:%d\tpred:%d\n\n" % ( headline, body, label, prediction))