def get_distance(seta, setb):
    seta = util.vectorize(seta)
    setb = util.vectorize(setb)
    nbrs = NearestNeighbors(n_neighbors=1).fit(seta)
    distances, indices = nbrs.kneighbors(setb)
    d = distances[:, 0]
    return np.mean(d), np.percentile(d, 25), np.percentile(d, 75)
Esempio n. 2
0
def lofarGenUVW(corrMatrix, ants, obs, sbs, ts):
    """Generate UVW coordinates from antenna positions, timestamps/subbands
    corrMatrix: [Nsubbands, Nints, nantpol, nantpol] array, correlation matrix for each subband, time integration
    ants: [Nantennas, 3] array, antenna positions in XYZ
    obs: ephem.Observer() of station
    sbs: [Nsubbands] array, subband IDs
    ts: datetime 2D array [Nsubbands, Nints], timestamp for each correlation matrix

    returns:
        vis: visibilities [4, Nsamples*Nints, Nsubbands]
        uvw: UVW coordinates [Nsamples*Nints, 3, Nsubbands]
    """
    nants = ants.shape[0]
    ncorrs = nants*(nants+1)/2
    nints = ts.shape[1]
    uvw = np.zeros((nints, ncorrs, 3, len(sbs)), dtype=float)
    vis = np.zeros((4, nints, ncorrs, len(sbs)), dtype=complex) # 4 polarizations: xx, xy, yx, yy

    for sbIdx, sb in enumerate(sbs):
        for tIdx in np.arange(nints):
            #TODO: using a reference Obs emperically works, but I can't quite justify it yet
            #TODO: using a reference Obs probably breaks the FT script, check if there is another roation needed
            refObs = lofarObserver(0., -90., 0., ts[sbIdx, tIdx]) # create an observatory at (lat,long)=(0,-90) to get the sidereal time at the reference position, this is along the Y axis I believe
            LSTangle = refObs.sidereal_time() # sidereal time at reference location, radians

            #obs.epoch = ts[sbIdx, tIdx]
            #obs.date = ts[sbIdx, tIdx]

            #LSTangle = obs.sidereal_time() # radians
            print 'LST:',  LSTangle, 'Dec:', obs.lat

            # Compute baselines in XYZ
            antPosRep = np.repeat(ants[:,0,:], nants, axis=0).reshape((nants, nants, 3)) # ants is of the form [nants, npol, 3], assume pols are at the same position
            xyz = util.vectorize(antPosRep - np.transpose(antPosRep, (1, 0, 2)))

            # Rotation matricies for XYZ -> UVW transform
            dec = float(np.pi/2.) # set the north pole to be dec 90, thus the dec rotation matrix below is not really needed
            decRotMat = np.array([  [1.,              0.,          0.],
                                    [0.,     np.sin(dec), np.cos(dec)],
                                    [0., -1.*np.cos(dec), np.sin(dec)]]) #rotate about x-axis
            ha = float(LSTangle) - 0. # Hour Angle in reference to longitude/RA=0
            #ha = float(LSTangle) - 0. - (np.pi/2.) # Hour Angle in reference to longitude/RA=0, use if refObs at (0,0) 
            haRotMat = np.array([   [    np.sin(ha), np.cos(ha), 0.],
                                    [-1.*np.cos(ha), np.sin(ha), 0.],
                                    [0.,             0.,         1.]]) #rotate about z-axis
            rotMatrix = np.dot(decRotMat, haRotMat)

            uvw[tIdx, :, :, sbIdx] = np.dot(rotMatrix, xyz.T).T

            # split up polarizations, vectorize the correlation matrix, and drop the lower triangle
            vis[0, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 0::2])
            vis[1, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 0::2])
            vis[2, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 1::2])
            vis[3, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 1::2])

    vis = np.reshape(vis, (vis.shape[0], vis.shape[1]*vis.shape[2], vis.shape[3])) 
    uvw = np.reshape(uvw, (uvw.shape[0]*uvw.shape[1], uvw.shape[2], uvw.shape[3])) 

    #TODO: i don't think we need to return the LST angle
    return vis, uvw, LSTangle
Esempio n. 3
0
def lofarGenUVW(corrMatrix, ants, obs, sbs, ts):
    """Generate UVW coordinates from antenna positions, timestamps/subbands
    corrMatrix: [Nsubbands, Nints, nantpol, nantpol] array, correlation matrix for each subband, time integration
    ants: [Nantennas, 3] array, antenna positions in XYZ
    obs: ephem.Observer() of station
    sbs: [Nsubbands] array, subband IDs
    ts: datetime 2D array [Nsubbands, Nints], timestamp for each correlation matrix

    returns:
        vis: visibilities [4, Nsamples*Nints, Nsubbands]
        uvw: UVW coordinates [Nsamples*Nints, 3, Nsubbands]
    """
    nants = ants.shape[0]
    ncorrs = nants*(nants+1)/2
    nints = ts.shape[1]
    uvw = np.zeros((nints, ncorrs, 3, len(sbs)), dtype=float)
    vis = np.zeros((4, nints, ncorrs, len(sbs)), dtype=complex) # 4 polarizations: xx, xy, yx, yy

    for sbIdx, sb in enumerate(sbs):
        for tIdx in np.arange(nints):
            #TODO: using a reference Obs emperically works, but I can't quite justify it yet
            #TODO: using a reference Obs probably breaks the FT script, check if there is another roation needed
            refObs = lofarObserver(0., -90., 0., ts[sbIdx, tIdx]) # create an observatory at (lat,long)=(0,-90) to get the sidereal time at the reference position, this is along the Y axis I believe
            LSTangle = refObs.sidereal_time() # sidereal time at reference location, radians

            #obs.epoch = ts[sbIdx, tIdx]
            #obs.date = ts[sbIdx, tIdx]

            #LSTangle = obs.sidereal_time() # radians
            print 'LST:',  LSTangle, 'Dec:', obs.lat

            # Compute baselines in XYZ
            antPosRep = np.repeat(ants[:,0,:], nants, axis=0).reshape((nants, nants, 3)) # ants is of the form [nants, npol, 3], assume pols are at the same position
            xyz = util.vectorize(antPosRep - np.transpose(antPosRep, (1, 0, 2)))

            # Rotation matricies for XYZ -> UVW transform
            dec = float(np.pi/2.) # set the north pole to be dec 90, thus the dec rotation matrix below is not really needed
            decRotMat = np.array([  [1.,              0.,          0.],
                                    [0.,     np.sin(dec), np.cos(dec)],
                                    [0., -1.*np.cos(dec), np.sin(dec)]]) #rotate about x-axis
            ha = float(LSTangle) - 0. # Hour Angle in reference to longitude/RA=0
            #ha = float(LSTangle) - 0. - (np.pi/2.) # Hour Angle in reference to longitude/RA=0, use if refObs at (0,0) 
            haRotMat = np.array([   [    np.sin(ha), np.cos(ha), 0.],
                                    [-1.*np.cos(ha), np.sin(ha), 0.],
                                    [0.,             0.,         1.]]) #rotate about z-axis
            rotMatrix = np.dot(decRotMat, haRotMat)

            uvw[tIdx, :, :, sbIdx] = np.dot(rotMatrix, xyz.T).T

            # split up polarizations, vectorize the correlation matrix, and drop the lower triangle
            vis[0, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 0::2])
            vis[1, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 0::2])
            vis[2, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 0::2, 1::2])
            vis[3, tIdx, :, sbIdx] = util.vectorize(corrMatrix[sbIdx, tIdx, 1::2, 1::2])

    vis = np.reshape(vis, (vis.shape[0], vis.shape[1]*vis.shape[2], vis.shape[3])) 
    uvw = np.reshape(uvw, (uvw.shape[0]*uvw.shape[1], uvw.shape[2], uvw.shape[3])) 

    #TODO: i don't think we need to return the LST angle
    return vis, uvw, LSTangle
Esempio n. 4
0
 def shape_map_all(self, pred):
     shapes = []
     shape_max = vectorize(get_index(pred))
     shape_max = shape_max * 4
     shape_max_pca = self.pca_noise_reduction(shape_max)
     shape_mean = vectorize(get_index_mean(pred))
     shape_mean = shape_mean * 4
     shape_mean_pca = self.pca_noise_reduction(shape_mean)
     shapes.append(shape_max)
     shapes.append(shape_max_pca)
     shapes.append(shape_mean)
     shapes.append(shape_mean_pca)
     if p.debug:
         show_predict(self.img, shape_max)
         show_predict(self.img, shape_max_pca)
         show_predict(self.img, shape_mean)
         show_predict(self.img, shape_mean_pca)
     return shapes
Esempio n. 5
0
 def shape_map_all(self, pred):
     shapes = []
     shape_max = vectorize(get_index(pred))
     shape_max = shape_max * 4
     shape_max_pca = self.pca_noise_reduction(shape_max)
     shape_mean = vectorize(get_index_mean(pred))
     shape_mean = shape_mean * 4
     shape_mean_pca = self.pca_noise_reduction(shape_mean)
     shapes.append(shape_max)
     shapes.append(shape_max_pca)
     shapes.append(shape_mean)
     shapes.append(shape_mean_pca)
     if p.debug:
         show_predict(self.img, shape_max)
         show_predict(self.img, shape_max_pca)
         show_predict(self.img, shape_mean)
         show_predict(self.img, shape_mean_pca)
     return shapes
Esempio n. 6
0
 def shape_map(self, pred, method):
     '''method can be one of ['max', 'mean', 'max_pca', 'mean_pca']'''
     if method == 'max':
         shape = vectorize(get_index(pred))
         shape = shape * 4
     elif method == 'mean':
         shape = vectorize(get_index_mean(pred))
         shape = shape * 4
     elif method == 'max_pca':
         shape = vectorize(get_index(pred))
         shape = shape * 4
         shape = self.pca_noise_reduction(shape)
     elif method == 'mean_pca':
         shape = vectorize(get_index_mean(pred))
         shape = shape * 4
         shape_pca = self.pca_noise_reduction(shape)
         if p.debug:
             show_predict(self.img, shape)
             show_predict(self.img, shape_pca)
     else:
         print "Unknown method: ", method
         exit(0)
     return shape
Esempio n. 7
0
 def shape_map(self, pred, method):
     '''method can be one of ['max', 'mean', 'max_pca', 'mean_pca']'''
     if method == 'max':
         shape = vectorize(get_index(pred))
         shape = shape * 4
     elif method == 'mean':
         shape = vectorize(get_index_mean(pred))
         shape = shape * 4
     elif method == 'max_pca':
         shape = vectorize(get_index(pred))
         shape = shape * 4
         shape = self.pca_noise_reduction(shape)
     elif method == 'mean_pca':
         shape = vectorize(get_index_mean(pred))
         shape = shape * 4
         shape_pca = self.pca_noise_reduction(shape)
         if p.debug:
             show_predict(self.img, shape)
             show_predict(self.img, shape_pca)
     else:
         print "Unknown method: ", method
         exit(0)
     return shape
def generate_answers(sess, model, dataset, rev_vocab):
    """
    Loop over the dev or test dataset and generate answer.

    Note: output format must be answers[uuid] = "real answer"
    You must provide a string of words instead of just a list, or start and end index

    In main() function we are dumping onto a JSON file

    evaluate.py will take the output JSON along with the original JSON file
    and output a F1 and EM

    You must implement this function in order to submit to Leaderboard.

    :param sess: active TF session
    :param model: a built QASystem model
    :param rev_vocab: this is a list of vocabulary that maps index to actual words
    :return:
    """
    answers = {}
    (context, question, question_uuid_data) = dataset
    context_data = convert_data_to_list(context)
    question_data = convert_data_to_list(question)
    context_padded, context_mask = pad_sequence(context_data,
                                                FLAGS.max_context_len)
    question_padded, question_mask = pad_sequence(question_data,
                                                  FLAGS.max_question_len)
    input_data = vectorize(context_padded, context_mask, question_padded,
                           question_mask, question_uuid_data)

    batch_size = 32
    num_batches = int(len(input_data) / batch_size) + 1
    prog = Progbar(target=num_batches)
    for i, batch in enumerate(minibatches(input_data, batch_size)):
        a_s_vec, a_e_vec = model.answer(sess, batch)
        prog.update(i + 1)
        for (a_s, a_e, context, uuid) in zip(a_s_vec, a_e_vec, batch[0],
                                             batch[4]):
            if a_s > a_e:
                tmp = a_s
                a_s = a_e
                a_e = tmp
            predicted_answer = model.formulate_answer(context, rev_vocab, a_s,
                                                      a_e)
            answers[uuid] = predicted_answer

    return answers
Esempio n. 9
0
def do_train(train_bodies, train_stances, dimension, embedding_path, config, 
    max_headline_len=None, max_body_len=None, verbose=False, 
    include_stopwords=True, similarity_metric_feature=None, 
    weight_embeddings=False, idf=False):
    logging.info("Loading training and dev data ...")
    fnc_data, fnc_data_train, fnc_data_dev = util.load_and_preprocess_fnc_data(
        train_bodies, train_stances, include_stopwords, 
        similarity_metric_feature)
    logging.info("%d training examples", len(fnc_data_train.headlines))
    logging.info("%d dev examples", len(fnc_data_dev.headlines))
    if max_headline_len is None:
        max_headline_len = fnc_data_train.max_headline_len
    if max_body_len is None:
        max_body_len = fnc_data_train.max_body_len
    logging.info("Max headline length: %d", max_headline_len)
    logging.info("Max body length: %d", max_body_len)

    # For convenience, create the word indices map over the entire dataset
    logging.info("Building word-to-index map ...")
    corpus = ([w for bod in fnc_data.bodies for w in bod] +
        [w for headline in fnc_data.headlines for w in headline])
    word_indices = util.process_corpus(corpus)
    logging.info("Building embedding matrix ...")
    embeddings, known_words = util.load_embeddings(word_indices=word_indices,
        dimension=dimension, embedding_path=embedding_path,
        weight_embeddings=weight_embeddings)

    logging.info("Vectorizing data ...")
    # Vectorize and assemble the training data
    headline_vectors = util.vectorize(fnc_data_train.headlines, word_indices,
        known_words, max_headline_len)
    body_vectors = util.vectorize(fnc_data_train.bodies, word_indices,
        known_words, max_body_len)

    headlines_pc = bodies_pc = None
    if config.method == "arora":
        headlines_pc = util.arora_embeddings_pc(headline_vectors,
            embeddings)
        bodies_pc = util.arora_embeddings_pc(body_vectors,
            embeddings)
    else:
        headlines_pc = None
        bodies_pc = None

    if config.method == "vanilla_bag_of_words":
        logging.info("Precomputing training sentence embeddings ...")
        train_emb = embeddings
        if idf:
            train_emb = util.idf_embeddings(word_indices,
                headline_vectors + body_vectors, train_emb)
        headlines_emb = util.sentence_embeddings(headline_vectors, dimension,
            max_headline_len, train_emb)
        bodies_emb = util.sentence_embeddings(body_vectors, dimension,
            max_body_len, train_emb)
        training_data = [headlines_emb, bodies_emb, fnc_data_train.stances]
    else:
        training_data = [headline_vectors, body_vectors, fnc_data_train.stances]

    if similarity_metric_feature:
        training_data.append(fnc_data_train.sim_scores)
    training_data = zip(*training_data)

    # Vectorize and assemble the dev data; note that we use the training
    # maximum length
    dev_headline_vectors = util.vectorize(fnc_data_dev.headlines, word_indices,
        known_words, max_headline_len)
    dev_body_vectors = util.vectorize(fnc_data_dev.bodies, word_indices,
        known_words, max_body_len)

    if config.method == "vanilla_bag_of_words":
        logging.info("Precomputing dev sentence embeddings ...")
        test_emb = embeddings
        if idf:
            # TODO(akshayka): Experiment with using whole corpus as
            # documents vs just training vs just testing
            test_emb = util.idf_embeddings(word_indices,
                headline_vecotrs + dev_headline_vectors + body_vectors +
                dev_body_vectors, test_emb)
        dev_headlines_emb = util.sentence_embeddings(dev_headline_vectors,
            dimension, max_headline_len, test_emb)
        dev_bodies_emb = util.sentence_embeddings(dev_body_vectors,
            dimension, max_body_len, test_emb)
        dev_data = [dev_headlines_emb, dev_bodies_emb, fnc_data_dev.stances]
    else:
        dev_data = [dev_headline_vectors, dev_body_vectors,
            fnc_data_dev.stances]

    if similarity_metric_feature:
        dev_data.append(fnc_data_dev.sim_scores)
    dev_data = zip(*dev_data)

    with tf.Graph().as_default():
        logger.info("Building model...",)
        start = time.time()
        model = FNCModel(config, max_headline_len, max_body_len, embeddings,
            headlines_pc=headlines_pc, bodies_pc=bodies_pc, verbose=verbose)
        logger.info("took %.2f seconds", time.time() - start)

        init = tf.global_variables_initializer()
        saver = tf.train.Saver()
        with tf.Session() as session:
            session.run(init)
            logging.info('Fitting ...')
            model.fit(session, saver, training_data, dev_data)
            logging.info('Outputting ...')
            output = model.output(session, dev_data)

    indices_to_words = {word_indices[w] : w for w in word_indices}
    # TODO(akshayka): Please code-review this. In particular,
    # please validate whether dev_headline_vectors is an equivalent 
    # representation of output[0][0], and dev_body_vectors for output[0][1]
    headlines = [' '.join(
        util.word_indices_to_words(h, indices_to_words))
        for h in dev_headline_vectors]
    bodies = [' '.join(
        util.word_indices_to_words(b, indices_to_words))
        for b in dev_body_vectors]
    output = zip(headlines, bodies, output[1], output[2])

    with open(model.config.eval_output, 'w') as f, open(
        model.config.error_output, "w") as g:
        for headline, body, label, prediction in output:
            f.write("%s\t%s\tgold:%d\tpred:%d\n\n" % (
                headline, body, label, prediction))
            if label != prediction:
                g.write("%s\t%s\tgold:%d\tpred:%d\n\n" % (
                    headline, body, label, prediction))