Esempio n. 1
0
def sparql_search_endpoint():
    sparql_query = request.args.get('query')

    response = jsonify(search.search(sparql_query, utils.get_uri2rank(), utils.get_clusters()))

    print('Successfully searched')
    return response
Esempio n. 2
0
 def get_mob_spawner_clusters(self):
     points = [p for p, b in self.block_entities.iteritems() if
               b._type == 'MobSpawner']
     results = []
     for cluster in utils.get_clusters(points, 16):
         cluster.mob_types = [
             self.block_entities.get(p, {}).get('EntityId') for
             p in cluster.points]
         results.append(cluster)
     return results
Esempio n. 3
0
def main(args):
    graph = GraphEmbedding(
        path_img=args.ImgPath, sigma=sigma, resize=args.resize_factor
    )
    cluster_centers = get_clusters(graph)
    graph.compute_graph(cluster_centers)
    embeddings = graph.embeddings_matrix
    source = len(embeddings) - 2
    sink = len(embeddings) - 1

    cut_edges = FordFulkerson(embeddings, source, sink)
    mask = compute_mask(cut_edges, graph.height, graph.width, source, sink)
    mask_reshape = cv2.resize(mask, graph.original_size[::-1], 0, 0)
    path, file_ = os.path.split(args.ImgPath)
    filename, file_extension = os.path.splitext(file_)
    save_dir = os.path.join(path, filename + "-mask" + file_extension)

    plt.imsave(save_dir, mask_reshape, cmap="gray")
    print("Saved image in ", save_dir)
def full_train(n_epochs=1, batch_size=200, save_prefix=None):
    """
    Runs the complete training process.
    """

    # Load initial data
    print("Loading data...")
    data = load_data()

    # Estimate the GPS clusters
    print("Estimating clusters...")
    clusters = get_clusters(data.train_labels)

    # Set up callbacks
    callbacks = []
    if save_prefix is not None:
        # Save the model's intermediary weights to disk after each epoch
        file_path = "cache/%s-{epoch:03d}-{val_loss:.4f}.hdf5" % save_prefix
        callbacks.append(ModelCheckpoint(file_path, monitor='val_loss', mode='min', save_weights_only=True, verbose=1))

    # Create model
    print("Creating model...")
    start_new_session()
    model = create_model(data.metadata, clusters)

    # Run the training
    print("Start training...")
    history = model.fit(
        process_features(data.train), data.train_labels,
        nb_epoch=n_epochs, batch_size=batch_size,
        validation_data=(process_features(data.validation), data.validation_labels),
        callbacks=callbacks)

    if save_prefix is not None:
        # Save the training history to disk
        file_path = 'cache/%s-history.pickle' % save_prefix
        with open(file_path, 'wb') as handle:
            pickle.dump(history.history, handle, protocol=pickle.HIGHEST_PROTOCOL)

    return history
plt.xticks(fontsize=9)
plt.axes().xaxis.set_major_locator(MultipleLocator(10))
plt.legend(['train', 'validation', 'smoothened validation'], loc='upper right')
plt.show()
'''

###########################################################################################################
## selected the weights learned at epoch #70 for my final model, which can now be loaded again with Keras:

start_new_session()

# load data and generate clusters
np.random.seed(42)
os.chdir('C:/ENEA_CAS_WORK/Taxi_destination_predictions')
data = load_data()
clusters = get_clusters(data.train_labels)

# load the model of the run #1
os.chdir('C:\\ENEA_CAS_WORK\\Taxi_destination_predictions\\cache')
model = create_model(data.metadata, clusters)
model.load_weights('mymodel-001-2.2026.hdf5')

WWW = model.weights
print(WWW[1].shape)
# Out[139]: TensorShape([7, 10]) ....7 feature of each of the 10 lat, lon (first and last coordinates)

processed = process_features(data.validation)
print(len(processed))
print(processed[6].shape)
# Out[155]: (16444, 20)  # lat, lon
Esempio n. 6
0
    def evaluate(self, test_docs):
        # doc_name: <cluster assignments> pairs for all test documents
        logging.info("Evaluating...")
        all_test_preds = {}

        # [MUC score]
        # The MUC score counts the minimum number of links between mentions
        # to be inserted or deleted when mapping a system response to a gold standard key set
        # [B3 score]
        # B3 computes precision and recall for all mentions in the document,
        # which are then combined to produce the final precision and recall numbers for the entire output
        # [CEAF score]
        # CEAF applies a similarity metric (either mention based or entity based) for each pair of entities
        # (i.e. a set of mentions) to measure the goodness of each possible alignment.
        # The best mapping is used for calculating CEAF precision, recall and F-measure
        muc_score = metrics.Score()
        b3_score = metrics.Score()
        ceaf_score = metrics.Score()

        for curr_doc in tqdm(test_docs):

            test_preds, _ = self._train_doc(curr_doc, eval_mode=True)
            test_clusters = get_clusters(test_preds)

            # Save predicted clusters for this document id
            all_test_preds[curr_doc.doc_id] = test_clusters

            # input into metric functions should be formatted as dictionary of {int -> set(str)},
            # where keys (ints) are clusters and values (string sets) are mentions in a cluster. Example:
            # {
            #  1: {'rc_1', 'rc_2', ...}
            #  2: {'rc_5', 'rc_8', ...}
            #  3: ...
            # }

            # gt = ground truth, pr = predicted by model
            gt_clusters = {k: set(v) for k, v in enumerate(curr_doc.clusters)}
            pr_clusters = {}
            for (pr_ment, pr_clst) in test_clusters.items():
                if pr_clst not in pr_clusters:
                    pr_clusters[pr_clst] = set()
                pr_clusters[pr_clst].add(pr_ment)

            muc_score.add(metrics.muc(gt_clusters, pr_clusters))
            b3_score.add(metrics.b_cubed(gt_clusters, pr_clusters))
            ceaf_score.add(metrics.ceaf_e(gt_clusters, pr_clusters))

        avg_score = metrics.conll_12(muc_score, b3_score, ceaf_score)
        logging.info(f"----------------------------------------------")
        logging.info(f"**Test scores**")
        logging.info(f"**MUC:      {muc_score}**")
        logging.info(f"**BCubed:   {b3_score}**")
        logging.info(f"**CEAFe:    {ceaf_score}**")
        logging.info(f"**CoNLL-12: {avg_score}**")
        logging.info(f"----------------------------------------------")

        # Save test predictions and scores to file for further debugging
        with open(self.path_pred_scores, "w", encoding="utf-8") as f:
            f.writelines([
                f"Database: {self.dataset_name}\n\n",
                f"Test scores:\n",
                f"MUC:      {muc_score}\n",
                f"BCubed:   {b3_score}\n",
                f"CEAFe:    {ceaf_score}\n",
                f"CoNLL-12: {metrics.conll_12(muc_score, b3_score, ceaf_score)}\n",
            ])
        with open(self.path_pred_clusters, "w", encoding="utf-8") as f:
            f.writelines(["Predictions:\n"])
            for doc_id, clusters in all_test_preds.items():
                f.writelines([f"Document '{doc_id}':\n", str(clusters), "\n"])

        return {
            "muc": muc_score,
            "b3": b3_score,
            "ceafe": ceaf_score,
            "avg": avg_score
        }
Esempio n. 7
0
use_elms_file = sys.argv[6]
suffix = sys.argv[7]

use_elms = {}
with open(use_elms_file) as f:
    for line in f:
        (elm, stuff) = line.strip().split('\t')
        use_elms[elm] = True

do_clustering = True
if distance_file == 'NA':
    do_clustering = False

if do_clustering:
    dis_file = os.path.join(results_dir, distance_file)
    mapping = utils.get_clusters(dis_file, dis_cutoff_init,
                                 dis_cutoff_meta)
else:
    mapping = {}
    
counts = utils.count_host_elmSeqs(global_settings.TEST_GENOMES,
                                  do_clustering, mapping,
                                  results_dir, use_elms, suffix)

ls = []
for host in counts:
    ls.append(counts[host])
all_elmSeqs = {}
#all_elmSeqs = utils_graph.intersectLists(ls)
for host in counts:
    for elmSeq in counts[host]:
        all_elmSeqs[elmSeq] = True
import sys, os, utils, global_settings, utils_graph
from collections import defaultdict

cluster_distance_file = sys.argv[1] # NA for skip
elm_count_dir = sys.argv[2] # results/roundup_all/

do_clustering = True
if cluster_distance_file == 'NA':
    do_clustering = False
# this comes from my scratch experiments
#human_distance_file = '../../scratch/human_flu_distances'
#chicken_distance_file = '../../scratch/chicken_flu_distances'
#both_distance_file = 'working/runs/Jun24/closest_dis'
if do_clustering:
    f = os.path.join(elm_count_dir, cluster_distance_file)
    mapping = utils.get_clusters(f, 2.5, float(2.5))    
else:
    mapping = {}
hosts = global_settings.TEST_GENOMES
#all_elmSeqs = {}
flus = ('human',)
flu_counts = {}
seen_seqs = {}
seen_seqs_ls = []

for flu in flus:
    flu_elm_file = os.path.join('results/',
                                flu + '.H5N1.elms')
    utils.count_flu_sampled(flu, flu_elm_file, flu_counts,
                            seen_seqs, mapping, do_clustering)
    seen_seqs_ls.append(seen_seqs[flu])