Beispiel #1
0
def rate_solution(solution, problem):
    distances = problem[1]
    flows = problem[3]
    result = 0
    for pair in utils.generate_pairs(len(solution)):
        distance = distances[pair]
        flow = flows[utils.correct_pair((solution[pair[0]],solution[pair[1]]))]
        result += distance * flow
    return result
Beispiel #2
0
def validate_problem(problem):
    locations = problem[0]
    distances = problem[1]
    facilities = problem[2]
    flows = problem[3]
    size = len(locations)
    if len(facilities) != size:
        return False
    pairs = utils.generate_pairs(size)
    if len(pairs) != len(distances):
        return False
    if len(distances) != len(flows):
        return False
    distances_keys = list(distances.keys())
    flows_keys = list(flows.keys())
    for i, pair in enumerate(utils.generate_pairs(size)):
        if pair != distances_keys[i] or pair != flows_keys[i]:
            return False
    return True
Beispiel #3
0
def generate_random_problem(size):
    locations = []
    distances = {}
    facilities = []
    flows = {}
    location_names = ["Warsaw","Cracow","Lodz","Wroclaw","Poznan","Gdansk","Szczecin","Bydgoszcz","Lublin","Bialystok","Katowice","Gdynia"]
    facility_names = ["Phone factory","Refinery","Coal mine","Hospital","Car factory","Missle silo","Centrifuge","Shipyard","Port","Power plant","5G tower","Airport"]
    rand.shuffle(location_names)
    rand.shuffle(facility_names)
    #size = rand.randint(3, 10)
    for i in range(size):
        locations.append(location_names[i])
        facilities.append(facility_names[i])
    for pair in utils.generate_pairs(size):
        distances[pair] = rand.randint(50,1000)
        flows[pair] = rand.randint(0,100)
    return [locations, distances, facilities, flows]
Beispiel #4
0
def get_output_for_writing_problem_to_file(problem):
    locations = problem[0]
    distances = problem[1]
    facilities = problem[2]
    flows = problem[3]
    size = len(problem[0])
    output = []
    cnt = 1
    output.append(str(size)+'\n')
    pairs = utils.generate_pairs(size)
    for i in range(size):
        output.append(locations[i]+'\n')
        cnt += 1
    for pair in pairs:
        output.append(str(distances[pair])+'\n')
        cnt += 1
    for i in range(size):
        output.append(facilities[i]+'\n')
        cnt += 1
    for pair in pairs:
        output.append(str(flows[pair])+'\n')
        cnt += 1
    return output
Beispiel #5
0
def read_problem_from_file(filename):
    locations = []
    distances = {}
    facilities = []
    flows = {}
    with open(filename, "r") as file:
        lines = file.read().splitlines()
    size = int(lines[0])
    cnt = 1
    pairs = utils.generate_pairs(size)
    for i in range(size):
        locations.append(lines[cnt])
        cnt += 1
    for pair in pairs:
        distances[pair] = int(lines[cnt])
        cnt += 1
    for i in range(size):
        facilities.append(lines[cnt])
        cnt += 1
    for pair in pairs:
        flows[pair] = int(lines[cnt])
        cnt += 1
    file.close()
    return [locations, distances, facilities, flows]
################ Get FACE projectors ################
svd_dims = [3, 10, 50]
proj_face = []
for subspace_d in svd_dims:
    proj = compl_svd_projector(all_names_embed, svd=subspace_d)
    proj_face.append(proj)

################ Get EXPLORE metric ################
np.random.seed(1)

# Comparable
n_pairs_comp = 50000
unique_names_idx = np.unique(names_from_df, return_index=True)[1]
pairs_idx = generate_pairs(len(unique_names_idx),
                           len(unique_names_idx),
                           n_pairs=n_pairs_comp)
comparable_pairs = all_names_embed[unique_names_idx[
    pairs_idx[0]]] - all_names_embed[unique_names_idx[pairs_idx[1]]]

# In-comparable
n_pairs_incomp = 50000
pos_idx = np.where(y == 1)[0]
neg_idx = np.where(y == -1)[0]
pairs_idx = generate_pairs(len(pos_idx), len(neg_idx), n_pairs=n_pairs_incomp)
incomp_pairs = X[pos_idx[pairs_idx[0]]] - X[neg_idx[pairs_idx[1]]]

# Pairs data
X_pairs = np.vstack((comparable_pairs, incomp_pairs))
Y_pairs = np.zeros(n_pairs_comp + n_pairs_incomp)
Y_pairs[:n_pairs_comp] = 1
Ceci est un script temporaire.
"""

from  sklearn.preprocessing import normalize
from utils import readfile, roc_report, generate_pairs
from metric_learn import distances_pairs
from scipy.spatial import distance
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.decomposition import PCA

X_train_facile, Y_train_facile = readfile('data_train_facile',test=False)

X_train_facile = normalize(X_train_facile)
pairs_idx, pairs_label = generate_pairs(Y_train_facile, 1000, 0.1)
#X_train_facile = normalize(X_train_facile[:10000])
#pairs_idx, pairs_label = generate_pairs(Y_train_facile[:10000], 1000, 0.1)

scores = []
possible_distances =[("Cosine",distance.cosine),
                     ("BrayCurtis",distance.braycurtis),
("Euclidean",distance.euclidean),("Manhattan",distance.cityblock),("Chebyshev",distance.chebyshev),("Hamming",distance.hamming),
("Correlation",distance.correlation) ]

for (name,func) in possible_distances:
    print name
    dist = distances_pairs(X_train_facile, pairs_idx,func)
    #print dist
    score_facile , score_difficile = roc_report(pairs_label,dist,name)
    scores.append((name,score_facile,score_difficile))
Beispiel #8
0
def process_doc_chains(M, doc_chains):
    #Cx, Cxy are total unigram and bigram counts over gigaword (subsection), respectively
    #DOCx, DOCxy are total document frequencies over gigaword for unigrams and bigrams, respectively
    DOCx_found = False
    DOCxy_found = False
    Cx_found = False
    Cxy_found = False

    total_key = M.total_key
    args = M.args
    Cx = M.Cx
    Cxy = M.Cxy
    DOCx = M.DOCx
    DOCxy = M.DOCxy
    Cx_baseline = M.Cx_baseline

    docset_x = set()
    docset_xy = set()

    #if only looking at nsubj and dobj dependencies (as in skipgram paper)
    if args.subjobj:
        doc_chains = map(
            lambda z:
            [x for x in z if x.endswith('->nsubj') or x.endswith('->dobj')],
            doc_chains)

    for seq in doc_chains:
        for vdep in seq:
            Cx_baseline[vdep] += 1

    #filter for long or longest chains if option is enabled
    if args.coref == 'longest':
        #only retain the longest coref chain(s)
        doc_chains = [
            chain for chain in doc_chains
            if len(chain) == len(max(doc_chains, key=lambda x: len(x)))
        ]
    elif args.coref == 'long':
        #select all chains with five or more events
        doc_chains = [chain for chain in doc_chains if len(chain) >= 5]
    else:
        assert args.coref == 'all'
        #all coref chains are included in counting

    for seq in doc_chains:

        #update DOCx (i.e. unigram document frequencies)
        if not DOCx_found:
            for vdep in seq:
                if not vdep in docset_x:
                    DOCx[vdep] += 1
                    docset_x.add(vdep)

        #make unigram updates to Cx, DOCx separately if using naive counts
        #naive: a a b a c => Cx[a] = 3
        if args.naive_uni:
            if not Cx_found:
                for vdep in seq:
                    #increment total count for vdep
                    if not Cx_found:
                        Cx[vdep] += 1
                        Cx[total_key] += 1

        #make bigram updates to Cxy, DOCxy
        #also make updates to Cx, DOCx if marginalizing/non-naive counts
        #non-naive: a a b a c => Cx[a] = 12 = Cxy[a,*]+Cxy[*,a]
        if (not Cxy_found) or (not DOCxy_found) or (not args.naive_uni
                                                    and not Cx_found):
            vdep_pairs = utils.generate_pairs(seq, M.args)
            for vdep_pair in vdep_pairs:
                if not args.naive_uni:
                    if not Cx_found:
                        Cx[vdep_pair[0]] += 1
                        Cx[vdep_pair[1]] += 1
                        Cx[total_key] += 2
                #increment total count for vdep_pair
                if not Cxy_found:
                    Cxy[vdep_pair] += 1
                    Cxy[total_key] += 1
                #increment doc count for vdep_pair
                if (not vdep_pair in docset_xy) and (not DOCxy_found):
                    DOCxy[vdep_pair] += 1
                    docset_xy.add(vdep_pair)
Beispiel #9
0
def process_doc_chains(M, doc_chains):
  #Cx, Cxy are total unigram and bigram counts over gigaword (subsection), respectively
  #DOCx, DOCxy are total document frequencies over gigaword for unigrams and bigrams, respectively
  DOCx_found = False
  DOCxy_found = False
  Cx_found = False
  Cxy_found = False

  total_key = M.total_key
  args = M.args
  Cx = M.Cx
  Cxy = M.Cxy
  DOCx = M.DOCx
  DOCxy = M.DOCxy
  Cx_baseline = M.Cx_baseline

  docset_x = set()
  docset_xy = set()

  #if only looking at nsubj and dobj dependencies (as in skipgram paper)
  if args.subjobj:
    doc_chains = map(lambda z:[x for x in z if x.endswith('->nsubj') or x.endswith('->dobj')], doc_chains)


  for seq in doc_chains:
    for vdep in seq:
      Cx_baseline[vdep] += 1

  #filter for long or longest chains if option is enabled
  if args.coref == 'longest':
    #only retain the longest coref chain(s)
    doc_chains = [chain for chain in doc_chains if len(chain) == len(max(doc_chains,key=lambda x:len(x)))]
  elif args.coref == 'long':
    #select all chains with five or more events
    doc_chains = [chain for chain in doc_chains if len(chain) >= 5]
  else:
    assert args.coref == 'all'
    #all coref chains are included in counting

  for seq in doc_chains:

    #update DOCx (i.e. unigram document frequencies)
    if not DOCx_found:
      for vdep in seq:
        if not vdep in docset_x:
          DOCx[vdep] += 1
          docset_x.add(vdep)

    #make unigram updates to Cx, DOCx separately if using naive counts
    #naive: a a b a c => Cx[a] = 3
    if args.naive_uni:
      if not Cx_found:
        for vdep in seq:
          #increment total count for vdep
          if not Cx_found:
            Cx[vdep] += 1
            Cx[total_key] += 1

    #make bigram updates to Cxy, DOCxy
    #also make updates to Cx, DOCx if marginalizing/non-naive counts
    #non-naive: a a b a c => Cx[a] = 12 = Cxy[a,*]+Cxy[*,a]
    if (not Cxy_found) or (not DOCxy_found) or (not args.naive_uni and not Cx_found):
      vdep_pairs = utils.generate_pairs(seq,M.args)
      for vdep_pair in vdep_pairs:
        if not args.naive_uni:
          if not Cx_found:
            Cx[vdep_pair[0]] += 1
            Cx[vdep_pair[1]] += 1
            Cx[total_key] += 2
        #increment total count for vdep_pair
        if not Cxy_found:
          Cxy[vdep_pair] += 1
          Cxy[total_key] += 1
        #increment doc count for vdep_pair
        if (not vdep_pair in docset_xy) and (not DOCxy_found):
          DOCxy[vdep_pair] += 1
          docset_xy.add(vdep_pair)
Beispiel #10
0
def get_connectivity_metrics(gt_graph_file,
                             osm_diff_graph_file,
                             post_graph_file,
                             verbose=False,
                             num_pairs=1000):
    s_points, e_points = generate_pairs(n=num_pairs)

    gt_graph = GraphSuper(gt_graph_file)
    post_graph = GraphSuper(post_graph_file)
    osm_diff_graph = GraphSuper(osm_diff_graph_file)

    osm_diff_metric = Metric(num_pairs)
    post_metric = Metric(num_pairs)
    gt_metric = Metric(num_pairs)

    for start_points, end_points in zip(s_points, e_points):

        gt_val = find_path(gt_graph,
                           start_points,
                           end_points,
                           gt_metric,
                           length_key='weight')
        if gt_val == -1:
            # osm_diff_metric.reduce_total_paths()
            # post_metric.reduce_total_paths()
            if verbose:
                print('couldnt find path in gt', start_points, end_points)

            # continue

        osm_val = find_path(osm_diff_graph,
                            start_points,
                            end_points,
                            osm_diff_metric,
                            length_key='weight')
        if osm_val == -1:
            if verbose:
                print('couldnt find path in osm', start_points, end_points)
            osm_diff_metric.update_fn(gt_val)
        else:
            osm_diff_metric.update_correct(gt_val, osm_val)

        post_val = find_path(post_graph,
                             start_points,
                             end_points,
                             post_metric,
                             length_key='weight')
        if post_val == -1:
            if verbose:
                print('couldnt find path in post', start_points, end_points)
            post_metric.update_fn(gt_val)
        else:
            post_metric.update_correct(gt_val, post_val)

    if verbose:
        print('\n osm diff')
        osm_diff_metric.print_all()
        print('\n post')
        post_metric.print_all()
        print('\n gt')
        gt_metric.print_all()

    return osm_diff_metric.get_all(), post_metric.get_all()