def step(self, dbg=False): self.i += 1 n = self.n stp = self.stp self.angle += (1-2*random())*self.angle_stp angle = self.angle + (1-2*random(n))*self.angle_local_stp xy = self.xy[:n,:] p = self.p[:n,:] tree = kdt(xy) new_p = xy + column_stack([cos(angle),sin(angle)]) * stp if len(new_p)>0: ind = tree.query_ball_point(new_p, stp) mask = [i for i,v in enumerate(ind) if not v] if mask: new_num = len(mask) self.p[n:n+new_num] = reshape(mask,(-1,1)) self.xy[n:n+new_num,:] = new_p[mask,:] inside = n+(logical_and(self.xy[n:n+new_num,:]<1.0, self.xy[n:n+new_num,:]>0.0).sum(axis=1)==2)\ .nonzero()[0] li = len(inside) self.xy[n:n+li,:] = self.xy[inside,:] self.p[n:n+li] = self.p[inside] self.n = n + li return True
def __make_sources(self, xx=0.5, yy=0.5, rad=None, domain='rect'): from scipy.spatial import cKDTree as kdt from scipy.spatial import Delaunay as triag from iutils.random import darts from iutils.random import darts_rect if rad is None: rad = self.init_rad if domain == 'circ': sources = darts(self.init_num, xx, yy, self.init_rad, self.source_dst) elif domain == 'rect': sources = darts_rect(self.init_num, xx, yy, 2 * rad, 2 * rad, self.source_dst) else: raise ValueError('domain must be "rect" or "circ".') tree = kdt(sources) self.sources = sources self.tree = tree self.tri = triag(self.sources, incremental=False, qhull_options='QJ Qc') self.num_sources = len(self.sources) return len(sources)
def step(self, dbg=False): self.i += 1 n = self.n stp = self.stp self.angle += (1 - 2 * random()) * self.angle_stp angle = self.angle + (1 - 2 * random(n)) * self.angle_local_stp xy = self.xy[:n, :] p = self.p[:n, :] tree = kdt(xy) new_p = xy + column_stack([cos(angle), sin(angle)]) * stp if len(new_p) > 0: ind = tree.query_ball_point(new_p, stp) mask = [i for i, v in enumerate(ind) if not v] if mask: new_num = len(mask) self.p[n:n + new_num] = reshape(mask, (-1, 1)) self.xy[n:n + new_num, :] = new_p[mask, :] inside = n+(logical_and(self.xy[n:n+new_num,:]<1.0, self.xy[n:n+new_num,:]>0.0).sum(axis=1)==2)\ .nonzero()[0] li = len(inside) self.xy[n:n + li, :] = self.xy[inside, :] self.p[n:n + li] = self.p[inside] self.n = n + li return True
def _calc_kdtree(self, selection = 'interpolated'): if self.condition in 'controlControlCONTROL': return None data = self._select_data(selection) zdata = zip(data.x, data.y, data.z) return kdt(zdata)
def __init__(self, nodes, coords=None): # spatial.cKDtree is reported to be 200-1000 times faster # however, query_ball_point is only included in very recent scipy # packages, not yet shipped # with Ubuntu 12.10 (and a manual scipy installation can be messy) self.lookup = list(nodes) if not coords: self.coords = np.array([x.coord_scaled for x in nodes]) self.tree = kdt(self.coords) else: self.coords = coords # the ordering of coords and nodes must be the same!!! self.tree = kdt(self.coords) return
def build_pos_index(paths): num = len(paths) xs = zeros((2 * num, 2), 'float') x_path = zeros(2 * num, 'int') for i, (start, stop) in enumerate(paths): xs[i, :] = start xs[num + i, :] = stop x_path[i] = i x_path[num + i] = i tree = kdt(xs) unsorted = set(range(2 * num)) return tree, xs, x_path, unsorted
def matchPoints(patch1, patch2, coord1, coord2, thresh=0.4): """ Matches a pair of keypoints from two images, based on feature descriptors (image patches) """ tree = kdt(patch2) dists, idx = tree.query(patch1, k=2) ratios = dists[:,0]/dists[:,1] patch2Idx = idx[ratios < thresh][:,0] patch1Idx = np.arange(len(patch1))[ratios < thresh] matched1 = coord1[patch1Idx] matched2 = coord2[patch2Idx] return matched1, matched2
def _append_tmp_sources(self): from scipy.spatial import cKDTree as kdt from scipy.spatial import Delaunay as triag sources = row_stack([self.sources] + self.tmp_sources) tree = kdt(sources) self.sources = sources self.tree = tree self.tmp_sources = [] self.tri = triag(self.sources, incremental=False, qhull_options='QJ Qc') self.num_sources = len(self.sources) return len(sources)
def _append_tmp_sources(self): from scipy.spatial import cKDTree as kdt from scipy.spatial import Delaunay as triag sources = row_stack([self.sources]+self.tmp_sources) tree = kdt(sources) self.sources = sources self.tree = tree self.tmp_sources = [] self.tri = triag( self.sources, incremental=False, qhull_options='QJ Qc' ) self.num_sources = len(self.sources) return len(sources)
def equ_dist_ts(arrival_time, eq_dist_array, data): """ Create a time series with constant time steps. The nearest point of the original time series is used for the corresponding time of the equi-distant time series. @parameter: arrival_time, type = np.array @parameter: eq_dist_array, type = np.array @parameter: data, type = np.array""" mask = ~np.isnan(data) data = data[mask] valid = np.arange(data.size) tt = kdt(list(zip(arrival_time[valid], np.zeros(arrival_time[valid].size)))) eq_tt = list(zip(eq_dist_array, np.zeros(eq_dist_array.size))) eq_tt = tt.query(eq_tt)[1] eq_data = data[valid][eq_tt] return eq_data
def darts_rect(n, xx, yy, w=1, h=1, dst=0): ## remove new nodes that are too close to other ## new nodes visited = set() dartsxy = random_points_in_rectangle(n, xx, yy, w, h) tree = kdt(dartsxy) near = tree.query_ball_point(dartsxy, dst) jj = [] for j, n in enumerate(near): if len(visited.intersection(n)) < 1: jj.append(j) visited.add(j) res = dartsxy[jj, :] return res
def spatial_sort_dots_2d(vertices, init_rad=0.01): from numpy import array from numpy import arange from numpy.linalg import norm from scipy.spatial import cKDTree as kdt num = len(vertices) res = [] unsorted = set(arange(num).astype('int')) tree = kdt(vertices) count = 0 pos = array([0,0],'float') while count<num: rad = init_rad while True: near = tree.query_ball_point(pos, rad) cands = list(set(near).intersection(unsorted)) if not cands: rad *= 2.0 continue dst = norm(pos - vertices[cands,:], axis=1) cp = dst.argmin() uns = cands[cp] break path = vertices[uns] res.append(path) pos = vertices[uns, :] unsorted.remove(uns) count += 1 return res
def darts_rect(n, xx, yy, w=1, h=1, dst=0): ## remove new nodes that are too close to other ## new nodes visited = set() dartsxy = random_points_in_rectangle(n, xx, yy, w, h) tree = kdt(dartsxy) near = tree.query_ball_point(dartsxy, dst) jj = [] for j,n in enumerate(near): if len(visited.intersection(n))<1: jj.append(j) visited.add(j) res = dartsxy[jj,:] return res
def __make_sources(self, xx=0.5, yy=0.5, rad=None, domain='rect'): from scipy.spatial import cKDTree as kdt from scipy.spatial import Delaunay as triag from dddUtils.random import darts from dddUtils.random import darts_rect if rad is None: rad = self.init_rad if domain=='circ': sources = darts( self.init_num, xx, yy, self.init_rad, self.source_dst ) elif domain=='rect': sources = darts_rect( self.init_num, xx, yy, 2*rad, 2*rad, self.source_dst ) else: raise ValueError('domain must be "rect" or "circ".') tree = kdt(sources) self.sources = sources self.tree = tree self.tri = triag( self.sources, incremental=False, qhull_options='QJ Qc' ) self.num_sources = len(self.sources) return len(sources)
def darts(n, xx, yy, rr, dst): """ get at most n random, uniformly distributed, points in a circle. centered at (xx,yy), with radius rr. points are no closer to each other than dst. """ ## remove new nodes that are too close to other ## new nodes visited = set() dartsxy = random_points_in_circle(n, xx, yy, rr) tree = kdt(dartsxy) near = tree.query_ball_point(dartsxy, dst) jj = [] for j, n in enumerate(near): if len(visited.intersection(n)) < 1: jj.append(j) visited.add(j) res = dartsxy[jj, :] return res
def darts(n, xx, yy, rr, dst): """ get at most n random, uniformly distributed, points in a circle. centered at (xx,yy), with radius rr. points are no closer to each other than dst. """ ## remove new nodes that are too close to other ## new nodes visited = set() dartsxy = random_points_in_circle(n, xx, yy, rr) tree = kdt(dartsxy) near = tree.query_ball_point(dartsxy, dst) jj = [] for j,n in enumerate(near): if len(visited.intersection(n))<1: jj.append(j) visited.add(j) res = dartsxy[jj,:] return res
def main(): parser = argparse.ArgumentParser( description="Evaluate the n matrix embedding experiment", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--dictionaries", "-d", nargs='+', type=argparse.FileType('r'), default=[ sys.stdin, ], help="vocabulary dictionaries of the form word vec with a header") parser.add_argument( "--infile", "-i", type=argparse.FileType('r'), default=sys.stdin, help= "evaluation instruction of the form word1 lang1 lang2 [word2]. If word2 is absent it is only predicted, not evaluated" ) parser.add_argument("--modelfiles", "-m", nargs='+', default=[], help="all models input files") parser.add_argument( "--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help= "results file of the form word1 lang1 lang2 word2 [pos wordlist], where the first three fields are identical to eval and the last field is the 1-best prediction. If truth is known, ordinal position of correct answer (-1 if not found) followed by the n-best list in order" ) parser.add_argument( "--nbest", "-n", type=int, default=10, help="nbest neighbors generated for purposes of evaluation") parser.add_argument("--pickle", "-p", action='store_true', default=False, help="dictionaries are pickled with pickle_vocab") parser.add_argument("--hidewords", action='store_true', default=False, help="don't actually print nbest words") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = args.infile dictionaries = [pickle.load(d) for d in args.dictionaries ] if args.pickle else [d for d in args.dictionaries] dicts_by_lang = dd(list) langdims = dict() outfile = args.outfile for d in dictionaries: if args.pickle: lang = d['lang'] dims = int(d['dim']) else: info = d.readline().strip().split() dims = int(info[1]) lang = info[2] if lang in langdims: if dims != langdims[lang]: raise ValueError("Multiple dimensions seen for %s: %d and %d" % (lang, dims, langdims[lang])) else: langdims[lang] = dims dicts_by_lang[lang].append(d) inmats = {} outmats = {} vocab = dd(lambda: dict()) # for kdt lookup targets = dd(list) targetvoc = dd(list) models = [np.load(x) for x in args.modelfiles] for l in list(langdims.keys()): inmats[l] = [np.matrix(x['%s_in' % l]) for x in models] outmats[l] = [np.matrix(x['%s_out' % l]) for x in models] fdim = langdims[l] for dfile in dicts_by_lang[l]: if args.pickle: print("Unpickling for " + l) vocab[l].update(dfile['vocab']) targets[l].extend(dfile['targets']) targetvoc[l].extend(dfile['targetvoc']) else: print("processing " + dfile.name) try: for ln, line in enumerate(dfile): entry = line.strip().split(' ') if len(entry) < fdim + 1: sys.stderr.write( "skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, dfile.name, len(entry), entry[0])) continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) # print "Adding "+l+" -> "+word vocab[l][word] = vec targets[l].append(vec) targetvoc[l].append(word) except: print(dfile.name) print(line) print(len(entry)) print(word) print(ln) raise # normalize for euclidean distance nearest neighbor => cosine with constant targets[l] = kdt(normalize(np.array(targets[l]), axis=1, norm='l2')) print("loaded vocabularies") for line in infile: inst = line.strip().split() inword = inst[0] inlang = inst[1] outlang = inst[2] outword = inst[3] if len(inst) > 3 else None if inword not in vocab[inlang]: sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (inlang, inword)) continue report = inst[:4] invec = np.matrix(vocab[inlang][inword]) for smat, tmat in zip(inmats[inlang], outmats[outlang]): xform = np.asarray(invec * smat * tmat)[0] neighbors = [] cosines, cands = targets[outlang].query(xform, args.nbest) for cos, cand in zip(cosines, cands): neighbors.append((cos, targetvoc[outlang][cand])) nb_words = [x[1] for x in neighbors] xbest = str(cosine(xform, vocab[outlang][nb_words[0]])) if outword is not None: truth = vocab[outlang][outword] xtruth = str(cosine(xform, truth)) truthbest = str(cosine(truth, vocab[outlang][nb_words[0]])) rank = nb_words.index(outword) if outword in nb_words else -1 report.append(str(rank)) report.extend([xtruth, xbest, truthbest]) else: report.append(xbest) if not args.hidewords: report.extend(nb_words) outfile.write('\t'.join(report) + "\n")
def run(self, npts=100, r=20, dtransform=None): r""" Performs the 2-point correlation calculation. This method works by selecting a set of **query** points in the void space then finding all neighboring points within a specified distance of each **query** point that lie in the void space or the solid phase. The fraction of points that lie in the void space as a function of distance from the query point is returned. Parameters ---------- npts : int The number of points against which the neighboring points should be queried. The **query** points are randomly selected, so repeated calls to run will not necessarily generate identical results. If the results differ too much then ``npts`` should be incresed. r : scalar or vector Controls the radial distance from the query points that are considered. If a scalar is received then a list of sizes between 1 and ``r`` is generated with a spacing of 1 voxel, otherwise the given ``r`` values are used. It is useful to provide ``r`` values to limit the number of points and speed-up the calculation. TODO: The methods in here could clearly benefit from proper use of itertools, nditer, and other numpy functions. I can't quite figure how to convert meshgrid to vector form. """ if sp.size(r) == 1: rmax = r sizes = sp.arange(1, rmax) else: sizes = r rmax = r[-1] # Extract size metrics from input image [Lx, Ly, Lz] = sp.shape(self.image) ind = sp.where(self.image == 1) temp = sp.random.randint(0, sp.shape(ind)[1], npts) i_query = (ind[0][temp], ind[1][temp], ind[2][temp]) i_void = sp.where(self.image == 1) i_solid = sp.where(self.image == 0) # Reduce points to only those within rmax of query points if dtransform is None: imtemp = sp.ones((Lx, Ly, Lz), dtype=bool) imtemp[i_query] = False dtransform = spim.distance_transform_edt(imtemp) mask = dtransform <= rmax i_void = sp.where((self.image * mask) == 1) i_solid = sp.where(((~self.image) * mask) == 1) # Convert matrix into index notation for void and solid phases ind_void = sp.vstack( (i_void[0].flatten(), i_void[1].flatten(), i_void[2].flatten())).T ind_solid = sp.vstack((i_solid[0].flatten(), i_solid[1].flatten(), i_solid[2].flatten())).T ind_query = sp.vstack((i_query[0].flatten(), i_query[1].flatten(), i_query[2].flatten())).T # Generate kdtrees for void, solid and query points dtree_void = kdt(ind_void) dtree_solid = kdt(ind_solid) dtree_pts = kdt(ind_query) # Perform 2-point correlation calculation for range of radii print('Checking correlations vs increasing radii') print('0%|' + '-' * len(sizes) + '|100%') print(' |', end='') hits = [] for r in sizes: print('.', end='') sys.stdout.flush() hits_void = dtree_pts.count_neighbors(other=dtree_void, r=r) hits_solid = dtree_pts.count_neighbors(other=dtree_solid, r=r) hits.append(hits_void / (hits_solid + hits_void)) print('|') # Store results in namedtuple vals = namedtuple('TwoPointCorrelation', ('distance', 'probability')) vals.distance = sizes vals.probability = hits return vals
def main(): parser = argparse.ArgumentParser( description="Show l2norm of all pairwise languages in a trained model", formatter_class=argparse.ArgumentDefaultsHelpFormatter, ) parser.add_argument( "--dictionaries", "-d", nargs="+", type=argparse.FileType("r"), default=[sys.stdin], help="vocabulary dictionaries of the form word vec with a header", ) parser.add_argument( "--infile", "-i", type=argparse.FileType("r"), default=sys.stdin, help="evaluation instruction of the form word1 lang1 word2 lang2 ... wordn langn.", ) parser.add_argument("--modelfiles", "-m", nargs="+", default=[], help="all models input files") parser.add_argument( "--outfile", "-o", nargs="?", type=argparse.FileType("w"), default=sys.stdout, help="for each model, for each pairwise language, the l2norm", ) parser.add_argument( "--pickle", "-p", action="store_true", default=False, help="dictionaries are pickled with pickle_vocab" ) try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = args.infile dictionaries = [pickle.load(d) for d in args.dictionaries] if args.pickle else [d for d in args.dictionaries] dicts_by_lang = dd(list) langdims = dict() outfile = args.outfile for d in dictionaries: if args.pickle: lang = d["lang"] dims = int(d["dim"]) else: info = d.readline().strip().split() dims = int(info[1]) lang = info[2] if lang in langdims: if dims != langdims[lang]: raise ValueError("Multiple dimensions seen for %s: %d and %d" % (lang, dims, langdims[lang])) else: langdims[lang] = dims dicts_by_lang[lang].append(d) inmats = {} outmats = {} vocab = dd(lambda: dict()) # for kdt lookup targets = dd(list) targetvoc = dd(list) models = [np.load(x) for x in args.modelfiles] for l in list(langdims.keys()): inmats[l] = [np.matrix(x["%s_in" % l]) for x in models] outmats[l] = [np.matrix(x["%s_out" % l]) for x in models] fdim = langdims[l] for dfile in dicts_by_lang[l]: if args.pickle: print("Unpickling for " + l) vocab[l].update(dfile["vocab"]) targets[l].extend(dfile["targets"]) targetvoc[l].extend(dfile["targetvoc"]) else: print("processing " + dfile.name) try: for ln, line in enumerate(dfile): entry = line.strip().split(" ") if len(entry) < fdim + 1: sys.stderr.write( "skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, dfile.name, len(entry), entry[0]) ) continue word = " ".join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) # print "Adding "+l+" -> "+word vocab[l][word] = vec targets[l].append(vec) targetvoc[l].append(word) except: print(dfile.name) print(line) print(len(entry)) print(word) print(ln) raise # normalize for euclidean distance nearest neighbor => cosine with constant targets[l] = kdt(normalize(np.array(targets[l]), axis=1, norm="l2")) print("loaded vocabularies") data = dd(list) langmap = {} for line in infile: linedata = line.strip().split() for dset, (word, lang) in enumerate(zip(linedata[::2], linedata[1::2])): if word not in vocab[lang]: sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (lang, word)) continue if dset not in langmap: langmap[dset] = lang elif langmap[dset] != lang: sys.stderr.write("Language collision at %d: %s vs %s\n" % (dset, lang, landmap[dset])) sys.exit(1) data[dset].append(vocab[lang][word]) for lang in langmap: data[lang] = np.matrix(data[lang]) langs = len(langmap.keys()) for m, mfile in enumerate(args.modelfiles): for d1 in range(langs): l1 = langmap[d1] d1xform = data[d1] * inmats[l1][m] for d2 in range(d1 + 1, langs): l2 = langmap[d2] if l2 in inmats: # i-i calculation d2xform = data[d2] * inmats[l2][m] delta = d1xform - d2xform delnorm = LA.norm(delta, ord=2) l2n2 = delnorm * delnorm outfile.write("%s\tii\t%s\t%s\t%f\n" % (mfile, l1, l2, l2n2)) if l2 in outmats: xform = d1xform * outmats[l2][m] delta = xform - data[d2] delnorm = LA.norm(delta, ord=2) l2n2 = delnorm * delnorm outfile.write("%s\tio\t%s\t%s\t%f\n" % (mfile, l1, l2, l2n2))
continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) # print "Adding "+l+" -> "+word vocab[l][word] = vec targets[l].append(vec) targetvoc[l].append(word) except: print dfile.name print line print len(entry) print word print ln raise # normalize for euclidean distance nearest neighbor => cosine with constant targets[l] = kdt(normalize(np.array(targets[l]), axis=1, norm='l2')) print "loaded vocabularies" for line in infile: inst = line.strip().split() inword = inst[0] inlang = inst[1] outlang = inst[2] outword = inst[3] if len(inst) > 3 else None if inword not in vocab[inlang]: #sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (inlang, inword)) continue report = inst[:4] invec = np.matrix(vocab[inlang][inword]) for smat, tmat in zip(inmats[inlang], outmats[outlang]): xform = np.asarray(invec * smat * tmat)[0]
def main(): parser = argparse.ArgumentParser(description="Evaluate the n matrix embedding experiment", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--dictionaries", "-d", nargs='+', type=argparse.FileType('r'), default=[sys.stdin,], help="vocabulary dictionaries of the form word vec with a header") parser.add_argument("--infile", "-i", type=argparse.FileType('r'), default=sys.stdin, help="evaluation instruction of the form word1 lang1 lang2 [word2]. If word2 is absent it is only predicted, not evaluated") parser.add_argument("--modelfiles", "-m", nargs='+', default=[], help="all models input files") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="results file of the form word1 lang1 lang2 word2 [pos wordlist], where the first three fields are identical to eval and the last field is the 1-best prediction. If truth is known, ordinal position of correct answer (-1 if not found) followed by the n-best list in order") parser.add_argument("--nbest", "-n", type=int, default=10, help="nbest neighbors generated for purposes of evaluation") parser.add_argument("--pickle", "-p", action='store_true', default=False, help="dictionaries are pickled with pickle_vocab") parser.add_argument("--hidewords", action='store_true', default=False, help="don't actually print nbest words") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = args.infile dictionaries = [pickle.load(d) for d in args.dictionaries] if args.pickle else [d for d in args.dictionaries] dicts_by_lang = dd(list) langdims = dict() outfile = args.outfile for d in dictionaries: if args.pickle: lang = d['lang'] dims = int(d['dim']) else: info = d.readline().strip().split() dims = int(info[1]) lang = info[2] if lang in langdims: if dims != langdims[lang]: raise ValueError("Multiple dimensions seen for %s: %d and %d" % (lang, dims, langdims[lang])) else: langdims[lang]=dims dicts_by_lang[lang].append(d) inmats = {} outmats = {} vocab = dd(lambda: dict()) # for kdt lookup targets = dd(list) targetvoc = dd(list) models = [ np.load(x) for x in args.modelfiles ] for l in list(langdims.keys()): inmats[l] = [ np.matrix(x['%s_in' % l]) for x in models ] outmats[l] = [ np.matrix(x['%s_out' % l]) for x in models ] fdim = langdims[l] for dfile in dicts_by_lang[l]: if args.pickle: print("Unpickling for "+l) vocab[l].update(dfile['vocab']) targets[l].extend(dfile['targets']) targetvoc[l].extend(dfile['targetvoc']) else: print("processing "+dfile.name) try: for ln, line in enumerate(dfile): entry = line.strip().split(' ') if len(entry) < fdim+1: sys.stderr.write("skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, dfile.name, len(entry), entry[0])) continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) # print "Adding "+l+" -> "+word vocab[l][word]=vec targets[l].append(vec) targetvoc[l].append(word) except: print(dfile.name) print(line) print(len(entry)) print(word) print(ln) raise # normalize for euclidean distance nearest neighbor => cosine with constant targets[l] = kdt(normalize(np.array(targets[l]), axis=1, norm='l2')) print("loaded vocabularies") for line in infile: inst = line.strip().split() inword = inst[0] inlang = inst[1] outlang = inst[2] outword = inst[3] if len(inst) > 3 else None if inword not in vocab[inlang]: sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (inlang, inword)) continue report = inst[:4] invec = np.matrix(vocab[inlang][inword]) for smat, tmat in zip(inmats[inlang], outmats[outlang]): xform = np.asarray(invec*smat*tmat)[0] neighbors = [] cosines, cands = targets[outlang].query(xform, args.nbest) for cos, cand in zip(cosines, cands): neighbors.append((cos, targetvoc[outlang][cand])) nb_words = [x[1] for x in neighbors] xbest=str(cosine(xform, vocab[outlang][nb_words[0]])) if outword is not None: truth=vocab[outlang][outword] xtruth=str(cosine(xform, truth)) truthbest=str(cosine(truth, vocab[outlang][nb_words[0]])) rank = nb_words.index(outword) if outword in nb_words else -1 report.append(str(rank)) report.extend([xtruth, xbest, truthbest]) else: report.append(xbest) if not args.hidewords: report.extend(nb_words) outfile.write('\t'.join(report)+"\n")
def main(): parser = argparse.ArgumentParser(description="Evaluate the 1 matrix no interlingus embedding experiment", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument("--sourcedictionary", "-S", type=argparse.FileType('r'), help="source vocabulary dictionary of the form lang word vec; headed by row col") parser.add_argument("--targetdictionary", "-T", type=argparse.FileType('r'), help="target vocabulary dictionary of the form lang word vec; headed by row col") parser.add_argument("--infile", "-i", type=argparse.FileType('r'), default=sys.stdin, help="evaluation instruction of the form word1 lang1 lang2 [word2]. If word2 is absent it is only predicted, not evaluated") parser.add_argument("--modelfile", "-m", help="all models input file") parser.add_argument("--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="results file of the form word1 lang1 lang2 word2 [pos wordlist], where the first three fields are identical to eval and the last field is the 1-best prediction. If truth is known, ordinal position of correct answer (-1 if not found) followed by the n-best list in order") parser.add_argument("--nbest", "-n", type=int, default=10, help="nbest neighbors generated for purposes of evaluation") parser.add_argument("--pickle", "-p", action='store_true', default=False, help="dictionaries are pickled with pickle_vocab") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = args.infile outfile = args.outfile sourcedictionary = pickle.load(args.sourcedictionary) if args.pickle else args.sourcedictionary targetdictionary = pickle.load(args.targetdictionary) if args.pickle else args.targetdictionary dims = {} if args.pickle: sourcelang = sourcedictionary['lang'] dims[sourcelang]=int(sourcedictionary['dim']) targetlang = targetdictionary['lang'] dims[targetlang]=int(targetdictionary['dim']) else: sourceinfo = sourcedictionary.readline().strip().split() targetinfo = targetdictionary.readline().strip().split() sourcelang=sourceinfo[2] targetlang=targetinfo[2] dims[sourcelang] = int(sourceinfo[1]) dims[targetlang] = int(targetinfo[1]) dicts_by_lang = {} dicts_by_lang[sourcelang]=sourcedictionary dicts_by_lang[targetlang]=targetdictionary sourcedim = dims[sourcelang] targetdim = dims[targetlang] print(sourcedim,targetdim) mat = np.matrix(np.load(args.modelfile)['arr_0']) print(mat.shape) vocab = dd(lambda: dict()) if args.pickle: print("Unpickling") targets = dicts_by_lang[targetlang]['targets'] targetvoc = dicts_by_lang[targetlang]['targetvoc'] for lang in (sourcelang, targetlang): vocab[lang] = dicts_by_lang[lang]['vocab'] else: print("Loading vocab from text files") targets = [] targetvoc = [] # load transformation matrices # TODO: would be cool if this could exist on-disk in some binary format so only the instructions need be passed in # Kludgy: store source and target in different structures for lang in (sourcelang, targetlang): istarget = lang == targetlang fdim = dims[lang] dfile = dicts_by_lang[lang] try: for ln, line in enumerate(dfile): entry = line.strip().split(' ') if len(entry) < fdim+1: sys.stderr.write("skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, dfile.name, len(entry), entry[0])) continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) vocab[lang][word]=vec if istarget: targets.append(vec) targetvoc.append(word) except: print(dfile.name) print(line) print(len(entry)) print(word) print(ln) raise targets = kdt(normalize(np.array(targets), axis=1, norm='l2')) print("loaded vocabularies") for line in infile: inst = line.strip().split() inword = inst[0] inlang = inst[1] outlang = inst[2] outword = inst[3] if len(inst) > 3 else None if inword not in vocab[inlang]: sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (inlang, inword)) continue invec = np.matrix(vocab[inlang][inword]) xform = np.asarray(invec*mat)[0] neighbors = [] cosines, cands = targets.query(xform, args.nbest) for cos, cand in zip(cosines, cands): neighbors.append((cos, targetvoc[cand])) report = inst[:3] nb_words = [x[1] for x in neighbors] xbest=str(cosine(xform, vocab[outlang][nb_words[0]])) if outword is not None and outword in vocab[outlang]: report.append(inst[3]) #cosines: xform to truth, xform to 1best, truth to 1best truth=vocab[outlang][outword] xtruth=str(cosine(xform, truth)) truthbest=str(cosine(truth, vocab[outlang][nb_words[0]])) rank = nb_words.index(outword) if outword in nb_words else -1 report.append(str(rank)) report.extend([xtruth, xbest, truthbest]) else: report.append(xbest) report.extend(nb_words) outfile.write('\t'.join(report)+"\n")
def spatial_sort(paths, init_rad=0.01): from numpy import array from numpy import zeros from numpy.linalg import norm from scipy.spatial import cKDTree as kdt num = len(paths) res = [] unsorted = set(range(2 * num)) xs = zeros((2 * num, 2), 'float') x_path = zeros(2 * num, 'int') for i, path in enumerate(paths): xs[i, :] = path[0, :] xs[num + i, :] = path[-1, :] x_path[i] = i x_path[num + i] = i tree = kdt(xs) count = 0 pos = array([0, 0], 'float') order = [] while count < num: rad = init_rad while True: near = tree.query_ball_point(pos, rad) cands = list(set(near).intersection(unsorted)) if not cands: rad *= 2.0 continue dst = norm(pos - xs[cands, :], axis=1) cp = dst.argmin() uns = cands[cp] break path_ind = x_path[uns] path = paths[path_ind] if uns >= num: res.append(path[::-1]) pos = paths[path_ind][0, :] unsorted.remove(uns) unsorted.remove(uns - num) else: res.append(path) pos = paths[path_ind][-1, :] unsorted.remove(uns) unsorted.remove(uns + num) order.append(path_ind) count += 1 return res, order
sourceinfo = map(int, sourcedictionary.readline().strip().split()) sourcedim = sourceinfo[1] smat = np.matrix(np.load(args.modelfile)['s']) tmat = np.matrix(np.load(args.modelfile)['t']) print smat.shape print tmat.shape # load transformation matrices # TODO: would be cool if this could exist on-disk in some binary format so only the instructions need be passed in # Kludgy: store source and target in different structures vocab = dd(lambda: dict()) # for kdt lookup pretargets, targetvoc = cPickle.load(args.targetdictionary) targets = kdt(pretargets) invtargetvoc = dict() for key, word in enumerate(targetvoc): invtargetvoc[word]=key print len(targetvoc) try: for ln, line in enumerate(sourcedictionary): entry = line.strip().split(' ') if len(entry) < sourcedim+2: sys.stderr.write("skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, sourcedictionary.name, len(entry), entry[0])) continue lang = entry[0] word = ' '.join(entry[1:-sourcedim]) vec = np.array(entry[-sourcedim:]).astype(float) vocab[lang][word]=vec
def main(): parser = argparse.ArgumentParser( description="Show l2norm of all pairwise languages in a trained model", formatter_class=argparse.ArgumentDefaultsHelpFormatter) parser.add_argument( "--dictionaries", "-d", nargs='+', type=argparse.FileType('r'), default=[ sys.stdin, ], help="vocabulary dictionaries of the form word vec with a header") parser.add_argument( "--infile", "-i", type=argparse.FileType('r'), default=sys.stdin, help= "evaluation instruction of the form word1 lang1 word2 lang2 ... wordn langn." ) parser.add_argument("--modelfiles", "-m", nargs='+', default=[], help="all models input files") parser.add_argument( "--outfile", "-o", nargs='?', type=argparse.FileType('w'), default=sys.stdout, help="for each model, for each pairwise language, the l2norm") parser.add_argument("--pickle", "-p", action='store_true', default=False, help="dictionaries are pickled with pickle_vocab") try: args = parser.parse_args() except IOError as msg: parser.error(str(msg)) infile = args.infile dictionaries = [pickle.load(d) for d in args.dictionaries ] if args.pickle else [d for d in args.dictionaries] dicts_by_lang = dd(list) langdims = dict() outfile = args.outfile for d in dictionaries: if args.pickle: lang = d['lang'] dims = int(d['dim']) else: info = d.readline().strip().split() dims = int(info[1]) lang = info[2] if lang in langdims: if dims != langdims[lang]: raise ValueError("Multiple dimensions seen for %s: %d and %d" % (lang, dims, langdims[lang])) else: langdims[lang] = dims dicts_by_lang[lang].append(d) inmats = {} outmats = {} vocab = dd(lambda: dict()) # for kdt lookup targets = dd(list) targetvoc = dd(list) models = [np.load(x) for x in args.modelfiles] for l in list(langdims.keys()): inmats[l] = [np.matrix(x['%s_in' % l]) for x in models] outmats[l] = [np.matrix(x['%s_out' % l]) for x in models] fdim = langdims[l] for dfile in dicts_by_lang[l]: if args.pickle: print("Unpickling for " + l) vocab[l].update(dfile['vocab']) targets[l].extend(dfile['targets']) targetvoc[l].extend(dfile['targetvoc']) else: print("processing " + dfile.name) try: for ln, line in enumerate(dfile): entry = line.strip().split(' ') if len(entry) < fdim + 1: sys.stderr.write( "skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, dfile.name, len(entry), entry[0])) continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) # print "Adding "+l+" -> "+word vocab[l][word] = vec targets[l].append(vec) targetvoc[l].append(word) except: print(dfile.name) print(line) print(len(entry)) print(word) print(ln) raise # normalize for euclidean distance nearest neighbor => cosine with constant targets[l] = kdt(normalize(np.array(targets[l]), axis=1, norm='l2')) print("loaded vocabularies") data = dd(list) langmap = {} for line in infile: linedata = line.strip().split() for dset, (word, lang) in enumerate(zip(linedata[::2], linedata[1::2])): if word not in vocab[lang]: sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (lang, word)) continue if dset not in langmap: langmap[dset] = lang elif langmap[dset] != lang: sys.stderr.write("Language collision at %d: %s vs %s\n" % (dset, lang, landmap[dset])) sys.exit(1) data[dset].append(vocab[lang][word]) for lang in langmap: data[lang] = np.matrix(data[lang]) langs = len(langmap.keys()) for m, mfile in enumerate(args.modelfiles): for d1 in range(langs): l1 = langmap[d1] d1xform = data[d1] * inmats[l1][m] for d2 in range(d1 + 1, langs): l2 = langmap[d2] if l2 in inmats: # i-i calculation d2xform = data[d2] * inmats[l2][m] delta = d1xform - d2xform delnorm = LA.norm(delta, ord=2) l2n2 = delnorm * delnorm outfile.write("%s\tii\t%s\t%s\t%f\n" % (mfile, l1, l2, l2n2)) if l2 in outmats: xform = d1xform * outmats[l2][m] delta = xform - data[d2] delnorm = LA.norm(delta, ord=2) l2n2 = delnorm * delnorm outfile.write("%s\tio\t%s\t%s\t%f\n" % (mfile, l1, l2, l2n2))
def spatial_sort_2d(paths, init_rad=0.01): from numpy import array from numpy import zeros from numpy.linalg import norm from scipy.spatial import cKDTree as kdt num = len(paths) res = [] unsorted = set(range(2*num)) xs = zeros((2*num,2), 'float') x_path = zeros(2*num, 'int') for i, path in enumerate(paths): xs[i,:] = path[0,:] xs[num+i,:] = path[-1,:] x_path[i] = i x_path[num+i] = i tree = kdt(xs) count = 0 pos = array([0,0],'float') while count<num: rad = init_rad while True: near = tree.query_ball_point(pos, rad) cands = list(set(near).intersection(unsorted)) if not cands: rad *= 2.0 continue dst = norm(pos - xs[cands,:], axis=1) cp = dst.argmin() uns = cands[cp] break path_ind = x_path[uns] path = paths[path_ind] if uns>=num: res.append(path[::-1]) pos = paths[path_ind][0,:] unsorted.remove(uns) unsorted.remove(uns-num) else: res.append(path) pos = paths[path_ind][-1,:] unsorted.remove(uns) unsorted.remove(uns+num) count += 1 return res
def __setstate__(self, state): self.lookup, coords = state self.tree = kdt(self.coords)
def run(self, npts=100, r=20, dtransform=None): r""" Performs the 2-point correlation calculation. This method works by selecting a set of **query** points in the void space then finding all neighboring points within a specified distance of each **query** point that lie in the void space or the solid phase. The fraction of points that lie in the void space as a function of distance from the query point is returned. Parameters ---------- npts : int The number of points against which the neighboring points should be queried. The **query** points are randomly selected, so repeated calls to run will not necessarily generate identical results. If the results differ too much then ``npts`` should be incresed. r : scalar or vector Controls the radial distance from the query points that are considered. If a scalar is received then a list of sizes between 1 and ``r`` is generated with a spacing of 1 voxel, otherwise the given ``r`` values are used. It is useful to provide ``r`` values to limit the number of points and speed-up the calculation. TODO: The methods in here could clearly benefit from proper use of itertools, nditer, and other numpy functions. I can't quite figure how to convert meshgrid to vector form. """ if sp.size(r) == 1: rmax = r sizes = sp.arange(1, rmax) else: sizes = r rmax = r[-1] # Extract size metrics from input image [Lx, Ly, Lz] = sp.shape(self.image) ind = sp.where(self.image == 1) temp = sp.random.randint(0, sp.shape(ind)[1], npts) i_query = (ind[0][temp], ind[1][temp], ind[2][temp]) i_void = sp.where(self.image == 1) i_solid = sp.where(self.image == 0) # Reduce points to only those within rmax of query points if dtransform is None: imtemp = sp.ones((Lx, Ly, Lz), dtype=bool) imtemp[i_query] = False dtransform = spim.distance_transform_edt(imtemp) mask = dtransform <= rmax i_void = sp.where((self.image*mask) == 1) i_solid = sp.where(((~self.image)*mask) == 1) # Convert matrix into index notation for void and solid phases ind_void = sp.vstack((i_void[0].flatten(), i_void[1].flatten(), i_void[2].flatten())).T ind_solid = sp.vstack((i_solid[0].flatten(), i_solid[1].flatten(), i_solid[2].flatten())).T ind_query = sp.vstack((i_query[0].flatten(), i_query[1].flatten(), i_query[2].flatten())).T # Generate kdtrees for void, solid and query points dtree_void = kdt(ind_void) dtree_solid = kdt(ind_solid) dtree_pts = kdt(ind_query) # Perform 2-point correlation calculation for range of radii print('Checking correlations vs increasing radii') print('0%|'+'-'*len(sizes)+'|100%') print(' |', end='') hits = [] for r in sizes: print('.', end='') sys.stdout.flush() hits_void = dtree_pts.count_neighbors(other=dtree_void, r=r) hits_solid = dtree_pts.count_neighbors(other=dtree_solid, r=r) hits.append(hits_void/(hits_solid + hits_void)) print('|') # Store results in namedtuple vals = namedtuple('TwoPointCorrelation', ('distance', 'probability')) vals.distance = sizes vals.probability = hits return vals
sourceinfo = map(int, sourcedictionary.readline().strip().split()) sourcedim = sourceinfo[1] smat = np.matrix(np.load(args.modelfile)['s']) tmat = np.matrix(np.load(args.modelfile)['t']) print smat.shape print tmat.shape # load transformation matrices # TODO: would be cool if this could exist on-disk in some binary format so only the instructions need be passed in # Kludgy: store source and target in different structures vocab = dd(lambda: dict()) # for kdt lookup pretargets, targetvoc = cPickle.load(args.targetdictionary) targets = kdt(pretargets) invtargetvoc = dict() for key, word in enumerate(targetvoc): invtargetvoc[word] = key print len(targetvoc) try: for ln, line in enumerate(sourcedictionary): entry = line.strip().split(' ') if len(entry) < sourcedim + 2: sys.stderr.write( "skipping line %d in %s because it only has %d fields; first field is %s\n" % (ln, sourcedictionary.name, len(entry), entry[0])) continue lang = entry[0] word = ' '.join(entry[1:-sourcedim])
continue word = ' '.join(entry[:-fdim]) vec = np.array(entry[-fdim:]).astype(float) vocab[lang][word]=vec if istarget: targets.append(vec) targetvoc.append(word) except: print dfile.name print line print len(entry) print word print ln raise targets = kdt(normalize(np.array(targets), axis=1, norm='l2')) print "loaded vocabularies" for line in infile: inst = line.strip().split() inword = inst[0] inlang = inst[1] outlang = inst[2] outword = inst[3] if len(inst) > 3 else None if inword not in vocab[inlang]: # sys.stderr.write("Warning: Couldn't find %s -> %s\n" % (inlang, inword)) continue invec = np.matrix(vocab[inlang][inword]) xform = np.asarray(invec*mat)[0] neighbors = [] cosines, cands = targets.query(xform, args.nbest)
def __init__(self, mesh, leaf_size=10): self.__mesh = mesh self.__tree = kdt(mesh.vertices, leaf_size)