def __error(self, R, P, Q, K, beta): """ Calculates the error for the function :param R: :param P: :param Q: :param K: :param beta: :return: """ e = 0 for i in xrange(len(R)): for j in xrange(len(R[i])): if R[i][j] > 0: # loss function error sum( (y-y_hat)^2 ) e = e + pow(R[i][j] - numpy.dot(P[i, :], Q[:, j]), 2) # add regularization for k in xrange(K): # error + ||P||^2 + ||Q||^2 e = e + (beta / 2) * (pow(P[i][k], 2) + pow(Q[k][j], 2)) return e
def generate_table_from_xlsx(path): book = xlrd.open_workbook(path) sheet = book.sheet_by_index(1) data = {} for row_index in xrange(0, sheet.nrows): row_values = [sheet.cell(row_index, col_index).value for col_index in xrange(0, sheet.ncols)] if row_index == 2: time_w = list(filter(None, row_values)) print(time_w) if row_index == 3: idx_w = [index for index, value in enumerate(row_values) if value == "WEIGHT"] idx_c = [index for index, value in enumerate(row_values) if value == "FAMACHA"] chunks = [] if row_index > 4: for i in xrange(0, len(idx_w)): if row_values[1] is '': continue s = "40101310%s" % row_values[1] serial = int(s.split('.')[0]) chunks.append([time_w[i], row_values[idx_c[i]], serial]) if len(chunks) != 0: data[serial] = chunks print(data) return data
def precook(s, n=4, out=False): """Takes a string as input and returns an object that can be given to either cook_refs or cook_test. This is optional: cook_refs and cook_test can take string arguments as well.""" words = s.split() counts = defaultdict(int) for k in xrange(1, n + 1): for i in xrange(len(words) - k + 1): ngram = tuple(words[i:i + k]) counts[ngram] += 1 return (len(words), counts)
def averagePixels(self): r, g, b = 0, 0, 0 count = 0 for x in xrange(self.pic.size[0]): for y in xrange(self.pic.size[1]): tempr, tempg, tempb = self.imgData[x, y] r += tempr g += tempg b += tempb count += 1 # calculate averages return (r / count), (g / count), (b / count)
def shrink_shap(data, rows, cols): shrunk = np.zeros((rows,cols)) for i in xrange(0,rows): for j in xrange(0,cols): row_sp = int (data.shape[0]/rows) col_sp = int (data.shape[1]/cols) zz = data[i*row_sp : i*row_sp + row_sp, j*col_sp : j*col_sp + col_sp] shrunk[i,j] = np.sum(zz) values = np.array([(item) for sublist in shrunk/np.max(shrunk) for item in sublist]) shap_values = [] for i in range(5): shap_values.append([[item for sublist in values for item in sublist]]) shap_values = np.array(shap_values) shap_values = shap_values / np.max(shap_values)
def analyze_centrality(graph): centrality_dict = OrderedDict() print('Analyzing degree centrality...') score_list = graph.degree() centrality_dict['degree'] = sorted([(graph.vs[i], score_list[i]) for i in xrange(0, len(score_list))], key=lambda x: x[1], reverse=True) print('Done!') print print('Analyzing betweenness centrality...') score_list = graph.betweenness() centrality_dict['betweenness'] = sorted( [(graph.vs[i], score_list[i]) for i in xrange(0, len(score_list))], key=lambda x: x[1], reverse=True) print('Done!') print print('Analyzing closeness centrality...') score_list = graph.closeness() centrality_dict['closeness'] = sorted( [(graph.vs[i], score_list[i]) for i in xrange(0, len(score_list))], key=lambda x: x[1], reverse=True) print('Done!') print print('Analyzing eigenvector centrality...') score_list = graph.evcent() centrality_dict['eigenvector'] = sorted( [(graph.vs[i], score_list[i]) for i in xrange(0, len(score_list))], key=lambda x: x[1], reverse=True) print('Done!') print print('Analyzing pagerank centrality...') score_list = graph.pagerank() centrality_dict['pagerank'] = sorted([(graph.vs[i], score_list[i]) for i in xrange(0, len(score_list))], key=lambda x: x[1], reverse=True) print("Done!") print return centrality_dict
def cook_test(test, reflen_refmaxcounts, eff=None, n=4): '''Takes a test sentence and returns an object that encapsulates everything that BLEU needs to know about it.''' testlen, counts = precook(test, n, True) result = {} # Calculate effective reference sentence length. if eff == "closest": result["reflen"] = min( (abs(l - testlen), l) for l in reflen_refmaxcounts[0])[1] else: ## i.e., "average" or "shortest" or None result["reflen"] = reflen_refmaxcounts[0] result["testlen"] = testlen result["guess"] = [max(0, testlen - k + 1) for k in xrange(1, n + 1)] result['correct'] = [0] * n for (ngram, count) in counts.items(): result["correct"][len(ngram) - 1] += min( reflen_refmaxcounts[1].get(ngram, 0), count) return result
def generate_data_table_from_xlsx(path): book = xlrd.open_workbook(path) sheet = book.sheet_by_index(1) data = [] print("reading file...") for row_index in xrange(0, sheet.nrows): if row_index == 0: continue row = [sheet.cell(row_index, col_index).value for col_index in xrange(0, sheet.ncols)] postal_code = format_postcode(row[0].strip()) geo_data = get_geoloc_data(postal_code) row.extend(geo_data) data.append(tuple(row)) print("finished reading. start appending SQL database...") insert_record_to_sql_table("final_data", data) sql_db_flush()
def computeKernelMatrix(self, data1, data2, symmetric=False): """ Computes the kernel matrix """ logging.debug("Starting RBF Kernel Matrix Computation...") self._data1 = mat(data1) self._data2 = mat(data2) assert self._data1.shape[1] == (self._data2.T).shape[0] self._dim1 = len(data1) self._dim2 = len(data2) self._symmetric = symmetric self.__km = None try: if self._symmetric: linearkm = self._data1 * self._data2.T trnorms = mat(np.diag(linearkm)).T trace_matrix = trnorms * mat( np.ones((1, self._dim1), dtype=float64)) self.__km = trace_matrix + trace_matrix.T self.__km = self.__km - 2 * linearkm self.__km = -self.__sigma_squared_inv * self.__km self.__km = np.exp(self.__km) return self.__km else: m = self._data1.shape[0] n = self._data2.shape[0] assert self._data1.shape[1] == self._data2.shape[1] linkm = mat(self._data1 * self._data2.T) trnorms1 = [] for i in xrange(m): trnorms1.append((self._data1[i] * self._data1[i].T)[0, 0]) trnorms1 = mat(trnorms1).T trnorms2 = [] for i in xrange(n): trnorms2.append((self._data2[i] * self._data2[i].T)[0, 0]) trnorms2 = mat(trnorms2).T self.__km = trnorms1 * mat(np.ones((n, 1), dtype=float64)).T self.__km = self.__km + mat(np.ones( (m, 1), dtype=float64)) * trnorms2.T self.__km = self.__km - 2 * linkm self.__km = -self.__sigma_squared_inv * self.__km self.__km = np.exp(self.__km) return self.__km except Exception as e: logging.error("Error while computing kernel matrix: " + str(e)) sys.exit()
def discount_rewards(r): """ take 1D float array of rewards and compute discounted reward """ discounted_r = np.zeros_like(r) running_add = 0 for t in reversed(xrange(0, r.size)): if r[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!) running_add = running_add * gamma + r[t] discounted_r[t] = running_add return discounted_r
def EM(So: float, mu: float, sigma: float, N: int, M=1): b = Brownian(N)[1] dt = M * (1 / N) # EM step size L = N / M wi = [So] for i in xrange(0, int(L)): Winc = np.sum(b[(M * (i - 1) + M):(M * i + M)]) w_i_new = wi[i] + mu * wi[i] * dt + sigma * wi[i] * Winc wi.append(w_i_new) return wi, dt
def GBM(So: float, mu: float, sigma: float, N: float, T=1.) -> list: W = Brownian(N)[0] t = np.linspace(0., 1., int(N) + 1) S = [] S.append(So) for i in xrange(1, int(N + 1)): drift = (mu - 0.5 * sigma**2) * t[i] diffusion = sigma * W[i - 1] S_temp = So * np.exp(drift + diffusion) S.append(S_temp) return S, t
def show(dataSet, k, centroids, clusterAssment): #查看矩阵或者数组的维数 ,行数 列数 numSamples, dim = dataSet.shape mark = ['or', 'ob', 'og', 'ok', '^r', '+r', 'sr', 'dr', '<r', 'pr'] for i in xrange(numSamples): markIndex = int(clusterAssment[i, 0]) plt.plot(dataSet[i, 0], dataSet[i, 1], mark[markIndex]) mark = ['Dr', 'Db', 'Dg', 'Dk', '^b', '+b', 'sb', 'db', '<b', 'pb'] for i in range(k): plt.plot(centroids[i, 0], centroids[i, 1], mark[i], markersize=12) plt.show()
def check_possibility(nums): """ :type nums: List[int] :rtype: bool """ p = None for i in xrange(len(nums) - 1): if nums[i] > nums[i + 1]: if p: return False p = i return (not p) or p == 0 or p == len(nums) - 2 or nums[ p - 1] <= nums[p + 1] or nums[p] < nums[p + 2]
def computeKernelMatrix(self, data1, data2, symmetric=False): """ Computes the kernel matrix """ logging.debug("Starting Linear Kernel Matrix Computation...") self._data1 = data1 self._data2 = data2 self._dim1 = len(data1) self._dim2 = len(data2) self._symmetric = symmetric self.__km = None try: km = mat(zeros((self._dim1, self._dim2), dtype=float64)) if self._symmetric: for i in xrange(self._dim1): message = 'Kernel Matrix Progress: %dx%d/%dx%d' % ( i, self._dim2, self._dim1, self._dim2) logging.debug(message) for j in xrange(i, self._dim2): val = self.getKernelValue(self._data1[i], self._data2[j]) km[i, j] = val km[j, i] = val return km else: for i in xrange(self._dim1): message = 'Kernel Matrix Progress: %dx%d/%dx%d' % ( i, self._dim2, self._dim1, self._dim2) logging.debug(message) for j in xrange(0, self._dim2): val = self.getKernelValue(self._data1[i], self._data2[j]) km[i, j] = val return km except Exception as e: logging.error("Error while computing kernel matrix: " + str(e)) sys.exit() logging.debug("Kernel Matrix computed...")
def main(args): parser = argparse.ArgumentParser( description="""Generate one or more JSON objects containing random data given a input json-schema.org compatible schema specification""") parser.add_argument('schemaFile', metavar='file', help='json-schema.org schema file to use') parser.add_argument('--count', default=1, type=int, help='number of objects to create (default: 1)') parser.add_argument('--mode', choices=['pure', 'mongo', 'full'], default="mongo", help="""\ format of non-string data to emit. pure is normal JSON; dates emitted as ISO8601 strings and numbers are just numbers. mongo is mongoDB-compatible where dates are emitted as a special map {"$date", millis}. mongoimport is sensitive to maps with $ keys and will process the content as the indicated type. full is a superset of mongoDB types. Integers are emitted as {"$int", value}, floats as {"$float", value}. mongoimport does not permit this -- but pymonimport does.""") parser.add_argument('--defaultStringIpsum', choices=['word', 'sentence', 'paragraph', 'fname'], default="word", help="""\ default style of string to emit when presented with type:string.""") rargs = parser.parse_args() fname = rargs.schemaFile count = rargs.count fp = open(fname) try: schema = json.load(fp) params = { "mode": rargs.mode, "defaultStringIpsum": rargs.defaultStringIpsum } q = Ipsum.Ipsum(params) for i in xrange(count): z = q.createItem(schema) print(json.dumps(z)) except ValueError: tb = traceback.format_exc() print(tb)
def makeIpsum(self, ipsum): style = self.dsi # default s = None if ipsum != None: style = ipsum if style == "sentence": n = self.randomInt(10, 20) s = ' '.join([self.randomFrom(self.bleck) for num in xrange(n)]) elif style == "paragraph": n = self.randomInt(10, len(self.bleck)) s = ' '.join([self.randomFrom(self.bleck) for num in xrange(n)]) elif style == "word": s = self.randomFrom(self.bleck) elif style == "fname": s = self.randomFrom(self.fnames) elif style == "id": s = str(uuid.uuid4()) elif style == "bson:ObjectId" or style == "bson:7": v = self.generateMongoOID() if self.mode == self.PURE_JSON: s = v else: # oooo not a string, but a dict! s = {"$oid": v} else: s = "unknown_ipsum \"" + style + "\"" return s
def centroid_centrality(graph: Graph): pathm = np.array([]) adjm = graph.get_adjacency().data for v1 in xrange(0, len(graph.vs)): temp = [] for v2 in xrange(0, len(graph.vs)): if adjm[i][j] > 0: temp.append(graph.shortest_paths(source=v1, target=v2)[0][0]) else: temp.append(0) pathm = np.append(pathm, temp) fm = np.zeros((len(graph.vs), len(graph.vs))) for v1 in xrange(0, len(graph.vs)): for v2 in xrange(0, len(graph.vs)): fm[v1][v2] = len( filter(lambda x: x[0] < x[1], zip(pathm[v1], pathm[v2])) ) - len(filter(lambda x: x[0] > x[1], zip(pathm[v1], pathm[v2]))) score_dict = {} for v in graph.vs: score_dict[v] = min(fm[v.index]) return sorted(score_dict.items(), key=lambda x: x[1], reverse=True)
def __optimize(self): logging.debug("Starting optimization with BFGS ...") self.__needed_function_calls = 0 # starting_point c_current = zeros(self.__dim, float64) c_current[self.__dim - 1] = self.__b # Annealing sequence. for i in xrange(len(self.__lam_Uvec)): self.__lamU = self.__lam_Uvec[i] # crop one dimension (in case the offset b is fixed) c_current = c_current[:self.__dim - 1] c_current = self.__localSearch(c_current) # reappend it if needed c_current = np.append(c_current, self.__b) f_opt = self.__getFitness(c_current) return c_current, f_opt
def longestcommonsubsequence(A, B): x = len(A) y = len(B) P = [[None] * (y + 1) for i in xrange(x + 1)] for i in range(x + 1): for j in range(y + 1): if i == 0 or j == 0: P[i][j] = 0 elif A[i - 1] == B[j - 1]: P[i][j] = P[i - 1][j - 1] + 1 else: P[i][j] = max(P[i - 1][j], P[i][j - 1]) return P[x][y]
def GBM(So: float, mu: float, sigma: float, N: float) -> list: """[summary] Arguments: So {float} -- initial stock price mu {float} -- mean of historical daily returns sigma {float} -- standard deviation of historical daily returns N {float} -- number of time points in prediction the time horizon Keyword Arguments: T {[type]} -- length of the prediction time horizon (default: {1.}) Returns: list -- [description] """ W = Brownian(N)[0] t = np.linspace(0., 1., int(N) + 1) S = [] S.append(So) for i in xrange(1, int(N + 1)): drift = (mu - 0.5 * sigma**2) * t[i] diffusion = sigma * W[i - 1] S_temp = So * np.exp(drift + diffusion) S.append(S_temp) return S, t
def chunks(lst, n): """Yield successive n-sized chunks from lst.""" for i in xrange(0, len(lst), n): yield lst[i:i + n]
def makeThing(self, path, info): type = info["type"] if type == "null": o = "null" elif type == "string": fmt = None v = None if "format" in info: fmt = info['format'] # date-time is special. It is good to have optimizations # because running the parser over and over again on the # string rep of a date is very very slow... if "enum" in info: if fmt == "date-time": if '_dateEnums' not in info: info['_dateEnums'] = [ self.str2Epoch(ss) for ss in info['enum'] ] v = self.randomFrom(info['_dateEnums']) # pick else: v = self.randomFrom(info['enum']) # v is no longer None if fmt == "date-time": if v is not None: # must have been an enum; parse! epoch = v #self.str2Epoch(v) else: if 'ipsum' in info: # if we have ipsum... q = info['ipsum'] if 'inc' in q: # ...AND we have inc then OK! q2 = q['inc'] #i.e. { "start": 0, "val": 1 } if path not in self.counters: epoch = self.str2Epoch( q2['start']) # expensive self.counters[path] = epoch else: if 'secs' in q2: v2 = q2['secs'] if 'mins' in q2: v2 = q2['mins'] * 60 if 'hrs' in q2: v2 = q2['hrs'] * 60 * 60 if 'days' in q2: v2 = q2['days'] * 60 * 60 * 24 v2 *= self.millisAdj self.counters[path] += v2 epoch = self.counters[path] # was no ipsum or no ipsum.inc... else: # try for min and max. To avoid running the expensive # parse over and over, look for _min and _max (which only # we can create). Don't believe it? Try commenting out # the assignments below (lines ending with #tag1) and # rerun. It's almost 3x (300%) faster when you parse and # save the value.... mmin = self.lowDateEpoch mmax = self.highDateEpoch if '_min' in info: mmin = info['_min'] else: if 'minimum' in info: mmin = self.str2Epoch( info['minimum']) # expensive info['_min'] = mmin #tag1 if '_max' in info: mmax = info['_max'] else: if 'maximum' in info: mmax = self.str2Epoch( info['maximum']) # expensive info['_max'] = mmax #tag1 epoch = self.randomLong(mmin, mmax) if self.mode == self.FULL_EXT_JSON or self.mode == self.MONGO_JSON: o = {"$date": epoch} elif self.mode == self.RAW: o = datetime.datetime.fromtimestamp(epoch) else: o = datetime.datetime.fromtimestamp(epoch).strftime( '%Y-%m-%dT%H:%M:%S') elif v is None: # not date-time and not enum if fmt is not None: # format takes precedence over ipsum field: o = self.makeFormattedString(fmt) else: t = info['ipsum'] if 'ipsum' in info else None o = self.makeIpsum(t) else: o = v elif type == "object": ss = info["properties"] nn = {} self.processObject(nn, path, ss) o = nn elif type == "array": ss = info["items"] mmin = ss['minItems'] if 'minItems' in ss else self.DEF_MIN_ARR_LEN mmax = ss['maxItems'] if 'maxItems' in ss else self.DEF_MAX_ARR_LEN # List comprehensions front and center.... o = [ self.makeThing(path + "." + str(i), ss) for i in xrange(self.randomInt(mmin, mmax)) ] elif type == "oneOf": ll = info["items"] # A list, not a dict! x = self.randomFrom(ll) # pick one and go! o = self.makeThing(path, x) elif type == "number" or type == "integer": v = None if "enum" in info: v = self.randomFrom(info['enum']) # v is no longer None elif "ipsum" in info: q = info['ipsum'] if 'inc' in q: q2 = q['inc'] #i.e. { "start": 0, "val": 1 } if path not in self.counters: self.counters[path] = q2['start'] else: self.counters[path] += q2['val'] v = self.counters[path] if v == None: mmin = info['minimum'] if 'minimum' in info else -100 mmax = info['maximum'] if 'maximum' in info else 100 if type == "number": v = self.randomDouble(mmin, mmax) if type == "integer": v = self.randomInt(mmin, mmax) # At this point, we have SOME kind of v! if self.mode == self.FULL_EXT_JSON: if type == "number": o = {"$float": v} if type == "integer": o = {"$int": v} else: o = v elif type == "boolean": v = None if "enum" in info: q = str(self.randomFrom(info['enum'])) # Force to str.... v = q.lower() in ("yes", "true", "t", "1") # v is no longer None but a bool if v == None: v = True if self.randomDouble(0, 1) > .5 else False o = v return o
def __factor_matrix(self, R, K, alpha, steps, beta, error_limit): """ R = user x product matrix K = latent features count (how many features we think the model should derive) alpha = learning rate beta = regularization penalty (minimize over/under fitting) step = logistic regression steps error_limit = algo finishes when error reaches this level Returns: P = User x features matrix. (How strongly a user is associated with a feature) Q = Product x feature matrix. (How strongly a product is associated with a feature) To predict, use dot product of P, Q """ # Transform regular array to numpy array R = numpy.array(R) # Generate P - N x K # Use random values to start. Best performance N = len(R) M = len(R[0]) P = numpy.random.rand(N, K) # Generate Q - M x K # Use random values to start. Best performance Q = numpy.random.rand(M, K) Q = Q.T error = 0 # iterate through max # of steps for step in xrange(steps): # iterate each cell in r for i in xrange(len(R)): for j in xrange(len(R[i])): if R[i][j] > 0: # get the eij (error) side of the equation eij = R[i][j] - numpy.dot(P[i, :], Q[:, j]) for k in xrange(K): # (*update_rule) update pik_hat P[i][k] = P[i][k] + alpha * (2 * eij * Q[k][j] - beta * P[i][k]) # (*update_rule) update qkj_hat Q[k][j] = Q[k][j] + alpha * (2 * eij * P[i][k] - beta * Q[k][j]) # Measure error error = self.__error(R, P, Q, K, beta) # Terminate when we converge if error < error_limit: break # track Q, P (learned params) # Q = Products x feature strength # P = Users x feature strength self.Q = Q.T self.P = P self.__print_fit_stats(error, N, M)
with tf.name_scope("train_op_"): train_op = tf.train.AdamOptimizer(learning_rate=2.**-5).minimize(loss) with tf.name_scope("mse_"): mse = tf.reduce_mean(tf.squared_difference(y, bernoulli.mean())) with tf.name_scope("init_op_"): init_op = tf.global_variables_initializer() # Run graph 1000 times. with tf.name_scope("train_100_"): num_steps = 2000 loss_ = np.zeros(num_steps) # Style: `_` to indicate sess.run result. mse_ = np.zeros(num_steps) with tf.Session() as sess: sess.run(init_op) for it in xrange(loss_.size): _, loss_[it], mse_[it] = sess.run([train_op, loss, mse]) if it % 200 == 0 or it == loss_.size - 1: print("iteration:{} loss:{} mse:{}".format(it, loss_[it], mse_[it])) # sess.run(tf.global_variables_initializer()) writer = tf.summary.FileWriter(LOGDIR) writer.add_graph(sess.graph) saver = tf.train.Saver() # ==> iteration:0 loss:0.635675370693 mse:0.222526371479 # iteration:200 loss:0.440077394247 mse:0.143687799573 # iteration:400 loss:0.440077394247 mse:0.143687844276
def compute_svd_pca(): global W, H face_width, face_height = (W, H) # Create a vector for all faces face_vector = np.array([ cv2.imread(os.path.join(face_database_gray_file, filename), 0).flatten() for filename in f_list ]) # Compute average face fave = np.mean(face_vector, 0) # Subtract the average face from each image before performing SVD and PCA X = face_vector - fave print("Finding SVD of data matrix") # Decompose the mean-centered matrix into three parts U, S, Vt = np.linalg.svd(X.transpose(), full_matrices=False) V = Vt.T # Sort principal components by descending order of the singular values ind = np.argsort(S)[::-1] U, S, V = U[:, ind], S[ind], V[:, ind] eigenfaces = U # Print Dimensions print("face_vector:", face_vector.shape) print("U:", U.shape) print("Sigma:", S.shape) print("V^T:", Vt.shape) # Weights is an n x n matrix weights = np.dot(X, eigenfaces) # TODO: Maybe swap + .T to eigenfaces # Some intermediate save: save_average_face = True if save_average_face: # Save average face average_face = fave.reshape(face_width, face_height) cv2.imwrite(os.path.join(intermediate_file, 'average_face.jpg'), average_face) save_eigenvectors = False if save_eigenvectors: print("Saving eigenvectors...") for i in xrange(n): f_name = os.path.join(intermediate_file, 'eigenvector_%s.png' % i) im = U[:, i].reshape(face_width, face_height) cv2.imwrite(f_name, im) save_reconstructed = True if save_reconstructed: k = 30 print( '\n', 'Save the reconstructed images based on only "%s" eigenfaces' % k) for img_id in range(n): # for k ranging from 1 to total + 1: reconstructed_face = fave + np.dot(weights[img_id, :k], eigenfaces[:, :k].T) reconstructed_face.shape = ( face_width, face_height ) # transform vector to initial image size cv2.imwrite( os.path.join(intermediate_file, 'img_reconstr_%s_k=%s.png' % (f_list[img_id], k)), reconstructed_face) # Projected training images into PCA subspace as yn=weights or Yn = E.T * (Xn - average_face) training_proj = weights average_face_flatten = fave return training_proj, eigenfaces, average_face_flatten
def genQRToken(self, qrsig): e = 0 for i in xrange(0, len(qrsig)): e += (e << 5) + ord(qrsig[i]) qrtoken = (e & 2147483647) return str(qrtoken)
def compute_score(self, option=None, verbose=0): n = self.n small = 1e-9 tiny = 1e-15 ## so that if guess is 0 still return 0 bleu_list = [[] for _ in range(n)] if self._score is not None: return self._score if option is None: option = "average" if len(self.crefs) == 1 else "closest" self._testlen = 0 self._reflen = 0 totalcomps = { 'testlen': 0, 'reflen': 0, 'guess': [0] * n, 'correct': [0] * n } # for each sentence for comps in self.ctest: testlen = comps['testlen'] self._testlen += testlen if self.special_reflen is None: ## need computation reflen = self._single_reflen(comps['reflen'], option, testlen) else: reflen = self.special_reflen self._reflen += reflen for key in ['guess', 'correct']: for k in xrange(n): totalcomps[key][k] += comps[key][k] # append per image bleu score bleu = 1. for k in xrange(n): bleu *= (float(comps['correct'][k]) + tiny) \ /(float(comps['guess'][k]) + small) bleu_list[k].append(bleu**(1. / (k + 1))) ratio = (testlen + tiny) / (reflen + small ) ## N.B.: avoid zero division if ratio < 1: for k in xrange(n): bleu_list[k][-1] *= math.exp(1 - 1 / ratio) if verbose > 1: print(comps, reflen) totalcomps['reflen'] = self._reflen totalcomps['testlen'] = self._testlen bleus = [] bleu = 1. for k in xrange(n): bleu *= float(totalcomps['correct'][k] + tiny) \ / (totalcomps['guess'][k] + small) bleus.append(bleu**(1. / (k + 1))) ratio = (self._testlen + tiny) / (self._reflen + small ) ## N.B.: avoid zero division if ratio < 1: for k in xrange(n): bleus[k] *= math.exp(1 - 1 / ratio) if verbose > 0: print(totalcomps) print("ratio:", ratio) self._score = bleus return self._score, bleu_list
def genBKN(self, skey): b = 5381 for i in xrange(0, len(skey)): b += (b << 5) + ord(skey[i]) bkn = (b & 2147483647) return str(bkn)
def generate_student_migation_data_table_from_xlsx(path): print('generate_student_migation_data_table_from_xlsx') book = xlrd.open_workbook(path) sheet = book.sheet_by_index(0) data = [] region_of_he_provider_list = [] domicile_list = [] geolocator = Nominatim(user_agent=__name__) print("reading file...") for row_index in xrange(0, sheet.nrows): if row_index < 18: continue row = [sheet.cell(row_index, col_index).value for col_index in xrange(0, sheet.ncols)] a_way_domicile = row[0] domicile = row[1] domicile_list.append(clean(domicile)) level_of_study = row[2] mode_of_study = row[3] academic_year = int(row[4].split('/')[0]) region_of_he_provider = row[5] if 'Total England' in region_of_he_provider: region_of_he_provider = 'England' if 'Total United Kingdom' in region_of_he_provider: region_of_he_provider = 'United Kingdom' region_of_he_provider_list.append(region_of_he_provider) number = int(row[6]) data.append([a_way_domicile, domicile, level_of_study, mode_of_study, academic_year, region_of_he_provider, number]) region_of_he_provider_list = list(set(region_of_he_provider_list)) region_of_he_provider_list.sort() domicile_list = list(set(domicile_list)) domicile_list.sort() with open("geolocator.data") as f: content = f.readlines() latitudes_longitudes_dom = [json.loads(x.strip()) for x in content][0] for place in domicile_list: print(place) if place not in latitudes_longitudes_dom: location = geolocator.geocode(place, timeout=None) latitudes_longitudes_dom[place] = {'lat': location.latitude, 'long': location.longitude} print('cache...') with open('geolocator.data', 'w') as outfile: outfile.write(json.dumps(latitudes_longitudes_dom)) outfile.close() time.sleep(1.1) else: print(place, latitudes_longitudes_dom[place]) # get_geoloc_data_migration(place) latitudes_longitudes_he = {} for place in region_of_he_provider_list: if 'Total England' in place: place = 'England' if 'Total United Kingdom' in place: place = 'United Kingdom' # get_geoloc_data_migration(place) location = geolocator.geocode(place + ' , UK', timeout=None) print(place, '-', location) latitudes_longitudes_he[place] = {'lat': location.latitude, 'long': location.longitude} time.sleep(1.1) final_data = [] for item in data: place = item[5] if 'Total England' in place: place = 'England' if 'Total United Kingdom' in place: place = 'United Kingdom' dom_c = latitudes_longitudes_dom[clean(item[1])] item.append(dom_c['lat']) item.append(dom_c['long']) he_c = latitudes_longitudes_he[clean(place)] item.append(he_c['lat']) item.append(he_c['long']) item[1] = clean(item[1]) if 'Caribbean' in item[1]: print(item) final_data.append(tuple(item)) print("finished reading. start appending SQL database...") insert_record_to_sql_table_student_migration("student_migration", final_data) sql_db_flush()