def get_features(self, business, naics, ADD_SYNONYMS=False): """ :param business: business dictionary from challenge set :param naics: list of naics dictionaries to check against :param ADD_SYNONYMS: boolean whether to add synonyms to titles and descriptions :return: dictionary of the 8 similarity combinations to their score """ business_desc = business['description'] google_type = self.google_types.get(business['unique_id']) business_name = business['name'] if google_type: business_name += ' ' + google_type if ADD_SYNONYMS: business_desc = util.add_synonyms_to_text(business_desc) business_name = util.add_synonyms_to_text(business_name) else: business_desc = util.clean_paragraph(business_desc) business_name = util.clean_paragraph(business_name) codes_to_features = {} for naic in naics: naic_desc = naic['description'] naic_title = naic['title'] if ADD_SYNONYMS: naic_title = util.add_synonyms_to_text(naic_title) naic_desc = util.add_synonyms_to_text(naic_desc) else: naic_title = util.clean_paragraph(naic_title) naic_desc = util.clean_paragraph(naic_desc) d_d_sim = util.cosine_sim(business_desc, naic_desc) t_t_sim = util.cosine_sim(business_name, naic_title) d_t_sim = util.cosine_sim(business_desc, naic_title) t_d_sim = util.cosine_sim(business_name, naic_desc) t_t_w2vsim = util.word2vec_sim(business_name, naic_title, self.model) d_d_w2vsim = util.word2vec_sim(business_desc, naic_desc, self.model) d_t_w2vsim = util.word2vec_sim(business_desc, naic_title, self.model) t_d_w2vsim = util.word2vec_sim(business_name, naic_desc, self.model) t_t_w2vsim = util.removeNans(t_t_w2vsim) d_d_w2vsim = util.removeNans(d_d_w2vsim) d_t_w2vsim = util.removeNans(d_t_w2vsim) t_d_w2vsim = util.removeNans(t_d_w2vsim) features = { 'd_d_sim': d_d_sim, 't_t_sim': t_t_sim, 'd_t_sim': d_t_sim, 't_d_sim': t_d_sim, 't_t_w2vsim': t_t_w2vsim, 'd_d_w2vsim': d_d_w2vsim, 'd_t_w2vsim': d_t_w2vsim, 't_d_w2vsim': t_d_w2vsim } codes_to_features[naic['code']] = features return codes_to_features
def get_all_business_types(): businesses = loader.get_challengeset() idtoloc = loader.get_idtoloc() business_types_dict = loader.get_business_types() print "Done {} of {}".format(len(business_types_dict), len(businesses)) for business in businesses: unique_id = business['unique_id'] if unique_id not in business_types_dict.keys(): print business['name'] closest_place, best_sim = None, 0 lat, lon = idtoloc[unique_id] for place in get_places(lat, lon): sim = cosine_sim(place['name'], business['name']) if sim > best_sim: closest_place = place best_sim = sim if closest_place: types = filter(lambda x: not x in ['point_of_interest', 'establishment', 'sublocality', 'route', 'real', 'political', 'of', 'or', 'local', 'locality', 'intersection', '1'], closest_place['types']) types = " ".join(types).replace("_", " ") else: types = None print types business_types_dict[unique_id] = types loader.dump_business_dict(business_types_dict)
d2 = (W.T@err)*nonlinearity.deriv(z) # gradient of the currents if two_layers: W2.grad = -([email protected])/len(idx_tst) b2.grad = -d2.mean(1, keepdim=True) d1 = (W2@d2)*nonlinearity.deriv(z1) W1.grad = -(d1@inputs[idx_tst,:])/len(idx_tst) b1.gad = -d1.mean(1, keepdim=True) else: W1.grad = -(d2@inputs[idx_tst,:])/len(idx_tst) b1.gad = -d2.mean(1, keepdim=True) conds = inp_condition[idx_tst] cond_grad = np.array([d2[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)]) gradz_sim.append(util.cosine_sim(cond_grad-cond_grad.mean(0),cond_grad-cond_grad.mean(0))) cond_grad = np.array([(W.T@err)[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)]) gradlin_sim.append(util.cosine_sim(cond_grad-cond_grad.mean(0),cond_grad-cond_grad.mean(0))) # cond_grad = np.array([((d2[:,conds==i]@z[:,conds==i].T)/np.sum(conds==i)).mean(1).detach().numpy() \ # for i in np.unique(conds)]) # gradw_sim.append(util.cosine_sim(cond_grad,cond_grad)) # do learning for j, btch in enumerate(dl): optimizer.zero_grad() inps, outs = btch if two_layers: z1 = nonlinearity(torch.matmul(W1,inps.T) + b1) z = nonlinearity(torch.matmul(W2,z1) + b2)
if epoch in [0, nepoch - 1]: errb = (targets[idx_tst, :].T - nn.Sigmoid()(pred)) # bernoulli errg = (targets[idx_tst, :].T - pred) # gaussian err = ppp * errb + (1 - ppp) * errg # convex sum, in case you want that d2 = (W.T @ err) * nonlinearity.deriv(z) # gradient of the currents conds = abstract_conds[idx_tst] cond_grad = np.array([ d2[:, conds == i].mean(1).detach().numpy() for i in np.unique(conds) ]) gradz_sim.append( util.cosine_sim(cond_grad - cond_grad.mean(0), cond_grad - cond_grad.mean(0))) # cond_grad = np.array([(W.T@err)[:,conds==i].mean(1).detach().numpy() for i in np.unique(conds)]) cond_grad = np.array([ (d2[:, conds == i] @ inputs[idx_tst, :][conds == i, :]).detach().numpy().T for i in np.unique(conds) ]) gradlin_sim.append( util.cosine_sim(cond_grad - cond_grad.mean(0), cond_grad - cond_grad.mean(0))) # cond_grad = np.array([((d2[:,conds==i]@z[:,conds==i].T)/np.sum(conds==i)).mean(1).detach().numpy() \ # for i in np.unique(conds)]) # gradw_sim.append(util.cosine_sim(cond_grad,cond_grad)) # do learning