def __is_fname(self, arr_or_fname): if isinstance(arr_or_fname, str): embed = load_embedding(arr_or_fname) else: embed = arr_or_fname return embed
def __init__(self, config, mode): self.config = config self.mode = mode # if model is bcnn, scope: bcnn, if model is abcnn, scope: abcnn self.scope = self.__class__.__name__ self.vocab_table, _, self.vocab_size = load_vocab( self.config.vocab_file) self.word_embedding = load_embedding(self.config.vocab_file, self.config.embedding_file) self._build_graph()
def get_features(self, input_data): features = input_data[[ 'numBedrooms', 'numBathrooms', 'sqft', 'region_coor', 'priceSqft', 'lat', 'lon' ]] embed = self.get_embedding() if embed is not None: embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values()) embed_features = pd.merge(features, embed_df, left_on='region_coor', right_index=True, how='inner') embed_features.drop('region_coor', axis=1, inplace=True) X = embed_features.drop(['priceSqft', 'lat', 'lon'], axis=1).values y = embed_features['priceSqft'].values else: embed = load_embedding(self.config['embedding_file']) embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values()) embed_features = pd.merge(features, embed_df, left_on='region_coor', right_index=True, how='inner') embed_features = pd.get_dummies(embed_features, columns=['region_coor']) keepcols = [ str(c) for c in list(embed_features.columns) if 'region_coor' in str(c) ] + ['numBedrooms', 'numBathrooms', 'sqft'] X = embed_features[keepcols].values y = embed_features['priceSqft'].values print(X.shape) self.X = X self.y = y
def get_features(self, input_data): input_data = input_data[~np.isnan(input_data.traffic)] #features = input_data[['region_coor', 'hour', 'Direction', 'SHAPE_Leng','traffic']] features = input_data[[ 'region_coor', 'Direction', 'SHAPE_Leng', 'traffic' ]] #features = pd.get_dummies(features, columns=['hour', 'Direction']) features = pd.get_dummies(features, columns=['Direction']) embed = self.get_embedding() if embed is not None: embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values()) embed_features = pd.merge(features, embed_df, left_on='region_coor', right_index=True, how='inner') embed_features.drop('region_coor', axis=1, inplace=True) else: embed = load_embedding(self.config['embedding_file']) embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values()) embed_features = pd.merge(features, embed_df, left_on='region_coor', right_index=True, how='inner') h_dim = int(self.config['hidden_dim_size']) drop_cols = list(range(h_dim)) + ['region_coor'] embed_features.drop(drop_cols, axis=1, inplace=True) X = embed_features.drop(['traffic'], axis=1).values y = embed_features['traffic'].values print(X.shape) self.X = X self.y = y
sys.exit() else: print('constructing coding table') train_features, train_labels, f_map, _, c_map = \ utils.generate_corpus_char(lines, if_shrink_c_feature=True, c_thresholds=args.mini_count, if_shrink_w_feature=False) f_set = {v for v in f_map} f_map = utils.shrink_features(f_map, train_features, args.mini_count) dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set) dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) f_map, embedding_tensor, in_doc_words = utils.load_embedding(args.emb_file, ' ', f_map, dt_f_set, args.unk, args.word_embedding_dim, shrink_to_corpus=args.shrink_embedding) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) print('constructing dataset') dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(train_features, train_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], ALLOW_SPANLEN=args.allowspan, train_set=True) dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False) dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset] dataset_loader_crf = [torch.utils.data.DataLoader(tup, 3, shuffle=True, drop_last=False) for tup in dataset_onlycrf] if dataset_onlycrf else None dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset] test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset] print('building model')
plt.subplots_adjust(bottom=.6) plt.bar(range(len(d)), list(d.values()), align='center', color=colors) plt.xticks(range(len(d)), list(d.keys()), rotation=90, fontsize=14) plt.yticks(fontsize=8) plt.ylim(y_lim) plt.legend(loc='best') plt.savefig(fname) plt.clf() plt.close() c = get_config() region_grid = RegionGrid(config=c) k = 5 H = load_embedding(c['embedding_file']) X = region_grid.feature_matrix nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(H) distances, indices = nbrs.kneighbors(H) for id, r in region_grid.regions.items(): r_latent_nbrs = indices[r.index, 1:] all_nbrs_disconnected = 0 for nbr in r_latent_nbrs: nbr_coor = region_grid.idx_coor_map[nbr] r_nbr = region_grid.regions[nbr_coor] #print("r: {}, nbr: {}".format(id, nbr_coor)) #print("Is graph adjacent? {}".format(nbr_coor in r.adjacent)) all_nbrs_disconnected += nbr_coor in r.adjacent
# In[ ]: # ### emb # In[12]: import time t0 = time.time() path = '/ids/datasets/glove_vectors/glove.6B.300d.txt' # can use 300d word_to_idx, words, emb = utils.load_embedding(path) print(time.time() - t0) emb.shape, len(word_to_idx), len(words) # last two: UNK, PAD # In[ ]: # ### test section: optional # In[ ]:
f_map = utils.shrink_features(f_map, train_features, args.mini_count) # union the f_set with dev_features and test_features. dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set) dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set) f_map, embedding_tensor, in_doc_words = utils.load_embedding( args.emb_file, ' ', f_map, dt_f_set, args.unk, args.word_embedding_dim, shrink_to_corpus=args.shrink_embedding, embsave_filePath=nn_config['embsave_filePath']) # print('in_doc_words: \n', in_doc_words) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels)) l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set) #end else print('constructing dataset') dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(
mse, mse_std, mae, mae_std, err_euclidean = mod_euclidean.cv_ols() results.append(['euclidean', mse, mse_std, mae, mae_std]) # Run with Taxi flow as weighted edges W = region_grid.weighted_mtx W = W[y_is_valid, :] W = W[:, y_is_valid] mod_flow = SimilarityModel(y_house, W) mse, mse_std, mae, mae_std, err_flow = mod_flow.cv_ols() results.append(['flow', mse, mse_std, mae, mae_std]) # Matrix Factorization as similarity nmf = load_embedding(c['nmf_file']) W_nmf = np.matmul(nmf, np.transpose(nmf)) W_nmf = W_nmf[y_is_valid, :] W_nmf = W_nmf[:, y_is_valid] mod_nmf = SimilarityModel(y_house, W_nmf) mse, mse_std, mae, mae_std, err_nmf = mod_nmf.cv_ols() results.append(['matrix factorization', mse, mse_std, mae, mae_std]) # Run with deepwalk as similarity measure deepwalk = load_embedding(c['deepwalk_file']) W_deepwalk = np.matmul(deepwalk, np.transpose(deepwalk)) W_deepwalk = W_deepwalk[y_is_valid, :] W_deepwalk = W_deepwalk[:, y_is_valid] mod_deepwalk = SimilarityModel(y_house, W_deepwalk)