def __is_fname(self, arr_or_fname):
        if isinstance(arr_or_fname, str):
            embed = load_embedding(arr_or_fname)
        else:
            embed = arr_or_fname

        return embed
Beispiel #2
0
    def __init__(self, config, mode):
        self.config = config
        self.mode = mode
        # if model is bcnn, scope: bcnn, if model is abcnn, scope: abcnn
        self.scope = self.__class__.__name__

        self.vocab_table, _, self.vocab_size = load_vocab(
            self.config.vocab_file)
        self.word_embedding = load_embedding(self.config.vocab_file,
                                             self.config.embedding_file)

        self._build_graph()
    def get_features(self, input_data):
        features = input_data[[
            'numBedrooms', 'numBathrooms', 'sqft', 'region_coor', 'priceSqft',
            'lat', 'lon'
        ]]

        embed = self.get_embedding()
        if embed is not None:

            embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values())
            embed_features = pd.merge(features,
                                      embed_df,
                                      left_on='region_coor',
                                      right_index=True,
                                      how='inner')
            embed_features.drop('region_coor', axis=1, inplace=True)

            X = embed_features.drop(['priceSqft', 'lat', 'lon'], axis=1).values
            y = embed_features['priceSqft'].values

        else:
            embed = load_embedding(self.config['embedding_file'])
            embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values())
            embed_features = pd.merge(features,
                                      embed_df,
                                      left_on='region_coor',
                                      right_index=True,
                                      how='inner')
            embed_features = pd.get_dummies(embed_features,
                                            columns=['region_coor'])
            keepcols = [
                str(c) for c in list(embed_features.columns)
                if 'region_coor' in str(c)
            ] + ['numBedrooms', 'numBathrooms', 'sqft']

            X = embed_features[keepcols].values
            y = embed_features['priceSqft'].values

        print(X.shape)
        self.X = X
        self.y = y
    def get_features(self, input_data):
        input_data = input_data[~np.isnan(input_data.traffic)]
        #features = input_data[['region_coor', 'hour', 'Direction', 'SHAPE_Leng','traffic']]
        features = input_data[[
            'region_coor', 'Direction', 'SHAPE_Leng', 'traffic'
        ]]
        #features = pd.get_dummies(features, columns=['hour', 'Direction'])
        features = pd.get_dummies(features, columns=['Direction'])

        embed = self.get_embedding()
        if embed is not None:

            embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values())
            embed_features = pd.merge(features,
                                      embed_df,
                                      left_on='region_coor',
                                      right_index=True,
                                      how='inner')
            embed_features.drop('region_coor', axis=1, inplace=True)

        else:
            embed = load_embedding(self.config['embedding_file'])
            embed_df = pd.DataFrame(embed, index=self.idx_coor_map.values())
            embed_features = pd.merge(features,
                                      embed_df,
                                      left_on='region_coor',
                                      right_index=True,
                                      how='inner')
            h_dim = int(self.config['hidden_dim_size'])
            drop_cols = list(range(h_dim)) + ['region_coor']
            embed_features.drop(drop_cols, axis=1, inplace=True)

        X = embed_features.drop(['traffic'], axis=1).values
        y = embed_features['traffic'].values

        print(X.shape)
        self.X = X
        self.y = y
Beispiel #5
0
            sys.exit()
    else:
        print('constructing coding table')

        train_features, train_labels, f_map, _, c_map = \
            utils.generate_corpus_char(lines, if_shrink_c_feature=True,
                                       c_thresholds=args.mini_count,
                                       if_shrink_w_feature=False)

        f_set = {v for v in f_map}

        f_map = utils.shrink_features(f_map, train_features, args.mini_count)
        dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_features), f_set)
        dt_f_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_features), dt_f_set)

        f_map, embedding_tensor, in_doc_words = utils.load_embedding(args.emb_file, ' ', f_map, dt_f_set, args.unk, args.word_embedding_dim, shrink_to_corpus=args.shrink_embedding)

        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), dev_labels))
        l_set = functools.reduce(lambda x, y: x | y, map(lambda t: set(t), test_labels), l_set)

    print('constructing dataset')
    dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(train_features, train_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], ALLOW_SPANLEN=args.allowspan, train_set=True)
    dev_dataset = utils.construct_bucket_mean_vb_wc(dev_features, dev_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)
    test_dataset = utils.construct_bucket_mean_vb_wc(test_features, test_labels, CRF_l_map, SCRF_l_map, c_map, f_map, SCRF_stop_tag=SCRF_l_map['<STOP>'], train_set=False)

    dataset_loader = [torch.utils.data.DataLoader(tup, args.batch_size, shuffle=True, drop_last=False) for tup in dataset]
    dataset_loader_crf = [torch.utils.data.DataLoader(tup, 3, shuffle=True, drop_last=False) for tup in dataset_onlycrf] if dataset_onlycrf else None
    dev_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in dev_dataset]
    test_dataset_loader = [torch.utils.data.DataLoader(tup, 50, shuffle=False, drop_last=False) for tup in test_dataset]

    print('building model')
Beispiel #6
0
    plt.subplots_adjust(bottom=.6)
    plt.bar(range(len(d)), list(d.values()), align='center', color=colors)
    plt.xticks(range(len(d)), list(d.keys()), rotation=90, fontsize=14)
    plt.yticks(fontsize=8)
    plt.ylim(y_lim)
    plt.legend(loc='best')
    plt.savefig(fname)
    plt.clf()
    plt.close()


c = get_config()
region_grid = RegionGrid(config=c)
k = 5

H = load_embedding(c['embedding_file'])
X = region_grid.feature_matrix
nbrs = NearestNeighbors(n_neighbors=k, algorithm='ball_tree').fit(H)
distances, indices = nbrs.kneighbors(H)

for id, r in region_grid.regions.items():
    r_latent_nbrs = indices[r.index, 1:]

    all_nbrs_disconnected = 0
    for nbr in r_latent_nbrs:
        nbr_coor = region_grid.idx_coor_map[nbr]
        r_nbr = region_grid.regions[nbr_coor]
        #print("r: {}, nbr: {}".format(id, nbr_coor))
        #print("Is graph adjacent? {}".format(nbr_coor in r.adjacent))
        all_nbrs_disconnected += nbr_coor in r.adjacent
# In[ ]:





# ### emb

# In[12]:


import time
t0 = time.time()
path = '/ids/datasets/glove_vectors/glove.6B.300d.txt' # can use 300d
word_to_idx, words, emb = utils.load_embedding(path)
print(time.time() - t0)
emb.shape, len(word_to_idx), len(words)
# last two: UNK, PAD


# In[ ]:





# ### test section: optional

# In[ ]:
Beispiel #8
0
            f_map = utils.shrink_features(f_map, train_features,
                                          args.mini_count)
            # union the f_set with dev_features and test_features.
            dt_f_set = functools.reduce(lambda x, y: x | y,
                                        map(lambda t: set(t), dev_features),
                                        f_set)
            dt_f_set = functools.reduce(lambda x, y: x | y,
                                        map(lambda t: set(t), test_features),
                                        dt_f_set)

            f_map, embedding_tensor, in_doc_words = utils.load_embedding(
                args.emb_file,
                ' ',
                f_map,
                dt_f_set,
                args.unk,
                args.word_embedding_dim,
                shrink_to_corpus=args.shrink_embedding,
                embsave_filePath=nn_config['embsave_filePath'])

            # print('in_doc_words: \n', in_doc_words)

            l_set = functools.reduce(lambda x, y: x | y,
                                     map(lambda t: set(t), dev_labels))
            l_set = functools.reduce(lambda x, y: x | y,
                                     map(lambda t: set(t), test_labels), l_set)
        #end else

        print('constructing dataset')
        dataset, dataset_onlycrf = utils.construct_bucket_mean_vb_wc(
    mse, mse_std, mae, mae_std, err_euclidean = mod_euclidean.cv_ols()
    results.append(['euclidean', mse, mse_std, mae, mae_std])

    # Run with Taxi flow as weighted edges
    W = region_grid.weighted_mtx
    W = W[y_is_valid, :]
    W = W[:, y_is_valid]

    mod_flow = SimilarityModel(y_house, W)

    mse, mse_std, mae, mae_std, err_flow = mod_flow.cv_ols()
    results.append(['flow', mse, mse_std, mae, mae_std])

    # Matrix Factorization as similarity
    nmf = load_embedding(c['nmf_file'])
    W_nmf = np.matmul(nmf, np.transpose(nmf))
    W_nmf = W_nmf[y_is_valid, :]
    W_nmf = W_nmf[:, y_is_valid]

    mod_nmf = SimilarityModel(y_house, W_nmf)
    mse, mse_std, mae, mae_std, err_nmf = mod_nmf.cv_ols()
    results.append(['matrix factorization', mse, mse_std, mae, mae_std])

    # Run with deepwalk as similarity measure
    deepwalk = load_embedding(c['deepwalk_file'])
    W_deepwalk = np.matmul(deepwalk, np.transpose(deepwalk))
    W_deepwalk = W_deepwalk[y_is_valid, :]
    W_deepwalk = W_deepwalk[:, y_is_valid]

    mod_deepwalk = SimilarityModel(y_house, W_deepwalk)