def get_people(self): yield chair_info(CHAIR_URL) for row in csv_reader(COUNCIL_PAGE, header=True, headers={'Cookie': 'incap_ses_168_68279=7jCHCh608QQSFVti3dtUAviu/1IAAAAAIRf6OsZL0NttnlzANkVb6w=='}): p = Legislator( name='%(FirstName0)s %(LastName0)s' % row, post_id='%(MUNIC)s Ward %(WARDNUM)s' % row, role='Councillor', ) p.add_contact('email', row['email0'], None) p.add_contact('voice', row['Phone0'], 'legislature') p.add_extra('boundary_url', '/boundaries/%s-wards/ward-%s/' % (row['MUNIC'].lower(), row['WARDNUM'])) p.add_source(COUNCIL_PAGE) yield p if row['FirstName1'].strip(): p = Legislator( name='%s %s' % (row['FirstName1'], row['LastName1']), post_id='%(MUNIC)s Ward %(WARDNUM)s' % row, role='Councillor', ) p.add_contact('email', row['email1'], None) p.add_contact('voice', row['Phone1'], 'legislature') p.add_extra('boundary_url', '/boundaries/%s-wards/ward-%s/' % (row['MUNIC'].lower(), row['WARDNUM'])) p.add_source(COUNCIL_PAGE) yield p
def get_people(self): reader = csv_reader(COUNCIL_PAGE, header=True) for row in reader: kwargs = {'role': 'candidate'} email = None links = [] extra = {} offices = [] for k, v in row.items(): v = v.strip() if not v: continue k = k.strip() match = re.search(r'\AOffice (\d): ', k) if match: index = int(match.group(1)) while index > len(offices): offices.append({}) if k[10:] == 'Type': offices[index - 1]['note'] = v elif k[10:] in CONTACT_TYPE_KEYS: offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v else: raise Exception(k) elif k == 'Party Name': kwargs['party'] = PARTY_MAP[v] elif k in KEYS: kwargs[KEYS[k]] = v elif k == 'Email': email = v elif k in LINKS_KEYS: links.append({'url': v, 'note': k}) elif k in IGNORE_KEYS: continue elif k in EXTRA_KEYS: extra[re.sub(r'[^a-z0-9_]', '', k.lower().replace(' ', '_'))] = v else: raise Exception(k) contacts = [] for office in offices: for _, type in CONTACT_TYPE_KEYS.items(): if office.get(type): contacts.push({'note': office['note'], type: type, 'value': office[type]}) if 'name' in kwargs: p = Legislator(**kwargs) p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) for link in links: p.add_link(**links) for contact in contacts: p.add_contact(**contact) for k, v in extra.items(): p.add_extra(k, v) yield p
def get_files(folds_dir, split, fold): splits = ['train', 'valid', 'test'] csv_dir = join(folds_dir, 'split_{}'.format(split), 'fold_{}'.format(fold)) csv_files = [ join(csv_dir, '{}_s_{}_f_{}.csv'.format(s, split, fold)) for s in splits ] split_files = [csv_reader(csv) for csv in csv_files] return split_files
def load_src(): count = 0 for row in csv_reader('nounsword.csv'): count += 1 if count < start_row: continue ret = bd_trans(row[0]) print(ret) trans = ret.get('trans_result') write_row_csv('trans_word.csv', trans[0].values())
def csv(): """ HW with CSV :return: template """ av_height, av_weight = utils.csv_reader() return render_template('csv.html', title='CSV', av_height=av_height, av_weight=av_weight)
def get_spike_detection(): """ Get all of the available hashtags """ word_type = request.get_json()["word_type"] time_frame = request.get_json()["time_frame"] file_name = "./files/{}/{}_totalcounts_{}.csv".format( time_frame, word_type, time_frame) # print(file_name) data = utils.csv_reader(file_name) res = {"data": data} return jsonify(res), 201
def get_people(self): for row in csv_reader(COUNCIL_PAGE, header=True): p = Legislator( name='%(First Name)s %(Last Name)s' % row, post_id='Vancouver', role=row['Elected Office'], gender=row['Gender'], image=row['Photo URL'], ) p.add_contact('email', row['Email'], None) p.add_contact('voice', row['Phone'], 'legislature') p.add_contact('fax', row['Fax'], 'legislature') p.add_contact('address', '%(Address line 1)s\n%(Locality)s %(Province)s %(Postal Code)s' % row, 'legislature') p.add_source(COUNCIL_PAGE) p.add_source(row['URL']) yield p
def get_people(self): yield chair_info(CHAIR_URL) for row in csv_reader( COUNCIL_PAGE, header=True, headers= { 'Cookie': 'incap_ses_168_68279=7jCHCh608QQSFVti3dtUAviu/1IAAAAAIRf6OsZL0NttnlzANkVb6w==' }): p = Legislator( name='%(FirstName0)s %(LastName0)s' % row, post_id='%(MUNIC)s Ward %(WARDNUM)s' % row, role='Councillor', ) p.add_contact('email', row['email0'], None) p.add_contact('voice', row['Phone0'], 'legislature') p.add_extra( 'boundary_url', '/boundaries/%s-wards/ward-%s/' % (row['MUNIC'].lower(), row['WARDNUM'])) p.add_source(COUNCIL_PAGE) yield p if row['FirstName1'].strip(): p = Legislator( name='%s %s' % (row['FirstName1'], row['LastName1']), post_id='%(MUNIC)s Ward %(WARDNUM)s' % row, role='Councillor', ) p.add_contact('email', row['email1'], None) p.add_contact('voice', row['Phone1'], 'legislature') p.add_extra( 'boundary_url', '/boundaries/%s-wards/ward-%s/' % (row['MUNIC'].lower(), row['WARDNUM'])) p.add_source(COUNCIL_PAGE) yield p
def main(args): '''Main function for imputed with GINN Args: - from_id: start index to file list - to_id: end index to file list - fold_size: fold_size start from index 1 Returns: - write imputed_data: imputed data ''' # Input parameters from_id = args.from_id to_id = args.to_id fold_size = args.fold_size # Initial parameters missingness_flag = [0, 10, 20, 30, 40, 50] # t% missing data seed = 42 # Main program for i_file in range(from_id, to_id): file_name = file_list[i_file] print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name)) for i in tqdm(range(1, fold_size)): for missingness in missingness_flag: (D_miss_train, D_miss_test) = csv_reader(data_K_Fold, file_name, i, method='data_missing', missingness=missingness) x_train = D_miss_train[:, :(D_miss_train.shape[1] - 1)] y_train = D_miss_train[:, -1] x_test = D_miss_test[:, :(D_miss_test.shape[1] - 1)] y_test = D_miss_test[:, -1] missing_train, missing_train_mask = mask_generation(x_train) missing_test, missing_test_mask = mask_generation(x_test) cx_train = np.c_[missing_train, y_train] cx_test = np.c_[missing_test, y_test] mask_train = np.c_[missing_train_mask, np.ones(y_train.shape)] mask_test = np.c_[missing_test_mask, np.ones(y_test.shape)] # Here we proprecess the data applying a one-hot encoding for the categorical variables. We get the encoded dataset # three different masks that indicates the missing features and if these features are categorical or numerical, # plus the new columns for the categorical variables with their one-hot range. numerical_columns = dictionary_datasets['{}'.format(file_name)]['numerical'] categorical_columns = dictionary_datasets['{}'.format(file_name)]['categorical'] [oh_data, oh_mask, oh_numerical_mask, oh_categorical_mask, oh_categorical_columns, classes_dictionary] = data2onehot( np.r_[cx_train, cx_test], np.r_[mask_train, mask_test], numerical_columns, categorical_columns) # We scale the features with a min max scaler that will preserve the one-hot encoding oh_data_train = oh_data[:x_train.shape[0], :] oh_data_test = oh_data[x_train.shape[0]:, :] oh_mask_train = oh_mask[:x_train.shape[0], :] oh_num_mask_train = oh_mask[:x_train.shape[0], :] oh_cat_mask_train = oh_mask[:x_train.shape[0], :] oh_mask_test = oh_mask[x_train.shape[0]:, :] oh_num_mask_test = oh_mask[x_train.shape[0]:, :] oh_cat_mask_test = oh_mask[x_train.shape[0]:, :] # Scaler scaler_train = preprocessing.MinMaxScaler() oh_data_train = scaler_train.fit_transform(oh_data_train) scaler_test = preprocessing.MinMaxScaler() oh_data_test = scaler_test.fit_transform(oh_data_test) # Now we are ready to impute the missing values on the training set! imputer_train = GINN( oh_data_train, oh_mask_train, oh_num_mask_train, oh_cat_mask_train, oh_categorical_columns, numerical_columns, categorical_columns ) # Transform imputer_train.fit(epochs=1) imputed_train = scaler_train.inverse_transform(imputer_train.transform()) # Impute test imputer_train.add_data( oh_data_test, oh_mask_test, oh_num_mask_test, oh_cat_mask_test ) imputed_test = imputer_train.transform() imputed_test = scaler_test.inverse_transform(imputed_test[x_train.shape[0]:]) # print(imputed_train[0]) # Rebuild construct matrix if categorical_columns != []: # Rebuild train D_inverse_tr = inverse_onehot(cx_train.shape, imputed_train, oh_categorical_columns, classes_dictionary) imputed_train = order_by_address(D_inverse_tr, num_cols=numerical_columns, cat_cols=categorical_columns) # Rebuild test D_inverse_te = inverse_onehot(cx_test.shape, imputed_test, oh_categorical_columns, classes_dictionary) imputed_test = order_by_address(D_inverse_te, num_cols=numerical_columns, cat_cols=categorical_columns) # Check the approximation of each element imputed_train_checked = check_approximation(imputed_train, cx_train) imputed_test_checked = check_approximation(imputed_test, cx_test) # Write result imputed_path = os.path.join(imputed_dataset, file_name) write_file(imputed_train_checked, imputed_test_checked, imputed_path, 'GINN', missingness, i)
def get_people(self): reader = csv_reader(COUNCIL_PAGE, header=True) for row in reader: kwargs = {'role': 'candidate'} email = None links = [] extra = {} offices = [] for k, v in row.items(): v = v.strip() if not v: continue k = k.strip() match = re.search(r'\AOffice (\d): ', k) if match: index = int(match.group(1)) while index > len(offices): offices.append({}) if k[10:] == 'Type': offices[index - 1]['note'] = v elif k[10:] in CONTACT_TYPE_KEYS: offices[index - 1][CONTACT_TYPE_KEYS[k[10:]]] = v else: raise Exception(k) elif k == 'Party Name': kwargs['party'] = PARTY_MAP[v] elif k in KEYS: kwargs[KEYS[k]] = v elif k == 'Email': email = v elif k in LINKS_KEYS: links.append({'url': v, 'note': k}) elif k in IGNORE_KEYS: continue elif k in EXTRA_KEYS: extra[re.sub(r'[^a-z0-9_]', '', k.lower().replace(' ', '_'))] = v else: raise Exception(k) contacts = [] for office in offices: for _, type in CONTACT_TYPE_KEYS.items(): if office.get(type): contacts.push({ 'note': office['note'], type: type, 'value': office[type] }) if 'name' in kwargs: p = Legislator(**kwargs) p.add_source(COUNCIL_PAGE) if email: p.add_contact('email', email, None) for link in links: p.add_link(**links) for contact in contacts: p.add_contact(**contact) for k, v in extra.items(): p.add_extra(k, v) yield p
levels = MaxNLocator(nbins=15).bin_boundaries(z.min(), z.max()) cmap = plt.get_cmap('PiYG') plt.contourf(x[:-1, :-1] + dx / 2., y[:-1, :-1] + dy / 2., z, levels=levels, cmap=cmap) plt.colorbar() plt.title('Density estimation by SOINN') plt.show() if __name__ == '__main__': from utils import csv_reader r = csv_reader('reg_intro.csv') X, y = r.separate_label() the_reg = ISOINNregressor(smooth=-0.4, K=15) the_reg.fit(X, y) # the_reg.draw_density() test_x = [] draw_x = [] for i in range(50): test_x.append(array([i / 50.0])) draw_x.append(i / 50.0) test_y = the_reg.predict(test_x) import matplotlib.pyplot as plt fig = plt.figure() ax = fig.add_subplot(111) ax.plot(draw_x, test_y, 'k-') plt.axis('off')
def main(args): '''Main function for prepare processing data Args: - from_id: start index to file list - to_id: end index to file list - review_missing_flag: Set flag is True create missing data - review_imputed_flag: Set flag is True imputed missing value Returns: - Write file missing data - Write file imputed values ''' # Flag review_missing_flag = args.review_missing_flag review_imputed_flag = args.review_imputed_flag # Parameters from_id = args.from_id to_id = args.to_id n_iterations = args.n_iterations fold_size = 2 * args.n_iterations + 1 # fold_size start from index 1 random.seed(0) missingness_flag = [0, 10, 20, 30, 40, 50] # t% missing data binary_flag = [1, 0, 0, 0, 1, 1] # 1 activate imputation algorithm imputation_flag = [i for i, impf in enumerate(binary_flag) if impf == 1] # Load data and introduce missingness for i_file in range(from_id, to_id): file_name = file_list[i_file] print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name)) # Data Processing if review_missing_flag: # Data loader D_train = np.loadtxt(data_folder + '/train1/' + file_name + '_train1.dat', delimiter=',') D_val = np.loadtxt(data_folder + '/val/' + file_name + '_val.dat', delimiter=',') D_test = np.loadtxt(data_folder + '/test/' + file_name + '_test.dat', delimiter=',') X_full = np.concatenate((D_train, D_val, D_test), axis=0) # K-Fold Cross Validation approach first time kf_1 = KFold(n_splits=n_iterations, shuffle=True) kf_1.split(X_full) # K-Fold Cross Validation approach second time kf_2 = KFold(n_splits=n_iterations, shuffle=True) kf_2.split(X_full) # Save file csv train(i)-test(i) i=<1, iterations> K_Fold_cross_validation(kf_1, X_full, data_K_Fold, file_name, 0) # Save file csv train(i)-test(i) i=<iterations, 2xiterations> K_Fold_cross_validation(kf_2, X_full, data_K_Fold, file_name, n_iterations) # Loading data K-Fold for i in tqdm(range(1, fold_size)): (D_train, D_test) = csv_reader(data_K_Fold, file_name, i, method='original_data', missingness=None) for missingness in missingness_flag: D_train_missing = missing_data_generation(D_train, missingness) D_test_missing = missing_data_generation(D_test, missingness) write_file(D_train_missing, D_test_missing, data_K_Fold, file_name, missingness, i) # Loading data processed and imputed dataset if review_imputed_flag: for i in tqdm(range(1, fold_size)): for missingness in missingness_flag: (D_missing_train, D_missing_test) = csv_reader(data_K_Fold, file_name, i, method='data_missing', missingness=missingness) for imp_flag in imputation_flag: imputed_train, imputed_test, imp_name = imputation_method(D_missing_train, D_missing_test, imp_flag, missingness) imputation_path = os.path.join(file_name, imp_name) write_file(imputed_train, imputed_test, imputed_dataset, imputation_path, missingness, i)
def main(args): """ Main function for classification with imputed dataset Args: - from_id: start index to file list - to_id: end index to file list - fold_size: fold_size start from index 1 Returns: - """ # Input parameters from_id = args.from_id to_id = args.to_id fold_size = args.fold_size # Initial parameters binary_classifiers = [1, 1, 1, 1] # 1: Activate or 0: Deactivate classfication_flag = [ i for i, clsf in enumerate(binary_classifiers) if clsf == 1 ] missingness_flag = [0, 10, 20, 30, 40, 50] # t% missing data # Loading data for i_file in range(from_id, to_id): file_name = file_list[i_file] print(datetime.datetime.now(), "File {}: {}".format(i_file, file_name)) file_data_path = os.path.join(imputed_dataset, file_name) result_data_path = os.path.join(result_path, file_name) for name_imputation in os.listdir(file_data_path): for missing in missingness_flag: for clf_flag in classfication_flag: dict_eval = { 'accuracy': [], 'p_macro': [], 'r_macro': [], 'f1_macro': [], 'p_micro': [], 'r_micro': [], 'f1_micro': [] } for i in range(1, fold_size): D_train, D_test = csv_reader(file_data_path, name_imputation, i, method='data_missing', missingness=missing) features_D_train = D_train[:, :-1] labels_D_train = D_train[:, -1].astype(np.int32) features_D_test = D_test[:, :-1] labels_D_test = D_test[:, -1].astype(np.int32) classes = np.unique(labels_D_test) n_classes = len(classes) labels_predicted, name_classification_algo = model_prediction( features_D_train, features_D_test, labels_D_train, clf_flag, n_classes) accuracy, p_macro, r_macro, f1_macro, p_micro, r_micro, f1_micro = evaluation_report( labels_predicted, labels_D_test) dict_eval['accuracy'].append(accuracy) dict_eval['p_macro'].append(p_macro) dict_eval['r_macro'].append(r_macro) dict_eval['f1_macro'].append(f1_macro) dict_eval['p_micro'].append(p_micro) dict_eval['r_micro'].append(r_micro) dict_eval['f1_micro'].append(f1_micro) write_report(dict_eval, result_data_path, name_imputation, missing, name_classification_algo)