def main(argv): DATA_PATH = Path(FLAGS.DATA_PATH) # -------------------------------------------------------------- # DOWNLOAD DATA # -------------------------------------------------------------- for url in URLS: download_data(url=url, fname=DATA_PATH/url2name(url)) # -------------------------------------------------------------- # UNTAR GZ FILES # -------------------------------------------------------------- files = list(sorted(DATA_PATH.rglob('*.gz'))) for f in files: untar_file(f, DATA_PATH) # -------------------------------------------------------------- # CONVERT THE RAW DATA INTO TF-RECORDS # -------------------------------------------------------------- files = list(sorted(DATA_PATH.rglob('*batch_[0-9]'))) + \ list(sorted(DATA_PATH.rglob('*test_batch'))) for f in files: convert_to_tf_records(f, overwrite=True) # -------------------------------------------------------------- # SYNC DATA TO BUCKET # -------------------------------------------------------------- sync_data_to_bucket(FLAGS.dest_dir)
def main(_args): ''' download colocation data Parameters ---------- _args : listx Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir. Returns ------- None. ''' username = input("Username: "******"Update datasets? (y/n): ") if update == 'y': update = True elif update == 'n': update = False else: sys.exit('Unknown update input. Choose "y", "n". Exiting.') #read target datasets data_target = pd.read_csv(_args[1]) for i, dataset_id in enumerate(data_target['id']): country_output = _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] + '_mobility' base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str(dataset_id) + '&ds=' earliest_date = datetime(int(data_target.loc[i, 'year']), int(data_target.loc[i, 'month']), int(data_target.loc[i, 'day']), int(data_target.loc[i, 'hour'])) data_dates = get_file_dates(earliest_date) if update: data_dates = list(compress(data_dates, [x > get_update_date(country_output) for x in data_dates])) if len(data_dates) == 0: sys.exit('No datasets to download. Exiting.') urls = get_urls(base_url, data_dates) download_data(urls, keys) move_most_recent_files(country_output, urls) print('Success.')
def check_predictor(self): """ Check if predictor exists. If not downloads it. """ if not os.path.exists(self.predictor_path): print('Downloading missing predictor.') url = self.configuration.get('landmarks_predictor_download_url') download_data(url, self.predictor_path + '.bz2', 64040097) print(f'Decompressing downloaded file into {self.predictor_path}') with bz2.BZ2File(self.predictor_path + '.bz2') as fr, open( self.predictor_path, 'wb') as fw: shutil.copyfileobj(fr, fw)
def main(_args): ''' download colocation data Parameters ---------- _args : listx Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir. Returns ------- None. ''' with open(_args[1], 'rb') as file: fernet = Fernet(file.read()) with open(_args[2] + '/username.key', 'rb') as file: username = file.read() with open(_args[2] + '/password.key', 'rb') as file: password = file.read() keys = [ fernet.decrypt(username).decode("utf-8"), fernet.decrypt(password).decode("utf-8") ] #read target datasets data_target = pd.read_csv(_args[3]) for i, dataset_id in enumerate(data_target['id']): base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str( dataset_id) + '&ds=' earliest_date = datetime(int(data_target.loc[i, 'year']), int(data_target.loc[i, 'month']), int(data_target.loc[i, 'day']), int(data_target.loc[i, 'hour'])) data_dates = get_file_dates(earliest_date) urls = get_urls(base_url, data_dates) download_data(urls, keys) move_most_recent_files( _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] + '_mobility', urls) print('Success.')
def aggregate_and_evaluate(args): print('Merging generated SMILES into a single file...') smiles = [] for rank in range(args['num_processes']): with open( os.path.join(args['log_dir'], str(rank), 'generated_smiles.txt'), 'r') as f: rank_smiles = f.read().splitlines() smiles.extend(rank_smiles) with open(os.path.join(args['log_dir'], 'generated_smiles.txt'), 'w') as f: for s in smiles: f.write(s + '\n') print('Removing temporary dirs...') remove_worker_tmp_dir(args) # Summarize training molecules print('Summarizing training molecules...') train_file = '_'.join([args['dataset'], 'DGMG_train.txt']) if not os.path.exists(train_file): download_data(args['dataset'], train_file) with open(train_file, 'r') as f: train_smiles = f.read().splitlines() train_summary = summarize_molecules(train_smiles, args['num_processes']) with open(os.path.join(args['log_dir'], 'train_summary.pickle'), 'wb') as f: pickle.dump(train_summary, f) # Summarize generated molecules print('Summarizing generated molecules...') generation_summary = summarize_molecules(smiles, args['num_processes']) with open(os.path.join(args['log_dir'], 'generation_summary.pickle'), 'wb') as f: pickle.dump(generation_summary, f) # Stats computation print('Preparing generation statistics...') valid_generated_smiles = generation_summary['smile'] unique_generated_smiles = get_unique_smiles(valid_generated_smiles) unique_train_smiles = get_unique_smiles(train_summary['smile']) novel_generated_smiles = get_novel_smiles(unique_generated_smiles, unique_train_smiles) with open(os.path.join(args['log_dir'], 'generation_stats.txt'), 'w') as f: f.write('Total number of generated molecules: {:d}\n'.format( len(smiles))) f.write('Validity among all: {:.4f}\n'.format( len(valid_generated_smiles) / len(smiles))) f.write('Uniqueness among valid ones: {:.4f}\n'.format( len(unique_generated_smiles) / len(valid_generated_smiles))) f.write('Novelty among unique ones: {:.4f}\n'.format( len(novel_generated_smiles) / len(unique_generated_smiles)))
def main(): if not os.path.exists(MODEL_DIR): os.mkdir(MODEL_DIR) download_data(data_dir=DATA_DIR, labels_file=LABELS_FILE, base_url=BASE_URL) model_save_path = os.path.join(MODEL_DIR, MODEL_FILE) classes_file_path = os.path.join(MODEL_DIR, CLASSES_FILE) model = train_model(DATA_DIR, model_save_path, classes_file_path) print('>>> All done! Model saved to <%s>' % model_save_path)
def _create_z_values(model, ticker, stats_data=None, \ auto_update_tolerances=False, *args, **kwargs): freq_range, frequencies = _create_freq() try: timezone = kwargs['timezone'] except KeyError: timezone = None try: start = kwargs['start'] except KeyError: start = None _, means, stds = _get_price_moves_and_stats(ticker=ticker, \ stats_data=stats_data, timezone=timezone, \ start=start) if auto_update_tolerances: utils.print_issue('STATS-INFO', 'Auto update of tolerances!') current_value = utils.download_data(tickers=ticker, \ start=(pd.Timestamp.today() - pd.Timedelta('1 days')), \ value='Close').values[-1] current_tols = model.break_values[ticker] - current_value utils.print_issue('STATS-INFO', 'Current value: {}!'.format(current_value)) utils.print_issue('STATS-INFO', 'New tolerances: {}!'.format(current_tols)) tol_unten = np.sort(current_tols)[0] tol_oben = np.sort(current_tols)[1] else: tol_unten = np.sort(model.tolerances[ticker])[0] tol_oben = np.sort(model.tolerances[ticker])[1] z_values_unten = (tol_unten - means) / stds z_values_oben = (tol_oben - means) / stds return np.array([z_values_unten, z_values_oben]), np.array([tol_unten, tol_oben]), means
def get_ureport_by_hash(self, master_hash, source=None): json_result = None if isinstance(self.url, dict): if source is not None: json_result = download_data(url=source, data=master_hash) return json_result
def _get_price_moves_and_stats(ticker, stats_data=None, timezone=None, start=None): if timezone is None: timezone = 'Europe/London' if start is None: start = pd.Timestamp(2019, 1, 1, 0) if stats_data is None: stats_data = utils.download_data(tickers=ticker, start=start, interval='60m', value='Close') freq_range, frequencies = _create_freq() price_movements = dict.fromkeys(frequencies) means = np.zeros(freq_range.shape) stds = np.zeros(freq_range.shape) for index, freq in enumerate(frequencies): current_time = start current_rng = pd.date_range(start=current_time, end=pd.Timestamp.today(), freq=freq, tz=timezone, name='Datetime') current_moves = stats_data[current_rng].pct_change() current_moves = current_moves[~np.isnan(current_moves)] means[index] = np.mean(current_moves) stds[index] = np.std(current_moves) price_movements[freq] = current_moves hours = np.flip(np.arange(1,25,1)) df = pd.DataFrame() df['hours'] = pd.Series(hours) df['pct_change'] = pd.Series(np.flip(means)) first_key = list(price_movements.values())[1] return price_movements, means, stds
def meetingSummary(self, isTape=False): table = self._create_table() row = ET.Element("tr") row.append(self._create_cell("")) # Site names if isTape: row.append(self._create_cell("T0_CH_CERN", header=True)) self._add_site_names(table, row) # Pledges row = ET.Element("tr") row.append(self._create_cell("Pledges [TB]", header=True)) total = 0 if isTape: row.append(self._create_cell(to_3f(T0CERNPLEDGE), color=True)) total += T0CERNPLEDGE for site in self.sites: data = PLEDGES[site.name] if site.name in PLEDGES else 0 total += data row.append(self._create_cell(to_3f(data), color=True)) row.append(self._create_cell(to_3f(total), color=True)) table.append(row) # Used (PhEDEx) url = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/nodeusage" interests = re.compile(r".*_node_bytes$") row = ET.Element("tr") row.append(self._create_cell("Used (PhEDEx) [TB]", header=True)) grand_total = 0 if isTape: row.append(self._create_cell(to_TB(T0CERNUSED))) grand_total += T0CERNUSED for site in self.sites: params = {"node":"{site}".format(site=site.name)} data = json.loads(utils.download_data(url, params))["phedex"]["node"][0] total = 0 for key, value in data.items(): if interests.match(key): total += long(value) if site.name == FNALTAPE: row.append(self._create_cell(to_TB(T1FNALMSSUSED))) grand_total += T1FNALMSSUSED else: row.append(self._create_cell(to_TB(total))) grand_total += total row.append(self._create_cell(to_TB(grand_total))) table.append(row) return table
def sync_world_statuses(): log('Checking for updates', COLLECTION_NAME, DATA_URL) with tempfile.TemporaryDirectory() as tmp_dir: temp_file = '{}/{}'.format(tmp_dir, 'world.csv') filename = download_data(DATA_URL, temp_file) statuses = read_csv(filename) sync_with_db(statuses)
def sync_ontario_cases(): log('Checking for updates', COLLECTION_NAME, DATA_URL) with tempfile.TemporaryDirectory() as tmp_dir: temp_file = '{}/ontario_cases.csv'.format(tmp_dir) filename = download_data(DATA_URL, temp_file) cases = read_csv(filename) sync_with_db(cases)
def __iter__(self): n_batches = np.ceil(self.metadata.shape[0] / self.batch_size).astype(int) for i in range(n_batches): names = self.metadata.iloc[i * (self.batch_size):(i + 1) * self.batch_size].Id tensor = torch.from_numpy(download_data(names)).permute( 1, 0, 2, 3).float() yield tensor
def get_data(self, value='Close', filter_date_range=None, *args, **kwargs): ''' Function to get asset historical prices. ''' self.data = utils.download_data(tickers=self.tickers, value=value, *args, **kwargs) #if only one ticker in self.tickers: download_data returns series! if not isinstance(self.data, pd.core.frame.DataFrame): self.data = self.data.to_frame(name=self.tickers[0]) if filter_date_range is not None: self.apply_date_filter(filter_date_range=filter_date_range)
def download_date(self): self._browser_create() self._search_loc() self._select_loc() DataSource.setSource(self._rp5source) self._rp5source = DataSource.getSource() self._select_data_source() if (self._rp5source.lower() == "metar"): downTabElem = self._browser.find_element_by_id('tabMetarDLoad') elif (self._rp5source.lower() == "archive"): downTabElem = self._browser.find_element_by_id('tabSynopDLoad') else: self._exit_msg = "Rp5 Source is not from available options, pls retry, exiting!" self._browser_exit(1) downTabElem.click() time.sleep(3) beginDateElem = self._browser.find_element_by_id('calender_dload') beginDateElem.clear() Log.info("Start Date is - {}".format(self._beginDate)) beginDateElem.send_keys(self._beginDate) endDateElem = self._browser.find_element_by_id('calender_dload2') endDateElem.clear() Log.info("End Date is - {}".format(self._endDate)) endDateElem.send_keys(self._endDate) generateDownloadElem = self._browser.find_elements_by_class_name( 'archButton')[1] generateDownloadElem.click() time.sleep(5) try: downloadElem = self._browser.find_element_by_link_text('Download') fileUrl = downloadElem.get_attribute('href') Log.info("File URL is - " + fileUrl) except: self._exit_msg = "Download link not found, exiting ..." self._browser_exit(1) fullPath = download_data(fileUrl, self._fileName, self._dirPath) Log.info("Downloading Data to {} ...".format(fullPath)) Log.info( "rp5 data downloaded successfully for {} location from {} to {}". format(self._loc, self._beginDate, self._endDate)) self._browser.close() return fullPath
def main(args, config): start_time = time.time() # 1) Read data from database print(20*'=') print('1. Downloading data...') data = download_data(user=args.user, password=args.password,tb_name='sketch.train_data_2') oh = download_data(user=args.user, password=args.password,tb_name='sketch.ode_school') # 2) Data preparation print(20*'=') print('2. Data processing...') data = data_preparation(data, oh) # TODO: args can be the filter of which vars to use train_data, test_data = train_val_test_split(data, config['min_train_cohort'], config['min_test_cohort']) # 3) Model print(20*'=') print('3. Training model...') model = model_dict[config['model']] clf = model(train_data, args=config['hyperparameters']) # 4) Compute metric in validation set print(20*'=') print('4. Evaluation model in validation set...') metric = metric_dict[config['metric']](clf, test_data) print('{}: {}', config['metric'], metric) # We print test and train accuracy train_accuracy = accuracy(clf, train_data) print('Train accuracy: ', train_accuracy) test_accuracy = accuracy(clf, test_data) print('Test accuracy: ', test_accuracy) # 5) Upload result to postgres print(20*'=') print('5. Uploading result to database...') upload_result(config['model_name'], config['metric'], metric, args.user, args.password) print(20*'=') print('Finished in {} seconds'.format(time.time()-start_time))
def pull_population(outdir, keys, country, dl_variables, update, population_type): ''' Parameters ---------- outdir : str Output directory. keys : list user credentials [username, password]. country : str Country name - must match .config file exactly (names with spaces must replace ' ' with '_'). dl_variables : dict download specific variables in a dict, 'id' = dataset id, 'origin' = dataset origin datetime.datetime object. update : boolean Whether an existing dataset is being updated. Returns ------- None. ''' country_output = outdir + "/" + country + '_' + population_type base_url = 'https://www.facebook.com/geoinsights-portal/downloads/raster/?id=' + str( dl_variables['id']) + '&ds=' earliest_date = dl_variables['origin'] data_dates = get_file_dates(earliest_date) if update: data_dates = list( compress(data_dates, [x > get_update_date(country_output) for x in data_dates])) if len(data_dates) == 0: sys.exit('No datasets to download. Exiting.') urls = get_urls(base_url, data_dates) start_time = download_data(urls, keys) move_most_recent_files(country_output, urls, start_time) remove_empty_files(country_output) print('Success.')
def main(args): INPUT_PATH.mkdir(parents=True, exist_ok=True) OUTPUT_PATH.mkdir(exist_ok=True) if args.clear_cache: try: shutil.rmtree(TMP_PATH) logger.info("Cache was cleared!") except Exception as err: logger.info("There is no cache to clear!") TMP_PATH.mkdir(exist_ok=True) if not INPUT_DATA.exists() or args.clear_cache: try: download_data( username=KAGGLE_USER, key=KAGGLE_KEY, dataset=KAGGLE_DATASET, download_path=INPUT_PATH, ) except Exception as err: logger.exception("Failed to download the data!") raise asyncio.run(execute_crawler())
def __iter__(self): n_batches = np.ceil(self.metadata.shape[0] / self.batch_size).astype(int) for i in range(n_batches): labels = self.metadata.iloc[i * (self.batch_size):(i + 1) * self.batch_size].iloc[:, 1:].values labels = torch.from_numpy(labels).int() names = self.metadata.iloc[i * (self.batch_size):(i + 1) * self.batch_size].Id tensor = torch.from_numpy(download_data(names)).permute( 0, 3, 1, 2).float() yield tensor, labels
def download_single_file(self, date): url = self.url_base + "/" + f"{date}.csv" #url = os.path.join(self.url_base, f"{date}.csv") data = download_data(url=url) if data is None: logging.info( f"{date}.csv doesn't not exists or failed to be downloaded!") return None data.loc[:, 'date_today'] = datetime.strptime(date, "%m-%d-%Y") data = data.rename(columns={"Province_State": "state", "Lat": "latitude", "Long_": "longitude", 'Confirmed': "confirmed", 'Deaths': "deaths", 'Recovered': "recovered", 'Active': "active", 'FIPS': "fips", "People_Hospitalized": "hospitalization"}) \ .dropna(subset=['fips']) data.loc[:, "fips"] = data['fips'].astype(int) data = data[self.common_columns].fillna(0) return data
def _add_phedex_totals(self, table): url = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/nodeusage" interests = re.compile(r".*_node_bytes$") row = ET.Element("tr") row.append(self._create_cell("phedex")) grand_total = 0 for site in self.sites: params = {"node":"{site}".format(site=site.name)} data = json.loads(utils.download_data(url, params))["phedex"]["node"][0] total = 0 for key, value in data.items(): if interests.match(key): total += int(value) grand_total += value row.append(self._create_cell(to_TB(total))) row.append(self._create_cell(to_TB(grand_total))) table.append(row)
def _get_price_moves_and_stats(ticker, stats_data=None, \ timezone=None, start=None): if timezone is None: timezone = 'Europe/London' if start is None: start = pd.Timestamp(2019, 1, 1, 0) if stats_data is None: stats_data = utils.download_data(tickers=ticker, start=start, \ interval='60m', value='Close') freq_range, frequencies = _create_freq() price_movements = dict.fromkeys(frequencies) means = np.zeros(freq_range.shape) stds = np.zeros(freq_range.shape) for index, freq in enumerate(frequencies): current_time = start current_rng = pd.date_range(start=current_time, end=pd.Timestamp.today(), \ freq=freq, tz=timezone, name='Datetime') current_moves = np.diff(stats_data[current_rng]) current_moves = current_moves[~np.isnan(current_moves)] means[index] = np.mean(current_moves) stds[index] = np.std(current_moves) price_movements[freq] = current_moves return price_movements, means, stds
def get_site(block): url = "https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod/data" params = {"block": block} data = json.loads(utils.download_data(url, params))["phedex"] return data["dbs"][0]["dataset"][0]["block"][0]["file"][0]["node"]
from posts_users_analyse import PostsUsersAnalyser from utils import download_data if __name__ == "__main__": posts_url = 'https://jsonplaceholder.typicode.com/posts' users_url = 'https://jsonplaceholder.typicode.com/users' posts = download_data(posts_url) users = download_data(users_url) analyser = PostsUsersAnalyser() posts_number = analyser.create_post_number_list(posts, users) nonunique_titles = analyser.find_nonunique_titles(posts) neighbours = analyser.find_neighbours(users)
import numpy as np import pandas as pd import featuretools as ft import utils utils.download_data() data_path = 'data/train_FD004.txt' data = utils.load_data(data_path) data.head()
args = parser.parse_args() device = torch.device("cuda" if torch.cuda.is_available() else "cpu") path_to_text = os.path.join(args.path_to_data, 'shakespeare.txt') if not os.path.exists(args.path_to_model): os.makedirs(args.path_to_model) if not os.path.exists(args.path_to_logdir): os.makedirs(args.path_to_logdir) if not os.path.exists(args.path_to_data): os.makedirs(args.path_to_data) if args.download: download_data(args.path_to_data, uurl) text_as_int, char2idx, idx2char = read_and_preprocess_text(path_to_text) vocab_size = len(char2idx) dataset = CharacheterDataset(text_as_int, args.seq_len) train_size = int(len(dataset) * args.train_size) trainset, valset = random_split( dataset, [train_size, len(dataset) - train_size]) train_loader = DataLoader(trainset, shuffle=True, batch_size=args.bz) val_loader = DataLoader(valset, shuffle=False, batch_size=args.bz) model = TextGenerator(embedding_dim=args.emb_dim, hidden_dim=args.hid_dim, vocab_size=vocab_size)
#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse from utils import download_data if __name__ == '__main__': parser = argparse.ArgumentParser(description='download the pre-processed MNIST dataset') parser.add_argument('gzfile', type=str, help='the dataset path') parser.add_argument('url', type=str, help='dataset url') args = parser.parse_args() download_data(args.gzfile, args.url)
from utils import download_data data_dir = "hymenoptera_data_version_0" download_url = f"https://pl-flash-data.s3.amazonaws.com/{data_dir}.zip" download_data(download_url, "./data")
def run(self): if not os.path.exists(os.getenv("DATA_DIR")): os.makedirs(os.getenv("DATA_DIR")) local_path = u.download_data(self.year_month)
#!/usr/bin/env python # # Licensed to the Apache Software Foundation (ASF) under one # or more contributor license agreements. See the NOTICE file # distributed with this work for additional information # regarding copyright ownership. The ASF licenses this file # to you under the Apache License, Version 2.0 (the # "License"); you may not use this file except in compliance # with the License. You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # import argparse from utils import download_data if __name__ == '__main__': parser = argparse.ArgumentParser( description='download the pre-processed MNIST dataset') parser.add_argument('gzfile', type=str, help='the dataset path') parser.add_argument('url', type=str, help='dataset url') args = parser.parse_args() download_data(args.gzfile, args.url)
def get_site(block): url = "https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod/data" params = {"block":block} data = json.loads(utils.download_data(url, params))["phedex"] return data["dbs"][0]["dataset"][0]["block"][0]["file"][0]["node"]
import os os.environ['KMP_DUPLICATE_LIB_OK'] = 'True' import numpy as np from model import Model, save_freeze_tensorflow_model_for_inference, convert_to_tensor_rt, inference_from_tensor_rt_graph from preprocessing import preprocess from utils import init_configuration, download_data, get_log_dir, get_arguments_as_dict # read the parameters from the config file all_params = init_configuration(config_file='config/config.yaml') # getting log directory to save the model and results log_dir = get_log_dir(all_params) print('downloading data') train_path, test_path = download_data(reload=True) print('preprocessing data') dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths, dataset_test_for_predict, dataset_test_lengths_for_predict, x_test = preprocess( train_path, test_path, all_params) print('initialize and train the model') model = Model(log_dir, all_params) model.train(dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths) model.predict(dataset_test_for_predict, dataset_test_lengths_for_predict) frozen_graph, your_outputs = save_freeze_tensorflow_model_for_inference( log_dir)
@author: ionut ''' import sys import utils def print_help_exit(): print('usage: python downloader.py START_YEAR STOP_YEAR') print('\texample: python downloader.py 1893 2015') sys.exit(1) if len(sys.argv) < 2: print_help_exit() start_year = 0 stop_year = 0 try: start_year = int(sys.argv[1]) stop_year = int(sys.argv[2]) if not 1763 <= start_year <= stop_year <= 2016: raise Exception('invalid year') except: print_help_exit() for i in range(start_year, stop_year): filename = '%d.csv.gz' % i utils.download_data(filename)
def main(config, download_resources=False, process_data=False, test_size=0.4, model_train=False, model_path=None): """ :param config: :param download_resources: :param process_data: :param test_size: :param model_train: :param model_path: :return: """ if download_resources: utils.download_data() # Get data if config['domain'] == "NER": train_chr, valid_chr, test_chr, train_word, valid_word, test_word, train_label, \ valid_label, test_label, chr_id_mappings, = data_loader.prepare_ner_data( process_data, test_size) else: train_chr, valid_chr, test_chr, train_word, valid_word, test_word, train_label, \ valid_label, test_label, chr_id_mappings, = data_loader.prepare_wjs_data( process_data, test_size) # Update config config['n_classes'] = train_label.shape[2] config['char_vocab_dim'] = len(chr_id_mappings) + 1 config['train_examples'] = train_chr.shape[0] config['validation_examples'] = valid_chr.shape[0] config['test_examples'] = test_chr.shape[0] logging.info("CONFIG:") logging.info("\n".join([k + ": " + str(v) for k, v in config.items()])) model = models.CNN_BILSTM_CRF(config) if model_train: train.train(train_word=train_word, valid_word=valid_word, train_chr=train_chr, valid_chr=valid_chr, train_label=train_label, valid_label=valid_label, num_epochs=config['train_epochs'], model=model, batch_size=config['batch_size'], config=config) # Evaluate at the end logging.info("Evaluating at the TEST set") train.eval(model, test_chr, test_word, test_label, config['batch_size']) else: if model_path: saver = tf.train.Saver() saver.restore(model.sess, model_path) # Test the model on the test set logging.info("Evaluating at the TEST set") train.eval(model, test_chr, test_word, test_label, config['batch_size']) else: print("No trained models exist! You have to train the model first.")
from model import FullyConnectedNetwork from utils import download_data import matplotlib.pyplot as plt # add foundations.set_tensorboard_logdir() code here # download data train_df, test_df = download_data() # prepare data input_size = len(train_df.columns) - 1 # don't include the target when counting inputs numeric_columns = ['machine_hours_current_meter', 'age_in_years', 'target'] categorical_sizes = {col: train_df[col].nunique() for col in train_df.columns if col not in numeric_columns} # define hyperparameters # replace following with foundations.load_parameters() hyperparameters = {'n_epochs': 5, 'batch_size': 128, 'validation_percentage': 0.1, 'dense_blocks': [{'size': 256, 'dropout_rate': 0}], 'embedding_factor': 0.5, 'learning_rate':0.0001, 'lr_plateau_factor':0.1, 'lr_plateau_patience':3, 'early_stopping_min_delta':0.001, 'early_stopping_patience':5} # train model = FullyConnectedNetwork(input_size, hyperparameters, categorical_sizes) hist = model.train(train_df)