def main(argv):
    DATA_PATH = Path(FLAGS.DATA_PATH)

    # --------------------------------------------------------------
    # DOWNLOAD DATA
    # --------------------------------------------------------------
    for url in URLS:
        download_data(url=url, fname=DATA_PATH/url2name(url))

    # --------------------------------------------------------------
    # UNTAR GZ FILES
    # --------------------------------------------------------------
    files = list(sorted(DATA_PATH.rglob('*.gz')))
    for f in files:
        untar_file(f, DATA_PATH)

    # --------------------------------------------------------------
    # CONVERT THE RAW DATA INTO TF-RECORDS
    # --------------------------------------------------------------
    files = list(sorted(DATA_PATH.rglob('*batch_[0-9]'))) + \
        list(sorted(DATA_PATH.rglob('*test_batch')))
    for f in files:
        convert_to_tf_records(f, overwrite=True)

    # --------------------------------------------------------------
    # SYNC DATA TO BUCKET
    # --------------------------------------------------------------
    sync_data_to_bucket(FLAGS.dest_dir)
def main(_args):
    '''
    download colocation data
    
    Parameters
    ----------
    _args : listx
        Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir.

    Returns
    -------
    None.

    '''
    
    username = input("Username: "******"Update datasets? (y/n): ")
    
    if update == 'y':
        update = True
    elif update == 'n':
        update = False
    else:
        sys.exit('Unknown update input. Choose "y", "n". Exiting.')
    
    #read target datasets
    data_target = pd.read_csv(_args[1])
    
    for i, dataset_id in enumerate(data_target['id']):
        
        country_output = _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] + '_mobility'
            
        base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str(dataset_id) + '&ds='
    
        earliest_date = datetime(int(data_target.loc[i, 'year']), int(data_target.loc[i, 'month']), int(data_target.loc[i, 'day']), int(data_target.loc[i, 'hour']))    
        
        data_dates = get_file_dates(earliest_date)
                
        if update:
            data_dates = list(compress(data_dates, [x > get_update_date(country_output) for x in data_dates]))
        
        if len(data_dates) == 0:
            sys.exit('No datasets to download. Exiting.')
            
        urls = get_urls(base_url, data_dates)
        
        download_data(urls, keys)
        
    
        move_most_recent_files(country_output, urls)
    
    print('Success.')
Beispiel #3
0
 def check_predictor(self):
     """ Check if predictor exists. If not downloads it. """
     if not os.path.exists(self.predictor_path):
         print('Downloading missing predictor.')
         url = self.configuration.get('landmarks_predictor_download_url')
         download_data(url, self.predictor_path + '.bz2', 64040097)
         print(f'Decompressing downloaded file into {self.predictor_path}')
         with bz2.BZ2File(self.predictor_path + '.bz2') as fr, open(
                 self.predictor_path, 'wb') as fw:
             shutil.copyfileobj(fr, fw)
def main(_args):
    '''
    download colocation data
    
    Parameters
    ----------
    _args : listx
        Arg list secret_key, username and pass dir, csv file specifying download countries and ids, outdir.

    Returns
    -------
    None.

    '''

    with open(_args[1], 'rb') as file:
        fernet = Fernet(file.read())

    with open(_args[2] + '/username.key', 'rb') as file:
        username = file.read()

    with open(_args[2] + '/password.key', 'rb') as file:
        password = file.read()

    keys = [
        fernet.decrypt(username).decode("utf-8"),
        fernet.decrypt(password).decode("utf-8")
    ]

    #read target datasets
    data_target = pd.read_csv(_args[3])

    for i, dataset_id in enumerate(data_target['id']):

        base_url = 'https://www.facebook.com/geoinsights-portal/downloads/vector/?id=' + str(
            dataset_id) + '&ds='

        earliest_date = datetime(int(data_target.loc[i, 'year']),
                                 int(data_target.loc[i, 'month']),
                                 int(data_target.loc[i, 'day']),
                                 int(data_target.loc[i, 'hour']))

        data_dates = get_file_dates(earliest_date)
        urls = get_urls(base_url, data_dates)

        download_data(urls, keys)

        move_most_recent_files(
            _args[len(_args) - 1] + "/" + data_target.loc[i, 'country'] +
            '_mobility', urls)

    print('Success.')
Beispiel #5
0
def aggregate_and_evaluate(args):
    print('Merging generated SMILES into a single file...')
    smiles = []
    for rank in range(args['num_processes']):
        with open(
                os.path.join(args['log_dir'], str(rank),
                             'generated_smiles.txt'), 'r') as f:
            rank_smiles = f.read().splitlines()
        smiles.extend(rank_smiles)

    with open(os.path.join(args['log_dir'], 'generated_smiles.txt'), 'w') as f:
        for s in smiles:
            f.write(s + '\n')

    print('Removing temporary dirs...')
    remove_worker_tmp_dir(args)

    # Summarize training molecules
    print('Summarizing training molecules...')
    train_file = '_'.join([args['dataset'], 'DGMG_train.txt'])
    if not os.path.exists(train_file):
        download_data(args['dataset'], train_file)
    with open(train_file, 'r') as f:
        train_smiles = f.read().splitlines()
    train_summary = summarize_molecules(train_smiles, args['num_processes'])
    with open(os.path.join(args['log_dir'], 'train_summary.pickle'),
              'wb') as f:
        pickle.dump(train_summary, f)

    # Summarize generated molecules
    print('Summarizing generated molecules...')
    generation_summary = summarize_molecules(smiles, args['num_processes'])
    with open(os.path.join(args['log_dir'], 'generation_summary.pickle'),
              'wb') as f:
        pickle.dump(generation_summary, f)

    # Stats computation
    print('Preparing generation statistics...')
    valid_generated_smiles = generation_summary['smile']
    unique_generated_smiles = get_unique_smiles(valid_generated_smiles)
    unique_train_smiles = get_unique_smiles(train_summary['smile'])
    novel_generated_smiles = get_novel_smiles(unique_generated_smiles,
                                              unique_train_smiles)
    with open(os.path.join(args['log_dir'], 'generation_stats.txt'), 'w') as f:
        f.write('Total number of generated molecules: {:d}\n'.format(
            len(smiles)))
        f.write('Validity among all: {:.4f}\n'.format(
            len(valid_generated_smiles) / len(smiles)))
        f.write('Uniqueness among valid ones: {:.4f}\n'.format(
            len(unique_generated_smiles) / len(valid_generated_smiles)))
        f.write('Novelty among unique ones: {:.4f}\n'.format(
            len(novel_generated_smiles) / len(unique_generated_smiles)))
def main():
    if not os.path.exists(MODEL_DIR):
        os.mkdir(MODEL_DIR)

    download_data(data_dir=DATA_DIR,
                  labels_file=LABELS_FILE,
                  base_url=BASE_URL)

    model_save_path = os.path.join(MODEL_DIR, MODEL_FILE)
    classes_file_path = os.path.join(MODEL_DIR, CLASSES_FILE)
    model = train_model(DATA_DIR, model_save_path, classes_file_path)

    print('>>> All done! Model saved to <%s>' % model_save_path)
Beispiel #7
0
def _create_z_values(model, ticker, stats_data=None, \
                     auto_update_tolerances=False, *args, **kwargs):
    freq_range, frequencies = _create_freq()
    try:
        timezone = kwargs['timezone']
    except KeyError:
        timezone = None
    try:
        start = kwargs['start']
    except KeyError:
        start = None
    _, means, stds = _get_price_moves_and_stats(ticker=ticker, \
                                                stats_data=stats_data,
                                                timezone=timezone, \
                                                start=start)
    if auto_update_tolerances:
        utils.print_issue('STATS-INFO', 'Auto update of tolerances!')
        current_value = utils.download_data(tickers=ticker, \
                                            start=(pd.Timestamp.today() - pd.Timedelta('1 days')), \
                                            value='Close').values[-1]
        current_tols = model.break_values[ticker] - current_value
        utils.print_issue('STATS-INFO',
                          'Current value: {}!'.format(current_value))
        utils.print_issue('STATS-INFO',
                          'New tolerances: {}!'.format(current_tols))
        tol_unten = np.sort(current_tols)[0]
        tol_oben = np.sort(current_tols)[1]
    else:
        tol_unten = np.sort(model.tolerances[ticker])[0]
        tol_oben = np.sort(model.tolerances[ticker])[1]
    z_values_unten = (tol_unten - means) / stds
    z_values_oben = (tol_oben - means) / stds
    return np.array([z_values_unten,
                     z_values_oben]), np.array([tol_unten, tol_oben]), means
Beispiel #8
0
    def get_ureport_by_hash(self, master_hash, source=None):
        json_result = None

        if isinstance(self.url, dict):
            if source is not None:
                json_result = download_data(url=source, data=master_hash)
        return json_result
Beispiel #9
0
    def get_ureport_by_hash(self, master_hash, source=None):
        json_result = None

        if isinstance(self.url, dict):
            if source is not None:
                json_result = download_data(url=source, data=master_hash)
        return json_result
Beispiel #10
0
def _get_price_moves_and_stats(ticker, stats_data=None,
                               timezone=None, start=None):
    if timezone is None:
        timezone = 'Europe/London'
    if start is None:
        start = pd.Timestamp(2019, 1, 1, 0)
    if stats_data is None:
        stats_data = utils.download_data(tickers=ticker, start=start,
                                         interval='60m', value='Close')
    freq_range, frequencies = _create_freq()
    price_movements = dict.fromkeys(frequencies)
    means = np.zeros(freq_range.shape)
    stds = np.zeros(freq_range.shape)
    for index, freq in enumerate(frequencies):
        current_time = start
        current_rng = pd.date_range(start=current_time, end=pd.Timestamp.today(),
                                    freq=freq, tz=timezone, name='Datetime')
        current_moves = stats_data[current_rng].pct_change()
        current_moves = current_moves[~np.isnan(current_moves)]
        means[index] = np.mean(current_moves)
        stds[index] = np.std(current_moves)
        price_movements[freq] = current_moves
        hours = np.flip(np.arange(1,25,1))
        df = pd.DataFrame()
        df['hours'] = pd.Series(hours)
        df['pct_change'] = pd.Series(np.flip(means))
        first_key = list(price_movements.values())[1]
    return price_movements, means, stds
Beispiel #11
0
    def meetingSummary(self, isTape=False):
        table = self._create_table()
        row = ET.Element("tr")
        row.append(self._create_cell(""))
        
        # Site names
        if isTape:
            row.append(self._create_cell("T0_CH_CERN", header=True))
        
        self._add_site_names(table, row)

        # Pledges
        row = ET.Element("tr")
        row.append(self._create_cell("Pledges [TB]", header=True))
        
        
        total = 0
        if isTape:
            row.append(self._create_cell(to_3f(T0CERNPLEDGE), color=True))
            total += T0CERNPLEDGE
            
        for site in self.sites:
            data = PLEDGES[site.name] if site.name in PLEDGES else 0
            total += data
            row.append(self._create_cell(to_3f(data), color=True))
            
        row.append(self._create_cell(to_3f(total), color=True))
        table.append(row)

        # Used (PhEDEx)
        url = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/nodeusage"
        interests = re.compile(r".*_node_bytes$")
        row = ET.Element("tr")
        row.append(self._create_cell("Used (PhEDEx) [TB]", header=True))
        grand_total = 0
        
        if isTape:
            row.append(self._create_cell(to_TB(T0CERNUSED)))
            grand_total += T0CERNUSED
        
        for site in self.sites:
            params = {"node":"{site}".format(site=site.name)}
            data = json.loads(utils.download_data(url, params))["phedex"]["node"][0]
            total = 0
            for key, value in data.items():
                if interests.match(key):
                    total += long(value)
                    
            if site.name == FNALTAPE:
                row.append(self._create_cell(to_TB(T1FNALMSSUSED)))
                grand_total += T1FNALMSSUSED
            else:
                row.append(self._create_cell(to_TB(total)))
                grand_total += total
                
        row.append(self._create_cell(to_TB(grand_total)))
        table.append(row)
        
        return table
Beispiel #12
0
def sync_world_statuses():
    log('Checking for updates', COLLECTION_NAME, DATA_URL)

    with tempfile.TemporaryDirectory() as tmp_dir:
        temp_file = '{}/{}'.format(tmp_dir, 'world.csv')
        filename = download_data(DATA_URL, temp_file)
        statuses = read_csv(filename)

    sync_with_db(statuses)
def sync_ontario_cases():
    log('Checking for updates', COLLECTION_NAME, DATA_URL)

    with tempfile.TemporaryDirectory() as tmp_dir:
        temp_file = '{}/ontario_cases.csv'.format(tmp_dir)
        filename = download_data(DATA_URL, temp_file)
        cases = read_csv(filename)

    sync_with_db(cases)
Beispiel #14
0
 def __iter__(self):
     n_batches = np.ceil(self.metadata.shape[0] /
                         self.batch_size).astype(int)
     for i in range(n_batches):
         names = self.metadata.iloc[i * (self.batch_size):(i + 1) *
                                    self.batch_size].Id
         tensor = torch.from_numpy(download_data(names)).permute(
             1, 0, 2, 3).float()
         yield tensor
Beispiel #15
0
    def get_data(self, value='Close', filter_date_range=None, *args, **kwargs):
        '''
        Function to get asset historical prices.
        '''
        self.data = utils.download_data(tickers=self.tickers,
                                        value=value,
                                        *args, **kwargs)
        #if only one ticker in self.tickers: download_data returns series!
        if not isinstance(self.data, pd.core.frame.DataFrame):
            self.data = self.data.to_frame(name=self.tickers[0])

        if filter_date_range is not None:
            self.apply_date_filter(filter_date_range=filter_date_range)
Beispiel #16
0
    def download_date(self):
        self._browser_create()
        self._search_loc()
        self._select_loc()

        DataSource.setSource(self._rp5source)
        self._rp5source = DataSource.getSource()

        self._select_data_source()

        if (self._rp5source.lower() == "metar"):
            downTabElem = self._browser.find_element_by_id('tabMetarDLoad')
        elif (self._rp5source.lower() == "archive"):
            downTabElem = self._browser.find_element_by_id('tabSynopDLoad')
        else:
            self._exit_msg = "Rp5 Source is not from available options, pls retry, exiting!"
            self._browser_exit(1)
        downTabElem.click()
        time.sleep(3)

        beginDateElem = self._browser.find_element_by_id('calender_dload')
        beginDateElem.clear()
        Log.info("Start Date is - {}".format(self._beginDate))
        beginDateElem.send_keys(self._beginDate)

        endDateElem = self._browser.find_element_by_id('calender_dload2')
        endDateElem.clear()
        Log.info("End Date is - {}".format(self._endDate))
        endDateElem.send_keys(self._endDate)

        generateDownloadElem = self._browser.find_elements_by_class_name(
            'archButton')[1]
        generateDownloadElem.click()
        time.sleep(5)

        try:
            downloadElem = self._browser.find_element_by_link_text('Download')
            fileUrl = downloadElem.get_attribute('href')
            Log.info("File URL is - " + fileUrl)
        except:
            self._exit_msg = "Download link not found, exiting ..."
            self._browser_exit(1)

        fullPath = download_data(fileUrl, self._fileName, self._dirPath)
        Log.info("Downloading Data to {} ...".format(fullPath))

        Log.info(
            "rp5 data downloaded successfully for {} location from {} to {}".
            format(self._loc, self._beginDate, self._endDate))
        self._browser.close()
        return fullPath
Beispiel #17
0
def main(args, config):
    start_time = time.time()
    # 1) Read data from database
    print(20*'=')
    print('1. Downloading data...')
    data = download_data(user=args.user, password=args.password,tb_name='sketch.train_data_2')
    oh = download_data(user=args.user, password=args.password,tb_name='sketch.ode_school')
    # 2) Data preparation
    print(20*'=')
    print('2. Data processing...')
    data = data_preparation(data, oh) # TODO: args can be the filter of which vars to use
    train_data, test_data = train_val_test_split(data, config['min_train_cohort'], config['min_test_cohort'])

    # 3) Model
    print(20*'=')
    print('3. Training model...')
    model = model_dict[config['model']]
    clf = model(train_data, args=config['hyperparameters'])

    # 4) Compute metric in validation set
    print(20*'=')
    print('4. Evaluation model in validation set...')
    metric = metric_dict[config['metric']](clf, test_data)
    print('{}: {}', config['metric'], metric)

    # We print test and train accuracy
    train_accuracy = accuracy(clf, train_data)
    print('Train accuracy: ', train_accuracy)
    test_accuracy = accuracy(clf, test_data)
    print('Test accuracy: ', test_accuracy)

    # 5) Upload result to postgres
    print(20*'=')
    print('5. Uploading result to database...')
    upload_result(config['model_name'], config['metric'], metric, args.user, args.password)

    print(20*'=')
    print('Finished in {} seconds'.format(time.time()-start_time))
def pull_population(outdir, keys, country, dl_variables, update,
                    population_type):
    '''
    
    Parameters
    ----------
    outdir : str
        Output directory.
    keys : list
        user credentials [username, password].
    country : str
        Country name - must match .config file exactly (names with spaces must replace ' ' with '_').
    dl_variables : dict
        download specific variables in a dict, 'id' = dataset id, 'origin' = dataset origin datetime.datetime object.
    update : boolean
        Whether an existing dataset is being updated.

    Returns
    -------
    None.

    '''

    country_output = outdir + "/" + country + '_' + population_type

    base_url = 'https://www.facebook.com/geoinsights-portal/downloads/raster/?id=' + str(
        dl_variables['id']) + '&ds='

    earliest_date = dl_variables['origin']

    data_dates = get_file_dates(earliest_date)

    if update:
        data_dates = list(
            compress(data_dates,
                     [x > get_update_date(country_output)
                      for x in data_dates]))

    if len(data_dates) == 0:
        sys.exit('No datasets to download. Exiting.')

    urls = get_urls(base_url, data_dates)

    start_time = download_data(urls, keys)

    move_most_recent_files(country_output, urls, start_time)

    remove_empty_files(country_output)

    print('Success.')
Beispiel #19
0
def main(args):
    INPUT_PATH.mkdir(parents=True, exist_ok=True)
    OUTPUT_PATH.mkdir(exist_ok=True)
    if args.clear_cache:
        try:
            shutil.rmtree(TMP_PATH)
            logger.info("Cache was cleared!")
        except Exception as err:
            logger.info("There is no cache to clear!")
    TMP_PATH.mkdir(exist_ok=True)

    if not INPUT_DATA.exists() or args.clear_cache:
        try:
            download_data(
                username=KAGGLE_USER,
                key=KAGGLE_KEY,
                dataset=KAGGLE_DATASET,
                download_path=INPUT_PATH,
            )
        except Exception as err:
            logger.exception("Failed to download the data!")
            raise
    asyncio.run(execute_crawler())
Beispiel #20
0
    def __iter__(self):
        n_batches = np.ceil(self.metadata.shape[0] /
                            self.batch_size).astype(int)
        for i in range(n_batches):

            labels = self.metadata.iloc[i * (self.batch_size):(i + 1) *
                                        self.batch_size].iloc[:, 1:].values
            labels = torch.from_numpy(labels).int()

            names = self.metadata.iloc[i * (self.batch_size):(i + 1) *
                                       self.batch_size].Id
            tensor = torch.from_numpy(download_data(names)).permute(
                0, 3, 1, 2).float()

            yield tensor, labels
Beispiel #21
0
 def download_single_file(self, date):
     url = self.url_base + "/" + f"{date}.csv"
     #url = os.path.join(self.url_base, f"{date}.csv")
     data = download_data(url=url)
     if data is None:
         logging.info(
             f"{date}.csv doesn't not exists or failed to be downloaded!")
         return None
     data.loc[:, 'date_today'] = datetime.strptime(date, "%m-%d-%Y")
     data = data.rename(columns={"Province_State": "state", "Lat": "latitude", "Long_": "longitude",
                                 'Confirmed': "confirmed", 'Deaths': "deaths", 'Recovered': "recovered",
                                 'Active': "active", 'FIPS': "fips", "People_Hospitalized": "hospitalization"}) \
         .dropna(subset=['fips'])
     data.loc[:, "fips"] = data['fips'].astype(int)
     data = data[self.common_columns].fillna(0)
     return data
Beispiel #22
0
 def _add_phedex_totals(self, table):
     url = "https://cmsweb.cern.ch/phedex/datasvc/json/prod/nodeusage"
     interests = re.compile(r".*_node_bytes$")
     row = ET.Element("tr")
     row.append(self._create_cell("phedex"))
     
     grand_total = 0
     for site in self.sites:
         params = {"node":"{site}".format(site=site.name)}
         data = json.loads(utils.download_data(url, params))["phedex"]["node"][0]
         total = 0
         for key, value in data.items():
             if interests.match(key):
                 total += int(value)
                 grand_total += value
         row.append(self._create_cell(to_TB(total)))
     row.append(self._create_cell(to_TB(grand_total)))
     table.append(row)
Beispiel #23
0
def _get_price_moves_and_stats(ticker, stats_data=None, \
                               timezone=None, start=None):
    if timezone is None:
        timezone = 'Europe/London'
    if start is None:
        start = pd.Timestamp(2019, 1, 1, 0)
    if stats_data is None:
        stats_data = utils.download_data(tickers=ticker, start=start, \
                                         interval='60m', value='Close')
    freq_range, frequencies = _create_freq()
    price_movements = dict.fromkeys(frequencies)
    means = np.zeros(freq_range.shape)
    stds = np.zeros(freq_range.shape)
    for index, freq in enumerate(frequencies):
        current_time = start
        current_rng = pd.date_range(start=current_time, end=pd.Timestamp.today(), \
                                    freq=freq, tz=timezone, name='Datetime')
        current_moves = np.diff(stats_data[current_rng])
        current_moves = current_moves[~np.isnan(current_moves)]
        means[index] = np.mean(current_moves)
        stds[index] = np.std(current_moves)
        price_movements[freq] = current_moves
    return price_movements, means, stds
Beispiel #24
0
def get_site(block):
    url = "https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod/data"
    params = {"block": block}
    data = json.loads(utils.download_data(url, params))["phedex"]
    return data["dbs"][0]["dataset"][0]["block"][0]["file"][0]["node"]
Beispiel #25
0
from posts_users_analyse import PostsUsersAnalyser
from utils import download_data

if __name__ == "__main__":
    posts_url = 'https://jsonplaceholder.typicode.com/posts'
    users_url = 'https://jsonplaceholder.typicode.com/users'

    posts = download_data(posts_url)
    users = download_data(users_url)

    analyser = PostsUsersAnalyser()

    posts_number = analyser.create_post_number_list(posts, users)
    nonunique_titles = analyser.find_nonunique_titles(posts)
    neighbours = analyser.find_neighbours(users)
Beispiel #26
0
import numpy as np
import pandas as pd
import featuretools as ft
import utils

utils.download_data()

data_path = 'data/train_FD004.txt'
data = utils.load_data(data_path)
data.head()
Beispiel #27
0
    args = parser.parse_args()

    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    path_to_text = os.path.join(args.path_to_data, 'shakespeare.txt')

    if not os.path.exists(args.path_to_model):
        os.makedirs(args.path_to_model)

    if not os.path.exists(args.path_to_logdir):
        os.makedirs(args.path_to_logdir)

    if not os.path.exists(args.path_to_data):
        os.makedirs(args.path_to_data)

    if args.download:
        download_data(args.path_to_data, uurl)

    text_as_int, char2idx, idx2char = read_and_preprocess_text(path_to_text)
    vocab_size = len(char2idx)

    dataset = CharacheterDataset(text_as_int, args.seq_len)
    train_size = int(len(dataset) * args.train_size)
    trainset, valset = random_split(
        dataset, [train_size, len(dataset) - train_size])

    train_loader = DataLoader(trainset, shuffle=True, batch_size=args.bz)
    val_loader = DataLoader(valset, shuffle=False, batch_size=args.bz)

    model = TextGenerator(embedding_dim=args.emb_dim,
                          hidden_dim=args.hid_dim,
                          vocab_size=vocab_size)
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
# 
#     http://www.apache.org/licenses/LICENSE-2.0
# 
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# 

import argparse
from utils import download_data

if __name__ == '__main__':
	parser = argparse.ArgumentParser(description='download the pre-processed MNIST dataset')
	parser.add_argument('gzfile', type=str, help='the dataset path')
	parser.add_argument('url', type=str, help='dataset url')	
	args = parser.parse_args()
	download_data(args.gzfile, args.url)
from utils import download_data

data_dir = "hymenoptera_data_version_0"
download_url = f"https://pl-flash-data.s3.amazonaws.com/{data_dir}.zip"
download_data(download_url, "./data")
Beispiel #30
0
 def run(self):
     if not os.path.exists(os.getenv("DATA_DIR")):
         os.makedirs(os.getenv("DATA_DIR"))
     local_path = u.download_data(self.year_month)
Beispiel #31
0
#!/usr/bin/env python
#
# Licensed to the Apache Software Foundation (ASF) under one
# or more contributor license agreements.  See the NOTICE file
# distributed with this work for additional information
# regarding copyright ownership.  The ASF licenses this file
# to you under the Apache License, Version 2.0 (the
# "License"); you may not use this file except in compliance
# with the License.  You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#

import argparse
from utils import download_data

if __name__ == '__main__':
    parser = argparse.ArgumentParser(
        description='download the pre-processed MNIST dataset')
    parser.add_argument('gzfile', type=str, help='the dataset path')
    parser.add_argument('url', type=str, help='dataset url')
    args = parser.parse_args()
    download_data(args.gzfile, args.url)
def get_site(block):
    url = "https://cmsweb-testbed.cern.ch/phedex/datasvc/json/prod/data"
    params = {"block":block}
    data = json.loads(utils.download_data(url, params))["phedex"]
    return data["dbs"][0]["dataset"][0]["block"][0]["file"][0]["node"]
Beispiel #33
0
import os

os.environ['KMP_DUPLICATE_LIB_OK'] = 'True'
import numpy as np
from model import Model, save_freeze_tensorflow_model_for_inference, convert_to_tensor_rt, inference_from_tensor_rt_graph
from preprocessing import preprocess
from utils import init_configuration, download_data, get_log_dir, get_arguments_as_dict

# read the parameters from the config file
all_params = init_configuration(config_file='config/config.yaml')

# getting log directory to save the model and results
log_dir = get_log_dir(all_params)

print('downloading data')
train_path, test_path = download_data(reload=True)

print('preprocessing data')
dataset_train, dataset_test, dataset_train_lengths, dataset_test_lengths, dataset_test_for_predict, dataset_test_lengths_for_predict, x_test = preprocess(
    train_path, test_path, all_params)

print('initialize and train the model')
model = Model(log_dir, all_params)
model.train(dataset_train, dataset_test, dataset_train_lengths,
            dataset_test_lengths)

model.predict(dataset_test_for_predict, dataset_test_lengths_for_predict)

frozen_graph, your_outputs = save_freeze_tensorflow_model_for_inference(
    log_dir)
@author: ionut
'''

import sys
import utils


def print_help_exit():
    print('usage: python downloader.py START_YEAR STOP_YEAR')
    print('\texample: python downloader.py 1893 2015')
    sys.exit(1)
    

if len(sys.argv) < 2:
    print_help_exit()

start_year = 0
stop_year = 0
try:
    start_year = int(sys.argv[1])
    stop_year = int(sys.argv[2])
    if not 1763 <= start_year <= stop_year <= 2016:
        raise Exception('invalid year')
except:
    print_help_exit()
    

for i in range(start_year, stop_year):
    filename = '%d.csv.gz' % i
    utils.download_data(filename)
Beispiel #35
0
def main(config, download_resources=False,
         process_data=False, test_size=0.4,
         model_train=False, model_path=None):
    """

    :param config:
    :param download_resources:
    :param process_data:
    :param test_size:
    :param model_train:
    :param model_path:
    :return:
    """
    if download_resources:
        utils.download_data()

    # Get data
    if config['domain'] == "NER":
        train_chr, valid_chr, test_chr, train_word, valid_word, test_word, train_label, \
        valid_label, test_label, chr_id_mappings, = data_loader.prepare_ner_data(
            process_data, test_size)
    else:
        train_chr, valid_chr, test_chr, train_word, valid_word, test_word, train_label, \
        valid_label, test_label, chr_id_mappings, = data_loader.prepare_wjs_data(
            process_data, test_size)

    # Update config
    config['n_classes'] = train_label.shape[2]
    config['char_vocab_dim'] = len(chr_id_mappings) + 1
    config['train_examples'] = train_chr.shape[0]
    config['validation_examples'] = valid_chr.shape[0]
    config['test_examples'] = test_chr.shape[0]

    logging.info("CONFIG:")
    logging.info("\n".join([k + ": " + str(v) for k, v in config.items()]))

    model = models.CNN_BILSTM_CRF(config)

    if model_train:
        train.train(train_word=train_word,
                    valid_word=valid_word,
                    train_chr=train_chr,
                    valid_chr=valid_chr,
                    train_label=train_label,
                    valid_label=valid_label,
                    num_epochs=config['train_epochs'],
                    model=model,
                    batch_size=config['batch_size'],
                    config=config)
        # Evaluate at the end
        logging.info("Evaluating at the TEST set")
        train.eval(model, test_chr, test_word, test_label, config['batch_size'])

    else:
        if model_path:
            saver = tf.train.Saver()
            saver.restore(model.sess, model_path)
            # Test the model on the test set
            logging.info("Evaluating at the TEST set")
            train.eval(model, test_chr, test_word, test_label,
                       config['batch_size'])
        else:
            print("No trained models exist! You have to train the model first.")
from model import FullyConnectedNetwork
from utils import download_data
import matplotlib.pyplot as plt

# add foundations.set_tensorboard_logdir() code here

# download data
train_df, test_df = download_data()

# prepare data
input_size = len(train_df.columns) - 1 # don't include the target when counting inputs
numeric_columns = ['machine_hours_current_meter', 'age_in_years', 'target']
categorical_sizes = {col: train_df[col].nunique() for col in train_df.columns if col not in numeric_columns}

# define hyperparameters
# replace following with foundations.load_parameters()
hyperparameters = {'n_epochs': 5,
                   'batch_size': 128,
                   'validation_percentage': 0.1,
                   'dense_blocks': [{'size': 256, 'dropout_rate': 0}],
                   'embedding_factor': 0.5,
                   'learning_rate':0.0001,
                   'lr_plateau_factor':0.1,
                   'lr_plateau_patience':3,
                   'early_stopping_min_delta':0.001,
                   'early_stopping_patience':5}

# train
model = FullyConnectedNetwork(input_size, hyperparameters, categorical_sizes)
hist = model.train(train_df)