Example #1
0
def form_population_data():
    '''
	Specialized function for turning state-wise population information into features
	Returns: Saves a dictionary with key = state name and value = (population, population density, population over 65 in %)
	'''

    path1 = '../dataset/population_density_usa.csv'
    path2 = '../dataset/population_old_usa.csv'

    df1 = pd.read_csv(path1)
    df2 = pd.read_csv(path2)
    column1 = ['State', 'Population', 'Density']
    column2 = ['State', 'Population65+%']
    pop_info = dict()

    for values in df1[column1].values:
        state, pop, density = values
        pop_info[state] = [
            float(pop.replace(',', '')),
            float(density.replace(',', ''))
        ]

    for values in df2[column2].values:
        state, pop65 = values
        pop_info[state].append(float(pop65))

    save_pickle(pop_info, '../dataset/generated/usa/pop_info')
Example #2
0
def main():
    # Load a dictionary of Michael's quotes to their season and episode
    print("Attempting to load quotes from file")
    quotes = load_quotes()
    if quotes is None:
        print("Scraping the web for new quotes")
        quotes = scrape()

    print("Creating sentence encoder")
    encoder = Encoder()

    print("Attempting to load quote embeddings from file")
    quote_embeddings = load_quote_embeddings()
    if quote_embeddings is None:
        print("Generating new quote embeddings")
        quote_embeddings = generate_quote_embeddings(encoder, quotes)
        print("Saving new quote embeddings to {0}".format(embeddings_file))
        save_pickle(quote_embeddings, embeddings_file)

    print("Creating predictor")
    predictor = Predictor(encoder, quote_embeddings)

    while True:
        input_sentence = query_input()
        prediction = predictor.predict_output(input_sentence)
        output_quote = prediction[0]
        output_season = prediction[1]['season']
        output_episode = prediction[1]['episode']
        print("Michael says: \"{0}\" in season {1}, episode {2}".format(
            output_quote, output_season, output_episode))
Example #3
0
 def write_stats_pickle(self, base_path: Union[str, Path]):
     """
     write the stats dictionary as a pickle
     :return:
     """
     filename = os.path.join(base_path, 'graph_stats', self.dataset,
                             self.model,
                             f'gs_{self.trial}_{self.iteration}.pkl.gz')
     CP.print_blue(f'Stats pickle stored at {filename}')
     save_pickle(self.stats, filename)
     return
Example #4
0
def load_preprocessing():

    # cfg.preprocessing = proc.ComposeProcessColumn([
    #     prep.Resize(224, apply_to_target=False),
    #     prep.MinMaxNorm(background=-1),
    #     prep.LocalMedian(background=-1),
    #     prep.EqualizeHist(background=-1),
    # ])
    cfg.preprocessing = proc.Processor()

    u.save_pickle(cfg.preprocessing, join(cfg.tensorboard_path, 'preprocessing.pkl'))
Example #5
0
    def _do_masking(self):
        try:
            # get key and dir's abs_path
            key_str = self._get_key()
            file_path = self._get_path()
            # print(key_str, file_path)
            wb_read = load_workbook(file_path, read_only=True)
            wb_write = Workbook(write_only=True)
            hash_bytes = load_pickle('mapping.pkl')
            # get the count of all data rows
            row_count = 0
            current_count = 1
            for sheetname in wb_read.sheetnames:
                sheet_read = wb_read[sheetname]
                row_count += sheet_read.max_row
            # read data and do masking, and then save the masked rows
            for sheetname in wb_read.sheetnames:
                print('processing sheet {}:'.format(sheetname))
                sheet_read = wb_read[sheetname]
                # sheet_row_count = sheet_read.max_row
                # print(sheet_row_count)
                sheet_write = wb_write.create_sheet(title=sheetname)
                rows_read = sheet_read.rows
                for row in rows_read:
                    row_values = []
                    for cell in row:
                        row_values.append(cell.value)
                    # do masking
                    if current_count > 1:
                        masked_row, hash_bytes_added = mask_row(
                            key_str, sheetname, row_values)
                        hash_bytes.update(hash_bytes_added)
                        sheet_write.append(masked_row)
                    else:
                        sheet_write.append(row_values)
                    current_count += 1
                    if current_count % 100 == 0 or current_count == row_count:
                        self._set_processBar(current_count / row_count * 100)
                        print('完成了{}%'.format(current_count / row_count * 100))
            save_pickle('mapping.pkl', hash_bytes)

            QMessageBox.information(QWidget(), "Information",
                                    "数据脱敏成功,点击确认后请保存文件")
            write_path = QFileDialog.getSaveFileName(caption="保存为.xlsx文档",
                                                     directory="./")[0]
            # write_path = './加密数据.xlsx'
            # print(write_path)
            wb_write.save(write_path)
            QMessageBox.information(QWidget(), "Information", "保存完成")
        except Exception as e:
            QMessageBox.warning(QWidget(), "warning", str(e))
            print(e)
Example #6
0
    def extract_features(self, model, model_path, model_tag, used_set,
                         loaders_dic):
        """
        inputs:
            model : The loaded model containing the feature extractor
            loaders_dic : Dictionnary containing training and testing loaders
            model_path : Where was the model loaded from
            model_tag : Which model ('final' or 'best') to load
            used_set : Set used between 'test' and 'val'
            n_ways : Number of ways for the task

        returns :
            extracted_features_dic : Dictionnary containing all extracted features and labels
        """

        # Load features from memory if previously saved ...
        save_dir = os.path.join(model_path, model_tag, used_set)
        filepath = os.path.join(save_dir, 'output.plk')
        if os.path.isfile(filepath):
            extracted_features_dic = load_pickle(filepath)
            print(" ==> Features loaded from {}".format(filepath))
            return extracted_features_dic

        # ... otherwise just extract them
        else:
            print(" ==> Beginning feature extraction")
            if not os.path.isdir(save_dir):
                os.makedirs(save_dir)

        model.eval()
        with torch.no_grad():

            all_features = []
            all_labels = []
            for i, (inputs, labels,
                    _) in enumerate(warp_tqdm(loaders_dic['test'], False)):
                inputs = inputs.to(self.device)
                outputs, _ = model(inputs, True)
                all_features.append(outputs.cpu())
                all_labels.append(labels)
            all_features = torch.cat(all_features, 0)
            all_labels = torch.cat(all_labels, 0)
            extracted_features_dic = {
                'concat_features': all_features,
                'concat_labels': all_labels
            }
        print(" ==> Saving features to {}".format(filepath))
        save_pickle(filepath, extracted_features_dic)
        return extracted_features_dic
Example #7
0
def train(args, device_id):
    init_logger(args.log_file)

    device = "cpu" if args.visible_gpus == '-1' else "cuda"
    logger.info('Device ID %d' % device_id)
    logger.info('Device %s' % device)
    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    if device_id >= 0:
        torch.cuda.set_device(device_id)
        torch.cuda.manual_seed(args.seed)

    torch.manual_seed(args.seed)
    random.seed(args.seed)
    torch.backends.cudnn.deterministic = True

    def train_iter_fct():
        return data_loader.Dataloader(args,
                                      load_dataset(args, 'train',
                                                   shuffle=True),
                                      args.batch_size,
                                      device,
                                      shuffle=True,
                                      is_test=False)

    model = Summarizer(args, device, load_pretrained_bert=True)
    if args.train_from != '':
        logger.info('Loading checkpoint from %s' % args.train_from)
        checkpoint = torch.load(args.train_from,
                                map_location=lambda storage, loc: storage)
        opt = vars(checkpoint['opt'])
        for k in opt.keys():
            if (k in model_flags):
                setattr(args, k, opt[k])
        model.load_cp(checkpoint)
        optim = model_builder.build_optim(args, model, checkpoint)
    else:
        optim = model_builder.build_optim(args, model, None)

    logger.info(model)
    trainer = build_trainer(args, device_id, model, optim)
    losses, n_docs = trainer.train(train_iter_fct, args.train_steps)

    save_pickle(losses, 'losses_classifier')
    save_pickle(n_docs, 'docs_classifier')
Example #8
0
def valid(_cfg, model, all_exam=False):
    cfg = copy.deepcopy(_cfg)
    if all_exam:
        cfg["dataset"]["param"][
            "posexam_only"] = False  # validation for all slices
    assert cfg["output"]
    assert not os.path.exists(cfg["output"])
    criterion = factory.get_criterion(cfg)

    path = os.path.join(output_dir, 'fold%d_ep0.pt' % (cfg['fold']))
    print(f'best path: {str(path)}')
    utils.load_model(str(path), model)

    loader_valid = factory.get_loader_valid(cfg)
    with torch.no_grad():
        results = run_nn(cfg,
                         'valid',
                         model,
                         loader_valid,
                         criterion=criterion)
    utils.save_pickle(results, cfg["output"])
    log('saved to %s' % cfg["output"])
Example #9
0
def create_flight_data(df):
    '''

	Args:
		df (Pandas dataframe): Travel data

	Returns: Saves a dictionary where key = (state1, state2) and value = count of flights
	'''

    columns_of_interest = ['ORIGIN_STATE_NM', 'DEST_STATE_NM']

    # Loop through the columns and increase the count
    flight_dict = dict()
    for s1, s2 in df[columns_of_interest].values:
        # Assuming an undirected graph, hence using a frozen set
        if s1 != s2:
            pair = frozenset([s1, s2])
            if pair in flight_dict:
                flight_dict[pair] += 1
            else:
                flight_dict[pair] = 1

    save_path = '../dataset/generated/flightdict'
    save_pickle(flight_dict, save_path)
def main():
    with Timer('Loading config'):
        cfg = load_config()

    with Timer('Loading tweets'):
        tweets = load_raw_data(cfg['RAW_DATA_PATH'])

    with Timer('Cleaning sentences'):
        tweet_text = cleanse_sentences(list(tweets['text']))

    with Timer('Mapping characters to integers'):
        tweet_enc, map_char_to_int, map_int_to_char = map_tweets_to_int(tweet_text)

    with Timer('Producing dataset'):
        tweet_train, tweet_label = produce_dataset(tweet_enc)

    with Timer('Save dataset and mapping tables'):
        save_pickle(tweet_train, cfg['PROCESSED_DATA_DIR'] + '/train.pkl')
        save_pickle(tweet_label, cfg['PROCESSED_DATA_DIR'] + '/label.pkl')
        save_pickle(map_char_to_int, cfg['PROCESSED_DATA_DIR'] + '/map_char_to_int.pkl')
        save_pickle(map_int_to_char, cfg['PROCESSED_DATA_DIR'] + '/map_int_to_char.pkl')
Example #11
0
def main():
    """
    Load raw ECG data from disc and transform it to cleansed training data.
    """
    cfg = load_config()

    with Timer('Getting label list'):
        labels, file_list = get_labels(cfg['RAW_DATA_PATH'] +
                                       '/Diagnostics.xlsx')

    with Timer('Loading & Downsampling files'):
        ecg_data = get_ecg_data(cfg['RAW_DATA_PATH'] + '/ECGDataDenoised',
                                file_list, cfg['DOWNSAMPLE_THRESHOLD'],
                                cfg['DATA_SLICE'], cfg['NUM_WORKERS'])

    with Timer('Imputing missing values'):
        ecg_data = impute_nans(ecg_data)

    with Timer('Splitting into Train & Test Set'):
        x_train, x_test, y_train, y_test = train_test_split(ecg_data,
                                                            labels,
                                                            test_size=0.2,
                                                            shuffle=True,
                                                            stratify=labels,
                                                            random_state=42)

        print('Final Training set has {} samples'.format(len(x_train)))
        print('Final Test set has {} samples'.format(len(x_test)))
        print('Distribution of labels in Training: {}'.format(
            Counter(y_train)))
        print('Distribution of labels in Testing: {}'.format(Counter(y_test)))

    with Timer('Normalizing data'):
        x_train, x_test = normalize_data(x_train, x_test)

    with Timer('Saving generated arrays'):
        save_pickle(x_train, cfg['PROCESSED_DATA_DIR'] + '/train_data.pkl')
        save_pickle(y_train, cfg['PROCESSED_DATA_DIR'] + '/train_label.pkl')
        save_pickle(x_test, cfg['PROCESSED_DATA_DIR'] + '/test_data.pkl')
        save_pickle(y_test, cfg['PROCESSED_DATA_DIR'] + '/test_label.pkl')
Example #12
0
        for data in train_loader:
            data = data.to(device)
            # print(data.x.size())
            optimizer.zero_grad()
            output = model(data)
            label = data.y.to(device).reshape(-1, 1)
            loss = cost(output, label)
            loss.backward()
            total_loss = loss
            optimizer.step()

    # for actual, predicted in zip(label, output):
    #     print(actual, predicted)
    # print("actual = ", label, "predicted = ", output, "loss =", total_loss)

    save_pickle(output.cpu().detach().numpy().reshape(-1, ), SOURCE_PATH / 'dataset/timeseries/data/outputs.pkl')

elif reply == 3:
    epochs = 12
    lr = 0.01
    cases = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/features.pkl')
    distances = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/dist_matrix.pkl')
    n_flights = load_pickle(SOURCE_PATH / 'dataset/timeseries/data/travel_matrix.pkl')
    distances = 1 - distances

    norm = np.max(cases)
    cases = cases / np.max(cases)
    edges = distances / np.max(distances) + n_flights / np.max(n_flights)
    edges /= np.max(edges)

    labels = torch.FloatTensor(cases[:, 0])
Example #13
0
def main(args):

    cfg.args = args
    cfg.BATCH_SIZE = args['BATCH_SIZE']
    cfg.EPOCHS = args['EPOCHS'] if not cfg.DEBUG else 2
    cfg.TRAIN_TEST_SPLIT = args['TRAIN_TEST_SPLIT']
    cfg.MODEL_NAME = args['MODEL_NAME']
    cfg.MODEL_ARGS = args['MODEL_ARGS']

    assert type(cfg.BATCH_SIZE) == int, "Batch size must be int"
    assert type(cfg.EPOCHS) == int, "Epochs must be int"

    cfg.res = pd.DataFrame([args])

    cfg.background = cfg.MODEL_ARGS.get('background', None)
    

    now = datetime.now()
    day, hour = now.strftime("%d/%m/%Y %H:%M:%S").split(' ')
    cfg.res['day'] = [day]
    cfg.res['hour'] = [hour]
    # Get tensorboard path to save results
    if not cfg.DEBUG:
        cfg.tensorboard_path = du.get_save_path(
            path_dirs=all_paths['tensorboard_classification'],
            path_to_manager=all_paths['manager_classification'],
            args=args,
        )
        cfg.res['tensorboard_path'] = cfg.tensorboard_path

        u.save_pickle(args, join(cfg.tensorboard_path, 'cur_args.pkl'))
        u.save_yaml(args, join(cfg.tensorboard_path, 'cur_args.yaml'))
        do_save_code.save_in_final_file()
    else:
        cfg.tensorboard_path = None

    # Load data oriented
    # load_preprocessing.main()
    load_data.main_train()

    # Model creation
    load_model.main()
    
    # Training
    print('==================')
    print('Training ...')
    tr.train(
        cfg.model,
        cfg.optimizer,
        cfg.loss,
        cfg.observables,
        defreezer=cfg.defreezer,
        number_of_epochs=cfg.EPOCHS,
        trainloader=cfg.trainloader,
        valloader=cfg.testloader,
        grad_input=True,
        retain_graph=True,
        grad_in_eval=True,
    #    interval=1,
        output_dir_tensorboard=cfg.tensorboard_path,
        device=cfg.device,
        verbose=VERBOSE_TRAIN,
    )
    print('Done.')

    cfg.res['Batch_Epoch_weights'] = [cfg.observables[0].best_weights_batch_epoch]

    if not cfg.DEBUG:
        cfg.model.load_state_dict(
            torch.load(join(cfg.tensorboard_path, 'best_weights.pt'))
        )
        print('weights loaded from', join(cfg.tensorboard_path, 'best_weights.pt'), 'Epoch Batch: ', cfg.res['Batch_Epoch_weights'])

    print('==================')
    print('Evaluating on train ...')
    # Evaluation on train set
    loss_train, metric_train = te.evaluate_model(
        cfg.model,
        cfg.trainloader_for_test,
        cfg.criterion,
        cfg.metrics,
        # 20,
        device=cfg.device,
    )
    print('Done.')

    title_train = {
        'loss': loss_train.item(),
        'metric': u.round_dict_array(metric_train)
    }
    cfg.res['loss_train'] = [title_train['loss']]
    for key, metric in metric_train.items():
        cfg.res['metric_train_{}'.format(key)] = [metric.cpu().numpy()]

    # Evaluation on test set
    print('==================')
    print('Evaluating on test ...')
    loss_test, metric_test = te.evaluate_model(
        cfg.model,
        cfg.testloader,
        cfg.criterion,
        cfg.metrics,
        # 20,
        device=cfg.device,
    )    
    print('Loss Test: {}'.format(loss_test))
    print('Metric Test: {}'.format(metric_test))
    print('Done.')
    


    title_test = {
        'loss': loss_test.item(),
        'metric': u.round_dict_array(metric_test)
    }
    cfg.res['loss_test'] = [title_test['loss']]
    for key, metric in metric_test.items():
        cur_metric = metric.cpu().numpy()
        if cur_metric.shape == ():
            cfg.res['metric_test_{}'.format(key)] = [cur_metric]
        else:
            for idx_met, met in enumerate(cur_metric):
                cfg.res['metric_test_{}_{}'.format(key, idx_met)] = [met]


    return cfg.res
Example #14
0
 def save(self):
     save_pickle(reduce_mem_usage(self.train), self.train_path)
     save_pickle(reduce_mem_usage(self.test), self.test_path)
Example #15
0
    # ===============================
    # === Make submission
    # ===============================

    sample_submission = pd.read_csv(input_dir / "sample_submission.csv")
    submission_df = make_submission(test_preds, sample_submission)

    # ===============================
    # === Save
    # ===============================

    config["eval_results"] = dict()
    for k, v in evals_results.items():
        config["eval_results"][k] = v
    save_path = output_dir / "output.json"
    save_json(config, save_path)

    plot_feature_importance(feature_importance,
                            output_dir / "feature_importance.png")

    np.save(output_dir / "oof_preds.npy", oof_preds)

    np.save(output_dir / "test_preds.npy", test_preds)

    submission_df.to_csv(output_dir / "submission.csv", index=False)

    save_pickle(models, output_dir / "model.pkl")

    slack_notify(config_name + "終わったぞ\n" + str(config))
Example #16
0
def scrape():
    episode_dict = build_episode_dict()
    quote_dict = build_quote_dict(episode_dict)
    print("Saving new quotes")
    save_pickle(quote_dict, quotes_file)
    return quote_dict
Example #17
0
def index_words(corpus_path, output_directory, min_count):
    regex = re.compile(r'<s>\s?|\s?</s>|\r\n|\n', re.MULTILINE)
    word2index = {PAD_STR: PAD, BOS_STR: BOS, EOS_STR: EOS, UNK_STR: UNK}
    index2word = {PAD: PAD_STR, BOS: BOS_STR, EOS: EOS_STR, UNK: UNK_STR}
    index2count = Counter()
    word2count = Counter()
    data = []
    with open(corpus_path, 'r') as f:
        total = sum(1 for _ in f)
    with open(corpus_path, 'r') as f:
        for sentence in tqdm(f, total=total, desc='Reading corpus file'):
            sentence = re.sub(regex, '', sentence)
            words = sentence.split()
            for word in words:
                word2count[word] += 1
            data.append(words)
    unk_cnt = 0
    for word, count in tqdm(word2count.most_common(),
                            desc='Fitering words using min_count'):
        if count >= min_count:
            ind = len(word2index)
            word2index[word] = ind
            index2word[ind] = word
            index2count[ind] = count
        else:
            unk_cnt += 1
    index2count[PAD] = 0
    index2count[UNK] = unk_cnt
    index2count[BOS] = 1  # Laplace
    index2count[EOS] = 1  # Laplace

    del word2count

    with open(os.path.join(output_directory, VOCAB_SIZE_FNAME), 'w') as f:
        f.write(len(word2index))

    print("Saving word2index...")
    save_pickle(os.path.join(output_directory, WORD2INDEX_FNAME), word2index)

    print("Saving index2word...")
    save_pickle(os.path.join(output_directory, INDEX2WORD_FNAME), index2word)
    del index2word

    print("Saving index2count...")
    save_pickle(os.path.join(output_directory, INDEX2COUNT_FNAME), index2count)
    del index2count

    def pad(l, pad_token, length):
        return l + [pad_token] * (length - len(l))

    dataset = []
    for sentence in tqdm(data, desc="Creating dataset"):
        seq = [word2index[w] if w in word2index else UNK for w in sentence]
        seq = [BOS] + seq + [EOS]
        if len(seq) < MAX_SENTENCE_LENGTH:
            dataset.append(pad(seq, PAD, MAX_SENTENCE_LENGTH))

    print("Freeing memory...")
    del data
    del word2index
    gc.collect()

    if not os.path.exists(output_directory):
        print("{} doesn't exists, creating".format(output_directory))
        os.mkdir(output_directory)

    print("Creating pandas dataframe with dataset")
    df = pd.DataFrame(dataset, dtype=np.int32)
    df = df.sample(frac=1).reset_index(drop=True)
    train, test = train_test_split(df, test_size=0.2)
    train = train.reset_index(drop=True)
    test = train.reset_index(drop=True)

    print("Saving train dataset")
    train.to_csv(os.path.join(output_directory, TRAIN_DATASET_FNAME),
                 index=False,
                 header=False)

    print("Saving test dataset")
    test.to_csv(os.path.join(output_directory, TEST_DATASET_FNAME),
                index=False,
                header=False)
Example #18
0
 def save(self, filename):
     save_dict = {'log_pattern_scores': self.log_pattern_scores, 
                  'pattern_count': self.pattern_count}
     save_pickle(filename, save_dict)
    def run(self, use_pickle: bool) -> None:
        """
        New runner - uses list of graphs
        :param use_pickle:
        :return:
        """
        pickle_ext = '.pkl.gz'
        self.graphs = []

        if use_pickle:
            if check_file_exists(self.graphs_pickle_path +
                                 pickle_ext):  # the whole pickle exists
                graphs = load_pickle(self.graphs_pickle_path + pickle_ext)
                #assert len(graphs) == 21, f'Expected 21 graphs, found {len(graphs)}'
                assert len(
                    graphs
                ) == self.num_generations + 1, f'Expected 21 graphs, found {len(graphs)}'
                CP.print_green(
                    f'Using completed pickle at {self.graphs_pickle_path + pickle_ext!r}. Loaded {len(graphs)} graphs'
                )
                return
            else:
                temp_file_pattern = re.compile(
                    f'list_(\d+)_{self.trial}_temp_(\d+).pkl.gz')
                dir_name = '/'.join(self.graphs_pickle_path.split('/')[:-1])

                input_files = [
                    f for f in os.listdir(dir_name)
                    if re.match(temp_file_pattern, f)
                ]
                if len(input_files) > 0:
                    assert len(
                        input_files
                    ) == 1, f'More than one matches found: {input_files}'

                    input_file = input_files[0]
                    total_generations, progress = map(
                        int,
                        temp_file_pattern.fullmatch(input_file).groups())
                    graphs = load_pickle(join(dir_name, input_file))
                    assert len(
                        graphs
                    ) == progress + 1, f'Found {len(graphs)}, expected: {progress}'
                    CP.print_blue(
                        f'Partial pickle found at {input_file!r} trial: {self.trial} progress: {progress}/{total_generations}'
                    )
                    self.graphs = graphs

        remaining_generations = self.num_generations - len(self.graphs)

        tqdm.write(
            f'Running Infinity Mirror on {self.initial_graph.name!r} {self.initial_graph.order(), self.initial_graph.size()} {self.model.model_name!r} {remaining_generations} generations'
        )
        pbar = tqdm(total=remaining_generations,
                    bar_format='{l_bar}{bar}|[{elapsed}<{remaining}]',
                    ncols=50)

        if len(self.graphs) == 0:
            self.initial_graph.level = 0
            self.graphs = [self.initial_graph]
            self.features = [None]

        completed_trial = False
        for i in range(len(self.graphs) - 1, self.num_generations):
            if i == len(self.graphs) - 1:
                curr_graph = self.graphs[-1]  # use the last graph

            level = i + 1
            try:
                fit_time_start = time.perf_counter()
                self.model.update(
                    new_input_graph=curr_graph)  # update the model
                fit_time = time.perf_counter() - fit_time_start
            except Exception as e:
                fit_time = np.nan
                print(f'Model fit failed {e}')
                break

            try:
                gen_time_start = time.perf_counter()
                generated_graphs = self.model.generate(
                    num_graphs=self.num_graphs,
                    gen_id=level)  # generate a new set of graphs
                gen_time = time.perf_counter() - gen_time_start
            except Exception as e:
                gen_time = np.nan
                print(f'Generation failed {e}')
                break

            if self.features:
                self.features.append(self.model.params)
            curr_graph = generated_graphs[
                0]  # we are only generating one graph
            curr_graph.name = f'{self.initial_graph.name}_{level}_{self.trial}'
            curr_graph.gen = level
            self.graphs.append(curr_graph)

            temp_pickle_path = self.graphs_pickle_path + f'_temp_{level}{pickle_ext}'
            prev_temp_pickle_path = self.graphs_pickle_path + f'_temp_{level-1}{pickle_ext}'

            temp_features_path = self.graphs_features_path + f'_temp_{level}{pickle_ext}'
            prev_temp_features_path = self.graphs_features_path + f'_temp_{level-1}{pickle_ext}'

            save_pickle(obj=self.graphs, path=temp_pickle_path)
            save_pickle(obj=self.features, path=temp_features_path)

            delete_files(prev_temp_pickle_path)
            delete_files(prev_temp_features_path)

            self.write_timing_csv(iter_=level,
                                  fit_time=fit_time,
                                  gen_time=gen_time)

            if level == self.num_generations:
                completed_trial = True
            pbar.update(1)
        pbar.close()

        if completed_trial:  # only delete the temp pickle if the trial finishes successfully
            delete_files(
                temp_pickle_path
            )  # delete the temp file if the loop finishes normally
            delete_files(
                temp_features_path
            )  # delete the temp file if the loop finishes normally
            CP.print_green(
                f'List of {len(self.graphs)} Graphs is pickled at "{self.graphs_pickle_path + pickle_ext}"'
            )
            save_pickle(obj=self.graphs,
                        path=self.graphs_pickle_path + pickle_ext)
            save_pickle(obj=self.features,
                        path=self.graphs_features_path + pickle_ext)
        return
Example #20
0
    def fit(self, input_df: XDataFrame) -> None:
        """Fit to data frame

        Args:
            input_df (XDataFrame): Input data frame.
        Returns:
            XDataFrame : Output data frame.
        """
        org_cols = input_df.columns.tolist()

        input_df = (input_df.to_pandas()
                    if isinstance(input_df, cudf.DataFrame) else input_df)

        seen_cols_pairs = (load_pickle(self.save_path /
                                       "seen_feats_pairs.pkl") if
                           (self.save_path / "seen_feats_pairs.pkl").exists()
                           else defaultdict(list))
        removed_cols_pairs = (load_pickle(self.save_path /
                                          "removed_feats_pairs.pkl") if
                              (self.save_path /
                               "removed_feats_pairs.pkl").exists() else
                              defaultdict(list))
        removed_cols = sum(removed_cols_pairs.values(), [])
        if self.dry_run:
            self._selected_cols = [
                col for col in org_cols if col not in set(removed_cols)
            ]
            return

        org_cols = [col for col in org_cols if col not in removed_cols]
        counter = 0
        for i in tqdm(range(len(org_cols) - 1)):
            feat_a_name = org_cols[i]
            if feat_a_name in removed_cols:
                continue

            feat_a = input_df[feat_a_name]

            for j in range(i + 1, len(org_cols)):
                feat_b_name = org_cols[j]

                if self._has_seen(feat_a_name, feat_b_name, seen_cols_pairs):
                    continue
                else:
                    seen_cols_pairs[feat_a_name].append(feat_b_name)
                    seen_cols_pairs[feat_b_name].append(feat_a_name)

                if self._has_removed(feat_a_name, feat_b_name, removed_cols):
                    continue

                feat_b = input_df[feat_b_name]
                c = np.corrcoef(feat_a, feat_b)[0][1]

                if abs(c) > self._threshold:
                    counter += 1
                    removed_cols.append(feat_b_name)
                    removed_cols_pairs[feat_a_name].append(feat_b_name)
                    print("{}: FEAT_A: {} FEAT_B: {} - Correlation: {}".format(
                        counter, feat_a_name, feat_b_name, c))

        save_pickle(removed_cols_pairs,
                    self.save_path / "removed_feats_pairs.pkl")
        save_pickle(seen_cols_pairs, self.save_path / "seen_feats_pairs.pkl")
        self._selected_cols = [
            col for col in org_cols if col not in set(removed_cols)
        ]
    df2 = df2.set_index('State')
    df1.sort_index(inplace=True)
    df2.sort_index(inplace=True)

    df1['Population65+%'] = df2['Population65+%']

    print(df1[['Population', 'Density', 'Population65+%']])
    return df1[['Population', 'Density', 'Population65+%']].to_numpy()


if __name__ == "__main__":
    df = pd.read_csv(SOURCE_PATH / 'data/COVID19.csv')
    travel_df = pd.read_csv(SOURCE_PATH / 'data/travel_data.csv')
    selected_cols = ['Province', 'Country', 'Lat', 'Long', 'Date', 'Value']
    list_of_states = load_pickle(SOURCE_PATH / 'data/us_states_list.pkl')
    df = df[selected_cols]
    features, df = get_US_states_data(list_of_states, df)
    dist_matrix = create_dist_matrix(df)
    travel_matrix = create_flight_matrix(list_of_states, travel_df)
    age_df = pd.read_csv(SOURCE_PATH / 'data/population_old_usa.csv')
    pop_df = pd.read_csv(SOURCE_PATH / 'data/population_density_usa.csv')
    pop_age_df = get_us_pop_data(pop_df, age_df)
    features = np.append(features, pop_age_df, axis=1)
    print(travel_matrix)
    save_pickle(features, SOURCE_PATH / 'data/features.pkl')
    save_pickle(dist_matrix, SOURCE_PATH / 'data/dist_matrix.pkl')
    save_pickle(travel_matrix, SOURCE_PATH / 'data/travel_matrix.pkl')
    a = load_pickle('data/pop_info')
    print(a)