Example #1
0
def load_whole(mdir):
    obj_path = os.path.join(mdir, 'args.pkl')
    assert os.path.exists(obj_path)
    args = utils.load_obj(obj_path)
    cdir = args.cdir

    ft_names = [
        '1-gram', '2-gram', '3-gram', '4-gram', 'unicode-block', 'word'
    ]
    ft_extractors = {name: None for name in ft_names}
    for name in ft_extractors:
        cache_path = os.path.join(cdir, f'{name}.pkl')
        assert os.path.exists(cache_path)
        ft_extractors[name] = utils.load_obj(cache_path)

    cache_path = os.path.join(cdir, 'lang.pkl')
    assert os.path.exists(cache_path)
    LANG = utils.load_obj(cache_path)

    mdl = FeedforwardNetwork(args, ft_extractors, LANG)
    fmdl = os.path.join(mdir, 'mdl.pkl')
    mdl.load_state_dict(torch.load(fmdl, map_location=torch.device('cpu')))
    mdl.eval()
    utils.log(f'Loaded model from {fmdl}')

    iso_639_4 = pd.read_csv('ISO-639-4.csv', '\t')
    lang2label = {row.iso: row.label for idx, row in iso_639_4.iterrows()}
    utils.log(f'Loaded ISO-639-4')

    return ft_extractors, LANG, cdir, mdl, lang2label
Example #2
0
def Q1_3():
    hashtags = [
        '#gohawks', '#nfl', '#sb49', '#gopatriots', '#patriots', '#superbowl'
    ]
    for tag in hashtags:
        X = load_obj(tag + '_Q13')[:-1, :]
        y = load_obj(tag + '_numTweetsInHour')[1:]
        model = stats_api.OLS(y, X)
        res = model.fit()
        y_pred = res.predict(X)
        y_resid = y - y_pred
        sum_err = pow(y_resid, 2)
        sum_err = np.sum(sum_err)
        print(res.summary())
        #     print(sum_err)
        rmse = sqrt(sum_err / len(y_resid))
        print('%s has RMSE of %.3f' % (tag, rmse))

        features = [
            'mentionCount', 'rankScore', 'passitivity',
            'co-occurrence_of_tags', 'unique_author'
        ]
        for i in [0, 2, 3]:
            x_plt = X[:, i]
            ys = [[y, 'Predictant']]
            x_label = features[i]
            y_label = 'number of tweets for next hour'
            title = tag + ', ' + x_label
            make_plot(x_plt,
                      ys,
                      scatter=True,
                      xlabel=x_label,
                      ylabel=y_label,
                      title=title)
        print('=============================')
Example #3
0
    def configure_optimizers(self):
        if 'decoder_lr' in self.cfg.optimizer.params.keys():
            params = [
                {
                    'params': self.model.decoder.parameters(),
                    'lr': self.cfg.optimizer.params.lr
                },
                {
                    'params': self.model.encoder.parameters(),
                    'lr': self.cfg.optimizer.params.decoder_lr
                },
            ]
            optimizer = load_obj(self.cfg.optimizer.class_name)(params)

        else:
            optimizer = load_obj(self.cfg.optimizer.class_name)(
                self.model.parameters(), **self.cfg.optimizer.params)
        scheduler = load_obj(self.cfg.scheduler.class_name)(
            optimizer, **self.cfg.scheduler.params)

        return [optimizer], [{
            "scheduler": scheduler,
            "interval": self.cfg.scheduler.step,
            'monitor': self.cfg.scheduler.monitor
        }]
Example #4
0
def get_datasets(period_params, symbols_list_name, thresholds_lst,
                 target_shift,
                 mode='all', datasets=None):
    print("Initializing datasets for periods: %s" % period_params)
    if not datasets:
        datasets = {}
    for thresholds in thresholds_lst:
        for resample_period, magic_number in period_params:
            normal_name, z_name = get_datasets_name(resample_period,
                                                    symbols_list_name,
                                                    thresholds,
                                                    target_shift)

            normal_file = os.path.join(DATA_PATH, normal_name)
            z_file = os.path.join(DATA_PATH, z_name)

            if exists_obj(normal_file) and exists_obj(z_file):
                print("Loading from cache:\n * %s\n * %s" % (
                    normal_file, z_file))
                dfn = load_obj(normal_file)
                dfz = load_obj(z_file)
            else:
                dfn, dfz = get_data(resample_period=resample_period,
                                    symbols_list_name=symbols_list_name,
                                    thresholds=thresholds,
                                    target_shift=target_shift)

            if mode == 'all' or mode == 'normal':
                datasets[normal_name] = (dfn, magic_number, thresholds)
            if mode == 'all' or mode == 'z-score':
                datasets[z_name] = (dfz, magic_number, thresholds)

    return datasets
Example #5
0
    def configure_optimizers(self):
        """TODO Add missing docstring."""
        if "decoder_lr" in self.cfg.optimizer.params.keys():
            params = [
                {
                    "params": self.model.decoder.parameters(),
                    "lr": self.cfg.optimizer.params.lr,
                },
                {
                    "params": self.model.encoder.parameters(),
                    "lr": self.cfg.optimizer.params.decoder_lr,
                },
            ]
            optimizer = load_obj(self.cfg.optimizer.class_name)(params)
        else:
            optimizer = load_obj(self.cfg.optimizer.class_name)(
                self.model.parameters(), **self.cfg.optimizer.params)
            scheduler = load_obj(self.cfg.scheduler.class_name)(
                optimizer, **self.cfg.scheduler.params)

        return (
            [optimizer],
            [{
                "scheduler": scheduler,
                "interval": self.cfg.scheduler.step,
                "monitor": self.cfg.scheduler.monitor,
            }],
        )
Example #6
0
    def configure_optimizers(self, *args, **kwargs):
        opt = self.conf.optimizer.class_name
        self.optimizer = load_obj(opt)(self.net.parameters(),
                                       **self.conf.optimizer.params)
        if self.conf.scheduler.class_name is None:
            return [self.optimizer]

        else:
            schedps = self.conf.scheduler
            __scheduler = load_obj(schedps.class_name)(self.optimizer,
                                                       **schedps.params)
            if not self.conf.scheduler.monitor:
                self.scheduler = {
                    "scheduler": __scheduler,
                    "interval": schedps.interval,
                    "frequency": schedps.frequency,
                }
            else:
                self.scheduler = {
                    "scheduler": __scheduler,
                    "interval": schedps.interval,
                    "frequency": schedps.frequency,
                    "monitor": schedps.monitor,
                }

            return [self.optimizer], [self.scheduler]
Example #7
0
 def load_index(self, fn):
     """
     Loads a pre-computed index (or indices) so we can answer queries.
     Input:
         fn - file name of pickled index.
         read from disk
     """
     utils.load_obj(fn)
def merge_index(config, files_num):
    """
    The function loads all the temporary index files that was made by the parse_and_index function and merge them into
    a united index.
    The function deals with the capital letters rule, where all the occurences of a term are starting with capital
    letters, it will be save in all capital. Otherwise it will be saved in the lower version.
    The function also merge the entites into the inverted index in case they appear in the corpus more than once.
    The function save the merged index to the disk for future use.
    :param config: config class that contains info about where to retrieve the saved files
    :param files_num: How many temporary files to merge in each category
    :return: Number of total terms in the index
    """
    merged_index = {}

    # Just merge all the terms in the index into one index
    file_prefix = config.get_save_files_dir() + "/tmp/inverted_idx_"
    for i in range(files_num):
        current_index = utils.load_obj(file_prefix + str(i))
        for term, apperances in current_index.items():
            if term not in merged_index.keys():
                merged_index[term] = apperances
            else:
                merged_index[term] += apperances

    # Handle the capital restriction
    merged_index_after_cap = {}
    for term, value in merged_index.items():
        if term[0].islower():
            if term not in merged_index_after_cap.keys():
                merged_index_after_cap[term] = value
            else:
                merged_index_after_cap[term] += value
        else:  # case it contains uppercase
            if term.lower() in merged_index.keys(
            ):  # case there is the same term in lower somewhere in the corpus
                if term.lower() not in merged_index_after_cap.keys():
                    merged_index_after_cap[term.lower()] = value
                else:
                    merged_index_after_cap[term.lower()] += value
            else:  # case it is actually capital only
                merged_index_after_cap[term.upper()] = value

    # Check if an entity appears more than once in the corpus it's being added to the index
    entities_idxs_prefix = config.get_save_files_dir() + "/tmp/entities_idx_"
    for i in range(files_num):
        current_entities = utils.load_obj(entities_idxs_prefix + str(i))
        for term, apperances in current_entities.items():
            if apperances > 1:
                merged_index_after_cap[term] = apperances

    total_terms = len(merged_index)
    #print("Total num of terms: {}".format(total_terms))
    # Save the merged index to disk
    saving_dir = config.get_save_files_dir()
    utils.save_obj(merged_index_after_cap, saving_dir + "/inverted_index")

    return total_terms
Example #9
0
def get_training_dataset(cfg: DictConfig = None) -> Dict[str, Dataset]:
    """
    Get training and validation datasets.

    Parameters
    ----------
    cfg : DictConfig, optional
        Project configuration, by default None

    Returns
    -------
    Dict[str, Dataset]
        {"train": train_dataset, "valid": valid_dataset}
    """
    images_dir = to_absolute_path(cfg.data.images_folder_path)

    data = pd.read_csv(to_absolute_path(cfg.data.dataset_path))
    data["x1"] = data["x"] + data["w"]
    data["y1"] = data["y"] + data["h"]
    data["area"] = data["w"] * data["h"]

    train_ids, valid_ids = train_test_split(
        data["image_id"].unique(),
        test_size=cfg.data.validation_split,
        random_state=cfg.training.seed,
    )

    # for fast training
    if cfg.training.debug:
        train_ids = train_ids[:10]
        valid_ids = valid_ids[:10]

    train_df = data.loc[data["image_id"].isin(train_ids)]
    valid_df = data.loc[data["image_id"].isin(valid_ids)]

    train_augs_list = [
        load_obj(i["class_name"])(**i["params"])
        for i in cfg["augmentation"]["train"]["augs"]
    ]
    train_bbox_params = OmegaConf.to_container(
        (cfg["augmentation"]["train"]["bbox_params"])
    )
    train_augs = Compose(train_augs_list, bbox_params=train_bbox_params)

    valid_augs_list = [
        load_obj(i["class_name"])(**i["params"])
        for i in cfg["augmentation"]["valid"]["augs"]
    ]
    valid_bbox_params = OmegaConf.to_container(
        (cfg["augmentation"]["valid"]["bbox_params"])
    )
    valid_augs = Compose(valid_augs_list, bbox_params=valid_bbox_params)

    train_dataset = XrayDataset(train_df, "train", images_dir, cfg, train_augs)
    valid_dataset = XrayDataset(valid_df, "valid", images_dir, cfg, valid_augs)

    return {"train": train_dataset, "valid": valid_dataset}
Example #10
0
    def __init__(self, args):
        device = torch.device(args.gpu if args.gpu != -1 else 'cpu')
        self.device = device
        ftrain = args.ftrain
        fvalid = args.fvalid
        ftest = args.ftest
        futable = args.futable
        bsz = args.bsz
        cdir = args.cdir
        train, valid, test = self.load_data(ftrain), \
                             self.load_data(fvalid), \
                             self.load_data(ftest)
        ft_extractors = {f'{n}-gram': NgramFeature(n, vsize) for n, vsize in \
                         zip([1, 2, 3, 4], [args.vsizes[VSIZE_1GRAM],
                                            args.vsizes[VSIZE_2GRAM],
                                            args.vsizes[VSIZE_3GRAM],
                                            args.vsizes[VSIZE_4GRAM]])}
        ft_extractors['unicode-block'] = UnicodeBlockFeature()
        ft_extractors['word'] = WordFeature(args.vsizes[VSIZE_WORD])
        for name in ft_extractors:
            cache_path = os.path.join(cdir, f'{name}.pkl')
            if os.path.exists(cache_path):
                ft_extractors[name] = utils.load_obj(cache_path)
            else:
                utils.log(f'Building feature {name}')
                if 'gram' in name:
                    ft_extractors[name].build(train.txt)
                elif name == 'unicode-block':
                    ft_extractors[name].build(futable)
                elif name == 'word':
                    ft_extractors[name].build(train.txt)
                else:
                    raise NotImplementedError
                utils.save_obj(ft_extractors[name], cache_path)

        cache_path = os.path.join(cdir, 'lang.pkl')
        LANG = Lang()
        if os.path.exists(cache_path):
            LANG = utils.load_obj(cache_path)
        else:
            utils.log('Building LANG')
            LANG.build(train.lang)
            utils.save_obj(LANG, cache_path)

        utils.log('Building batches')
        self.train_iter, _ = self.build_batches(train, cdir, 'train',
                                                ft_extractors, bsz, LANG, True,
                                                device)
        self.valid_iter, _ = self.build_batches(valid, cdir, 'valid',
                                                ft_extractors, bsz, LANG,
                                                False, device)
        self.test_iter, _ = self.build_batches(test, cdir, 'test',
                                               ft_extractors, bsz, LANG, False,
                                               device)
        self.ft_extractors = ft_extractors
        self.LANG = LANG
Example #11
0
 def expand_query(self, query_as_list):
     new_query_list = []
     embedding_dict = utils.load_obj("embedding_dict")
     new_embedding_dict = utils.load_obj("new_embedding_dict")
     for term in query_as_list:
         if term in embedding_dict.keys():
             new_query_list.extend(
                 find_closest_embeddings(embedding_dict[term], 4,
                                         new_embedding_dict))
     return new_query_list
Example #12
0
    def init_embeddings(self):
        if self.one_hot_embed:
            embed_arr = utils.load_obj('datasets/context/embeddings/one_hot_33_dim')
        else:
            embed_arr = utils.load_obj('datasets/context/embeddings/norm_embed_arr_' + str(self.embed_dim))

        num_classes = embed_arr.shape[0]
        self.embeddings = torch.nn.Embedding(num_classes, self.embed_dim)
        self.embeddings.weight.requires_grad = False
        self.embeddings.weight.data.copy_(torch.from_numpy(embed_arr))
Example #13
0
 def set_precomputed_ct(self, base_obj_path, ancestor_dict_path,
                        sample_idx_vec_path, point_num):
     self.ct = load_obj(base_obj_path)
     self.ct.__init__()
     self.point_num = 1000
     self.ct.point_num = self.point_num
     self.ct.ancestor_dict = load_obj(ancestor_dict_path)
     self.ct.sample_idx_vec_dict = load_obj(sample_idx_vec_path)
     self.ct.fidx_vec = np.array(
         [fidx for fidx in self.ct.sample_idx_vec_dict.keys()])
Example #14
0
 def Init_model(self):
     #init dataloader
     self.data_loader = DataLoader_test(self.save_dir)
     # init model
     self.ort_session = onnxruntime.InferenceSession(self.save_dir +
                                                     self.model_nm)
     # init dict
     self.idx2lbl = load_obj(self.save_dir + "idx2lbl.json")
     self.idx2cls = load_obj(self.save_dir + "idx2cls.json")
     # get valid slot for a specific intent
     self.idx_mask = load_obj(self.save_dir + "idx_mask_onnx.json")
Example #15
0
    def initialize(self, services):
        self.services = services
        self.valid_actions_getter = MyValidActionsGetter(
            self.services.parser, self.services.perception)
        self.uncompleted_goals = self.services.goal_tracking.uncompleted_goals

        if os.path.exists(self.env_name + "_transitions"):
            self.transitions = load_obj(self.env_name + "_transitions")
        if os.path.exists(self.env_name + "_state_action_transition_count"):
            self.state_action_transition_count = load_obj(
                self.env_name + "_state_action_transition_count")
Example #16
0
 def get_val_slides(self, resample_round):
     patients_train = load_obj(
         'train_img_paths_DX_round_{}'.format(resample_round),
         self.DATA_SPLIT_DIR + 'train/')
     patients_train = list(
         set([p.split('/')[-1].split('.')[0][:15] for p in patients_train]))
     patients_val = load_obj(
         'val_img_paths_DX_round_{}'.format(resample_round),
         self.DATA_SPLIT_DIR + 'val/')
     patients_val = list(
         set([p.split('/')[-1].split('.')[0][:15] for p in patients_val]))
     return patients_val, patients_train
Example #17
0
def train(cfg: DictConfig) -> None:
    """
    Run model training.

    Parameters
    ----------
    cfg : DictConfig
        Project configuration object
    """
    model = load_obj(cfg.model.backbone.class_name)
    model = model(**cfg.model.backbone.params)

    # get number of input features for the classifier
    in_features = model.roi_heads.box_predictor.cls_score.in_features

    head = load_obj(cfg.model.head.class_name)

    # replace the pre-trained head with a new one
    model.roi_heads.box_predictor = head(in_features,
                                         cfg.model.head.params.num_classes)

    set_seed(cfg.training.seed)
    hparams = flatten_omegaconf(cfg)
    xray_detection = XrayDetection(hparams=hparams, cfg=cfg, model=model)

    callbacks = xray_detection.get_callbacks()
    loggers = xray_detection.get_loggers()

    trainer = pl.Trainer(
        logger=loggers,
        early_stop_callback=callbacks["early_stopping"],
        checkpoint_callback=callbacks["model_checkpoint"],
        **cfg.trainer,
    )
    trainer.fit(xray_detection)

    # Load the best checkpoint
    get_logger().info("Saving model from the best checkpoint...")
    checkpoints = [
        ckpt for ckpt in os.listdir("./")
        if ckpt.endswith(".ckpt") and ckpt != "last.ckpt"
    ]
    best_checkpoint_path = checkpoints[0]

    model = XrayDetection.load_from_checkpoint(best_checkpoint_path,
                                               hparams=hparams,
                                               cfg=cfg,
                                               model=model)

    save_best(model, cfg)
    def merge_files(self, out, letter,
                    file_name_letter_idx):  # temp_letter_dict):

        permanent_file_name = out + letter
        file_name_letter_idx = utils.load_obj(out + file_name_letter_idx)
        permanent_dict_file = utils.load_obj(permanent_file_name)

        for key in file_name_letter_idx:
            if key in permanent_dict_file:
                permanent_dict_file[key].extend(file_name_letter_idx[key])
            else:
                permanent_dict_file[key] = file_name_letter_idx[key]

        utils.save_obj(permanent_dict_file, permanent_file_name)
Example #19
0
    def __init__(self, save_dir):
        self.save_dir = save_dir
        self.word2idx = load_obj(self.save_dir + "dict.json")
        self.config = load_obj(self.save_dir + "Config.json")
        self.max_len = self.config["max_len"]

        self.WORD = {int(k): v for k, v in self.config["WORD"].items()}
        self.BOS = self.config["BOS"]
        self.UNK = self.config["UNK"]
        self.PAD = self.config["PAD"]

        assert self.BOS == self.word2idx[self.WORD[self.BOS]]
        assert self.UNK == self.word2idx[self.WORD[self.UNK]]
        assert self.PAD == self.word2idx[self.WORD[self.PAD]]
Example #20
0
    def __init__(self, args_dict, set, w2i_tit, w2i, transform=None):
        """
        Args:
            set: 'train', 'val', 'test
            w2i_tit: word to index for titles
            w2i: word to index for comments
            transform: data transform
        """
        self.args_dict = args_dict
        self.set = set

        # Load Data
        if self.set == 'train':
            textfile = args_dict.csvtrain
            self.mismtch = 0.8
        elif self.set == 'val':
            textfile = args_dict.csvval
            self.mismtch = 0
        elif self.set == 'test':
            textfile = args_dict.csvtest
            self.mismtch = 0
        df = pd.read_csv(textfile, delimiter='\t')
        self.imageurls = list(df['IMAGE_FILE'])
        self.comment_map = get_mapped_text(df, w2i, field='DESCRIPTION')
        self.titles_map = get_mapped_text(df, w2i_tit, field='TITLE')

        # Parameters
        self.numpairs = len(df) / (1 - self.mismtch)
        self.comw2i = w2i
        self.titw2i = w2i_tit
        # self.titw2i = dict([(w, i) for i, w in enumerate(titvocab)])
        self.imagefolder = args_dict.dir_images
        self.transform = transform

        # tfidf weights and vectors
        if os.path.exists(args_dict.dir_data + args_dict.tfidf_coms_file):
            self.tfidf_coms = load_obj(args_dict.dir_data +
                                       args_dict.tfidf_coms_file)
        else:
            self.tfidf_coms = self.get_tfidf(self.comment_map, self.comw2i)
            save_obj(self.tfidf_coms,
                     args_dict.dir_data + args_dict.tfidf_coms_file)

        if os.path.exists(args_dict.dir_data + args_dict.tfidf_tits_file):
            self.tfidf_tits = load_obj(args_dict.dir_data +
                                       args_dict.tfidf_tits_file)
        else:
            self.tfidf_tits = self.get_tfidf(self.titles_map, self.titw2i)
            save_obj(self.tfidf_tits,
                     args_dict.dir_data + args_dict.tfidf_tits_file)
def merge_posting_letter(saving_dir, prefix, files_num, inverted_idx):
    """
    Merge one posting file, by it's prefix. (This task is dispatched to several processes so it runs in parallel)
    It reads all the posting dict and the entities candidate_dicts with the relevant prefix and merge them into one.
    It also makes sure that entities and capital letters are aligned with the way we dealt with it in the inverted idx
    :param saving_dir: Where to save the output and find the temp files
    :param prefix: Which posting prefix this task is being applied to
    :param files_num: How many temp files to read
    :param inverted_idx: The inverted index of the corpus that contains all the final version ok keys
    :return: Which prefix this task worked on
    """

    #print("merging posting of prefix {}, files_num: {}".format(prefix, files_num))
    loading_dir = saving_dir + '/tmp'
    file_prefix = loading_dir + "/postingDict_" + prefix + "_"
    entities_prefix = loading_dir + "/entitiesDict_" + prefix + "_"
    merged_letter_posting = {}

    # Merge all the posting entries
    for i in range(files_num):
        try:
            current_letter_posting = utils.load_obj(file_prefix + str(i))
            for term, apperances in current_letter_posting.items():
                if term in merged_letter_posting.keys():  # already found term
                    merged_letter_posting[term] += apperances
                else:
                    if term in inverted_idx.keys(
                    ):  # term capital that is valid, or a lower one
                        merged_letter_posting[term] = apperances
                    else:  # capital term candidate that haven't made it, will be lowered
                        merged_letter_posting[term.lower()] = apperances

            # load entities_posting and merge it
            curent_entity_posting = utils.load_obj(entities_prefix + str(i))
            for term, apperances in curent_entity_posting.items():
                if term in inverted_idx.keys():  # Valid entity
                    merged_letter_posting[term] = apperances
        except:
            pass

    # Sort every posting entry by it doc_id
    for postings_entry in merged_letter_posting.values():
        postings_entry.sort(key=lambda x: x[0])

    # Save relevant posting dict
    utils.save_obj(merged_letter_posting,
                   saving_dir + "/postingDict_" + prefix)
    #print("saved {} posting dict".format(prefix))
    return prefix
Example #22
0
def load_tweet_dict():
    """
    read the tweet vector files and insert the vectors to the tweet dictionary
    :return tweet_Dictionary including the Glove vector data
    """
    tweet_dict = utils.load_obj("docDictionary")
    buckets = []
    for i in range(tweet_dict["metadata"]["tweet_vector_buckets"]):
        buckets.append(utils.load_obj("avgVector" + str(i)))
    for tweet_id in tweet_dict.keys():
        if tweet_id == "metadata":
            continue
        address = tweet_dict[tweet_id][5]
        tweet_dict[tweet_id][5] = buckets[address[0]][address[1]]
    return tweet_dict
Example #23
0
 def load_blacklist(self):
     filename = self.blacklist_filename()
     if not os.path.exists(filename):
         blacklist = set()
     else:
         blacklist = utils.load_obj(filename)
     return blacklist
Example #24
0
 def load_index(self, fn):
     """
     Loads a pre-computed index (or indices) so we can answer queries.
     Input:
         fn - file name of pickled index.
     """
     self.inverted_idx, self.documents = utils.load_obj(fn)
Example #25
0
def reconstruct_from_postings(output_path, stemming):
    postings = glob(output_path + "\\{}\\*.pkl".format("WithStem" if stemming else "WithoutStem"), recursive=True)

    reconstructed = set()
    corpus_size = 0
    total_length = 0
    for posting in postings:

        if "inverted_idx" not in posting:

            splited_path = os_path_splitext(posting)
            print(splited_path)
            file = utils.load_obj(splited_path[0])

            for doc_list in file.values():

                for doc in doc_list:

                    doc_id = doc[0]
                    doc_length = doc[4]

                    if doc_id not in reconstructed:
                        reconstructed.add(doc_id)
                        total_length += doc_length
                        corpus_size += 1

    return corpus_size, float(total_length) / corpus_size
Example #26
0
def train(df, attrs, clf_class, clf_name, model_params, mode, magic_number,
          dates, dataset_name, trading_params):
    trade_freq = trading_params['trade_frequency']
    name = '%s-%s-attr%s-%s-%s-%s-%s-%s_' % (
        clf_name, dataset_name, len(attrs), dict_to_str(model_params).replace(
            ' ', '_').replace(':', ''), mode, magic_number,
        pd.to_datetime(dates[0], format=DATE_FORMAT).date(),
        pd.to_datetime(dates[1], format=DATE_FORMAT).date())
    cached_file = os.path.join(CACHE_PATH + '/models/', name)

    start_date, final_date = dates
    idx = 0

    indices = sorted([
        day for day in list(set(df.index.values))
        if start_date <= day <= final_date
    ])

    print("Model and params: %s %s " % (clf_name, model_params))
    # magic number is by default 53, 52 weeks for training 1 for prediction
    while idx + magic_number < len(indices) and indices[idx + magic_number] <= \
            indices[-1]:

        if mode == CLASSIFICATION:
            train_x, train_y, test_x, test_y = \
                get_classification_data(clf_name, df, attrs, indices, idx,
                                        magic_number)
        elif mode == REGRESSION:
            # get regression datasets (target is float y -> ratio of increase)
            train_x, train_y, test_x, test_y = \
                get_regression_data(clf_name, df, attrs, indices, idx,
                                    magic_number)

        print(
            "Training %s/%s with %s instances." %
            (idx // trade_freq, len(indices) // trade_freq, train_x.shape[0]))
        sys.stdout.flush()

        clf_cached_file = cached_file + str(indices[idx])[:10]

        if not CHECKPOINTING:
            clf = clf_class(**model_params).fit(train_x, train_y)
        else:
            try:
                clf = load_obj(clf_cached_file)
            except:
                clf = clf_class(**model_params).fit(train_x, train_y)
                save_obj(clf, clf_cached_file)

        pred = clf.predict(test_x)

        # import ipdb
        # ipdb.set_trace()
        df.loc[indices[idx + magic_number], clf_name] = pred

        idx += trade_freq
    df_trade = df.dropna(axis=0)

    print("Finished training for %s" % (clf_name))
    return df_trade
Example #27
0
 def set_default_ct(self):
     self.ct = load_obj("cell_tracker_with_lineage")
     self.ct.__init__()
     self.point_num = 1000
     self.ct.point_num = self.point_num
     self.ct.fidx_vec = np.array(
         [fidx for fidx in self.ct.sample_idx_vec_dict.keys()])
Example #28
0
def classify(k, text):

    target_vec = lda_all.get_document_topics(dictionary_all.doc2bow(
        utils.tokenize(sampletext)),
                                             per_word_topics=True)[0]

    closest_points = []

    with open('./data/corpus-labels.csv') as labels:
        labelreader = csv.reader(labels)

        if not os.path.exists('./data/ldaspace-titles-abstracts.pkl'):
            print "data/ldaspace-titles-abstract.pkl not found. Generating file (this may take a while)"
            save_pointcloud('./data/ldaspace-titles-abstracts')

        ldaspace = utils.load_obj('./data/ldaspace-titles-abstracts')

        for l, current_vec in zip(labelreader, ldaspace):

            dist = get_distance(current_vec, target_vec)
            if len(closest_points) >= k:
                if dist < closest_points[k - 1]:
                    closest_points.pop(k - 1)
                    closest_points.append((l, dist))
            else:
                closest_points.append((l, dist))

            closest_points.sort(key=lambda point: point[1])

    category_counter = Counter()
    for x in closest_points:
        category_counter.update(x[0])

    return category_counter
Example #29
0
def test(args):
    # see if we already ran this experiment
    code_root = os.path.dirname(os.path.realpath(__file__))
    exp_dir = utils.get_path_from_args(
        args) if not args.output_dir else args.output_dir
    path = "{}/results/{}".format(code_root, exp_dir)
    assert os.path.isdir(path)
    task_family_test = tasks_sine.RegressionTasksSinusoidal(
        "test", args.skew_task_distribution)
    best_valid_model = utils.load_obj(os.path.join(path,
                                                   "logs")).best_valid_model
    k_shots = [5, 10, 20, 40]
    df = []
    for k_shot in k_shots:
        losses = np.array(
            eval(
                args,
                copy.copy(best_valid_model),
                task_family=task_family_test,
                num_updates=10,
                lr_inner=0.01,
                n_tasks=1000,
                k_shot=k_shot,
            ))
        for grad_step, task_losses in enumerate(losses.T, 1):
            new_rows = [[k_shot, grad_step, tl] for tl in task_losses]
            df.extend(new_rows)

    df = pd.DataFrame(df, columns=["k_shot", "grad_steps", "loss"])
    df.to_pickle(os.path.join(path, "res.pkl"))
    utils.plot_df(df, path)
    def __init__(self, model_variables, variables, database):
        self.MV = model_variables
        self.Vars = variables
        self.DB = database
        self.model = None
        self.weight_save_path = "saved_weights"
        self.outputs_path = "outputs"
        self.model_name = self.Vars["name"]

        if self.Vars["class"] == "age":
            self.model_class = "age"
            self.class_count = self.DB.age_class_count
            self.class_labels = self.DB.age_labels
            self.db_train_path = self.DB.db_age_train_folder_path
            self.db_test_path = self.DB.db_age_test_folder_path
            self.mean_image = myutils.load_image(self.DB.age_mean_image_path)
        elif self.Vars["class"] == "sex":
            self.model_class = "sex"
            self.class_count = self.DB.sex_class_count
            self.class_labels = self.DB.sex_labels
            self.db_train_path = self.DB.db_sex_train_folder_path
            self.db_test_path = self.DB.db_sex_test_folder_path
            self.mean_image = myutils.load_image(self.DB.sex_mean_image_path)

        self.class_weights = myutils.load_obj(self.DB.db_new_path + "/" +
                                              self.model_class)
Example #31
0
def show_examples(args):

    # Load reason instances and embeddings
    fileembds = os.path.join(args.data_dir, args.embsfile)
    embeddings = utils.load_obj(fileembds)
    allreasons = read_data(args)
    allinds = list(range(len(allreasons)))
    assert len(allreasons) == embeddings.shape[0]
    numReasons = len(allreasons)

    # Get a random instance, compute scores and show most similar
    thisidx = random.sample(allinds, 1)[0]
    thisreason = allreasons[thisidx]
    print('-' * 25)
    print("REASON: {}".format(thisidx))
    print(thisreason)
    print('.')

    # Compute scores and sort
    allscores = sklearn.metrics.pairwise.cosine_similarity(embeddings)
    thisscores = allscores[thisidx, :]
    ranking = np.argsort(thisscores)[::-1].tolist()
    sortedscores = np.sort(thisscores)[::-1].tolist()

    # show top 5
    numshow = 10
    print("MATCHES")
    for k in list(range(numshow)):
        kidx = ranking[k]
        score = sortedscores[k]
        reason = allreasons[kidx]
        print("sample %d, score %.03f: %s" % (kidx, score, reason))

    return allscores
Example #32
0
    def load_dicts(self, variant):
        filename = self.cache_filename(variant)
        if not os.path.exists(filename):
            cache = self.default_cache()
        else:
            cache = utils.load_obj(filename)

        return cache
Example #33
0
def main(args):

    if(len(args) != 2):
                print "Usage: mds.py clustering.pkl"
                print "     C is the cluster in clustering.pkl to display"           
                sys.exit(0)

    path = args[1]
                
    print "Loading"
    clusters = clustering = utils.load_obj(path)

    #map(lambda c: c.set_label(), clustering)
    for i in [5]:  
        clusters = reclusterWithOPTICS(clusters, i)
    
        _docs = reduce(lambda x,y: x+y, map(lambda c: c.members, clusters))
    
    
        confirm = BaseCONFIRM(_docs)
        confirm.clusters = clusters
    
        print "Original Number of Clusters:", len(clustering)
        print "Final Number of Clusters:", len(clusters)
    
        '''print reps
    
        imgs = []
        
        for idx in reps:
            if idx == 0:
                imgs.append(clustering[i].center)
            else:
                idx = idx -1
                imgs.append(clustering[i].members[idx])
                
        
        display(imgs)'''
    #print  len(selectWithHac(clustering))

    #print streamSelector(clustering)

    #print entropy(clustering)
    
    #print "Analyzing"
        analyzer = metric.KnownClusterAnalyzer(confirm)
        analyzer.print_all()

        print "User Queries:", QueryCount
Example #34
0
 def get(self, filename):
     with self._disk_lock:
         if type(filename) == types.UnicodeType:
             filename = filename.encode('utf-8')        
         data = super(LifoCache, self).get(filename)
         if not data:
             self.disk_read_count += 1
             if filename in self.disk_cache:
                 if os.path.exists(self.disk_cache_dir + filename):
                     self.disk_read_hit += 1
                     data = utils.load_obj(self.disk_cache_dir + filename)
                 del self.disk_cache[filename]
                 if data:
                     self.disk_cache[filename] = True
                     super(LifoCache, self).set(filename, data)
     return data
Example #35
0
def main(args):

    if(len(args) != 3):
        print "Usage: clusterFrame.py C clustering.pkl"
        print "     C is the cluster in clustering.pkl to display"
        sys.exit(0)

    C = int(args[1])
    path = args[2]

    clustering = utils.load_obj(path)

    root = Tk()
    frame = ClusterFrame(root, clustering[C])
    frame.grid()
    root.mainloop()  
Example #36
0
 def __load_imagenet_weights(self):
     variables = tf.get_collection(tf.GraphKeys.GLOBAL_VARIABLES)
     try:
         print("Loading ImageNet pretrained weights...")
         dict = load_obj(self.args.pretrained_path)
         run_list = []
         for variable in variables:
             for key, value in dict.items():
                 # Adding ':' means that we are interested in the variable itself and not the variable parameters
                 # that are used in adaptive optimizers
                 if key + ":" in variable.name:
                     run_list.append(tf.assign(variable, value))
         self.sess.run(run_list)
         print("Weights loaded\n\n")
     except KeyboardInterrupt:
         print("No pretrained ImageNet weights exist. Skipping...\n\n")
Example #37
0
def main(args):

    if(len(args) != 3):
        print "Usage: clusterFrame.py C clustering.pkl"
        print "     C is the cluster in clustering.pkl to display"
        sys.exit(0)

    C = int(args[1])
    path = args[2]

    print "Loading"
    clustering = utils.load_obj(path)
    #clustering  = doc.get_docs_nested(driver.get_data_dir("very_small"))
        
    hierarchy = Hierarchy.createHierarchy(clustering)

    print "Starting GUI"
    root = Tk()
    frame = GraphFrame(root, hierarchy)
    frame.pack(fill=BOTH,expand=1)
    root.mainloop()  
Example #38
0
 def _load(self, filename):
     # no need of a lock here
     items = utils.load_obj(filename)
     for d in items:
         self.put(*d)
Example #39
0
def default_jobs():
    return {
        "match_queue": job_queue.JobQueue(),
        "split_queue": job_queue.JobQueue(),
        "number_of_match_job": 0,
        "number_of_split_job": 0,
    }


if __name__ == "__main__":
    try:
        cache_dir = "match_and_split_text_layer"
        if not os.path.exists(os.path.expanduser("~/cache/" + cache_dir)):
            os.mkdir(os.path.expanduser("~/cache/" + cache_dir))
        # qdel send a SIGUSR2 if -notify is used when starting the job.
        # signal.signal(signal.SIGUSR2, on_exit)
        try:
            jobs = utils.load_obj("wsdaemon.jobs")
        except:
            jobs = default_jobs()

        thread.start_new_thread(job_thread, (jobs["match_queue"], do_match))
        thread.start_new_thread(job_thread, (jobs["split_queue"], do_split))
        bot_listening()
    except KeyboardInterrupt:
        pywikibot.stopme()
        os._exit(1)
    finally:
        pywikibot.stopme()
 def __init__(self, filename):
     self.base_path = "/".join(filename.split('/')[:-1])
     self.index = utils.load_obj(filename + '.index')
     self.fd_data = open(filename)
Example #41
0
 def load_thrift_app(self):
     return utils.load_obj(self.app_uri)
Example #42
0
 def load(self):
     self.chdir()
     self.tfactory = utils.load_obj(self.cfg.thrift_transport_factory)()
     self.pfactory = utils.load_obj(self.cfg.thrift_protocol_factory)()
     self.thrift_app = self.load_thrift_app()
     return lambda: 1