Example #1
0
    def create_train_and_val_data(
        self,
        save_file_name: Union[None, str] = None
    ) -> Tuple[pd.DataFrame, pd.DataFrame, pd.DataFrame, pd.DataFrame]:
        """Creates the training and validation sets and labels

        The sets contain the top 20 ms2ds matches of each spectrum and a
        collection of different scores and data of these matches in a
        pd.DataFrame. The labels contain a dataframe with the tanimoto scores.
        Args
        ----
        save_file_name:
            File name to which the result will be stored. The result is stored
            as a pickled file of a tuple containing the training_set, the
            training_labels, the validation_set and the validation_labels in
            that order.
            """
        training_set, training_labels = \
            self.get_matches_info_and_tanimoto(self.training_spectra)
        validation_set, validation_labels = \
            self.get_matches_info_and_tanimoto(self.validation_spectra)

        if save_file_name:
            with open(save_file_name, "wb") \
                    as new_file:
                pickle.dump((training_set, training_labels, validation_set,
                             validation_labels), new_file)
        return training_set, training_labels, validation_set, validation_labels
Example #2
0
    def dump(self, rv: Any) -> None:
        """Dump data to the cache as a pickle.

        :param rv: The arbitrary python object to dump
        """
        with open(self.path, "wb") as file:
            pickle.dump(rv, file, protocol=pickle.HIGHEST_PROTOCOL)
Example #3
0
 def save(self, path):  # FilePath -> IO ()
     par = os.path.split(path)[0]
     if not os.path.exists(par):
         os.makedirs(par)
     f = gzip.open(path, 'wb')
     pickle.dump(self.db, f, -1)
     f.close()
Example #4
0
    def get_kNN_ml_data(self, name2save=None):
        train_jet_particle, test_jet_particle = self.prepare_ml_data()
        ## compute persistence information for b0 features based on kNN graphs
        train_b0_pair = {}
        for key in train_jet_particle:
            train_b0_pair[key] = topology.ML_JetPersistance(
            ).get_kNN_ml_inputs(utils.get_p4(train_jet_particle[key]),
                                k=self.k,
                                p=self.p)

        test_b0_pair = {}
        for key in test_jet_particle:
            test_b0_pair[key] = topology.ML_JetPersistance().get_kNN_ml_inputs(
                utils.get_p4(test_jet_particle[key]), k=self.k, p=self.p)

        ml_data = {
            'train_b0': train_b0_pair,
            'test_b0': test_b0_pair,
        }

        if not name2save:
            return ml_data
        else:
            with open(name2save, 'wb') as handle:
                pickle.dump(ml_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #5
0
def init_mnist():
    download_mnist()
    dataset = _convert_numpy()
    print("Creating pickle file ...")
    with open(save_file, 'wb') as f:
        pickle.dump(dataset, f, -1)
    print("Done!")
Example #6
0
    def get_DT_ml_data(self, name2save=None):
        train_jet_particle, test_jet_particle = self.prepare_ml_data()
        ## compute persistence information
        train_b0_pair = {}
        train_b1_pair = {}
        for key in train_jet_particle:
            pers_pairs = topology.ML_JetPersistance().get_ml_inputs(
                utils.get_p4(train_jet_particle[key]),
                zeta_type=self.zeta_type,
                R=self.R)
            train_b0_pair[key], train_b1_pair[key] = pers_pairs[
                'b0'], pers_pairs['b1']

        test_b0_pair = {}
        test_b1_pair = {}
        for key in test_jet_particle:
            pers_pairs = topology.ML_JetPersistance().get_ml_inputs(
                utils.get_p4(test_jet_particle[key]),
                zeta_type=self.zeta_type,
                R=self.R)
            test_b0_pair[key], test_b1_pair[key] = pers_pairs[
                'b0'], pers_pairs['b1']

        ml_data = {
            'train_b0': train_b0_pair,
            'train_b1': train_b1_pair,
            'test_b0': test_b0_pair,
            'test_b1': test_b1_pair
        }

        if not name2save:
            return ml_data
        else:
            with open(name2save, 'wb') as handle:
                pickle.dump(ml_data, handle, protocol=pickle.HIGHEST_PROTOCOL)
Example #7
0
 def get_jet_obs(self, name2save=None):
     '''
     compute N-Subjettiness for n=1-6
     '''
     train_jet_particle, test_jet_particle = self.prepare_ml_data()
     train_obs, test_obs = {}, {}
     Ns = [1, 2, 3, 4, 5, 6]
     for key in train_jet_particle:
         taus = []
         for N in Ns:
             taus.append(utils.JetObs().Njettiness(utils.get_p4(
                 train_jet_particle[key]),
                                                   N=N,
                                                   beta=0.2))
         train_obs[key] = np.vstack(taus).T
     for key in test_jet_particle:
         taus = []
         for N in Ns:
             taus.append(utils.JetObs().Njettiness(utils.get_p4(
                 test_jet_particle[key]),
                                                   N=N,
                                                   beta=0.2))
         test_obs[key] = np.vstack(taus).T
     obs = {'train': train_obs, 'test': test_obs}
     if not name2save:
         return obs
     else:
         with open(name2save, 'wb') as handle:
             pickle.dump(obs, handle, protocol=pickle.HIGHEST_PROTOCOL)
def training_model(number_topics=10, number_words=1):
    print("Get Files")
    data = data_loader.get_processed_papers()

    data = pd.DataFrame(data,
                        columns=['paper_id', 'title', 'abstract', 'body_text'])

    text_data = []

    for tokens in data['body_text']:
        for token in tokens:
            text_data.append(token)

    dictionary = corpora.Dictionary(text_data)
    corpus = [dictionary.doc2bow(text) for text in text_data]

    pickle.dump(corpus, open('corpus.pkl', 'wb'))
    dictionary.save('dictorionary.gensim')

    model = models.LdaModel(corpus,
                            num_topics=number_topics,
                            id2word=dictionary,
                            passes=15)
    model.save('model5.gensim')

    topics = model.print_topics(num_words=number_words)

    for topic in topics:
        print(topic)
Example #9
0
    def dump_list(self, file, *args):
        """
		dump_list(file, *args)
		Dumps lists to the file
		"""
        for lists in args:
            pickle.dump(lists, file, 2)
Example #10
0
def make_dataset(input_path,
                 output_path,
                 size=128,
                 nb_points=10000,
                 number_models=None,
                 overwrite=False,
                 nb_samples_per_model=20,
                 gl_tries=5,
                 fast_skip=False):

    dataset.utils.make_dir(output_path)
    objects_path = os.path.join(input_path, "*/models/*.obj")
    objects_path = glob.glob(objects_path)
    if number_models is not None:
        objects_path = objects_path[:number_models]
    for path in tqdm.tqdm(objects_path):
        name = path.split(os.path.sep)[-3]
        object_dir = os.path.join(output_path, name)
        if fast_skip and os.path.isdir(object_dir):
            continue
        dataset.utils.make_dir(object_dir)
        pts_path = os.path.join(object_dir, "pts.pkl")
        pts = dataset.geometry.sample_points(path, nb=nb_points)
        pts = np.array(pts, dtype=np.float32)
        with open(pts_path, 'wb') as handle:
            pickle.dump(pts, handle, protocol=pickle.HIGHEST_PROTOCOL)

        for i in range(nb_samples_per_model):
            render_name = f"render_{str(i).zfill(5)}"
            mat_name = f"mat_{str(i).zfill(5)}"
            image_path = os.path.join(object_dir, render_name + ".jpg")
            mat_path = os.path.join(object_dir, render_name + ".pkl")
            mat = dataset.geometry.random_camera()

            if not overwrite and os.path.isfile(mat_path):
                continue

            for i in range(gl_tries):
                try:
                    color = dataset.rendering.render(path, mat, im_size=size)
                    im = Image.fromarray(color)
                    im.save(image_path)
                    with open(mat_path, 'wb') as handle:
                        pickle.dump(mat,
                                    handle,
                                    protocol=pickle.HIGHEST_PROTOCOL)
                    break
                except OpenGL.error.GLError:
                    print(f"GL Error occured, try {i}, trying again.")
                except ValueError:
                    print(f"Value error with {name}")
                    break
                except RuntimeError:
                    print(f"Runtime error with {name}")
                    break
                except TypeError:
                    print(f"Type error with {name}")
                    break
            else:
                pass
Example #11
0
    def login_handler(self, remember=True, cookies=True):

        if self.driver.current_url == self.dash_board_url:
            pass

        else:
            self.driver.get(self.login_url)
            try:
                with open("cookies", "rb") as f:
                    cookies = pickle.load(f)
                    for cookie in cookies:
                        self.driver.add_cookie(cookie)
                self.driver.refresh()

            except Exception as e:

                user_email = self.__get_xpath_elem(
                    user["user-email-field"]).send_keys(self.user_mail)
                user_password = self.__get_xpath_elem(
                    user["user-password-field"]).send_keys(self.user_pswd)

                if remember:
                    user_remember_me = self._click(user["user-remember-me"])

                self._click(user["user-login-btn"])
                self._random_wait(3, 5)

                if cookies:
                    if self.driver.current_url == self.dash_board_url:
                        with open("cookies", "wb") as f:
                            pickle.dump(self.driver.get_cookies(), f)
Example #12
0
def update_skills_pickle(
        from_table: str = "skills.tab", pickle_out: str = "skills.pickle"
        ) -> None:
    """
    First import, then serialize and store skills data, so that it is not hard
    coded here.

    """
    if os.path.exists(os.path.join(_TABLES_LOCATION, pickle_out)):
        if os.path.getmtime(
            os.path.join(_TABLES_LOCATION, from_table)
        ) < os.path.getmtime(os.path.join(_TABLES_LOCATION, pickle_out)):
            return

    skills_d = {'groups':[]}
    path = os.path.join(_TABLES_LOCATION, from_table)
    with open(path) as handle:
        table = handle.read().splitlines()
        for row in table:
            if row.lstrip()[0] == '#':
                continue
            row = row.split('\t')
            name = row[0]
            group = row[1] if len(row) > 1 else None
            group = group if group != "" else None
            difficulty = int(row[2]) if len(row) > 2 else 0
            skills_d[name] = {'group':group,'difficulty':difficulty}
            skills_d['groups'].append(group)

    with open(os.path.join(_TABLES_LOCATION, pickle_out), "wb") as handle:
        pickle.dump(symbols_d, handle, protocol=pickle.HIGHEST_PROTOCOL)
def get_credentials(logger: lg.Logger = None) -> pickle:
    """Get the proper credentials needed to write to the Google spreadsheet."""
    creds = None
    if osp.exists(GGL_SHEETS_TOKEN):
        if logger: logger.info(F"osp.exists({GGL_SHEETS_TOKEN})")
        with open(GGL_SHEETS_TOKEN, "rb") as token:
            creds = pickle.load(token)

    # if there are no (valid) credentials available, let the user log in.
    if not creds or not creds.valid:
        if logger: logger.info("creds is None or not creds.valid")
        if creds and creds.expired and creds.refresh_token:
            creds.refresh(Request())
            if logger: logger.debug("creds.refresh(Request())")
        else:
            flow = InstalledAppFlow.from_client_secrets_file(
                CREDENTIALS_FILE, SHEETS_RW_SCOPE)
            creds = flow.run_local_server()
            if logger: logger.debug("creds = flow.run_local_server()")
        # save the credentials for the next run
        with open(GGL_SHEETS_TOKEN, "wb") as token:
            if logger: logger.debug("pickle.dump()")
            pickle.dump(creds, token, pickle.HIGHEST_PROTOCOL)

    return creds
Example #14
0
def all_double_transforms_combo_60_66(testing_flag, transformation_indices, lr, batch_size, epoch_number):
    '''For all 49 combinations of 2 transformations, return a list of f1_macro and micro scores. This function will take transforms 00 to 16 '''
    transformation_metrics = {}
    transformation_confidences = {}
    transformation_percentages = {}
    transformation_macro_at_50_percent = {}
    start_time = datetime.datetime.now()
    start_time_str = start_time.strftime("%Y%m%d-%H%M%S")

    path_to_embeddings = os.path.join(os.getcwd(), "embeddings", "chapman", "cardiac", "simclr")
    try:
        if not os.path.exists(path_to_embeddings):
            os.makedirs(path_to_embeddings)
    except OSError as err:
        print(err)

    path_to_training_percentages = os.path.join(os.getcwd(), "training_percentage", "chapman", "cardiac", "simclr")
    try:
        if not os.path.exists(path_to_training_percentages):
            os.makedirs(path_to_training_percentages)
    except OSError as err:
        print(err)

    for i in [6]:
        for j in [0, 1, 2, 3, 4, 5, 6]:
            transformation_indices = [i, j]
            string_indices = "".join([str(num) for num in transformation_indices])
            save_name = f'{start_time_str}_testing-{testing_flag}_bs-{batch_size}_transformations-{string_indices}_lr-{lr}'

            np.random.seed(7)
            user_datasets, patient_to_rhythm_dict, test_train_split_dict, working_directory = get_datasets_from_paths(testing_flag)
            np_train_data, train_labels, train_labels_dict, np_test_data, test_labels, test_labels_dict = create_train_test_datasets(user_datasets, patient_to_rhythm_dict, test_train_split_dict, working_directory)

            trained_simclr_model, epoch_losses = train_simclr(testing_flag, np_train_data, transformation_indices=transformation_indices, lr=lr, batch_size=batch_size, epoch_number=epoch_number)
            
            metrics, middle_ovo, half_ovo, mean_ovo, std_ovo, middle_ovr, half_ovr, mean_ovr, std_ovr, percentages, auc_ovo_scores, auc_ovr_scores, embeddings_data = downstream_evaluation(trained_simclr_model, np_train_data, np_test_data, train_labels, test_labels)
            print(metrics)
            print_auc_intervals(middle_ovo, half_ovo, mean_ovo, std_ovo, middle_ovr, half_ovr, mean_ovr, std_ovr)

            # store to dictionary 
            transformation_metrics[string_indices] = metrics 
            transformation_confidences[string_indices] = [middle_ovo, half_ovo, mean_ovo, std_ovo, middle_ovr, half_ovr, mean_ovr, std_ovr]
            transformation_percentages[string_indices] = [percentages, auc_ovo_scores, auc_ovr_scores]
            macro_50_percent_data = auc_ovr_scores[4]
            if macro_50_percent_data:
                transformation_macro_at_50_percent[string_indices] = macro_50_percent_data
            else:
                transformation_macro_at_50_percent[string_indices] = mean_ovr

        save_data = [transformation_metrics, transformation_confidences, transformation_percentages, transformation_macro_at_50_percent]
        
        path_to_double_transforms = os.path.join(os.getcwd(), "embeddings", "double_transforms")
        if not os.path.exists(path_to_double_transforms):
            os.makedirs(path_to_double_transforms)
        save_name = f'{start_time_str}_testing-{testing_flag}_bs-{batch_size}_lr-{lr}_chapman_cardiac-00-16.pickle'
        save_path = os.path.join(path_to_double_transforms, save_name)
        print(f'all double transform path: {save_path}')
        with open(save_path, 'wb') as f:
            pickle.dump(save_data, f)
Example #15
0
 def save(self, file=None, v=1):
     '''save this object'''
     if not file: file = os.path.join(self.path, self.name + '.pkl')
     with open(file, 'wb') as out:
         pickle5.dump(self, out, pickle5.HIGHEST_PROTOCOL)
     if v:
         print(colors.green + "object saved\n" + colors.yellow + file +
               colors.black)
Example #16
0
 def save_subscriptions(self, subs):
     """Save subscribers to file.
     """
     sub_info = {
         'last_update': datetime.now(timezone.utc),
         'subscriptions': subs}
     with open(self.settings.subs_file, 'wb') as fp:
         pickle.dump(sub_info, fp, pickle.HIGHEST_PROTOCOL)
Example #17
0
 def save_last_run(self, found_videos):
     """Save 'last run', which is just the current time to file.
     """
     last_run = {
         'last_run': datetime.now(timezone.utc),
         'found_videos': found_videos}
     with open(self.settings.last_run_file, 'wb') as fp:
         pickle.dump(last_run, fp, pickle.HIGHEST_PROTOCOL)
Example #18
0
def get_data(train=True):
    # feats = cPickle.load(open(coco_inception_features_path, "rb"), encoding="latin1")
    feats = cPickle.load(open('../data/coco_train_v3.pik', "rb"),
                         encoding="latin1")
    feats.update(
        cPickle.load(open('../data/coco_val_ins.pik', "rb"),
                     encoding="latin1"))

    sents = []
    final_feats = []
    filenames = []
    js = json.load(open(coco_dataset_path, "r"))
    for i, img in enumerate(js["images"]):
        if train and img["extrasplit"] == "val":
            continue
        if (not train) and img["extrasplit"] != "val":
            continue
        if img["filename"] not in feats:
            continue
        if train:
            for sen in img["sentences"]:
                sents.append(sen["rm_style_tokens"])
                final_feats.append(feats[img["filename"]])
                filenames.append(img["filename"])
        else:
            sents.append(img["sentences"][0]["rm_style_tokens"])
            final_feats.append(feats[img["filename"]])
            filenames.append(img["filename"])

    final_feats = np.array(final_feats)
    data_file = 'cleaned_sents_train.pkl' if train is True else 'cleaned_test_train.pkl'
    if os.path.exists(data_file):
        with open(data_file, 'rb') as f:
            sents = cPickle.load(f)
    else:
        m = []
        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=3)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
        for i in tqdm(sents, position=0):
            l = []
            for j in range(len(i)):
                t = correct_spell(
                    i[j].replace('NOUNNOUNNOUN', '').replace(
                        "PARTPARTPART", "").replace("FRAMENET", "").replace(
                            "ADJADJADJ", "").replace('INTJINTJINTJ',
                                                     '').lower(), sym_spell)
                l.append(t)
            m.append(l)
        sents = m

        with open(data_file, 'wb') as f:
            cPickle.dump(sents, f)
    return final_feats, filenames, sents
Example #19
0
 def save_dest_playlist(self, pl_id, pl_name):
     """Save playslists to file.
     """
     pl_info = {
         'last_update': datetime.now(timezone.utc),
         'name': pl_name,
         'id': pl_id}
     with open(self.settings.dest_pl_file, 'wb') as fp:
         pickle.dump(pl_info, fp, pickle.HIGHEST_PROTOCOL)
Example #20
0
 def dump(self, filename):
     # dump state of machine into a file
     d = dict()
     d["memory"] = self.memory
     d["stack"] = self.stack
     d["register"] = self.register
     d["xptr"] = self.exec_ptr
     with open(filename + '.pkl', 'wb+') as f:
         pickle.dump(d, f, pickle.HIGHEST_PROTOCOL)
def downstream_evaluation_ms(byol_encoder, test_dataset, train_dataset, segment_number, path_to_embeddings, save_name):
    train_loader = DataLoader(train_dataset, batch_size=len(train_dataset), num_workers=28)
    test_loader = DataLoader(test_dataset, batch_size=len(test_dataset), num_workers=28)

    for data_label in train_loader:
        data, label = data_label
        segment_size = data.size()[2] // segment_number
        split = torch.split(data, segment_size, dim=2)
        split = split[:segment_number]
        encoded_split_numpy = [byol_encoder(split_data.float()).detach().numpy() for split_data in split]
        # take mean of encoded segments 
        byol_train_numpy_x = np.mean(np.array(encoded_split_numpy), axis=0)
        byol_train_numpy_y = label.detach().numpy()

    X_train, y_train = byol_train_numpy_x, byol_train_numpy_y

    for data_label in test_loader:
        data, label = data_label 
        data, label = data_label
        segment_size = data.size()[2] // segment_number
        split = torch.split(data, segment_size, dim=2)
        split = split[:segment_number]
        encoded_split_numpy = [byol_encoder(split_data.float()).detach().numpy() for split_data in split]
        # take mean of encoded segments 
        byol_test_numpy_x = np.mean(np.array(encoded_split_numpy), axis=0)
        byol_test_numpy_y = label.detach().numpy()

    X_test, y_test = byol_test_numpy_x, byol_test_numpy_y

    save_name = f'{save_name}-train_test.pickle'
    save_path = os.path.join(path_to_embeddings, save_name)
    with open(save_path, 'wb') as f:
        data = [X_train, X_test, y_train, y_test]
        pickle.dump(data, f)

    log_reg_clf = LogisticRegression(multi_class='multinomial', solver='lbfgs')
    log_reg_clf.fit(X_train, y_train)
    y_pred = log_reg_clf.predict(X_test)

    averages = ['micro', 'macro']
    metrics = {}
    for average in averages:
        f1 = f1_score(y_test, y_pred, average=average)
        precision = precision_score(y_test, y_pred, average=average)
        recall = recall_score(y_test, y_pred, average=average)
        metrics[f'f1_{average}'] = f1
        metrics[f'precision_{average}'] = precision
        metrics[f'recall_{average}'] = recall
    
    accuracy = accuracy_score(y_test, y_pred)
    metrics['accuracy'] = accuracy

    middle_macro, half_macro, mean_macro, std_macro, middle_micro, half_micro, mean_micro, std_micro = get_confidence_interval_f1_micro_macro(y_test, y_pred)    

    percentages, f1_macro_scores, f1_micro_scores = different_percentage_training(X_train, X_test, y_train, y_test)

    return metrics, middle_macro, half_macro, mean_macro, std_macro, middle_micro, half_micro, mean_micro, std_micro
Example #22
0
def to_bytes_gz(graph: BELGraph,
                protocol: int = pickle.HIGHEST_PROTOCOL) -> bytes:
    """Convert a graph to gzipped bytes with pickle.

    :param graph: A BEL graph
    :param protocol: Pickling protocol to use. Defaults to ``HIGHEST_PROTOCOL``.
    """
    io = BytesIO()
    with gzip.open(io, mode='wb') as file:
        pickle.dump(graph, file, protocol=protocol)
    return io.getvalue()
Example #23
0
    def save_model(self, path: str) -> None:
        weights_to_save = {}
        for variable in self.sess.graph.get_collection(
                tf.GraphKeys.GLOBAL_VARIABLES):
            assert variable.name not in weights_to_save
            weights_to_save[variable.name] = self.sess.run(variable)

        data_to_save = {"params": self.params, "weights": weights_to_save}

        with open(path, 'wb') as out_file:
            pickle.dump(data_to_save, out_file, pickle.HIGHEST_PROTOCOL)
def save_train_test_embeddings(X_train, X_test, y_train, y_test,
                               path_to_embeddings, save_name):
    start_time = datetime.datetime.now()
    start_time_str = start_time.strftime("%Y%m%d-%H%M%S")
    print(f'Starting to save train test embeddings: {start_time_str}')
    save_name = f'{save_name}-{start_time_str}.pickle'
    save_path = os.path.join(path_to_embeddings, save_name)
    print(f'path to embeddings: {save_path}')
    with open(save_path, 'wb') as f:
        data = [X_train, X_test, y_train, y_test]
        pickle.dump(data, f)
 def exit_handler(self):
     print(f"R/W? - {self.read_write_mode}")
     if self.read_write_mode == READ_WRITE:
         # create directory if it doesn't exist
         self.lib_file_path.parent.mkdir(parents=True, exist_ok=True)
         print(f'PICKLING before EXIT: {self.lib_file_path}')
         print(f"SIZE: {len(self.media_files.keys())}")
         pprint(self.media_files.keys())
         print(" - - ")
         with open(self.lib_file_path, 'wb') as f:
             pickle.dump(self.media_files, f, pickle.HIGHEST_PROTOCOL)
Example #26
0
File: tree.py Project: HuviX/mai
    def save(self, path: str) -> None:
        """
        Allow to dump tree to pickle.
        .pkl file extension should be specified

        tree.save("path.pkl")
        """
        assert (self.tree), "No tree created for this class instance"

        with open(path, 'wb') as handle:
            pickle.dump(self.tree, handle)
def create_model(X, y, model_filename, classifier_type):
    """
    Builds the classifier model

    :param X: Features
    :param y: Labels
    :param model_filename: the file where to save the model
    :param classifier_type: svm/rf
    :return:
    """
    # split dataset
    X_train, X_test, y_train, y_test = train_test_split(X,
                                                        y,
                                                        test_size=0.33,
                                                        random_state=42)

    print('Training Classifier..')
    if classifier_type is 'svm':
        # creates SVM model and fits it on training samples
        clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))
        clf.fit(X_train, y_train)
        model = clf
        # stores the classifier in pickle file
        with open(model_filename, 'wb') as model_file:
            pickle.dump(model, model_file)

        # predict for test samples
        y_pred = clf.predict(X_test)
        # predict for training samples
        y_pred_train = clf.predict(X_train)

    else:
        # creates Random Forest model and fits it on training samples
        rf = RandomForestClassifier(n_estimators=50,
                                    criterion='gini',
                                    max_depth=40,
                                    max_features=11)
        rf.fit(X_train, y_train)
        model = rf
        # stores the classifier in pickle file
        with open(model_filename, 'wb') as model_file:
            pickle.dump(model, model_file)

        # predict for test samples
        y_pred = rf.predict(X_test)
        # predict for training samples
        y_pred_train = rf.predict(X_train)

    # Print accuracy score
    print("Test Accuracy score:", metrics.accuracy_score(y_test, y_pred))
    print("Train Accuracy score:",
          metrics.accuracy_score(y_train, y_pred_train))
Example #28
0
    def build_vocab(self, min_freq=0, max_freq=sys.maxsize):
        """
		build vocab + add eos
		encode sentence
		"""
        with open(os.path.join(self.data_dir, 'train.txt'), 'r') as fn:
            data = fn.readlines()

        if 'lambada' in self.data_dir:
            with open(os.path.join(self.data_dir, 'test.txt'), 'r') as fn:
                data.extend(fn.readlines())

            with open(os.path.join(self.data_dir, 'valid.txt'), 'r') as fn:
                data.extend(fn.readlines())

        print('building vocab ...')
        self.vocab = defaultdict(int)
        self.tok2id = {}
        self.id2tok = []

        for line in tqdm(data):
            line = line.strip().split()
            for tok in line:
                self.vocab[tok] += 1

        self.vocab = {
            a: self.vocab[a]
            for a in self.vocab
            if self.vocab[a] >= min_freq and self.vocab[a] <= max_freq
        }
        # sort vocab in case of using adaptive softmax
        self.vocab = list(
            sorted(self.vocab.items(), key=lambda a: a[1], reverse=True))
        print(self.vocab[:10])

        if 'lambada' in self.data_dir:
            self.vocab = self.vocab[:60000]
            self.vocab.append(('<unk>', 0))

        self.id2tok = ['<pad>'] + ['<eos>'] + [a[0] for a in self.vocab]
        self.tok2id = {a: i for i, a in enumerate(self.id2tok)}
        self.vocab_size = len(self.id2tok)

        print('end building vocab ...')
        print('vocab size', len(self.tok2id))
        with open(os.path.join(self.data_dir, 'vocab.pkl'), 'wb') as fn:
            pickle.dump(
                {
                    'id2tok': self.id2tok,
                    'tok2id': self.tok2id,
                    'vocab_size': self.vocab_size
                }, fn)
Example #29
0
 def build_fasttext(self, word_dict):
     # create word_vec with fastText vectors
     if not os.path.exists('word_vec.pkl'):
         ft = fasttext.load_model('cc.sv.300.bin')
         for word in tqdm(word_dict):
             vec = np.array(ft.get_word_vector(word))
             vec = vec / np.sqrt(np.sum(np.power(vec, 2)))
             self.word_vec[word] = vec
         with open('word_vec.pkl', 'wb') as f:
             pickle.dump(self.word_vec, f, pickle.HIGHEST_PROTOCOL)
     else:
         with open('word_vec.pkl', 'rb') as f:
             self.word_vec = pickle.load(f)
Example #30
0
    def make_api_calls(self):
        """
        Make API calls to get and store data ahead of the game. 
        """

        # Get Tweet counts for each user
        with open(f"{self.parent_dir}/data/tweet_counts_{self.uuid}.txt",
                  "wb") as data_file:
            # Initiate empty dictionary
            counts_data = {}
            # Get number of Tweets for each user
            search_counts = Search_Counts(self.auth)
            for i, user in enumerate(self.users):
                # Make API call
                response = search_counts(f"from:{user[1:]} -is:retweet")
                if response.status_code == 200:
                    response = response.text
                    parsed = json.loads(response)
                    counts_data[user] = parsed["totalCount"]
                else:
                    return False, response.status_code
            pickle.dump(counts_data, data_file)

        # Get recent Tweets for each user
        with open(f"{self.parent_dir}/data/recent_search_{self.uuid}.txt",
                  "wb") as data_file:
            # Initiate empty dictionary
            tweet_data = {}
            # Get Tweets for each user
            recent_search_data = Recent_Search_Data(self.auth)
            for i, user in enumerate(self.users):
                # Make API call
                response = recent_search_data(f"from:{user[1:]} -is:retweet")
                if response.status_code == 200:
                    response = response.text
                    parsed = json.loads(response)
                    # Twitter Labs endpoint subject to change
                    try:
                        tweet_data[user] = [
                            tweet["text"] for tweet in parsed["data"]
                        ]
                    except:
                        tweet_data[user] = None
                else:
                    return False, response.status_code

            pickle.dump(tweet_data, data_file)

        # Store paths to wordcloud images
        self.wordcloud_paths = self.make_wordclouds()
        return True, 200