def parse_train_data(self, filepath_or_buffer):
        self._list_of_instances, self._list_of_labels = self._load_train_data(
            filepath_or_buffer)

        checks.check_equality(len(self._list_of_instances),
                              len(self._list_of_labels),
                              message="Instances of read data are not equal "
                              "to their.")
        checks.check_equality(self.to_final_label(
            self.to_interim_label([24, 42, 42])), [24, 42, 42],
                              message="Processing data methods are "
                              "not mutually inverse.")

        self._list_of_samples = list(
            map(self._to_sample, self._list_of_instances))

        if self._debug:
            print(len(self._list_of_instances))
            print(self._list_of_instances[:3])
            print(self._list_of_labels[:3])
            print(self._list_of_samples[:3])

        self._train_samples_num = int(self._proportion *
                                      len(self._list_of_labels))
        self._chknums = self._chknums[self._train_samples_num:]
        self._answers_for_train = [
            sorted(x) for x in self._list_of_labels[self._train_samples_num:]
        ]
Example #2
0
    def train(self, train_samples, train_labels, **kwargs):
        checks.check_equality(len(train_samples),
                              len(train_labels),
                              message="Samples and labels have different "
                              "sizes")

        self.most_popular_goods = kwargs["most_popular_goods"]
        self.most_popular_good_ids = kwargs["most_popular_good_ids"]

        checks.check_value(len(self.most_popular_good_ids),
                           lower=self.num_popular_ids,
                           strict_less=False,
                           var_name="most_popular_good_ids")

        self.max_good_id = kwargs["max_good_id"]

        # Get person ids from train samples, samples format:
        # [[person_id, month, day], [person_id, month, day], ...].
        persons_ids = [person_data[0] for person_data in train_samples]
        for persons_id, label in zip(persons_ids, train_labels):
            if self.orders.get(persons_id) is None:
                self.orders[persons_id] = np.array(label)
            else:
                self.orders[persons_id] += np.array(label)

        self.process_orders()
Example #3
0
    def train(self, train_samples, train_labels, **kwargs):
        checks.check_equality(len(train_samples),
                              len(train_labels),
                              message="Samples and labels have different "
                              "sizes")

        self.most_popular_goods = kwargs["most_popular_goods"]

        # Get person ids from train samples, samples format:
        # [[person_id, month, day], [person_id, month, day], ...].
        persons_ids = [person_data[0] for person_data in train_samples]
        self.latest_orders = dict(zip(persons_ids, train_labels))
Example #4
0
    def train(self, train_samples, train_labels, **kwargs):
        """
        Train current model.

        :param train_samples: array-like, sparse matrix.
            Training data.

        :param train_labels: array-like, sparse matrix.
            Target values. Will be cast to train_samples’s dtype if necessary.

        :param kwargs: dict, optional(default={}).
            Additional keyword arguments.
        """
        checks.check_equality(len(train_samples),
                              len(train_labels),
                              message="Samples and labels have different "
                              "sizes")

        self.model.fit(train_samples, train_labels, **kwargs)
Example #5
0
    def fit(self, train_samples, train_labels, **kwargs):
        checks.check_equality(len(train_samples),
                              len(train_labels),
                              message="Samples and labels have different "
                              "sizes")

        persons_ids = [person_data[0] for person_data in train_samples]
        for persons_id, label in zip(persons_ids, train_labels):
            if self.orders.get(persons_id) is None:
                self.orders[persons_id] = np.array(label)
            else:
                self.orders[persons_id] += np.array(label)

        self.clustering_table = pd.DataFrame(self.model.fit_predict(
            pd.DataFrame.from_dict(self.orders, orient="index")),
                                             columns=[self.COL_NAME])

        cluster_id = self.clustering_table[self.COL_NAME]\
            .value_counts().index[0]
        larg_clust_center = self.model.cluster_centers_[cluster_id]
        self.largest_cluster_goods = (larg_clust_center >=
                                      self.CLUSTER_BORDER).astype(np.int)
Example #6
0
 def train(self, train_samples, train_labels, **kwargs):
     checks.check_equality(len(train_samples),
                           len(train_labels),
                           message="Samples and labels have different "
                           "sizes")
     self.most_popular_goods = kwargs["most_popular_goods"]