コード例 #1
0
class ApplyDimensionReductionModel(gokart.TaskOnKart):
    task_namespace = 'redshells.word_item_similarity'
    item2embedding_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs item2embedding data with type = Dict[Any, np.ndarray].'
    )
    dimension_reduction_model_task = gokart.TaskInstanceParameter(
        default='A task outputs a model instance of `DimensionReductionModel`.'
    )
    l2_normalize = luigi.BoolParameter()  # type: bool
    output_file_path = luigi.Parameter(
        default='app/word_item_similarity/dimension_reduction_model.pkl'
    )  # type: str

    def requires(self):
        return dict(item2embedding=self.item2embedding_task,
                    model=self.dimension_reduction_model_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        item2embedding = self.load(
            'item2embedding')  # type: Dict[Any, np.ndarray]
        model = self.load('model')
        items = list(item2embedding.keys())
        embeddings = model.apply(np.array(list(item2embedding.values())))
        if self.l2_normalize:
            embeddings = sklearn.preprocessing.normalize(embeddings,
                                                         axis=1,
                                                         norm='l2')
        self.dump(dict(zip(items, list(embeddings))))
コード例 #2
0
class TrainLdaModel(gokart.TaskOnKart):
    output_file_path = luigi.Parameter(
        default='model/lda_model.pkl')  # type: str
    tokenized_text_data_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs tokenized texts with type "List[List[str]]".')
    dictionary_task = gokart.TaskInstanceParameter(
        description='A task outputs gensim.corpura.Dictionary.')
    lda_model_kwargs = luigi.DictParameter(
        default=dict(n_topics=100,
                     chunksize=16,
                     decay=0.5,
                     offset=16,
                     iterations=3,
                     eta=1.e-16),
        description='Arguments for redshells.model.LdaModel.'
    )  # type: Dict[str, Any]

    def requires(self):
        return dict(tokenized_texts=self.tokenized_text_data_task,
                    dictionary=self.dictionary_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        tokenized_texts = self.load('tokenized_texts')  # type: List[List[str]]
        dictionary = self.load('dictionary')  # type: gensim.corpora.Dictionary
        model = redshells.model.LdaModel(**self.lda_model_kwargs)
        model.fit(texts=tokenized_texts, dictionary=dictionary)
        self.dump(model)
class CalculateSimilarityWithMatrixFactorization(gokart.TaskOnKart):
    """Calculate similarity between items using latent factors which are calculated by Matrix Factorization.
    """
    task_namespace = 'redshells.word_item_similarity'
    target_item_task = gokart.TaskInstanceParameter(
        description='A task outputs item ids as type List.')
    matrix_factorization_task = gokart.TaskInstanceParameter(
        description='A task instance of `TrainMatrixFactorization`.')
    normalize = luigi.BoolParameter(
        description='Normalize item factors with l2 norm.')  # type: bool
    batch_size = luigi.IntParameter(default=1000, significant=False)
    output_file_path = luigi.Parameter(
        default=
        'app/word_item_similarity/calculate_similarity_with_matrix_factorization.zip'
    )  # type: str

    def requires(self):
        assert type(self.matrix_factorization_task) == redshells.train.TrainMatrixFactorization,\
            f'but actually {type(self.matrix_factorization_task)} is passed.'
        return dict(data=self.target_item_task,
                    model=self.matrix_factorization_task)

    def output(self):
        return self.make_large_data_frame_target(self.output_file_path)

    def run(self):
        tf.reset_default_graph()
        data = self.load('data')  # type: List
        model = self.load('model')  # type: redshells.model.MatrixFactorization

        data = list(set(data))
        item_ids = model.get_valid_item_ids(data)
        factors = model.get_item_factors(item_ids, normalize=self.normalize)
        # Usually, ths size of item_ids is too large to calculate similarities at once. So I split data.
        split_size = factors.shape[0] // self.batch_size + 1
        factors_sets = np.array_split(factors, split_size)
        item_ids_sets = np.array_split(item_ids, split_size)

        def _calculate(x, y, x_ids, y_ids):
            if np.array_equal(x_ids, y_ids):
                indices = np.triu_indices(x_ids.shape[0], k=1)
            else:
                indices_ = np.indices([x_ids.shape[0], y_ids.shape[0]])
                indices = (indices_[0].flatten(), indices_[1].flatten())

            df = pd.DataFrame({
                'item_id_0': list(x_ids[indices[0]]),
                'item_id_1': list(y_ids[indices[1]]),
                'similarity': list(np.dot(x, y.T)[indices])
            })
            return df

        results = pd.concat([
            _calculate(factors_sets[i], factors_sets[j], item_ids_sets[i],
                       item_ids_sets[j]) for i, j in tqdm(
                           list(
                               itertools.combinations_with_replacement(
                                   range(split_size), 2)))
        ])
        self.dump(results)
コード例 #4
0
class CalculateDocumentEmbedding(gokart.TaskOnKart):
    """
    Calculate document embeddings 
    """
    task_namespace = 'redshells.word_item_similarity'
    document_task = gokart.TaskInstanceParameter()
    scdv_task = gokart.TaskInstanceParameter()
    item_id_column_name = luigi.Parameter()  # type: str
    document_column_name = luigi.Parameter()  # type: str
    l2_normalize = luigi.BoolParameter()  # type: bool
    output_file_path = luigi.Parameter(
        default='app/word_item_similarity/calculate_document_embedding.pkl'
    )  # type: str

    def requires(self):
        return dict(document=self.document_task, scdv=self.scdv_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        scdv = self.load('scdv')
        document = self.load_data_frame('document',
                                        required_columns={
                                            self.item_id_column_name,
                                            self.document_column_name
                                        })

        documents = document[self.document_column_name].tolist()
        embeddings = scdv.infer_vector(documents,
                                       l2_normalize=self.l2_normalize)
        self.dump(
            dict(
                zip(document[self.item_id_column_name].tolist(),
                    list(embeddings))))
コード例 #5
0
class FilterItemByWordSimilarity(gokart.TaskOnKart):
    word2items_task = gokart.TaskInstanceParameter()
    word2embedding_task = gokart.TaskInstanceParameter()
    item2title_embedding_task = gokart.TaskInstanceParameter()
    no_below = luigi.FloatParameter()
    output_file_path = luigi.Parameter(
        default='app/word_item_similarity/filter_item_by_word_similarity.pkl'
    )  # type: str

    def requires(self):
        return dict(word2items=self.word2items_task,
                    word2embedding=self.word2embedding_task,
                    item2title_embedding=self.item2title_embedding_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        word2items = self.load('word2items')
        word2embedding = self.load('word2embedding')
        item2title_embedding = self.load('item2title_embedding')

        filtered_word2items = defaultdict(list)
        for word, items in word2items.items():
            word_embedding = word2embedding[word]
            for item in items:
                title_embedding = item2title_embedding[item]
                if np.inner(word_embedding, title_embedding) > self.no_below:
                    filtered_word2items[word].append(item)

        self.dump(dict(filtered_word2items))
コード例 #6
0
class MergeData(gokart.TaskOnKart):
    task_namespace = 'm5-forecasting'

    calendar_data_task = gokart.TaskInstanceParameter()
    selling_price_data_task = gokart.TaskInstanceParameter()
    sales_data_task = gokart.TaskInstanceParameter()

    def requires(self):
        return dict(calendar=self.calendar_data_task, selling_price=self.selling_price_data_task,
                    sales=self.sales_data_task)

    def run(self):
        calendar = self.load_data_frame('calendar')
        selling_price = self.load_data_frame('selling_price')
        sales = self.load_data_frame('sales')
        output = self._run(calendar, selling_price, sales)
        self.dump(output)

    @staticmethod
    def _run(calendar, selling_price, sales):
        sales = sales.merge(calendar, how="left", on="d")
        gc.collect()

        sales = sales.merge(selling_price, how="left", on=["store_id", "item_id", "wm_yr_wk"])
        sales.drop(["wm_yr_wk"], axis=1, inplace=True)
        gc.collect()
        del selling_price

        return sales
コード例 #7
0
class CalculateWordEmbedding(gokart.TaskOnKart):
    task_namespace = 'redshells.word_item_similarity'
    word_task = gokart.TaskInstanceParameter()
    word2item_task = gokart.TaskInstanceParameter()
    item2embedding_task = gokart.TaskInstanceParameter()
    output_file_path = luigi.Parameter(default='app/word_item_similarity/calculate_word_embedding.pkl')  # type: str

    def requires(self):
        return dict(word=self.word_task, word2item=self.word2item_task, item2embedding=self.item2embedding_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        word_data = self.load('word')
        word2item = self.load('word2item')
        item2embedding = self.load('item2embedding')

        results = {word: self._calculate(word2item[word], item2embedding) for word in word_data if word in word2item}
        self.dump(results)

    def _calculate(self, items, item2embedding):
        embeddings = [item2embedding[item] for item in items if item in item2embedding]
        if not embeddings:
            return None
        return sklearn.preprocessing.normalize([np.sum(embeddings, axis=0)], norm='l2', axis=1)[0]
コード例 #8
0
class TaskD(gokart.TaskOnKart):
    foo = gokart.TaskInstanceParameter()
    bar = gokart.TaskInstanceParameter()

    def run(self):
        x = self.load('foo')
        y = self.load('bar')
        self.dump(x + y + ['D'])
コード例 #9
0
class TrainSCDV(gokart.TaskOnKart):
    task_namespace = 'redshells'
    tokenized_text_data_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs tokenized texts with type "List[List[str]]".')
    dictionary_task = gokart.TaskInstanceParameter(
        description='A task outputs gensim.corpora.Dictionary.')
    word2vec_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs gensim.models.Word2Vec, gensim.models.FastText or models with the same interface.'
    )
    cluster_size = luigi.IntParameter(
        default=60,
        description='A cluster size of Gaussian mixture model in SCDV.'
    )  # type: int
    sparsity_percentage = luigi.FloatParameter(
        default=0.04,
        description='A percentage of sparsity in SCDV')  # type: float
    gaussian_mixture_kwargs = luigi.DictParameter(
        default=dict(),
        description=
        'Arguments for Gaussian mixture model except for cluster size.'
    )  # type: Dict[str, Any]
    output_file_path = luigi.Parameter(default='model/scdv.pkl')  # type: str
    text_sample_size = luigi.IntParameter(
        default=10000,
        description=
        'SCDV uses texts to calculate threshold to make sparse, so not all texts data is required.'
    )  # type: int

    def requires(self):
        return dict(text=self.tokenized_text_data_task,
                    dictionary=self.dictionary_task,
                    word2vec=self.word2vec_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        texts = self.load('text')  # type: List
        dictionary = self.load('dictionary')  # type: gensim.corpora.Dictionary
        word2vec = self.load('word2vec')  # type: gensim.models.Word2Vec

        if len(texts) > self.text_sample_size:
            texts = np.random.choice(texts, size=self.text_sample_size)

        if isinstance(texts[0], str):
            texts = redshells.train.utils.TokenIterator(texts=texts)

        model = redshells.model.SCDV(
            documents=texts,
            cluster_size=self.cluster_size,
            sparsity_percentage=self.sparsity_percentage,
            gaussian_mixture_kwargs=self.gaussian_mixture_kwargs,
            dictionary=dictionary,
            w2v=word2vec)
        self.dump(model)
コード例 #10
0
class _DoubleLoadSubTask(gokart.TaskOnKart):
    task_namespace = __name__
    sub1 = gokart.TaskInstanceParameter()
    sub2 = gokart.TaskInstanceParameter()

    def output(self):
        return self.make_target('sub_task.txt')

    def run(self):
        self.dump(f'task uid = {self.make_unique_id()}')
コード例 #11
0
class FindItemKeywordByMatching(gokart.TaskOnKart):
    """
    Find items which include keywords in its value of 'item_keyword_column_name'.
    Output pd.DataFrame with columns [item_id, keyword].
    """
    task_namespace = 'redshells.word_item_similarity'
    target_keyword_task = gokart.TaskInstanceParameter(
        description='A task outputs keywords as type `List[Any]` or `Set[Any]`.'
    )
    item_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs item data as type `pd.DataFrame` which has `item_id_column_name`.'
    )
    tfidf_task = gokart.TaskInstanceParameter(
        description='A task instance of TrainTfidf.')
    keep_top_rate = luigi.FloatParameter(
        description='A rate to filter words in texts.')  # type: float
    item_id_column_name = luigi.Parameter()  # type: str
    item_keyword_column_name = luigi.Parameter()  # type: str
    output_file_path = luigi.Parameter(
        default='app/word_item_similarity/find_item_by_keyword_matching.pkl'
    )  # type: str

    def requires(self):
        return dict(keyword=self.target_keyword_task,
                    item=self.item_task,
                    tfidf=self.tfidf_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        keywords = set(self.load('keyword'))
        items = self.load_data_frame('item',
                                     required_columns={
                                         self.item_id_column_name,
                                         self.item_keyword_column_name
                                     })
        tfidf = self.load('tfidf')  # type: redshells.model.Tfidf
        tokens = items[self.item_keyword_column_name].tolist()
        top_tokens = [
            list(zip(*values))[0]
            for values in tfidf.apply(tokens=tokens,
                                      keep_top_rate=self.keep_top_rate)
        ]

        item_ids = items[self.item_id_column_name].tolist()
        match_keywords = [set(t) & keywords for t in top_tokens]
        result = pd.DataFrame(
            dict(item_id=list(
                itertools.chain.from_iterable(
                    [[item_id] * len(keywords)
                     for item_id, keywords in zip(item_ids, match_keywords)])),
                 keyword=list(itertools.chain.from_iterable(match_keywords))))
        self.dump(result)
コード例 #12
0
class CalculateWordItemSimilarity(gokart.TaskOnKart):
    """
    Calculate similarity between words and items. 
    """
    task_namespace = 'redshells.word_item_similarity'
    word2embedding_task = gokart.TaskInstanceParameter()
    item2embedding_task = gokart.TaskInstanceParameter()
    similarity_model_task = gokart.TaskInstanceParameter()
    prequery_return_size = luigi.IntParameter()  # type: int
    return_size = luigi.IntParameter()  # type: int
    output_file_path = luigi.Parameter(
        default='app/word_item_similarity/calculate_word_item_similarity.pkl')  # type: str

    def requires(self):
        return dict(
            word2embedding=self.word2embedding_task,
            item2embedding=self.item2embedding_task,
            model=self.similarity_model_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        word2embedding = self.load('word2embedding')  # type: Dict[Any, np.ndarray]
        item2embedding = self.load('item2embedding')  # type: Dict[Any, np.ndarray]
        model = self.load('model')

        item_embeddings = np.array(list(item2embedding.values()))
        items = np.array(list(item2embedding.keys()))
        results = pd.concat([
            self._find_top_similarity(model, word, embedding, items, item_embeddings)
            for word, embedding in tqdm(word2embedding.items())
        ])
        self.dump(results.reset_index(drop=True))

    def _find_top_similarity(self, model, word, word_embedding: np.ndarray, items: np.ndarray,
                             item_embeddings: np.ndarray) -> pd.DataFrame:
        if word_embedding is None:
            logger.info(f'word {word} is not registered.')
            return pd.DataFrame(columns=['word', 'item', 'similarity'])
        filtered_indices = self._filter(word_embedding, item_embeddings)
        similarities = self._predict(model, word_embedding, item_embeddings[filtered_indices, :])
        top_indices = similarities.argsort()[-self.return_size:][::-1]
        return pd.DataFrame(
            dict(word=word, item=items[filtered_indices[top_indices]], similarity=similarities[top_indices]))

    def _predict(self, model, word_embedding: np.ndarray, item_embeddings: np.ndarray) -> np.ndarray:
        i = list(model.classes_).index(1)
        return model.predict_proba(item_embeddings * word_embedding)[:, i]

    def _filter(self, word_embedding: np.ndarray, item_embeddings: np.ndarray) -> np.ndarray:
        similarities = np.dot(item_embeddings, word_embedding.reshape([-1, 1])).flatten()
        top_indices = similarities.argsort()[-self.prequery_return_size:][::-1]
        return top_indices
コード例 #13
0
class TrainGraphConvolutionalMatrixCompletion(gokart.TaskOnKart):
    task_namespace = 'redshells'
    train_data_task = gokart.TaskInstanceParameter(
        description='A task outputs a pd.DataFrame with columns={`user_column_name`, `item_column_name`, `target_column_name`}.')
    user_column_name = luigi.Parameter(default='user', description='The column name of user id.')  # type: str
    item_column_name = luigi.Parameter(default='item', description='The column name of item id')  # type: str
    rating_column_name = luigi.Parameter(default='rating', description='The target column name to predict.')  # type: str
    user_feature_task = gokart.TaskInstanceParameter(default=NoneTask())
    item_feature_task = gokart.TaskInstanceParameter(default=NoneTask())
    model_kwargs = luigi.DictParameter(default=dict(), description='Arguments of the model.')  # type: Dict[str, Any]
    max_data_size = luigi.IntParameter(default=50000000)  # type: int
    output_file_path = luigi.Parameter(default='model/graph_convolutional_matrix_completion.zip')  # type: str
    try_count = luigi.IntParameter(default=10)  # type: int
    decay_speed = luigi.FloatParameter(default=2.0)  # type: float
    test_size = luigi.FloatParameter(default=0.2)  # type: float
    # data parameters
    min_user_click_count = luigi.IntParameter(default=5)  # type: int
    max_user_click_count = luigi.IntParameter(default=200)  # type: int

    def requires(self):
        return dict(train_data=self.train_data_task, user_features=self.user_feature_task, item_features=self.item_feature_task)

    def output(self):
        return dict(model=self.make_model_target(self.output_file_path,
                                                 save_function=GraphConvolutionalMatrixCompletion.save,
                                                 load_function=GraphConvolutionalMatrixCompletion.load),
                    report=self.make_target('model_report/report.txt'))

    def run(self):
        tf.reset_default_graph()
        df = self.load_data_frame('train_data', required_columns={self.user_column_name, self.item_column_name, self.rating_column_name})
        user_features = self.load('user_features')
        item_features = self.load('item_features')

        df.drop_duplicates(subset=[self.user_column_name, self.item_column_name], inplace=True)
        df = sklearn.utils.shuffle(df)
        df = df.head(n=int(self.max_data_size))

        user_ids = df[self.user_column_name].values
        item_ids = df[self.item_column_name].values
        ratings = df[self.rating_column_name].values

        dataset = GcmcDataset(user_ids=user_ids, item_ids=item_ids, ratings=ratings, user_features=user_features, item_features=item_features)
        graph_dataset = GcmcGraphDataset(dataset=dataset,
                                         test_size=self.test_size,
                                         min_user_click_count=self.min_user_click_count,
                                         max_user_click_count=self.max_user_click_count)
        model = GraphConvolutionalMatrixCompletion(graph_dataset=graph_dataset, **self.model_kwargs)
        self.task_log['report'] = [str(self.model_kwargs)] + model.fit(try_count=self.try_count, decay_speed=self.decay_speed)
        self.dump(self.task_log['report'], 'report')
        self.dump(model, 'model')
コード例 #14
0
class _PairwiseSimilarityModelTask(gokart.TaskOnKart):
    item2embedding_task = gokart.TaskInstanceParameter(
        description='A task outputs a mapping from item to embedding. The output must have type=Dict[Any, np.ndarray].')
    similarity_data_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs a pd.DataFrame with columns={`item0_column_name`, `item`_column_name`, `similarity_column_name`}. '
        '`similarity_column_name` must be binary data.')
    item0_column_name = luigi.Parameter()  # type: str
    item1_column_name = luigi.Parameter()  # type: str
    similarity_column_name = luigi.Parameter()  # type: str
    model_name = luigi.Parameter(
        default='XGBClassifier',
        description='A model name which has "fit" interface, and must be registered by "register_prediction_model".'
    )  # type: str
    model_kwargs = luigi.DictParameter(
        default=dict(), description='Arguments of the model which are created with model_name.')  # type: Dict[str, Any]
    output_file_path = luigi.Parameter(default='model/pairwise_similarity_model.pkl')  # type: str

    def requires(self):
        return dict(item2embedding=self.item2embedding_task, similarity_data=self.similarity_data_task)

    def output(self):
        return self.make_target(self.output_file_path)

    def create_model(self):
        return redshells.factory.create_prediction_model(self.model_name, **self.model_kwargs)

    def create_train_data(self):
        logger.info('loading input data...')
        item2embedding = self.load('item2embedding')  # type: Dict[Any, np.ndarray]
        similarity_data = self.load_data_frame(
            'similarity_data',
            required_columns={self.item0_column_name, self.item1_column_name, self.similarity_column_name})
        logger.info(f'similarity_data size={similarity_data.shape}')
        similarity_data = sklearn.utils.shuffle(similarity_data)
        logger.info('making features...')
        similarity_data[self.similarity_column_name] = similarity_data[self.similarity_column_name].astype(int)
        similarity_data = sklearn.utils.shuffle(similarity_data)
        similarity_data = similarity_data[similarity_data[self.item0_column_name].isin(item2embedding)]
        similarity_data = similarity_data[similarity_data[self.item1_column_name].isin(item2embedding)]
        x = np.array([
            np.multiply(item2embedding[i1], item2embedding[i2]) for i1, i2 in zip(
                similarity_data[self.item0_column_name].tolist(), similarity_data[self.item1_column_name].tolist())
        ])

        y = similarity_data[self.similarity_column_name].tolist()

        logger.info('done making train data.')
        logger.info(f'size of x={len(x)}, {len(x[0])}')
        return x, y
コード例 #15
0
class TrainDictionary(gokart.TaskOnKart):
    task_namespace = 'redshells'
    tokenized_text_data_task = gokart.TaskInstanceParameter(
        description=
        'The task outputs tokenized texts with type "List[List[str]]".')
    output_file_path = luigi.Parameter(
        default='model/dictionary.pkl')  # type: str
    dictionary_filter_kwargs = luigi.DictParameter(
        default=dict(no_below=5, no_above=0.5, keep_n=100000,
                     keep_tokens=None),
        description=
        'Arguments for FastText except "sentences". Please see gensim.corpura.FastText for more details.'
    )  # type: Dict[str, Any]

    def requires(self):
        return self.tokenized_text_data_task

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        texts = self.load()  # type: List
        if isinstance(texts[0], str):
            texts = redshells.train.utils.TokenIterator(texts=texts)
        dictionary = gensim.corpora.Dictionary(texts)
        if len(self.dictionary_filter_kwargs):
            dictionary.filter_extremes(**self.dictionary_filter_kwargs)
        self.dump(dictionary)
コード例 #16
0
class TaskC(gokart.TaskOnKart):
    foo = gokart.TaskInstanceParameter()
    text = luigi.Parameter()

    def run(self):
        x = self.load('foo')
        self.dump(x + [self.text])
コード例 #17
0
class _FactorizationMachineTask(gokart.TaskOnKart):
    train_data_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs a pd.DataFrame with columns={`target_column_name`}.')
    target_column_name = luigi.Parameter(
        default='category', description='Category column names.')  # type: str
    model_name = luigi.Parameter(
        default='XGBClassifier',
        description=
        'A model name which has "fit" interface, and must be registered by "register_prediction_model".'
    )  # type: str
    model_kwargs = luigi.DictParameter(
        default=dict(),
        description='Arguments of the model which are created with model_name.'
    )  # type: Dict[str, Any]

    def requires(self):
        return self.train_data_task

    def create_model(self):
        return redshells.factory.create_prediction_model(
            self.model_name, **self.model_kwargs)

    def create_train_data(self):
        data = self.load_data_frame(required_columns={self.target_column_name})
        data = sklearn.utils.shuffle(data)
        y = data[self.target_column_name].astype(int)

        x = data.drop(self.target_column_name, axis=1)
        return x, y
コード例 #18
0
class MakePairedData(gokart.TaskOnKart):
    task_namespace = 'novelty_enhanced_bpr'

    click_task = gokart.TaskInstanceParameter()
    positive_sample_weight: int = luigi.IntParameter()
    distance_threshold: float = luigi.FloatParameter()

    def requires(self):
        return self.click_task

    def run(self):
        clicks = self.load()['clicks_train']
        item_distance = self.load()['item_distance']
        paired_data = self._run(clicks, item_distance, self.positive_sample_weight, self.distance_threshold)
        self.dump(paired_data)

    @staticmethod
    def _run(clicks: pd.DataFrame, item_distance: pd.DataFrame, positive_sample_weight: int, distance_threshold: float) -> pd.DataFrame:
        clicked_data = clicks[clicks['click'].astype(bool)].rename(columns={'item_id': 'positive_item_id'})
        not_clicked_data = clicks[~clicks['click'].astype(bool)].rename(columns={'item_id': 'negative_item_id'})

        not_clicked_data = not_clicked_data.groupby('user_id').apply(
            lambda x: x.sample(positive_sample_weight)).reset_index(drop=True)

        paired_data = pd.merge(clicked_data[['user_id', 'positive_item_id']],
                               not_clicked_data[['user_id', 'negative_item_id']],
                               on='user_id', how='inner')

        paired_data = pd.merge(paired_data, item_distance, left_on=['positive_item_id', 'negative_item_id'],
                               right_on=['item_id_x', 'item_id_y'], how='inner')
        if distance_threshold:
            paired_data = paired_data[paired_data['distance'] < distance_threshold]

        return paired_data[['user_id', 'positive_item_id', 'negative_item_id']]
コード例 #19
0
class TrainFastText(gokart.TaskOnKart):
    task_namespace = 'redshells'
    tokenized_text_data_task = gokart.TaskInstanceParameter(
        description=
        'The task outputs tokenized texts with type `List[List[str]]` or `List[str]` separated with space.'
    )
    fasttext_kwargs = luigi.DictParameter(
        default=dict(),
        description=
        'Arguments for FastText except "sentences". Please see gensim.models.FastText for more details.'
    )  # type: Dict[str, Any]
    output_file_path = luigi.Parameter(
        default='model/fasttext.zip')  # type: str

    def requires(self):
        return self.tokenized_text_data_task

    def output(self):
        return self.make_model_target(
            self.output_file_path,
            save_function=gensim.models.FastText.save,
            load_function=gensim.models.FastText.load)

    def run(self):
        texts = self.load()
        assert len(texts) > 0
        shuffle(texts)

        if isinstance(texts[0], str):
            texts = TokenIterator(texts=texts)

        logger.info(f'training FastText...')
        model = gensim.models.FastText(sentences=texts, **self.fasttext_kwargs)
        self.dump(model)
コード例 #20
0
class MakeFeature(gokart.TaskOnKart):
    task_namespace = 'm5-forecasting'

    merged_data_task = gokart.TaskInstanceParameter()
    is_train: bool = luigi.BoolParameter()
    is_small: bool = luigi.BoolParameter()

    def requires(self):
        return dict(data=self.merged_data_task)

    def run(self):
        data = self.load_data_frame('data')
        output = self._run(data, self.is_train)
        self.dump(output)

    @classmethod
    def _run(cls, data, is_train: bool):
        data = cls._label_encode(data)
        data = data.dropna(subset={'sell_price'}) if is_train else data
        return data

    @staticmethod
    def _label_encode(data):
        for i, v in tqdm(enumerate(["item_id", "dept_id", "store_id", "cat_id", "state_id"])):
            data[v] = OrdinalEncoder(dtype="int").fit_transform(data[[v]]).astype("int16") + 1
        return data
コード例 #21
0
class PreprocessCriteo(gokart.TaskOnKart):
    data_task = gokart.TaskInstanceParameter()

    def requires(self):
        return self.data_task

    def output(self):
        return self.make_target('criteo/train_data.pkl')

    def run(self):
        logger.info('loading...')
        df = self.load_data_frame()

        logger.info('preprocess for integer columns...')
        for c in tqdm(_get_integer_columns()):
            values = df[c].copy()
            m = np.min([x for x in values[values.notnull()]])
            values[values.notnull()] += -m + 2
            values[values.isnull()] = 1
            df[c] = np.log(values)

        logger.info('preprocess for category columns...')
        for c in _get_categorical_columns():
            df[c] = df[c].astype('category')

        logger.info('dumping...')
        self.dump(df)
コード例 #22
0
class GetItemDistance(gokart.TaskOnKart):
    task_namespace = 'novelty_enhanced_bpr'
    item_embed_vector_task = gokart.TaskInstanceParameter()

    def requires(self):
        return self.item_embed_vector_task

    def run(self):
        item_embed_vector = self.load()
        item_embed_vector_x = item_embed_vector.rename(
            columns={
                'item_id': 'item_id_x',
                'item_vector': 'item_vector_x'
            })
        item_embed_vector_y = item_embed_vector.rename(
            columns={
                'item_id': 'item_id_y',
                'item_vector': 'item_vector_y'
            })
        item_distance_df = cross_join(item_embed_vector_x, item_embed_vector_y)

        def func(vector1, vector2):
            return np.linalg.norm(vector1 - vector2)

        item_distance_df['distance'] = item_distance_df.apply(
            lambda x: func(x['item_vector_x'], x['item_vector_y']), axis=1)
        self.dump(item_distance_df[['item_id_x', 'item_id_y', 'distance']])
コード例 #23
0
class TrainDoc2Vec(gokart.TaskOnKart):
    task_namespace = 'redshells'
    tokenized_text_data_task = gokart.TaskInstanceParameter(
        description=
        'The task outputs tokenized texts with type "List[List[str]]".')
    output_file_path = luigi.Parameter(
        default='model/doc2vec.zip')  # type: str
    doc2vec_kwargs = luigi.DictParameter(
        default=dict(),
        description=
        'Arguments for Doc2Vec except "documents". Please see gensim.models.Doc2Vec for more details.'
    )  # type: Dict[str, Any]

    def requires(self):
        return self.tokenized_text_data_task

    def output(self):
        return self.make_model_target(self.output_file_path,
                                      save_function=gensim.models.Doc2Vec.save,
                                      load_function=gensim.models.Doc2Vec.load)

    def run(self):
        texts = self.load()  # type: List[List[str]]
        shuffle(texts)
        documents = [
            gensim.models.doc2vec.TaggedDocument(doc, [i])
            for i, doc in enumerate(texts)
        ]
        model = gensim.models.Doc2Vec(documents=documents,
                                      **self.doc2vec_kwargs)
        model.delete_temporary_training_data(keep_doctags_vectors=True,
                                             keep_inference=True)
        self.dump(model)
コード例 #24
0
class ExtractColumnAsDict(gokart.TaskOnKart):
    """
    Extract column data of pd.DataFrame as dict, and keep the first value when values of `key_column_name` are duplicate.
    """
    task_namespace = 'redshells.data_frame_utils'
    data_task = gokart.TaskInstanceParameter(
        description='A task outputs pd.DataFrame.')
    key_column_name = luigi.Parameter()  # type: str
    value_column_name = luigi.Parameter()  # type: str
    output_file_path = luigi.Parameter(
        default='data/extract_column_as_dict.pkl')  # type: str

    def requires(self):
        return self.data_task

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        data = self.load_data_frame(
            required_columns={self.key_column_name, self.value_column_name})
        data.drop_duplicates(self.key_column_name, keep='first', inplace=True)
        self.dump(
            dict(
                zip(data[self.key_column_name].tolist(),
                    data[self.value_column_name].tolist())))
コード例 #25
0
class ConvertToOneHot(gokart.TaskOnKart):
    """
    Convert column values of `categorical_column_names` to one-hot.
    """
    task_namespace = 'redshells.data_frame_utils'
    data_task = gokart.TaskInstanceParameter(
        description='A task outputs pd.DataFrame.')
    categorical_column_names = luigi.ListParameter()  # type: List[str]
    output_file_path = luigi.Parameter(
        default='data/group_by_column_as_dict.pkl')  # type: str

    def requires(self):
        return self.data_task

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        categorical_column_names = list(self.categorical_column_names)
        data = self.load_data_frame(
            required_columns=set(categorical_column_names))
        result = pd.get_dummies(data[categorical_column_names])
        result = result.merge(data.drop(categorical_column_names, axis=1),
                              left_index=True,
                              right_index=True)
        self.dump(result)
コード例 #26
0
class GroupByColumnAsDict(gokart.TaskOnKart):
    """
    Group by column names of pd.DataFrame and return map from `key_column_name` to a list of `value_column_name`.
    
    **This always drops na values.**
    """
    task_namespace = 'redshells.data_frame_utils'
    data_task = gokart.TaskInstanceParameter(
        description='A task outputs pd.DataFrame.')
    key_column_name = luigi.Parameter()  # type: str
    value_column_name = luigi.Parameter()  # type: str
    output_file_path = luigi.Parameter(
        default='data/group_by_column_as_dict.pkl')  # type: str

    def requires(self):
        return self.data_task

    def output(self):
        return self.make_target(self.output_file_path)

    def run(self):
        data = self.load_data_frame(
            required_columns={self.key_column_name, self.value_column_name})
        data.dropna(subset={self.key_column_name, self.value_column_name},
                    inplace=True)
        result = data.groupby(by=self.key_column_name)[
            self.value_column_name].apply(list).to_dict()
        self.dump(result)
コード例 #27
0
class _DummyTask(gokart.TaskOnKart):
    task_namespace = __name__
    sub_task = gokart.TaskInstanceParameter()

    def output(self):
        return self.make_target('test.txt')

    def run(self):
        self.dump('test')
コード例 #28
0
class TaskB(TaskBase):
    task = gokart.TaskInstanceParameter()

    def requires(self):
        return self.task

    def run(self):
        params = self.load()
        params.update({'trained': True})  # training model
        self.dump(params)
コード例 #29
0
class LoadDataOfTask(gokart.TaskOnKart):
    task_namespace = 'redshells'
    data_task = gokart.TaskInstanceParameter()
    target_name = luigi.Parameter()

    def requires(self):
        return self.data_task

    def output(self):
        return self.input()[self.target_name]
コード例 #30
0
class TrainMatrixFactorization(gokart.TaskOnKart):
    task_namespace = 'redshells'
    train_data_task = gokart.TaskInstanceParameter(
        description=
        'A task outputs a pd.DataFrame with columns={`user_column_name`, `item_column_name`, `service_column_name`, `target_column_name`}.'
    )
    user_column_name = luigi.Parameter(
        default='user', description='The column name of user id.')  # type: str
    item_column_name = luigi.Parameter(
        default='item', description='The column name of item id')  # type: str
    service_column_name = luigi.Parameter(
        default='service',
        description='The column name of service id.')  # type: str
    rating_column_name = luigi.Parameter(
        default='rating',
        description='The target column name to predict.')  # type: str
    model_kwargs = luigi.DictParameter(
        default=dict(),
        description='Arguments of the model.')  # type: Dict[str, Any]
    max_data_size = luigi.IntParameter(default=50000000)
    output_file_path = luigi.Parameter(
        default='model/matrix_factorization.zip')  # type: str

    def requires(self):
        return self.train_data_task

    def output(self):
        return self.make_model_target(self.output_file_path,
                                      save_function=MatrixFactorization.save,
                                      load_function=MatrixFactorization.load)

    def run(self):
        tf.reset_default_graph()
        df = self.load_data_frame(
            required_columns={
                self.user_column_name, self.item_column_name,
                self.service_column_name, self.rating_column_name
            })

        df.drop_duplicates(
            subset=[self.user_column_name, self.item_column_name],
            inplace=True)
        df = sklearn.utils.shuffle(df)
        df = df.head(n=self.max_data_size)

        user_ids = df[self.user_column_name]
        item_ids = df[self.item_column_name]
        service_ids = df[self.service_column_name]
        ratings = df[self.rating_column_name]
        model = MatrixFactorization(**self.model_kwargs)
        model.fit(user_ids=user_ids,
                  item_ids=item_ids,
                  service_ids=service_ids,
                  ratings=ratings)
        self.dump(model)