class TestBaseTokenizer(unittest.TestCase): def setUp(self): self.tok = BaseTokenizer() def test_init(self): self.assertEqual(self.tok.sep, ' ') def test_tokenize(self): tokens = self.tok.tokenize('a b c') self.assertListEqual(tokens, ['a', 'b', 'c']) def test_batch_tokenize(self): token_list = self.tok.batch_tokenize(['a b c', 'd e f']) self.assertListEqual(token_list, [['a', 'b', 'c'], ['d', 'e', 'f']]) def test_default_rules(self): tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES) token_list = tok.tokenize('<t>a</t> B |{ C ]?&$ d123 E') self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e']) def test_stopwords(self): text = 'this is a nice house' tok = BaseTokenizer(stop_words='english') self.assertListEqual(tok.tokenize(text), ['nice', 'house']) tok = BaseTokenizer(stop_words=['is', 'a']) self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house']) try: BaseTokenizer(stop_words='vietnamese') except ValueError: assert True
def test_stopwords(self): text = 'this is a nice house' tok = BaseTokenizer(stop_words='english') self.assertListEqual(tok.tokenize(text), ['nice', 'house']) tok = BaseTokenizer(stop_words=['is', 'a']) self.assertListEqual(tok.tokenize(text), ['this', 'nice', 'house']) try: BaseTokenizer(stop_words='vietnamese') except ValueError: assert True
""" import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModule from cornac.data.text import BaseTokenizer docs, item_ids = citeulike.load_text() data = citeulike.load_data(reader=Reader(item_set=item_ids)) # build text module item_text_module = TextModule(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer('\t'), max_vocab=8000, max_doc_freq=0.5, stop_words='english') ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True, item_text=item_text_module, verbose=True, seed=123, rating_threshold=0.5) cdr = cornac.models.CDR(k=50, autoencoder_structure=[200], max_iter=100,
"""Example for HFT with Movilen 1m dataset """ import cornac from cornac.data import Reader from cornac.datasets import movielens from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer plots, movie_ids = movielens.load_plot() ml_1m = movielens.load_1m(reader=Reader(item_set=movie_ids)) # build text module item_text_modality = TextModality(corpus=plots, ids=movie_ids, tokenizer=BaseTokenizer( sep='\t', stop_words='english'), max_vocab=5000, max_doc_freq=0.5) ratio_split = RatioSplit(data=ml_1m, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123) hft = cornac.models.HFT(k=10, max_iter=40, grad_iter=5, l2_reg=0.001, lambda_text=0.01,
import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer docs, item_ids = citeulike.load_text() data = citeulike.load_data(reader=Reader(item_set=item_ids)) # build text module item_text_modality = TextModality( corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(stop_words='english'), max_vocab=8000, max_doc_freq=0.5) ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123, rating_threshold=0.5) cdr = cornac.models.CDR(k=50, autoencoder_structure=[200], max_iter=100, batch_size=128,
from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer # CDR composes an autoencoder with a ranking collaborative model to represent item texts and user-item interactions # The necessary data can be loaded as follows docs, item_ids = citeulike.load_text() feedback = citeulike.load_feedback(reader=Reader(item_set=item_ids)) # Instantiate a TextModality, it makes it convenient to work with text auxiliary information # For more details, please refer to the tutorial on how to work with auxiliary data item_text_modality = TextModality( corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(stop_words="english"), max_vocab=8000, max_doc_freq=0.5, ) # Define an evaluation method to split feedback into train and test sets ratio_split = RatioSplit( data=feedback, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123, rating_threshold=0.5, )
from cornac.eval_methods import RatioSplit from cornac.data import TextModality from cornac.data.text import BaseTokenizer # HFT jointly models the user-item preferences and item texts (e.g., product reviews) with shared item factors # Below we fit HFT to the MovieLens 1M dataset. We need both the ratings and movie plots information plots, movie_ids = movielens.load_plot() ml_1m = movielens.load_feedback(variant="1M", reader=Reader(item_set=movie_ids)) # Instantiate a TextModality, it makes it convenient to work with text auxiliary information # For more details, please refer to the tutorial on how to work with auxiliary data item_text_modality = TextModality( corpus=plots, ids=movie_ids, tokenizer=BaseTokenizer(sep="\t", stop_words="english"), max_vocab=5000, max_doc_freq=0.5, ) # Define an evaluation method to split feedback into train and test sets ratio_split = RatioSplit( data=ml_1m, test_size=0.2, exclude_unknowns=True, item_text=item_text_modality, verbose=True, seed=123, ) # Instantiate HFT model
@author: Tran Thanh Binh """ import cornac from cornac.data import Reader from cornac.datasets import citeulike from cornac.eval_methods import RatioSplit from cornac.data import TextModule from cornac.data.text import BaseTokenizer docs, item_ids = citeulike.load_text() data = citeulike.load_data(reader=Reader(item_set=item_ids)) # build text module item_text_module = TextModule(corpus=docs, ids=item_ids, tokenizer=BaseTokenizer(sep=' '), max_vocab=8000, max_doc_freq=0.5, stop_words='english') ratio_split = RatioSplit(data=data, test_size=0.2, exclude_unknowns=True, item_text=item_text_module, verbose=True, seed=123, rating_threshold=0.5) cdl = cornac.models.CDL(k=50, autoencoder_structure=[200], max_iter=30, lambda_u=0.1, lambda_v=1, lambda_w=0.1, lambda_n=1000) rec_300 = cornac.metrics.Recall(k=300) exp = cornac.Experiment(eval_method=ratio_split, models=[cdl], metrics=[rec_300]) exp.run()
def test_default_rules(self): tok = BaseTokenizer(pre_rules=DEFAULT_PRE_RULES) token_list = tok.tokenize('<t>a</t> B |{ C ]?&$ d123 E') self.assertListEqual(token_list, ['a', 'b', 'c', 'd', 'e'])
def setUp(self): self.tok = BaseTokenizer()