def test_categories_to_reviews(self): """Tests add_categories_to_reviews to make sure it is properly running""" category = [('categories', [CATEGORY]), ('review', TEXT)] job = CategoryPredictor() category_results = list(job.add_categories_to_reviews_reducer(BIZ_ID, category)) result = [('all', {CATEGORY: 1}), (CATEGORY, TEXT)] self.assertEqual(category_results,result)
def test_tokenize_reviews(self): """Tests tokenize_reviews_mapper to make sure it is properly running""" review = {CATEGORY: 1} job = CategoryPredictor() token_results = list(job.tokenize_reviews_mapper('all', review)) result = [('all', {CATEGORY: 1})] self.assertEqual(token_results, result)
def test_review_category(self): """Tests the category_mapper to make sure it is properly running""" business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID) review = REVIEW_TEMPLATE % (TEXT, BIZ_ID) job = CategoryPredictor() review_results = list(job.review_category_mapper(None, json.loads(review))) biz_results = list(job.review_category_mapper(None, json.loads(business))) self.assertEqual(review_results, [(BIZ_ID, ('review', TEXT))]) self.assertEqual(biz_results, [(BIZ_ID, ('categories', [CATEGORY]))])
def test_categories_to_reviews(self): """Tests add_categories_to_reviews to make sure it is properly running""" category = [('categories', [CATEGORY]), ('review', TEXT)] job = CategoryPredictor() category_results = list( job.add_categories_to_reviews_reducer(BIZ_ID, category)) result = [('all', {CATEGORY: 1}), (CATEGORY, TEXT)] self.assertEqual(category_results, result)
def test_review_category(self): """Tests the category_mapper to make sure it is properly running""" business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID) review = REVIEW_TEMPLATE % (TEXT, BIZ_ID) job = CategoryPredictor() review_results = list( job.review_category_mapper(None, json.loads(review))) biz_results = list( job.review_category_mapper(None, json.loads(business))) self.assertEqual(review_results, [(BIZ_ID, ('review', TEXT))]) self.assertEqual(biz_results, [(BIZ_ID, ('categories', [CATEGORY]))])
def test_smoke(self): """Does a complete run with mock data""" business = BUSINESS_TEMPLATE % (CATEGORY, BIZ_ID) review = REVIEW_TEMPLATE % (LONG_TEXT, BIZ_ID) total_input = business + review static_stdin = StringIO(total_input) job = CategoryPredictor(['-r', 'inline', '--no-conf', '-']) job.sandbox(stdin=static_stdin) results = [] with job.make_runner() as runner: runner.run() for line in runner.stream_output(): key, value = job.parse_output_line(line) results.append(value) # Results should be the probability of that category being chosen. result = {CATEGORY: 1} self.assertEqual(results[0], result)
filtered_counts['UNK'] = 0.01 # emit the result yield category, filtered_counts def steps(self): return [ self.mr(mapper=self.review_category_mapper, reducer=self.add_categories_to_reviews_reducer), self.mr(mapper=self.tokenize_reviews_mapper, reducer=self.sum_counts) ] if __name__ == "__main__": CategoryPredictor().run() ########NEW FILE######## __FILENAME__ = predict # Copyright 2011 Yelp and Contributors # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and