def getMaxValues(self,rdd): setMovies = SetMovies() regexMoviesTitle = '::([a-z]|[A-Z]|[0-9]|[(]|[)]|[ ])*::' regexYear = '[(][0-9][0-9][0-9][0-9][)]' rdd = rdd.map(lambda line : re.search(regexMoviesTitle,line).group()) rdd = rdd.map(lambda movie : (movie,movie.replace('::','').split(" "))) rdd = rdd.flatMap(lambda titleAndWords : map(lambda word: (word,(1,[titleAndWords[0]])),titleAndWords[1])) rdd = rdd.filter(lambda wordsAndTitle : not re.match(regexYear, wordsAndTitle[0]) and len(wordsAndTitle[0])>=3) rdd = rdd.reduceByKey(lambda firstValue,secondValue :(firstValue[0]+secondValue[0],list(set(firstValue[1]+secondValue[1])))) return setMovies.setWithMaxValues(rdd,lambda value :(value[1][0],(value[0],value[1][1])))
class TestCalculator (unittest.TestCase): def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.setMovies = SetMovies() def tearDown(self): self.sc.stop() def test_when_calculate_set_word_most_repeater(self): entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])), ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])), ('Story', (1, ['::Toy Story Toy (1995)::'])), ('StoryA', (3, ['::ToyA StoryA ToyA (1995)::']))] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::']), ("StoryA",["::ToyA StoryA ToyA (1995)::"])) funcReverseTuple = lambda value :((value[1][0],(value[0],value[1][1]))) rdd = self.sc.parallelize(entry) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_set_word_most_repeater_one(self): entry = [('Toy', (1, ['::Toy Story Toy (1995)::'])), ('ToyA', (3, ['::ToyA StoryA ToyA (1995)::'])), ('Story', (1, ['::Toy Story Toy (1995)::'])), ('StoryA', (1, ['::ToyA StoryA ToyA (1995)::']))] result = (('ToyA', ['::ToyA StoryA ToyA (1995)::'])) funcReverseTuple = lambda value :(value[1][0],(value[0],value[1][1])) rdd = self.sc.parallelize(entry) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_maximum_year(self): entry = [('(1996)',2), ('(1998)',2), ('(1997)',1)] result = ('(1996)','(1998)') rdd = self.sc.parallelize(entry) funcReverseTuple = lambda value :(value[1],value[0]) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result) def test_when_calculate_maximum_year_with_only_one(self): entry = [('(1996)',2), ('(1998)',1), ('(1997)',1), ('(1999)',1)] result = ('(1996)') rdd = self.sc.parallelize(entry) funcReverseTuple = lambda value :(value[1],value[0]) self.assertEqual(self.setMovies.setWithMaxValues(rdd,funcReverseTuple),result)
def setUp(self): conf = SparkConf().setAppName("appTest").setMaster("local[*]") self.sc = SparkContext(conf=conf) self.setMovies = SetMovies()
def getMaxValues(self,rdd): setMovies = SetMovies() regexYearWithParenthesis = '[(][0-9][0-9][0-9][0-9][)]' rdd = rdd.map(lambda line : (re.search(regexYearWithParenthesis,line).group(),1)) rdd = rdd.reduceByKey(lambda firstValue,secondValue :(firstValue+secondValue)) return setMovies.setWithMaxValues(rdd,lambda value :(value[1],value[0]))