def testSchemaFor_ThroughJythonUDF(self): script = '\n'.join([ "Register 'tests/udfs.py' using jython as udfs;", "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" % self.INPUT_FILE, "queries = FOREACH data GENERATE query, udfs.concat(query,query) AS doublequery;", "STORE queries INTO 'top_3_queries';", ]) proxy = PigProxy(script) schema = proxy.schemaFor('queries') self.assertEqual(schema, '(query: chararray,doublequery: chararray)')
def testInlinePigScript(self): script = '\n'.join([ "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" % self.INPUT_FILE, "queries_group = GROUP data BY query PARALLEL 1;", "queries_sum = FOREACH queries_group GENERATE group AS query, SUM(data.count) AS count;", "queries_ordered = ORDER queries_sum BY count DESC PARALLEL 1;", "queries_limit = LIMIT queries_ordered 3;", "STORE queries_limit INTO 'top_3_queries';", ]) proxy = PigProxy(script) output = [ "(yahoo,25)", "(facebook,15)", "(twitter,7)", ] self.assertLastOutput(proxy, output)
def testWithUdf(self): script = '\n'.join([ # "REGISTER myIfNeeded.jar;", "DEFINE TOKENIZE TOKENIZE();", "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" % self.INPUT_FILE, "queries = FOREACH data GENERATE query, TOKENIZE(query) AS query_tokens;", "queries_ordered = ORDER queries BY query DESC PARALLEL 1;", "queries_limit = LIMIT queries_ordered 3;", "STORE queries_limit INTO 'top_3_queries';", ]) proxy = PigProxy(script) output = [ "(yahoo,{(yahoo)})", "(yahoo,{(yahoo)})", "(twitter,{(twitter)})", ] self.assertLastOutput(proxy, output)
def testGetLastAlias(self): script = '\n'.join([ "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" % self.INPUT_FILE, "queries_group = GROUP data BY query PARALLEL 1;", "queries_sum = FOREACH queries_group GENERATE group AS query, SUM(data.count) AS count;", "queries_ordered = ORDER queries_sum BY count DESC PARALLEL 1;", "queries_limit = LIMIT queries_ordered 3;", "STORE queries_limit INTO 'top_3_queries';", ]) proxy = PigProxy(script) expected = \ "(yahoo,25)\n" + \ "(facebook,15)\n" + \ "(twitter,7)" self.assertEquals( expected, '\n'.join([str(i) for i in proxy.get_alias("queries_limit")]))