Example #1
0
 def testSchemaFor_ThroughJythonUDF(self):
     script = '\n'.join([
         "Register 'tests/udfs.py' using jython as udfs;",
         "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" %
         self.INPUT_FILE,
         "queries = FOREACH data GENERATE query, udfs.concat(query,query) AS doublequery;",
         "STORE queries INTO 'top_3_queries';",
     ])
     proxy = PigProxy(script)
     schema = proxy.schemaFor('queries')
     self.assertEqual(schema, '(query: chararray,doublequery: chararray)')
Example #2
0
 def testInlinePigScript(self):
     script = '\n'.join([
         "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" %
         self.INPUT_FILE,
         "queries_group = GROUP data BY query PARALLEL 1;",
         "queries_sum = FOREACH queries_group GENERATE group AS query, SUM(data.count) AS count;",
         "queries_ordered = ORDER queries_sum BY count DESC PARALLEL 1;",
         "queries_limit = LIMIT queries_ordered 3;",
         "STORE queries_limit INTO 'top_3_queries';",
     ])
     proxy = PigProxy(script)
     output = [
         "(yahoo,25)",
         "(facebook,15)",
         "(twitter,7)",
     ]
     self.assertLastOutput(proxy, output)
Example #3
0
 def testWithUdf(self):
     script = '\n'.join([
         # "REGISTER myIfNeeded.jar;",
         "DEFINE TOKENIZE TOKENIZE();",
         "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" %
         self.INPUT_FILE,
         "queries = FOREACH data GENERATE query, TOKENIZE(query) AS query_tokens;",
         "queries_ordered = ORDER queries BY query DESC PARALLEL 1;",
         "queries_limit = LIMIT queries_ordered 3;",
         "STORE queries_limit INTO 'top_3_queries';",
     ])
     proxy = PigProxy(script)
     output = [
         "(yahoo,{(yahoo)})",
         "(yahoo,{(yahoo)})",
         "(twitter,{(twitter)})",
     ]
     self.assertLastOutput(proxy, output)
Example #4
0
 def testGetLastAlias(self):
     script = '\n'.join([
         "data = LOAD '%s' AS (query:CHARARRAY, count:INT);" %
         self.INPUT_FILE,
         "queries_group = GROUP data BY query PARALLEL 1;",
         "queries_sum = FOREACH queries_group GENERATE group AS query, SUM(data.count) AS count;",
         "queries_ordered = ORDER queries_sum BY count DESC PARALLEL 1;",
         "queries_limit = LIMIT queries_ordered 3;",
         "STORE queries_limit INTO 'top_3_queries';",
     ])
     proxy = PigProxy(script)
     expected = \
     "(yahoo,25)\n" + \
     "(facebook,15)\n" + \
     "(twitter,7)"
     self.assertEquals(
         expected,
         '\n'.join([str(i) for i in proxy.get_alias("queries_limit")]))