Ejemplo n.º 1
0
def test_query_field_from_path():
    """
  Queries with SelectionOps that reference only fields
  parsed from the directory structre will rewrite
  the query so that the file list is filtered without 
  opening/decoding the files.
  """

    adapter = DirAdapter(employees=dict(root_dir="/",
                                        pattern="{department}",
                                        filename_column="path",
                                        decode="auto",
                                        schema=TEST_SCHEMA))

    op = SelectionOp(LoadOp('employees'),
                     EqOp(Var('department'), Const('sales')))

    loc = query_zipper(op).leftmost_descendant()

    res = adapter.evaluate(loc)
    relation = adapter.get_relation('employees')

    compare(
        res.root(),
        Function(
            'decode',
            SelectionOp(
                Function('extract_path',
                         Function('files', Const(relation.root_dir)),
                         Const(relation.root_dir + "{department}")),
                EqOp(Var('department'), Const('sales'))), Const('auto'),
            Const(TEST_SCHEMA), Const('path')))
Ejemplo n.º 2
0
def test_query_field_in_payload():
    """
  Querying a field inside the payload should result
  in the LoadOp being rewritten as

  SelectionOp(Function('decode', Function('extract_path', Function('files'))))
  """

    adapter = DirAdapter(employees=dict(root_dir="/",
                                        pattern="{department}",
                                        filename_column="path",
                                        decode="auto",
                                        schema=TEST_SCHEMA))

    op = SelectionOp(LoadOp('employees'), GeOp(Var('salary'), Const(40000)))

    loc = query_zipper(op).leftmost_descendant()

    res = adapter.evaluate(loc)
    relation = adapter.get_relation('employees')

    compare(
        res.root(),
        SelectionOp(
            Function(
                'decode',
                Function('extract_path',
                         Function('files', Const(relation.root_dir)),
                         Const(relation.root_dir + "{department}")),
                Const('auto'), Const(TEST_SCHEMA), Const('path')),
            GeOp(Var('salary'), Const(40000))))
Ejemplo n.º 3
0
def compare(op1, op2):
  loc1 = query_zipper(op1).leftmost_descendant()
  loc2 = query_zipper(op2).leftmost_descendant()

  while True:
    n1 = loc1.node()
    n2 = loc2.node()

    if not (compare_relation(n1,n2)  or n1 == n2):
      raise NodeDiffException(n1,n2)

    if any((loc1.at_end(), loc2.at_end())):
      # if either is at the end, they both should be
      assert loc1.at_end() == loc2.at_end()
      break
    else:
      loc1 = loc1.postorder_next()
      loc2 = loc2.postorder_next()
Ejemplo n.º 4
0
def test_query_field_from_path_and_contents():
    """
  Queries with SelectionOps that reference both  fields
  parsed from the directory structre and content will 
  rewrite the query so that the file list is filtered before 
  opening/decoding the files and finally filtered by the field
  from the content
  """

    adapter = DirAdapter(
        employees=dict(root_dir="/",
                       pattern="{department}",
                       filename_column="path",
                       decode="auto",
                       schema=dict(fields=[
                           dict(type='STRING', name='department'),
                           dict(type='INTEGER', name='id'),
                           dict(type='STRING', name='full_name'),
                           dict(type='INTEGER', name='salary'),
                           dict(type='INTEGER', name='manager_id'),
                       ])))

    op = SelectionOp(
        LoadOp('employees'),
        And(
            EqOp(Var('department'), Const('sales')),
            GeOp(Var('salary'), Const(40000)),
        ))

    loc = query_zipper(op).leftmost_descendant()

    res = adapter.evaluate(loc)
    relation = adapter.get_relation('employees')

    compare(
        res.root(),
        SelectionOp(
            Function(
                'decode',
                SelectionOp(
                    Function('extract_path',
                             Function('files', Const(relation.root_dir)),
                             Const(relation.root_dir + "{department}")),
                    EqOp(Var('department'), Const('sales'))), Const('auto'),
                Const(TEST_SCHEMA), Const('path')),
            GeOp(Var('salary'), Const(40000))))
Ejemplo n.º 5
0
def test_evaluate():

    adapter = DirAdapter(songs=dict(
        root_dir=path,
        pattern="{artist}/{album}/{track}.{ext}",
        filename_column="path",
    ))
    relation = adapter.get_relation('songs')

    op = LoadOp('songs')
    loc = query_zipper(op).leftmost_descendant()

    res = adapter.evaluate(loc)

    compare(
        res.root(),
        Function('extract_path', Function('files', Const(relation.root_dir)),
                 Const(path + "/{artist}/{album}/{track}.{ext}")))
Ejemplo n.º 6
0
def test_evaluate():
  adapter = S3Adapter(
    logs = dict(
      bucket = "aws-publicdatasets",
      anon = True,
      prefix = "/common-crawl/",
      pattern = "{timestamp}/{server}"
    )
  )

  relation = adapter.get_relation('logs')

  op = LoadOp('logs')
  loc = query_zipper(op).leftmost_descendant()
  
  res = adapter.evaluate(loc)

  import pdb; pdb.set_trace()

  eq_(
    res.root(),
    Function('s3_keys', Const(relation.bucket_name), Const(relation.prefix))
  )