Beispiel #1
0
def extract_path_schema(relation, pattern, path_column="path"):
    regex, columns = pattern_regex(pattern)
    schema = relation.schema

    return Schema(schema.fields +
                  [dict(name=c, type='STRING') for c in columns],
                  name="extract_path({})".format(pattern))
Beispiel #2
0
def test_pattern_regex():
  regex, columns = pattern_regex("{artist}/{album}/{track}.{ext}")

  eq_(
    regex.match("nirvana/nevermind/02.ogg").groups(),
    ("nirvana", "nevermind", "02", "ogg")
  )
Beispiel #3
0
  def __init__(self,  name, root_dir, **options):
    self.name = name

    if not root_dir.endswith('/'):
      root_dir += '/'
    self.root_dir = root_dir

    pattern = options.pop('pattern', None)
    if pattern:
      while pattern.startswith('/'):
        pattern = pattern[1:]

      self.pattern_regex, self.pattern_columns = pattern_regex(
        self.root_dir + pattern
      )

    else:
      self.pattern_regex = None
      self.pattern_columns = []

    self.content_column = options.pop('content_column', None)
    self.filename_column = options.pop('filename_column', None)
  
    self.decode = options.pop('decode', "none")

    self._schema = None
    if 'schema' in options:
      self._schema = Schema(**options.pop('schema'))

    if options:
      raise ValueError("Unrecognized options {}".format(options.keys()))
Beispiel #4
0
def extract_path_schema(relation, pattern, path_column="path"):
  regex, columns = pattern_regex(pattern)
  schema = relation.schema

  return Schema(schema.fields + [
    dict(name=c, type='STRING')
    for c in columns    
  ], name="extract_path({})".format(pattern))
Beispiel #5
0
def extract_path(ctx, files_relation, pattern, path_column="path"):
    """
  Extracts patterns out of file paths and urls.

  Returns a relation where the path_column is matched 
  against a reular expression expressed by the pattern
  argument.
  the resulting group info appened to the matching row.

  Example
  ['/some/path'] -> ['/some/path', 'some', 'path']
  """

    field_pos = files_relation.schema.field_position(path_column)
    regex, columns = pattern_regex(pattern)

    for row in files_relation.records(ctx):
        path = row[field_pos]
        m = regex.match(path)
        if m:
            yield row + m.groups()
Beispiel #6
0
def extract_path(ctx, files_relation, pattern, path_column="path"):
  """
  Extracts patterns out of file paths and urls.

  Returns a relation where the path_column is matched 
  against a reular expression expressed by the pattern
  argument.
  the resulting group info appened to the matching row.

  Example
  ['/some/path'] -> ['/some/path', 'some', 'path']
  """

  field_pos = files_relation.schema.field_position(path_column)
  regex, columns = pattern_regex(pattern)


  for row in files_relation.records(ctx):
    path = row[field_pos]
    m = regex.match(path)
    if m:
      yield row + m.groups()
Beispiel #7
0
def test_pattern_regex():
    regex, columns = pattern_regex("{artist}/{album}/{track}.{ext}")

    eq_(
        regex.match("nirvana/nevermind/02.ogg").groups(),
        ("nirvana", "nevermind", "02", "ogg"))