def extract_path_schema(relation, pattern, path_column="path"): regex, columns = pattern_regex(pattern) schema = relation.schema return Schema(schema.fields + [dict(name=c, type='STRING') for c in columns], name="extract_path({})".format(pattern))
def test_pattern_regex(): regex, columns = pattern_regex("{artist}/{album}/{track}.{ext}") eq_( regex.match("nirvana/nevermind/02.ogg").groups(), ("nirvana", "nevermind", "02", "ogg") )
def __init__(self, name, root_dir, **options): self.name = name if not root_dir.endswith('/'): root_dir += '/' self.root_dir = root_dir pattern = options.pop('pattern', None) if pattern: while pattern.startswith('/'): pattern = pattern[1:] self.pattern_regex, self.pattern_columns = pattern_regex( self.root_dir + pattern ) else: self.pattern_regex = None self.pattern_columns = [] self.content_column = options.pop('content_column', None) self.filename_column = options.pop('filename_column', None) self.decode = options.pop('decode', "none") self._schema = None if 'schema' in options: self._schema = Schema(**options.pop('schema')) if options: raise ValueError("Unrecognized options {}".format(options.keys()))
def extract_path_schema(relation, pattern, path_column="path"): regex, columns = pattern_regex(pattern) schema = relation.schema return Schema(schema.fields + [ dict(name=c, type='STRING') for c in columns ], name="extract_path({})".format(pattern))
def extract_path(ctx, files_relation, pattern, path_column="path"): """ Extracts patterns out of file paths and urls. Returns a relation where the path_column is matched against a reular expression expressed by the pattern argument. the resulting group info appened to the matching row. Example ['/some/path'] -> ['/some/path', 'some', 'path'] """ field_pos = files_relation.schema.field_position(path_column) regex, columns = pattern_regex(pattern) for row in files_relation.records(ctx): path = row[field_pos] m = regex.match(path) if m: yield row + m.groups()
def test_pattern_regex(): regex, columns = pattern_regex("{artist}/{album}/{track}.{ext}") eq_( regex.match("nirvana/nevermind/02.ogg").groups(), ("nirvana", "nevermind", "02", "ogg"))