class FileHandler(Configurable): """Abstract component factory for file-related components. Args: fs (str): service name to use for filesystem. path (str): which path to use within the provided filesystem. eol (str): which character to use to separate lines. mode (str): which mode to use when opening the file. encoding (str): which encoding to use when opening the file. """ path = Option( filesystem_path, required=True, positional=True, __doc__="Path to use within the provided filesystem.") # type: str eol = Option(str, default="\n", __doc__="Character to use as line separator.") # type: str mode = Option(str, __doc__="What mode to use for open() call.") # type: str encoding = Option(str, default="utf-8", __doc__="Encoding.") # type: str fs = Service("fs", __doc__="The filesystem instance to use.") # type: str @ContextProcessor def file(self, context, *, fs): with self.open(fs) as file: yield file def open(self, fs): return fs.open(self.path, self.mode, encoding=self.encoding)
class OdooReader(OdooBase): domain = Option( type=list, default=[], ) fields = Option( type=list, default=[], ) limit = Option(type=int, required=False) def read(self, context, *args, config, **kwargs): new_args = [self.domain] new_args += args new_kwargs = kwargs.copy() if self.limit: new_kwargs['limit'] = self.limit if self.fields and not context.output_type: context.set_output_fields(self.fields) fields = context.get_output_fields() results = config.search_read(self.model, *new_args, **new_kwargs) if not fields: yield from results else: for result in results: final_result = [] for field in fields: final_result.append(result.get(field, False)) yield tuple(final_result) if self.fields else result __call__ = read
class ExtractKeyedValue(Configurable): ''' Given a `dict` representing an some object, extract the `key` member (a dict). To the extracted dictionaries, add a 'parent_data' key with the value of the original dictionary. Yield the extracted dictionary. ''' key = Option(str, required=True) include_parent = Option(bool, default=True) def __init__(self, *v, **kw): ''' Sets the __name__ property to include the relevant options so that when the bonobo graph is serialized as a GraphViz document, different objects can be visually differentiated. ''' super().__init__(*v, **kw) self.__name__ = f'{type(self).__name__} ({self.key})' def __call__(self, data, *args, **kwargs): a = data.get(self.key) if a: child = {k: v for k, v in a.items()} child.update({ 'parent_data': data, }) yield child
class GroupKeys(Configurable): mapping = Option(dict) drop_empty = Option(bool, default=True) def __call__(self, data): to_delete = set() for key, mapping in self.mapping.items(): subd = {} properties = mapping['properties'] postprocess = mapping.get('postprocess') for k in properties: v = data.get(k) to_delete.add(k) if self.drop_empty and not v: continue subd[k] = v if postprocess: if callable(postprocess): postprocess = [postprocess] for p in postprocess: subd = p(subd, data) data[key] = subd for k in to_delete: with suppress(KeyError): del data[k] return data
class Trace(Configurable): name = Option() diff = Option(default=False) ordinals = Option(default=(0,)) trace_counter = Service('trace_counter') def __call__(self, thing: dict, trace_counter): key = '__trace_id' skey = '__trace_seq' if not key in thing: thing[key] = next(trace_counter) thing[skey] = 1 else: thing[skey] += 1 id = thing[key] seq = thing[skey] if id in self.ordinals: formatted = pprint.pformat({k: v for k, v in thing.items() if not k.startswith('__trace_')}) if formatted[0] == '{' and formatted[-1] == '}': # adding newlines and a trailing comma helps with making a sensible diff formatted = '{\n ' + formatted[1:-1] + ',\n}\n' if self.diff: previous = thing.get('__trace_%d_%d' % (id, seq-1)) print('===========> %s #%d: sequence %d' % (self.name, id, seq)) if previous: lines = difflib.ndiff(previous.splitlines(keepends=True), formatted.splitlines(keepends=True)) sys.stdout.writelines(lines) else: print(formatted) else: print(formatted) thing['__trace_%d_%d' % (id, seq)] = formatted return thing
class GroupRepeatingKeys(Configurable): mapping = Option(dict) drop_empty = Option(bool, default=True) def __call__(self, data): for key, mapping in self.mapping.items(): property_prefixes = mapping['prefixes'] postprocess = mapping.get('postprocess') data[key] = [] to_delete = set() with suppress(KeyError): for i in itertools.count(1): ks = ((prefix, f'{prefix}_{i}') for prefix in property_prefixes) subd = {} for p, k in ks: subd[p] = data[k] to_delete.add(k) if self.drop_empty: values_unset = list( map(lambda v: not bool(v), subd.values())) if all(values_unset): continue if postprocess and subd: if callable(postprocess): postprocess = [postprocess] for p in postprocess: subd = p(subd, data) if not subd: break if subd: data[key].append(subd) for k in to_delete: del data[k] return data
class MethodBasedConfigurable(Configurable): handler = Method() foo = Option(positional=True) bar = Option() def __call__(self, *args, **kwargs): self.handler(*args, **kwargs)
class OpendatasoftExtract(Configurable): portal = Option(str, required=True, positional=True) dataset_id = Option(str, required=True, positional=True) format = Option(str, required=True, positional=True) http = Service('http') def __call__(self, http): exports_url, str_date = self.get_metadata(http, self.portal, self.dataset_id) version = self.date2version(str_date) url = self.get_export_url(http, exports_url, self.format) yield { 'url': url, 'version': version, } def get_metadata(self, http, portal, dataset_id): url = f'{portal}/api/v2/catalog/datasets/{dataset_id}' result = http.get(url) if not result.ok: raise RuntimeError(f'Fails fetch metedata content from {url}') try: metadata = result.json() except ValueError as e: raise ValueError(f'Fails parse json metedata from {url}') from e try: str_date = metadata['dataset']['metas']['default']['data_processed'] link = next(filter(lambda d: d['rel'] == 'exports', metadata['links'])) exports_url = link['href'] except KeyError as e: raise ValueError(f'Fails use metedata from {url}') from e return [exports_url, str_date] def date2version(self, str_date): # ISO date, just cut it return str_date[:10].replace('-', '.') def get_export_url(self, http, exports_url, format): result = http.get(exports_url) if not result.ok: raise RuntimeError(f'Fails fetch export list from {exports_url}') try: exports = result.json() except ValueError as e: raise ValueError(f'Fails parse json export list from {exports_url}') from e try: link = next(filter(lambda d: d['rel'] == format, exports['links'])) except KeyError as e: raise ValueError(f'Fails retrive export format {format} from {exports_url}') from e if not link: raise RuntimeError(f'Export format {format} from {exports_url} not available') return link['href']
class DateRangeNode(Configurable): start_date = Option(positional=True, required=True) end_date = Option(positional=True, required=True) def __call__(self): for date in date_range(self.start_date, self.end_date): yield date
class FileReader(Reader, FileHandler): """Component factory for file-like readers. On its own, it can be used to read a file and yield one row per line, trimming the "eol" character at the end if present. Extending it is usually the right way to create more specific file readers (like json, csv, etc.) """ mode = Option(str, default='r', __doc__=''' What mode to use for open() call. ''') # type: str output_fields = Option( ensure_tuple, required=False, __doc__=''' Specify the field names of output lines. Mutually exclusive with "output_type". ''' ) output_type = Option( required=False, __doc__=''' Specify the type of output lines. Mutually exclusive with "output_fields". ''' ) @ContextProcessor def output(self, context, *args, **kwargs): """ Allow all readers to use eventually use output_fields XOR output_type options. """ output_fields = self.output_fields output_type = self.output_type if output_fields and output_type: raise UnrecoverableError('Cannot specify both output_fields and output_type option.') if self.output_type: context.set_output_type(self.output_type) if self.output_fields: context.set_output_fields(self.output_fields) yield def read(self, file, *, fs): """ Write a row on the next line of given file. Prefix is used for newlines. """ for line in file: yield line.rstrip(self.eol) __call__ = read
class FilterXPathEqual(Configurable): xpath = Option(str, required=True) value = Option(str) def __call__(self, e): for t in e.xpath(self.xpath): if t.text == self.value: return NOT_MODIFIED return None
class AddFieldNames(Configurable): key = Option(required=False) field_names = Option() def __call__(self, *data): if len(data) == 1 and type(data[0]) in (tuple, list): data = data[0] names = self.field_names.get(self.key, []) if isinstance(self.field_names, dict) else self.field_names d = dict(zip(names, data)) return d
class OpenDataSoftAPI(Configurable): dataset = Option(str, required=True) endpoint = Option(str, default='{scheme}://{netloc}{path}') scheme = Option(str, default='https') netloc = Option(str, default='data.opendatasoft.com') path = Option(path_str, default='/api/records/1.0/search/') rows = Option(int, default=500) limit = Option(int, default=None) timezone = Option(str, default='Europe/Paris') kwargs = Option(dict, default=dict) @ContextProcessor def compute_path(self, context): params = (('dataset', self.dataset), ('timezone', self.timezone)) + tuple(sorted(self.kwargs.items())) yield self.endpoint.format(scheme=self.scheme, netloc=self.netloc, path=self.path) + '?' + urlencode(params) @ContextProcessor def start(self, context, base_url): yield ValueHolder(0) def __call__(self, base_url, start, *args, **kwargs): while (not self.limit) or (self.limit > start): url = '{}&start={start}&rows={rows}'.format( base_url, start=start.value, rows=self.rows if not self.limit else min(self.rows, self.limit - start) ) resp = requests.get(url) records = resp.json().get('records', []) if not len(records): break for row in records: yield {**row.get('fields', {}), 'geometry': row.get('geometry', {})} start.value += self.rows
class HTTPGetExtract(Configurable): url = Option(str, required=True, positional=True) content = Option(str, required=False, default='content') http = Service('http') def __call__(self, http): response = http.get(self.url) if not response.ok: logger.error(response.text) raise RuntimeError(f'Request fails: {self.url}') yield response.content
class CurriedCSVReader(Configurable): ''' This reader takes CSV filenames as input, and for each parses the CSV content and yields a tuple of strings for each row. ''' fs = Service( 'fs', __doc__='''The filesystem instance to use.''', ) # type: str mode = Option( str, default='r', __doc__='''What mode to use for open() call.''', ) # type: str encoding = Option( str, default='utf-8', __doc__='''Encoding.''', ) # type: str limit = Option( int, __doc__= '''Limit the number of rows read (to allow early pipeline termination).''', ) field_names = Option() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.count = 0 def read(self, path, *, fs): limit = self.limit count = self.count names = self.field_names if not (limit) or (limit and count < limit): sys.stderr.write('============================== %s\n' % (path, )) with fs.open(path, newline='') as csvfile: r = csv.reader(csvfile) for row in r: if limit and count >= limit: break count += 1 if names: d = {} for i in range(len(names)): d[names[i]] = row[i] yield d else: yield row self.count = count __call__ = read
class Bobby(Configurable): handler = Method() handler2 = Method() foo = Option(positional=True) bar = Option(required=False) @ContextProcessor def think(self, context): yield 'different' def __call__(self, think, *args, **kwargs): self.handler('1', *args, **kwargs) self.handler2('2', *args, **kwargs)
class HTTPGet(Configurable): url = Option(str, required=False, default='url') content = Option(str, required=False, default='content') http = Service('http') def __call__(self, properties, http): response = http.get(properties[self.url]) if not response.ok: logger.error(response.text) raise RuntimeError(f'Request fails: {properties[self.url]}') properties[self.content] = response.content yield properties
class OverpassExtract(Configurable): query = Option(str, required=True, positional=True) overpass_url = Option(str, required=False, positional=False, default=OVERPASS_URL) http = Service('http') def __call__(self, http): response = http.post(self.overpass_url, data=self.query) if not response.ok: logger.error(response.text) raise RuntimeError('Overpass query fails') yield response.content
class CurriedXMLReader(Configurable): ''' Similar to XMLReader, this reader takes XML filenames as input, and for each parses the XML content and yields lxml.etree Element objects matching the given XPath expression. ''' xpath = Option(str, required=True) fs = Service( 'fs', __doc__='''The filesystem instance to use.''', ) # type: str mode = Option( str, default='r', __doc__='''What mode to use for open() call.''', ) # type: str encoding = Option( str, default='utf-8', __doc__='''Encoding.''', ) # type: str limit = Option( int, __doc__= '''Limit the number of rows read (to allow early pipeline termination).''', ) verbose = Option(bool, default=False) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.count = 0 def read(self, path, *, fs): limit = self.limit count = self.count if not (limit) or (limit and count < limit): if self.verbose: sys.stderr.write('============================== %s\n' % (path, )) file = fs.open(path, self.mode, encoding=self.encoding) root = lxml.etree.parse(file) for e in root.xpath(self.xpath): if limit and count >= limit: break count += 1 yield e self.count = count file.close() __call__ = read
class PreserveCSVFields(Configurable): key = Option(str, default='csv_line') order = Option(list, default=None) def __call__(self, data:dict): s = '' keyorder = self.order if not keyorder: keyorder = sorted(data.keys()) for k in keyorder: v = data.get(k, '') s += f'{k}: {v}\n' data[self.key] = s yield data
class MongoReader(Configurable): database = Option(str, positional=True, default='scopus', __doc__='the mongodb database name') collection = Option(str, positional=True, default='', __doc__='the mongodb collection name') client = Service('mongodb.client') def __call__(self, args, *, client): db = client[self.database] collection = db[self.collection]
class RecordCounter(Configurable): counts = Service('counts') verbose = Option(bool, default=False) name = Option() def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) self.mod = 100 def __call__(self, data, counts): counts[self.name] += 1 count = counts[self.name] if count % self.mod == 0: print(f'\r{count} {self.name}', end='', file=sys.stderr) return data
class MongoWriter(Configurable): database = Option(str, positional=True, default='scopus', __doc__='the mongo database') collection = Option(str, positional=True, default='', __doc__='the mongo collection') client = Service('mongodb.client') def __call__(self, args, *, client): db = client[self.database] collection = db[self.collection] collection.insert_one(fix_keys(args))
class CleanDateToSpan(Configurable): ''' Supplied with a key name, attempt to parse the value in `input[key]`` as a date or date range, and create a new `TimeSpan` object for the parsed date(s). Store the resulting timespan in `input[key + '_span']`. ''' key = Option(str, required=True) optional = Option(bool, default=True) def __init__(self, *v, **kw): ''' Sets the __name__ property to include the relevant options so that when the bonobo graph is serialized as a GraphViz document, different objects can be visually differentiated. ''' super().__init__(*v, **kw) self.__name__ = f'{type(self).__name__} ({self.key})' @staticmethod def string_to_span(value): '''Parse a string value and attempt to create a corresponding `model.TimeSpan` object.''' try: date_from, date_to = date_cleaner(value) ts = model.TimeSpan() if date_from is not None: ts.begin_of_the_begin = date_from.strftime( "%Y-%m-%dT%H:%M:%SZ") if date_to is not None: ts.end_of_the_end = date_to.strftime("%Y-%m-%dT%H:%M:%SZ") return ts except Exception as e: print('*** Unknown date format %r: %s' % (value, e)) return None def __call__(self, data, *args, **kwargs): if self.key in data: value = data[self.key] ts = self.string_to_span(value) if ts is not None: data['%s_span' % self.key] = ts return data else: if not self.optional: warnings.warn('*** key %r is not in the data object:' % (self.key, )) pprint.pprint(data, stream=sys.stderr) return NOT_MODIFIED
class IOFormatEnabled(Configurable): ioformat = Option(default=settings.IOFORMAT.get) def get_input(self, *args, **kwargs): if self.ioformat == settings.IOFORMAT_ARG0: if len(args) != 1 or len(kwargs): raise ValueError( 'Wrong input formating: IOFORMAT=ARG0 implies one arg and no kwargs, got args={!r} and kwargs={!r}.' .format(args, kwargs)) return args[0] if self.ioformat == settings.IOFORMAT_KWARGS: if len(args) or not len(kwargs): raise ValueError( 'Wrong input formating: IOFORMAT=KWARGS ioformat implies no arg, got args={!r} and kwargs={!r}.' .format(args, kwargs)) return kwargs raise NotImplementedError('Unsupported format.') def get_output(self, row): if self.ioformat == settings.IOFORMAT_ARG0: return row if self.ioformat == settings.IOFORMAT_KWARGS: return Bag(**row) raise NotImplementedError('Unsupported format.')
class AddAuctionCatalog(Configurable): helper = Option(required=True) non_auctions = Service('non_auctions') def __call__(self, data: dict, non_auctions): '''Add modeling for auction catalogs as linguistic objects''' cno = data['catalog_number'] # this information may either come from `data` (for the auction events branch of the pipeline) # or from `non_auctions` (for the catalogs branch, which lacks this information, # but will have access to the `non_auctions` service which was shared from the events branch) sale_type = non_auctions.get(cno, data.get('non_auction_flag')) if sale_type: non_auctions[cno] = sale_type sale_type = sale_type or 'Auction' catalog = self.helper.catalog_text(cno, sale_type) cdata = {'uri': catalog.id} puid = data.get('persistent_puid') if puid: puid_id = self.helper.gri_number_id(puid) catalog.identified_by = puid_id cdata['identifiers'] = [puid_id] data['_catalog'] = add_crom_data(data=cdata, what=catalog) yield data
class RemoveKeys(Configurable): keys = Option(set) def __call__(self, data:dict): for key in self.keys: with suppress(KeyError): del data[key] return data
class AddDataDependentArchesModel(Configurable): ''' Set the `_ARCHES_MODEL` key in the supplied `dict` to the appropriate arches model UUID and return it. ''' models = Option() def __call__(self, data, *args, **kwargs): if '_LOD_OBJECT' in data: obj = data['_LOD_OBJECT'] t = type(obj) tname = t.__name__ if tname in self.models: data['_ARCHES_MODEL'] = self.models[tname] return data else: typename = type(obj).__name__ if tname in self.models: data['_ARCHES_MODEL'] = self.models[typename] return data else: print(f'*** No Arches model available for {typename}') data['_ARCHES_MODEL'] = f'XXX-{typename}' else: data['_ARCHES_MODEL'] = self.models['LinguisticObject'] return data
class PickleReader(FileReader, PickleHandler): """ Reads a Python pickle object and yields the items in dicts. """ mode = Option(str, default='rb') def read(self, file, context, *, fs): data = pickle.load(file) # if the data is not iterable, then wrap the object in a list so it may be iterated if isinstance(data, dict): is_dict = True iterator = iter(data.items()) else: is_dict = False try: iterator = iter(data) except TypeError: iterator = iter([data]) if not context.output_type: context.set_output_fields(self.fields or next(iterator)) fields = context.get_output_fields() fields_length = len(fields) for row in iterator: if len(row) != fields_length: raise ValueError( 'Received an object with {} items, expected {}.'.format( len(row), fields_length)) yield tuple(row.values() if is_dict else row) __call__ = read
class Serializer(Configurable): compact = Option(default=True) def __call__(self, data: dict): factory = data['_CROM_FACTORY'] js = factory.toString(data['_LOD_OBJECT'], self.compact) data['_OUTPUT'] = js return data