class AddAuctionEvent(Configurable): helper = Option(required=True) event_properties = Service('event_properties') date_modifiers = Service('date_modifiers') def __call__(self, data:dict, event_properties, date_modifiers): '''Add modeling for an auction event based on properties of the supplied `data` dict.''' record = get_crom_object(data['_catalog']) cno = data['catalog_number'] sale_type = data.get('non_auction_flag', 'Auction') ts, begin, end = timespan_from_bound_components( data, date_modifiers, 'sale_begin_', 'begin', 'sale_end_', 'eoe' ) event_properties['auction_dates'][cno] = (ts, begin, end) event_properties['auction_date_label'][cno] = ts._label event_date_label = event_properties['auction_date_label'].get(cno) auction, uid, uri = self.helper.sale_event_for_catalog_number(cno, sale_type, date_label=event_date_label) auction.referred_to_by = record auction.identified_by = model.Name(ident='', content=auction._label) data['uid'] = uid data['uri'] = uri add_crom_data(data=data, what=auction) catalog = get_crom_object(data['_catalog']) data['_record'] = data['_catalog'] return data
class AddPhysicalCatalogEntry(Configurable): helper = Option(required=True) non_auctions = Service('non_auctions') def __call__(self, data: dict, non_auctions): '''Add modeling for the entry describing a physical auction catalog in the PSCP dataset.''' cno = data['catalog_number'] owner = data['owner_code'] copy = data['copy_number'] rec_num = data['star_record_no'] sale_type = non_auctions.get(cno, data.get('non_auction_flag', 'Auction')) keys = [v for v in [cno, owner, copy] if v] record_uri = self.helper.make_proj_uri('ENTRY', 'PHYS-CAT', *keys) content = data['star_csv_data'] catalog_label = self.helper.physical_catalog_label( cno, sale_type, owner, copy) row_name = f'STAR Entry for Physical {catalog_label}' row = vocab.EntryTextForm(ident=record_uri, content=content, label=row_name) row.part_of = self.helper.static_instances.get_instance( 'LinguisticObject', 'db-sales_catalogs') creation = model.Creation(ident='') creation.carried_out_by = self.helper.static_instances.get_instance( 'Group', 'gpi') row.created_by = creation row.identified_by = self.helper.gpi_number_id(rec_num, vocab.StarNumber) row.identified_by = vocab.PrimaryName(ident='', content=row_name) data['_catalog_record'] = add_crom_data({'uri': record_uri}, row) yield data
class AddPhysicalCatalogObjects(Configurable): helper = Option(required=True) non_auctions = Service('non_auctions') def __call__(self, data: dict, non_auctions): '''Add modeling for physical copies of an auction catalog''' catalog = get_crom_object(data['_catalog']) record = get_crom_object(data['_catalog_record']) cno = data['catalog_number'] owner = data['owner_code'] copy = data['copy_number'] sale_type = non_auctions.get(cno, 'Auction') catalogObject = self.helper.physical_catalog(cno, sale_type, owner, copy, add_name=True) catalogObject.referred_to_by = record data['uri'] = catalogObject.id info = data.get('annotation_info') if info: catalogObject.referred_to_by = vocab.Note(ident='', content=info) catalogObject.carries = catalog add_crom_data(data=data, what=catalogObject) return data
class SubmittingCentreExtractor(Configurable): """Get unique submitting centre names from the full database.""" s3client = Service("s3client") @ContextProcessor def acc(self, context, **kwargs): centres = yield ValueHolder(set()) for centre in sorted(centres.get()): print(centre) if not NO_OUTPUT_FILE: with open("/tmp/message.txt", "w") as f: for centre in sorted(centres.get()): print(centre, file=f) @use_raw_input def __call__(self, centres, *args, **kwargs): """The accumulator fuction run by the pipeline. Parameters ---------- centres : ValueHolder(set()) Accumulator for the centre names task, obj, _ = tuple[str, boto3.resource('s3').ObjectSummary, None] A task name ("process" is what accepted here) and a clinical data file object to extract the submitting centre from. **kwargs : dict Keyword arguments. """ task, key, _ = args s3client = kwargs["s3client"] if task == "process" and Path(key).suffix.lower() == ".json": centre = helpers.get_submitting_centre_from_key(s3client, key) if centre is not None: centres.add(centre)
class AddAuctionCatalog(Configurable): helper = Option(required=True) non_auctions = Service('non_auctions') def __call__(self, data: dict, non_auctions): '''Add modeling for auction catalogs as linguistic objects''' cno = data['catalog_number'] # this information may either come from `data` (for the auction events branch of the pipeline) # or from `non_auctions` (for the catalogs branch, which lacks this information, # but will have access to the `non_auctions` service which was shared from the events branch) sale_type = non_auctions.get(cno, data.get('non_auction_flag')) if sale_type: non_auctions[cno] = sale_type sale_type = sale_type or 'Auction' catalog = self.helper.catalog_text(cno, sale_type) cdata = {'uri': catalog.id} puid = data.get('persistent_puid') if puid: puid_id = self.helper.gri_number_id(puid) catalog.identified_by = puid_id cdata['identifiers'] = [puid_id] data['_catalog'] = add_crom_data(data=cdata, what=catalog) yield data
class AddAuctionHouses(Configurable): helper = Option(required=True) event_properties = Service('event_properties') def __call__(self, data:dict, event_properties): ''' Add modeling data for the auction house organization(s) associated with an auction event. ''' auction = get_crom_object(data) event_record = get_crom_object(data['_record']) catalog = data['_catalog']['_LOD_OBJECT'] d = data.copy() houses = data.get('auction_house', []) cno = data['catalog_number'] house_dicts = [] event_record = get_crom_object(data['_record']) d['_organizers'] = [] for i, h in enumerate(houses): house_dict = self.helper.copy_source_information(h, data) house_dict_copy = house_dict.copy() h['_catalog'] = catalog self.helper.add_auction_house_data(house_dict, sequence=i, event_record=event_record) house_dict_copy['uri'] = house_dict['uri'] house_dicts.append(house_dict_copy) house = get_crom_object(h) act = vocab.AuctionHouseActivity(ident='', label=f'Activity of {house._label}') act.carried_out_by = house auction.part = act d['_organizers'].append(h) event_properties['auction_houses'][cno] += house_dicts return d
class AddFieldNamesService(Configurable): key = Option( required=False ) # This is passed into __init__ as a kwarg but not into __call__ field_names = Service( 'header_names' ) # This is passed into __call__ as a kwarg not at __init__ # ... go figure def __init__(self, *args, **kwargs): ''' Sets the __name__ property to include the relevant options so that when the bonobo graph is serialized as a GraphViz document, different objects can be visually differentiated. ''' super().__init__(self, *args, **kwargs) self.__name__ = f'{type(self).__name__} ({self.key})' def __call__(self, *data, field_names={}): if len(data) == 1 and type(data[0]) in (tuple, list): data = data[0] names = field_names.get(self.key, []) if isinstance( field_names, dict) else field_names d = dict(zip(names, data)) return d
class FileHandler(Configurable): """Abstract component factory for file-related components. Args: fs (str): service name to use for filesystem. path (str): which path to use within the provided filesystem. eol (str): which character to use to separate lines. mode (str): which mode to use when opening the file. encoding (str): which encoding to use when opening the file. """ path = Option( filesystem_path, required=True, positional=True, __doc__="Path to use within the provided filesystem.") # type: str eol = Option(str, default="\n", __doc__="Character to use as line separator.") # type: str mode = Option(str, __doc__="What mode to use for open() call.") # type: str encoding = Option(str, default="utf-8", __doc__="Encoding.") # type: str fs = Service("fs", __doc__="The filesystem instance to use.") # type: str @ContextProcessor def file(self, context, *, fs): with self.open(fs) as file: yield file def open(self, fs): return fs.open(self.path, self.mode, encoding=self.encoding)
class MessageCountWriter(Configurable): date = Option(required=True, positional=True) message_count_database = Service("message_count") def __call__(self, _, channel, count, *, message_count_database): message_count_database.set_day_channel(self.date, channel, count)
class ChannelsSource(Configurable): date = Option(positional=True, required=True) message_count_database = Service("message_count") def __call__(self, message_count_database): yield from message_count_database.get_channels_for_day(self.date)
class Trace(Configurable): name = Option() diff = Option(default=False) ordinals = Option(default=(0,)) trace_counter = Service('trace_counter') def __call__(self, thing: dict, trace_counter): key = '__trace_id' skey = '__trace_seq' if not key in thing: thing[key] = next(trace_counter) thing[skey] = 1 else: thing[skey] += 1 id = thing[key] seq = thing[skey] if id in self.ordinals: formatted = pprint.pformat({k: v for k, v in thing.items() if not k.startswith('__trace_')}) if formatted[0] == '{' and formatted[-1] == '}': # adding newlines and a trailing comma helps with making a sensible diff formatted = '{\n ' + formatted[1:-1] + ',\n}\n' if self.diff: previous = thing.get('__trace_%d_%d' % (id, seq-1)) print('===========> %s #%d: sequence %d' % (self.name, id, seq)) if previous: lines = difflib.ndiff(previous.splitlines(keepends=True), formatted.splitlines(keepends=True)) sys.stdout.writelines(lines) else: print(formatted) else: print(formatted) thing['__trace_%d_%d' % (id, seq)] = formatted return thing
class MyServiceDependantConfigurable(Configurable): printer = Service( PrinterInterface, ) def __call__(self, printer: PrinterInterface, *args): return printer.print(*args)
class JsonRawThreadsWriter(_FileLdJsonWriter): date = Option(required=True, positional=True) database = Service("database") def open(self, database): return database.open_raw_threads_file(self.date, mode="w")
class OpendatasoftExtract(Configurable): portal = Option(str, required=True, positional=True) dataset_id = Option(str, required=True, positional=True) format = Option(str, required=True, positional=True) http = Service('http') def __call__(self, http): exports_url, str_date = self.get_metadata(http, self.portal, self.dataset_id) version = self.date2version(str_date) url = self.get_export_url(http, exports_url, self.format) yield { 'url': url, 'version': version, } def get_metadata(self, http, portal, dataset_id): url = f'{portal}/api/v2/catalog/datasets/{dataset_id}' result = http.get(url) if not result.ok: raise RuntimeError(f'Fails fetch metedata content from {url}') try: metadata = result.json() except ValueError as e: raise ValueError(f'Fails parse json metedata from {url}') from e try: str_date = metadata['dataset']['metas']['default']['data_processed'] link = next(filter(lambda d: d['rel'] == 'exports', metadata['links'])) exports_url = link['href'] except KeyError as e: raise ValueError(f'Fails use metedata from {url}') from e return [exports_url, str_date] def date2version(self, str_date): # ISO date, just cut it return str_date[:10].replace('-', '.') def get_export_url(self, http, exports_url, format): result = http.get(exports_url) if not result.ok: raise RuntimeError(f'Fails fetch export list from {exports_url}') try: exports = result.json() except ValueError as e: raise ValueError(f'Fails parse json export list from {exports_url}') from e try: link = next(filter(lambda d: d['rel'] == format, exports['links'])) except KeyError as e: raise ValueError(f'Fails retrive export format {format} from {exports_url}') from e if not link: raise RuntimeError(f'Export format {format} from {exports_url} not available') return link['href']
class AddPhysicalCatalogOwners(Configurable): helper = Option(required=True) location_codes = Service('location_codes') unique_catalogs = Service('unique_catalogs') def __call__(self, data: dict, location_codes, unique_catalogs): '''Add information about the ownership of a physical copy of an auction catalog''' # Add the URI of this physical catalog to `unique_catalogs`. This data will be used # later to figure out which catalogs can be uniquely identified by a catalog number # and owner code (e.g. for owners who do not have multiple copies of a catalog). cno = data['catalog_number'] owner_code = data['owner_code'] copy_number = data.get('copy_number', '') owner_name = None entry_record = get_crom_object(data.get('_catalog')) with suppress(KeyError): owner_name = location_codes[owner_code] owner_uri = self.helper.make_proj_uri('ORGANIZATION', 'LOCATION-CODE', owner_code) data['_owner'] = { 'label': owner_name, 'uri': owner_uri, 'referred_to_by': [entry_record], 'identifiers': [ model.Name(ident='', content=owner_name), model.Identifier(ident='', content=str(owner_code)) ], } owner = model.Group(ident=owner_uri) owner.referred_to_by = entry_record add_crom_data(data['_owner'], owner) if not owner_code: warnings.warn(f'Setting empty identifier on {owner.id}') add_crom_data(data=data['_owner'], what=owner) catalog = get_crom_object(data) catalog.current_owner = owner owner_uri = self.helper.physical_catalog_uri( cno, owner_code, None ) # None here because we want a key that will stand in for all the copies belonging to a single owner copy_uri = self.helper.physical_catalog_uri(cno, owner_code, copy_number) unique_catalogs[owner_uri].add(copy_uri) return data
class JsonRawThreadsReader(Configurable): database = Service("database") def __call__(self, _, date, *, database): with database.open_raw_threads_file(date) as fp: for line in fp: if line.strip(): yield json.loads(line)
class MakeAATAAbstract(Configurable): helper = Option(required=True) language_code_map = Service('language_code_map') def __call__(self, data, language_code_map): ''' Given a `dict` representing an "article," extract the abstract records. yield a new `dict`s for each such record. The resulting asbtract `dict` will contain these keys: * `_LOD_OBJECT`: A `model.LinguisticObject` object representing the abstract * `_aata_record_id`: The identifier of the corresponding article * `_aata_record_author_seq`: A integer identifying this abstract (unique within the scope of the article) * `content`: The text content of the abstract * `language`: A model object representing the declared langauge of the abstract (if any) * `author_abstract_flag`: A boolean value indicating whether the article's authors also authored the abstract * `identifiers`: A `list` of (identifier, identifier type) pairs * `_authors`: The authorship information from the input article `dict` * `uid`: A unique ID for this abstract * `parent`: The model object representing the corresponding article * `parent_data`: The `dict` representing the corresponding article ''' lod_object = get_crom_object(data) for a in data.get('_abstracts', []): abstract_dict = { k: v for k, v in a.items() if k not in ('language', ) } abstract_uri = self.helper.make_proj_uri( 'Abstract', data['_aata_record_id'], a['_aata_record_abstract_seq']) content = a.get('content') abstract = vocab.Abstract(ident=abstract_uri, content=content) abstract.refers_to = lod_object langcode = a.get('language') if langcode is not None: language = self.helper.language_object_from_code( langcode, language_code_map) if language is not None: abstract.language = language abstract_dict['language'] = language if '_authors' in data: abstract_dict['_authors'] = data['_authors'] # create a uid based on the AATA record id, the sequence number of the abstract # in that record, and which author we're handling right now abstract_dict.update({ 'parent_data': data, 'uri': abstract_uri, }) add_crom_data(data=abstract_dict, what=abstract) yield abstract_dict
class RecentlyActiveChannelSource(Configurable): message_count_database = Service("message_count") date = Option(required=True, positional=True) def __call__(self, message_count_database): all_channels = set() for day in date_range(nworking_days_before(self.date, 3), self.date): channels = message_count_database.get_channels_for_day(day) all_channels.update(channels) yield from all_channels
class HTTPGetExtract(Configurable): url = Option(str, required=True, positional=True) content = Option(str, required=False, default='content') http = Service('http') def __call__(self, http): response = http.get(self.url) if not response.ok: logger.error(response.text) raise RuntimeError(f'Request fails: {self.url}') yield response.content
class JsonEnrichedMessagesReader(Configurable): database = Service("database") @ContextProcessor def fp(self, _, *, database): with database.open_enriched_messages_file() as fp: yield fp def __call__(self, fp, _, *, database): for line in fp: yield json.loads(line)
class CurriedCSVReader(Configurable): ''' This reader takes CSV filenames as input, and for each parses the CSV content and yields a tuple of strings for each row. ''' fs = Service( 'fs', __doc__='''The filesystem instance to use.''', ) # type: str mode = Option( str, default='r', __doc__='''What mode to use for open() call.''', ) # type: str encoding = Option( str, default='utf-8', __doc__='''Encoding.''', ) # type: str limit = Option( int, __doc__= '''Limit the number of rows read (to allow early pipeline termination).''', ) field_names = Option() def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.count = 0 def read(self, path, *, fs): limit = self.limit count = self.count names = self.field_names if not (limit) or (limit and count < limit): sys.stderr.write('============================== %s\n' % (path, )) with fs.open(path, newline='') as csvfile: r = csv.reader(csvfile) for row in r: if limit and count >= limit: break count += 1 if names: d = {} for i in range(len(names)): d[names[i]] = row[i] yield d else: yield row self.count = count __call__ = read
class AddAuctionCatalogEntry(Configurable): helper = Option(required=True) non_auctions = Service('non_auctions') def __call__(self, data: dict, non_auctions): '''Add modeling for auction catalogs as linguistic objects''' cno = data['auction_of_lot']['catalog_number'] rec_num = data['pi_record_no'] record_uri = self.helper.make_proj_uri('CATALOG', cno, 'RECORD', rec_num) record = vocab.ParagraphText( ident=record_uri, label=f'Sale recorded in catalog (record number {rec_num})') data['_sale_record'] = add_crom_data({'uri': record_uri}, record) page_id = data.get('pg') pdf_page_id = data.get('ppg') if not page_id: yield data return sale_type = non_auctions.get(cno, data.get('non_auction_flag')) if sale_type: non_auctions[cno] = sale_type sale_type = sale_type or 'Auction' catalog = self.helper.catalog_text(cno, sale_type) cdata = add_crom_data(data={'uri': catalog.id}, what=catalog) idents = [ vocab.PageNumber(ident='', content=page_id), ] if pdf_page_id: idents.append( vocab.make_multitype_obj(vocab.PageNumber, vocab.OrderNumber, ident='', content=pdf_page_id, label=f'Page Order')) data['_text_page'] = { 'uri': self.helper.make_proj_uri('CATALOG', cno, 'Page', page_id), 'object_type': vocab.PageTextForm, 'label': f'Sale Catalog {cno}, Page {page_id}', 'identifiers': idents, 'referred_to_by': [], 'part_of': [cdata], 'part': [], } mlo = MakeLinkedArtLinguisticObject() mlo(data['_text_page']) yield data
class HTTPGet(Configurable): url = Option(str, required=False, default='url') content = Option(str, required=False, default='content') http = Service('http') def __call__(self, properties, http): response = http.get(properties[self.url]) if not response.ok: logger.error(response.text) raise RuntimeError(f'Request fails: {properties[self.url]}') properties[self.content] = response.content yield properties
class CurriedXMLReader(Configurable): ''' Similar to XMLReader, this reader takes XML filenames as input, and for each parses the XML content and yields lxml.etree Element objects matching the given XPath expression. ''' xpath = Option(str, required=True) fs = Service( 'fs', __doc__='''The filesystem instance to use.''', ) # type: str mode = Option( str, default='r', __doc__='''What mode to use for open() call.''', ) # type: str encoding = Option( str, default='utf-8', __doc__='''Encoding.''', ) # type: str limit = Option( int, __doc__= '''Limit the number of rows read (to allow early pipeline termination).''', ) verbose = Option(bool, default=False) def __init__(self, *args, **kwargs): super().__init__(*args, **kwargs) self.count = 0 def read(self, path, *, fs): limit = self.limit count = self.count if not (limit) or (limit and count < limit): if self.verbose: sys.stderr.write('============================== %s\n' % (path, )) file = fs.open(path, self.mode, encoding=self.encoding) root = lxml.etree.parse(file) for e in root.xpath(self.xpath): if limit and count >= limit: break count += 1 yield e self.count = count file.close() __call__ = read
class OverpassExtract(Configurable): query = Option(str, required=True, positional=True) overpass_url = Option(str, required=False, positional=False, default=OVERPASS_URL) http = Service('http') def __call__(self, http): response = http.post(self.overpass_url, data=self.query) if not response.ok: logger.error(response.text) raise RuntimeError('Overpass query fails') yield response.content
class MongoReader(Configurable): database = Option(str, positional=True, default='scopus', __doc__='the mongodb database name') collection = Option(str, positional=True, default='', __doc__='the mongodb collection name') client = Service('mongodb.client') def __call__(self, args, *, client): db = client[self.database] collection = db[self.collection]
class RecordCounter(Configurable): counts = Service('counts') verbose = Option(bool, default=False) name = Option() def __init__(self, *args, **kwargs): super().__init__(self, *args, **kwargs) self.mod = 100 def __call__(self, data, counts): counts[self.name] += 1 count = counts[self.name] if count % self.mod == 0: print(f'\r{count} {self.name}', end='', file=sys.stderr) return data
class MongoWriter(Configurable): database = Option(str, positional=True, default='scopus', __doc__='the mongo database') collection = Option(str, positional=True, default='', __doc__='the mongo collection') client = Service('mongodb.client') def __call__(self, args, *, client): db = client[self.database] collection = db[self.collection] collection.insert_one(fix_keys(args))
class JsonRawMessagesDateReader(Configurable): date = Option(required=True, positional=True) database = Service("database") @ContextProcessor def fp(self, _, *, database): with database.open_raw_messages_file(self.date) as fp: yield fp def __call__(self, fp, _, *, database): for line in fp: if line.strip(): yield json.loads(line)
class PyfilesLoad(Configurable): namespace = Option(str, required=True, positional=True) filename = Option(str, required=True, positional=True) content = Option(str, required=False, default='content') version = Option(str, required=False, default='version') pyfile_storage = Service('pyfile_storage') def __call__(self, properties, pyfile_storage): event_loop = asyncio.new_event_loop() event_loop.run_until_complete( pyfile_storage.store(stream=properties[self.content], namespace=self.namespace, filename=self.filename, version=properties[self.version])) yield NOT_MODIFIED