Python Service Examples, bonobo.config.Service Python Examples

Example #1

0

Show file

class AddAuctionEvent(Configurable):
	helper = Option(required=True)
	event_properties = Service('event_properties')
	date_modifiers = Service('date_modifiers')

	def __call__(self, data:dict, event_properties, date_modifiers):
		'''Add modeling for an auction event based on properties of the supplied `data` dict.'''
		record = get_crom_object(data['_catalog'])
		cno = data['catalog_number']
		sale_type = data.get('non_auction_flag', 'Auction')

		ts, begin, end = timespan_from_bound_components(
			data,
			date_modifiers,
			'sale_begin_', 'begin',
			'sale_end_', 'eoe'
		)
		
		event_properties['auction_dates'][cno] = (ts, begin, end)
		event_properties['auction_date_label'][cno] = ts._label
		
		event_date_label = event_properties['auction_date_label'].get(cno)
		auction, uid, uri = self.helper.sale_event_for_catalog_number(cno, sale_type, date_label=event_date_label)
		auction.referred_to_by = record
		auction.identified_by = model.Name(ident='', content=auction._label)
		data['uid'] = uid
		data['uri'] = uri
		add_crom_data(data=data, what=auction)
		
		catalog = get_crom_object(data['_catalog'])
		data['_record'] = data['_catalog']
		return data

Example #2

0

Show file

File: catalogs.py Project: thegetty/pipeline

class AddPhysicalCatalogEntry(Configurable):
    helper = Option(required=True)
    non_auctions = Service('non_auctions')

    def __call__(self, data: dict, non_auctions):
        '''Add modeling for the entry describing a physical auction catalog in the PSCP dataset.'''
        cno = data['catalog_number']
        owner = data['owner_code']
        copy = data['copy_number']
        rec_num = data['star_record_no']
        sale_type = non_auctions.get(cno,
                                     data.get('non_auction_flag', 'Auction'))
        keys = [v for v in [cno, owner, copy] if v]
        record_uri = self.helper.make_proj_uri('ENTRY', 'PHYS-CAT', *keys)
        content = data['star_csv_data']

        catalog_label = self.helper.physical_catalog_label(
            cno, sale_type, owner, copy)
        row_name = f'STAR Entry for Physical {catalog_label}'
        row = vocab.EntryTextForm(ident=record_uri,
                                  content=content,
                                  label=row_name)
        row.part_of = self.helper.static_instances.get_instance(
            'LinguisticObject', 'db-sales_catalogs')
        creation = model.Creation(ident='')
        creation.carried_out_by = self.helper.static_instances.get_instance(
            'Group', 'gpi')
        row.created_by = creation
        row.identified_by = self.helper.gpi_number_id(rec_num,
                                                      vocab.StarNumber)
        row.identified_by = vocab.PrimaryName(ident='', content=row_name)

        data['_catalog_record'] = add_crom_data({'uri': record_uri}, row)

        yield data

Example #3

0

Show file

File: catalogs.py Project: thegetty/pipeline

class AddPhysicalCatalogObjects(Configurable):
    helper = Option(required=True)
    non_auctions = Service('non_auctions')

    def __call__(self, data: dict, non_auctions):
        '''Add modeling for physical copies of an auction catalog'''
        catalog = get_crom_object(data['_catalog'])
        record = get_crom_object(data['_catalog_record'])
        cno = data['catalog_number']
        owner = data['owner_code']
        copy = data['copy_number']
        sale_type = non_auctions.get(cno, 'Auction')
        catalogObject = self.helper.physical_catalog(cno,
                                                     sale_type,
                                                     owner,
                                                     copy,
                                                     add_name=True)
        catalogObject.referred_to_by = record
        data['uri'] = catalogObject.id
        info = data.get('annotation_info')
        if info:
            catalogObject.referred_to_by = vocab.Note(ident='', content=info)
        catalogObject.carries = catalog

        add_crom_data(data=data, what=catalogObject)
        return data

Example #4

0

Show file

class SubmittingCentreExtractor(Configurable):
    """Get unique submitting centre names from the full database."""

    s3client = Service("s3client")

    @ContextProcessor
    def acc(self, context, **kwargs):
        centres = yield ValueHolder(set())
        for centre in sorted(centres.get()):
            print(centre)
        if not NO_OUTPUT_FILE:
            with open("/tmp/message.txt", "w") as f:
                for centre in sorted(centres.get()):
                    print(centre, file=f)

    @use_raw_input
    def __call__(self, centres, *args, **kwargs):
        """The accumulator fuction run by the pipeline.

        Parameters
        ----------
        centres : ValueHolder(set())
            Accumulator for the centre names
        task, obj, _ = tuple[str, boto3.resource('s3').ObjectSummary, None]
            A task name ("process" is what accepted here) and a clinical data file
            object to extract the submitting centre from.
        **kwargs : dict
            Keyword arguments.
        """
        task, key, _ = args
        s3client = kwargs["s3client"]
        if task == "process" and Path(key).suffix.lower() == ".json":
            centre = helpers.get_submitting_centre_from_key(s3client, key)
            if centre is not None:
                centres.add(centre)

Example #5

0

Show file

File: catalogs.py Project: kasei/pipeline

class AddAuctionCatalog(Configurable):
    helper = Option(required=True)
    non_auctions = Service('non_auctions')

    def __call__(self, data: dict, non_auctions):
        '''Add modeling for auction catalogs as linguistic objects'''
        cno = data['catalog_number']

        # this information may either come from `data` (for the auction events branch of the pipeline)
        # or from `non_auctions` (for the catalogs branch, which lacks this information,
        # but will have access to the `non_auctions` service which was shared from the events branch)
        sale_type = non_auctions.get(cno, data.get('non_auction_flag'))
        if sale_type:
            non_auctions[cno] = sale_type
        sale_type = sale_type or 'Auction'
        catalog = self.helper.catalog_text(cno, sale_type)
        cdata = {'uri': catalog.id}
        puid = data.get('persistent_puid')
        if puid:
            puid_id = self.helper.gri_number_id(puid)
            catalog.identified_by = puid_id
            cdata['identifiers'] = [puid_id]

        data['_catalog'] = add_crom_data(data=cdata, what=catalog)
        yield data

Example #6

0

Show file

class AddAuctionHouses(Configurable):
	helper = Option(required=True)
	event_properties = Service('event_properties')

	def __call__(self, data:dict, event_properties):
		'''
		Add modeling data for the auction house organization(s) associated with an auction
		event.
		'''
		auction = get_crom_object(data)
		event_record = get_crom_object(data['_record'])
		catalog = data['_catalog']['_LOD_OBJECT']
		d = data.copy()
		houses = data.get('auction_house', [])
		cno = data['catalog_number']

		house_dicts = []
		event_record = get_crom_object(data['_record'])
		d['_organizers'] = []
		for i, h in enumerate(houses):
			house_dict = self.helper.copy_source_information(h, data)
			house_dict_copy = house_dict.copy()
			h['_catalog'] = catalog
			self.helper.add_auction_house_data(house_dict, sequence=i, event_record=event_record)
			house_dict_copy['uri'] = house_dict['uri']
			house_dicts.append(house_dict_copy)
			house = get_crom_object(h)
			act = vocab.AuctionHouseActivity(ident='', label=f'Activity of {house._label}')
			act.carried_out_by = house
			auction.part = act
			d['_organizers'].append(h)
		event_properties['auction_houses'][cno] += house_dicts
		return d

Example #7

0

Show file

File: basic.py Project: azaroth42/pipeline

class AddFieldNamesService(Configurable):
    key = Option(
        required=False
    )  # This is passed into __init__ as a kwarg but not into __call__
    field_names = Service(
        'header_names'
    )  # This is passed into __call__ as a kwarg not at __init__

    # ... go figure

    def __init__(self, *args, **kwargs):
        '''
		Sets the __name__ property to include the relevant options so that when the
		bonobo graph is serialized as a GraphViz document, different objects can be
		visually differentiated.
		'''
        super().__init__(self, *args, **kwargs)
        self.__name__ = f'{type(self).__name__} ({self.key})'

    def __call__(self, *data, field_names={}):
        if len(data) == 1 and type(data[0]) in (tuple, list):
            data = data[0]
        names = field_names.get(self.key, []) if isinstance(
            field_names, dict) else field_names
        d = dict(zip(names, data))
        return d

Example #8

0

Show file

File: base.py Project: zkan/bonobo

class FileHandler(Configurable):
    """Abstract component factory for file-related components.

    Args:
        fs (str): service name to use for filesystem.
        path (str): which path to use within the provided filesystem.
        eol (str): which character to use to separate lines.
        mode (str): which mode to use when opening the file.
        encoding (str): which encoding to use when opening the file.
    """

    path = Option(
        filesystem_path,
        required=True,
        positional=True,
        __doc__="Path to use within the provided filesystem.")  # type: str
    eol = Option(str,
                 default="\n",
                 __doc__="Character to use as line separator.")  # type: str
    mode = Option(str,
                  __doc__="What mode to use for open() call.")  # type: str
    encoding = Option(str, default="utf-8", __doc__="Encoding.")  # type: str
    fs = Service("fs", __doc__="The filesystem instance to use.")  # type: str

    @ContextProcessor
    def file(self, context, *, fs):
        with self.open(fs) as file:
            yield file

    def open(self, fs):
        return fs.open(self.path, self.mode, encoding=self.encoding)

Example #9

0

Show file

File: message_count_graph.py Project: pbugnion/async-slack

class MessageCountWriter(Configurable):

    date = Option(required=True, positional=True)
    message_count_database = Service("message_count")

    def __call__(self, _, channel, count, *, message_count_database):
        message_count_database.set_day_channel(self.date, channel, count)

Example #10

0

Show file

File: raw_threads_graph.py Project: pbugnion/async-slack

class ChannelsSource(Configurable):

    date = Option(positional=True, required=True)
    message_count_database = Service("message_count")

    def __call__(self, message_count_database):
        yield from message_count_database.get_channels_for_day(self.date)

Example #11

0

Show file

File: basic.py Project: thegetty/pipeline

class Trace(Configurable):
	name = Option()
	diff = Option(default=False)
	ordinals = Option(default=(0,))
	trace_counter = Service('trace_counter')

	def __call__(self, thing: dict, trace_counter):
		key = '__trace_id'
		skey = '__trace_seq'
		if not key in thing:
			thing[key] = next(trace_counter)
			thing[skey] = 1
		else:
			thing[skey] += 1
		id = thing[key]
		seq = thing[skey]
		if id in self.ordinals:
			formatted = pprint.pformat({k: v for k, v in thing.items() if not k.startswith('__trace_')})
			if formatted[0] == '{' and formatted[-1] == '}':
				# adding newlines and a trailing comma helps with making a sensible diff
				formatted = '{\n ' + formatted[1:-1] + ',\n}\n'
			if self.diff:
				previous = thing.get('__trace_%d_%d' % (id, seq-1))
				print('===========> %s #%d: sequence %d' % (self.name, id, seq))
				if previous:
					lines = difflib.ndiff(previous.splitlines(keepends=True), formatted.splitlines(keepends=True))
					sys.stdout.writelines(lines)
				else:
					print(formatted)
			else:
				print(formatted)
			thing['__trace_%d_%d' % (id, seq)] = formatted
		return thing

Example #12

0

Show file

class MyServiceDependantConfigurable(Configurable):
    printer = Service(
        PrinterInterface,
    )

    def __call__(self, printer: PrinterInterface, *args):
        return printer.print(*args)

Example #13

0

Show file

class JsonRawThreadsWriter(_FileLdJsonWriter):

    date = Option(required=True, positional=True)
    database = Service("database")

    def open(self, database):
        return database.open_raw_threads_file(self.date, mode="w")

Example #14

0

Show file

File: opendatasoft.py Project: autonomens/terra-datacollector

class OpendatasoftExtract(Configurable):
    portal = Option(str, required=True, positional=True)
    dataset_id = Option(str, required=True, positional=True)
    format = Option(str, required=True, positional=True)

    http = Service('http')

    def __call__(self, http):
        exports_url, str_date = self.get_metadata(http, self.portal, self.dataset_id)
        version = self.date2version(str_date)
        url = self.get_export_url(http, exports_url, self.format)
        yield {
            'url': url,
            'version': version,
        }

    def get_metadata(self, http, portal, dataset_id):
        url = f'{portal}/api/v2/catalog/datasets/{dataset_id}'
        result = http.get(url)
        if not result.ok:
            raise RuntimeError(f'Fails fetch metedata content from {url}')

        try:
            metadata = result.json()
        except ValueError as e:
            raise ValueError(f'Fails parse json metedata from {url}') from e

        try:
            str_date = metadata['dataset']['metas']['default']['data_processed']
            link = next(filter(lambda d: d['rel'] == 'exports', metadata['links']))
            exports_url = link['href']
        except KeyError as e:
            raise ValueError(f'Fails use metedata from {url}') from e

        return [exports_url, str_date]

    def date2version(self, str_date):
        # ISO date, just cut it
        return str_date[:10].replace('-', '.')

    def get_export_url(self, http, exports_url, format):
        result = http.get(exports_url)
        if not result.ok:
            raise RuntimeError(f'Fails fetch export list from {exports_url}')

        try:
            exports = result.json()
        except ValueError as e:
            raise ValueError(f'Fails parse json export list from {exports_url}') from e

        try:
            link = next(filter(lambda d: d['rel'] == format, exports['links']))
        except KeyError as e:
            raise ValueError(f'Fails retrive export format {format} from {exports_url}') from e

        if not link:
            raise RuntimeError(f'Export format {format} from {exports_url} not available')

        return link['href']

Example #15

0

Show file

File: catalogs.py Project: thegetty/pipeline

class AddPhysicalCatalogOwners(Configurable):
    helper = Option(required=True)
    location_codes = Service('location_codes')
    unique_catalogs = Service('unique_catalogs')

    def __call__(self, data: dict, location_codes, unique_catalogs):
        '''Add information about the ownership of a physical copy of an auction catalog'''
        # Add the URI of this physical catalog to `unique_catalogs`. This data will be used
        # later to figure out which catalogs can be uniquely identified by a catalog number
        # and owner code (e.g. for owners who do not have multiple copies of a catalog).
        cno = data['catalog_number']
        owner_code = data['owner_code']
        copy_number = data.get('copy_number', '')
        owner_name = None
        entry_record = get_crom_object(data.get('_catalog'))
        with suppress(KeyError):
            owner_name = location_codes[owner_code]
            owner_uri = self.helper.make_proj_uri('ORGANIZATION',
                                                  'LOCATION-CODE', owner_code)
            data['_owner'] = {
                'label':
                owner_name,
                'uri':
                owner_uri,
                'referred_to_by': [entry_record],
                'identifiers': [
                    model.Name(ident='', content=owner_name),
                    model.Identifier(ident='', content=str(owner_code))
                ],
            }
            owner = model.Group(ident=owner_uri)
            owner.referred_to_by = entry_record
            add_crom_data(data['_owner'], owner)
            if not owner_code:
                warnings.warn(f'Setting empty identifier on {owner.id}')
            add_crom_data(data=data['_owner'], what=owner)
            catalog = get_crom_object(data)
            catalog.current_owner = owner

        owner_uri = self.helper.physical_catalog_uri(
            cno, owner_code, None
        )  # None here because we want a key that will stand in for all the copies belonging to a single owner
        copy_uri = self.helper.physical_catalog_uri(cno, owner_code,
                                                    copy_number)
        unique_catalogs[owner_uri].add(copy_uri)
        return data

Example #16

0

Show file

class JsonRawThreadsReader(Configurable):

    database = Service("database")

    def __call__(self, _, date, *, database):
        with database.open_raw_threads_file(date) as fp:
            for line in fp:
                if line.strip():
                    yield json.loads(line)

Example #17

0

Show file

class MakeAATAAbstract(Configurable):
    helper = Option(required=True)
    language_code_map = Service('language_code_map')

    def __call__(self, data, language_code_map):
        '''
		Given a `dict` representing an "article," extract the abstract records.
		yield a new `dict`s for each such record.

		The resulting asbtract `dict` will contain these keys:

		* `_LOD_OBJECT`: A `model.LinguisticObject` object representing the abstract
		* `_aata_record_id`: The identifier of the corresponding article
		* `_aata_record_author_seq`: A integer identifying this abstract
									 (unique within the scope of the article)
		* `content`: The text content of the abstract
		* `language`: A model object representing the declared langauge of the abstract (if any)
		* `author_abstract_flag`: A boolean value indicating whether the article's authors also
								  authored the abstract
		* `identifiers`: A `list` of (identifier, identifier type) pairs
		* `_authors`: The authorship information from the input article `dict`
		* `uid`: A unique ID for this abstract
		* `parent`: The model object representing the corresponding article
		* `parent_data`: The `dict` representing the corresponding article
		'''
        lod_object = get_crom_object(data)
        for a in data.get('_abstracts', []):
            abstract_dict = {
                k: v
                for k, v in a.items() if k not in ('language', )
            }
            abstract_uri = self.helper.make_proj_uri(
                'Abstract', data['_aata_record_id'],
                a['_aata_record_abstract_seq'])
            content = a.get('content')
            abstract = vocab.Abstract(ident=abstract_uri, content=content)
            abstract.refers_to = lod_object
            langcode = a.get('language')
            if langcode is not None:
                language = self.helper.language_object_from_code(
                    langcode, language_code_map)
                if language is not None:
                    abstract.language = language
                    abstract_dict['language'] = language

            if '_authors' in data:
                abstract_dict['_authors'] = data['_authors']

            # create a uid based on the AATA record id, the sequence number of the abstract
            # in that record, and which author we're handling right now
            abstract_dict.update({
                'parent_data': data,
                'uri': abstract_uri,
            })
            add_crom_data(data=abstract_dict, what=abstract)
            yield abstract_dict

Example #18

0

Show file

File: message_count_graph.py Project: pbugnion/async-slack

class RecentlyActiveChannelSource(Configurable):

    message_count_database = Service("message_count")
    date = Option(required=True, positional=True)

    def __call__(self, message_count_database):
        all_channels = set()
        for day in date_range(nworking_days_before(self.date, 3), self.date):
            channels = message_count_database.get_channels_for_day(day)
            all_channels.update(channels)
        yield from all_channels

Example #19

0

Show file

File: common.py Project: autonomens/terra-datacollector

class HTTPGetExtract(Configurable):
    url = Option(str, required=True, positional=True)
    content = Option(str, required=False, default='content')

    http = Service('http')

    def __call__(self, http):
        response = http.get(self.url)
        if not response.ok:
            logger.error(response.text)
            raise RuntimeError(f'Request fails: {self.url}')
        yield response.content

Example #20

0

Show file

class JsonEnrichedMessagesReader(Configurable):

    database = Service("database")

    @ContextProcessor
    def fp(self, _, *, database):
        with database.open_enriched_messages_file() as fp:
            yield fp

    def __call__(self, fp, _, *, database):
        for line in fp:
            yield json.loads(line)

Example #21

0

Show file

File: csv.py Project: kasei/pipeline

class CurriedCSVReader(Configurable):
    '''
	This reader takes CSV filenames as input, and for each parses
	the CSV content and yields a tuple of strings for each row.
	'''
    fs = Service(
        'fs',
        __doc__='''The filesystem instance to use.''',
    )  # type: str
    mode = Option(
        str,
        default='r',
        __doc__='''What mode to use for open() call.''',
    )  # type: str
    encoding = Option(
        str,
        default='utf-8',
        __doc__='''Encoding.''',
    )  # type: str
    limit = Option(
        int,
        __doc__=
        '''Limit the number of rows read (to allow early pipeline termination).''',
    )
    field_names = Option()

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.count = 0

    def read(self, path, *, fs):
        limit = self.limit
        count = self.count
        names = self.field_names
        if not (limit) or (limit and count < limit):
            sys.stderr.write('============================== %s\n' % (path, ))
            with fs.open(path, newline='') as csvfile:
                r = csv.reader(csvfile)
                for row in r:
                    if limit and count >= limit:
                        break
                    count += 1
                    if names:
                        d = {}
                        for i in range(len(names)):
                            d[names[i]] = row[i]
                        yield d
                    else:
                        yield row
            self.count = count

    __call__ = read

Example #22

0

Show file

File: catalogs.py Project: thegetty/pipeline

class AddAuctionCatalogEntry(Configurable):
    helper = Option(required=True)
    non_auctions = Service('non_auctions')

    def __call__(self, data: dict, non_auctions):
        '''Add modeling for auction catalogs as linguistic objects'''
        cno = data['auction_of_lot']['catalog_number']
        rec_num = data['pi_record_no']
        record_uri = self.helper.make_proj_uri('CATALOG', cno, 'RECORD',
                                               rec_num)
        record = vocab.ParagraphText(
            ident=record_uri,
            label=f'Sale recorded in catalog (record number {rec_num})')
        data['_sale_record'] = add_crom_data({'uri': record_uri}, record)

        page_id = data.get('pg')
        pdf_page_id = data.get('ppg')
        if not page_id:
            yield data
            return

        sale_type = non_auctions.get(cno, data.get('non_auction_flag'))
        if sale_type:
            non_auctions[cno] = sale_type
        sale_type = sale_type or 'Auction'
        catalog = self.helper.catalog_text(cno, sale_type)

        cdata = add_crom_data(data={'uri': catalog.id}, what=catalog)
        idents = [
            vocab.PageNumber(ident='', content=page_id),
        ]
        if pdf_page_id:
            idents.append(
                vocab.make_multitype_obj(vocab.PageNumber,
                                         vocab.OrderNumber,
                                         ident='',
                                         content=pdf_page_id,
                                         label=f'Page Order'))
        data['_text_page'] = {
            'uri': self.helper.make_proj_uri('CATALOG', cno, 'Page', page_id),
            'object_type': vocab.PageTextForm,
            'label': f'Sale Catalog {cno}, Page {page_id}',
            'identifiers': idents,
            'referred_to_by': [],
            'part_of': [cdata],
            'part': [],
        }

        mlo = MakeLinkedArtLinguisticObject()
        mlo(data['_text_page'])

        yield data

Example #23

0

Show file

File: common.py Project: autonomens/terra-datacollector

class HTTPGet(Configurable):
    url = Option(str, required=False, default='url')
    content = Option(str, required=False, default='content')

    http = Service('http')

    def __call__(self, properties, http):
        response = http.get(properties[self.url])
        if not response.ok:
            logger.error(response.text)
            raise RuntimeError(f'Request fails: {properties[self.url]}')
        properties[self.content] = response.content
        yield properties

Example #24

0

Show file

class CurriedXMLReader(Configurable):
    '''
	Similar to XMLReader, this reader takes XML filenames as input, and for each parses
	the XML content and yields lxml.etree Element objects matching the given XPath
	expression.
	'''
    xpath = Option(str, required=True)
    fs = Service(
        'fs',
        __doc__='''The filesystem instance to use.''',
    )  # type: str
    mode = Option(
        str,
        default='r',
        __doc__='''What mode to use for open() call.''',
    )  # type: str
    encoding = Option(
        str,
        default='utf-8',
        __doc__='''Encoding.''',
    )  # type: str
    limit = Option(
        int,
        __doc__=
        '''Limit the number of rows read (to allow early pipeline termination).''',
    )
    verbose = Option(bool, default=False)

    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.count = 0

    def read(self, path, *, fs):
        limit = self.limit
        count = self.count
        if not (limit) or (limit and count < limit):
            if self.verbose:
                sys.stderr.write('============================== %s\n' %
                                 (path, ))
            file = fs.open(path, self.mode, encoding=self.encoding)
            root = lxml.etree.parse(file)
            for e in root.xpath(self.xpath):
                if limit and count >= limit:
                    break
                count += 1
                yield e
            self.count = count
            file.close()

    __call__ = read

Example #25

0

Show file

File: overpass.py Project: autonomens/terra-datacollector

class OverpassExtract(Configurable):
    query = Option(str, required=True, positional=True)
    overpass_url = Option(str, required=False, positional=False, default=OVERPASS_URL)

    http = Service('http')

    def __call__(self, http):
        response = http.post(self.overpass_url, data=self.query)

        if not response.ok:
            logger.error(response.text)
            raise RuntimeError('Overpass query fails')

        yield response.content

Example #26

0

Show file

class MongoReader(Configurable):
    database = Option(str,
                      positional=True,
                      default='scopus',
                      __doc__='the mongodb database name')
    collection = Option(str,
                        positional=True,
                        default='',
                        __doc__='the mongodb collection name')
    client = Service('mongodb.client')

    def __call__(self, args, *, client):
        db = client[self.database]
        collection = db[self.collection]

Example #27

0

Show file

File: basic.py Project: thegetty/pipeline

class RecordCounter(Configurable):
	counts = Service('counts')
	verbose = Option(bool, default=False)
	name = Option()

	def __init__(self, *args, **kwargs):
		super().__init__(self, *args, **kwargs)
		self.mod = 100

	def __call__(self, data, counts):
		counts[self.name] += 1
		count = counts[self.name]
		if count % self.mod == 0:
			print(f'\r{count} {self.name}', end='', file=sys.stderr)
		return data

Example #28

0

Show file

File: transformers.py Project: jiaola/scometrics

class MongoWriter(Configurable):
    database = Option(str,
                      positional=True,
                      default='scopus',
                      __doc__='the mongo database')
    collection = Option(str,
                        positional=True,
                        default='',
                        __doc__='the mongo collection')
    client = Service('mongodb.client')

    def __call__(self, args, *, client):
        db = client[self.database]
        collection = db[self.collection]
        collection.insert_one(fix_keys(args))

Example #29

0

Show file

class JsonRawMessagesDateReader(Configurable):

    date = Option(required=True, positional=True)
    database = Service("database")


    @ContextProcessor
    def fp(self, _, *, database):
        with database.open_raw_messages_file(self.date) as fp:
            yield fp

    def __call__(self, fp, _, *, database):
        for line in fp:
            if line.strip():
                yield json.loads(line)

Example #30

0

Show file

class PyfilesLoad(Configurable):
    namespace = Option(str, required=True, positional=True)
    filename = Option(str, required=True, positional=True)
    content = Option(str, required=False, default='content')
    version = Option(str, required=False, default='version')

    pyfile_storage = Service('pyfile_storage')

    def __call__(self, properties, pyfile_storage):
        event_loop = asyncio.new_event_loop()
        event_loop.run_until_complete(
            pyfile_storage.store(stream=properties[self.content],
                                 namespace=self.namespace,
                                 filename=self.filename,
                                 version=properties[self.version]))

        yield NOT_MODIFIED