def test_grouper(): my_iterable = list(range(100)) assert len(list(grouper(my_iterable, 10))) == 10 my_iterable = list(range(100)) + [None] my_groups = list(grouper(my_iterable, 10)) assert len(my_groups) == 11 assert len(my_groups[10]) == 1
def get_items(self): """ Custom get items to allow for incremental building for a whole set of stores """ self.logger.info("Starting Website Builder") self.ensure_indexes() keys = self.get_keys() self.logger.info("Processing {} items".format(len(keys))) self.total = len(keys) # Chunk keys by chunk size for good data IO for chunked_keys in grouper(keys, self.chunk_size, None): chunked_keys = list(filter(None.__ne__, chunked_keys)) docs = { d[self.materials.key]: d for d in self.materials.query( criteria={self.materials.key: {"$in": chunked_keys}} ) } self.add_thermo_docs(docs) self.add_aux_docs(docs) for d in docs.values(): yield d
def get_groups_from_keys(self, keys) -> Set[Tuple]: """ Get the groups by grouping_keys for these documents """ grouping_keys = self.grouping_keys groups: Set[Tuple] = set() for chunked_keys in grouper(keys, self.chunk_size): docs = list( self.source.query( criteria={self.source.key: {"$in": chunked_keys}}, properties=grouping_keys, ) ) sub_groups = set( tuple(get(d, prop, None) for prop in grouping_keys) for d in docs ) self.logger.debug(f"Found {len(sub_groups)} subgroups to process") groups |= sub_groups self.logger.info(f"Found {len(groups)} groups to process") return groups
def run(self, log_level=logging.DEBUG): """ Run the builder serially This is only intended for diagnostic purposes """ # Set up logging root = logging.getLogger() root.setLevel(log_level) ch = TqdmLoggingHandler() formatter = logging.Formatter( "%(asctime)s - %(name)s - %(levelname)s - %(message)s") ch.setFormatter(formatter) root.addHandler(ch) self.connect() cursor = self.get_items() for chunk in grouper(tqdm(cursor), self.chunk_size): self.logger.info("Processing batch of {} items".format( self.chunk_size)) processed_chunk = [self.process_item(item) for item in chunk] processed_items = [ item for item in processed_chunk if item is not None ] self.update_targets(processed_items) self.finalize()
def get_items(self): self.logger.info("Starting {} Builder".format(self.__class__.__name__)) self.ensure_indexes() if self.incremental: keys = source_keys_updated(source=self.source, target=self.target, query=self.query) else: keys = self.source.distinct(self.source.key, self.query) self.logger.info("Processing {} items".format(len(keys))) if self.projection: projection = list( set(self.projection + [self.source.key, self.source.lu_field])) else: projection = None self.total = len(keys) for chunked_keys in grouper(keys, self.chunk_size, None): chunked_keys = list(filter(None.__ne__, chunked_keys)) for doc in list( self.source.query( criteria={self.source.key: { "$in": chunked_keys }}, properties=projection)): yield doc
def prechunk(self, number_splits: int) -> Iterator[Dict]: """ Generic prechunk for map builder to perform domain-decompostion by the key field """ self.ensure_indexes() keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True) N = ceil(len(keys) / number_splits) for split in grouper(keys, N): yield {"query": {self.source.key: {"$in": list(split)}}}
def prechunk(self, number_splits: int) -> Iterator[Dict]: """ Generic prechunk for group builder to perform domain-decompostion by the grouping keys """ self.ensure_indexes() keys = self.get_ids_to_process() groups = self.get_groups_from_keys(keys) N = ceil(len(groups) / number_splits) for split in grouper(keys, N): yield {"query": dict(zip(self.grouping_keys, split))}
def get_items(self): # Borrowed from MapBuilder keys = self.propnet_store.distinct('task_id', criteria=self.criteria) containers = self.props + ['inputs'] self.total = len(keys) for chunked_keys in grouper(keys, self.chunk_size, None): chunked_keys = list(filter(None.__ne__, chunked_keys)) for doc in list( self.propnet_store.query( criteria={'task_id': { "$in": chunked_keys }}, properties=containers + ['task_id'], )): yield doc
def run(self): """ Run the builder serially Args: builder_id (int): the index of the builder in the builders list """ self.connect() cursor = self.get_items() for chunk in grouper(cursor, self.chunk_size): self.logger.info("Processing batch of {} items".format(self.chunk_size)) processed_items = [self.process_item(item) for item in chunk if item is not None] self.update_targets(processed_items) self.finalize(cursor)
def process(self, builder_id): """ Run the builder using the builtin multiprocessing. Adapted from pymatgen-db Args: builder_id (int): the index of the builder in the builders list """ builder = self.builders[builder_id] chunk_size = builder.chunk_size processing_builder = reload_msonable_object(builder) process_pool = Pool(self.num_workers, maxtasksperchild=chunk_size) cursor = builder.get_items() for items in grouper( process_pool.imap(processing_builder.process_item, cursor), chunk_size): self.logger.info("Completed {} items".format(chunk_size)) builder.update_targets(items) builder.finalize(cursor)
def remove_docs(self, criteria: Dict, remove_s3_object: bool = False): """ Remove docs matching the query dictionary Args: criteria: query dictionary to match remove_s3_object: whether to remove the actual S3 Object or not """ if not remove_s3_object: self.index.remove_docs(criteria=criteria) else: to_remove = self.index.distinct(self.key, criteria=criteria) self.index.remove_docs(criteria=criteria) # Can remove up to 1000 items at a time via boto to_remove_chunks = list(grouper(to_remove, n=1000)) for chunk_to_remove in to_remove_chunks: objlist = [{ "Key": self.sub_dir + obj } for obj in chunk_to_remove] self.s3_bucket.delete_objects(Delete={"Objects": objlist})
def process(self, builder_id): """ Run the builder serially Args: builder_id (int): the index of the builder in the builders list """ builder = self.builders[builder_id] chunk_size = builder.chunk_size # establish connection to the sources and targets builder.connect() cursor = builder.get_items() for chunk in grouper(cursor, chunk_size): self.logger.info("Processing batch of {} items".format(chunk_size)) processed_items = [ builder.process_item(item) for item in filter(None, chunk) ] builder.update_targets(processed_items)
def get_items(self): """ Generic get items for Map Builder designed to perform incremental building """ self.logger.info("Starting {} Builder".format(self.__class__.__name__)) self.ensure_indexes() keys = self.target.newer_in(self.source, criteria=self.query, exhaustive=True) if self.retry_failed: failed_keys = self.target.distinct( self.target.key, criteria={"state": { "$ne": "failed" }}) keys = list(set(keys + failed_keys)) self.logger.info("Processing {} items".format(len(keys))) if self.projection: projection = list( set(self.projection + [self.source.key, self.source.last_updated_field])) else: projection = None self.total = len(keys) for chunked_keys in grouper(keys, self.chunk_size): chunked_keys = list(chunked_keys) for doc in list( self.source.query( criteria={self.source.key: { "$in": chunked_keys }}, properties=projection, )): yield doc
def _request_with_fewer_props(self, n, k, reduce_batch_on_fail=False): """ Requests the nth page of k results from the AFLUX API, using batches of properties. The algorithm divides the number of properties into x chunks, starting with x = 2, and requests each chunk. If one of the chunks fails, optionally, the batch size is reduced according to `_request_with_smaller_batch()`. If the chunk continues to fail, x is increased by 1, the properties are re-chunked and re-requested. This proceeds until each chunk contains only one property. If the query still fails, an error is raised. Args: n (int): page number of the results to return. k (int): number of datasets per page. reduce_batch_on_fail (bool): True causes batch size to decrease if a query fails to produce results prior to decreasing the chunk size. False does not decrease the batch size. Default: False Returns: dict: cumulative response from API """ if len(self.responses) == 0: # We are making the very first request, finalize the query. self.finalize() collected_responses = defaultdict(dict) props = self.selects chunks = 2 # Split up current query matchbook to recover filters matchbook_splitter = re.compile(r"(?!\'),(?<!\')") filter_identifier = re.compile(r"\(.+\)") current_matchbook = self._matchbook split_matchbook = matchbook_splitter.split(current_matchbook) orderby_kw = split_matchbook[0] # Preserves orderby keyword filters = [] for item in split_matchbook[1:]: if filter_identifier.search(item): filters.append(item) while len(props) // chunks >= 1: if len(props) / chunks < 2: chunks = len(props) + 1 query_error = False for chunk in grouper(props, (len(props) // chunks) + 1): logger.debug('Requesting property chunk {} with {} records'.format(chunks, k)) props_to_request = list(set(c for c in chunk if c is not None)) # Exclude orderby keyword if it is not requested in this chunk. # If it is included, remove from requested properties to avoid duplication in URI orderby_prop = None orderby_str = None for prop in props_to_request: if orderby_kw.startswith(prop): if orderby_kw.startswith('$'): orderby_str = orderby_kw[1:] else: orderby_str = orderby_kw orderby_prop = prop break if orderby_prop: props_to_request.remove(orderby_prop) else: if orderby_kw.startswith('$'): orderby_str = orderby_kw else: orderby_str = '$' + orderby_kw matchbook_list = [orderby_str] + filters + props_to_request query = AflowAPIQuery(catalog=self.catalog, batch_reduction=reduce_batch_on_fail) query.finalize() query._matchbook = ",".join(matchbook_list) try: query._request(n, k) except ValueError: # pragma: no cover query_error = True if not query_error: response = query.responses[n] for record_key, record in response.items(): collected_responses[record_key].update(record) else: # pragma: no cover break if query_error: # pragma: no cover chunks += 1 else: return collected_responses raise ValueError("The API failed to complete the request " # pragma: no cover "and reducing the number of properties failed to fix it.")
def serial(builder: Builder): """ Runs the builders using a single process """ logger = logging.getLogger("SerialProcessor") builder.connect() cursor = builder.get_items() total = None if isinstance(cursor, GeneratorType): try: cursor = primed(cursor) if hasattr(builder, "total"): total = builder.total except StopIteration: pass elif hasattr(cursor, "__len__"): total = len(cursor) # type: ignore elif hasattr(cursor, "count"): total = cursor.count() # type: ignore logger.info( f"Starting serial processing: {builder.__class__.__name__}", extra={ "maggma": { "event": "BUILD_STARTED", "total": total, "builder": builder.__class__.__name__, "sources": [source.name for source in builder.sources], "targets": [target.name for target in builder.targets], } }, ) for chunk in grouper(tqdm(cursor, total=total), builder.chunk_size): logger.info( "Processing batch of {} items".format(builder.chunk_size), extra={ "maggma": { "event": "UPDATE", "items": len(chunk), "builder": builder.__class__.__name__, } }, ) processed_chunk = [builder.process_item(item) for item in chunk] processed_items = [ item for item in processed_chunk if item is not None ] builder.update_targets(processed_items) logger.info( f"Ended serial processing: {builder.__class__.__name__}", extra={ "maggma": { "event": "BUILD_ENDED", "builder": builder.__class__.__name__ } }, ) builder.finalize()
def get_items(self): """ Retrieves AFLOW data using the AFLUX API according to the specifications in the query configurations. Yields: tuple: The first item is an `aflow.entries.Entry` containing the material data and the second item is a list of targets for the data ('data' and/or 'auid') """ kws = self.keywords.copy() for kw in ('auid', 'aurl', 'compound', 'files'): try: kws.remove(kw) except KeyError: pass for config_ in self.query_configs: logger.debug("Catalog {} selecting {}".format( config_['catalog'], 'all' if not config_['select'] else config_['select'])) if config_['select']: kws_to_chunk = config_['select'] else: kws_to_chunk = self.keywords k = config_['k'] filter_vals = config_['filter'] chunk_idx = 0 chunk_size = 5 total_chunks = len(kws_to_chunk) // chunk_size + 1 for chunk in grouper(kws_to_chunk, chunk_size): chunk_idx += 1 logger.debug("Property chunk {} of {}".format( chunk_idx, total_chunks)) props = [getattr(AFLOW_KWS, c) for c in chunk if c is not None] if len(props) == 0: continue data_query = self._get_query_obj(config_['catalog'], k, config_['exclude'], filter_vals) data_query.select(*props) success = False while not success: try: for entry in data_query: yield entry, config_['targets'] success = True except ValueError: if data_query.N == 0: # Empty query raise ValueError( "Query returned no results. Query config:\n{}". format(config_)) else: # pragma: no cover logger.warning('Server error. ' + 'Resting...starting {}'.format( datetime.datetime.now())) time.sleep(120)
def get_items(self) -> Iterable: """ Gets items from source_stores for processing. Items are retrieved in chunks based on a subset of key values set by chunk_size but are unsorted. Returns: generator of items to process """ self.logger.info("Starting {} get_items...".format( self.__class__.__name__)) # get distinct key values if len(self.query_by_key) > 0: keys = self.query_by_key else: unique_keys = set() # type: Set for store in self.sources: store_keys = store.distinct(field=store.key) unique_keys.update(store_keys) if None in store_keys: self.logger.debug( "None found as a key value for store {} with key {}". format(store.collection_name, store.key)) keys = list(unique_keys) self.logger.info("{} distinct key values found".format(len(keys))) self.logger.debug( "None found in key values? {}".format(None in keys)) # for every key (in chunks), query from each store and # project fields specified by projection_mapping for chunked_keys in grouper(keys, self.chunk_size): chunked_keys = [k for k in chunked_keys if k is not None] self.logger.debug( "Querying by chunked_keys: {}".format(chunked_keys)) unsorted_items_to_process = [] for store, projection in zip(self.sources, self.projection_mapping): # project all fields from store if corresponding element # in projection_mapping is an empty dict, # else only project the specified fields properties: Union[List, None] if projection == {}: # all fields are projected properties = None self.logger.debug( "For store {} getting all properties".format( store.collection_name)) else: # only specified fields are projected properties = [v for v in projection.values()] self.logger.debug( "For {} store getting properties: {}".format( store.collection_name, properties)) # get docs from store for given chunk of key values, # rename fields if specified by projection mapping, # and put in list of unsorted items to be processed docs = store.query(criteria={store.key: { "$in": chunked_keys }}, properties=properties) for d in docs: if properties is None: # all fields are projected as is item = deepcopy(d) else: # specified fields are renamed item = dict() for k, v in projection.items(): item[k] = get(d, v) # remove unneeded fields and add key value to each item # key value stored under target_key is used for sorting # items during the process_items step for k in ["_id", store.last_updated_field]: if k in item.keys(): del item[k] item[self.target.key] = d[store.key] unsorted_items_to_process.append(item) self.logger.debug( "Example fields of one output item from {} store sent to process_items: {}" .format(store.collection_name, item.keys())) yield unsorted_items_to_process
def get_items(self) -> Tuple[List[Dict], List[Dict]]: """ Gets all materials to assocaite with SNLs Returns: generator of materials and SNLs that could match """ self.logger.info("Provenance Builder Started") self.logger.info("Setting indexes") self.ensure_indicies() # Find all formulas for materials that have been updated since this # builder was last ran q = {**self.query, "property_name": ProvenanceDoc.property_name} updated_materials = self.provenance.newer_in( self.materials, criteria=q, exhaustive=True, ) forms_to_update = set( self.materials.distinct( "formula_pretty", {"material_id": { "$in": updated_materials }})) # Find all new SNL formulas since the builder was last run for source in self.source_snls: new_snls = self.provenance.newer_in(source) forms_to_update |= set(source.distinct("formula_pretty", new_snls)) # Now reduce to the set of formulas we actually have forms_avail = set(self.materials.distinct("formula_pretty", self.query)) forms_to_update = forms_to_update & forms_avail self.logger.info( f"Found {len(forms_to_update)} new/updated systems to proces") self.total = len(forms_to_update) for formulas in grouper(forms_to_update, self.chunk_size): snls = [] for source in self.source_snls: snls.extend( source.query( criteria={"formula_pretty": { "$in": formulas }})) mats = list( self.materials.query( properties=[ "material_id", "last_updated", "structure", "initial_structures", "formula_pretty", ], criteria={"formula_pretty": { "$in": formulas }}, )) form_groups = defaultdict(list) for snl in snls: form_groups[snl["formula_pretty"]].append(snl) mat_groups = defaultdict(list) for mat in mats: mat_groups[mat["formula_pretty"]].append(mat) for formula, snl_group in form_groups.items(): mat_group = mat_groups[formula] self.logger.debug( f"Found {len(snl_group)} snls and {len(mat_group)} mats") yield mat_group, snl_group