def run(self, *args, **kwargs): self.login() api_collection = Collection(self.client) local_collection = Collection() self.logger.info('Reading existing spec data from API') api_collection.load_all('maturities') api_collection.load_all('specifications') cache = "using cache" if self.use_cache else "no cache" self.logger.info('Reading spec data from MDN templates (%s)', cache) specname = self.specname_template() self.parse_specname(specname, api_collection, local_collection) spec2 = self.spec2_template() self.parse_spec2(spec2, local_collection) # Load API sections into local collection local_collection.override_ids_to_match(api_collection) local_specs = local_collection.get_resources('specifications') for local_spec in local_specs: api_spec = api_collection.get('specifications', local_spec.id.id) if api_spec: local_spec.sections = api_spec.sections.ids return self.sync_changes( api_collection, local_collection, self.skip_deletes)
def run(self, *args, **kwargs): self.login() api_collection = Collection(self.client) local_collection = Collection() resources = [ 'browsers', 'versions', 'features', 'supports', 'specifications', 'maturities', 'sections', 'references' ] self.logger.info('Reading existing data from API') for resource in resources: count = api_collection.load_all(resource) self.logger.info('Downloaded %d %s.', count, resource) self.logger.info('Reading upload data from disk') for resource in resources: filepath = self.data_file('{}.json'.format(resource)) if not exists(filepath): continue with codecs.open(filepath, 'r', 'utf8') as f: data = json.load(f, object_pairs_hook=OrderedDict) resource_class = local_collection.resource_by_type[resource] for item in data[resource]: obj = resource_class() obj.from_json_api({resource: item}) local_collection.add(obj) return self.sync_changes(api_collection, local_collection)
def run(self, *args, **kwargs): self.login() api_collection = Collection(self.client) local_collection = Collection() compat_data = loads( self.cached_download( "data-human.json", "https://raw.githubusercontent.com/webplatform/compatibility-data" "/master/data-human.json")) self.logger.info('Reading existing feature data from API') for resource in ['browsers', 'versions', 'features', 'supports']: api_collection.load_all(resource) self.logger.info('Loading feature data from webplatform repository') self.parse_compat_data(compat_data, local_collection) if not self.all_data: self.logger.info('Selecting subset of data') self.select_subset(local_collection) return self.sync_changes(api_collection, local_collection)
def run(self, rate=5, *args, **kwargs): self.login() collection = Collection(self.client) collection.load_all('features') # Collect feature URIs uris = [] for feature in collection.get_resources('features'): if feature.mdn_uri and feature.mdn_uri.get('en'): uris.append((feature.mdn_uri['en'], feature.id.id)) total = len(uris) if self.reparse: action = 'Reparsing cached pages...' else: action = 'Reparsing latest pages...' self.logger.info('Features loaded, %d MDN pages found. %s', total, action) # Import from MDN, with rate limiting start_time = time.time() mdn_requests = 0 counts = {'new': 0, 'reset': 0, 'reparsed': 0, 'unchanged': 0} log_at = 15 logged_at = time.time() for uri, feature_id in uris: attempt = 1 max_attempt = 3 while attempt <= max_attempt: if attempt > 1: self.logger.info('Attempt %d of %d for %s', attempt, max_attempt, uri) try: import_type = self.import_page(uri, feature_id) except RequestException as exception: if attempt < max_attempt: self.logger.error('%s', exception) self.logger.info('Pausing 5 seconds...') time.sleep(5) else: self.logger.info('Giving up on %s.', uri) attempt += 1 else: counts[import_type] += 1 if import_type in ('new', 'reset'): mdn_requests += 1 break # Pause for rate limiting? current_time = time.time() if rate: elapsed = current_time - start_time target = start_time + float(mdn_requests) / rate current_rate = float(mdn_requests) / elapsed if time.time < target: rest = int(target - current_time) + 1 self.logger.warning( '%d pages fetched, %0.2f per second, target rate %d' ' per second. Resting %d seconds.', mdn_requests, current_rate, rate, rest) time.sleep(rest) logged_at = time.time() current_time = time.time() # Log progress? if (logged_at + log_at) < current_time: processed = sum(counts.values()) percent = int(100 * (float(processed) / total)) self.logger.info(' Processed %d of %d MDN pages (%d%%)...', processed, total, percent) logged_at = current_time return counts
def run(self, *args, **kwargs): self.login() api_collection = Collection(self.client) self.logger.info('Reading existing features from API') api_collection.load_all('features') # Copy feature to working local collection local_collection = Collection() local_collection.load_collection(api_collection) features = self.known_features(local_collection) feature_by_uri = dict((k[0], k[2]) for k in features) feature_by_slug = dict((k[1], k[2]) for k in features) slugs = set(feature_by_slug.keys()) cache_state = "using cache" if self.use_cache else "no cache" self.logger.info('Reading pages from MDN (%s)', cache_state) mdn_uris = self.current_mdn_uris() # Find new / updated pages new_page, needs_url, existing_page = 0, 0, 0 seen_uris = set() for path, parent_path, raw_slug, title in mdn_uris: uri = self.base_mdn_domain + path seen_uris.add(uri) feature = feature_by_uri.get(uri) if feature: existing_page += 1 feature.name = self.to_trans(title) else: slug = self.unique_slugify(raw_slug, slugs) feature = feature_by_slug.get(slug) if feature: # Need to add URI to feature feature.mdn_uri = {'en': uri} feature_by_uri[uri] = feature needs_url += 1 else: if parent_path: parent_uri = self.base_mdn_domain + parent_path parent_feature = feature_by_uri.get(parent_uri) assert parent_feature,\ 'No feature for parent page {}'.format(parent_uri) parent_id = parent_feature.id.id else: parent_id = None feature = Feature(id='_' + slug, slug=slug, mdn_uri={'en': uri}, name=self.to_trans(title), parent=parent_id) local_collection.add(feature) feature_by_uri[uri] = feature feature_by_slug[slug] = feature self.logger.info( 'MDN URIs gathered, %d found (%d new, %d needs url, %d existing).', len(mdn_uris), new_page, needs_url, existing_page) # Find deleted pages for uri, feature in feature_by_uri.items(): if uri and uri not in seen_uris: local_collection.remove(feature) return self.sync_changes(api_collection, local_collection, self.skip_deletes)