def run(self, *args, **kwargs):
        self.login()
        api_collection = Collection(self.client)
        local_collection = Collection()

        self.logger.info('Reading existing spec data from API')
        api_collection.load_all('maturities')
        api_collection.load_all('specifications')

        cache = "using cache" if self.use_cache else "no cache"
        self.logger.info('Reading spec data from MDN templates (%s)', cache)
        specname = self.specname_template()
        self.parse_specname(specname, api_collection, local_collection)
        spec2 = self.spec2_template()
        self.parse_spec2(spec2, local_collection)

        # Load API sections into local collection
        local_collection.override_ids_to_match(api_collection)
        local_specs = local_collection.get_resources('specifications')
        for local_spec in local_specs:
            api_spec = api_collection.get('specifications', local_spec.id.id)
            if api_spec:
                local_spec.sections = api_spec.sections.ids
        return self.sync_changes(
            api_collection, local_collection, self.skip_deletes)
Example #2
0
    def run(self, *args, **kwargs):
        self.login()
        api_collection = Collection(self.client)
        local_collection = Collection()
        resources = [
            'browsers', 'versions', 'features', 'supports', 'specifications',
            'maturities', 'sections', 'references'
        ]

        self.logger.info('Reading existing data from API')
        for resource in resources:
            count = api_collection.load_all(resource)
            self.logger.info('Downloaded %d %s.', count, resource)

        self.logger.info('Reading upload data from disk')
        for resource in resources:
            filepath = self.data_file('{}.json'.format(resource))
            if not exists(filepath):
                continue
            with codecs.open(filepath, 'r', 'utf8') as f:
                data = json.load(f, object_pairs_hook=OrderedDict)
            resource_class = local_collection.resource_by_type[resource]
            for item in data[resource]:
                obj = resource_class()
                obj.from_json_api({resource: item})
                local_collection.add(obj)

        return self.sync_changes(api_collection, local_collection)
    def run(self, *args, **kwargs):
        self.login()

        api_collection = Collection(self.client)
        local_collection = Collection()
        compat_data = loads(
            self.cached_download(
                "data-human.json",
                "https://raw.githubusercontent.com/webplatform/compatibility-data"
                "/master/data-human.json"))

        self.logger.info('Reading existing feature data from API')
        for resource in ['browsers', 'versions', 'features', 'supports']:
            api_collection.load_all(resource)
        self.logger.info('Loading feature data from webplatform repository')
        self.parse_compat_data(compat_data, local_collection)

        if not self.all_data:
            self.logger.info('Selecting subset of data')
            self.select_subset(local_collection)

        return self.sync_changes(api_collection, local_collection)
Example #4
0
    def run(self, rate=5, *args, **kwargs):
        self.login()
        collection = Collection(self.client)
        collection.load_all('features')

        # Collect feature URIs
        uris = []
        for feature in collection.get_resources('features'):
            if feature.mdn_uri and feature.mdn_uri.get('en'):
                uris.append((feature.mdn_uri['en'], feature.id.id))
        total = len(uris)
        if self.reparse:
            action = 'Reparsing cached pages...'
        else:
            action = 'Reparsing latest pages...'
        self.logger.info('Features loaded, %d MDN pages found. %s', total,
                         action)

        # Import from MDN, with rate limiting
        start_time = time.time()
        mdn_requests = 0
        counts = {'new': 0, 'reset': 0, 'reparsed': 0, 'unchanged': 0}
        log_at = 15
        logged_at = time.time()
        for uri, feature_id in uris:
            attempt = 1
            max_attempt = 3
            while attempt <= max_attempt:
                if attempt > 1:
                    self.logger.info('Attempt %d of %d for %s', attempt,
                                     max_attempt, uri)
                try:
                    import_type = self.import_page(uri, feature_id)
                except RequestException as exception:
                    if attempt < max_attempt:
                        self.logger.error('%s', exception)
                        self.logger.info('Pausing 5 seconds...')
                        time.sleep(5)
                    else:
                        self.logger.info('Giving up on %s.', uri)
                    attempt += 1
                else:
                    counts[import_type] += 1
                    if import_type in ('new', 'reset'):
                        mdn_requests += 1
                    break

            # Pause for rate limiting?
            current_time = time.time()
            if rate:
                elapsed = current_time - start_time
                target = start_time + float(mdn_requests) / rate
                current_rate = float(mdn_requests) / elapsed
                if time.time < target:
                    rest = int(target - current_time) + 1
                    self.logger.warning(
                        '%d pages fetched, %0.2f per second, target rate %d'
                        ' per second.  Resting %d seconds.', mdn_requests,
                        current_rate, rate, rest)
                    time.sleep(rest)
                    logged_at = time.time()
                    current_time = time.time()

            # Log progress?
            if (logged_at + log_at) < current_time:
                processed = sum(counts.values())
                percent = int(100 * (float(processed) / total))
                self.logger.info('  Processed %d of %d MDN pages (%d%%)...',
                                 processed, total, percent)
                logged_at = current_time

        return counts
Example #5
0
    def run(self, *args, **kwargs):
        self.login()
        api_collection = Collection(self.client)
        self.logger.info('Reading existing features from API')
        api_collection.load_all('features')

        # Copy feature to working local collection
        local_collection = Collection()
        local_collection.load_collection(api_collection)
        features = self.known_features(local_collection)
        feature_by_uri = dict((k[0], k[2]) for k in features)
        feature_by_slug = dict((k[1], k[2]) for k in features)
        slugs = set(feature_by_slug.keys())

        cache_state = "using cache" if self.use_cache else "no cache"
        self.logger.info('Reading pages from MDN (%s)', cache_state)
        mdn_uris = self.current_mdn_uris()

        # Find new / updated pages
        new_page, needs_url, existing_page = 0, 0, 0
        seen_uris = set()
        for path, parent_path, raw_slug, title in mdn_uris:
            uri = self.base_mdn_domain + path
            seen_uris.add(uri)
            feature = feature_by_uri.get(uri)
            if feature:
                existing_page += 1
                feature.name = self.to_trans(title)
            else:
                slug = self.unique_slugify(raw_slug, slugs)
                feature = feature_by_slug.get(slug)
                if feature:
                    # Need to add URI to feature
                    feature.mdn_uri = {'en': uri}
                    feature_by_uri[uri] = feature
                    needs_url += 1
                else:
                    if parent_path:
                        parent_uri = self.base_mdn_domain + parent_path
                        parent_feature = feature_by_uri.get(parent_uri)
                        assert parent_feature,\
                            'No feature for parent page {}'.format(parent_uri)
                        parent_id = parent_feature.id.id
                    else:
                        parent_id = None
                    feature = Feature(id='_' + slug,
                                      slug=slug,
                                      mdn_uri={'en': uri},
                                      name=self.to_trans(title),
                                      parent=parent_id)
                    local_collection.add(feature)
                    feature_by_uri[uri] = feature
                    feature_by_slug[slug] = feature
        self.logger.info(
            'MDN URIs gathered, %d found (%d new, %d needs url, %d existing).',
            len(mdn_uris), new_page, needs_url, existing_page)

        # Find deleted pages
        for uri, feature in feature_by_uri.items():
            if uri and uri not in seen_uris:
                local_collection.remove(feature)

        return self.sync_changes(api_collection, local_collection,
                                 self.skip_deletes)