Example #1
0
    def _iterator(self, start_date):
        self.stdout.write("Working on %s to now" % start_date)
        # Fetch all the bugs that have been created or had the crash_signature
        # field changed since start_date
        payload = BUG_QUERY_PARAMS.copy()
        payload["chfieldfrom"] = start_date

        # Use a 30-second timeout because Bugzilla is slow sometimes
        session = session_with_retries(default_timeout=30.0)
        headers = {}
        if settings.BZAPI_TOKEN:
            headers["X-BUGZILLA-API-KEY"] = settings.BZAPI_TOKEN
            self.stdout.write(
                "using BZAPI_TOKEN (%s)" % (settings.BZAPI_TOKEN[:-8] + "xxxxxxxx")
            )
        else:
            self.stdout.write("Warning: No BZAPI_TOKEN specified!")
        r = session.get(
            settings.BZAPI_BASE_URL + "/bug", headers=headers, params=payload
        )
        if r.status_code < 200 or r.status_code >= 300:
            r.raise_for_status()
        results = r.json()

        # Yield each one as a (bug_id, set of signatures)
        for report in results["bugs"]:
            yield (
                int(report["id"]),
                find_signatures(report.get("cf_crash_signature", "")),
            )
Example #2
0
    def __init__(self, config):
        super().__init__(config)
        self.cache = ExpiringCache(max_size=self.CACHE_MAX_SIZE, default_ttl=self.SHORT_CACHE_TTL)
        self.metrics = markus.get_metrics('processor.betaversionrule')

        # For looking up version strings
        self.version_string_api = config.version_string_api
        self.session = session_with_retries()
    def __init__(self, config):
        super().__init__(config)
        self.cache = ExpiringCache(max_size=self.CACHE_MAX_SIZE, default_ttl=self.SHORT_CACHE_TTL)
        self.metrics = markus.get_metrics('processor.betaversionrule')

        # For looking up version strings
        self.version_string_api = config.version_string_api
        self.session = session_with_retries()
Example #4
0
    def __init__(self, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.database = self.config.database_class(self.config)

        # NOTE(willkg): If archive.mozilla.org is timing out after 5 seconds,
        # then it has issues and we should try again some other time
        self.session = session_with_retries(default_timeout=5.0)
        self.successful_inserts = 0
Example #5
0
def main(argv=None):
    parser = argparse.ArgumentParser(formatter_class=WrappedTextHelpFormatter,
                                     description=DESCRIPTION.strip())
    parser.add_argument(
        "--sleep",
        help="how long in seconds to sleep before submitting the next group",
        type=int,
        default=SLEEP_DEFAULT,
    )
    parser.add_argument("--host",
                        help="host for system to reprocess in",
                        default=DEFAULT_HOST)
    parser.add_argument(
        "crashid",
        help="one or more crash ids to fetch data for",
        nargs="*",
        action=FallbackToPipeAction,
    )

    if argv is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    api_token = os.environ.get("SOCORRO_REPROCESS_API_TOKEN")
    if not api_token:
        print("You need to set SOCORRO_REPROCESS_API_TOKEN in the environment")
        return 1

    url = args.host.rstrip("/") + "/api/Reprocessing/"
    print("Sending reprocessing requests to: %s" % url)
    session = session_with_retries()

    crash_ids = args.crashid
    print("Reprocessing %s crashes sleeping %s seconds between groups..." %
          (len(crash_ids), args.sleep))

    groups = list(chunked(crash_ids, CHUNK_SIZE))
    for i, group in enumerate(groups):
        print("Processing group ending with %s ... (%s/%s)" %
              (group[-1], i + 1, len(groups)))
        resp = session.post(url,
                            data={"crash_ids": group},
                            headers={"Auth-Token": api_token})
        if resp.status_code != 200:
            print("Got back non-200 status code: %s %s" %
                  (resp.status_code, resp.content))
            continue

        # NOTE(willkg): We sleep here because the webapp has a bunch of rate limiting and we don't
        # want to trigger that. It'd be nice if we didn't have to do this.
        time.sleep(args.sleep)

    print("Done!")
Example #6
0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=WrappedTextHelpFormatter,
        description=DESCRIPTION.strip(),
    )
    parser.add_argument(
        '--sleep',
        help='how long in seconds to sleep before submitting the next group',
        type=int,
        default=SLEEP_DEFAULT
    )
    parser.add_argument('--host', help='host for system to reprocess in', default=DEFAULT_HOST)
    parser.add_argument('crashid', help='one or more crash ids to fetch data for',
                        nargs='*', action=FallbackToPipeAction)

    if argv is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    api_token = os.environ.get('SOCORRO_REPROCESS_API_TOKEN')
    if not api_token:
        print('You need to set SOCORRO_REPROCESS_API_TOKEN in the environment')
        return 1

    url = args.host.rstrip('/') + '/api/Reprocessing/'
    print('Sending reprocessing requests to: %s' % url)
    session = session_with_retries()

    crash_ids = args.crashid
    print('Reprocessing %s crashes sleeping %s seconds between groups...' % (
        len(crash_ids), args.sleep
    ))

    groups = list(chunked(crash_ids, CHUNK_SIZE))
    for i, group in enumerate(groups):
        print('Processing group ending with %s ... (%s/%s)' % (group[-1], i + 1, len(groups)))
        resp = session.post(
            url,
            data={'crash_ids': group},
            headers={
                'Auth-Token': api_token
            }
        )
        if resp.status_code != 200:
            print('Got back non-200 status code: %s %s' % (resp.status_code, resp.content))
            continue

        # NOTE(willkg): We sleep here because the webapp has a bunch of rate limiting and we don't
        # want to trigger that. It'd be nice if we didn't have to do this.
        time.sleep(args.sleep)

    print('Done!')
Example #7
0
    def _iterator(self, from_date):
        payload = BUGZILLA_PARAMS.copy()
        payload['chfieldfrom'] = from_date
        session = session_with_retries()
        r = session.get(BUGZILLA_BASE_URL, params=payload)
        if r.status_code < 200 or r.status_code >= 300:
            r.raise_for_status()
        results = r.json()

        for report in results['bugs']:
            yield (int(report['id']),
                   find_signatures(report.get('cf_crash_signature', '')))
Example #8
0
def fetch_crashids(host, params, num_results):
    """Generator that returns crash ids

    :arg str host: the host to query
    :arg dict params: dict of super search parameters to base the query on
    :arg varies num: number of results to get or INFINITY

    :returns: generator of crash ids

    """
    url = host + '/api/SuperSearch/'

    session = session_with_retries()

    # Set up first page
    params['_results_offset'] = 0
    params['_results_number'] = min(MAX_PAGE, num_results)

    # Fetch pages of crash ids until we've gotten as many as we want or there aren't any more to get
    crashids_count = 0
    while True:
        resp = session.get(url, params=params)
        if resp.status_code != 200:
            raise Exception('Bad response: %s %s' % (resp.status_code, resp.content))

        hits = resp.json()['hits']

        for hit in hits:
            crashids_count += 1
            yield hit['uuid']

            # If we've gotten as many crashids as we need, we return
            if crashids_count >= num_results:
                return

        # If there are no more crash ids to get, we return
        total = resp.json()['total']
        if not hits or crashids_count >= total:
            return

        # Get the next page, but only as many results as we need
        params['_results_offset'] += MAX_PAGE
        params['_results_number'] = min(
            # MAX_PAGE is the maximum we can request
            MAX_PAGE,

            # The number of results Super Search can return to us that is hasn't returned so far
            total - crashids_count,

            # The numver of results we want that we haven't gotten, yet
            num_results - crashids_count
        )
Example #9
0
    def _iterator(self, from_date):
        # Fetch all the bugs that have been created or had the crash_signature
        # field changed since from_date
        payload = BUGZILLA_PARAMS.copy()
        payload['chfieldfrom'] = from_date
        session = session_with_retries()
        r = session.get(BUGZILLA_BASE_URL, params=payload)
        if r.status_code < 200 or r.status_code >= 300:
            r.raise_for_status()
        results = r.json()

        # Yield each one as a (bug_id, set of signatures)
        for report in results['bugs']:
            yield (int(report['id']),
                   find_signatures(report.get('cf_crash_signature', '')))
    def get(self, bugs):
        if isinstance(bugs, six.string_types):
            bugs = [bugs]
        fields = ('summary', 'status', 'id', 'resolution')
        results = []
        missing = []
        for bug in bugs:
            cache_key = self.make_cache_key(bug)
            cached = cache.get(cache_key)
            if cached is None:
                missing.append(bug)
            else:
                results.append(cached)
        if missing:
            params = {
                'bugs': ','.join(missing),
                'fields': ','.join(fields),
            }
            headers = {
                'Accept': 'application/json',
                'Content-Type': 'application/json'
            }
            url = settings.BZAPI_BASE_URL + (
                '/bug?id=%(bugs)s&include_fields=%(fields)s' % params
            )
            session = session_with_retries(
                # BZAPI isn't super reliable, so be extra patient
                total_retries=5,
                # 502 = Bad Gateway
                # 504 = Gateway Time-out
                status_forcelist=(500, 502, 504)
            )
            response = session.get(
                url,
                headers=headers,
                timeout=self.BUGZILLA_REST_TIMEOUT,
            )
            if response.status_code != 200:
                raise BugzillaRestHTTPUnexpectedError(response.status_code)

            for each in response.json()['bugs']:
                cache_key = self.make_cache_key(each['id'])
                cache.set(cache_key, each, self.BUG_CACHE_SECONDS)
                results.append(each)
        return {'bugs': results}
Example #11
0
    def get(self, bugs):
        if isinstance(bugs, str):
            bugs = [bugs]
        fields = ('summary', 'status', 'id', 'resolution')
        results = []
        missing = []
        for bug in bugs:
            cache_key = self.make_cache_key(bug)
            cached = cache.get(cache_key)
            if cached is None:
                missing.append(bug)
            else:
                results.append(cached)
        if missing:
            params = {
                'bugs': ','.join(missing),
                'fields': ','.join(fields),
            }
            headers = {
                'Accept': 'application/json',
                'Content-Type': 'application/json'
            }
            url = settings.BZAPI_BASE_URL + (
                '/bug?id=%(bugs)s&include_fields=%(fields)s' % params
            )
            session = session_with_retries(
                # BZAPI isn't super reliable, so be extra patient
                total_retries=5,
                # 502 = Bad Gateway
                # 504 = Gateway Time-out
                status_forcelist=(500, 502, 504)
            )
            response = session.get(
                url,
                headers=headers,
                timeout=self.BUGZILLA_REST_TIMEOUT,
            )
            if response.status_code != 200:
                raise BugzillaRestHTTPUnexpectedError(response.status_code)

            for each in response.json()['bugs']:
                cache_key = self.make_cache_key(each['id'])
                cache.set(cache_key, each, self.BUG_CACHE_SECONDS)
                results.append(each)
        return {'bugs': results}
Example #12
0
    def _iterator(self, start_date):
        self.stdout.write('Working on %s to now' % start_date)
        # Fetch all the bugs that have been created or had the crash_signature
        # field changed since start_date
        payload = BUGZILLA_PARAMS.copy()
        payload['chfieldfrom'] = start_date

        # Use a 30-second timeout because Bugzilla is slow sometimes
        session = session_with_retries(default_timeout=30.0)
        r = session.get(BUGZILLA_BASE_URL, params=payload)
        if r.status_code < 200 or r.status_code >= 300:
            r.raise_for_status()
        results = r.json()

        # Yield each one as a (bug_id, set of signatures)
        for report in results['bugs']:
            yield (int(report['id']),
                   find_signatures(report.get('cf_crash_signature', '')))
Example #13
0
    def _iterator(self, from_date):
        # Fetch all the bugs that have been created or had the crash_signature
        # field changed since from_date
        payload = BUGZILLA_PARAMS.copy()
        payload['chfieldfrom'] = from_date
        # Use a 30-second timeout because Bugzilla is slow sometimes
        session = session_with_retries(default_timeout=30.0)
        r = session.get(BUGZILLA_BASE_URL, params=payload)
        if r.status_code < 200 or r.status_code >= 300:
            r.raise_for_status()
        results = r.json()

        # Yield each one as a (bug_id, set of signatures)
        for report in results['bugs']:
            yield (
                int(report['id']),
                find_signatures(report.get('cf_crash_signature', ''))
            )
Example #14
0
    def handle(self, **options):
        debug_mode = options.get("debug")

        # Request file
        session = session_with_retries()

        resp = session.get(PCI_IDS_URL)

        # Let's raise an error if there's an error and let it alert us in Sentry for now
        resp.raise_for_status()

        # If we got the file successfully, then process it
        self.stdout.write(f"Fetch successful, {len(resp.text)} bytes...")
        devices = utils.pci_ids__parse_graphics_devices_iterable(
            resp.text.splitlines(), debug=debug_mode)

        total_created = 0
        total_updated = 0
        total_skipped = 0

        for item in devices:
            obj, created = GraphicsDevice.objects.get_or_create(
                vendor_hex=item["vendor_hex"], adapter_hex=item["adapter_hex"])
            if (obj.vendor_name == item["vendor_name"]
                    and obj.adapter_name == item["adapter_name"]):
                total_skipped += 1
                continue

            obj.vendor_name = item["vendor_name"]
            obj.adapter_name = item["adapter_name"]
            obj.save()

            if created:
                total_created += 1
            else:
                total_updated += 1

        self.stdout.write(f"Done. "
                          f"Created: {total_created}; "
                          f"Updated: {total_updated}; "
                          f"Skipped: {total_skipped}")
Example #15
0
    def get(self, bugs):
        if isinstance(bugs, str):
            bugs = [bugs]
        fields = ("summary", "status", "id", "resolution")
        results = []
        missing = []
        for bug in bugs:
            cache_key = self.make_cache_key(bug)
            cached = cache.get(cache_key)
            if cached is None:
                missing.append(bug)
            else:
                results.append(cached)
        if missing:
            params = {"bugs": ",".join(missing), "fields": ",".join(fields)}
            headers = {
                "Accept": "application/json",
                "Content-Type": "application/json"
            }
            if settings.BZAPI_TOKEN:
                headers["X-BUGZILLA-API-KEY"] = settings.BZAPI_TOKEN
            url = settings.BZAPI_BASE_URL + (
                "/bug?id=%(bugs)s&include_fields=%(fields)s" % params)
            session = session_with_retries(
                # BZAPI isn't super reliable, so be extra patient
                total_retries=5,
                # 502 = Bad Gateway
                # 504 = Gateway Time-out
                status_forcelist=(500, 502, 504),
            )
            response = session.get(url,
                                   headers=headers,
                                   timeout=self.BUGZILLA_REST_TIMEOUT)
            if response.status_code != 200:
                raise BugzillaRestHTTPUnexpectedError(response.status_code)

            for each in response.json()["bugs"]:
                cache_key = self.make_cache_key(each["id"])
                cache.set(cache_key, each, self.BUG_CACHE_SECONDS)
                results.append(each)
        return {"bugs": results}
Example #16
0
    def run(self, connection):
        # The @with_single_postgres_transaction decorator makes
        # sure this cursor is committed or rolled back and cleaned up.
        cursor = connection.cursor()

        session = session_with_retries()

        for product in self.config.products:
            url = self.config.api_endpoint_url.format(product=product)
            response = session.get(url)
            if response.status_code != 200:
                raise DownloadError('{} ({})'.format(
                    url,
                    response.status_code,
                ))
            versions = response.json()
            self._set_featured_versions(
                cursor,
                product,
                versions,
            )
Example #17
0
def fetch_crash(fetchdumps, outputdir, api_token, crash_id):
    """Fetch crash data and save to correct place on the file system

    http://antenna.readthedocs.io/en/latest/architecture.html#aws-s3-file-hierarchy

    """
    if api_token:
        headers = {
            'Auth-Token': api_token
        }
    else:
        headers = {}

    # Fetch raw crash metadata
    session = session_with_retries()
    resp = session.get(
        HOST + '/api/RawCrash/',
        params={
            'crash_id': crash_id,
            'format': 'meta',
        },
        headers=headers,
    )

    # Handle 404 and 403 so we can provide the user more context
    if resp.status_code == 404:
        raise CrashDoesNotExist(crash_id)
    if api_token and resp.status_code == 403:
        raise BadAPIToken(resp.json().get('error', 'No error provided'))

    # Raise an error for any other non-200 response
    resp.raise_for_status()

    # Save raw crash to file system
    raw_crash = resp.json()
    fn = os.path.join(outputdir, 'v2', 'raw_crash', crash_id[0:3], '20' + crash_id[-6:], crash_id)
    create_dir_if_needed(os.path.dirname(fn))
    with open(fn, 'w') as fp:
        json.dump(raw_crash, fp, cls=JsonDTEncoder, indent=2, sort_keys=True)

    if fetchdumps:
        # Fetch dumps
        dumps = {}
        dump_names = raw_crash.get('dump_checksums', {}).keys()
        for dump_name in dump_names:
            print('Fetching %s -> %s' % (crash_id, dump_name))

            # We store "upload_file_minidump" as "dump", so we need to use that
            # name when requesting from the RawCrash api
            file_name = dump_name
            if file_name == 'upload_file_minidump':
                file_name = 'dump'

            resp = session.get(
                HOST + '/api/RawCrash/',
                params={
                    'crash_id': crash_id,
                    'format': 'raw',
                    'name': file_name
                },
                headers=headers,
            )

            if resp.status_code != 200:
                raise Exception('Something unexpected happened. status_code %s, content %s' % (
                    resp.status_code, resp.content)
                )

            dumps[dump_name] = resp.content

        # Save dump_names to file system
        fn = os.path.join(outputdir, 'v1', 'dump_names', crash_id)
        create_dir_if_needed(os.path.dirname(fn))
        with open(fn, 'w') as fp:
            json.dump(list(dumps.keys()), fp)

        # Save dumps to file system
        for dump_name, data in dumps.items():
            if dump_name == 'upload_file_minidump':
                dump_name = 'dump'

            fn = os.path.join(outputdir, 'v1', dump_name, crash_id)
            create_dir_if_needed(os.path.dirname(fn))
            with open(fn, 'wb') as fp:
                fp.write(data)
Example #18
0
    def get_data(product):
        # Get 10 Firefox versions from product_versions to compare
        with connection.cursor() as cursor:
            cursor.execute(
                """
            SELECT
                distinct pv.product_name, pv.build_type, pvb.build_id, pv.release_version,
                pv.version_string
            FROM product_versions AS pv, product_version_builds AS pvb
            WHERE
                pv.product_version_id = pvb.product_version_id
                AND pv.build_type = 'beta'
                AND pv.product_name = %s
            ORDER BY
                pvb.build_id DESC, pv.product_name, pv.build_type, pv.release_version,
                pv.version_string
            LIMIT 15
            """, (product, ))
            pv_data = cursor.fetchall()

        pv_data = [{
            'product': pv[0],
            'channel': pv[1],
            'build_id': pv[2],
            'release_version': pv[3],
            'version_string': pv[4],
        } for pv in pv_data]

        session = session_with_retries(buildhub_api)

        for pv in pv_data:
            # Lookup (product, buildid, channel) and exclude versions with "rc" in
            # them because we're looking at the beta channel
            es_query = {
                'query': {
                    'bool': {
                        'must': {
                            'match_all': {}
                        },
                        'filter': [
                            {
                                'term': {
                                    'source.product': product.lower()
                                }
                            },
                            {
                                'term': {
                                    'build.id': str(pv['build_id'])
                                }
                            },
                            {
                                'term': {
                                    'target.channel': pv['channel']
                                }
                            },
                        ],
                        'must_not': {
                            'wildcard': {
                                'target.version': '*rc*'
                            }
                        }
                    }
                },
                'size': 1
            }

            resp = session.post(buildhub_api, data=json.dumps(es_query))

            if resp.status_code != 200:
                pv['buildhub_resp'] = 'HTTP %s' % resp.status_code
                continue

            data = resp.json()
            hits = data.get('hits', {}).get('hits', [])

            if not hits:
                pv['buildhub_resp'] = 'no hits--might be release'
                continue

            pv['buildhub_resp'] = hits[0]['_source']['target']['version']
        return pv_data
    def _get_version_data(self, product, build_id, channel):
        """Return the real version number of a specified product, build, channel

        For example, beta builds of Firefox declare their version number as the
        major version (i.e. version 54.0b3 would say its version is 54.0). This
        database call returns the actual version number of said build (i.e.
        54.0b3 for the previous example).

        :arg product: the product
        :arg build_id: the build_id as a string
        :arg channel: the release channel

        :returns: ``None`` or the version string that should be used

        :raises requests.RequestException: raised if it has connection issues with
            the host specified in ``version_string_api``

        """
        # NOTE(willkg): AURORA LIVES!
        #
        # But seriously, if this is for Firefox/aurora and the build id is after
        # 20170601, then we ask Buildhub about devedition/aurora instead because
        # devedition is the aurora channel
        if (product, channel) == ('firefox',
                                  'aurora') and build_id > '20170601':
            product = 'devedition'

        key = '%s:%s:%s' % (product, build_id, channel)
        if key in self.cache:
            return self.cache[key]

        session = session_with_retries(self.buildhub_api)

        query = {
            'source.product': product,
            'build.id': '"%s"' % build_id,
            'target.channel': channel,
            '_limit': 1
        }
        resp = session.get(self.buildhub_api, params=query)

        if resp.status_code == 200:
            hits = resp.json()['data']

            # Shimmy to add to ttl so as to distribute cache misses over time and reduce
            # HTTP requests from bunching up.
            shimmy = random.randint(1, 120)

            if hits:
                # If we got an answer we should keep it around for a while because it's
                # a real answer and it's not going to change so use the long ttl plus
                # a fudge factor.
                real_version = hits[0]['target']['version']
                self.cache.set(key,
                               value=real_version,
                               ttl=self.LONG_CACHE_TTL + shimmy)
                return real_version

            # We didn't get an answer which could mean that this is a weird
            # build and there is no answer or it could mean that Buildhub
            # doesn't know, yet. Maybe in the future we get a better answer
            # so we use the short ttl plus a fudge factor.
            self.cache.set(key, value=None, ttl=self.SHORT_CACHE_TTL + shimmy)

        return None
Example #20
0
def fetch_crash(host, fetchraw, fetchdumps, fetchprocessed, outputdir,
                api_token, crash_id):
    """Fetch crash data and save to correct place on the file system

    http://antenna.readthedocs.io/en/latest/architecture.html#aws-s3-file-hierarchy

    """
    if api_token:
        headers = {"Auth-Token": api_token}
    else:
        headers = {}

    session = session_with_retries()

    if fetchraw:
        # Fetch raw crash metadata
        print("Fetching raw %s" % crash_id)
        resp = session.get(
            host + "/api/RawCrash/",
            params={
                "crash_id": crash_id,
                "format": "meta"
            },
            headers=headers,
        )

        # Handle 404 and 403 so we can provide the user more context
        if resp.status_code == 404:
            raise CrashDoesNotExist(crash_id)
        if api_token and resp.status_code == 403:
            raise BadAPIToken(resp.json().get("error", "No error provided"))

        # Raise an error for any other non-200 response
        resp.raise_for_status()

        # Save raw crash to file system
        raw_crash = resp.json()
        fn = os.path.join(outputdir, "v2", "raw_crash", crash_id[0:3],
                          "20" + crash_id[-6:], crash_id)
        create_dir_if_needed(os.path.dirname(fn))
        with open(fn, "w") as fp:
            json.dump(raw_crash,
                      fp,
                      cls=JsonDTEncoder,
                      indent=2,
                      sort_keys=True)

    if fetchdumps:
        # Fetch dumps
        dumps = {}
        dump_names = raw_crash.get("dump_checksums", {}).keys()
        for dump_name in dump_names:
            print("Fetching dump %s/%s" % (crash_id, dump_name))

            # We store "upload_file_minidump" as "dump", so we need to use that
            # name when requesting from the RawCrash api
            file_name = dump_name
            if file_name == "upload_file_minidump":
                file_name = "dump"

            resp = session.get(
                host + "/api/RawCrash/",
                params={
                    "crash_id": crash_id,
                    "format": "raw",
                    "name": file_name
                },
                headers=headers,
            )

            if resp.status_code != 200:
                raise Exception(
                    "Something unexpected happened. status_code %s, content %s"
                    % (resp.status_code, resp.content))

            dumps[dump_name] = resp.content

        # Save dump_names to file system
        fn = os.path.join(outputdir, "v1", "dump_names", crash_id)
        create_dir_if_needed(os.path.dirname(fn))
        with open(fn, "w") as fp:
            json.dump(list(dumps.keys()), fp)

        # Save dumps to file system
        for dump_name, data in dumps.items():
            if dump_name == "upload_file_minidump":
                dump_name = "dump"

            fn = os.path.join(outputdir, "v1", dump_name, crash_id)
            create_dir_if_needed(os.path.dirname(fn))
            with open(fn, "wb") as fp:
                fp.write(data)

    if fetchprocessed:
        # Fetch processed crash data
        print("Fetching processed %s" % crash_id)
        resp = session.get(
            host + "/api/ProcessedCrash/",
            params={
                "crash_id": crash_id,
                "format": "meta"
            },
            headers=headers,
        )

        # Handle 404 and 403 so we can provide the user more context
        if resp.status_code == 404:
            raise CrashDoesNotExist(crash_id)
        if api_token and resp.status_code == 403:
            raise BadAPIToken(resp.json().get("error", "No error provided"))

        # Raise an error for any other non-200 response
        resp.raise_for_status()

        # Save processed crash to file system
        processed_crash = resp.json()
        fn = os.path.join(outputdir, "v1", "processed_crash", crash_id)
        create_dir_if_needed(os.path.dirname(fn))
        with open(fn, "w") as fp:
            json.dump(processed_crash,
                      fp,
                      cls=JsonDTEncoder,
                      indent=2,
                      sort_keys=True)
Example #21
0
 def __init__(self, *args, **kwargs):
     super(ArchiveScraperCronApp, self).__init__(*args, **kwargs)
     # NOTE(willkg): If archive.mozilla.org is timing out after 5 seconds,
     # then it has issues and we should try again some other time
     self.session = session_with_retries(default_timeout=5.0)
     self.successful_inserts = 0
def main(argv=None):
    parser = argparse.ArgumentParser(
        formatter_class=WrappedTextHelpFormatter,
        description=DESCRIPTION.strip(),
    )
    parser.add_argument(
        '--sleep',
        help='how long in seconds to sleep before submitting the next group',
        type=int,
        default=SLEEP_DEFAULT)
    parser.add_argument('--host',
                        help='host for system to reprocess in',
                        default=DEFAULT_HOST)
    parser.add_argument('crashid',
                        nargs='*',
                        help='one or more crash ids to fetch data for')

    if argv is None:
        args = parser.parse_args()
    else:
        args = parser.parse_args(argv)

    api_token = os.environ.get('SOCORRO_REPROCESS_API_TOKEN')
    if not api_token:
        print('You need to set SOCORRO_REPROCESS_API_TOKEN in the environment')
        return 1

    url = args.host.rstrip('/') + '/api/Reprocessing/'

    if args.crashid:
        crash_ids = args.crashid
    elif not sys.stdin.isatty():
        # If a script is piping to this script, then isatty() returns False. If there is no script
        # piping to this script, then isatty() returns True and if we do list(sys.stdin), it'll
        # block waiting for input.
        crash_ids = list(sys.stdin)
    else:
        crash_ids = []

    # If there are no crashids, then print help and exit
    if not crash_ids:
        parser.print_help()
        return 0

    crash_ids = [item.strip() for item in crash_ids]

    print('Sending reprocessing requests to: %s' % url)
    session = session_with_retries()

    print('Reprocessing %s crashes sleeping %s seconds between groups...' %
          (len(crash_ids), args.sleep))

    groups = list(chunked(crash_ids, CHUNK_SIZE))
    for i, group in enumerate(groups):
        print('Processing group ending with %s ... (%s/%s)' %
              (group[-1], i + 1, len(groups)))
        resp = session.post(url,
                            data={'crash_ids': group},
                            headers={'Auth-Token': api_token})
        if resp.status_code != 200:
            print('Got back non-200 status code: %s %s' %
                  (resp.status_code, resp.content))
            continue

        # NOTE(willkg): We sleep here because the webapp has a bunch of rate limiting and we don't
        # want to trigger that. It'd be nice if we didn't have to do this.
        time.sleep(args.sleep)

    print('Done!')
Example #23
0
    def _get_version_data(self, product, version, build_id):
        """Return the real version number of a specific product, version and build

        For example, beta builds of Firefox declare their version number as the
        major version (i.e. version 54.0b3 would say its version is 54.0). This
        database call returns the actual version number of said build (i.e.
        54.0b3 for the previous example).

        :arg product: the product
        :arg version: the version as a string. e.g. "56.0"
        :arg build_id: the build_id as a string.

        :returns: ``None`` or the version string that should be used

        :raises requests.RequestException: raised if it has connection issues with
            the host specified in ``version_string_api``

        """
        if not (product and version and build_id):
            return None

        key = '%s:%s:%s' % (product, version, build_id)
        if key in self.cache:
            return self.cache[key]

        session = session_with_retries(self.version_string_api)

        resp = session.get(self.version_string_api,
                           params={
                               'product': product,
                               'version': version,
                               'build_id': build_id
                           })

        if resp.status_code == 200:
            hits = resp.json()['hits']

            # Shimmy to add to ttl so as to distribute cache misses over time and reduce
            # HTTP requests from bunching up.
            shimmy = random.randint(1, 120)

            if hits:
                # If we got an answer we should keep it around for a while because it's
                # a real answer and it's not going to change so use the long ttl plus
                # a fudge factor.
                real_version = hits[0]
                self.cache.set(key,
                               value=real_version,
                               ttl=self.LONG_CACHE_TTL + shimmy)
                return real_version
            else:
                # We didn't get an answer which could mean that this is a weird build and there
                # is no answer or it could mean that ftpscraper hasn't picked up the relevant
                # build information or it could mean we're getting cached answers from the webapp.
                # Regardless, maybe in the future we get a better answer so we use the short
                # ttl plus a fudge factor.
                self.cache.set(key,
                               value=None,
                               ttl=self.SHORT_CACHE_TTL + shimmy)

        return None
Example #24
0
def get_session():
    """Return a retryable requests session."""
    # NOTE(willkg): If archive.mozilla.org is timing out after 5 seconds, then
    # it has issues and we should try again some other time
    return session_with_retries(default_timeout=5.0)
Example #25
0
 def __init__(self, *args, **kwargs):
     super(FTPScraperCronApp, self).__init__(*args, **kwargs)
     self.session = session_with_retries()
Example #26
0
def fetch_crash(fetchdumps, outputdir, api_token, crash_id):
    """Fetch crash data and save to correct place on the file system

    http://antenna.readthedocs.io/en/latest/architecture.html#aws-s3-file-hierarchy

    """
    if api_token:
        headers = {'Auth-Token': api_token}
    else:
        headers = {}

    # Fetch raw crash metadata
    session = session_with_retries()
    resp = session.get(
        HOST + '/api/RawCrash/',
        params={
            'crash_id': crash_id,
            'format': 'meta',
        },
        headers=headers,
    )

    # Handle 404 and 403 so we can provide the user more context
    if resp.status_code == 404:
        raise CrashDoesNotExist(crash_id)
    if api_token and resp.status_code == 403:
        raise BadAPIToken(resp.json().get('error', 'No error provided'))

    # Raise an error for any other non-200 response
    resp.raise_for_status()

    # Save raw crash to file system
    raw_crash = resp.json()
    fn = os.path.join(outputdir, 'v2', 'raw_crash', crash_id[0:3],
                      '20' + crash_id[-6:], crash_id)
    create_dir_if_needed(os.path.dirname(fn))
    with open(fn, 'w') as fp:
        json.dump(raw_crash, fp, cls=JsonDTEncoder, indent=2, sort_keys=True)

    if fetchdumps:
        # Fetch dumps
        dumps = {}
        dump_names = raw_crash.get('dump_checksums', {}).keys()
        for dump_name in dump_names:
            print('Fetching %s -> %s' % (crash_id, dump_name))

            # We store "upload_file_minidump" as "dump", so we need to use that
            # name when requesting from the RawCrash api
            file_name = dump_name
            if file_name == 'upload_file_minidump':
                file_name = 'dump'

            resp = session.get(
                HOST + '/api/RawCrash/',
                params={
                    'crash_id': crash_id,
                    'format': 'raw',
                    'name': file_name
                },
                headers=headers,
            )

            if resp.status_code != 200:
                raise Exception(
                    'Something unexpected happened. status_code %s, content %s'
                    % (resp.status_code, resp.content))

            dumps[dump_name] = resp.content

        # Save dump_names to file system
        fn = os.path.join(outputdir, 'v1', 'dump_names', crash_id)
        create_dir_if_needed(os.path.dirname(fn))
        with open(fn, 'w') as fp:
            json.dump(dumps.keys(), fp)

        # Save dumps to file system
        for dump_name, data in dumps.items():
            if dump_name == 'upload_file_minidump':
                dump_name = 'dump'

            fn = os.path.join(outputdir, 'v1', dump_name, crash_id)
            create_dir_if_needed(os.path.dirname(fn))
            with open(fn, 'wb') as fp:
                fp.write(data)