def update_pages(self): bower = Content.get_by_id('bower', parent=self.version_key) if bower is None: return bower_json = bower.get_json() for _, path in bower_json.get('pages', {}).iteritems(): response = util.github_get('repos', self.owner, self.repo, 'contents/' + path, params={'ref': self.sha}) if response.status_code == 200: response_json = json.loads(response.content) markdown = None # Ensure a file was returned if isinstance(response_json, dict) and response_json.get('type') == 'file': markdown = base64.b64decode(response_json.get('content')) elif response.status_code == 404: markdown = None else: return self.retry('error fetching page %s (%d)' % (path, response.status_code)) if markdown is not None: response = util.github_markdown(markdown) if response.status_code == 200: Content(parent=self.version_key, id='page-' + path, content=response.content, status=Status.ready, etag=response.headers.get('ETag', None)).put() else: return self.retry('error converting page to markdown %s (%d)' % (path, response.status_code))
def update_readme(self): response = util.github_get('repos', self.owner, self.repo, 'readme', params={"ref": self.sha}) if response.status_code == 200: readme = base64.b64decode(json.loads(response.content)['content']) try: Content(parent=self.version_key, id='readme', content=readme, status=Status.ready, etag=response.headers.get('ETag', None)).put() except db.BadValueError: return self.error( "Could not store README.md as a utf-8 string", ErrorCodes.Version_utf) elif response.status_code == 404: readme = None else: return self.retry('error fetching readme (%d)' % response.status_code) if readme is not None: response = util.github_markdown(readme) if response.status_code == 200: Content(parent=self.version_key, id='readme.html', content=response.content, status=Status.ready, etag=response.headers.get('ETag', None)).put() else: return self.retry('error converting readme to markdown (%d)' % response.status_code)
def update_readme(self, is_npm_package): if is_npm_package: # Load registry metadata to fetch readme path. library = Library.get_by_id(Library.id(self.owner, self.repo)) registry_metadata = json.loads(library.registry_metadata) if library.registry_metadata else None readme_path = registry_metadata.get('readmeFilename', 'README.md') response = util.unpkg_get(self.owner, self.repo, self.version, readme_path) readme = response.content else: # Load readme from GitHub endpoint. response = util.github_get('repos', self.owner, self.repo, 'readme', params={"ref": self.sha}) if response.status_code == 200: readme = base64.b64decode(json.loads(response.content)['content']) elif response.status_code == 404: readme = None else: return self.retry('error fetching readme (%d)' % response.status_code) if readme is not None: # Store the raw readme markdown. try: Content(parent=self.version_key, id='readme', content=readme, status=Status.ready, etag=response.headers.get('ETag', None)).put() except db.BadValueError: return self.error("Could not store README.md as a utf-8 string", ErrorCodes.Version_utf) # Convert markdown to HTML and store the result. response = util.github_markdown(readme) if response.status_code == 200: Content(parent=self.version_key, id='readme.html', content=response.content, status=Status.ready, etag=response.headers.get('ETag', None)).put() else: return self.retry('error converting readme to markdown (%d)' % response.status_code)
def trigger_analysis(self, tag, sha, transactional=False): analysis_sha = None if self.library.kind == 'collection': analysis_sha = sha version_key = ndb.Key(Library, self.library.key.id(), Version, tag) content = Content.get_by_id('analysis', parent=version_key) if content is None or content.status == Status.error: Content(id='analysis', parent=version_key, status=Status.pending).put() task_url = util.ingest_analysis_task(self.scope, self.package, tag, analysis_sha) util.new_task(task_url, target='analysis', transactional=transactional, queue_name='analysis')
def test_compressed(self): library_key = Library(id='owner/repo').put() version_key = Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() content = Content(id='analysis', parent=version_key, status=Status.pending) content.json = dict({"analyzerData": "some data"}) content.status = Status.ready content.put() response = self.app.get('/api/docs/owner/repo/v1.1.1?use_analyzer_data') self.assertEqual(response.status_int, 200) self.assertEqual(json.loads(response.normal_body).get('analysis'), "some data")
def test_update_indexes(self): metadata = """{ "full_name": "full-name" }""" collection_library_key = Library(id='my/collection', status=Status.ready, kind='collection', metadata=metadata).put() collection_version_key = Version(id='v1.0.0', parent=collection_library_key, sha='sha', status=Status.ready).put() Content(id='bower', parent=collection_version_key, content="""{"dependencies": { "a": "org/element-1#1.0.0", "b": "org/element-2#1.0.0" }}""").put() VersionCache.update(collection_library_key) response = self.app.get(util.update_indexes_task('my', 'collection'), headers={'X-AppEngine-QueueName': 'default'}) self.assertEqual(response.status_int, 200) # Triggers ingestions tasks = self.tasks.get_filtered_tasks() self.assertEqual([ util.ensure_library_task('org', 'element-1'), util.ensure_library_task('org', 'element-2'), ], [task.url for task in tasks]) # Ensures collection references ref1 = CollectionReference.get_by_id(id="my/collection/v1.0.0", parent=ndb.Key(Library, "org/element-1")) self.assertIsNotNone(ref1) ref2 = CollectionReference.get_by_id(id="my/collection/v1.0.0", parent=ndb.Key(Library, "org/element-2")) self.assertIsNotNone(ref2) # Validate search index index = search.Index('repo') document = index.get('my/collection') self.assertIsNotNone(document) self.assertTrue(len(document.fields) > 0)
def get(self, owner, repo, ver=None): self.response.headers['Access-Control-Allow-Origin'] = '*' owner = owner.lower() repo = repo.lower() library_key = ndb.Key(Library, Library.id(owner, repo)) if ver is None: ver = yield Library.latest_version_for_key_async(library_key) if ver is None: self.response.set_status(404) return version_key = ndb.Key(Library, Library.id(owner, repo), Version, ver) analysis = Content.get_by_id('analysis', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if analysis is None: self.response.set_status(404) return self.response.headers['Content-Type'] = 'application/json' result = {} result['status'] = analysis.status if analysis.status == Status.ready: result['content'] = json.loads(analysis.content) if analysis.status == Status.error: result['error'] = analysis.error if result['status'] != Status.ready: self.response.set_status(400) self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(result))
def handle_post(self): message_json = json.loads( urllib.unquote(self.request.body).rstrip('=')) message = message_json['message'] data = base64.b64decode(str(message['data'])) attributes = message['attributes'] owner = attributes['owner'] repo = attributes['repo'] version = attributes['version'] error = attributes.get('error', None) version_key = ndb.Key(Library, Library.id(owner, repo), Version, version) content = Content.get_by_id('analysis', parent=version_key) if content is None: return if data == '': content.set_json(None) else: content.set_json(json.loads(data)) if error is None: content.status = Status.ready content.error = None else: content.status = Status.error content.error = error content.put() if version_key.id() == Library.default_version_for_key_async( version_key.parent()).get_result(): task_url = util.update_indexes_task(owner, repo) util.new_task(task_url, target='manage')
def update_search_index(self, owner, repo, version_key, library, bower): metadata = json.loads(library.metadata) registry_metadata = json.loads(library.registry_metadata) if library.registry_metadata else None npm_description = registry_metadata.get('description', '') if registry_metadata else '' npm_keywords = registry_metadata.get('keywords', []) if registry_metadata else [] fields = [ search.AtomField(name='owner', value=owner), search.TextField(name='repo', value=repo), search.AtomField(name='kind', value=library.kind), search.AtomField(name='version', value=version_key.id()), search.TextField(name='github_description', value=metadata.get('description', '')), search.TextField(name='bower_description', value=bower.get('description', '')), search.TextField(name='npm_description', value=npm_description), search.TextField(name='bower_keywords', value=' '.join(bower.get('keywords', []))), search.TextField(name='npm_keywords', value=' '.join(npm_keywords)), search.TextField(name='prefix_matches', value=' '.join(util.generate_prefixes_from_list( util.safe_split_strip(metadata.get('description')) + util.safe_split_strip(bower.get('description')) + util.safe_split_strip(repo)))), ] # Generate weighting field weights = [(repo, 10)] analysis = Content.get_by_id('analysis', parent=version_key) if analysis is not None and analysis.status == Status.ready: data = analysis.get_json() if data.get('analyzerData', None) is not None: # Use analyzer data for search index element_objects = data.get('analyzerData', {}).get('elements', []) elements = [element.get('tagname', '') or element.get('classname', '') for element in element_objects] if elements != []: fields.append(search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behavior_objects = data.get('analyzerData', {}).get('metadata', {}).get('polymer', {}).get('behaviors', []) behaviors = [behavior.get('name', '') for behavior in behavior_objects] if behaviors != []: fields.append(search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) else: # Use hydrolysis data for search index elements = data.get('elementsByTagName', {}).keys() if elements != []: fields.append(search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behaviors = data.get('behaviorsByName', {}).keys() if behaviors != []: fields.append(search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) weighted = [] for value, weight in weights: for _ in range(0, weight): weighted.append(value) fields.append(search.TextField(name='weighted_fields', value=' '.join(weighted))) rank = int((library.updated - datetime.datetime(2016, 1, 1)).total_seconds()) document = search.Document(doc_id=Library.id(owner, repo), fields=fields, rank=rank) index = search.Index('repo') index.put(document)
def get(self, owner, repo, version=None): self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' library_key = ndb.Key(Library, Library.id(owner, repo)) if version is None: version = yield Library.default_version_for_key_async(library_key) if version is None: self.response.set_status(404) return version_key = ndb.Key(Library, library_key.id(), Version, version) bower = yield Content.get_by_id_async('bower', parent=version_key) if bower is None: self.response.set_status(404) return bower_json = bower.get_json() bower_dependencies = bower_json.get('dependencies', {}) dependencies = [] version_futures = [] for name in bower_dependencies.keys(): dependency = Dependency.from_string(bower_dependencies[name]) if dependency is None: continue dependencies.append(dependency) dependency_library_key = ndb.Key(Library, Library.id(dependency.owner, dependency.repo)) version_futures.append(Library.versions_for_key_async(dependency_library_key)) dependency_futures = [] for i, dependency in enumerate(dependencies): versions = yield version_futures[i] def matches(version, spec): try: return versiontag.match(version, spec) except ValueError: # FIXME: What other cases do we need to support here? return False while len(versions) > 0 and not matches(versions[-1], dependency.version): versions.pop() if len(versions) > 0: dependency_library_key = ndb.Key(Library, Library.id(dependency.owner.lower(), dependency.repo.lower())) dependency_futures.append(LibraryMetadata.brief_async(dependency_library_key, versions[-1])) results = [] for future in dependency_futures: dependency_result = yield future if dependency_result is not None: results.append(dependency_result) result = { 'results': results, 'count': len(results), } self.response.write(json.dumps(result))
def test_analyze_leaves_existing_content_when_reanalyzing(self): library_key = Library(id='owner/repo').put() version_key = Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() content = Content(id='analysis', parent=version_key, status=Status.pending) content.content = 'existing data' content.status = Status.ready content.put() response = self.app.get('/task/analyze/owner/repo', headers={'X-AppEngine-QueueName': 'default'}) self.assertEqual(response.status_int, 200) content = Content.get_by_id('analysis', parent=version_key) self.assertEqual(content.content, 'existing data') self.assertEqual(content.status, Status.ready) tasks = self.tasks.get_filtered_tasks() self.assertEqual([ util.ingest_analysis_task('owner', 'repo', 'v1.1.1'), ], [task.url for task in tasks])
def trigger_analysis(self, tag, sha, transactional=False): analysis_sha = None if self.library.kind == 'collection': analysis_sha = sha version_key = ndb.Key(Library, self.library.key.id(), Version, tag) Content(id='analysis', parent=version_key, status=Status.pending).put() task_url = util.ingest_analysis_task(self.owner, self.repo, tag, analysis_sha) util.new_task(task_url, target='analysis', transactional=transactional)
def get(self, owner, repo, tag, name=None, path=None): self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' owner = owner.lower() repo = repo.lower() version_key = ndb.Key(Library, '%s/%s' % (owner, repo), Version, tag) analysis = Content.get_by_id('analysis', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if analysis is None: self.response.set_status(404) return dependencies = json.loads(analysis.content).get('bowerDependencies', None) if dependencies is None: self.response.set_status(404) return config_map = {} for dependency in dependencies: config_map[dependency['name']] = '%s/%s/%s' % (dependency['owner'], dependency['repo'], dependency['version']) # Ensure the repo serves its own version. config_map[repo] = '%s/%s/%s' % (owner, repo, tag) def resolve(name, path): return 'https://cdn.rawgit.com/%s%s' % (config_map[name], path) if name in config_map else None # debug mode if name is None or path is None: for k in config_map: self.response.write('/%s/%s/%s/components/%s/... -> %s\n' % (owner, repo, tag, k, resolve(k, '/...'))) self.response.write('\n') return resolved = resolve(name, path) if resolved is None: self.response.write('%s is not a valid dependency for %s/%s#%s' % (name, owner, repo, tag)) self.response.set_status(400) return # TODO: Figure out what other types this is necessary for. eg. do we need it for CSS @import? # We need to serve html files from the same origin, so that relative urls load correctly. if path.endswith('.html'): # TODO: Decide whether this should be memcached. Appengine's urlfetch already does caching. response = urlfetch.fetch(resolved) if response.status_code == 200: self.response.write(response.content) self.response.headers['cache-control'] = response.headers.get('cache-control', 'max-age=315569000') self.response.headers['content-type'] = response.headers.get('content-type', 'text/html') else: self.response.write('could not fetch: %s' % resolved) self.response.set_status(400) else: self.response.set_status(301) self.response.headers['Location'] = str(resolved) self.response.headers['cache-control'] = 'max-age=315569000'
def test_analyze_resets_error_content_when_reanalyzing(self): library_key = Library(id='owner/repo').put() version_key = Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() content = Content(id='analysis', parent=version_key, status=Status.pending) content.status = Status.error content.put() response = self.app.get('/task/analyze/owner/repo', headers={'X-AppEngine-QueueName': 'default'}) self.assertEqual(response.status_int, 200) content = Content.get_by_id('analysis', parent=version_key) self.assertEqual(content.status, Status.pending) tasks = self.tasks.get_filtered_tasks() self.assertEqual([ util.ingest_analysis_task('owner', 'repo', 'v1.1.1'), ], [task.url for task in tasks])
def update_search_index(self, owner, repo, version_key, library, bower): metadata = json.loads(library.metadata) fields = [ search.AtomField(name='owner', value=owner), search.TextField(name='repo', value=repo), search.AtomField(name='kind', value=library.kind), search.AtomField(name='version', value=version_key.id()), search.TextField(name='github_description', value=metadata.get('description', '')), search.TextField(name='bower_description', value=bower.get('description', '')), search.TextField(name='bower_keywords', value=' '.join(bower.get('keywords', []))), search.TextField( name='prefix_matches', value=' '.join( util.generate_prefixes_from_list( util.safe_split_strip(metadata.get('description')) + util.safe_split_strip(bower.get('description')) + util.safe_split_strip(repo)))), ] # Generate weighting field weights = [(repo, 10)] analysis = Content.get_by_id('analysis', parent=version_key) if analysis is not None and analysis.status == Status.ready: analysis = json.loads(analysis.content) elements = analysis.get('elementsByTagName', {}).keys() if elements != []: fields.append( search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behaviors = analysis.get('behaviorsByName', {}).keys() if behaviors != []: fields.append( search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) weighted = [] for value, weight in weights: for _ in range(0, weight): weighted.append(value) fields.append( search.TextField(name='weighted_fields', value=' '.join(weighted))) rank = int( (library.updated - datetime.datetime(2016, 1, 1)).total_seconds()) document = search.Document(doc_id=Library.id(owner, repo), fields=fields, rank=rank) index = search.Index('repo') index.put(document)
def trigger_analysis(self, tag, sha, transactional=False): analysis_sha = None if self.library.kind == 'collection': analysis_sha = sha version_key = ndb.Key(Library, self.library.key.id(), Version, tag) content = Content.get_by_id('analysis', parent=version_key) if content is None or content.status == Status.error: Content(id='analysis', parent=version_key, status=Status.pending).put() task_url = util.ingest_analysis_task(self.owner, self.repo, tag, analysis_sha) util.new_task(task_url, target='analysis', transactional=transactional, queue_name='analysis')
def test_analyzer_index_empty(self): metadata = """{ "full_name": "full-name" }""" library_key = Library(id='owner/repo', metadata=metadata).put() version_key = Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() content = Content(id='analysis', parent=version_key, status=Status.pending) data = {"analyzerData": {}} content.json = data content.status = Status.ready content.put() VersionCache.update(library_key) response = self.app.get(util.update_indexes_task('owner', 'repo'), headers={'X-AppEngine-QueueName': 'default'}) self.assertEqual(response.status_int, 200) index = search.Index('repo') document = index.get('owner/repo') self.assertIsNotNone(document) self.assertTrue(len(document.fields) > 0) elements = [field for field in document.fields if field.name == 'element'] self.assertEqual(len(elements), 0) behaviors = [field for field in document.fields if field.name == 'behavior'] self.assertEqual(len(behaviors), 0)
def update_bower(self): response = urlfetch.fetch(util.content_url(self.owner, self.repo, self.sha, 'bower.json'), validate_certificate=True) if response.status_code == 200: try: bower_json = json.loads(response.content) except ValueError: return self.error("could not parse bower.json", ErrorCodes.Version_parse_bower) Content(parent=self.version_key, id='bower', json=bower_json, status=Status.ready, etag=response.headers.get('ETag', None)).put() return bower_json elif response.status_code == 404: return self.error("missing bower.json", ErrorCodes.Version_missing_bower) else: return self.retry('could not access bower.json (%d)' % response.status_code)
def post(self): message_json = json.loads(urllib.unquote(self.request.body).rstrip('=')) message = message_json['message'] data = base64.b64decode(str(message['data'])) attributes = message['attributes'] owner = attributes['owner'] repo = attributes['repo'] version = attributes['version'] logging.info('Ingesting analysis data %s/%s/%s', owner, repo, version) parent = Version.get_by_id(version, parent=ndb.Key(Library, '%s/%s' % (owner, repo))) # Don't accept the analysis data unless the version still exists in the datastore if parent is not None: content = Content(parent=parent.key, id='analysis', content=data) try: content.put() # TODO: Which exception is this for? # pylint: disable=bare-except except: logging.error(sys.exc_info()[0]) self.response.set_status(200)
def get(self, owner, repo, ver=None): use_analyzer_data = self.request.get('use_analyzer_data', None) is not None self.response.headers['Access-Control-Allow-Origin'] = '*' owner = owner.lower() repo = repo.lower() library_key = ndb.Key(Library, Library.id(owner, repo)) if ver is None: ver = yield Library.default_version_for_key_async(library_key) if ver is None: self.response.set_status(404) return version_key = ndb.Key(Library, Library.id(owner, repo), Version, ver) analysis = Content.get_by_id('analysis', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if analysis is None: self.response.set_status(404) return self.response.headers['Content-Type'] = 'application/json' result = {} result['status'] = analysis.status if analysis.status == Status.ready: content = analysis.get_json() has_analyzer_data = content.get('analyzerData', None) is not None if use_analyzer_data and has_analyzer_data: # Use the analyzer data fields result['analysis'] = content['analyzerData'] else: # Use the hydrolysis fields and delete the analyzer ones if has_analyzer_data: del content['analyzerData'] result['content'] = content if analysis.status == Status.error: result['error'] = analysis.error if result['status'] != Status.ready: self.response.set_status(400) self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(result))
def test_analyze_latest(self): library_key = Library(id='owner/repo').put() Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() version_key = Version(id='v1.1.2', parent=library_key, sha='sha', status='ready').put() VersionCache.update(library_key) response = self.app.get('/task/analyze/owner/repo/True', headers={'X-AppEngine-QueueName': 'default'}) self.assertEqual(response.status_int, 200) content = Content.get_by_id('analysis', parent=version_key) self.assertEqual(content.get_json(), None) self.assertEqual(content.status, Status.pending) tasks = self.tasks.get_filtered_tasks() self.assertEqual([ util.ingest_analysis_task('owner', 'repo', 'v1.1.2'), ], [task.url for task in tasks])
def get(self, owner, repo, ver, path): self.response.headers['Access-Control-Allow-Origin'] = '*' version_key = ndb.Key(Library, Library.id(owner, repo), Version, ver) if version_key is None: self.response.set_status(404) self.response.write('Invalid repo/version') return page = Content.get_by_id('page-' + path, parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if page is None: self.response.set_status(404) self.response.write('Cannot find page %s' % path) return self.response.write(page.content)
def get(self, owner, repo, ver=None): self.response.headers['Access-Control-Allow-Origin'] = '*' owner = owner.lower() repo = repo.lower() version_key = ndb.Key(Library, '%s/%s' % (owner, repo), Version, ver) hydrolyzer = Content.get_by_id('hydrolyzer', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if hydrolyzer is None: self.response.set_status(404) return dependencies = json.loads(hydrolyzer.content).get('bowerDependencies', None) if dependencies is None: self.response.set_status(404) return self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(dependencies))
def update_search_index(self, owner, repo, version_key, library, bower): metadata = json.loads(library.metadata) fields = [ search.AtomField(name='owner', value=owner), search.TextField(name='repo', value=repo), search.AtomField(name='kind', value=library.kind), search.AtomField(name='version', value=version_key.id()), search.TextField(name='github_description', value=metadata.get('description', '')), search.TextField(name='bower_description', value=bower.get('description', '')), search.TextField(name='bower_keywords', value=' '.join(bower.get('keywords', []))), search.TextField(name='prefix_matches', value=' '.join(util.generate_prefixes_from_list( util.safe_split_strip(metadata.get('description')) + util.safe_split_strip(bower.get('description')) + util.safe_split_strip(repo)))), ] # Generate weighting field weights = [(repo, 10)] analysis = Content.get_by_id('analysis', parent=version_key) if analysis is not None and analysis.status == Status.ready: analysis = json.loads(analysis.content) elements = analysis.get('elementsByTagName', {}).keys() if elements != []: fields.append(search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behaviors = analysis.get('behaviorsByName', {}).keys() if behaviors != []: fields.append(search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) weighted = [] for value, weight in weights: for _ in range(0, weight): weighted.append(value) fields.append(search.TextField(name='weighted_fields', value=' '.join(weighted))) rank = int((library.updated - datetime.datetime(2016, 1, 1)).total_seconds()) document = search.Document(doc_id=Library.id(owner, repo), fields=fields, rank=rank) index = search.Index('repo') index.put(document)
def update_search_index(self, owner, repo, version_key, library, bower): metadata = json.loads(library.metadata) fields = [ search.AtomField(name='owner', value=owner), search.TextField(name='repo', value=repo), search.AtomField(name='kind', value=library.kind), search.AtomField(name='version', value=version_key.id()), search.TextField(name='github_description', value=metadata.get('description', '')), search.TextField(name='bower_description', value=bower.get('description', '')), search.TextField(name='bower_keywords', value=' '.join(bower.get('keywords', []))), search.TextField( name='prefix_matches', value=' '.join( util.generate_prefixes_from_list( [repo] + util.safesplit(metadata.get('description')) + util.safesplit(bower.get('description')) + repo.replace("_", " ").replace("-", " ").split()))), ] analysis = Content.get_by_id('analysis', parent=version_key) if analysis is not None and analysis.status == Status.ready: analysis = json.loads(analysis.content) elements = analysis.get('elementsByTagName', {}).keys() if elements != []: fields.append( search.TextField(name='element', value=' '.join(elements))) behaviors = analysis.get('behaviorsByName', {}).keys() if behaviors != []: fields.append( search.TextField(name='behavior', value=' '.join(behaviors))) document = search.Document(doc_id=Library.id(owner, repo), fields=fields) index = search.Index('repo') index.put(document)
def get(self, owner, repo, ver=None): # TODO: Share all of this boilerplate between GetDataMeta and GetHydroData self.response.headers['Access-Control-Allow-Origin'] = '*' owner = owner.lower() repo = repo.lower() library_key = ndb.Key(Library, '%s/%s' % (owner, repo)) # TODO: version shouldn't be optional here if ver is None: versions = Version.query(ancestor=library_key).map(lambda v: v.key.id()) versions.sort(versiontag.compare) if versions == []: self.response.set_status(404) return ver = versions[-1] version_key = ndb.Key(Library, '%s/%s' % (owner, repo), Version, ver) hydro = Content.get_by_id('hydrolyzer', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if hydro is None: self.response.set_status(404) return self.response.headers['Content-Type'] = 'application/json' self.response.write(hydro.content)
def handle_post(self): # Ignore payloads larger than 5 MB. if len(self.request.body) > 1048487 * 5: return message_json = json.loads(urllib.unquote(self.request.body).rstrip('=')) message = message_json['message'] data = base64.b64decode(str(message['data'])) attributes = message['attributes'] if len(attributes) == 0: logging.error(message) return owner = attributes['owner'] repo = attributes['repo'] version = attributes['version'] error = attributes.get('error', None) version_key = ndb.Key(Library, Library.id(owner, repo), Version, version) content = Content.get_by_id('analysis', parent=version_key) if content is None: return if data == '': content.set_json(None) else: content.set_json(json.loads(data)) if error is None: content.status = Status.ready content.error = None else: content.status = Status.error content.error = error content.put() if version_key.id() == Library.default_version_for_key_async(version_key.parent()).get_result(): task_url = util.update_indexes_task(owner, repo) util.new_task(task_url, target='manage')
def handle_post(self): message_json = json.loads(urllib.unquote(self.request.body).rstrip('=')) message = message_json['message'] data = base64.b64decode(str(message['data'])) attributes = message['attributes'] owner = attributes['owner'] repo = attributes['repo'] version = attributes['version'] error = attributes.get('error', None) version_key = ndb.Key(Library, Library.id(owner, repo), Version, version) content = Content.get_by_id('analysis', parent=version_key) if content is None: return if data == '': content.content = None elif len(data) > 500000: # Max entity size is only 1MB. logging.error('content was too large: %d %s %s', len(data), Library.id(owner, repo), version) error = 'content was too large: %d' % len(data) else: content.content = data if error is None: content.status = Status.ready content.error = None else: content.status = Status.error content.error = error content.put() if version_key.id() == Library.default_version_for_key_async(version_key.parent()).get_result(): task_url = util.update_indexes_task(owner, repo) util.new_task(task_url, target='manage')
def test_compressed(self): library_key = Library(id='owner/repo').put() version_key = Version(id='v1.1.1', parent=library_key, sha='sha', status='ready').put() content = Content(id='analysis', parent=version_key, status=Status.pending) content.json = dict({"analyzerData": "some data"}) content.status = Status.ready content.put() response = self.app.get( '/api/docs/owner/repo/v1.1.1?use_analyzer_data') self.assertEqual(response.status_int, 200) self.assertEqual( json.loads(response.normal_body).get('analysis'), "some data")
def get(self, owner, repo, version): logging.info('ingesting version %s/%s/%s', owner, repo, version) github = quota.GitHub() if not github.reserve(1): self.response.set_status(500) return key = ndb.Key(Library, '%s/%s' % (owner, repo), Version, version) response = urlfetch.fetch(util.content_url(owner, repo, version, 'README.md')) readme = response.content try: content = Content(parent=key, id='readme', content=readme) content.etag = response.headers.get('ETag', None) content.put() except db.BadValueError: ver = key.get() ver.error = "Could not store README.md as a utf-8 string" ver.put() self.response.set_status(200) return response = github.markdown(readme) content = Content(parent=key, id='readme.html', content=response.content) content.put() response = urlfetch.fetch(util.content_url(owner, repo, version, 'bower.json')) try: json.loads(response.content) except ValueError: ver = key.get() ver.error = "This version has a missing or broken bower.json" ver.put() self.response.set_status(200) return content = Content(parent=key, id='bower', content=response.content) content.etag = response.headers.get('ETag', None) content.put() versions = Library.versions_for_key(key.parent()) if versions[-1] == version: library = key.parent().get() if library.kind == "collection": task_url = util.ingest_dependencies_task(owner, repo, version) util.new_task(task_url) bower = json.loads(response.content) metadata = json.loads(library.metadata) logging.info('adding search index for %s', version) description = bower.get("description", metadata.get("description", "")) document = search.Document(doc_id='%s/%s' % (owner, repo), fields=[ search.AtomField(name='full_name', value=metadata['full_name']), search.TextField(name='owner', value=owner), search.TextField(name='repo', value=repo), search.TextField(name='version', value=version), search.TextField(name='repoparts', value=' '.join(repo.split('-'))), search.TextField(name='description', value=description), search.TextField(name='keywords', value=' '.join(bower.get('keywords', []))), search.NumberField(name='stars', value=metadata.get('stargazers_count')), search.NumberField(name='subscribers', value=metadata.get('subscribers_count')), search.NumberField(name='forks', value=metadata.get('forks')), search.NumberField(name='contributors', value=library.contributor_count), search.DateField(name='updated_at', value=datetime.datetime.strptime(metadata.get('updated_at'), TIME_FORMAT)) ]) index = search.Index('repo') index.put(document) self.response.set_status(200)
def update_search_index(self, owner, repo, version_key, library, bower): metadata = json.loads(library.metadata) registry_metadata = json.loads(library.registry_metadata) if library.registry_metadata else None npm_description = registry_metadata.get('description', '') if registry_metadata else '' npm_keywords = registry_metadata.get('keywords', []) if registry_metadata else [] fields = [ search.AtomField(name='owner', value=owner), search.AtomField(name='github_owner', value=library.github_owner), search.TextField(name='repo', value=repo), search.AtomField(name='kind', value=library.kind), search.AtomField(name='version', value=version_key.id()), search.TextField(name='github_description', value=metadata.get('description', '')), search.TextField(name='bower_description', value=bower.get('description', '')), search.TextField(name='npm_description', value=npm_description), search.TextField(name='bower_keywords', value=' '.join(bower.get('keywords', []))), search.TextField(name='npm_keywords', value=' '.join(npm_keywords)), search.TextField(name='prefix_matches', value=' '.join(util.generate_prefixes_from_list( util.safe_split_strip(metadata.get('description')) + util.safe_split_strip(bower.get('description')) + util.safe_split_strip(repo)))), ] # Generate weighting field weights = [(repo, 10)] analysis = Content.get_by_id('analysis', parent=version_key) if analysis is not None and analysis.status == Status.ready: data = analysis.get_json() if data.get('analyzerData', None) is not None: # Use analyzer data for search index element_objects = data.get('analyzerData', {}).get('elements', []) elements = [element.get('tagname', '') or element.get('classname', '') for element in element_objects] if elements != []: fields.append(search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behavior_objects = data.get('analyzerData', {}).get('metadata', {}).get('polymer', {}).get('behaviors', []) behaviors = [behavior.get('name', '') for behavior in behavior_objects] if behaviors != []: fields.append(search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) else: # Use hydrolysis data for search index elements = data.get('elementsByTagName', {}).keys() if elements != []: fields.append(search.TextField(name='element', value=' '.join(elements))) weights.append((' '.join(elements), 5)) behaviors = data.get('behaviorsByName', {}).keys() if behaviors != []: fields.append(search.TextField(name='behavior', value=' '.join(behaviors))) weights.append((' '.join(behaviors), 5)) weighted = [] for value, weight in weights: for _ in range(0, weight): weighted.append(value) fields.append(search.TextField(name='weighted_fields', value=' '.join(weighted))) rank = int((library.updated - datetime.datetime(2016, 1, 1)).total_seconds()) document = search.Document(doc_id=Library.id(owner, repo), fields=fields, rank=rank) index = search.Index('repo') index.put(document)
def full_async(library_key, tag=None, brief=False, assume_latest=False): if assume_latest: assert tag is not None library_future = library_key.get_async() if tag is None or not brief or not assume_latest: versions_future = Library.versions_for_key_async(library_key) if tag is None: versions = yield versions_future default_version = versiontag.default_version(versions) version_key = None if len(versions) == 0 else ndb.Key(Library, library_key.id(), Version, default_version) else: version_key = ndb.Key(Library, library_key.id(), Version, tag) if version_key is not None: version_future = version_key.get_async() bower_future = Content.get_by_id_async('bower', parent=version_key) if not brief: readme_future = Content.get_by_id_async('readme.html', parent=version_key) library = yield library_future if library is None or library.status == Status.suppressed: raise ndb.Return(None) result = {} # Add NPM package fields key = library_key.string_id() if key.startswith('@'): parts = key.split('/') if parts[0] != '@@npm': result['npmScope'] = parts[0] result['npmFullPackage'] = key else: result['npmFullPackage'] = parts[1] result['npmPackage'] = parts[1] if library.migrated_from_bower: result['migratedFromBower'] = True elif library.npm_package: result['migratedToNpm'] = library.npm_package result['apiKey'] = key result['kind'] = library.kind result['status'] = library.status if library.status != Status.ready: if library.status == Status.error: result['error'] = library.error raise ndb.Return(result) version = None if version_key is not None: version = yield version_future if version is None: raise ndb.Return(None) result['spdx_identifier'] = library.spdx_identifier result['version'] = version.key.id() if version.status != Status.ready: result['status'] = version.status if version.status == Status.error: result['error'] = version.error raise ndb.Return(result) if not brief or not assume_latest: versions = yield versions_future result['versions'] = versions if len(versions) > 0: result['default_version'] = versiontag.default_version(versions) # Remove latest_version once deployed clients all use default_version result['latest_version'] = versiontag.default_version(versions) if not brief and library.participation is not None: result['activity'] = json.loads(library.participation).get('all', []) if not brief and library.contributors is not None: contributors = [] raw = json.loads(library.contributors) for contributor in raw: contributors.append({ 'login': contributor['login'], 'avatar_url': contributor['avatar_url'], 'contributions': contributor['contributions'], }) result['contributors'] = contributors if library.metadata is not None: metadata = json.loads(library.metadata) result['description'] = metadata.get('description', '') result['subscribers'] = metadata.get('subscribers_count', 0) result['stars'] = metadata.get('stargazers_count', 0) result['forks'] = metadata.get('forks', 0) result['open_issues'] = metadata.get('open_issues', 0) result['updated_at'] = metadata.get('updated_at', 0) result['owner'] = metadata['owner']['login'] result['avatar_url'] = metadata['owner'].get('avatar_url', '') result['repo'] = metadata['name'] if metadata.get('homepage') and re.match(r'https?', metadata.get('homepage')): result['homepage'] = metadata['homepage'] result['default_branch'] = metadata.get('default_branch', 'master') if not brief: readme = yield readme_future result['readme'] = None if readme is None else readme.content bower = yield bower_future if bower is not None: bower_json = bower.get_json() dependencies = bower_json.get('dependencies', {}) result['dependency_count'] = len(dependencies) result['bower'] = { 'license': bower_json.get('license', ''), 'dependencies': dependencies, 'keywords': bower_json.get('keywords', []), 'demos': bower_json.get('demos', {}), 'pages': bower_json.get('pages', {}), } if result.get('description', '') == '': result['description'] = bower_json.get('description', '') raise ndb.Return(result)
def get(self, owner, repo, tag, name=None, path=None): self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' if name is None and path is None: self.response.write( 'Invalid request. Try using a relative path if you are using an absolute path.' ) self.response.set_status(400) return owner = owner.lower() repo = repo.lower() version_key = ndb.Key(Library, '%s/%s' % (owner, repo), Version, tag) if path.endswith('/'): path = path + 'index.html' analysis = Content.get_by_id('analysis', parent=version_key, read_policy=ndb.EVENTUAL_CONSISTENCY) if analysis is None or analysis.status != Status.ready: self.response.write('could not find analysis for %s in %s/%s' % (tag, owner, repo)) self.response.set_status(404) return dependencies = analysis.get_json().get('bowerDependencies', None) if dependencies is None: self.response.write('could not find dependencies for %s in %s/%s' % (tag, owner, repo)) self.response.set_status(404) return config_map = {} for dependency in dependencies: if dependency['owner'] == owner and dependency['repo'] == repo: continue config_map[dependency['name']] = '%s/%s/%s' % ( dependency['owner'], dependency['repo'], dependency['version']) # Ensure the repo serves its own version. config_map[repo] = '%s/%s/%s' % (owner, repo, tag) def resolve(name, path): return 'https://cdn.rawgit.com/%s%s' % ( config_map[name], path) if name in config_map else None # debug mode if name is None or path is None: for k in config_map: self.response.write('/%s/%s/%s/components/%s/... -> %s\n' % (owner, repo, tag, k, resolve(k, '/...'))) self.response.write('\n') return resolved = resolve(name, path) if resolved is None: self.response.write('%s is not a valid dependency for %s/%s#%s' % (name, owner, repo, tag)) self.response.set_status(400) return # TODO: Figure out what other types this is necessary for. eg. do we need it for CSS @import? # We need to serve html files from the same origin, so that relative urls load correctly. if path.endswith('.html'): # TODO: Decide whether this should be memcached. Appengine's urlfetch already does caching. response = urlfetch.fetch(resolved, validate_certificate=True) if response.status_code == 200: self.response.write(response.content) self.response.headers['cache-control'] = response.headers.get( 'cache-control', 'max-age=315569000') self.response.headers['content-type'] = response.headers.get( 'content-type', 'text/html') else: self.response.write('could not fetch: %s' % resolved) self.response.set_status(400) else: self.response.set_status(301) self.response.headers['Location'] = str(resolved) self.response.headers['cache-control'] = 'max-age=315569000'
def full_async(library_key, tag=None, brief=False, assume_latest=False): if assume_latest: assert tag is not None library_future = library_key.get_async() if tag is None or not brief or not assume_latest: versions_future = Library.versions_for_key_async(library_key) if tag is None: versions = yield versions_future default_version = versiontag.default_version(versions) version_key = None if len(versions) == 0 else ndb.Key( Library, library_key.id(), Version, default_version) else: version_key = ndb.Key(Library, library_key.id(), Version, tag) if version_key is not None: version_future = version_key.get_async() bower_future = Content.get_by_id_async('bower', parent=version_key) if not brief: readme_future = Content.get_by_id_async('readme.html', parent=version_key) library = yield library_future if library is None or library.status == Status.suppressed: raise ndb.Return(None) result = {} result['kind'] = library.kind result['status'] = library.status if library.status != Status.ready: if library.status == Status.error: result['error'] = library.error raise ndb.Return(result) version = None if version_key is not None: version = yield version_future if version is None: raise ndb.Return(None) result['spdx_identifier'] = library.spdx_identifier result['version'] = version.key.id() if version.status != Status.ready: result['status'] = version.status if version.status == Status.error: result['error'] = version.error raise ndb.Return(result) if not brief or not assume_latest: versions = yield versions_future result['versions'] = versions if len(versions) > 0: result['default_version'] = versiontag.default_version( versions) # Remove latest_version once deployed clients all use default_version result['latest_version'] = versiontag.default_version(versions) if not brief and library.participation is not None: result['activity'] = json.loads(library.participation).get( 'all', []) if not brief and library.contributors is not None: contributors = [] raw = json.loads(library.contributors) for contributor in raw: contributors.append({ 'login': contributor['login'], 'avatar_url': contributor['avatar_url'], 'contributions': contributor['contributions'], }) result['contributors'] = contributors if library.metadata is not None: metadata = json.loads(library.metadata) result['description'] = metadata['description'] result['subscribers'] = metadata['subscribers_count'] result['stars'] = metadata['stargazers_count'] result['forks'] = metadata['forks'] result['open_issues'] = metadata['open_issues'] result['updated_at'] = metadata['updated_at'] result['owner'] = metadata['owner']['login'] result['avatar_url'] = metadata['owner']['avatar_url'] result['repo'] = metadata['name'] result['homepage'] = metadata['homepage'] result['default_branch'] = metadata['default_branch'] if not brief: readme = yield readme_future result['readme'] = None if readme is None else readme.content bower = yield bower_future if bower is not None: bower_json = json.loads(bower.content) dependencies = bower_json.get('dependencies', []) result['dependency_count'] = len(dependencies) result['bower'] = { 'license': bower_json.get('license', ''), 'dependencies': dependencies, 'keywords': bower_json.get('keywords', []), } if result.get('description', '') == '': result['description'] = bower_json.get('description', '') raise ndb.Return(result)
def get(self, owner, repo, ver=None): owner = owner.lower() repo = repo.lower() library = Library.get_by_id('%s/%s' % (owner, repo), read_policy=ndb.EVENTUAL_CONSISTENCY) if library is None or library.error is not None: self.response.write(str(library)) self.response.set_status(404) return versions = library.versions() if ver is None: ver = versions[-1] version = Version.get_by_id(ver, parent=library.key, read_policy=ndb.EVENTUAL_CONSISTENCY) if version is None or version.error is not None: self.response.write(str(version)) self.response.set_status(404) return metadata = json.loads(library.metadata) dependencies = [] bower = Content.get_by_id('bower', parent=version.key, read_policy=ndb.EVENTUAL_CONSISTENCY) if bower is not None: try: bower_json = json.loads(bower.content) # TODO: Which exception is this for? # pylint: disable=bare-except except: bower_json = {} readme = Content.get_by_id('readme.html', parent=version.key, read_policy=ndb.EVENTUAL_CONSISTENCY) full_name_match = re.match(r'(.*)/(.*)', metadata['full_name']) result = { 'version': ver, 'versions': versions, 'readme': None if readme is None else readme.content, 'subscribers': metadata['subscribers_count'], 'stars': metadata['stargazers_count'], 'forks': metadata['forks'], 'contributors': library.contributor_count, 'open_issues': metadata['open_issues'], 'updated_at': metadata['updated_at'], 'owner': full_name_match.groups()[0], 'repo': full_name_match.groups()[1], 'bower': None if bower is None else { 'description': bower_json.get('description', ''), 'license': bower_json.get('license', ''), 'dependencies': bower_json.get('dependencies', []), 'keywords': bower_json.get('keywords', []), }, 'collections': [] } for collection in library.collections: if not versiontag.match(ver, collection.semver): continue collection_version = collection.version.id() collection_library = collection.version.parent().get() collection_metadata = json.loads(collection_library.metadata) collection_name_match = re.match(r'(.*)/(.*)', collection_metadata['full_name']) result['collections'].append({ 'owner': collection_name_match.groups()[0], 'repo': collection_name_match.groups()[1], 'version': collection_version }) if library.kind == 'collection': dependencies = [] version_futures = [] for dep in version.dependencies: parsed_dep = Dependency.fromString(dep) dep_key = ndb.Key(Library, "%s/%s" % (parsed_dep.owner.lower(), parsed_dep.repo.lower())) version_futures.append(Library.versions_for_key_async(dep_key)) for i, dep in enumerate(version.dependencies): parsed_dep = Dependency.fromString(dep) versions = version_futures[i].get_result() versions.reverse() while len(versions) > 0 and not versiontag.match(versions[0], parsed_dep.version): versions.pop() if len(versions) == 0: dependencies.append({ 'error': 'unsatisfyable dependency', 'owner': parsed_dep.owner, 'repo': parsed_dep.repo, 'versionSpec': parsed_dep.version }) else: dependencies.append(brief_metadata_from_datastore(parsed_dep.owner, parsed_dep.repo, versions[0])) result['dependencies'] = dependencies self.response.headers['Access-Control-Allow-Origin'] = '*' self.response.headers['Content-Type'] = 'application/json' self.response.write(json.dumps(result))