def feed_backend(url, clean, fetch_archive, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None, arthur=False): """ Feed Ocean with backend data """ backend = None repo = {'backend_name': backend_name, 'backend_params': backend_params} # repository data to be stored in conf if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend %s" % backend_name) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: logger.info("Feeding Ocean from %s (%s)", backend_name, es_index) if not es_index: logger.error("Raw index not defined for %s", backend_name) repo['repo_update_start'] = datetime.now().isoformat() # perceval backends fetch params offset = None from_date = None category = None latest_items = None backend_cmd = klass(*backend_params) parsed_args = vars(backend_cmd.parsed_args) init_args = find_signature_parameters(backend_cmd.BACKEND, parsed_args) if backend_cmd.archive_manager and fetch_archive: archive = Archive(parsed_args['archive_path']) else: archive = backend_cmd.archive_manager.create_archive() if backend_cmd.archive_manager else None init_args['archive'] = archive backend_cmd.backend = backend_cmd.BACKEND(**init_args) backend = backend_cmd.backend ocean_backend = connector[1](backend, fetch_archive=fetch_archive, project=project) elastic_ocean = get_elastic(url, es_index, clean, ocean_backend) ocean_backend.set_elastic(elastic_ocean) if fetch_archive: signature = inspect.signature(backend.fetch_from_archive) else: signature = inspect.signature(backend.fetch) if 'from_date' in signature.parameters: try: # Support perceval pre and post BackendCommand refactoring from_date = backend_cmd.from_date except AttributeError: from_date = backend_cmd.parsed_args.from_date if 'offset' in signature.parameters: try: offset = backend_cmd.offset except AttributeError: offset = backend_cmd.parsed_args.offset if 'category' in signature.parameters: try: category = backend_cmd.category except AttributeError: try: category = backend_cmd.parsed_args.category except AttributeError: pass if 'latest_items' in signature.parameters: try: latest_items = backend_cmd.latest_items except AttributeError: latest_items = backend_cmd.parsed_args.latest_items # fetch params support if arthur: # If using arthur just provide the items generator to be used # to collect the items and upload to Elasticsearch aitems = feed_backend_arthur(backend_name, backend_params) ocean_backend.feed(arthur_items=aitems) elif latest_items: if category: ocean_backend.feed(latest_items=latest_items, category=category) else: ocean_backend.feed(latest_items=latest_items) elif offset: if category: ocean_backend.feed(from_offset=offset, category=category) else: ocean_backend.feed(from_offset=offset) elif from_date and from_date.replace(tzinfo=None) != parser.parse("1970-01-01"): if category: ocean_backend.feed(from_date, category=category) else: ocean_backend.feed(from_date) elif category: ocean_backend.feed(category=category) else: ocean_backend.feed() except Exception as ex: if backend: logger.error("Error feeding ocean from %s (%s): %s", backend_name, backend.origin, ex) # this print makes blackbird fails traceback.print_exc() else: logger.error("Error feeding ocean %s" % ex) traceback.print_exc() logger.info("Done %s " % (backend_name))
def feed_backend(url, clean, fetch_archive, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None, es_aliases=None, projects_json_repo=None, repo_labels=None, anonymize=False): """ Feed Ocean with backend data """ error_msg = None backend = None repo = {'backend_name': backend_name, 'backend_params': backend_params} # repository data to be stored in conf if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend {}".format(backend_name)) connector = get_connector_from_name(backend_name) klass = connector[3] # BackendCmd for the connector try: logger.debug("Feeding raw from {} ({})".format(backend_name, es_index)) if not es_index: logger.error("Raw index not defined for {}".format(backend_name)) repo['repo_update_start'] = datetime.now().isoformat() # perceval backends fetch params offset = None from_date = None category = None branches = None latest_items = None filter_classified = None backend_cmd = klass(*backend_params) parsed_args = vars(backend_cmd.parsed_args) init_args = find_signature_parameters(backend_cmd.BACKEND, parsed_args) if backend_cmd.archive_manager and fetch_archive: archive = Archive(parsed_args['archive_path']) else: archive = backend_cmd.archive_manager.create_archive() if backend_cmd.archive_manager else None init_args['archive'] = archive backend_cmd.backend = backend_cmd.BACKEND(**init_args) backend = backend_cmd.backend ocean_backend = connector[1](backend, fetch_archive=fetch_archive, project=project, anonymize=anonymize) elastic_ocean = get_elastic(url, es_index, clean, ocean_backend, es_aliases) ocean_backend.set_elastic(elastic_ocean) ocean_backend.set_repo_labels(repo_labels) ocean_backend.set_projects_json_repo(projects_json_repo) if fetch_archive: signature = inspect.signature(backend.fetch_from_archive) else: signature = inspect.signature(backend.fetch) if 'from_date' in signature.parameters: try: # Support perceval pre and post BackendCommand refactoring from_date = backend_cmd.from_date except AttributeError: from_date = backend_cmd.parsed_args.from_date if 'offset' in signature.parameters: try: offset = backend_cmd.offset except AttributeError: offset = backend_cmd.parsed_args.offset if 'category' in signature.parameters: try: category = backend_cmd.category except AttributeError: try: category = backend_cmd.parsed_args.category except AttributeError: pass if 'branches' in signature.parameters: try: branches = backend_cmd.branches except AttributeError: try: branches = backend_cmd.parsed_args.branches except AttributeError: pass if 'filter_classified' in signature.parameters: try: filter_classified = backend_cmd.parsed_args.filter_classified except AttributeError: pass if 'latest_items' in signature.parameters: try: latest_items = backend_cmd.latest_items except AttributeError: latest_items = backend_cmd.parsed_args.latest_items params = {} if latest_items: params['latest_items'] = latest_items if category: params['category'] = category if branches: params['branches'] = branches if filter_classified: params['filter_classified'] = filter_classified if from_date and (from_date.replace(tzinfo=None) != parser.parse("1970-01-01")): params['from_date'] = from_date if offset: params['from_offset'] = offset ocean_backend.feed(**params) except RateLimitError as ex: logger.error("Error feeding raw from {} ({}): rate limit exceeded".format(backend_name, backend.origin)) error_msg = "RateLimitError: seconds to reset {}".format(ex.seconds_to_reset) except Exception as ex: if backend: error_msg = "Error feeding raw from {} ({}): {}".format(backend_name, backend.origin, ex) logger.error(error_msg, exc_info=True) else: error_msg = "Error feeding raw from {}".format(ex) logger.error(error_msg, exc_info=True) logger.info("[{}] Done collection for {}".format(backend_name, backend.origin)) return error_msg
def feed_backend(url, clean, fetch_archive, backend_name, backend_params, es_index=None, es_index_enrich=None, project=None, es_aliases=None, projects_json_repo=None, repo_labels=None): """ Feed Ocean with backend data """ error_msg = None backend = None repo = { 'backend_name': backend_name, 'backend_params': backend_params } # repository data to be stored in conf if es_index: clean = False # don't remove index, it could be shared if not get_connector_from_name(backend_name): raise RuntimeError("Unknown backend {}".format(backend_name)) # ereturn: [GitHub, GitHubOcean, GitHubEnrich, GitHubCommand] # 每一个都代表着一个类,具体看 utils 里面的代码 connector = get_connector_from_name(backend_name) # klass = GitHubCommand 类 klass = connector[3] # BackendCmd for the connector try: # log: Feeding raw from github (github_test-raw) logger.debug("Feeding raw from {} ({})".format(backend_name, es_index)) if not es_index: logger.error("Raw index not defined for {}".format(backend_name)) repo['repo_update_start'] = datetime.now().isoformat() # perceval backends fetch params offset = None from_date = None category = None branches = None latest_items = None filter_classified = None # 看perceval.backends.core.github 的GitHubCommand类 # backend_cmd 即为 GitHubCommand 类的一个实例 backend_cmd = klass(*backend_params) # 解析出来的 perceval 的参数 parsed_args = vars(backend_cmd.parsed_args) init_args = find_signature_parameters(backend_cmd.BACKEND, parsed_args) if backend_cmd.archive_manager and fetch_archive: archive = Archive(parsed_args['archive_path']) else: archive = backend_cmd.archive_manager.create_archive( ) if backend_cmd.archive_manager else None init_args['archive'] = archive # BACKEND = GitHub 类, 则 backend 即为 GitHub 类的一个实例 backend_cmd.backend = backend_cmd.BACKEND(**init_args) backend = backend_cmd.backend # connector[1] = GitHubOcean,则 ocean_backend 即为GitHubOcean类的一个实例 # GitHubOcean 继承自 ElasticOcean, ElasticOcean继承自 ElasticItems # 初始化工作是在 ElasticItems 里面做 ocean_backend = connector[1](backend, fetch_archive=fetch_archive, project=project) # 返回ElasticSearch类实例,设置 elastic client,用于和es交互 elastic_ocean = get_elastic(url, es_index, clean, ocean_backend, es_aliases) ocean_backend.set_elastic(elastic_ocean) ocean_backend.set_repo_labels(repo_labels) ocean_backend.set_projects_json_repo(projects_json_repo) if fetch_archive: signature = inspect.signature(backend.fetch_from_archive) else: signature = inspect.signature(backend.fetch) if 'from_date' in signature.parameters: try: # Support perceval pre and post BackendCommand refactoring from_date = backend_cmd.from_date except AttributeError: from_date = backend_cmd.parsed_args.from_date if 'offset' in signature.parameters: try: offset = backend_cmd.offset except AttributeError: offset = backend_cmd.parsed_args.offset if 'category' in signature.parameters: try: category = backend_cmd.category except AttributeError: try: category = backend_cmd.parsed_args.category except AttributeError: pass if 'branches' in signature.parameters: try: branches = backend_cmd.branches except AttributeError: try: branches = backend_cmd.parsed_args.branches except AttributeError: pass if 'filter_classified' in signature.parameters: try: filter_classified = backend_cmd.parsed_args.filter_classified except AttributeError: pass if 'latest_items' in signature.parameters: try: latest_items = backend_cmd.latest_items except AttributeError: latest_items = backend_cmd.parsed_args.latest_items params = {} if latest_items: params['latest_items'] = latest_items if category: params['category'] = category if branches: params['branches'] = branches if filter_classified: params['filter_classified'] = filter_classified if from_date and (from_date.replace(tzinfo=None) != parser.parse("1970-01-01")): params['from_date'] = from_date if offset: params['from_offset'] = offset # 主要的调用:Feed data in Elastic from Perceval,从github获取数据也是在这里面 ocean_backend.feed(**params) except RateLimitError as ex: logger.error( "Error feeding raw from {} ({}): rate limit exceeded".format( backend_name, backend.origin)) error_msg = "RateLimitError: seconds to reset {}".format( ex.seconds_to_reset) except Exception as ex: if backend: error_msg = "Error feeding raw from {} ({}): {}".format( backend_name, backend.origin, ex) logger.error(error_msg, exc_info=True) else: error_msg = "Error feeding raw from {}".format(ex) logger.error(error_msg, exc_info=True) logger.info("[{}] Done collection for {}".format(backend_name, backend.origin)) return error_msg