Exemple #1
0
def feed_backend(url, clean, fetch_archive, backend_name, backend_params,
                 es_index=None, es_index_enrich=None, project=None, arthur=False):
    """ Feed Ocean with backend data """

    backend = None
    repo = {'backend_name': backend_name, 'backend_params': backend_params}  # repository data to be stored in conf

    if es_index:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend %s" % backend_name)
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        logger.info("Feeding Ocean from %s (%s)", backend_name, es_index)

        if not es_index:
            logger.error("Raw index not defined for %s", backend_name)

        repo['repo_update_start'] = datetime.now().isoformat()

        # perceval backends fetch params
        offset = None
        from_date = None
        category = None
        latest_items = None

        backend_cmd = klass(*backend_params)

        parsed_args = vars(backend_cmd.parsed_args)
        init_args = find_signature_parameters(backend_cmd.BACKEND,
                                              parsed_args)

        if backend_cmd.archive_manager and fetch_archive:
            archive = Archive(parsed_args['archive_path'])
        else:
            archive = backend_cmd.archive_manager.create_archive() if backend_cmd.archive_manager else None

        init_args['archive'] = archive
        backend_cmd.backend = backend_cmd.BACKEND(**init_args)
        backend = backend_cmd.backend

        ocean_backend = connector[1](backend, fetch_archive=fetch_archive, project=project)
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend)
        ocean_backend.set_elastic(elastic_ocean)

        if fetch_archive:
            signature = inspect.signature(backend.fetch_from_archive)
        else:
            signature = inspect.signature(backend.fetch)

        if 'from_date' in signature.parameters:
            try:
                # Support perceval pre and post BackendCommand refactoring
                from_date = backend_cmd.from_date
            except AttributeError:
                from_date = backend_cmd.parsed_args.from_date

        if 'offset' in signature.parameters:
            try:
                offset = backend_cmd.offset
            except AttributeError:
                offset = backend_cmd.parsed_args.offset

        if 'category' in signature.parameters:
            try:
                category = backend_cmd.category
            except AttributeError:
                try:
                    category = backend_cmd.parsed_args.category
                except AttributeError:
                    pass

        if 'latest_items' in signature.parameters:
            try:
                latest_items = backend_cmd.latest_items
            except AttributeError:
                latest_items = backend_cmd.parsed_args.latest_items

        # fetch params support
        if arthur:
            # If using arthur just provide the items generator to be used
            # to collect the items and upload to Elasticsearch
            aitems = feed_backend_arthur(backend_name, backend_params)
            ocean_backend.feed(arthur_items=aitems)
        elif latest_items:
            if category:
                ocean_backend.feed(latest_items=latest_items, category=category)
            else:
                ocean_backend.feed(latest_items=latest_items)
        elif offset:
            if category:
                ocean_backend.feed(from_offset=offset, category=category)
            else:
                ocean_backend.feed(from_offset=offset)
        elif from_date and from_date.replace(tzinfo=None) != parser.parse("1970-01-01"):
            if category:
                ocean_backend.feed(from_date, category=category)
            else:
                ocean_backend.feed(from_date)
        elif category:
            ocean_backend.feed(category=category)
        else:
            ocean_backend.feed()

    except Exception as ex:
        if backend:
            logger.error("Error feeding ocean from %s (%s): %s", backend_name, backend.origin, ex)
            # this print makes blackbird fails
            traceback.print_exc()
        else:
            logger.error("Error feeding ocean %s" % ex)
            traceback.print_exc()

    logger.info("Done %s " % (backend_name))
Exemple #2
0
def feed_backend(url, clean, fetch_archive, backend_name, backend_params,
                 es_index=None, es_index_enrich=None, project=None,
                 es_aliases=None, projects_json_repo=None, repo_labels=None,
                 anonymize=False):
    """ Feed Ocean with backend data """

    error_msg = None
    backend = None
    repo = {'backend_name': backend_name, 'backend_params': backend_params}  # repository data to be stored in conf

    if es_index:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend {}".format(backend_name))
    connector = get_connector_from_name(backend_name)
    klass = connector[3]  # BackendCmd for the connector

    try:
        logger.debug("Feeding raw from {} ({})".format(backend_name, es_index))

        if not es_index:
            logger.error("Raw index not defined for {}".format(backend_name))

        repo['repo_update_start'] = datetime.now().isoformat()

        # perceval backends fetch params
        offset = None
        from_date = None
        category = None
        branches = None
        latest_items = None
        filter_classified = None

        backend_cmd = klass(*backend_params)

        parsed_args = vars(backend_cmd.parsed_args)
        init_args = find_signature_parameters(backend_cmd.BACKEND,
                                              parsed_args)

        if backend_cmd.archive_manager and fetch_archive:
            archive = Archive(parsed_args['archive_path'])
        else:
            archive = backend_cmd.archive_manager.create_archive() if backend_cmd.archive_manager else None

        init_args['archive'] = archive
        backend_cmd.backend = backend_cmd.BACKEND(**init_args)
        backend = backend_cmd.backend

        ocean_backend = connector[1](backend, fetch_archive=fetch_archive, project=project, anonymize=anonymize)
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend, es_aliases)
        ocean_backend.set_elastic(elastic_ocean)
        ocean_backend.set_repo_labels(repo_labels)
        ocean_backend.set_projects_json_repo(projects_json_repo)

        if fetch_archive:
            signature = inspect.signature(backend.fetch_from_archive)
        else:
            signature = inspect.signature(backend.fetch)

        if 'from_date' in signature.parameters:
            try:
                # Support perceval pre and post BackendCommand refactoring
                from_date = backend_cmd.from_date
            except AttributeError:
                from_date = backend_cmd.parsed_args.from_date

        if 'offset' in signature.parameters:
            try:
                offset = backend_cmd.offset
            except AttributeError:
                offset = backend_cmd.parsed_args.offset

        if 'category' in signature.parameters:
            try:
                category = backend_cmd.category
            except AttributeError:
                try:
                    category = backend_cmd.parsed_args.category
                except AttributeError:
                    pass

        if 'branches' in signature.parameters:
            try:
                branches = backend_cmd.branches
            except AttributeError:
                try:
                    branches = backend_cmd.parsed_args.branches
                except AttributeError:
                    pass

        if 'filter_classified' in signature.parameters:
            try:
                filter_classified = backend_cmd.parsed_args.filter_classified
            except AttributeError:
                pass

        if 'latest_items' in signature.parameters:
            try:
                latest_items = backend_cmd.latest_items
            except AttributeError:
                latest_items = backend_cmd.parsed_args.latest_items

        params = {}
        if latest_items:
            params['latest_items'] = latest_items
        if category:
            params['category'] = category
        if branches:
            params['branches'] = branches
        if filter_classified:
            params['filter_classified'] = filter_classified
        if from_date and (from_date.replace(tzinfo=None) != parser.parse("1970-01-01")):
            params['from_date'] = from_date
        if offset:
            params['from_offset'] = offset

        ocean_backend.feed(**params)

    except RateLimitError as ex:
        logger.error("Error feeding raw from {} ({}): rate limit exceeded".format(backend_name, backend.origin))
        error_msg = "RateLimitError: seconds to reset {}".format(ex.seconds_to_reset)
    except Exception as ex:
        if backend:
            error_msg = "Error feeding raw from {} ({}): {}".format(backend_name, backend.origin, ex)
            logger.error(error_msg, exc_info=True)
        else:
            error_msg = "Error feeding raw from {}".format(ex)
            logger.error(error_msg, exc_info=True)

    logger.info("[{}] Done collection for {}".format(backend_name, backend.origin))
    return error_msg
Exemple #3
0
def feed_backend(url,
                 clean,
                 fetch_archive,
                 backend_name,
                 backend_params,
                 es_index=None,
                 es_index_enrich=None,
                 project=None,
                 es_aliases=None,
                 projects_json_repo=None,
                 repo_labels=None):
    """ Feed Ocean with backend data """

    error_msg = None
    backend = None
    repo = {
        'backend_name': backend_name,
        'backend_params': backend_params
    }  # repository data to be stored in conf

    if es_index:
        clean = False  # don't remove index, it could be shared

    if not get_connector_from_name(backend_name):
        raise RuntimeError("Unknown backend {}".format(backend_name))
    # ereturn: [GitHub, GitHubOcean, GitHubEnrich, GitHubCommand]
    # 每一个都代表着一个类,具体看 utils 里面的代码
    connector = get_connector_from_name(backend_name)
    # klass = GitHubCommand 类
    klass = connector[3]  # BackendCmd for the connector

    try:
        # log: Feeding raw from github (github_test-raw)
        logger.debug("Feeding raw from {} ({})".format(backend_name, es_index))

        if not es_index:
            logger.error("Raw index not defined for {}".format(backend_name))

        repo['repo_update_start'] = datetime.now().isoformat()

        # perceval backends fetch params
        offset = None
        from_date = None
        category = None
        branches = None
        latest_items = None
        filter_classified = None

        # 看perceval.backends.core.github 的GitHubCommand类
        # backend_cmd 即为 GitHubCommand 类的一个实例
        backend_cmd = klass(*backend_params)
        # 解析出来的 perceval 的参数
        parsed_args = vars(backend_cmd.parsed_args)
        init_args = find_signature_parameters(backend_cmd.BACKEND, parsed_args)

        if backend_cmd.archive_manager and fetch_archive:
            archive = Archive(parsed_args['archive_path'])
        else:
            archive = backend_cmd.archive_manager.create_archive(
            ) if backend_cmd.archive_manager else None

        init_args['archive'] = archive
        # BACKEND = GitHub 类, 则 backend 即为 GitHub 类的一个实例
        backend_cmd.backend = backend_cmd.BACKEND(**init_args)
        backend = backend_cmd.backend
        # connector[1] = GitHubOcean,则 ocean_backend 即为GitHubOcean类的一个实例
        # GitHubOcean 继承自 ElasticOcean, ElasticOcean继承自 ElasticItems
        # 初始化工作是在 ElasticItems 里面做
        ocean_backend = connector[1](backend,
                                     fetch_archive=fetch_archive,
                                     project=project)
        # 返回ElasticSearch类实例,设置 elastic client,用于和es交互
        elastic_ocean = get_elastic(url, es_index, clean, ocean_backend,
                                    es_aliases)
        ocean_backend.set_elastic(elastic_ocean)
        ocean_backend.set_repo_labels(repo_labels)
        ocean_backend.set_projects_json_repo(projects_json_repo)

        if fetch_archive:
            signature = inspect.signature(backend.fetch_from_archive)
        else:
            signature = inspect.signature(backend.fetch)

        if 'from_date' in signature.parameters:
            try:
                # Support perceval pre and post BackendCommand refactoring
                from_date = backend_cmd.from_date
            except AttributeError:
                from_date = backend_cmd.parsed_args.from_date

        if 'offset' in signature.parameters:
            try:
                offset = backend_cmd.offset
            except AttributeError:
                offset = backend_cmd.parsed_args.offset

        if 'category' in signature.parameters:
            try:
                category = backend_cmd.category
            except AttributeError:
                try:
                    category = backend_cmd.parsed_args.category
                except AttributeError:
                    pass

        if 'branches' in signature.parameters:
            try:
                branches = backend_cmd.branches
            except AttributeError:
                try:
                    branches = backend_cmd.parsed_args.branches
                except AttributeError:
                    pass

        if 'filter_classified' in signature.parameters:
            try:
                filter_classified = backend_cmd.parsed_args.filter_classified
            except AttributeError:
                pass

        if 'latest_items' in signature.parameters:
            try:
                latest_items = backend_cmd.latest_items
            except AttributeError:
                latest_items = backend_cmd.parsed_args.latest_items

        params = {}
        if latest_items:
            params['latest_items'] = latest_items
        if category:
            params['category'] = category
        if branches:
            params['branches'] = branches
        if filter_classified:
            params['filter_classified'] = filter_classified
        if from_date and (from_date.replace(tzinfo=None) !=
                          parser.parse("1970-01-01")):
            params['from_date'] = from_date
        if offset:
            params['from_offset'] = offset

        # 主要的调用:Feed data in Elastic from Perceval,从github获取数据也是在这里面
        ocean_backend.feed(**params)

    except RateLimitError as ex:
        logger.error(
            "Error feeding raw from {} ({}): rate limit exceeded".format(
                backend_name, backend.origin))
        error_msg = "RateLimitError: seconds to reset {}".format(
            ex.seconds_to_reset)
    except Exception as ex:
        if backend:
            error_msg = "Error feeding raw from {} ({}): {}".format(
                backend_name, backend.origin, ex)
            logger.error(error_msg, exc_info=True)
        else:
            error_msg = "Error feeding raw from {}".format(ex)
            logger.error(error_msg, exc_info=True)

    logger.info("[{}] Done collection for {}".format(backend_name,
                                                     backend.origin))
    return error_msg