Example #1
0
def _build_vectorizer(init_args, conf, pipeline_list):
    """
    Builds a vectorized that converts raw text to feature vectors. The
    parameters for the vectorizer are specified in the *feature extraction*
    section of the configuration file. These will be matched to those of the
     *__init__* method of the    vectorizer class and the matching keywords
     are passed to the vectorizer. The non-matching arguments are simply
     ignored.

     The vectorizer converts a corpus into a term frequency matrix. A given
     source corpus is converted into a term frequency matrix and
     returned as a numpy *coo_matrix*.The value of the *vectorizer* field
     in the main configuration file is used as the transformer class. This class
     has to implement the methods *fit*, *transform* and *fit_transform* as
     per scikit-learn.
    """
    vectorizer = get_named_object(conf['vectorizer']['class'])

    # get the names of the arguments that the vectorizer class takes
    # the object must only take keyword arguments
    # todo KmeansVectorizer does not declare its parameters explicitly so intersection doesnt work
    # instead its constructor should take **kwargs, and we can pass in whatever we want with no need to manually check
    # which parameters are valid for that object
    init_args.update(get_intersection_of_parameters(vectorizer, conf['feature_extraction'], 'vect'))
    init_args.update(get_intersection_of_parameters(vectorizer, conf['vectorizer'], 'vect'))
    init_args.update(get_intersection_of_parameters(vectorizer, conf, 'vect')) # get debug_level from conf file
    pipeline_list.append(('vect', vectorizer()))
Example #2
0
def _build_classifiers(classifiers_conf):
    for i, clf_name in enumerate(classifiers_conf):
        if not classifiers_conf[clf_name]:
            continue
            # ignore disabled classifiers
        if not classifiers_conf[clf_name]['run']:
            logging.debug('Ignoring classifier %s' % clf_name)
            continue
        clf = get_named_object(clf_name)
        init_args = get_intersection_of_parameters(clf, classifiers_conf[clf_name])
        yield clf(**init_args)
Example #3
0
def _build_feature_selector(init_args, feature_selection_conf, pipeline_list):
    """
    If feature selection is required, this function appends a selector
    object to pipeline_list and its configuration to configuration. Note this
     function modifies (appends to) its input arguments
    """
    if feature_selection_conf['run']:
        method = get_named_object(feature_selection_conf['method'])
        scoring = feature_selection_conf.get('scoring_function')
        logging.info('Scoring function is %s', scoring)
        scoring_func = get_named_object(scoring) if scoring else None

        # the parameters for steps in the Pipeline are defined as
        # <component_name>__<arg_name> - the Pipeline (which is actually a
        # BaseEstimator) takes care of passing the correct arguments down
        # along the pipeline, provided there are no name clashes between the
        # keyword arguments of two consecutive transformers.

        init_args.update(get_intersection_of_parameters(method, feature_selection_conf, 'fs'))
        logging.info('FS method is %s', method)
        pipeline_list.append(('fs', method(scoring_func)))