Beispiel #1
0
 def resolver(path='.'):
     return pipe(
         globs,
         mapcat(lambda n: path_f(path).glob(n)),
         map(lambda p: p.read_text()),
         tuple,
     )
Beispiel #2
0
    def __or__(self, function):
        if __.is_str(function):
            # function = globals()[function]
            log.debug(f'function space: {pprint.pformat(self.function_space)}')
            function = eval(function, self.function_space)

        if is_traversal_function(function):
            nids, _neighbors = zip(*self.selection)

            return _.pipe(
                # self.selection,
                # _.mapcat(_.second),
                nids,
                # _.concat,
                set,
                # _.do(print),
                _.map(function(self.graph)),
                _.mapcat(tuple),
                set,
                lambda node_ids: self.graph(*node_ids),
            )
        elif is_introspect_function(function):
            node_ids, _nb = zip(*self.selection)
            return _.pipe(
                node_ids,
                set,
                _.map(function(self.graph)),
                ValuePipe,
            )
        else:
            node_ids, _nb = zip(*self.selection)
            return ValuePipe(function(node_ids))
Beispiel #3
0
def fetch(content, prefix):
    return {
        "parts":
        pipe(
            parse("$..layers").find(content),
            mapcat(lambda m: m.value),
            filter(lambda v: v["exportOptions"]["exportFormats"]),
            filter(lambda v: re.match(prefix, v["name"])),
            map(lambda v: glom(
                v,
                {
                    "key":
                    "name",
                    "layout": (
                        "frame",
                        {
                            "left": ("x", round),
                            "top": ("y", round),
                            "width": ("width", round),
                            "height": ("height", round),
                        },
                    ),
                },
            )),
            sorted(key=lambda p: p["key"]),
            list,
        )
    }
def visit_loop_expression(node):
    return mapcat(
        visit_node,
        iter_unlooped_nodes(
            loop_variables_nodes=node['loop_variables'],
            node=node['expression'],
        ))
def iter_json_file_names(*pathnames):
    for json_file_path in sorted(mapcat(
                lambda pathname: glob.iglob(os.path.join(ast_dir_path, pathname)),
                pathnames,
                )):
        json_file_name = os.path.basename(json_file_path)
        yield json_file_name
Beispiel #6
0
def test_flowly_kv_transform__mapcat(executor):
    actual = executor(
        kv_transform(mapcat(lambda i: [10 * i, 20 * i])),
        [(i % 2, i) for i in range(20)],
        npartitions=10,
    )
    assert sorted(actual) == sorted(
        it.chain([(i % 2, 10 * i) for i in range(20)],
                 [(i % 2, 20 * i) for i in range(20)]))
Beispiel #7
0
def test_toolz_mapcat(executor):
    actual = executor(
        mapcat(lambda s: s.upper()),
        ["ab", "cde"],
        npartitions=2,
    )
    expected = ['A', 'B', 'C', 'D', 'E']

    assert list(actual) == expected
Beispiel #8
0
def groupby_many(f, it):
    return toolz.pipe(
        it,
        curried.mapcat(
            toolz.compose_left(
                lambda element: (f(element), [element]),
                functional.star(itertools.product),
            )),
        edges_to_graph,
    )
def iter_ast_json_file_names(filenames):
    json_file_paths = pipe(
        filenames,
        map(lambda pathname: os.path.join(args.json_dir, 'ast', pathname)),
        mapcat(glob.iglob),
        sorted,
        )
    for json_file_path in json_file_paths:
        json_file_name = os.path.basename(json_file_path)
        file_name_head = os.path.splitext(json_file_name)[0]
        yield json_file_name
Beispiel #10
0
def runlist(yamls):
    nodelist = list(
        mapcat(
            lambda f: map(lambda i: (f[0], i), f[1].get('deps', [PHONY_DEP])),
            yamls.items()))
    return pipe(nodelist, nx.DiGraph,
                          nx.topological_sort,
                          list,
                          reversed,
                          filter(lambda r: r != PHONY_DEP),
                          list) # noqa yapf: disable
Beispiel #11
0
 def recursive_get_child_ids(frame: pd.DataFrame, ids: list):
     return pipe(
         ids,
         c.mapcat(lambda an_id: [
             an_id,
             *pipe(
                 get_in([0, "Ids"], frame.loc[an_id].Relationships, []),
                 Block.recursive_get_child_ids(frame),
             ),
         ]),
         list,
     )
Beispiel #12
0
def code_section(course: Course, course_root: str, path: str, lines=()):
    path = resolve_path(course_root, path)

    log.debug(f'Inserting code from {path}')
    if lines:
        lines_str = pipe(
            lines,
            mapcat(lambda v: [v] if is_int(v) else v),
            ' '.join,
            lambda t: 'hl_lines="{' + t + '}"',
        )
        return f'```\n#!python3 {lines_str}\n{path.read_text()}\n```'

    else:
        return f'```python3\n{path.read_text()}\n```'
Beispiel #13
0
def Paragraphs(text):
  """
  Creates p tags from each blank line separated block of text.

  Any blocks of text with a blank line in between them (`\n\n` sequence) is
  separated and given its own `p` tag. This is the minimum formatting you'd
  probably expect in any static content generator.

  Subscribes to the entire registry.
  """
  return t.pipe(re.split(r'(?m)(?:\n|^)(\[\|\|\d+\|\|\])', text),
                tc.mapcat(lambda x: x.split('\n\n')),
                tc.filter(lambda x: not not x),
                tc.map(lambda x: x if re.match(r'^\[\|\|\d+\|\|\]$',x) else ['p', x.rstrip()]),
                tc.cons('div'),
                list)
def parse_violations(do_request):
    """"""
    logger.info('Parsing violations')

    return toolz.compose(
        # filter out meaningless values
        curried.filter(lambda x: x not in ('IME PREDPISA', '')),
        # extract data from each row
        curried.map(lambda tr: pq(tr).find('td').eq(1).text()),
        # get all rows in tables
        curried.mapcat(lambda page: page('table.MsoNormalTable tr')),
        # get all subpages
        curried.map(do_request),
        # let's skip empty urls/strings
        curried.filter(lambda a: a),
        # get menu links
        curried.map(lambda a: pq(a).attr('href')),
        # get menu elements
        lambda doc: doc('.moduletable_menu a'),
        # get main page
        do_request,
    )(VIOLATION_URL + '/index.php')
Beispiel #15
0
    def _make_samples(meta, shuffle):
        def _to_sample(person, images):
            # Random images needed for representation interpolation (3.5)
            x1 = _get_random_image()
            x2 = _get_random_image()
            return m(id=person["id_class"] - 1,
                     images=freeze(list(images)),
                     x1=freeze(x1),
                     x2=freeze(x2))

        samples = pipe(
            meta["persons"],
            tz.take(limit) if limit is not None else tz.identity,
            tz.map(lambda p: m(p=p,
                               i=tz.partition(
                                   args.N_images,
                                   _shuffled(p["images"])
                                   if shuffle else p["images"]))),
            tz.mapcat(lambda s: [_to_sample(s.p, i) for i in s.i]),
            tz.take(limit) if limit is not None else tz.identity, list)
        if shuffle:
            random.shuffle(samples)
        return samples
def parse_sessions(do_request):
    """"""
    logger.info('Parsing sessions')

    def get_votings(voting_page):
        # parse transcripts for a session
        transcript_urls = voting_page(':contains("Zapisi seje")')\
            .closest('td')\
            .find('a')\
            .map(lambda i, r: pq(r).attr('href'))
        # TODO: parse transcript_urls

        # parse votings in a session
        epas_and_votes_urls = toolz.compose(
            lambda p: p('table.dataTableExHov > tbody tr')
                     # we're interested into those with more than one link
                     .filter(lambda i, r: len(pq(r).find('a')) > 1)
                     .map(lambda i, r: {'epa_url': pq(r).find('td').eq(0).find('a').attr('href'),
                                        'vote_url': pq(r).find('td').eq(3).find('a').attr('href')}),
        )(voting_page)
        # arrow.get('(23.09.2015)', 'DD.MM.YYYY')
        import pdb; pdb.set_trace()
        return {}

    return toolz.compose(
        # parse voting from session url
        curried.map(get_votings),
        # paginate all votings per session
        curried.mapcat(partial(paginate_url, do_request=do_request)),
        # get all session urls
        curried.map(lambda r: pq(r).attr('href')),
        # get all anchor elements per page
        curried.map(lambda p: p('table.dataTableExHov tbody a')),
        # get a list of all pages
        partial(paginate_url, do_request=do_request),
    )(DZ_RS_SESSIONS_URL)
def visit_boolean_expression(node):
    return mapcat(visit_node, node['operands'])
Beispiel #18
0
    return map(
        toolz.first,
        graph_traverse(source=(source, 0),
                       get_neighbors=get_neighbors_limiting_radius),
    )


edges_to_graph = toolz.compose(
    curried.valmap(toolz.compose(frozenset, curried.map(toolz.second))),
    curried.groupby(toolz.first),
)

graph_to_edges = toolz.compose_left(
    curried.keymap(lambda x: (x, )),
    dict.items,
    curried.mapcat(functional.star(itertools.product)),
)

reverse_graph = toolz.compose_left(
    graph_to_edges, curried.map(toolz.compose_left(reversed, tuple)),
    edges_to_graph)

cliques_to_graph = toolz.compose_left(
    curried.mapcat(lambda clique: itertools.permutations(clique, r=2)),
    edges_to_graph)


def get_connectivity_components(graph: Dict) -> Iterable[FrozenSet]:
    """Graph is assumed to undirected, so each edge must appear both ways."""
    nodes_left = frozenset(graph)
    while nodes_left:
Beispiel #19
0
            9: 10
        }),
    ),
    "itemfilter": (
        chained(dict,
                curried.itemfilter(lambda i: i[0] % 2 == 0 and i[1] < 4)),
        dict.items({
            1: 2,
            2: 3,
            3: 4,
            4: 5
        }),
    ),
    # example taken from toolz docs
    "mapcat": (
        chained(curried.mapcat(lambda s: [c.upper() for c in s]), list),
        [["a", "b"], ["c", "d", "e"]],
    ),
    "reduce": (curried.reduce(op.add), range(20)),
    "reduceby": (curried.reduceby(lambda x: x % 2 == 0, op.add), range(20)),
    "topk": (chained(curried.topk(5), list), range(20)),
    "curried.unique": (chained(curried.unique, sorted), [1, 1, 2, 3, 4, 4]),
    "unique": (chained(toolz.unique, sorted), [1, 1, 2, 3, 4, 4]),
}


def params(spec, m):
    return pytest.mark.parametrize(spec, [v for (_, v) in sorted(m.items())],
                                   ids=sorted(m.keys()))

def visit_boolean_expression(node):
    return mapcat(visit_node, node['operands'])
Beispiel #21
0
 def glob(self, root, recursive=True):
     # walk: root [dirs] [files] -> for each dir (tup.2), for
     # each file (tup.1), we need to join with root
     return pipe(os.walk(expanduser(root)),
                 mapcat(lambda tup: map(lambda f: join(tup[0], f))
                        (concat([tup[2], tup[1]]))), list)  # noqa
def visit_comparaison(node):
    return mapcat(visit_node, (node['left_operand'], node['right_operand']))
def visit_regle(node):
    return mapcat(
        lambda node1: visit_node(node1)
        if node1['type'] == 'pour_formula' else [visit_node(node1)],
        node['formulas'],
    )
def load_regles_file(json_file_name):
    return pipe(
        read_ast_json_file(json_file_name),
        filter(lambda node: 'batch' in node['applications']),
        mapcat(python_source_visitors.visit_node),
        )
def visit_comparaison(node):
    return mapcat(visit_node, (node['left_operand'], node['right_operand']))
def subcomponent_multi(graph, vertices, mode="out"):
    """Return concatenated subcomponents generated by the given list of
    vertices.
    """
    return tlz.mapcat(lambda vertex: graph.subcomponent(vertex, mode=mode),
                      vertices)
def visit_regle(node):
    return mapcat(
        lambda node1: visit_node(node1) if node1['type'] == 'pour_formula' else [visit_node(node1)],
        node['formulas'],
        )
def visit_product_expression(node):
    return mapcat(visit_node, node['operands'])
def visit_loop_expression(node):
    return mapcat(visit_node, iter_unlooped_nodes(
        loop_variables_nodes=node['loop_variables'],
        node=node['expression'],
        ))
def visit_function_call(node):
    return mapcat(visit_node, node['arguments'])
def visit_function_call(node):
    return mapcat(visit_node, node['arguments'])
Beispiel #32
0
def sync_slides(course_dir, loglevel):
    '''Sync course slides from directory containing slide markdown files

    COURSE-DIR: Path of section-specific course directory
    (e.g. "~/courses/cs101/section-01") . Will search given directory
    recursively for some number of course.yml files.

    '''
    setup_logging(loglevel)

    api = canvas.api.get_api_from_config()
    courses = canvas.course.courses_from_path(api, course_dir)

    if not courses:
        exit_with_msg('Could not find course information.')

    log.info(f'{len(courses)} courses found.')
    _.pipe(
        courses,
        _.map(lambda c: c.data['name']),
        tuple,
        yaml.dump,
        lambda s: '\n' + s,
        log.info
    )

    course_root = common.find_course_root(course_dir)
    if not course_root:
        exit_with_msg('Could not find course root given course'
                      f' dir: {course_dir}')

    log.info(f'Found course root: {course_root}')

    content_paths = common.find_content_paths(course_root)
    slides_path = content_paths['slide']
    log.info(f'Found slides path: {slides_path}')
    slide_md_paths = _.pipe(
        slides_path.glob('slide-*.md'),
        sorted,
    )
    if not slide_md_paths:
        exit_with_msg(
            f'Could not find any slides in {slides_path}'
        )

    _.pipe(
        slide_md_paths,
        _.map(str),
        sorted,
        yaml.dump,
        lambda s: 'Slides found:\n' + s,
        log.info,
    )

    renderers = _.pipe(
        courses,
        _.map(lambda c: templates.slide.render_remark_slides(c, course_root)),
        tuple,
    )

    def render_path(course, md_path):
        for renderer in renderers:
            html_path = Path(md_path.parent, f'{md_path.stem}.html')
            html_content = renderer(md_path)
            log.info(
                f'Writing {len(html_content)} bytes to {html_path}'
            )
            html_path.write_text(html_content)
            log.info(
                f'Uploading {html_path} to course {course.data["name"]}:'
            )
            file_ep = canvas.file.upload_course_file(course, html_path)
            log.info(
                f'  ...done {html_path} -> {course.data["name"]}'
            )
            return (course, md_path, html_path, file_ep)

    all_content = _.pipe(
        itertools.product(courses, slide_md_paths),
        parallel.thread_map(lcommon.vcall(render_path), max_workers=10),
        tuple,
    )

    pages = _.pipe(
        courses,
        _.mapcat(canvas.page.pages),
    )

    for course, md_path, html_path, file_ep in all_content:
        page_content = templates.slide.slide_page(
            course, course_root, md_path, html_path, file_ep,
        )
        page_path = Path(
            content_paths['page'],
            f'page-{md_path.name}',
        )
        log.info(f'Writing page for {md_path}  -->  {page_path}')
        page_path.write_text(page_content)
def visit_product_expression(node):
    return mapcat(visit_node, node['operands'])
def parse_content_into_count(max_num_words, allowed_parts_of_speech, list_of_content):
    """Return a dictionary of tokens (as keys) and counts (as values)"""
    def is_english(s):
        """Predicate that estimates whether a given string is in English"""
        try: 
            return langdetect.detect(s) == 'en'
        except:
            print("Couldn't detect the language of: {}".format(s))
            return True
    @tz.curry
    def tokenize_and_filter_perc_func(allowed_parts_of_speech, given_text):
        """Return the tokens in the given text that are the allowed parts
        of speech

        This version uses the faster PerceptronTagger"""
        return tz.pipe(
            given_text,
            lambda x: TextBlob(x, pos_tagger=PerceptronTagger()),
            lambda x: x.tags,
            tz.filter(lambda x: x[1] in allowed_parts_of_speech), 
                # limit to allowed parts of speech
            tz.map(lambda x: x[0]), # return only the token
            list, 
        )
    @tz.curry
    def tokenize_and_filter_nltk_func(allowed_parts_of_speech, given_text):
        """Return the tokens in the given text that are the allowed parts
        of speech

        This version uses the recommended tagger from NLTK, it is relatively
        slow."""
        return tz.pipe(
            given_text,
            nltk.word_tokenize,
            lambda x: nltk.pos_tag(x),
            tz.filter(lambda x: x[1] in allowed_parts_of_speech), 
                # limit to allowed parts of speech
            tz.map(lambda x: x[0]), # return only the token
            list, 
            print_and_pass)
    if allowed_parts_of_speech == "all":
        # Don't even tag parts of speech, just use everything
        tokenize_func = lambda x: nltk.word_tokenize(x)
    else: 
        tokenize_func = tokenize_and_filter_perc_func(allowed_parts_of_speech)
    wordnet_lemmatizer = nltk.stem.WordNetLemmatizer()
    lemma_fun = lambda x: wordnet_lemmatizer.lemmatize(x)
    exclusion_list = ['//platform.twitter.com/widgets.js', 'align=', 'aligncenter', 'id=', 'width=', '/caption', 'pdf.pdf', u'//t.c\xe2rt', 'http']
        # Yeah, this is a bit of a hack
    return tz.pipe(
        list_of_content, # given content
        tz.map(lambda x: BeautifulSoup(x, 'html.parser').get_text()), # remove html in string
        tz.filter(is_english), # limit to English entries
        tz.map(lambda x: re.sub(r'http.*?(?=\s)', "", x)), # remove urls
        chunk_string(500), # this is done to speedup the part of speech tagging
        tz.mapcat(tokenize_func), # tokenize, and maybe filter by part of speech
        tz.filter(lambda x: x not in exclusion_list), # filter out specific tokens
        tz.filter(lambda x: re.sub(r'\W', "", x) != ''), # filter out punctuation-only strings
        tz.map(lambda s: s.lower()), # convert to lower case
        tz.map(lemma_fun), # convert tokens to a more standard lemma
        tz.countby(tz.identity)) # count occurrences
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages')
    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity')
    parser.add_argument('json_dir', help='Directory containing the JSON AST and data files')
    global args
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING),
        stream=sys.stdout,
        )

    if not os.path.exists(args.json_dir):
        parser.error('json_dir {!r} does not exist'.format(args.json_dir))

    if not os.path.isdir(generated_dir_path):
        os.mkdir(generated_dir_path)

    # Initialize a variables_definitions object and set global variable in visitors

    variables_definitions = python_source_visitors.variables_definitions = VariablesDefinitions()

    # Transpile verification functions

    verif_sources = list(
        mapcat(load_verifs_file, iter_ast_json_file_names(filenames=['coc*.json', 'coi*.json']))
        )
    verifs_source = Template("""\
from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme


def get_errors(formulas, saisie_variables):
    errors = []

$verifs
    return errors or None
""").substitute(verifs=textwrap.indent('\n'.join(verif_sources), prefix=4 * ' '))
    write_source_file(
        file_name='verifs.py',
        source=verifs_source,
        )

    # Transpile formulas

    constants = loaders.load_constants()
    source_by_formula_name = dict(list(mapcat(
        load_regles_file,
        iter_ast_json_file_names(filenames=['chap-*.json', 'res-ser*.json']),
        )))

    def get_formula_source(variable_name):
        source = source_by_formula_name.get(variable_name)
        if source is not None:
            return source
        if variables_definitions.is_saisie(variable_name):
            return python_source_visitors.make_formula_source(
                expression='saisie_variables.get({!r}, 0)'.format(variable_name),
                formula_name=variable_name,
                )
        if variable_name in constants:
            return python_source_visitors.make_formula_source(
                expression='constants[{!r}]'.format(variable_name),
                formula_name=variable_name,
                )
        if variables_definitions.is_calculee(variable_name):
            if not variables_definitions.is_calculee(variable_name, kind='base'):
                log.debug('Variable {!r} is declared in tgvH file but has no formula'.format(variable_name))
            return python_source_visitors.make_formula_source(
                expression='0',
                formula_name=variable_name,
                )
        assert False, variable_name

    # Merge variable names coming from dependencies graph and variables definitions
    # because some variables are missing in tgvH file;
    # or some constants are declared in tgvH but are not used in formulas, only in verifs.
    dependencies_by_formula_name = loaders.load_formulas_dependencies()
    all_variable_names = set(concatv(
        dependencies_by_formula_name.keys(),
        concat(dependencies_by_formula_name.values()),
        variables_definitions.definition_by_variable_name.keys(),
        constants.keys(),
        ))
    write_source_file(
        file_name='formulas.py',
        source=Template("""\
from __future__ import division

import inspect

from ..formulas_helpers import arr, cached, inf, interval, null, positif, positif_ou_nul, present, somme


def get_formulas(cache, constants, saisie_variables):
    formulas = {}

$formulas
    return formulas
""").substitute(
            formulas=textwrap.indent(
                '\n'.join(map(get_formula_source, sorted(all_variable_names))),
                prefix=4 * ' ',
                ),
            ),
        )

    return 0
def main():
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument('-d', '--debug', action='store_true', default=False, help='Display debug messages')
    parser.add_argument('-v', '--verbose', action='store_true', default=False, help='Increase output verbosity')
    global args
    args = parser.parse_args()
    logging.basicConfig(
        level=logging.DEBUG if args.debug else (logging.INFO if args.verbose else logging.WARNING),
        stream=sys.stdout,
        )

    if not os.path.isdir(json_dir_path):
        os.mkdir(json_dir_path)
    if not os.path.isdir(ast_dir_path):
        os.mkdir(ast_dir_path)

    # Load variables definitions

    tgvh_infos = list(load_tgvH_file())

    # Write constants

    constant_by_name = pipe(
         tgvh_infos,
         filter(lambda val: val['type'] == 'variable_const'),
         map(lambda d: (d['name'], d['value'])),
         dict,
         )
    write_json_file(data=constant_by_name, file_name='constants.json')

    # Write variables dependencies

    regles_nodes = list(mapcat(load_regles_nodes, iter_json_file_names('chap-*.json', 'res-ser*.json')))
    dependencies_by_formula_name = dict(list(mapcat(dependencies_visitors.visit_node, regles_nodes)))
    write_json_file(data=dependencies_by_formula_name, file_name='formulas_dependencies.json')

    # Write variables definitions

    ast_infos_by_variable_name = {}
    for regle_node in regles_nodes:
        regle_infos = {
            'regle_applications': regle_node['applications'],
            'regle_linecol': regle_node['linecol'],
            'regle_name': regle_node['name'],
            'source_file_name': regle_node['source_file_name'],
            }
        regle_tags = list(pluck('value', regle_node.get('tags', [])))
        if regle_tags:
            regle_infos['regle_tags'] = regle_tags
        for formula_node in regle_node['formulas']:
            if formula_node['type'] == 'formula':
                ast_infos_by_variable_name[formula_node['name']] = assoc(
                    regle_infos, 'formula_linecol', formula_node['linecol'])
            elif formula_node['type'] == 'pour_formula':
                for unlooped_formula_node in unloop_helpers.iter_unlooped_nodes(
                        loop_variables_nodes=formula_node['loop_variables'],
                        node=formula_node['formula'],
                        unloop_keys=['name'],
                        ):
                    pour_formula_infos = merge(regle_infos, {
                        'pour_formula_linecol': formula_node['formula']['linecol'],
                        'pour_formula_name': formula_node['formula']['name'],
                        })
                    ast_infos_by_variable_name[unlooped_formula_node['name']] = pour_formula_infos
            else:
                assert False, 'Unhandled formula_node type: {}'.format(formula_node)

    def rename_key(d, key_name, key_new_name):
        return assoc(dissoc(d, key_name), key_new_name, d[key_name])

    tgvh_infos_by_variable_name = pipe(
        tgvh_infos,
        filter(lambda d: d['type'] in ('variable_calculee', 'variable_saisie')),
        map(lambda d: rename_key(d, 'linecol', 'tgvh_linecol')),
        map(lambda d: (d['name'], d)),  # Index by name
        dict,
        )

    definition_by_variable_name = merge_with(merge, ast_infos_by_variable_name, tgvh_infos_by_variable_name)

    write_json_file(data=definition_by_variable_name, file_name='variables_definitions.json')

    return 0
Beispiel #37
0
def backward_subset_feature_selection(train_data: pd.DataFrame,
                                      param_train_fn: TuningLearnerFnType,
                                      features_sets: Dict[str, List[str]],
                                      split_fn: SplitterFnType,
                                      eval_fn: EvalFnType,
                                      extractor: ExtractorFnType,
                                      metric_name: str,
                                      threshold: float = 0.005,
                                      num_removed_by_step: int = 3,
                                      early_stop: int = 2,
                                      iter_limit: int = 50,
                                      min_remaining_features: int = 50,
                                      save_intermediary_fn: SaveIntermediaryFnType = None,
                                      n_jobs: int = 1) -> ListLogListType:
    """
        Performs train-evaluation iterations while testing the subsets of features
        to compute statistics about the importance of each feature category

        Parameters
        ----------
        train_data : pandas.DataFrame
            A Pandas' DataFrame with training data

        param_train_fn : function (pandas.DataFrame, list of str) -> prediction_function, predictions_dataset, logs
            A partially defined learning function that takes a training set and a feature list and
            returns a predict function, a dataset with training predictions and training
            logs.

        features_sets: dict of string -> list
            Each String Key on the dict is a subset of columns from the dataset, the function will
            analyse the influence of each group of features on the model performance

        split_fn : function pandas.DataFrame ->  list of tuple
            Partially defined split function that takes a dataset and returns
            a list of folds. Each fold is a Tuple of arrays. The fist array in
            each tuple contains training indexes while the second array
            contains validation indexes.

        eval_fn : function pandas.DataFrame -> dict
            A partially defined evaluation function that takes a dataset with prediction and
            returns the evaluation logs.

        extractor: function str -> float
            A extractor that take a string and returns the value of that string on a dict

        metric_name: str
            String with the name of the column that refers to the metric column to be extracted

        num_removed_by_step: int (default 3)
            Number of features removed at each iteration

        threshold: float (default 0.005)
            Threshold for model performance comparison

        early_stop: int (default 2)
            Number of rounds without improvement before stopping process

        iter_limit: int (default 50)
            Maximum number of iterations before stopping

        min_remaining_features: int (default 50)
            Minimum number of features that should remain in the model,
            combining num_removed_by_step and iter_limit accomplishes the same
            functionality as this parameter.

        save_intermediary_fn : function(log) -> save to file
            Partially defined saver function that receives a log result from a
            tuning step and appends it into a file
            Example: save_intermediary_result(save_path='tuning.pkl')

        n_jobs : int
            Number of parallel processes to spawn.

        Returns
        ----------
        logs: list of list of dict
            A list log-like lists of dictionaries evaluations. Each element of the
            list is validation step of the algorithm.

    """

    selector_fn = remove_features_subsets(extractor=extractor,
                                          metric_name=metric_name,
                                          num_removed_by_step=num_removed_by_step)

    stop_fn = aggregate_stop_funcs(
        stop_by_no_improvement_parallel(extractor=extractor, metric_name=metric_name, early_stop=early_stop,
                                        threshold=threshold),
        stop_by_iter_num(iter_limit=iter_limit),
        stop_by_num_features_parallel(extractor=extractor, metric_name=metric_name,
                                      min_num_features=min_remaining_features)
    )

    used_subsets = [features_sets.keys()]

    used_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in used_subsets]

    trainers = [lambda df: param_train_fn(df, feat) for feat in used_features]

    first_val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]
    logs = [[dict(log, **{"used_subsets": list(subset)}) for log, subset in zip(first_val_logs, used_subsets)]]

    while not stop_fn(logs):
        curr_log = first(logs)

        new_subsets = selector_fn(curr_log)
        new_features = [list(mapcat(lambda key: features_sets[key], subset)) for subset in new_subsets]

        trainers = [lambda df: param_train_fn(df, feat) for feat in new_features]

        val_logs = [parallel_validator(train_data, split_fn, train_func, eval_fn, n_jobs) for train_func in trainers]

        new_logs = [dict(log, **{"used_subsets": subset}) for log, subset in zip(val_logs, new_subsets)]

        if save_intermediary_fn is not None:
            save_intermediary_fn(new_logs)

        logs = [new_logs] + logs

    return logs
Beispiel #38
0
def get_gifts(people):
    # ``pipe(data, f, g, h)`` is equivalent to ``h(g(f(data)))`
    return pipe(people,
        filter(lambda v: v['age'] < 18 and v['well_behaved']),
        mapcat(get(['name'])),
        list)