Example #1
0
def _get_wf_call_failures(metadata, opts):
    calls = []
    if 'calls' in opts:
        calls = opts['calls'].split(',')
    else:
        calls = metadata['calls'].keys()

    jobids = None
    if 'jobids' in opts:
        jobids = set(opts['jobids'].split(','))

    fails = {}

    for c in calls:
        tasks = metadata['calls'][c]
        failures = pipe(
            tasks,
            filter(lambda x: get('executionStatus', x) == 'Failed'),
            filter(lambda x: _valid_job_id(jobids, get('jobId', x))),
            map(
                lambda x: {
                    'jobId': get('jobId', x),
                    #                                               'inputs'  : get('inputs', x),
                    'stderr': get('stderr', x),
                    'shard': get('shardIndex', x),
                    'err_msg': get_in(['failures', 0, 'message'], x, 'NA'),
                    #                                               'jes'     : get('jes', x),
                    #                                               'runtime' : get('runtimeAttributes', x),
                    'rc': get('returnCode', x, 'NA'),
                }),
            list)
        fails[c] = failures

    return fails
Example #2
0
    def find_domain_urls(self, domain: str) -> List[str]:
        """
        Get all known urls for domain.

        Returns
        -------
        all_urls : iterator
        """
        def _urlkey_to_url(urlkey):
            try:
                # very rare bugged urlkeys appear
                domain, path = urlkey.split(')/', 1)
            except ValueError:
                return
            domain = domain.split(',')
            domain.reverse()
            domain = '.'.join(domain)
            if path:
                return '/'.join([domain, path])
            return domain

        urls_by_index = map(
            lambda ind: self.__get_domain_urls_in_index(ind, domain),
            self.indexes)
        all_urls = pipe(urls_by_index, concat, map(bytes.decode),
                        map(_urlkey_to_url), filter(None), map(unquote),
                        map(lambda x: x.strip()), unique, list)
        return all_urls
Example #3
0
    def __call__(self, epoch):
        cyclic = 1.0
        phase = epoch % self.period
        turn_phase, ratio = self.turning_point
        turn_cyclic = self.min_factor + self.range * ratio


        if  phase <= turn_phase:
            cyclic = (
                self.min_factor +
                (turn_cyclic - self.min_factor) *
                phase/turn_phase
            )

        else:
            cyclic = turn_cyclic + \
                (self.max_factor - turn_cyclic) * \
                (phase - turn_phase)/(self.period - turn_phase)

        gamma = pipe(
            self.milestones,
            filter(lambda x: x[0] <= epoch),
            map(lambda x: x[1]),
            last
        )
        return cyclic * gamma
 def get_hashtag_string(given_item):
     """Return a string of hashtags associated with the given item"""
     return tz.pipe(
         tz.get_in(['entities', 'hashtags'], given_item, default=[]),
         tz.map(lambda x: tz.get_in(['text'], x, default=None)),
         tz.filter(lambda x: x is not None),
         lambda x: ", ".join(x))
Example #5
0
    def serde_with_class(cls):
        from_fields = list(
            map(lambda a: (a, get_in([from_key], a.metadata, [a.name])),
                fields(cls)))

        to_fields = pipe(
            fields(cls),
            map(lambda a: (a, get_in([to_key], a.metadata))),
            filter(lambda f: f[1]),
            list,
        )

        def from_dict(d):
            return cls(**dict(
                map(
                    lambda f: (f[0].name, get_in(f[1], d, f[0].default)),
                    from_fields,
                )))

        def to_dict(self):
            d = asdict(self)
            return reduce(
                lambda acc, f: update_in(acc, f[1], lambda _: d[f[0].name]),
                to_fields,
                {},
            )

        cls.from_dict = staticmethod(from_dict)
        cls.to_dict = to_dict
        return cls
Example #6
0
        def to_dict(self,
                    convert_values: bool = False) -> MutableMapping[str, Any]:
            to_fields = curried.pipe(
                fields(self.__class__),
                curried.map(lambda a:
                            (a, curried.get_in([to_key], a.metadata))),
                curried.filter(lambda f: f[1]),
                list,
            )

            if convert_values:
                d = asdict(self)
            else:
                d = {
                    a.name: getattr(self, a.name)
                    for a in fields(self.__class__)
                }

            if not to_fields:
                return d

            return curried.reduce(
                lambda acc, f: curried.update_in(acc, f[1], lambda _: d[f[0].
                                                                        name]),
                to_fields,
                {},
            )
 def get_categories(given_dict):
     """Return a string of the categories associated with a post"""
     return tz.pipe(
         tz.get_in(['object', 'tags'], given_dict, default = []),
         tz.filter(lambda x: tz.get_in(['objectType'], x, default=None) == 'category'),
         tz.map(lambda x: tz.get_in(['displayName'], x, default=None)),
         lambda x: ", ".join(x)
     )
Example #8
0
def parse_format_assignments(txt):
    assignments = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('format')),
        mapcat(lambda x: x.lower().split('.')),
        map(lambda x: x.split()),  # break out vars and format
        (mapcat, lambda y: [(k, y[-1]) for k in y]),  # tuple of var, fmt
        dict
    )
    return assignments
Example #9
0
    def __init__(
        self,
        id,
        dataset_dir,
        output_dir,
        n_splits,
        base_train_config,
        folds,
    ):
        params = locals()
        torch.manual_seed(0)

        ids = pipe(range(n_splits), filter(lambda x: x in folds), list)

        train_df_path = delayed(load_train_df)(
            dataset_dir=join(dataset_dir, 'train'),
            output=join(output_dir, 'train.pqt'))

        train_df = delayed(pd.read_parquet)(train_df_path)

        kfolded = delayed(kfold)(train_df, n_splits)

        train_sets = pipe(ids, map(lambda x: delayed(lambda i: i[x])(kfolded)),
                          list)

        model_paths = pipe(
            zip(ids, train_sets),
            map(lambda x: delayed(train_fusion)(
                **base_train_config,
                model_path=join(output_dir, f"{id}-fold-{x[0]}-base-model.pt"),
                sets=x[1],
                log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0]}/base',
            )), list)

        test_df_path = load_test_df(dataset_dir='/store/tellus/test',
                                    output=join(output_dir, 'test.pqt'))
        test_df = delayed(pd.read_parquet)(test_df_path)
        test_dataset = delayed(TellusDataset)(
            test_df,
            has_y=False,
        )

        submission_df_path = delayed(predict)(
            model_paths=model_paths,
            log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/sub',
            dataset=test_dataset,
            log_interval=10,
            out_path=f'{output_dir}/{id}_submission.tsv',
        )

        self.output = delayed(lambda x: x)((
            model_paths,
            submission_df_path,
        ))
Example #10
0
def parse_questions(txt):
    rqt = re.compile(r'[\"\']')  # match quote chars
    assignments = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('label')),
        mapcat(lambda x: x.lower().split('\n')),
        map(lambda x: x.split('=')),  # break out vars and format
        (map, lambda y: (y[0].strip().lower(), rqt.sub('', y[1].strip()))),  # tuple of var, fmt
        dict
    )
    return assignments
Example #11
0
def test_kfold():
    output = load_train_df(
        dataset_dir='/store/tellus/train',
        output='/store/tmp/train.pqt'
    )
    df = pd.read_parquet(output)
    sets = kfold(df, n_splits=10)
    for s in sets:
        assert pipe(
            s['train_pos'],
            take(100),
            map(lambda x: x['label']),
            filter(lambda x: x == 0),
            list,
            len
        ) == 0
        assert pipe(
            s['val_pos'],
            take(100),
            map(lambda x: x['label']),
            filter(lambda x: x == 0),
            list,
            len
        ) == 0
        assert pipe(
            s['train_neg'],
            take(100),
            map(lambda x: x['label']),
            filter(lambda x: x == 1),
            list,
            len
        ) == 0
        assert pipe(
            s['val_neg'],
            take(100),
            map(lambda x: x['label']),
            filter(lambda x: x == 1),
            list,
            len
        ) == 0
        assert len(s) == 4
Example #12
0
def parse_variable_labels(txt, repl, lbls_to_lower=True):
    b2d = curry(block2dict)(repl=repl, to_lower=lbls_to_lower)
    labels = thread_last(
        txt.split(';'),
        filter(lambda x: x.strip().lower().startswith('value')),
        map(lambda x: x.strip().split('\n')),
        map(lambda x: (x[0].split()[1].lower(), b2d(x[1:]))),
        dict
    )
    logger.info('parsed varlabels from format txt',
                nlabeled=len(labels.keys()), nrepl=len(repl.keys()))
    return labels
Example #13
0
def block2dict(lines, repl, to_lower=False):
    f_lwr = str.lower if to_lower else identity
    f_repl = curry(lambda k, r: r[k] if k in r else k)(r=repl)
    rqt = re.compile(r'[\"\']')  # match quote chars
    rws = re.compile(r'\s')        # match whitespace
    # keep only alnum and a few unreserved symbols
    ruri = re.compile(r'(?![\w\s\-\_\.\'\$\-\+\(\)\/]|\.).')
    d = thread_last(
        lines,
        map(lambda x: x.replace('\x92', "'")),
        map(lambda x: rqt.sub('', x.strip()).split('=')),
        map(lambda x: (rws.sub('', x[0].strip()), ruri.sub('', x[1].strip()))),
        filter(lambda x: x[0].find('-') == -1),  # no support for ranges
        (mapcat, lambda x: map(lambda y: (y, x[1]), x[0].split(','))),
        filter(lambda x: x[0].isnumeric()),  # remove non-numeric codes
        map(lambda x: (int(x[0]),  # cat codes are ints
                       pipe(x[1], f_lwr, f_repl))),
        dict
    )
    # d[-1] = np.nan #use NA as a marker for unmapped vals
    return d
def connect_to_twitter_filtered_stream(stream_key, saveing_function):
    """Connect to & consume a filtered Twitter stream, where Twitter does 
    some of the filtering"""
    stream = tz.pipe(
        ## Connect
        start_stream_twitter(**CONFIG['twitter_filter']),
        tz.map(print_twitter_stall_warning),
        ## Filter
        tz.filter(is_tweet), # filter to tweets
        ## Parse
        tz.map(parse_tweet), # parse into a flat dictionary
    )

    ## Collect
    saveing_function(stream_key, stream)
def connect_to_twitter_stream(stream_key, saveing_function):
    """Connect to & consume a Twitter stream"""
    stream = tz.pipe(
        ## Connect
        start_stream_twitter(), # public sampled stream
        tz.map(print_twitter_stall_warning),
        ## Filter
        tz.filter(is_tweet), # filter to tweets
        # tz.filter(is_user_lang_tweet(["en", "en-AU", "en-au", "en-GB", "en-gb"])), # filter to English
        ## Parse
        tz.map(parse_tweet), # parse into a flat dictionary
    )

    # Collect
    saveing_function(stream_key, stream)
Example #16
0
 def get_url_location(self, url: str) -> Optional[Dict]:
     """
     Get html location in index for url.
     """
     params = {
         'url': url,
         'output': 'json',
         'closest': self._cur_ts(),
         'filter': '!status:404',
         'fl': 'filename,length,offset,status,timestamp'
     }
     locations = pipe(self.indexes,
                      map(lambda index: self.__locate_url(index, params)),
                      filter(None), concat, list)
     if locations:
         location = self.__locate_most_relevant_location(locations)
         return location
     return None
Example #17
0
def _get_wf_call_statuses(metadata):
    calls = metadata['calls'].keys()
    states = set([])
    call_stats = {}

    for c in calls:
        tasks = metadata['calls'][c]
        counts = pipe(tasks, map(get('executionStatus')), frequencies)
        new_states = list(filter(lambda x: x not in states, counts.keys()))
        if new_states:
            for s in new_states:
                states.add(s)
        call_stats[c] = counts

    base_states = {s: 0 for s in states}

    final_stats = valmap(lambda d: merge(base_states, d), call_stats)
    return (calls, sorted(states), final_stats)
Example #18
0
    def __call__(self, epoch):
        phase = epoch % self.period
        turn_cyclic = self.min_factor + self.range


        cyclic = (
            self.min_factor +
            (turn_cyclic - self.min_factor) *
            phase
        )


        gamma = pipe(
            self.milestones,
            filter(lambda x: x[0] <= epoch),
            map(lambda x: x[1]),
            last
        )
        return cyclic * gamma
Example #19
0
def test_esampler():

    output = load_train_df(
        dataset_dir='/store/tellus/train',
        output='/store/tmp/train.pqt'
    )
    df = pd.read_parquet(output)
    dataset = TellusDataset(
        df=df,
        has_y=True,
    )
    subset = Subset(
        dataset,
        list(range(1500, 1600))
    )

    epoch_size = 10
    s = ChunkSampler(
        epoch_size=epoch_size,
        len_indices=len(subset),
        shuffle=True,
    )

    batch_size = 2
    train_loader = DataLoader(
        subset,
        sampler=s,
        batch_size=batch_size,
        pin_memory=True,
    )
    for i in range(11):
        samples = pipe(
            train_loader,
            map(lambda x: x['id']),
            filter(lambda x: len(x) == batch_size),
            list
        )
        assert len(samples) == epoch_size//batch_size
from cytoolz.curried import (compose, filter, get, groupby, map, pipe, pluck,
                             valmap)

accounts = [
    (1, 'Alice', 100, 'F'),  # id, name, balance, gender
    (2, 'Bob', 200, 'M'),
    (3, 'Charlie', 150, 'M'),
    (4, 'Dennis', 50, 'M'),
    (5, 'Edith', 300, 'F')
]

# I. SELECTING WITH `MAP()` AND `FILTER()`
# SELECT name, balance FROM accounts WHERE balance > 150

# Functional version with pipeline and curry
acc1 = pipe(accounts, filter(lambda account: account[2] > 150),
            map(get([1, 2])), list)
print(acc1)

# List comprehensions version (more Pythonic):
acc2 = [(name, balance) for (id, name, balance, gender) in accounts
        if balance > 150]
print(acc2)

# II. SPLIT-APPLY-COMBINE WITH `GROUPBY` AND `REDUCEBY`:
# 1. Split the dataset into groups by some property
# 2. Reduce each of the groups with some synopsis function

# In Memory Split-Apply-Combine
# SELECT gender, SUM(balance) FROM accounts GROUP BY gender;
print(groupby(get(3), accounts))
Example #21
0
#!/usr/bin/env python

import cytoolz.curried as cc
from pprint import pprint as pp
import sys

data_input = cc.pipe(sys.stdin.readlines(),
                     cc.map(lambda x: x.replace('\n', '')), list)


def has_no_duplicate(x):
    return len(set(x)) == len(x)


answer = cc.pipe(data_input, cc.map(str.split), cc.filter(has_no_duplicate),
                 list, len)

pp(answer)
Example #22
0
import pandas as pd
import numpy as np
from cytoolz.itertoolz import unique
from cytoolz.functoolz import thread_last, identity
from cytoolz.curried import map, filter, curry
from survey_stats import pdutil
# import sys
# import traceback as tb

from survey_stats import log

logger = log.getLogger(__name__)

US_STATES_FIPS_INTS = thread_last(us.STATES_AND_TERRITORIES,
                                  map(lambda x: x.fips),
                                  filter(lambda x: x is not None),
                                  map(lambda x: int(x)), list)

SITECODE_TRANSLATORS = {
    'fips':
    lambda x: (us.states.lookup('%.2d' % x).abbr
               if int(x) in US_STATES_FIPS_INTS else 'NA'),
    'codes':
    identity
}

SVYDESIGN_COLS = ['sitecode', 'strata', 'psu', 'weight']


def convert_cat_codes(s, fmt):
    unq_lvls = list(unique([fmt[k] for k in sorted(fmt.keys())]))
Example #23
0
    cc.map(lambda x: x.split('->')),
    cc.map(lambda x: (x[0], [] if len(x) == 1 else cc.pipe(
        x[1], lambda x: x.split(','), cc.map(str.strip), list))), list)

tree_val_dict = cc.pipe(
    data_input, cc.map(cc.first),
    cc.map(lambda x: [tree_val_re.match(x).group(y) for y in (1, 2)]), dict,
    cc.valmap(int))

tree_mapping_dict = cc.pipe(
    data_input, cc.map(lambda x: (tree_val_re.match(x[0]).group(1), x[1])),
    dict)

root = cc.pipe(
    tree_mapping_dict.keys(),
    cc.filter(lambda x: x not in cc.concat(tree_mapping_dict.values())),
    cc.first)

tree = Tree(root, tree_mapping_dict, tree_val_dict)

unbalanced = tree.find_unbalanced()
unbalanced_self_weight = unbalanced.weight - sum(x.weight
                                                 for x in unbalanced.children)
unbalanced_grouped_siblings = unbalanced.grouped('siblings')

balanced_weight = cc.first(
    cc.valfilter(lambda x: len(x) > 1, unbalanced_grouped_siblings).keys())
unbalanced_weight = cc.first(
    cc.valfilter(lambda x: len(x) == 1, unbalanced_grouped_siblings).keys())
weight_offset = balanced_weight - unbalanced_weight
Example #24
0
 def find_mismatched_levels(self):
     return pipe(self.meta.qns[ID_COLUMN], set, map(self.compare_levels),
                 filter(lambda x: set(x['surveys']) != set(x['socrata'])))
Example #25
0
File: graph.py Project: 908kre/aplf
    def __init__(
        self,
        id,
        dataset_dir,
        output_dir,
        n_splits,
        base_train_config,
        fine_train_config,
        top_num,
        folds,
    ):
        params = locals()

        ids = pipe(range(n_splits), list)

        dataset_df = delayed(load_dataset_df)(dataset_dir, 'train.csv')
        dataset = delayed(TgsSaltDataset)(
            dataset_df,
            has_y=True,
        )

        kfolded = delayed(kfold)(dataset, n_splits)

        train_sets = pipe(
            range(n_splits),
            map(lambda idx: delayed(lambda x: x[idx][0])(kfolded)),
            map(lambda x: delayed(Subset)(dataset, x)), list)

        seg_sets = pipe(
            train_sets, map(delayed(lambda x: x.indices)),
            map(lambda x: delayed(get_segment_indices)(dataset, x)),
            map(lambda x: delayed(Subset)(dataset, x)), list)

        val_sets = pipe(range(n_splits),
                        map(lambda idx: delayed(lambda x: x[idx][1])(kfolded)),
                        map(lambda x: delayed(Subset)(dataset, x)), list)

        predict_dataset_df = delayed(load_dataset_df)(dataset_dir,
                                                      'sample_submission.csv')

        predict_set = delayed(TgsSaltDataset)(predict_dataset_df, has_y=False)
        trains = pipe(zip(ids, train_sets, seg_sets, val_sets),
                      filter(lambda x: x[0] in folds), list)

        model_paths = pipe(
            trains,
            map(lambda x: delayed(base_train)(
                **base_train_config,
                model_path=f"{output_dir}/id-{id}-fold-{x[0]}-base-model.pt",
                train_set=x[1],
                seg_set=x[2],
                val_set=x[3],
                no_lable_set=predict_set,
                log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0]}/base',
            )), list)

        #  model_paths = pipe(
        #      zip(trains, model_paths),
        #      map(lambda x: delayed(fine_train)(
        #          **fine_train_config,
        #          base_model_path=x[1],
        #          model_path=f"{output_dir}/id-{id}-fold-{x[0][0]}-fine-model.pt",
        #          train_set=x[0][1],
        #          seg_set=x[0][2],
        #          val_set=x[0][3],
        #          no_lable_set=predict_set,
        #          log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/{x[0][0]}/fine',
        #      )),
        #      list
        #  )
        #
        submission_df = delayed(predict)(
            model_paths=model_paths,
            log_dir=f'{config["TENSORBORAD_LOG_DIR"]}/{id}/sub',
            dataset=predict_set,
            log_interval=10,
            hdf5_path=f'{output_dir}/{id}.hdf5')
        #
        submission_df = delayed(lambda df: df[['rle_mask']])(submission_df)
        submission_file = delayed(
            lambda df: df.to_csv(f"{output_dir}/id-{id}-submission.csv"))(
                submission_df, )

        self.output = delayed(lambda x: x)((
            model_paths,
            submission_df,
            submission_file,
        ))
Example #26
0
import pytest
from mlboard_api import models as ms
from mlboard_api import query as qry
from mlboard_api import create_app
import uuid
from cytoolz.curried import pipe, map, filter
from dateutil.parser import parse
import datetime
import uuid
from .fixture import app


@pytest.fixture(params=pipe(
    dir(ms),
    map(lambda x: getattr(ms, x)),
    filter(lambda x: type(x).__name__ == 'DeclarativeMeta'),
    list
))
def target(request):
    return request.param


def test_all_table(app, target):
    payload = {
        "target": target.__name__,
        'entities': [],
        "methods": [
            {"name": "limit", "args": [1], "kwargs":{}},
            {"name": "all", "args": [], "kwargs":{}}
        ],
    }
Example #27
0
#!/usr/bin/env python

import cytoolz.curried as cc
import itertools as it
from pprint import pprint as pp
import sys

data_input = sys.stdin.read().replace('\n', '')
data_input_midpt = cc.pipe(data_input, it.cycle,
                           cc.drop(int(len(data_input) / 2)))

answer = cc.pipe(zip(data_input, data_input_midpt),
                 cc.filter(lambda x: x[0] == x[1]),
                 cc.map(lambda x: int(x[0])), sum)

pp(answer)
Example #28
0
#!/usr/bin/env python

import cytoolz.curried as cc
from pprint import pprint as pp
import sys

data_input = sys.stdin.read().replace('\n', '')
data_input += data_input[0]

answer = cc.pipe(
    ((x for x in data_input), (x for x in cc.drop(1, data_input))),
    lambda x: zip(*x), cc.filter(lambda x: x[0] == x[1]),
    cc.map(lambda x: int(x[0])), sum)

pp(answer)