Ejemplo n.º 1
0
def test_introspect_curry_py3():
    if not PY3:
        return
    f = cytoolz.curry(make_func(''))
    assert num_required_args(f) == 0
    assert is_arity(0, f)
    assert has_varargs(f) is False
    assert has_keywords(f) is False

    f = cytoolz.curry(make_func('x'))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)  # A side-effect of being curried

    f = cytoolz.curry(make_func('x, y, z=0'))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert is_arity(2, f) is False
    assert is_arity(3, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)

    f = cytoolz.curry(make_func('*args, **kwargs'))
    assert num_required_args(f) == 0
    assert has_varargs(f)
    assert has_keywords(f)
Ejemplo n.º 2
0
def test_introspect_curry_py3():
    f = cytoolz.curry(make_func(''))
    assert num_required_args(f) == 0
    assert is_arity(0, f)
    assert has_varargs(f) is False
    assert has_keywords(f) is False

    f = cytoolz.curry(make_func('x'))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)  # A side-effect of being curried

    f = cytoolz.curry(make_func('x, y, z=0'))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert is_arity(2, f) is False
    assert is_arity(3, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)

    f = cytoolz.curry(make_func('*args, **kwargs'))
    assert num_required_args(f) == 0
    assert has_varargs(f)
    assert has_keywords(f)
Ejemplo n.º 3
0
def test_introspect_curry_py3():
    if not PY3:
        return
    f = cytoolz.curry(make_func(""))
    assert num_required_args(f) == 0
    assert is_arity(0, f)
    assert has_varargs(f) is False
    assert has_keywords(f) is False

    f = cytoolz.curry(make_func("x"))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)  # A side-effect of being curried

    f = cytoolz.curry(make_func("x, y, z=0"))
    assert num_required_args(f) == 0
    assert is_arity(0, f) is False
    assert is_arity(1, f) is False
    assert is_arity(2, f) is False
    assert is_arity(3, f) is False
    assert has_varargs(f) is False
    assert has_keywords(f)

    f = cytoolz.curry(make_func("*args, **kwargs"))
    assert num_required_args(f) == 0
    assert has_varargs(f)
    assert has_keywords(f)
Ejemplo n.º 4
0
def main(opts):
    print(opts)
    dataset, split = opts.annotation.split('_')
    if split == 'dev':
        txt_db = 'val_txt_db'
    elif split == 'pretrain':
        txt_db = 'train_txt_db'
    else:
        txt_db = f'{split}_txt_db'
    opts.output = os.path.join(
        '/txt', intermediate_dir(opts.pretrained_model_name_or_path),
        getattr(opts, txt_db))

    os.makedirs(opts.output)

    # train_db_dir = os.path.join(os.path.dirname(opts.output), f'{source}_{split}.db')
    # meta = vars(opts)
    # meta['tokenizer'] = opts.toker
    tokenizer = AutoTokenizer.from_pretrained(
        opts.pretrained_model_name_or_path, use_fast=True)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        id2lens = process(opts, db, tokenizer)

    with open(f'{opts.output}/id2len.json', 'w') as f:
        json.dump(id2lens, f)
Ejemplo n.º 5
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError("Found existing DB. Please explicitly remove "
                         "for re-processing")
    meta = vars(opts)
    meta["tokenizer"] = opts.toker
    toker = BertTokenizer.from_pretrained(opts.toker,
                                          do_lower_case="uncased"
                                          in opts.toker)
    tokenizer = bert_tokenize(toker)
    meta["UNK"] = toker.convert_tokens_to_ids(["[UNK]"])[0]
    meta["CLS"] = toker.convert_tokens_to_ids(["[CLS]"])[0]
    meta["SEP"] = toker.convert_tokens_to_ids(["[SEP]"])[0]
    meta["MASK"] = toker.convert_tokens_to_ids(["[MASK]"])[0]
    meta["v_range"] = (toker.convert_tokens_to_ids("!")[0], len(toker.vocab))
    with open(f"{opts.output}/meta.json", "w") as f:
        json.dump(vars(opts), f, indent=4)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        with open(opts.annotation) as ann:
            if opts.missing_imgs is not None:
                missing_imgs = set(json.load(open(opts.missing_imgs)))
            else:
                missing_imgs = None
            id2lens, txt2img = process_nlvr2(ann, db, tokenizer, missing_imgs)

    with open(f"{opts.output}/id2len.json", "w") as f:
        json.dump(id2lens, f)
    with open(f"{opts.output}/txt2img.json", "w") as f:
        json.dump(txt2img, f)
Ejemplo n.º 6
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = BertTokenizer.from_pretrained(opts.toker,
                                          do_lower_case='uncased'
                                          in opts.toker)

    tokenizer = bert_tokenize(toker)
    meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids('!')[0], len(toker.vocab))
    with open(f'{opts.output}/meta.json', 'w') as f:
        json.dump(vars(opts), f, indent=4)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        with open(opts.annotation) as ann:
            if opts.missing_imgs is not None:
                missing_imgs = set(json.load(open(opts.missing_imgs)))
            else:
                missing_imgs = None
            id2lens, txt2img = process_nlvr2(ann, db, tokenizer, missing_imgs)

    with open(f'{opts.output}/id2len.json', 'w') as f:
        json.dump(id2lens, f)
    with open(f'{opts.output}/txt2img.json', 'w') as f:
        json.dump(txt2img, f)
    def X_feature_onehot(self, dataframe: DataFrame) -> DataFrame:
        # fieldgroups[basename] = [ fieldname ]
        # noinspection PyArgumentList
        fieldgroups = groupby(
            curry(re.sub)('\d+(st|nd|rd)?$')(''),  # basename
            self.params['X_feature_onehot']  # fieldnames
        )
        encodings = {}
        for basename, fieldnames in fieldgroups.items():
            # NOTE: in theory, unique_values should be hardcoded based on data_description.txt
            #       for Kaggle, we can cheat and just take unique_values from self.data['combined']
            # BUGFIX: running to_X() separately on test/train/validate datasets results in column name mismatches
            unique_values = np.unique(
                self.data['combined'][fieldnames].dropna().values)
            category_dtype = CategoricalDtype(categories=unique_values)

            for fieldname in fieldnames:
                dataframe[fieldname] = dataframe[fieldname].astype(
                    category_dtype)
                onehot = pd.get_dummies(dataframe[fieldname],
                                        prefix=basename,
                                        prefix_sep='_')
                if not basename in encodings: encodings[basename] = onehot
                else:
                    encodings[basename] = onehot & encodings[
                        basename]  # Bitwise addition

        # Add additional onehot columns to dataframe
        for basename, onehot in encodings.items():
            dataframe = dataframe.join(onehot)

        # Mark original categorical columns for exclusion
        self.params['X_feature_exclude'] += self.params['X_feature_onehot']
        return dataframe
Ejemplo n.º 8
0
 def curry_namespace(ns):
     return dict(
         (
             name,
             curry(f) if should_curry(f) else f,
         )
         for name, f in ns.items()
         if '__' not in name
     )
Ejemplo n.º 9
0
def find_background_illumination(fns,
                                 radius=None,
                                 input_bitdepth=None,
                                 quantile=0.5,
                                 stretch_quantile=0.):
    """Use a set of related images to find uneven background illumination.

    Parameters
    ----------
    fns : list of string
        A list of image file names
    radius : int, optional
        The radius of the structuring element used to find background.
        default: The width or height of the input images divided by 4,
        whichever is smaller.
    input_bitdepth : int, optional
        The bit-depth of the input images. Should be specified if non-standard
        bitdepth images are used in a 16-bit image file, e.g. 12-bit images.
        Default is the dtype of the input image.
    quantile : float in [0, 1], optional
        The desired quantile to find background. default: 0.5 (median)
    stretch_quantile : float in [0, 1], optional
        Stretch image to full dtype limit, saturating above this quantile.

    Returns
    -------
    illum : np.ndarray, float, shape (M, N)
        The estimated illumination over the image field.

    See Also
    --------
    `correct_image_illumination`, `correct_multiimage_illumination`.
    """
    # this function follows the "PyToolz" streaming data model to
    # obtain the illumination estimate.
    # first, define the functions for each individual step:
    in_range = ('image' if input_bitdepth is None else
                (0, 2**input_bitdepth - 1))
    rescale = tz.curry(exposure.rescale_intensity)
    normalize = (tz.partial(stretchlim, bottom=stretch_quantile)
                 if stretch_quantile > 0 else skimage.img_as_float)

    # produce a stream of properly-scaled images
    ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize)
           for fn in fns)

    # take the mean of that stream
    mean_image = mean(ims)

    # return the median filter of that mean
    radius = radius or min(mean_image.shape) // 4
    illum = ndi.percentile_filter(mean_image,
                                  percentile=(quantile * 100),
                                  footprint=morphology.disk(radius))
    return illum
Ejemplo n.º 10
0
def test_introspect_curry_valid_py3(check_valid=is_valid_args,
                                    incomplete=False):
    if not PY3:
        return
    orig_check_valid = check_valid
    check_valid = lambda _func, *args, **kwargs: orig_check_valid(
        _func, args, kwargs)

    f = cytoolz.curry(make_func("x, y, z=0"))
    assert check_valid(f)
    assert check_valid(f, 1)
    assert check_valid(f, 1, 2)
    assert check_valid(f, 1, 2, 3)
    assert check_valid(f, 1, 2, 3, 4) is False
    assert check_valid(f, invalid_keyword=True) is False
    assert check_valid(f(1))
    assert check_valid(f(1), 2)
    assert check_valid(f(1), 2, 3)
    assert check_valid(f(1), 2, 3, 4) is False
    assert check_valid(f(1), x=2) is False
    assert check_valid(f(1), y=2)
    assert check_valid(f(x=1), 2) is False
    assert check_valid(f(x=1), y=2)
    assert check_valid(f(y=2), 1)
    assert check_valid(f(y=2), 1, z=3)
    assert check_valid(f(y=2), 1, 3) is False

    f = cytoolz.curry(make_func("x, y, z=0"), 1, x=1)
    assert check_valid(f) is False
    assert check_valid(f, z=3) is False

    f = cytoolz.curry(make_func("x, y, *args, z"))
    assert check_valid(f)
    assert check_valid(f, 0)
    assert check_valid(f(1), 0)
    assert check_valid(f(1, 2), 0)
    assert check_valid(f(1, 2, 3), 0)
    assert check_valid(f(1, 2, 3, 4), 0)
    assert check_valid(f(1, 2, 3, 4), z=4)
    assert check_valid(f(x=1))
    assert check_valid(f(x=1), 1) is False
    assert check_valid(f(x=1), y=2)
Ejemplo n.º 11
0
def find_background_illumination(fns, radius=None, input_bitdepth=None,
                                 quantile=0.5, stretch_quantile=0.):
    """Use a set of related images to find uneven background illumination.

    Parameters
    ----------
    fns : list of string
        A list of image file names
    radius : int, optional
        The radius of the structuring element used to find background.
        default: The width or height of the input images divided by 4,
        whichever is smaller.
    input_bitdepth : int, optional
        The bit-depth of the input images. Should be specified if non-standard
        bitdepth images are used in a 16-bit image file, e.g. 12-bit images.
        Default is the dtype of the input image.
    quantile : float in [0, 1], optional
        The desired quantile to find background. default: 0.5 (median)
    stretch_quantile : float in [0, 1], optional
        Stretch image to full dtype limit, saturating above this quantile.

    Returns
    -------
    illum : np.ndarray, float, shape (M, N)
        The estimated illumination over the image field.

    See Also
    --------
    `correct_image_illumination`, `correct_multiimage_illumination`.
    """
    # this function follows the "PyToolz" streaming data model to
    # obtain the illumination estimate.
    # first, define the functions for each individual step:
    in_range = ('image' if input_bitdepth is None
                else (0, 2**input_bitdepth - 1))
    rescale = tz.curry(exposure.rescale_intensity)
    normalize = (tz.partial(stretchlim, bottom=stretch_quantile)
                 if stretch_quantile > 0
                 else skimage.img_as_float)

    # produce a stream of properly-scaled images
    ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize)
           for fn in fns)

    # take the mean of that stream
    mean_image = mean(ims)

    # return the median filter of that mean
    radius = radius or min(mean_image.shape) // 4

    mean_image = img_as_ubyte(stretchlim(mean_image))
    illum = imfilter.rank.median(mean_image, selem=morphology.disk(radius))
    return illum
Ejemplo n.º 12
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        print(opts.output)
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = RobertaTokenizer.from_pretrained(opts.toker)
    tokenizer = roberta_tokenize(toker)
    meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0]
    meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0],
                       toker.convert_tokens_to_ids(['<|endoftext|>'])[0] + 1)
    with open(f'{opts.output}/meta.json', 'w') as f:
        json.dump(vars(opts), f, indent=4)

    open_cap_db = curry(open_lmdb, f"{opts.output}/cap.db", readonly=False)
    open_clip_db = curry(open_lmdb, f"{opts.output}/clip.db", readonly=False)
    with open_cap_db() as cap_db, open_clip_db() as clip_db:
        with open(opts.annotation) as ann, open(opts.subtitles) as sub:
            (id2lens, cap2vid, clip2vid, vid2caps,
             vid2clips) = process_tvc(ann, sub, cap_db, clip_db, tokenizer)

    with open(f'{opts.output}/cap.db/id2len.json', 'w') as f:
        json.dump(id2lens, f)
    with open(f'{opts.output}/cap.db/cap2vid.json', 'w') as f:
        json.dump(cap2vid, f)
    with open(f'{opts.output}/clip.db/clip2vid.json', 'w') as f:
        json.dump(clip2vid, f)
    with open(f'{opts.output}/cap.db/vid2caps.json', 'w') as f:
        json.dump(vid2caps, f)
    with open(f'{opts.output}/clip.db/vid2clips.json', 'w') as f:
        json.dump(vid2clips, f)
Ejemplo n.º 13
0
def test_introspect_curry_valid_py3(check_valid=is_valid_args, incomplete=False):
    if not PY3:
        return
    orig_check_valid = check_valid
    check_valid = lambda _func, *args, **kwargs: orig_check_valid(_func, args, kwargs)

    f = cytoolz.curry(make_func('x, y, z=0'))
    assert check_valid(f)
    assert check_valid(f, 1)
    assert check_valid(f, 1, 2)
    assert check_valid(f, 1, 2, 3)
    assert check_valid(f, 1, 2, 3, 4) is False
    assert check_valid(f, invalid_keyword=True) is False
    assert check_valid(f(1))
    assert check_valid(f(1), 2)
    assert check_valid(f(1), 2, 3)
    assert check_valid(f(1), 2, 3, 4) is False
    assert check_valid(f(1), x=2) is False
    assert check_valid(f(1), y=2)
    assert check_valid(f(x=1), 2) is False
    assert check_valid(f(x=1), y=2)
    assert check_valid(f(y=2), 1)
    assert check_valid(f(y=2), 1, z=3)
    assert check_valid(f(y=2), 1, 3) is False

    f = cytoolz.curry(make_func('x, y, z=0'), 1, x=1)
    assert check_valid(f) is False
    assert check_valid(f, z=3) is False

    f = cytoolz.curry(make_func('x, y, *args, z'))
    assert check_valid(f)
    assert check_valid(f, 0)
    assert check_valid(f(1), 0)
    assert check_valid(f(1, 2), 0)
    assert check_valid(f(1, 2, 3), 0)
    assert check_valid(f(1, 2, 3, 4), 0)
    assert check_valid(f(1, 2, 3, 4), z=4)
    assert check_valid(f(x=1))
    assert check_valid(f(x=1), 1) is False
    assert check_valid(f(x=1), y=2)
Ejemplo n.º 14
0
def test_funcname_cytoolz():
    @curry
    def foo(a, b, c):
        pass

    assert funcname(foo) == "foo"
    assert funcname(foo(1)) == "foo"

    def bar(a, b):
        return a + b

    c_bar = curry(bar, 1)
    assert funcname(c_bar) == "bar"
Ejemplo n.º 15
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    # else:
    #     raise ValueError('Found existing DB. Please explicitly remove '
    #                      'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = BertTokenizer.from_pretrained(opts.toker,
                                          do_lower_case='uncased'
                                          in opts.toker)
    tokenizer = bert_tokenize(toker)
    meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids('!')[0], len(toker.vocab))
    with open(f'{opts.output}/meta.json', 'w') as f:
        json.dump(vars(opts), f, indent=4)

    if opts.dataset == "nvlr2":
        open_db = curry(open_lmdb, opts.output, readonly=False)
        with open_db() as db:
            with open(opts.annotation) as ann:
                if opts.missing_imgs is not None:
                    missing_imgs = set(json.load(open(opts.missing_imgs)))
                else:
                    missing_imgs = None
                id2lens, txt2img = process_nlvr2(ann, db, tokenizer,
                                                 missing_imgs)

        with open(f'{opts.output}/id2len.json', 'w') as f:
            json.dump(id2lens, f)
        with open(f'{opts.output}/txt2img.json', 'w') as f:
            json.dump(txt2img, f)
    else:
        train_ann_path = os.path.join(opts.annotation, "train.json")
        train_img_dir = os.path.join(opts.img_dir, "train")
        train_output_dir = f'{opts.output}/train/'

        with open(train_ann_path, "r") as ann_file:
            ann = json.load(ann_file)
            process_vizwiz(ann, tokenizer, train_img_dir, train_output_dir)

        val_ann_path = os.path.join(opts.annotation, "val.json")
        val_img_dir = os.path.join(opts.img_dir, "val")
        val_output_dir = f'{opts.output}/val/'

        with open(val_ann_path) as ann_file:
            ann = json.load(ann_file)
            process_vizwiz(ann, tokenizer, val_img_dir, val_output_dir)
Ejemplo n.º 16
0
def reduce(function, initval=None):
    """
    Curried version of the built-in reduce.

    >>> reduce(lambda x,y: x+y)( [1, 2, 3, 4, 5] )
    15
    >>> reduce(lambda x,y: x+y, initval=10)( [1, 2, 3, 4, 5] )
    25
    """
    if initval is None:
        return cytoolz.curry(__builtin__.reduce)(function)
    else:
        # TODO: Port to cytoolz
        return lambda s: __builtin__.reduce(function, s, initval)
Ejemplo n.º 17
0
def other_than(groups, bools):
    """
    Construct a Series that has booleans indicating the presence of
    something- or someone-else with a certain property within a group.

    Parameters
    ----------
    groups : pandas.Series
        A column with the same index as `bools` that defines the grouping
        of `bools`. The `bools` Series will be used to index `groups` and
        then the grouped values will be counted.
    bools : pandas.Series
        A boolean Series indicating where the property of interest is present.
        Should have the same index as `groups`.

    Returns
    -------
    others : pandas.Series
        A boolean Series with the same index as `groups` and `bools`
        indicating whether there is something- or something-else within
        a group with some property (as indicated by `bools`).

    """
    counts = groups[bools].value_counts()
    merge_col = groups.to_frame(name='right')
    pipeline = tz.compose(
        tz.curry(pd.Series.fillna, value=False), itemgetter('left'),
        tz.curry(pd.DataFrame.merge,
                 right=merge_col,
                 how='right',
                 left_index=True,
                 right_on='right'), tz.curry(pd.Series.to_frame, name='left'))
    gt0 = pipeline(counts > 0)
    gt1 = pipeline(counts > 1)

    return gt1.where(bools, other=gt0)
Ejemplo n.º 18
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = BertTokenizer.from_pretrained(
        opts.toker, do_lower_case='uncased' in opts.toker)
    tokenizer = bert_tokenize(toker)
    meta['UNK'] = toker.convert_tokens_to_ids(['[UNK]'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['[CLS]'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['[SEP]'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['[MASK]'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids('!')[0],
                       len(toker.vocab))
    with open(f'{opts.output}/meta.json', 'w') as f:
        json.dump(vars(opts), f, indent=4)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    output_field_name = ['id2len', 'txt2img']
    with open_db() as db:
        if opts.task == 'nlvr':
            with open(opts.annotations[0]) as ann:
                if opts.missing_imgs is not None:
                    missing_imgs = set(json.load(open(opts.missing_imgs)))
                else:
                    missing_imgs = None
                jsons = process_nlvr2(
                    ann, db, tokenizer, missing_imgs)
        elif opts.task == 're':
            data = pickle.load(open(opts.annotations[0], 'rb'))
            instances = json.load(open(opts.annotations[1], 'r'))
            iid_to_ann_ids = json.load(
                open(opts.annotations[2], 'r'))['iid_to_ann_ids']
            # dirs/refcoco_testA_bert-base-cased.db -> testA
            img_split = opts.output.split('/')[-1].split('.')[0].split('_')[1]
            jsons = process_referring_expressions(
                data, instances, iid_to_ann_ids,
                db, tokenizer, img_split)
            output_field_name = [
                'id2len', 'images', 'annotations',
                'categories', 'refs']

    for dump, name in zip(jsons, output_field_name):
        with open(f'{opts.output}/{name}.json', 'w') as f:
            json.dump(dump, f)
Ejemplo n.º 19
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = RobertaTokenizer.from_pretrained(
        opts.toker)
    tokenizer = roberta_tokenize(toker)
    meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0]
    meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0],
                       toker.convert_tokens_to_ids(['<|endoftext|>'])[0]+1)
    save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        with open(opts.annotation, "r") as ann:
            if opts.task == "tvr":
                id2lens, query2video, query_data = process_tvr(
                    ann, db, tokenizer)
            elif opts.task == "tvqa":
                id2lens, query2video, query_data = process_tvqa(
                    ann, db, tokenizer)
            elif opts.task == "violin":
                id2lens, query2video, query_data = process_violin(
                    ann, db, tokenizer)
            else:
                raise NotImplementedError(
                    f"prepro for {opts.task} not implemented")

    save_json(id2lens, f'{opts.output}/id2len.json')
    save_json(query2video, f'{opts.output}/query2video.json')
    save_jsonl(query_data, f'{opts.output}/query_data.jsonl')
Ejemplo n.º 20
0
def main(opts):
    if not exists(opts.output):
        os.makedirs(opts.output)
    else:
        raise ValueError('Found existing DB. Please explicitly remove '
                         'for re-processing')
    meta = vars(opts)
    meta['tokenizer'] = opts.toker
    toker = RobertaTokenizer.from_pretrained(opts.toker)
    tokenizer = roberta_tokenize(toker)
    meta['BOS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['EOS'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['SEP'] = toker.convert_tokens_to_ids(['</s>'])[0]
    meta['CLS'] = toker.convert_tokens_to_ids(['<s>'])[0]
    meta['PAD'] = toker.convert_tokens_to_ids(['<pad>'])[0]
    meta['MASK'] = toker.convert_tokens_to_ids(['<mask>'])[0]
    meta['UNK'] = toker.convert_tokens_to_ids(['<unk>'])[0]
    meta['v_range'] = (toker.convert_tokens_to_ids(['.'])[0],
                       toker.convert_tokens_to_ids(['<|endoftext|>'])[0] + 1)
    save_json(vars(opts), f'{opts.output}/meta.json', save_pretty=True)

    open_db = curry(open_lmdb, opts.output, readonly=False)
    with open_db() as db:
        sub_info_cache_path = f'{opts.output}/sub_info.json'
        try:
            vid2nframe = load_json(opts.vid2nframe)
        except Exception:
            vid2nframe = None
        if not os.path.exists(sub_info_cache_path):
            video2sub_info = load_process_sub_meta(
                opts.annotation, vid2nframe, frame_length=args.frame_length)
            save_json(video2sub_info, sub_info_cache_path)
        else:
            video2sub_info = load_json(sub_info_cache_path)
        with open(opts.annotation) as ann:
            vid2len, vid2max_frame_sub_len = process_tv_subtitles(
                ann, video2sub_info, db, tokenizer, meta['SEP'])

        save_json(vid2len, f'{opts.output}/vid2len.json')
        save_json(vid2max_frame_sub_len,
                  f'{opts.output}/vid2max_frame_sub_len.json')
Ejemplo n.º 21
0
def compare_bytecode(left, right):
    unprefixed_left = remove_0x_prefix(left)
    unprefixed_right = remove_0x_prefix(right)

    sub = curry(re.sub)
    norm_pipeline = compose(
        sub(EMBEDDED_SWARM_HASH_REGEX, SWARM_HASH_REPLACEMENT),
        sub(EMBEDDED_ADDRESS_REGEX, ADDRESS_REPLACEMENT))
    norm_left = norm_pipeline(unprefixed_left)
    norm_right = norm_pipeline(unprefixed_right)

    if len(norm_left) != len(unprefixed_left) or len(norm_right) != len(
            unprefixed_right):
        raise ValueError(
            "Invariant.  Normalized bytecodes are not the correct lengths:" +
            "\n- left  (original)  :" + left + "\n- left  (unprefixed):" +
            unprefixed_left + "\n- left  (normalized):" + norm_left +
            "\n- right (original)  :" + right + "\n- right (unprefixed):" +
            unprefixed_right + "\n- right (normalized):" + norm_right)

    return norm_left == norm_right
Ejemplo n.º 22
0
# -*- coding: utf-8 -*-
__title__ = 'text2math'
__author__ = 'Steven Cutting'
__author_email__ = '*****@*****.**'
__created_on__ = '02/13/2016'
__copyright__ = "text2math Copyright (C) 2016  Steven Cutting"


import sys

import cytoolz as tlz
c_map = tlz.curry(tlz.map)


# --
# Specific imports

# Parsing
from xml.dom import minidom
from bs4 import BeautifulSoup

# Encoding issues
import cchardet as chardet
from unidecode import unidecode
import ftfy


if sys.version_info[0] < 3:
    _STRINGTYPES = (basestring,)
else:
    # temp fix, so that 2.7 support wont break
Ejemplo n.º 23
0
import cytoolz
from cytoolz import *
from cytoolz.curried_exceptions import *

# Here is the recipe used to create the list below
# (and "cytoolz/tests/test_curried_toolzlike.py" verifies the list is correct):
#
# import toolz
# import toolz.curried
#
# for item in sorted(key for key, val in toolz.curried.__dict__.items()
#                    if isinstance(val, toolz.curry)):
#      print '%s = cytoolz.curry(%s)' % (item, item)

accumulate = cytoolz.curry(accumulate)
assoc = cytoolz.curry(assoc)
cons = cytoolz.curry(cons)
countby = cytoolz.curry(countby)
dissoc = cytoolz.curry(dissoc)
do = cytoolz.curry(do)
drop = cytoolz.curry(drop)
filter = cytoolz.curry(filter)
get = cytoolz.curry(get)
get_in = cytoolz.curry(get_in)
groupby = cytoolz.curry(groupby)
interleave = cytoolz.curry(interleave)
interpose = cytoolz.curry(interpose)
itemfilter = cytoolz.curry(itemfilter)
itemmap = cytoolz.curry(itemmap)
iterate = cytoolz.curry(iterate)
Ejemplo n.º 24
0
    
    from itertools import groupby, accumulate, count
    from functools import reduce
    import operator

    # curried versions
    from cytoolz.curried import filter as cfilter
    from cytoolz.curried import map as cmap
    from cytoolz.curried import sorted as csorted
    from cytoolz.curried import groupby as cgroupby
    from cytoolz.curried import accumulate as caccumulate
    from cytoolz.curried import count as ccount
    from cytoolz.curried import reduce as creduce

    from cytoolz import curry

    cmax = curry(max)
    cmin = curry(min)

    czip = lambda xs: zip(*xs)


# def contains(val):
#    return lambda x: val in x


with catch_exc(print_error=False):
    import pyspark.sql.functions as F
    from pyspark.sql.types import *
    from pyspark.sql.window import Window
Ejemplo n.º 25
0
 def curry_namespace(ns):
     return dict(
         (name, cytoolz.curry(f) if should_curry(f) else f)
         for name, f in ns.items() if '__' not in name
     )
Ejemplo n.º 26
0
# -*- coding: utf-8 -*-
__title__ = 'text2math'
__author__ = 'Steven Cutting'
__author_email__ = '*****@*****.**'
__created_on__ = '02/13/2016'
__copyright__ = "text2math Copyright (C) 2016  Steven Cutting"

from operator import eq

import pytest
import cytoolz as tlz
c_eq = tlz.curry(eq)

from text2math import raw2text

from utils import osx_xfail

# TODO (steven_c) Find out why km/h test fails only on OSX.


@pytest.mark.parametrize("string,expected",
                         [("<p>foo<\p><li>bar<\li>", "foobar"),
                          ])
def test__remove_html_bits(string, expected):
    assert(tlz.pipe(string,
                    raw2text.remove_html_bits,
                    c_eq(expected)))


def test__verify_unicode_fail():
    with pytest.raises(AssertionError):
Ejemplo n.º 27
0
            return self[key + '.each_event']
        k, w = key[:i], key[i + 1:]
        if w == 'each_file':
            return (f[self.map(k)].value for f in self.files)
        elif w == 'each_event':
            return concat(f[self.map(k)] for f in self.files)
        else:
            raise ValueError("Key '{}' is invalid!".format(key))


try:
    from dbpy import (read_hightagnumber as __read_hightagnumber,
                      read_taglist_byrun as __read_taglist_byrun,
                      read_syncdatalist_float)
    from stpy import StorageReader, StorageBuffer
    read_hightagnumber = curry(memoize(__read_hightagnumber))
    read_taglist_byrun = curry(memoize(__read_taglist_byrun))

    class _ReadonlyBuffer:
        def __init__(self, buffer):
            self.__buffer = buffer

        @property
        def data(self):
            return self.__buffer.read_det_data(0)

        @property
        def info(self):
            return self.__buffer.read_det_info(0)

    class StorageWrapper:
Ejemplo n.º 28
0
    to_bytes,
    to_canonical_address,
    to_checksum_address,
    to_dict,
    to_hex,
    to_int,
    to_list,
    to_normalized_address,
    to_ordered_dict,
    to_set,
    to_text,
    to_tuple,
    to_wei,
)

apply_formatter_at_index = curry(apply_formatter_at_index)
apply_formatter_if = curry(apply_formatter_if)
apply_formatter_to_array = curry(apply_formatter_to_array)
apply_formatters_to_dict = curry(apply_formatters_to_dict)
apply_key_map = curry(apply_key_map)
apply_one_of_formatters = curry(apply_one_of_formatters)
flatten_return = curry(flatten_return)
force_bytes = curry(force_bytes)
force_text = curry(force_text)
from_wei = curry(from_wei)
hexstr_if_str = curry(hexstr_if_str)
is_same_address = curry(is_same_address)
reversed_return = curry(reversed_return)
sort_return = curry(sort_return)
text_if_str = curry(text_if_str)
to_wei = curry(to_wei)
Ejemplo n.º 29
0
from __future__ import absolute_import

import operator

from cytoolz import curry


# We use a blacklist instead of whitelist because:
#   1. We have more things to include than exclude.
#   2. This gives us access to things like matmul iff we are in Python >=3.5.
no_curry = frozenset((
    'abs',
    'index',
    'inv',
    'invert',
    'neg',
    'not_',
    'pos',
    'truth',
))

locals().update(
    dict((name, curry(f) if name not in no_curry else f)
         for name, f in vars(operator).items() if callable(f)),
)

# Clean up the namespace.
del curry
del no_curry
del operator
Ejemplo n.º 30
0
import cytoolz
from cytoolz import *
from cytoolz.curried_exceptions import *


# Here is the recipe used to create the list below
# (and "cytoolz/tests/test_curried_toolzlike.py" verifies the list is correct):
#
# import toolz
# import toolz.curried
#
# for item in sorted(key for key, val in toolz.curried.__dict__.items()
#                    if isinstance(val, toolz.curry)):
#      print '%s = cytoolz.curry(%s)' % (item, item)

accumulate = cytoolz.curry(accumulate)
assoc = cytoolz.curry(assoc)
cons = cytoolz.curry(cons)
countby = cytoolz.curry(countby)
dissoc = cytoolz.curry(dissoc)
do = cytoolz.curry(do)
drop = cytoolz.curry(drop)
filter = cytoolz.curry(filter)
get = cytoolz.curry(get)
get_in = cytoolz.curry(get_in)
groupby = cytoolz.curry(groupby)
interleave = cytoolz.curry(interleave)
interpose = cytoolz.curry(interpose)
itemfilter = cytoolz.curry(itemfilter)
itemmap = cytoolz.curry(itemmap)
iterate = cytoolz.curry(iterate)
Ejemplo n.º 31
0
    interleave,
    isdistinct,
    isiterable,
    juxt,
    last,
    memoize,
    merge_sorted,
    peek,
    pipe,
    second,
    thread_first,
    thread_last,
)
from .exceptions import merge, merge_with

accumulate = cytoolz.curry(cytoolz.accumulate)
assoc = cytoolz.curry(cytoolz.assoc)
assoc_in = cytoolz.curry(cytoolz.assoc_in)
cons = cytoolz.curry(cytoolz.cons)
countby = cytoolz.curry(cytoolz.countby)
dissoc = cytoolz.curry(cytoolz.dissoc)
do = cytoolz.curry(cytoolz.do)
drop = cytoolz.curry(cytoolz.drop)
excepts = cytoolz.curry(cytoolz.excepts)
filter = cytoolz.curry(cytoolz.filter)
get = cytoolz.curry(cytoolz.get)
get_in = cytoolz.curry(cytoolz.get_in)
groupby = cytoolz.curry(cytoolz.groupby)
interpose = cytoolz.curry(cytoolz.interpose)
itemfilter = cytoolz.curry(cytoolz.itemfilter)
itemmap = cytoolz.curry(cytoolz.itemmap)
Ejemplo n.º 32
0
 def img(self, i):
     x, y = self.x_edges, self.y_edges
     spline = RectBivariateSpline(x, y, i)
     self.intensity = curry(spline, grid=False)
Ejemplo n.º 33
0
 def intensity(self):
     x, y = self.x_centers, self.y_centers
     dx, dy = self.x_diffs, self.y_diffs
     spline = RectBivariateSpline(x, y, self.hist / outer(dx, dy))
     return curry(spline, grid=False)
Ejemplo n.º 34
0
def _curry_namespace(ns):
    return dict((name, cytoolz.curry(f) if f in _curry_set else f)
                for name, f in ns.items() if '__' not in name)
Ejemplo n.º 35
0
def _curry_namespace(ns):
    return dict(
        (name, cytoolz.curry(f) if f in _curry_set else f)
        for name, f in ns.items() if '__' not in name
    )
Ejemplo n.º 36
0
 def curry_namespace(ns):
     return {
         name: cytoolz.curry(f) if should_curry(f) else f
         for name, f in ns.items() if "__" not in name
     }
Ejemplo n.º 37
0
from __future__ import absolute_import

import operator

from cytoolz import curry

# We use a blacklist instead of whitelist because:
#   1. We have more things to include than exclude.
#   2. This gives us access to things like matmul iff we are in Python >=3.5.
no_curry = frozenset((
    'abs',
    'index',
    'inv',
    'invert',
    'neg',
    'not_',
    'pos',
    'truth',
))

locals().update(
    dict((name, curry(f) if name not in no_curry else f)
         for name, f in vars(operator).items() if callable(f)), )

# Clean up the namespace.
del curry
del no_curry
del operator