def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS, is_reset=True, truncate_to_most_frequent=0): """ Builds vocabulary from wordcount. It also adds extra words to the vocabulary. In: this_wordcount - dictionary of wordcounts, e.g. {'cpu':3} extra_words - additional words to build the vocabulary dictionary of {word: id} by default {UNKNOWN: 0} is_reset - if True we restart the vocabulary counting by defaults False truncate_to_most_frequent - if positive then the vocabulary is truncated to 'truncate_to_most_frequent' words; by default 0 Out: word2index - mapping from words to indices index2word - mapping from indices to words """ if is_reset: _myinc.counter=len(EXTRA_WORDS) if truncate_to_most_frequent > 0: sorted_wordcount = dict(sorted( this_wordcount.items(), key=lambda x:x[1], reverse=True)[:truncate_to_most_frequent]) this_wordcount = sorted_wordcount word2index = itemmap(_myinc, this_wordcount) if not extra_words == {}: assert(all([el not in word2index.values() for el in extra_words.values()])) word2index.update(extra_words) index2word = itemmap(reversed, word2index) return word2index, index2word
def fmap(keys: Collection, funcs: Collection[Callable], data_dict: Dict, val_as_args: bool = False) -> Dict: """ :param keys: a collection, should support __contains__ and __getitem__ :param funcs: a iterable of callables :param data_dict: a data dictionary :param val_as_args: bool :return: a data dictionary """ func_map = dict( zip_longest( keys, funcs, fillvalue=lambda x: x ) ) def _apply_func(item): k, v = item if k in func_map: if val_as_args: return k, func_map[k](*v) else: return k, func_map[k](v) else: return k, v return dict( itemmap(_apply_func, data_dict) )
def get_required_fields(link: ParseResult) -> Dict[str, str]: fields = link.netloc.split(':') return itemmap( lambda i: decode_field(i[0], fields[i[1]]), REQUIRED_FIELDS_POSITION, OrderedDict )
def invert(d): """Inverts a dictionary from key->value mapping to a value->key mapping. The values being switched to keys must be hashable. >>> invert({'ashley': 6, 'timothy': 15}) ... {6: 'ashley', 15: 'timothy'} """ return dict(toolz.itemmap(reversed, d))
def get_table_lines(dicts, max_width: int or dict = 50, keys=None): """Return list of lines formatted as table.""" display_dicts = [{key: d.get(key) for key in (keys or d)} for d in dicts] cut_at_max_width = partial(cut_values, max_width=max_width) display_dicts = [toolz.itemmap(cut_at_max_width, d) for d in display_dicts] table_string = tabulate.tabulate(display_dicts, headers="keys", tablefmt="github") table_lines = get_unique_lines(table_string) return table_lines
def invert_with(f, d): """Inverts a dictionary from key->value mapping to a value->key mapping with some transform on the old values. The new keys must be hashable, per dictionary requirements. The transforming function needs to accept only the value as an argument. >>> invert_with(sum, {'ashley': [1,2,3], 'timothy': [4,5,6]}) ... {6: 'ashley', 15: 'timothy'} """ r_f = lambda item: (f(item[1]), item[0]) return dict(toolz.itemmap(r_f, d))
def build_vocabulary(this_wordcount, extra_words=EXTRA_WORDS, is_reset=True, truncate_to_most_frequent=0): """ Builds vocabulary from wordcount. It also adds extra words to the vocabulary. In: this_wordcount - dictionary of wordcounts, e.g. {'cpu':3} extra_words - additional words to build the vocabulary dictionary of {word: id} by default {UNKNOWN: 0} is_reset - if True we restart the vocabulary counting by defaults False truncate_to_most_frequent - if positive then the vocabulary is truncated to 'truncate_to_most_frequent' words; by default 0 Out: word2index - mapping from words to indices index2word - mapping from indices to words """ if is_reset: _myinc.counter = len(EXTRA_WORDS) if truncate_to_most_frequent > 0: sorted_wordcount = dict( sorted(this_wordcount.items(), key=lambda x: x[1], reverse=True)[:truncate_to_most_frequent]) this_wordcount = sorted_wordcount word2index = itemmap(_myinc, this_wordcount) if not extra_words == {}: assert (all( [el not in word2index.values() for el in extra_words.values()])) word2index.update(extra_words) index2word = itemmap(reversed, word2index) return word2index, index2word
def build_graph(rules: List[str]) -> Tuple[MultiDiGraph, dict]: """ Rather than building the graph so that every individual bag is represented by a node, and colored, and doing the calculations based on colors in the graph, I did it so that every type of bag was one node, and the connections between the types of bag were represented by multiple edges. This is probably not the best way to do it, but it does work for these use cases. """ nodes = [parse(rule)[0] for rule in rules] nodemap = dict(enumerate(nodes)) color_to_node = itemmap(reversed, nodemap) noded = add_nodes(MultiDiGraph(), nodemap) edged = reduce(partial(parse_edges, color_to_node), rules, noded) return (edged, color_to_node)
def frename(keys: Collection, data_dict: Dict) -> Dict: """ Rename keys according to a mapping :param keys: :param data_dict: :return: """ def _keys(item): k, v = item if k in keys: return keys[k], v else: return k, v return dict( itemmap(_keys, data_dict) )
MAPPING = {'object': 'String', 'uint64': 'UInt64', 'uint32': 'UInt32', 'uint16': 'UInt16', 'uint8': 'UInt8', 'float64': 'Float64', 'float32': 'Float32', 'int64': 'Int64', 'int32': 'Int32', 'int16': 'Int16', 'int8': 'Int8', 'datetime64[D]': 'Date', 'datetime64[ns]': 'DateTime'} PD2CH = keymap(np.dtype, MAPPING) CH2PD = itemmap(reversed, MAPPING) CH2PD['Null'] = 'object' CH2PD['Nothing'] = 'object' NULLABLE_COLS = ['UInt64', 'UInt32', 'UInt16', 'UInt8', 'Float64', 'Float32', 'Int64', 'Int32', 'Int16', 'Int8', 'String', 'DateTime'] for col in NULLABLE_COLS: CH2PD['Nullable({})'.format(col)] = CH2PD[col] PY3 = sys.version_info[0] == 3 def normalize(df, index=True): if index: df = df.reset_index()
__all__ = [ 'build_vocabulary', 'index_sequence', 'encode_questions_index', 'encode_questions_one_hot', 'encode_answers_one_hot' ] ### ### # Constants ### PADDING = '<pad>' UNKNOWN = '<unk>' EOA = '<eoa>' # end of answer EOQ = '<eoq>' # end of question EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ] EXTRA_WORDS = {PADDING: 0, UNKNOWN: 1, EOA: 2, EOQ: 3} EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS) ### # Functions ### def static_vars(**kwargs): def decorate(func): for k in kwargs: setattr(func, k, kwargs[k]) return func return decorate @static_vars(counter=len(EXTRA_WORDS))
__all__ = ['build_vocabulary', 'index_sequence', 'encode_questions_index','encode_questions_one_hot', 'encode_answers_one_hot'] ### ### # Constants ### PADDING = '<pad>' UNKNOWN = '<unk>' EOA = '<eoa>' # end of answer EOQ = '<eoq>' # end of question EXTRA_WORDS_NAMES = [PADDING, UNKNOWN, EOA, EOQ] EXTRA_WORDS = {PADDING:0, UNKNOWN:1, EOA:2, EOQ:3} EXTRA_WORDS_ID = itemmap(reversed, EXTRA_WORDS) ### # Functions ### def static_vars(**kwargs): def decorate(func): for k in kwargs: setattr(func, k, kwargs[k]) return func return decorate @static_vars(counter=len(EXTRA_WORDS)) def _myinc(d): """
def get_extra_params(link: ParseResult) -> Dict[str, str]: return itemmap( lambda i: decode_field(*i), valmap(first, parse_qs(link.query)) )
def json2list(filter_fn, map_fn, dictionary): """Filter and map dictionary in succession to obtain list.""" filtered = toolz.itemfilter(star(filter_fn), dictionary) return toolz.itemmap(star(map_fn), filtered, factory=list)