Example #1
0
def build(obj, *applicators):
    """
    Run the provided object through the series of applicator functions.

    If ``obj`` is an instances of :class:`~eth.chains.base.BaseChain` the
    applicators will be run on a copy of the chain and thus will not mutate the
    provided chain instance.
    """
    if isinstance(obj, BaseChain):
        return pipe(obj, copy(), *applicators)
    else:
        return pipe(obj, *applicators)
Example #2
0
def test_chain_builder_initialize_chain_default(chain_class):
    chain = pipe(
        chain_class,
        genesis(),
    )

    header = chain.get_canonical_head()
    assert header == chain.get_canonical_block_by_number(0).header

    assert header.parent_hash == constants.GENESIS_PARENT_HASH
    assert header.uncles_hash == constants.EMPTY_UNCLE_HASH
    assert header.coinbase == constants.GENESIS_COINBASE
    assert header.state_root == constants.BLANK_ROOT_HASH
    assert header.transaction_root == constants.BLANK_ROOT_HASH
    assert header.receipt_root == constants.BLANK_ROOT_HASH
    assert header.bloom == 0
    assert header.difficulty == 1
    assert header.block_number == constants.GENESIS_BLOCK_NUMBER
    assert header.gas_limit == constants.GENESIS_GAS_LIMIT
    assert header.gas_used == 0
    # account for runtime.  should run in less than few seconds and should be
    # effectively "now"
    assert abs(header.timestamp - time.time()) < 2
    assert header.extra_data == constants.GENESIS_EXTRA_DATA
    assert header.mix_hash == constants.GENESIS_MIX_HASH
    assert header.nonce == constants.GENESIS_NONCE
Example #3
0
def guess_language(doc, output="best"):
    """Guess the language of a document.

    This function applies a statistical method to determine the language of a
    document. Depending on the ``output`` argument, it may either return a
    single language code, or a ranking of languages that a document may be
    written in, sorted by probability.

    Uses the langid library.

    Parameters
    ----------
    doc : document

    output : string
        Either "best" to get a pair (code, prob) giving the two-letter code
        of the most probable language and its probability, or "rank" for a
        list of such pairs for all languages in the model.
    """
    from langid import classify, rank

    try:
        func = {"best": classify, "rank": rank}[output]
    except KeyError:
        raise ValueError("invalid parameter value output=%r" % output)

    return pipe(doc, fetch, func)
Example #4
0
def tokenize(doc):
    """Tokenize text.

    Uses the NLTK function word_tokenize.
    """
    nltk_download('punkt')
    return pipe(doc, fetch, nltk.word_tokenize)
Example #5
0
def movie_review_emotions(doc, **kwargs):
    """Emotion (fine-grained sentiment) tagger for movie reviews.

    The training data for this function is that of Buitinck et al., with the
    training and test data concatenated. The algorithm is SVMs in a binary
    relevance (one-vs-rest) combination. You may use the training data
    (and this function) for academic/research purposes only. Add a parameter
    for_academic_research=True if you accept the license.

    Returns
    -------
    tagged : list of (string, list of string)
        A list of (sentence, labels) pairs. Each sentence may have zero or
        more labels.

    References
    ----------
    L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015).
    Multi-emotion detection in user-generated reviews. Proc. ECIR.
    https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf
    """
    if not (kwargs.get('for_academic_research', False)
            or kwargs.get('unittest', False)):
        raise RuntimeError(
            "This functionality is only available for"
            " academic research. Please use movie_review_emotions(doc,"
            " for_academic_research=True) to use this function for"
            " that purpose.")

    from ._emotion import classify
    nltk_download('punkt')
    sentences = pipe(doc, fetch, nltk.sent_tokenize)
    return list(zip(sentences, classify(sentences)))
Example #6
0
def alpino(doc, output="raw"):
    """Wrapper around the Alpino (dependency) parser for Dutch.

    Expects an environment variable ALPINO_HOME to point at
    the Alpino installation dir.

    The script uses the 'dependencies' end_hook to generate lemmata and
    the dependency structure.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output from Alpino itself.
        If 'saf', returns a SAF dictionary.

    References
    ----------
    `Alpino homepage <http://www.let.rug.nl/vannoord/alp/Alpino/>`_.
    """
    from ._alpino import tokenize, parse_raw, interpret_parse

    try:
        transf = {"raw": identity, "saf": interpret_parse}[output]
    except KeyError:
        raise ValueError("Unknown output format %r" % output)

    return pipe(doc, fetch, tokenize, parse_raw, transf)
Example #7
0
def test_chain_builder_initialize_chain_with_params(chain_class):
    chain = pipe(chain_class, genesis(params={'difficulty': 12345}, ))

    header = chain.get_canonical_head()
    assert header == chain.get_canonical_block_by_number(0).header

    assert header.difficulty == 12345
Example #8
0
    def x_hashing_pre(self, line):

        #Remove links, hashtags, at-mentions, mark-up, and "RT"
        line = re.sub(r"http\S+", "", line)
        line = re.sub(r"@\S+", "", line)
        line = re.sub(r"#\S+", "", line)
        line = re.sub("<[^>]*>", "", line)
        line = line.replace(" RT", "").replace("RT ", "")

        #Remove emojis
        line = re.sub(self.myre, "", line)

        #Remove punctuation and extra spaces
        line = ct.pipe(line, preprocessing.strip_tags,
                       preprocessing.strip_punctuation,
                       preprocessing.strip_numeric,
                       preprocessing.strip_non_alphanum,
                       preprocessing.strip_multiple_whitespaces)

        #Strip and reduce to max training length
        line = line.lower().strip().lstrip()

        #Truncate sampels for LID
        if self.type == "LID":
            line = line[0:self.sample_size]

        return line
Example #9
0
def find_matching_fn_abi(abi, fn_identifier=None, args=None, kwargs=None):
    filters = []

    if fn_identifier:
        if fn_identifier is FallbackFn:
            return get_fallback_func_abi(abi)
        elif is_text(fn_identifier):
            filters.append(functools.partial(filter_by_name, fn_identifier))
        else:
            raise TypeError("Unsupported function identifier")

    if args is not None or kwargs is not None:
        if args is None:
            args = tuple()
        if kwargs is None:
            kwargs = {}

        num_arguments = len(args) + len(kwargs)
        filters.extend([
            functools.partial(filter_by_argument_count, num_arguments),
            functools.partial(filter_by_encodability, args, kwargs),
        ])

    function_candidates = pipe(abi, *filters)

    if len(function_candidates) == 1:
        return function_candidates[0]
    if not function_candidates:
        raise ValueError("No matching functions found")
    else:
        raise ValueError("Multiple functions found")
Example #10
0
def find_matching_fn_abi(abi, fn_name=None, args=None, kwargs=None):
    filters = []

    if fn_name:
        filters.append(functools.partial(filter_by_name, fn_name))

    if args is not None or kwargs is not None:
        if args is None:
            args = tuple()
        if kwargs is None:
            kwargs = {}

        num_arguments = len(args) + len(kwargs)
        filters.extend([
            functools.partial(filter_by_argument_count, num_arguments),
            functools.partial(filter_by_encodability, args, kwargs),
        ])

    function_candidates = filter_by_type('function', abi)

    function_candidates = pipe(abi, *filters)

    if len(function_candidates) == 1:
        return function_candidates[0]
    if not function_candidates:
        raise ValueError("No matching functions found")
    else:
        raise ValueError("Multiple functions found")
Example #11
0
def find_matching_fn_abi(abi, fn_name=None, args=None, kwargs=None):
    filters = []

    if fn_name:
        filters.append(functools.partial(filter_by_name, fn_name))

    if args is not None or kwargs is not None:
        if args is None:
            args = tuple()
        if kwargs is None:
            kwargs = {}

        num_arguments = len(args) + len(kwargs)
        filters.extend([
            functools.partial(filter_by_argument_count, num_arguments),
            functools.partial(filter_by_encodability, args, kwargs),
        ])

    function_candidates = filter_by_type('function', abi)

    function_candidates = pipe(abi, *filters)

    if len(function_candidates) == 1:
        return function_candidates[0]
    if not function_candidates:
        raise ValueError("No matching functions found")
    else:
        raise ValueError("Multiple functions found")
Example #12
0
    def fetch_candidate_head(self):
        # Try to return a log that has the score that we are checking for,
        # checking in order of oldest to most recent.
        unchecked_logs = pipe(
            self.unchecked_logs,
            enumerate,
            tuple,
            reversed,
            tuple,
        )
        current_score = self.current_score

        for idx, log_entry in unchecked_logs:
            if log_entry['score'] == current_score:
                return self.unchecked_logs.pop(idx)
        # If no further recorded but unchecked logs exist, go to the next
        # is_new_head = true log
        while True:
            try:
                log_entry = self.get_next_log()
            # TODO: currently just raise when there is no log anymore
            except NextLogUnavailable:
                # TODO: should returns the genesis collation instead or just leave it?
                raise NoCandidateHead("No candidate head available")
            if log_entry['is_new_head']:
                break
            self.unchecked_logs.append(log_entry)
        self.current_score = log_entry['score']
        return log_entry
Example #13
0
def map_abi_data(normalizers, types, data):
    '''
    This function will apply normalizers to your data, in the
    context of the relevant types. Each normalizer is in the format:

    def normalizer(datatype, data):
        # Conditionally modify data
        return (datatype, data)

    Where datatype is a valid ABI type string, like "uint".

    In case of an array, like "bool[2]", normalizer will receive `data`
    as an iterable of typed data, like `[("bool", True), ("bool", False)]`.

    Internals
    ---

    This is accomplished by:

    1. Decorating the data tree with types
    2. Recursively mapping each of the normalizers to the data
    3. Stripping the types back out of the tree
    '''
    pipeline = itertools.chain(
        [abi_data_tree(types)],
        map(data_tree_map, normalizers),
        [partial(recursive_map, strip_abi_type)],
    )

    return pipe(data, *pipeline)
Example #14
0
def montage_stream(ims, montage_order=None, channel_order=[0, 1, 2],
                   clear_none=True):
    """From a sequence of single-channel field images, montage multichannels.

    Suppose the input is a list:

    ```
    ims = [green1a, blue1a, red1a, green1b, blue1b, red1b,
           green2a, blue2a, red2a, green2b, blue2b, red2b]
    ```

    with channel order ``[2, 0, 1]`` and montage order ``[1, 0]``, then
    the output will be:

    ```
    [rgb1_ba, rgb2_ba]
    ```

    Parameters
    ----------
    ims : iterator of array, shape (M, N)
        A list of images in which consecutive images represent single
        channels of the same image. (See example.)
    montage_order : array-like of int, optional
        The order of the montage images (in 1D or 2D).
    channel_order : list of int, optional
        The order in which the channels appear.

    Returns
    -------
    montaged_stream : iterator of arrays
        An iterator of the images composed into multi-channel montages.

    Examples
    --------
    >>> images = (i * np.ones((4, 5), dtype=np.uint8) for i in range(24))
    >>> montaged = list(montage_stream(images, [[0, 1], [2, 3]], [2, 0, 1]))
    >>> len(montaged)
    2
    >>> montaged[0].shape
    (8, 10, 3)
    >>> montaged[0][0, 0, :]
    array([2, 0, 1], dtype=uint8)
    >>> montaged[0][4, 5, :]
    array([11,  9, 10], dtype=uint8)
    >>> montaged[1][4, 5, :]
    array([23, 21, 22], dtype=uint8)
    """
    if montage_order is None:
        montage_order = cellomics.SPIRAL_CLOCKWISE_RIGHT_25
    montage_order = np.array(montage_order)
    ntiles = montage_order.size
    if clear_none:
        nchannels = len([i for i in channel_order if i is not None])
    else:
        nchannels = len(channel_order)
    return tz.pipe(ims, c.partition(nchannels),
                        c.map(stack_channels(order=channel_order)),
                        c.partition(ntiles),
                        c.map(montage(order=montage_order)))
Example #15
0
File: single.py Project: NLeSC/xtas
def movie_review_emotions(doc, **kwargs):
    """Emotion (fine-grained sentiment) tagger for movie reviews.

    The training data for this function is that of Buitinck et al., with the
    training and test data concatenated. The algorithm is SVMs in a binary
    relevance (one-vs-rest) combination. You may use the training data
    (and this function) for academic/research purposes only. Add a parameter
    for_academic_research=True if you accept the license.

    Returns
    -------
    tagged : list of (string, list of string)
        A list of (sentence, labels) pairs. Each sentence may have zero or
        more labels.

    References
    ----------
    L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015).
    Multi-emotion detection in user-generated reviews. Proc. ECIR.
    https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf
    """
    if not (kwargs.get("for_academic_research", False) or kwargs.get("unittest", False)):
        raise RuntimeError(
            "This functionality is only available for"
            " academic research. Please use movie_review_emotions(doc,"
            " for_academic_research=True) to use this function for"
            " that purpose."
        )

    from ._emotion import classify

    nltk_download("punkt")
    sentences = pipe(doc, fetch, nltk.sent_tokenize)
    return list(zip(sentences, classify(sentences)))
Example #16
0
def find_matching_fn_abi(abi, fn_identifier=None, args=None, kwargs=None):
    filters = []

    if fn_identifier:
        if fn_identifier is FallbackFn:
            return get_fallback_func_abi(abi)
        elif is_text(fn_identifier):
            filters.append(functools.partial(filter_by_name, fn_identifier))
        else:
            raise TypeError("Unsupported function identifier")

    if args is not None or kwargs is not None:
        if args is None:
            args = tuple()
        if kwargs is None:
            kwargs = {}

        num_arguments = len(args) + len(kwargs)
        filters.extend([
            functools.partial(filter_by_argument_count, num_arguments),
            functools.partial(filter_by_encodability, args, kwargs),
        ])

    function_candidates = pipe(abi, *filters)

    if len(function_candidates) == 1:
        return function_candidates[0]
    if not function_candidates:
        raise ValueError("No matching functions found")
    else:
        raise ValueError("Multiple functions found")
Example #17
0
def map_abi_data(normalizers, types, data):
    '''
    This function will apply normalizers to your data, in the
    context of the relevant types. Each normalizer is in the format:

    def normalizer(datatype, data):
        # Conditionally modify data
        return (datatype, data)

    Where datatype is a valid ABI type string, like "uint".

    In case of an array, like "bool[2]", normalizer will receive `data`
    as an iterable of typed data, like `[("bool", True), ("bool", False)]`.

    Internals
    ---

    This is accomplished by:

    1. Decorating the data tree with types
    2. Recursively mapping each of the normalizers to the data
    3. Stripping the types back out of the tree
    '''
    pipeline = itertools.chain(
        [abi_data_tree(types)],
        map(data_tree_map, normalizers),
        [partial(recursive_map, strip_abi_type)],
    )

    return pipe(data, *pipeline)
Example #18
0
def get_text_from_xml_file(filename):
    """
    This is setup for extracting text from the Stackoverflow posts data dump
    that is stored in a xml file.

    Returns a stream of Post bodies (just the text).
    """

    @tlz.curry
    def _get_xml_attr(key, xml_element):
        return xml_element.attributes[key].value

    @tlz.curry
    def _try_to_get_xml_attr(key, xml_element, default=''):
        try:
            return _get_xml_attr(key, xml_element)
        except(KeyError):
            return default

    return tlz.pipe(filename,
                    minidom.parse,  # Not pure
                    lambda layer0: layer0.getElementsByTagName("posts")[0],
                    lambda layer1: layer1.getElementsByTagName("row"),
                    c_map(tlz.juxt(_try_to_get_xml_attr("Title"),
                                   _get_xml_attr("Body"))),
                    c_map(lambda titleAndBody: '\n\n\n'.join(titleAndBody)))
Example #19
0
File: single.py Project: NLeSC/xtas
def tokenize(doc):
    """Tokenize text.

    Uses the NLTK function word_tokenize.
    """
    nltk_download("punkt")
    return pipe(doc, fetch, nltk.word_tokenize)
Example #20
0
def token_vectors_pipeline(input_col: str,
                           output_col: str,
                           df: DataFrame,
                           stemmer_func=None):
    """Convert a string into an array of integer token ids"""
    filled_col = input_col + "_filled"
    tokenised_col = input_col + "_tokenised"
    tf_vectors = input_col + "_tf_vectors"

    transforms = [
        # note that the tokenizer completely breaks given null input values
        partial(fill_nulls_with_empty_string, input_col, filled_col),
        partial(tokenize_words, filled_col, tokenised_col),
    ]

    # optionally stem the tokens
    if stemmer_func:
        transforms += [partial(stemmer_func, tokenised_col, tokenised_col)]

    transforms += [
        partial(rm_empty_strings_from_tokens, tokenised_col, tokenised_col),
        partial(term_frequency_vectors, tokenised_col, tf_vectors),
        partial(sparse_vector_indices, tf_vectors, output_col),
        partial(drop_cols, [filled_col, tokenised_col, tf_vectors]),
    ]
    return pipe(df, *transforms)
Example #21
0
def get_aggregation_bitfield(attestation_participants, target_committee_size):
    bitfield = get_empty_bitfield(target_committee_size)
    bitfield = pipe(
        bitfield,
        *(set_voted(index=committee_index)
          for committee_index in attestation_participants))
    return bitfield
Example #22
0
def alpino(doc, output="raw"):
    """Wrapper around the Alpino (dependency) parser for Dutch.

    Expects an environment variable ALPINO_HOME to point at
    the Alpino installation dir.

    The script uses the 'dependencies' end_hook to generate lemmata and
    the dependency structure.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output from Alpino itself.
        If 'saf', returns a SAF dictionary.

    References
    ----------
    `Alpino homepage <http://www.let.rug.nl/vannoord/alp/Alpino/>`_.
    """
    from ._alpino import tokenize, parse_raw, interpret_parse

    try:
        transf = {"raw": identity, "saf": interpret_parse}[output]
    except KeyError:
        raise ValueError("Unknown output format %r" % output)

    return pipe(doc, fetch, tokenize, parse_raw, transf)
Example #23
0
    def fetch_candidate_head(self):
        # Try to return a log that has the score that we are checking for,
        # checking in order of oldest to most recent.
        unchecked_logs = pipe(
            self.unchecked_logs,
            enumerate,
            tuple,
            reversed,
            tuple,
        )
        current_score = self.current_score

        for idx, logs_entry in unchecked_logs:
            if logs_entry['score'] == current_score:
                return self.unchecked_logs.pop(idx)
        # If no further recorded but unchecked logs exist, go to the next
        # is_new_head = true log
        while True:
            # TODO: currently just raise when there is no log anymore
            log_entry = self.get_next_log()
            if log_entry['is_new_head']:
                break
            self.unchecked_logs.append(log_entry)
        self.current_score = log_entry['score']
        return log_entry
Example #24
0
def int_to_bytes32(value):
    if not isinstance(value, int) or isinstance(value, bool):
        raise ValueError(
            "Value must be an integer: Got: {0}".format(
                type(value),
            )
        )
    if value < 0:
        raise ValueError(
            "Value cannot be negative: Got: {0}".format(
                value,
            )
        )
    if value > UINT_256_MAX:
        raise ValueError(
            "Value exeeds maximum UINT256 size.  Got: {0}".format(
                value,
            )
        )
    value_bytes = pipe(
        value,
        int_to_big_endian,
        pad32,
    )
    return value_bytes
Example #25
0
File: single.py Project: NLeSC/xtas
def guess_language(doc, output="best"):
    """Guess the language of a document.

    This function applies a statistical method to determine the language of a
    document. Depending on the ``output`` argument, it may either return a
    single language code, or a ranking of languages that a document may be
    written in, sorted by probability.

    Uses the langid library.

    Parameters
    ----------
    doc : document

    output : string
        Either "best" to get a pair (code, prob) giving the two-letter code
        of the most probable language and its probability, or "rank" for a
        list of such pairs for all languages in the model.
    """
    from langid import classify, rank

    try:
        func = {"best": classify, "rank": rank}[output]
    except KeyError:
        raise ValueError("invalid parameter value output=%r" % output)

    return pipe(doc, fetch, func)
Example #26
0
File: mnist.py Project: woohp/mnist
def main():
    mnist = fetch_mldata('MNIST original')
    X = mnist.data.astype(np.float32).reshape(
        (len(mnist.data), 28, 28, 1)) / 255.
    label_binarizer = LabelBinarizer()
    Y = label_binarizer.fit_transform(mnist.target)
    X_train, X_test, Y_train, Y_test = train_test_split(X, Y, train_size=60000)

    layers = [
        Conv2D(20, (3, 3), padding='same', activation='relu'),
        MaxPooling2D(),
        Conv2D(50, (3, 3), padding='same', activation='relu'),
        MaxPooling2D(),
        Flatten(),
        Dense(500, activation='relu'),
        Dense(10, activation='softmax')
    ]

    input = Input((28, 28, 1))
    output = toolz.pipe(input, *layers)
    model = Model(input, output)
    model.compile(optimizer='rmsprop',
                  loss='categorical_crossentropy',
                  metrics=['accuracy'])
    model.fit(X_train,
              Y_train,
              batch_size=128,
              epochs=10,
              validation_data=[X_test, Y_test])
Example #27
0
File: single.py Project: NLeSC/xtas
def nlner_conll(doc, **kwargs):
    """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset.

    See http://www.clips.uantwerpen.be/conll2002/ner/ for the dataset and
    its license. Add a parameter conll2002_project=True if you accept the
    license.

    See also
    --------
    frog: NER tagger and dependency parser for Dutch.

    stanford_ner_tag: NER tagger for English.
    """

    if not (kwargs.get("conll2002_project", False) or kwargs.get("unittest", False)):
        raise RuntimeError(
            "This functionality is only available to the"
            " CoNLL'02 project. Please use nlner_conll(doc,"
            " conll2002_project=True) if you are doing research"
            " in the context of the shared CoNLL-2002 shared task."
        )

    from ._nl_conll_ner import ner

    return pipe(doc, fetch, _tokenize_if_needed, ner)
Example #28
0
def load_seg_models(models, loaderdict=_LOADERDICT):
    """
    models - [(kind1, filename1), (kind2, filename2), ...]
    """
    return tlz.pipe(
        models,
        tlzc.map(lambda model: load_seg_model(
            model[0], model[1], loaderdict=loaderdict)), tuple)
Example #29
0
def decode_and_fix(text, encoding='utf-8'):
    """
    First applies a liberal decode method to the text in which it
    """
    return tlz.pipe(text,
                    adv_decode(encoding=encoding),
                    clean_unicode,
                    normize_text)
Example #30
0
def _ecpairing(data):
    exponent = bn128.FQ12.one()

    processing_pipeline = (_process_point(data[start_idx:start_idx + 192])
                           for start_idx in range(0, len(data), 192))
    exponent = pipe(bn128.FQ12.one(), *processing_pipeline)

    result = bn128.final_exponentiate(exponent) == bn128.FQ12.one()
    return result
Example #31
0
def std_decode(text, encoding='utf-8', errors='strict'):
    """
    Standardized interface to standard python string decode method.

    Only accepts byte string.
    """
    return tlz.pipe(text,
                    verify_bytestring,
                    lambda txt: txt.decode(encoding=encoding, errors=errors))
Example #32
0
def tags_at(run: int,
            *other_runs: int,
            beamline: int = None) -> Tuple[int, Sequence[int]]:
    """
    Example:
        hightag, tags = tags_at(509700, beamline=3)  # from single run
        hightag, tags = tags_at(509700, 509701, 509702, beamline=3)  # from multiple runs
    """
    if beamline is None:
        raise ValueError("Keyword argument 'beamline' must be given!")
    runs = run, *other_runs
    hightag_at_the_beamline = partial(hightag, beamline)
    taglist_at_the_beamline = partial(taglist, beamline)
    hightags: ndarray = pipe(runs, partial(map, hightag_at_the_beamline),
                             partial(fromiter, dtype='int'))
    if not (hightags == hightags[0]).all():
        raise ValueError('Not all the runs have a single hightag!')
    tags = pipe(runs, partial(map, taglist_at_the_beamline), concat, tuple)
    return hightags[0], tags
Example #33
0
def segment_text(segmentfunc, txt, flatten=False):
    """
    Splits the text into tokens and then segments the tokens.

    using segmentfunc

    Curried.
    """
    return tlz.pipe(txt, tpu.split_and_clean, tlzc.map(segmentfunc),
                    mseg.should_flatten(flatten), list)
Example #34
0
def segment_text(model, txt, flatten=True):
    """
    Splits the text into tokens and then segments the tokens.

    Uses a Flatcat model.

    Curried.
    """
    return tlz.pipe(txt, tpu.split_and_clean, tlzc.map(mk_segmenter(model)),
                    mseg.should_flatten(flatten), list)
Example #35
0
def test_chain_builder_initialize_chain_with_state_simple(chain_class):
    chain = pipe(chain_class, genesis(state=((ADDRESS_A, 'balance', 1), ), ))

    header = chain.get_canonical_head()
    assert header == chain.get_canonical_block_by_number(0).header

    assert header.state_root != constants.BLANK_ROOT_HASH

    account_db = chain.get_vm().state.account_db
    assert account_db.get_balance(ADDRESS_A) == 1
Example #36
0
def find_background_illumination(fns,
                                 radius=None,
                                 input_bitdepth=None,
                                 quantile=0.5,
                                 stretch_quantile=0.):
    """Use a set of related images to find uneven background illumination.

    Parameters
    ----------
    fns : list of string
        A list of image file names
    radius : int, optional
        The radius of the structuring element used to find background.
        default: The width or height of the input images divided by 4,
        whichever is smaller.
    input_bitdepth : int, optional
        The bit-depth of the input images. Should be specified if non-standard
        bitdepth images are used in a 16-bit image file, e.g. 12-bit images.
        Default is the dtype of the input image.
    quantile : float in [0, 1], optional
        The desired quantile to find background. default: 0.5 (median)
    stretch_quantile : float in [0, 1], optional
        Stretch image to full dtype limit, saturating above this quantile.

    Returns
    -------
    illum : np.ndarray, float, shape (M, N)
        The estimated illumination over the image field.

    See Also
    --------
    `correct_image_illumination`, `correct_multiimage_illumination`.
    """
    # this function follows the "PyToolz" streaming data model to
    # obtain the illumination estimate.
    # first, define the functions for each individual step:
    in_range = ('image' if input_bitdepth is None else
                (0, 2**input_bitdepth - 1))
    rescale = tz.curry(exposure.rescale_intensity)
    normalize = (tz.partial(stretchlim, bottom=stretch_quantile)
                 if stretch_quantile > 0 else skimage.img_as_float)

    # produce a stream of properly-scaled images
    ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize)
           for fn in fns)

    # take the mean of that stream
    mean_image = mean(ims)

    # return the median filter of that mean
    radius = radius or min(mean_image.shape) // 4
    illum = ndi.percentile_filter(mean_image,
                                  percentile=(quantile * 100),
                                  footprint=morphology.disk(radius))
    return illum
Example #37
0
def apply_all_link_refs(bytecode: bytes, link_refs: List[Dict[str, Any]],
                        attr_dict: Dict[str, str]) -> bytes:
    """
    Applies all link references corresponding to a valid attr_dict to the bytecode.
    """
    if link_refs is None:
        return bytecode
    link_fns = (apply_link_ref(offset, ref["length"], attr_dict[ref["name"]])
                for ref in link_refs for offset in ref["offsets"])
    linked_bytecode = cytoolz.pipe(bytecode, *link_fns)
    return linked_bytecode
Example #38
0
 def middleware(method, params):
     # TODO send call to eth-tester without gas, and remove guess_gas entirely
     if method == 'eth_call':
         filled_transaction = pipe(
             params[0],
             fill_default_from,
             fill_default_gas,
         )
         return make_request(method, [filled_transaction] + params[1:])
     elif method in (
             'eth_estimateGas',
             'eth_sendTransaction',
     ):
         filled_transaction = pipe(
             params[0],
             fill_default_from,
         )
         return make_request(method, [filled_transaction] + params[1:])
     else:
         return make_request(method, params)
Example #39
0
 def middleware(method, params):
     # TODO send call to eth-tester without gas, and remove guess_gas entirely
     if method == 'eth_call':
         filled_transaction = pipe(
             params[0],
             fill_default_from,
             fill_default_gas,
         )
         return make_request(method, [filled_transaction] + params[1:])
     elif method in (
         'eth_estimateGas',
         'eth_sendTransaction',
     ):
         filled_transaction = pipe(
             params[0],
             fill_default_from,
         )
         return make_request(method, [filled_transaction] + params[1:])
     else:
         return make_request(method, params)
Example #40
0
 def hash(self):
     '''
     :returns: the hash of the encoded bytestring
     :rtype: ~hexbytes.main.HexBytes
     '''
     return pipe(
         self,
         rlp.encode,
         keccak,
         HexBytes,
     )
Example #41
0
def remove_punctuation(line):
    """
    Removes punctuation from corpus
    :param line:
    :return:
    """
    return ct.pipe(line, preprocessing.strip_tags,
                   preprocessing.strip_punctuation,
                   preprocessing.strip_numeric,
                   preprocessing.strip_non_alphanum,
                   preprocessing.strip_multiple_whitespaces)
Example #42
0
def get_interactions():
    dates = sorted(set(map(_g('date'), data['interactions'])))
    d = t.pipe(data['interactions'],
               tc.groupby(lambda i: i.student),
               tc.valmap(lambda x: t.pipe(t.groupby(lambda i: i.date,x),
                                          tc.valmap(lambda v: [v[0].time_in, v[0].time_out]))))

    mat = [['student'] + dates]
    for student, attendance in d.items():
        record = [student]
        for dt in dates:
            if dt in attendance:
                record.append(attendance[dt])
            elif dt in data['students'][student].absences:
                record.append(('',''))
            else:
                record.append((None,None))
        mat.append(record)

    return {'interactions': mat}
Example #43
0
def normize_text(text):
    """
    Normalizes characters and converts all to
    best matching ASCII representation.

    Expects text to be Unicode.
    (returned text is still Unicode)
    """
    return tlz.pipe(text,
                    verify_unicode,
                    unidecode,
                    unicode)
Example #44
0
def _ecpairing(data):
    exponent = bn128.FQ12.one()

    processing_pipeline = (
        _process_point(data[start_idx:start_idx + 192])
        for start_idx
        in range(0, len(data), 192)
    )
    exponent = pipe(bn128.FQ12.one(), *processing_pipeline)

    result = bn128.final_exponentiate(exponent) == bn128.FQ12.one()
    return result
Example #45
0
def find_background_illumination(fns, radius=None, input_bitdepth=None,
                                 quantile=0.5, stretch_quantile=0.):
    """Use a set of related images to find uneven background illumination.

    Parameters
    ----------
    fns : list of string
        A list of image file names
    radius : int, optional
        The radius of the structuring element used to find background.
        default: The width or height of the input images divided by 4,
        whichever is smaller.
    input_bitdepth : int, optional
        The bit-depth of the input images. Should be specified if non-standard
        bitdepth images are used in a 16-bit image file, e.g. 12-bit images.
        Default is the dtype of the input image.
    quantile : float in [0, 1], optional
        The desired quantile to find background. default: 0.5 (median)
    stretch_quantile : float in [0, 1], optional
        Stretch image to full dtype limit, saturating above this quantile.

    Returns
    -------
    illum : np.ndarray, float, shape (M, N)
        The estimated illumination over the image field.

    See Also
    --------
    `correct_image_illumination`, `correct_multiimage_illumination`.
    """
    # this function follows the "PyToolz" streaming data model to
    # obtain the illumination estimate.
    # first, define the functions for each individual step:
    in_range = ('image' if input_bitdepth is None
                else (0, 2**input_bitdepth - 1))
    rescale = tz.curry(exposure.rescale_intensity)
    normalize = (tz.partial(stretchlim, bottom=stretch_quantile)
                 if stretch_quantile > 0
                 else skimage.img_as_float)

    # produce a stream of properly-scaled images
    ims = (tz.pipe(fn, io.imread, rescale(in_range=in_range), normalize)
           for fn in fns)

    # take the mean of that stream
    mean_image = mean(ims)

    # return the median filter of that mean
    radius = radius or min(mean_image.shape) // 4

    mean_image = img_as_ubyte(stretchlim(mean_image))
    illum = imfilter.rank.median(mean_image, selem=morphology.disk(radius))
    return illum
Example #46
0
def nlner_conll(doc):
    """Baseline NER tagger for Dutch, based on the CoNLL'02 dataset.

    See http://www.clips.uantwerpen.be/conll2002/ner/ for the dataset and
    its license.

    See also
    --------
    frog: NER tagger and dependency parser for Dutch.

    stanford_ner_tag: NER tagger for English.
    """
    from ._nl_conll_ner import ner
    return pipe(doc, fetch, _tokenize_if_needed, ner)
Example #47
0
File: single.py Project: NLeSC/xtas
def corenlp_lemmatize(doc, output="raw"):
    """Wrapper around the Stanford CoreNLP lemmatizer.

    CoreNLP is downloaded automatically.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output lines from CoreNLP.
        If 'saf', returns a SAF dictionary.
    """
    from ._corenlp import parse, stanford_to_saf

    return pipe(doc, fetch, parse, _output_func(output, stanford_to_saf))
Example #48
0
def stem_snowball(doc, language):
    """Stem words in doc using the Snowball stemmer.

    Set the parameter ``lang`` to a language code such as "de", "en", "nl", or
    the special string "porter" to get Porter's classic stemming algorithm for
    English.

    See also
    --------
    morphy: smarter approach to stemming (lemmatization), but only for English.
    """
    from Stemmer import Stemmer
    # Build the Stemmer before fetching to force an exception for invalid
    # languages.
    stem = Stemmer(language).stemWords
    return pipe(doc, fetch, _tokenize_if_needed, stem)
Example #49
0
def serializable_unsigned_transaction_from_dict(web3, transaction_dict):
    '''
    if web3 is None, fill out transaction as much as possible without calling client
    '''
    filled_transaction = pipe(
        transaction_dict,
        dict,
        fill_transaction_defaults(web3),
        chain_id_to_v,
        apply_formatters_to_dict(TRANSACTION_FORMATTERS),
    )
    if 'v' in filled_transaction:
        serializer = Transaction
    else:
        serializer = UnsignedTransaction
    return serializer.from_dict(filled_transaction)
Example #50
0
File: single.py Project: NLeSC/xtas
def corenlp(doc, output="raw"):
    """Wrapper around the Stanford CoreNLP parser.

    CoreNLP is downloaded automatically.

    If run with all annotators, it requires around 3G of memory,
    and it will keep the process in memory indefinitely.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output lines from CoreNLP.
        If 'saf', returns a SAF dictionary.
    """
    from ._corenlp import parse, stanford_to_saf

    return pipe(doc, fetch, parse, _output_func(output, stanford_to_saf))
Example #51
0
def sentiwords_tag(doc, output="bag"):
    """Tag doc with SentiWords polarity priors.

    Performs left-to-right, longest-match annotation of token spans with
    polarities from SentiWords.

    Uses no part-of-speech information; when a span has multiple possible
    taggings in SentiWords, the mean is returned.

    Parameters
    ----------
    doc : document or list of strings

    output : string, optional
        Output format. Either "bag" for a histogram (dict) of annotated token
        span frequencies, or "tokens" a mixed list of strings and (list of
        strings, polarity) pairs.


    See also
    --------
    movie_review_emotions: per-sentence fine-grained sentiment tagger

    movie_review_polarity: figure out if a movie review is positive or negative
    """
    from ._sentiwords import tag

    tagged = pipe(doc, fetch, _tokenize_if_needed, tag)
    if output == "bag":
        counts = {}
        for ngram, polarity in tagged:
            if polarity == 0:
                continue
            if ngram in counts:
                counts[ngram][1] += 1
            else:
                counts[ngram] = [polarity, 1]
        return counts

    elif output == "tokens":
        return [ngram if polarity == 0 else (ngram, polarity)
                for ngram, polarity in tagged]

    else:
        raise ValueError("unknown output format %r" % output)
Example #52
0
def frog(doc, output='raw'):
    """Wrapper around the Frog lemmatizer/POS tagger/NER/dependency parser.

    Expects Frog to be running in server mode, listening on
    ``localhost:${XTAS_FROG_PORT}`` or port 9987 if the environment variable
    ``XTAS_FROG_PORT`` is not set. It is *not* started for you.

    Currently, the module is only tested with all frog modules active except
    for the NER and parser.

    The following line starts Frog in the correct way:

    ``frog -S ${XTAS_FROG_PORT:-9887}``

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output lines from Frog itself.
        If 'tokens', returns dictionaries for the tokens.
        If 'saf', returns a SAF dictionary.

    References
    ----------
    `Frog homepage <http://ilk.uvt.nl/frog/>`_

    See also
    --------
    nlner_conll: simple NER tagger for Dutch.
    """
    from ._frog import call_frog, parse_frog, frog_to_saf
    if output not in ('raw', 'tokens', 'saf'):
        raise ValueError("Unknown output: {output}, "
                         "please choose either raw, tokens, or saf"
                         .format(**locals()))
    result = pipe(doc, fetch, call_frog)
    if output == 'raw':
        return list(result)
    if output in ('tokens', 'saf'):
        result = parse_frog(result)
        if output == 'tokens':
            return list(result)
        return frog_to_saf(result)
Example #53
0
def movie_review_polarity(doc):
    """Movie review polarity classifier.

    Determines whether the film review ``doc`` is positive or negative. Might
    be applicable to other types of document as well, but uses a statistical
    model trained on a corpus of user reviews of movies, all in English.

    Returns
    -------
    p : float
        The probability that the movie review ``doc`` is positive.

    See also
    --------
    movie_review_emotions: per-sentence fine-grained sentiment tagger

    sentiwords_tag: more generic sentiment expression tagger
    """
    from ._polarity import classify
    return pipe(doc, fetch, classify)
Example #54
0
File: single.py Project: NLeSC/xtas
def morphy(doc):
    """Lemmatize tokens using morphy, WordNet's lemmatizer.

    Finds the morphological root of all words in ``doc``, which is assumed to
    be written in English.

    Returns
    -------
    lemmas : list
        List of lemmas.

    See also
    --------
    stem_snowball: simpler approach to lemmatization (stemming).
    """
    # XXX Results will be better if we do POS tagging first, but then we
    # need to map Penn Treebank tags to WordNet tags.
    nltk_download("wordnet")
    tokens = pipe(doc, fetch, _tokenize_if_needed)
    return map(nltk.WordNetLemmatizer().lemmatize, tokens)
Example #55
0
def corenlp_lemmatize(doc, output='raw'):
    """Wrapper around the CoreNLP lemmatizer.

    Expects ``$CORENLP_HOME`` to point to the CoreNLP installation dir.

    Tested with `CoreNLP 2014-01-04
    <http://nlp.stanford.edu/software/stanford-corenlp-full-2014-01-04.zip>`_.

    Parameters
    ----------
    output : string
        If 'raw', returns the raw output lines from CoreNLP.
        If 'saf', returns a SAF dictionary.
    """
    from ._corenlp import parse, stanford_to_saf

    try:
        transf = {"raw": identity, "saf": stanford_to_saf}[output]
    except KeyError:
        raise ValueError("Unknown output format %r" % output)

    return pipe(doc, fetch, parse, transf)
Example #56
0
	def load(self, line, word_classes = False):

		#Tokenize zho
		if self.language == "zho" and self.zho_split == True:
								
			line = [x for x in self.tk.cut(line, cut_all = True, HMM = True) if x != ""]
			line = " ".join(line)

		#Remove links, hashtags, at-mentions, mark-up, and "RT"
		line = re.sub(r"http\S+", "", line)
		line = re.sub(r"@\S+", "", line)
		line = re.sub(r"#\S+", "", line)
		line = re.sub("<[^>]*>", "", line)
		line = line.replace(" RT", "").replace("RT ", "")
								
		#Remove emojis
		line = re.sub(self.myre, "", line)
									
		#Remove punctuation and extra spaces
		line = ct.pipe(line, 
						preprocessing.strip_tags, 
						preprocessing.strip_punctuation, 
						preprocessing.split_alphanum,
						preprocessing.strip_non_alphanum,
						preprocessing.strip_multiple_whitespaces
						)
									
		#Strip and reduce to max training length
		line = line.lower().strip().lstrip()

		if word_classes == False:
			line = self.r.tagRawSentenceHash(rawLine = line, DICT = self.DICT, word_dict = self.domain_dict)
			#Array of tuples (LEX, POS, CAT)

		#For training word embeddings, just return the list
		else:
			line = self.r.tagRawSentenceGenSim(rawLine = line, DICT = self.DICT)

		return np.array(line)
Example #57
0
def find_matching_event_abi(abi, event_name=None, argument_names=None):

    filters = [
        functools.partial(filter_by_type, 'event'),
    ]

    if event_name is not None:
        filters.append(functools.partial(filter_by_name, event_name))

    if argument_names is not None:
        filters.append(
            functools.partial(filter_by_argument_name, argument_names)
        )

    event_abi_candidates = pipe(abi, *filters)

    if len(event_abi_candidates) == 1:
        return event_abi_candidates[0]
    elif not event_abi_candidates:
        raise ValueError("No matching events found")
    else:
        raise ValueError("Multiple events found")
Example #58
0
def movie_review_emotions(doc):
    """Emotion (fine-grained sentiment) tagger for movie reviews.

    The training data for this function is that of Buitinck et al., with the
    training and test data concatenated. The algorithm is SVMs in a binary
    relevance (one-vs-rest) combination.

    Returns
    -------
    tagged : list of (string, list of string)
        A list of (sentence, labels) pairs. Each sentence may have zero or
        more labels.

    References
    ----------
    L. Buitinck, J. van Amerongen, E. Tan and M. de Rijke (2015).
    Multi-emotion detection in user-generated reviews. Proc. ECIR.
    https://staff.fnwi.uva.nl/m.derijke/wp-content/papercite-data/pdf/buitinck-multi-emotion-2015.pdf
    """
    from ._emotion import classify
    nltk_download('punkt')
    sentences = pipe(doc, fetch, nltk.sent_tokenize)
    return list(zip(sentences, classify(sentences)))