Exemple #1
0
def main():
    """
    This function is where you build your own pipeline from the parts that are
    available in this module. Please see the example below for how to do this
    or RTFM
    """

    # load your data
    fp = "data/test.txt"
    text = [i for i in open(fp, "r").read().splitlines()]

    # compose your cleaning functions
    clean = compose(model.parse, data.parse)
    target = [clean(line) for line in text]
    pairs = [(i, j) for i, j in zip(text, target)]

    for o, t in pairs:
        print(f"{o:>30s}: {t}")
Exemple #2
0
def roots(tokens: Doc) -> List[str]:
    return [i for i in filter(lambda x: x.dep_ == 'ROOT', tokens)]


#+TODO: first is here for when you don't want to/cannot do NER, currently there
#       no NER so it is the default target selector
def fst(xs: List[str]) -> str:
    return xs[0]


def lemma(x: Doc) -> str:
    return x.lemma_


def process(x: str) -> Doc:
    return proc(x)


#+TODO: fix this - sadly you can't create a type synonym until after you load a
# language model so it just breaks the tidiness of the code but that's pretty
# minimal in the grand scheme of things
def vocab(m: spacy.lang.en.English, x: str) -> str:
    return x if x in m.vocab else ''


vocab = functools.partial(vocab, proc)

# rewrite me
parse = compose(vocab, lemma, fst, roots, process)
Exemple #3
0
def regexes(r : dict, x : str) -> str:
    return compose(*[functools.partial(re.sub, i, j) for i,j in r.items()])(x)
Exemple #4
0
#-- Definitions -----------------------------------------------------------------
#-- cleaning
#NOTE: all functions are endomorphic String -> String so their composition does
#      not need to be tested and they can be composed in any order

# read the config files for the operations
with open('etc/regex') as f:
    replace = toml.load(f)

with open('etc/encoding') as f:
    encode = toml.load(f)

def regexes(r : dict, x : str) -> str:
    return compose(*[functools.partial(re.sub, i, j) for i,j in r.items()])(x)

replacements = functools.partial(regexes, replace)
encoding     = functools.partial(regexes, encode)


def unlines(x : str) -> str:
    return x.replace('\n', '')


# Composition -----------------------------------------------------------------
parse = compose( encoding
               , replacements
               , unlines
               , str.strip
               )
Exemple #5
0
# third party
import toml

# project
from woffle.functions.compose import compose

#-- Definitions -----------------------------------------------------------------
#-- cleaning
#NOTE: all functions are endomorphic String -> String so their composition does
#      not need to be tested and they can be composed in any order

# read the config files for the operations
with open('etc/regex') as f:
    replace = toml.load(f)

with open('etc/encoding') as f:
    encode = toml.load(f)


def regexes(r: dict, x: str) -> str:
    return compose(*[functools.partial(re.sub, i, j) for i, j in r.items()])(x)


replacements = functools.partial(regexes, replace)
encoding = functools.partial(regexes, encode)

# Composition -----------------------------------------------------------------
parse_ = compose(encoding, replacements, str.strip)

parse = functools.partial(map, parse_)