def main(): """ This function is where you build your own pipeline from the parts that are available in this module. Please see the example below for how to do this or RTFM """ # load your data fp = "data/test.txt" text = [i for i in open(fp, "r").read().splitlines()] # compose your cleaning functions clean = compose(model.parse, data.parse) target = [clean(line) for line in text] pairs = [(i, j) for i, j in zip(text, target)] for o, t in pairs: print(f"{o:>30s}: {t}")
def roots(tokens: Doc) -> List[str]: return [i for i in filter(lambda x: x.dep_ == 'ROOT', tokens)] #+TODO: first is here for when you don't want to/cannot do NER, currently there # no NER so it is the default target selector def fst(xs: List[str]) -> str: return xs[0] def lemma(x: Doc) -> str: return x.lemma_ def process(x: str) -> Doc: return proc(x) #+TODO: fix this - sadly you can't create a type synonym until after you load a # language model so it just breaks the tidiness of the code but that's pretty # minimal in the grand scheme of things def vocab(m: spacy.lang.en.English, x: str) -> str: return x if x in m.vocab else '' vocab = functools.partial(vocab, proc) # rewrite me parse = compose(vocab, lemma, fst, roots, process)
def regexes(r : dict, x : str) -> str: return compose(*[functools.partial(re.sub, i, j) for i,j in r.items()])(x)
#-- Definitions ----------------------------------------------------------------- #-- cleaning #NOTE: all functions are endomorphic String -> String so their composition does # not need to be tested and they can be composed in any order # read the config files for the operations with open('etc/regex') as f: replace = toml.load(f) with open('etc/encoding') as f: encode = toml.load(f) def regexes(r : dict, x : str) -> str: return compose(*[functools.partial(re.sub, i, j) for i,j in r.items()])(x) replacements = functools.partial(regexes, replace) encoding = functools.partial(regexes, encode) def unlines(x : str) -> str: return x.replace('\n', '') # Composition ----------------------------------------------------------------- parse = compose( encoding , replacements , unlines , str.strip )
# third party import toml # project from woffle.functions.compose import compose #-- Definitions ----------------------------------------------------------------- #-- cleaning #NOTE: all functions are endomorphic String -> String so their composition does # not need to be tested and they can be composed in any order # read the config files for the operations with open('etc/regex') as f: replace = toml.load(f) with open('etc/encoding') as f: encode = toml.load(f) def regexes(r: dict, x: str) -> str: return compose(*[functools.partial(re.sub, i, j) for i, j in r.items()])(x) replacements = functools.partial(regexes, replace) encoding = functools.partial(regexes, encode) # Composition ----------------------------------------------------------------- parse_ = compose(encoding, replacements, str.strip) parse = functools.partial(map, parse_)