def sentences(lines): #@typecheck([str], [(str, [object])], n=int) def parseloop(lines, n=0): return [(clean(lines[0]), parseloop(lines[1:], n=n+1) if lines[1:] else []) for lines in splitby(lambda line:n==indent(line), lines, True)] return dct.collapse(filter(None, splitby(elem('<sent>'), lines, first=True)), pipe(car, speaker_code), pipe(cdr, cur(filter, useful), parseloop, car))
def group_words(csv): "[[str]]-> {str:{str:{str:[float]}}} ie {Word:{Segment:{Feature:[Value]}}}" segment_name = lambda s: s[:re.search('[0-9]', s).end()] segment = fnc.pipe(car, dropwhile(str.islower), segment_name) feature = lambda s: s[re.search('[0-9]', s).end():] fillsegments = curried(dct.map_items)(makesegment) features = carcdr(lambda title, data:(feature(title), map(float, data))) phones = lambda l: dct.map(dict, dct.collapse(l, segment, features)) words = dct.collapse(cdr(csv), fnc.pipe(car, takewhile(str.islower)), fnc.ident) return dct.map(fnc.pipe(phones, fillsegments), words)
def run_compare_to_base(fs): "[str] -> [{utf-8-char:set<lev.Rule>}]" return map(pipe(cur(compare, fs[0]), classify), fs)
def corpusSize(path, regions): "path*{region:[filename]}" numbers = dct.map(lambda files:lap(pipe(read(path), len), files), regions) return sorted(dct.map(sum, numbers).items(), key=snd)
def groupedRegions(path, regions): "path*{region:[site]} -> {region:[filename]}" return dct.map(pipe(cur(groupedSites)(path), dict.values, concat), regions)
swewords = [w for r in regions for w in r.splitlines()] swevocab = set(swewords) swecount = dct.count(swewords) unsharedTokensTal = sum(talcount[w] for w in sharedVocab) unsharedTokensSwe = sum(swecount[w] for w in sharedVocab) print(len(talvocab)) print(len(swevocab)) print(len(swevocab & talvocab)) print(len(swevocab - talvocab)) print(len(talvocab - swevocab) + len(swevocab - talvocab) + len(swevocab & talvocab)) print(unsharedTokensTal) print(unsharedTokensSwe) print(len(talwords) + len(swewords) - unsharedTokensTal - unsharedTokensSwe) lap = pipe(map, list) # Python 3 sucks lilter = pipe(filter, list) # (iterators as imperative laziness are at fault here) def corpusSize(path, regions): "path*{region:[filename]}" numbers = dct.map(lambda files:lap(pipe(read(path), len), files), regions) return sorted(dct.map(sum, numbers).items(), key=snd) def printCorpusSize(path, regions): for region, total in corpusSize(path, regions): print(region, ":\t", total, '\t', numbers[region]) def swediaProvinceSize(): "Example use of corpusSize" corpusSize(consts.swpath, groupedRegions(consts.swpath, consts.swediaProvinces)) def swediaSiteSize(): "Another example use"