Exemple #1
0
def sentences(lines):
    #@typecheck([str], [(str, [object])], n=int)
    def parseloop(lines, n=0):
        return [(clean(lines[0]),
                 parseloop(lines[1:], n=n+1) if lines[1:] else [])
                for lines in splitby(lambda line:n==indent(line), lines, True)]
    return dct.collapse(filter(None, splitby(elem('<sent>'), lines, first=True)),
                        pipe(car, speaker_code),
                        pipe(cdr, cur(filter, useful), parseloop, car))
Exemple #2
0
def group_words(csv):
    "[[str]]-> {str:{str:{str:[float]}}} ie {Word:{Segment:{Feature:[Value]}}}"
    segment_name = lambda s: s[:re.search('[0-9]', s).end()]
    segment = fnc.pipe(car, dropwhile(str.islower), segment_name)
    feature = lambda s: s[re.search('[0-9]', s).end():]
    fillsegments = curried(dct.map_items)(makesegment)
    features = carcdr(lambda title, data:(feature(title), map(float, data)))
    phones = lambda l: dct.map(dict, dct.collapse(l, segment, features))

    words = dct.collapse(cdr(csv),
                         fnc.pipe(car, takewhile(str.islower)),
                         fnc.ident)
    return dct.map(fnc.pipe(phones, fillsegments), words)
Exemple #3
0
def run_compare_to_base(fs):
    "[str] -> [{utf-8-char:set<lev.Rule>}]"
    return map(pipe(cur(compare, fs[0]), classify), fs)
Exemple #4
0
def corpusSize(path, regions):
    "path*{region:[filename]}"
    numbers = dct.map(lambda files:lap(pipe(read(path), len), files), regions)
    return sorted(dct.map(sum, numbers).items(), key=snd)
Exemple #5
0
def groupedRegions(path, regions):
    "path*{region:[site]} -> {region:[filename]}"
    return dct.map(pipe(cur(groupedSites)(path), dict.values, concat),
                   regions)
Exemple #6
0
    swewords = [w for r in regions for w in r.splitlines()]
    swevocab = set(swewords)
    swecount = dct.count(swewords)
    unsharedTokensTal = sum(talcount[w] for w in sharedVocab)
    unsharedTokensSwe = sum(swecount[w] for w in sharedVocab)
    print(len(talvocab))
    print(len(swevocab))
    print(len(swevocab & talvocab))
    print(len(swevocab - talvocab))
    print(len(talvocab - swevocab)
          + len(swevocab - talvocab)
          + len(swevocab & talvocab))
    print(unsharedTokensTal)
    print(unsharedTokensSwe)
    print(len(talwords) + len(swewords) - unsharedTokensTal - unsharedTokensSwe)
lap = pipe(map, list) # Python 3 sucks
lilter = pipe(filter, list)
# (iterators as imperative laziness are at fault here)
def corpusSize(path, regions):
    "path*{region:[filename]}"
    numbers = dct.map(lambda files:lap(pipe(read(path), len), files), regions)
    return sorted(dct.map(sum, numbers).items(), key=snd)
def printCorpusSize(path, regions):
    for region, total in corpusSize(path, regions):
        print(region, ":\t", total, '\t', numbers[region])
def swediaProvinceSize():
    "Example use of corpusSize"
    corpusSize(consts.swpath,
               groupedRegions(consts.swpath, consts.swediaProvinces))
def swediaSiteSize():
    "Another example use"