def bag_of_words(defs, stem_flag, quiet): '''convert dictionary definitions into bags of words''' # convert to bag of words, count words if not quiet: print("Converting defs to bags of words") count = {} pr = ProgressBar(maxval = len(defs)) empty_keys = set() for lemma in defs: pr.update(pr.currval + 1) defs[lemma] = [tesslang.standardize('any', w) for w in pat.clean['any'].split(defs[lemma]) if not w.isspace() and w != ''] if stem_flag: defs[lemma] = [stem(w) for w in defs[lemma]] if len(defs[lemma]) > 0: for d in defs[lemma]: if d in count: count[d] += 1 else: count[d] = 1 else: empty_keys.add(lemma) pr.finish() if not quiet: print("Removing hapax legomena") pr = ProgressBar(maxval = len(defs)) for lemma in defs: pr.update(pr.currval + 1) defs[lemma] = [w for w in defs[lemma] if count[w] > 1] if defs[lemma] == []: empty_keys.add(lemma) pr.finish() if not quiet: print('Lost {0} empty definitions'.format(len(empty_keys))) for k in empty_keys: del defs[k] return(defs)
def parse_stop_list(lang, name, quiet): '''read frequency table''' # open stoplist file filename = None if name == '*': filename = os.path.join(fs['data'], 'common', lang + '.stem.freq') else: filename = os.path.join(fs['data'], 'v3', lang, name, name + '.freq_stop_stem') if not quiet: print 'Reading stoplist {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) # read stoplist header to get total token count head = f.readline() m = re.compile('#\s+count:\s+(\d+)', re.U).match(head) if m is None: print "Can't find header in {0}".format(filename) sys.exit(1) total = int(m.group(1)) pr.advance(len(head.encode('utf-8'))) # read the individual token counts, divide by total rank = {} n = 1 for line in f: lemma, count = line.split('\t') lemma = tesslang.standardize(lang, lemma) lemma = number.sub('', lemma) rank[lemma] = math.log(n) n += 1 pr.advance(len(line.encode('utf-8'))) return (rank)
def parse_stop_list(lang, name, quiet): '''read frequency table''' # open stoplist file filename = None if name == '*': filename = os.path.join(fs['data'], 'common', lang + '.stem.freq') else: filename = os.path.join(fs['data'], 'v3', lang, name, name + '.freq_stop_stem') if not quiet: print 'Reading stoplist {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) # read stoplist header to get total token count head = f.readline() m = re.compile('#\s+count:\s+(\d+)', re.U).match(head) if m is None: print "Can't find header in {0}".format(filename) sys.exit(1) total = int(m.group(1)) pr.advance(len(head.encode('utf-8'))) # read the individual token counts, divide by total rank = {} n = 1 for line in f: lemma, count = line.split('\t') lemma = tesslang.standardize(lang, lemma) lemma = number.sub('', lemma) rank[lemma] = math.log(n) n += 1 pr.advance(len(line.encode('utf-8'))) return(rank)
def parse_stop_list(lang, name, quiet): '''read frequency table''' # open stoplist file filename = None if name == '*': filename = os.path.join(basedir, "data", lang + '.stem.freq') else: filename = os.path.join(basedir, name + '.freq_stop_stem') if not quiet: print('Reading stoplist {0}'.format(filename)) pr = ProgressBar(maxval = os.stat(filename).st_size) try: f = open(filename, "r", encoding="utf_8") except IOError as err: print("Can't read {0}: {1}".format(filename, str(err))) sys.exit(1) # read stoplist header to get total token count head = f.readline() m = re.compile('#\s+count:\s+(\d+)').match(head) if m is None: print("Can't find header in {0}".format(filename)) sys.exit(1) total = int(m.group(1)) pr.update(pr.currval + len(head.encode('utf-8'))) # read the individual token counts, divide by total freq = dict() for line in f: lemma, count = line.split('\t') lemma = tesslang.standardize(lang, lemma) lemma = pat.number.sub('', lemma) freq[lemma] = float(count)/total pr.update(pr.currval + len(line.encode('utf-8'))) pr.finish() return(freq)
def bag_of_words(defs, stem_flag, quiet=False): '''convert dictionary definitions into bags of words''' # convert to bag of words, count words if not quiet: print "Converting defs to bags of words" count = {} pr = progressbar.ProgressBar(len(defs), quiet) empty_keys = set() for lemma in defs: pr.advance() defs[lemma] = [ tesslang.standardize('any', w) for w in pat.clean['any'].split(defs[lemma]) if not w.isspace() and w != '' ] if len(defs[lemma]) > 0: for d in defs[lemma]: if d in count: count[d] += 1 else: count[d] = 1 else: empty_keys.add(lemma) if not quiet: print "Removing hapax legomena" pr = progressbar.ProgressBar(len(defs), quiet) for lemma in defs: pr.advance() defs[lemma] = [w for w in defs[lemma] if count[w] > 1] if defs[lemma] == []: empty_keys.add(lemma) if not quiet: print 'Lost {0} empty definitions'.format(len(empty_keys)) for k in empty_keys: del defs[k] return (defs)
def parse_stem_dict(lang, quiet): '''parse the csv stem dictionaries of Helma Dik''' filename = os.path.join(fs['data'], 'common', lang + '.lexicon.csv') f = open(filename, 'r') if not quiet: print 'Reading lexicon {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) pos = dict() heads = dict() for line in f: pr.advance(len(line.encode('utf-8'))) line = line.strip().lower().replace('"', '') try: token, code, lemma = line.split(',') except ValueError: continue lemma = tesslang.standardize(lang, lemma) lemma = pat.number.sub('', lemma) if len(code) == 10: if lemma in pos: pos[lemma].append(code[:2]) else: pos[lemma] = [code[:2]] heads[lemma] = 1 success = 0 for lemma in heads: if lemma in pos: success += 1 print 'pos success; {0}%'.format(100 * success / len(heads)) return(pos)
def bag_of_words(defs, stem_flag, quiet): '''convert dictionary definitions into bags of words''' # convert to bag of words, count words if not quiet: print "Converting defs to bags of words" count = {} pr = progressbar.ProgressBar(len(defs), quiet) empty_keys = set() for lemma in defs: pr.advance() defs[lemma] = [tesslang.standardize('any', w) for w in pat.clean['any'].split(defs[lemma]) if not w.isspace() and w != ''] if len(defs[lemma]) > 0: for d in defs[lemma]: if d in count: count[d] += 1 else: count[d] = 1 else: empty_keys.add(lemma) if not quiet: print "Removing hapax legomena" pr = progressbar.ProgressBar(len(defs), quiet) for lemma in defs: pr.advance() defs[lemma] = [w for w in defs[lemma] if count[w] > 1] if defs[lemma] == []: empty_keys.add(lemma) if not quiet: print 'Lost {0} empty definitions'.format(len(empty_keys)) for k in empty_keys: del defs[k] return(defs)
def parse_XML_dictionaries(langs, quiet): '''Create a dictionary of english translations for each lemma''' defs = dict() # process latin, greek lexica in turn for lang in langs: filename = os.path.join(basedir, 'data', lang + '.lexicon.xml') if not quiet: print('Reading lexicon {0}'.format(filename)) pr = ProgressBar(maxval = os.stat(filename).st_size) try: f = open(filename, "r", encoding="utf_8") except IOError as err: print("Can't read {0}: {1}".format(filename, str(err))) sys.exit(1) # # Each line in the lexicon is one entry. # Process one at a time to extract headword, definition. # for line in f: pr.update(pr.currval + len(line.encode('utf-8'))) # skip lines that don't conform with the expected entry structure m = pat.entry.search(line) if m is None: continue lemma, entry = m.group(1, 2) # standardize the headword lemma = pat.clean[lang].sub('', lemma) lemma = pat.number.sub('', lemma) lemma = tesslang.standardize(lang, lemma) # remove elements on the stoplist for stop in pat.stop: entry = stop.sub('', entry) # transliterate betacode to unicode chars # in foreign tags entry = pat.foreign.sub(mo_beta2uni, entry) # extract strings marked as translations of the headword def_strings = pat.definition[lang].findall(entry) # drop empty defs def_strings = [d for d in def_strings if not d.isspace()] # skip lemmata for which no translation can be extracted if def_strings is None: continue if lemma in defs and defs[lemma] is not None: defs[lemma].extend(def_strings) else: defs[lemma] = def_strings pr.finish() if not quiet: print('Read {0} entries'.format(len(defs))) print('Flattening entries with multiple definitions') pr = ProgressBar(maxval = len(defs)) empty_keys = set() for lemma in defs: pr.update(pr.currval + 1) if defs[lemma] is None or defs[lemma] == []: empty_keys.add(lemma) continue defs[lemma] = '; '.join(defs[lemma]) pr.finish() if not quiet: print('Lost {0} empty definitions'.format(len(empty_keys))) for k in empty_keys: del defs[k] return(defs)
def parse_XML_dictionaries(langs, quiet=False): '''Create a dictionary of english translations for each lemma''' defs = dict() # process latin, greek lexica in turn for lang in langs: filename = os.path.join(fs['data'], 'common', lang + '.lexicon.xml') if not quiet: print 'Reading lexicon {0}'.format(filename) pr = progressbar.ProgressBar(os.stat(filename).st_size, quiet) try: f = codecs.open(filename, encoding='utf_8') except IOError as err: print "Can't read {0}: {1}".format(filename, str(err)) sys.exit(1) # # Each line in the lexicon is one entry. # Process one at a time to extract headword, definition. # for line in f: pr.advance(len(line.encode('utf-8'))) # skip lines that don't conform with the expected entry structure m = pat.entry.search(line) if m is None: continue lemma, entry = m.group(1, 2) # standardize the headword lemma = pat.clean[lang].sub('', lemma) lemma = pat.number.sub('', lemma) lemma = tesslang.standardize(lang, lemma) # remove elements on the stoplist for stop in pat.stop: entry = stop.sub('', entry) # transliterate betacode to unicode chars # in foreign tags entry = pat.foreign.sub(mo_beta2uni, entry) # extract strings marked as translations of the headword def_strings = pat.definition[lang].findall(entry) # drop empty defs def_strings = [d for d in def_strings if not d.isspace()] # skip lemmata for which no translation can be extracted if def_strings is None: continue if lemma in defs and defs[lemma] is not None: defs[lemma].extend(def_strings) else: defs[lemma] = def_strings if not quiet: print 'Read {0} entries'.format(len(defs)) print 'Flattening entries with multiple definitions' pr = progressbar.ProgressBar(len(defs), quiet) empty_keys = set() for lemma in defs: pr.advance() if defs[lemma] is None or defs[lemma] == []: empty_keys.add(lemma) continue defs[lemma] = '; '.join(defs[lemma]) if not quiet: print 'Lost {0} empty definitions'.format(len(empty_keys)) for k in empty_keys: del defs[k] if "" in defs: del defs[""] return (defs)