def hsk2voc(file): voc = pie.Vocabulary(language="Mandarin") for line in file: pieces = line.split(",") word = pie.Word() # This is for "HSK Level,Word,Pronunciation,Definition," format try: level = int(pieces[0]) except ValueError: continue word.mainform = pieces[1].decode("utf-8", "ignore") word.rom = pieces[2].decode("utf-8", "ignore") word.glossed = pieces[3].decode("utf-8", "ignore") glosses = word.glossed.split(";") glosses = [x.strip() for x in glosses] word.glosses = glosses if len(glosses) > 1 and glosses[0].startswith("("): word.gloss = glosses[1] else: word.gloss = glosses[0] word.tally = 1000 / level word.phonetic = rom2bopo(word.rom) word.language = "Mandarin" voc.add(word) return voc
def __init__(self, text, vocab): self.rawwords = False self.words = False self.text = text self.vocab = vocab if self.vocab == False: self.vocab = pie.Vocabulary() if text: print "Processing..." self.process(text)
def list2voc(path="C:\\Code\\koreanvocab2.txt"): import pie vocab = pie.Vocabulary(filter=False, language="Korean") text = open(path).read() text = text.decode("utf-8", "ignore") lines = text.split("\n") lines = [tuple(x.split("\t")) for x in lines if "\t" in x] for line in lines: rank = line[0] print rank.encode('utf-8', 'ignore') if rank: try: tally = 1000000 / int(rank) except: tally = 0 else: tally = 0 word = line[1] newword = pie.Word(text=word) newword.tally = tally vocab.allwords.add(newword) return vocab
def process_aggtext(aggtext, stems, wordlist): pieces = [ tuple([x.strip() for x in y.split("</url>")]) for y in aggtext.split("<url>")[1:] ] for p in pieces: url = p[0] text = p[1] dummyvoc = pie.Vocabulary() sentences = get_sentences(text) sentences = [x for x in sentences if len(x.split(" ")) < 10] # eliminate long ones sentences = [Parser(text=x, vocab=dummyvoc) for x in sentences] for s in sentences: sentems = [x[1] for x in s.stems] matches = set(sentems).intersection(set(stems.keys())) if not matches: continue for m in matches: stems[m].addsample(s.text, wordlist, href=url.encode("utf-8", "ignore")) return stems
def process_word(word, vocab=False, noun=[], firstpass=True, restrict=False, thisform=""): # romanized word if not thisform: thisform = word stemmed = thisform verbs = dict(verbage) notdone = False afterparticles = "" nounish = False if vocab is False: vocab = pie.Vocabulary() if len(word.split("---")) > 1: # syllable count if thisform.endswith("#---r.eu.l") and firstpass: #direct object if thisform[:-2] not in verbs.keys(): # ... stemmed = thisform[:-9] else: stemmed = thisform[:-1] + "#---d.a.#" elif thisform.endswith("---#.eu.l") and firstpass: if not thisform.endswith("#---#.eu.l"): stemmed = word[:-9] elif thisform.endswith("#---g.a.#"): #subject if thisform.endswith("---d.a.#---g.a.#"): stemmed = word[:-16] + "---d.a.#" else: stemmed = word[:-8] elif thisform.endswith("---#.i.#"): if not thisform.endswith("#---#.i.#"): stemmed = thisform[:-8] elif thisform.endswith("#---n.eu.n") and firstpass: #topic if thisform[:-9] not in verbs.keys( ) and thisform[:-10] + "l" not in verbs.keys() and firstpass: stemmed = thisform[:-9] elif thisform[:-9] in verbs.keys(): stemmed = thisform[:-9] elif thisform[:-10] + "l" in verbs.keys(): stemmed = thisform[:-10] + "l" elif thisform.endswith("---#.eu.n") and firstpass: if not thisform.endswith("#---#.eu.n"): stemmed = thisform[:-9] notdone = True elif thisform.endswith("---r.o.#"): stemmed = word[:-8] if thisform.endswith("---#.eu.#---r.o.#"): if not thisform.endswith("#---#.eu.#---r.o.#"): stemmed = thisform[:-17] elif thisform.endswith("#---r.o.#---s.eo.#"): stemmed = thisform[:-17] if thisform.endswith("---#.eu.#---r.o.#---s.eo.#"): if not thisform.endswith("#---#.eu.#---r.o.#---s.eo.#"): stemmed = thisform[:-26] elif thisform.endswith("#---r.o.#---ss.eo.#"): stemmed = thisform[:-18] if thisform.endswith("---#.eu.#---r.o.#---ss.eo.#"): if not thisform.endswith("#---#.eu.#---r.o.#---ss.eo.#"): stemmed = thisform[:-27] elif thisform.endswith("---#.e.#"): stemmed = thisform[:-8] elif thisform.endswith("---#.e.#---g.e.#"): stemmed = thisform[:-16] elif thisform.endswith("---#.e.#---g.e.#---s.eo.#"): stemmed = thisform[:-25] elif thisform.endswith("---kk.e.#"): stemmed = thisform[:-9] elif thisform.endswith("---#.e.#---s.eo.#"): stemmed = thisform[:-17] elif thisform.endswith("---kk.e.#---s.eo.#"): stemmed = thisform[:-18] elif thisform.endswith("---kk.a.#---j.i.#"): stemmed = thisform[:-17] elif thisform.endswith("---b.u.#---t.eo.#"): stemmed = thisform[:-17] elif thisform.endswith( "---#.ui.#") and firstpass: # correct for stuff like .. stemmed = thisform[:-9] elif thisform.endswith("---d.eu.l"): stemmed = thisform[:-9] elif thisform.endswith("---#.i.n"): stemmed = thisform[:-8] elif thisform.endswith("#---#.wa.#"): stemmed = thisform[:-9] elif thisform.endswith( "---g.wa.#") and not thisform.endswith("#---g.wa.#"): stemmed = word[:-9] elif thisform.endswith("---d.o.#"): stemmed = thisform[:-8] elif thisform.endswith("---ch.eo.#---r.eo.m"): stemmed = thisform[:-19] elif thisform.endswith("---m.a.n"): stemmed = thisform[:-8] elif thisform.endswith("---m.a.l---#.ya.#"): stemmed = thisform[:-17] if stemmed.endswith("---d.eu.l"): if stemmed[:-1] + "#" not in verbs.keys(): stemmed = stemmed[:-9] afterparticles = stemmed if afterparticles != thisform: nounish = True if stemmed not in vocab.quickfind.keys(): if stemmed.endswith("d.oe.#"): #cleanup stemmed = stemmed[:-9] elif not nounish: if stemmed: stemmed = suggestoverb(stemmed, verbs, vocab) else: stemmed = suggestoverb(thisform, verbs, vocab) else: thelist = ["---g.i.#", "m"] stemmed = suggestoverb(stemmed, verbs, vocab, thelist) if stemmed in verbs.keys(): if not stemmed == afterparticles or not len(stemmed.split("---")) < 2: stemmed = verbs[stemmed] elif stemmed.endswith("#.i.#---d.a.#") and len( stemmed ) > 16: # assuming not a verb in its own right, attached copula stemmed = stemmed[:-16] return stemmed