Example #1
0
def hsk2voc(file):
    voc = pie.Vocabulary(language="Mandarin")
    for line in file:
        pieces = line.split(",")
        word = pie.Word()
        # This is for "HSK Level,Word,Pronunciation,Definition," format
        try:
            level = int(pieces[0])
        except ValueError:
            continue
        word.mainform = pieces[1].decode("utf-8", "ignore")
        word.rom = pieces[2].decode("utf-8", "ignore")
        word.glossed = pieces[3].decode("utf-8", "ignore")
        glosses = word.glossed.split(";")
        glosses = [x.strip() for x in glosses]
        word.glosses = glosses
        if len(glosses) > 1 and glosses[0].startswith("("):
            word.gloss = glosses[1]
        else:
            word.gloss = glosses[0]
        word.tally = 1000 / level
        word.phonetic = rom2bopo(word.rom)
        word.language = "Mandarin"
        voc.add(word)
    return voc
Example #2
0
 def __init__(self, text, vocab):
     self.rawwords = False
     self.words = False
     self.text = text
     self.vocab = vocab
     if self.vocab == False:
         self.vocab = pie.Vocabulary()
     if text:
         print "Processing..."
         self.process(text)
Example #3
0
def list2voc(path="C:\\Code\\koreanvocab2.txt"):
    import pie
    vocab = pie.Vocabulary(filter=False, language="Korean")
    text = open(path).read()
    text = text.decode("utf-8", "ignore")
    lines = text.split("\n")
    lines = [tuple(x.split("\t")) for x in lines if "\t" in x]
    for line in lines:
        rank = line[0]
        print rank.encode('utf-8', 'ignore')
        if rank:
            try:
                tally = 1000000 / int(rank)
            except:
                tally = 0
        else:
            tally = 0
        word = line[1]
        newword = pie.Word(text=word)
        newword.tally = tally
        vocab.allwords.add(newword)
    return vocab
Example #4
0
def process_aggtext(aggtext, stems, wordlist):
    pieces = [
        tuple([x.strip() for x in y.split("</url>")])
        for y in aggtext.split("<url>")[1:]
    ]
    for p in pieces:
        url = p[0]
        text = p[1]
        dummyvoc = pie.Vocabulary()
        sentences = get_sentences(text)
        sentences = [x for x in sentences
                     if len(x.split(" ")) < 10]  # eliminate long ones
        sentences = [Parser(text=x, vocab=dummyvoc) for x in sentences]
        for s in sentences:
            sentems = [x[1] for x in s.stems]
            matches = set(sentems).intersection(set(stems.keys()))
            if not matches: continue
            for m in matches:
                stems[m].addsample(s.text,
                                   wordlist,
                                   href=url.encode("utf-8", "ignore"))
    return stems
Example #5
0
def process_word(word,
                 vocab=False,
                 noun=[],
                 firstpass=True,
                 restrict=False,
                 thisform=""):  # romanized word
    if not thisform: thisform = word
    stemmed = thisform
    verbs = dict(verbage)
    notdone = False
    afterparticles = ""
    nounish = False
    if vocab is False:
        vocab = pie.Vocabulary()
    if len(word.split("---")) > 1:  # syllable count
        if thisform.endswith("#---r.eu.l") and firstpass:  #direct object
            if thisform[:-2] not in verbs.keys():  # ...
                stemmed = thisform[:-9]
            else:
                stemmed = thisform[:-1] + "#---d.a.#"
        elif thisform.endswith("---#.eu.l") and firstpass:
            if not thisform.endswith("#---#.eu.l"):
                stemmed = word[:-9]
        elif thisform.endswith("#---g.a.#"):  #subject
            if thisform.endswith("---d.a.#---g.a.#"):
                stemmed = word[:-16] + "---d.a.#"
            else:
                stemmed = word[:-8]
        elif thisform.endswith("---#.i.#"):
            if not thisform.endswith("#---#.i.#"):
                stemmed = thisform[:-8]
        elif thisform.endswith("#---n.eu.n") and firstpass:  #topic
            if thisform[:-9] not in verbs.keys(
            ) and thisform[:-10] + "l" not in verbs.keys() and firstpass:
                stemmed = thisform[:-9]
            elif thisform[:-9] in verbs.keys():
                stemmed = thisform[:-9]
            elif thisform[:-10] + "l" in verbs.keys():
                stemmed = thisform[:-10] + "l"
        elif thisform.endswith("---#.eu.n") and firstpass:
            if not thisform.endswith("#---#.eu.n"):
                stemmed = thisform[:-9]
                notdone = True
        elif thisform.endswith("---r.o.#"):
            stemmed = word[:-8]
            if thisform.endswith("---#.eu.#---r.o.#"):
                if not thisform.endswith("#---#.eu.#---r.o.#"):
                    stemmed = thisform[:-17]
        elif thisform.endswith("#---r.o.#---s.eo.#"):
            stemmed = thisform[:-17]
            if thisform.endswith("---#.eu.#---r.o.#---s.eo.#"):
                if not thisform.endswith("#---#.eu.#---r.o.#---s.eo.#"):
                    stemmed = thisform[:-26]
        elif thisform.endswith("#---r.o.#---ss.eo.#"):
            stemmed = thisform[:-18]
            if thisform.endswith("---#.eu.#---r.o.#---ss.eo.#"):
                if not thisform.endswith("#---#.eu.#---r.o.#---ss.eo.#"):
                    stemmed = thisform[:-27]
        elif thisform.endswith("---#.e.#"):
            stemmed = thisform[:-8]
        elif thisform.endswith("---#.e.#---g.e.#"):
            stemmed = thisform[:-16]
        elif thisform.endswith("---#.e.#---g.e.#---s.eo.#"):
            stemmed = thisform[:-25]
        elif thisform.endswith("---kk.e.#"):
            stemmed = thisform[:-9]
        elif thisform.endswith("---#.e.#---s.eo.#"):
            stemmed = thisform[:-17]
        elif thisform.endswith("---kk.e.#---s.eo.#"):
            stemmed = thisform[:-18]
        elif thisform.endswith("---kk.a.#---j.i.#"):
            stemmed = thisform[:-17]
        elif thisform.endswith("---b.u.#---t.eo.#"):
            stemmed = thisform[:-17]
        elif thisform.endswith(
                "---#.ui.#") and firstpass:  # correct for stuff like ..
            stemmed = thisform[:-9]
        elif thisform.endswith("---d.eu.l"):
            stemmed = thisform[:-9]
        elif thisform.endswith("---#.i.n"):
            stemmed = thisform[:-8]
        elif thisform.endswith("#---#.wa.#"):
            stemmed = thisform[:-9]
        elif thisform.endswith(
                "---g.wa.#") and not thisform.endswith("#---g.wa.#"):
            stemmed = word[:-9]
        elif thisform.endswith("---d.o.#"):
            stemmed = thisform[:-8]
        elif thisform.endswith("---ch.eo.#---r.eo.m"):
            stemmed = thisform[:-19]
        elif thisform.endswith("---m.a.n"):
            stemmed = thisform[:-8]
        elif thisform.endswith("---m.a.l---#.ya.#"):
            stemmed = thisform[:-17]
        if stemmed.endswith("---d.eu.l"):
            if stemmed[:-1] + "#" not in verbs.keys():
                stemmed = stemmed[:-9]
        afterparticles = stemmed
        if afterparticles != thisform:
            nounish = True
        if stemmed not in vocab.quickfind.keys():
            if stemmed.endswith("d.oe.#"):  #cleanup
                stemmed = stemmed[:-9]
            elif not nounish:
                if stemmed:
                    stemmed = suggestoverb(stemmed, verbs, vocab)
                else:
                    stemmed = suggestoverb(thisform, verbs, vocab)
            else:
                thelist = ["---g.i.#", "m"]
                stemmed = suggestoverb(stemmed, verbs, vocab, thelist)
    if stemmed in verbs.keys():
        if not stemmed == afterparticles or not len(stemmed.split("---")) < 2:
            stemmed = verbs[stemmed]
    elif stemmed.endswith("#.i.#---d.a.#") and len(
            stemmed
    ) > 16:  # assuming not a verb in its own right, attached copula
        stemmed = stemmed[:-16]
    return stemmed