def startElement(self, name, attributes): if name == 'doclist': self.in_token_ = False self.doclist_ = documents.Doclist() pass elif name == 'doc': self.in_token_ = False self.doc_ = documents.Doc() pass elif name == 'lang': self.in_token_ = False self.lang_ = tokens.Lang() try: self.lang_.SetId(attributes['id']) except KeyError: pass elif name == 'token': self.token_string_ = '' self.in_token_ = True try: self.count_ = int(attributes['count']) except KeyError: self.count_ = 1 try: self.morphs_ = attributes['morphs'] except KeyError: self.morphs_ = '' try: self.prons_ = attributes['prons'] except KeyError: self.prons_ = ''
def LoadData(): t_extr = thai_extractor.ThaiExtractor() e_extr = extractor.NameExtractor() doclist = documents.Doclist() doc = documents.Doc() doclist.AddDoc(doc) #### Thai lang = tokens.Lang() lang.SetId('th') doc.AddLang(lang) t_extr.FileExtract(THAI_) lang.SetTokens(t_extr.Tokens()) lang.CompactTokens() for t in lang.Tokens(): pronouncer_ = pronouncer.UnitranPronouncer(t) pronouncer_.Pronounce() #### English lang = tokens.Lang() lang.SetId('en') doc.AddLang(lang) e_extr.FileExtract(ENGLISH_) lang.SetTokens(e_extr.Tokens()) lang.CompactTokens() for t in lang.Tokens(): pronouncer_ = pronouncer.EnglishPronouncer(t) pronouncer_.Pronounce() return doclist
def LoadData(): t_extr = chinese_extractor.ChineseExtractor() e_extr = extractor.NameExtractor() doclist = documents.Doclist() doc = documents.Doc() doclist.AddDoc(doc) #### Chinese lang = tokens.Lang() lang.SetId('zh') doc.AddLang(lang) t_extr.FileExtract(CHINESE_) lang.SetTokens(t_extr.Tokens()) lang.CompactTokens() for t in lang.Tokens(): pronouncer_ = pronouncer.HanziPronouncer(t) pronouncer_.Pronounce() #### English lang = tokens.Lang() lang.SetId('en') doc.AddLang(lang) e_extr.FileExtract(ENGLISH_) lang.SetTokens(e_extr.Tokens()) lang.CompactTokens() for t in lang.Tokens(): pronouncer_ = pronouncer.EnglishPronouncer(t) pronouncer_.Pronounce() return doclist
def __init__(self, doclist=None): if doclist is None: self.doclist_ = documents.Doclist() else: self.doclist_ = doclist self.n_ = len(self.doclist_.Docs()) self.tokstats_ = {}
def LoadData(filelist, base='.', extractor_=extractor.NameExtractor, xdump=None, mincnt=DEF_MINCNT_): lastgroup = -1 lastlanguage = '' doc = None lang = None doclist = documents.Doclist() xtractr = extractor_() sys.stderr.write('Extracting terms...\n') fp = open(filelist) for line in fp: toks = line.split() group = int(toks[0]) language = toks[1] files = toks[2:] if group != lastgroup: if lastgroup > 0: assert group == lastgroup + 1,\ 'Failed sanity check: group %d != group %d + 1' % (group, lastgroup) doc = documents.Doc() doclist.AddDoc(doc) if language != lastlanguage: if lang: lang.CompactTokens() lang = tokens.Lang() lang.SetId(language) doc.AddLang(lang) for file in files: file = base + '/' + file xtractr.InitData() xtractr.FileExtract(file) for t in xtractr.Tokens(): lang.AddToken(t) lastgroup = group lastlanguage = language #lines = p.readlines() fp.close() if mincnt > 0: sys.stderr.write( 'Filtering to remove terms less frequent than %d...\n' % mincnt) filter_ = filter.FrequencyFilter(doclist) filter_.SetMinCount(mincnt) filter_.Filter() if xdump: sys.stderr.write('Dumping doclist to %s...\n' % xdump) doclist.XmlDump(xdump, utf8=True) return doclist
def LoadData(): mp = open(CHINESE_) ep = open(ENGLISH_) cp = open(CONFIDENCE_) doclist = documents.Doclist() while True: eline = ep.readline() mline = mp.readline() cline = cp.readline() if not cline: break if float(cline.strip()) < MINCONFIDENCE_: continue doc = documents.Doc() ### Chinese extractor_ = chinese_extractor.ChineseExtractor() extractor_.InitData() extractor_.LineSegment(mline) lang = tokens.Lang() lang.SetId('zho') for t in extractor_.Tokens(): lang.AddToken(t) lang.CompactTokens() ## Combine duplicates for t in lang.Tokens(): pronouncer_ = pronouncer.HanziPronouncer(t) pronouncer_.Pronounce() doc.AddLang(lang) ### English extractor_ = extractor.NameExtractor() extractor_.InitData() extractor_.LineSegment(eline) lang = tokens.Lang() lang.SetId('eng') for t in extractor_.Tokens(): lang.AddToken(t) lang.CompactTokens() ## Combine duplicates for t in lang.Tokens(): pronouncer_ = pronouncer.EnglishPronouncer(t) pronouncer_.Pronounce() if not t.Pronunciations(): pronouncer_ = pronouncer.LatinPronouncer(t) pronouncer_.Pronounce() doc.AddLang(lang) doclist.AddDoc(doc) mp.close() ep.close() cp.close() return doclist
def CreateDoclist(): doclist = documents.Doclist() doc = documents.Doc() lang = tokens.Lang() lang.SetId('eng') token_ = tokens.Token('Bush') token_.SetCount(1) token_.AddPronunciation('b U S') token_.SetMorphs(['Bush', "'s"]) lang.AddToken(token_) token_ = tokens.Token('Clinton') token_.SetCount(3) token_.AddPronunciation('k l I n t & n') token_.AddPronunciation('k l I n t > n') token_.SetMorphs(['Clinton']) lang.AddToken(token_) token_ = tokens.Token('Bush') token_.SetCount(3) token_.AddPronunciation('b U S') token_.SetMorphs([ 'Bush', "'s", ]) lang.AddToken(token_) lang.CompactTokens() doc.AddLang(lang) lang = tokens.Lang() lang.SetId('zho') token_ = tokens.Token('克林頓') token_.SetCount(3) token_.AddPronunciation('kh & l i n t u n') token_.SetMorphs(['克林頓']) lang.AddToken(token_) token_ = tokens.Token('高島屋') token_.SetCount(1) token_.AddPronunciation('k a u t a u u') token_.AddPronunciation('t A k A s i m A j a') lang.AddToken(token_) doc.AddLang(lang) doclist.AddDoc(doc) doc = documents.Doc() lang = tokens.Lang() lang.SetId('eng') token_ = tokens.Token('Clinton') token_.SetCount(2) token_.AddPronunciation('k l I n t & n') token_.SetMorphs(['Clinton']) lang.AddToken(token_) token_ = tokens.Token('Bush') token_.SetCount(3) token_.AddPronunciation('b U S') token_.SetMorphs(['Bush', "'s"]) lang.AddToken(token_) doc.AddLang(lang) lang = tokens.Lang() lang.SetId('ara') token_ = tokens.Token('كلينتون') token_.SetCount(3) token_.AddPronunciation('k l j n t w n') token_.SetMorphs(['كلينتون']) lang.AddToken(token_) doc.AddLang(lang) doclist.AddDoc(doc) return doclist