コード例 #1
0
def LoadData():
    t_extr = thai_extractor.ThaiExtractor()
    e_extr = extractor.NameExtractor()
    doclist = documents.Doclist()
    doc = documents.Doc()
    doclist.AddDoc(doc)
    #### Thai
    lang = tokens.Lang()
    lang.SetId('th')
    doc.AddLang(lang)
    t_extr.FileExtract(THAI_)
    lang.SetTokens(t_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.UnitranPronouncer(t)
        pronouncer_.Pronounce()
    #### English
    lang = tokens.Lang()
    lang.SetId('en')
    doc.AddLang(lang)
    e_extr.FileExtract(ENGLISH_)
    lang.SetTokens(e_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.EnglishPronouncer(t)
        pronouncer_.Pronounce()
    return doclist
コード例 #2
0
def LoadData():
    t_extr = chinese_extractor.ChineseExtractor()
    e_extr = extractor.NameExtractor()
    doclist = documents.Doclist()
    doc = documents.Doc()
    doclist.AddDoc(doc)
    #### Chinese
    lang = tokens.Lang()
    lang.SetId('zh')
    doc.AddLang(lang)
    t_extr.FileExtract(CHINESE_)
    lang.SetTokens(t_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.HanziPronouncer(t)
        pronouncer_.Pronounce()
    #### English
    lang = tokens.Lang()
    lang.SetId('en')
    doc.AddLang(lang)
    e_extr.FileExtract(ENGLISH_)
    lang.SetTokens(e_extr.Tokens())
    lang.CompactTokens()
    for t in lang.Tokens():
        pronouncer_ = pronouncer.EnglishPronouncer(t)
        pronouncer_.Pronounce()
    return doclist
コード例 #3
0
ファイル: xmlhandler.py プロジェクト: sushengyang/NLP-project
 def startElement(self, name, attributes):
     if name == 'doclist':
         self.in_token_ = False
         self.doclist_ = documents.Doclist()
         pass
     elif name == 'doc':
         self.in_token_ = False
         self.doc_ = documents.Doc()
         pass
     elif name == 'lang':
         self.in_token_ = False
         self.lang_ = tokens.Lang()
         try:
             self.lang_.SetId(attributes['id'])
         except KeyError:
             pass
     elif name == 'token':
         self.token_string_ = ''
         self.in_token_ = True
         try:
             self.count_ = int(attributes['count'])
         except KeyError:
             self.count_ = 1
         try:
             self.morphs_ = attributes['morphs']
         except KeyError:
             self.morphs_ = ''
         try:
             self.prons_ = attributes['prons']
         except KeyError:
             self.prons_ = ''
コード例 #4
0
def LoadData():
  mp = open(CHINESE_)
  ep = open(ENGLISH_)
  cp = open(CONFIDENCE_)
  doclist = documents.Doclist()
  while True:
    eline = ep.readline()
    mline = mp.readline()
    cline = cp.readline()
    if not cline: break
    if float(cline.strip()) < MINCONFIDENCE_: continue
    doc = documents.Doc()
    ### Chinese
    extractor_ = chinese_extractor.ChineseExtractor()
    extractor_.InitData()
    extractor_.LineSegment(mline)
    lang = tokens.Lang()
    lang.SetId('zho')
    for t in extractor_.Tokens():
      lang.AddToken(t)
    lang.CompactTokens() ## Combine duplicates
    for t in lang.Tokens():
      pronouncer_ = pronouncer.HanziPronouncer(t)
      pronouncer_.Pronounce()
    doc.AddLang(lang)
    ### English
    extractor_ = extractor.NameExtractor()
    extractor_.InitData()
    extractor_.LineSegment(eline)
    lang = tokens.Lang()
    lang.SetId('eng')
    for t in extractor_.Tokens():
      lang.AddToken(t)
    lang.CompactTokens() ## Combine duplicates
    for t in lang.Tokens():
      pronouncer_ = pronouncer.EnglishPronouncer(t)
      pronouncer_.Pronounce()
      if not t.Pronunciations():
        pronouncer_ = pronouncer.LatinPronouncer(t)
        pronouncer_.Pronounce()
    doc.AddLang(lang)
    doclist.AddDoc(doc)
  mp.close()
  ep.close()
  cp.close()
  return doclist
コード例 #5
0
ファイル: miner.py プロジェクト: sushengyang/NLP-project
def LoadData(filelist,
             base='.',
             extractor_=extractor.NameExtractor,
             xdump=None,
             mincnt=DEF_MINCNT_):
    lastgroup = -1
    lastlanguage = ''
    doc = None
    lang = None
    doclist = documents.Doclist()
    xtractr = extractor_()
    sys.stderr.write('Extracting terms...\n')
    fp = open(filelist)
    for line in fp:
        toks = line.split()
        group = int(toks[0])
        language = toks[1]
        files = toks[2:]
        if group != lastgroup:
            if lastgroup > 0:
                assert group == lastgroup + 1,\
                    'Failed sanity check: group %d != group %d + 1' % (group, lastgroup)
            doc = documents.Doc()
            doclist.AddDoc(doc)
        if language != lastlanguage:
            if lang:
                lang.CompactTokens()
            lang = tokens.Lang()
            lang.SetId(language)
            doc.AddLang(lang)
        for file in files:
            file = base + '/' + file
            xtractr.InitData()
            xtractr.FileExtract(file)
            for t in xtractr.Tokens():
                lang.AddToken(t)
        lastgroup = group
        lastlanguage = language
        #lines = p.readlines()
    fp.close()
    if mincnt > 0:
        sys.stderr.write(
            'Filtering to remove terms less frequent than %d...\n' % mincnt)
        filter_ = filter.FrequencyFilter(doclist)
        filter_.SetMinCount(mincnt)
        filter_.Filter()
    if xdump:
        sys.stderr.write('Dumping doclist to %s...\n' % xdump)
        doclist.XmlDump(xdump, utf8=True)
    return doclist
コード例 #6
0
 def AddTokens(self, toks, langid):
     success = False
     for lang in self.langs_:
         if lang.Id() == langid:
             for t in toks:
                 lang.AddToken(t)
             lang.CompactTokens()
             success = True
     if not success:
         lang = tokens.Lang()
         lang.SetId(langid)
         lang.SetTokens(toks)
         lang.CompactTokens()
         self.AddLang(lang)
コード例 #7
0
def CreateDoclist():
    doclist = documents.Doclist()
    doc = documents.Doc()
    lang = tokens.Lang()
    lang.SetId('eng')
    token_ = tokens.Token('Bush')
    token_.SetCount(1)
    token_.AddPronunciation('b U S')
    token_.SetMorphs(['Bush', "'s"])
    lang.AddToken(token_)
    token_ = tokens.Token('Clinton')
    token_.SetCount(3)
    token_.AddPronunciation('k l I n t & n')
    token_.AddPronunciation('k l I n t > n')
    token_.SetMorphs(['Clinton'])
    lang.AddToken(token_)
    token_ = tokens.Token('Bush')
    token_.SetCount(3)
    token_.AddPronunciation('b U S')
    token_.SetMorphs([
        'Bush',
        "'s",
    ])
    lang.AddToken(token_)
    lang.CompactTokens()
    doc.AddLang(lang)
    lang = tokens.Lang()
    lang.SetId('zho')
    token_ = tokens.Token('克林頓')
    token_.SetCount(3)
    token_.AddPronunciation('kh & l i n t u n')
    token_.SetMorphs(['克林頓'])
    lang.AddToken(token_)
    token_ = tokens.Token('高島屋')
    token_.SetCount(1)
    token_.AddPronunciation('k a u t a u u')
    token_.AddPronunciation('t A k A s i m A j a')
    lang.AddToken(token_)
    doc.AddLang(lang)
    doclist.AddDoc(doc)
    doc = documents.Doc()
    lang = tokens.Lang()
    lang.SetId('eng')
    token_ = tokens.Token('Clinton')
    token_.SetCount(2)
    token_.AddPronunciation('k l I n t & n')
    token_.SetMorphs(['Clinton'])
    lang.AddToken(token_)
    token_ = tokens.Token('Bush')
    token_.SetCount(3)
    token_.AddPronunciation('b U S')
    token_.SetMorphs(['Bush', "'s"])
    lang.AddToken(token_)
    doc.AddLang(lang)
    lang = tokens.Lang()
    lang.SetId('ara')
    token_ = tokens.Token('كلينتون')
    token_.SetCount(3)
    token_.AddPronunciation('k l j n t w n')
    token_.SetMorphs(['كلينتون'])
    lang.AddToken(token_)
    doc.AddLang(lang)
    doclist.AddDoc(doc)
    return doclist