text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','')
          text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','')
          text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','')
          text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','')
          text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','')
          if text.isupper(): text = text.lower()
#         print text
      except IndexError:
          print line
          continue

# G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works)
      line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','')
      print line,

# H. Parts of speech with MBSP -- resplit the text if needed
      try:
         pos = MBSP.chunk(text, tokenize=True, lemmata=True)
         for pos in pos.splitlines():
             pos = str(pos).replace(' ','|')
             print "".join([field[0],"|",field[1],"|POS_01","|",pos])
      except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError):
         # Tag failed UTF-8 lines NA to enable repair
         print "".join([field[0],"|",field[1],"|POS_01","|NA"])
         continue

# I. Close the file
fp.close()

# EOF
Exemple #2
0
 def _chunk_MBSP(self, txt):
     chunked = MBSP.chunk(txt)
     return unicode(chunked)