text = str(text).replace('\x00 ','').replace('\xef\xbf\xbd','') text = str(text).replace('\xf7','').replace('\xc3\xba','').replace('\xb6','').replace('\xa9','').replace('\xe2\x99\xaa','') text = str(text).replace('\xc3\xaf','').replace('\x5c','').replace('\xf1','').replace('\xe1','').replace('\xe7','').replace('\xfa','') text = str(text).replace('\xf3','').replace('\xed','').replace('\xe9','').replace('\xe0','').replace('\xae','').replace('\xc2','') text = str(text).replace('\xc3','').replace('\xa2','').replace('\xbf','') if text.isupper(): text = text.lower() # print text except IndexError: print line continue # G. Remove clearly wrong unicode characters -- BOM, NULL (only utf8 hex works) line = str(line).replace('\x00 ','').replace('\xef\xbf\xbd','') print line, # H. Parts of speech with MBSP -- resplit the text if needed try: pos = MBSP.chunk(text, tokenize=True, lemmata=True) for pos in pos.splitlines(): pos = str(pos).replace(' ','|') print "".join([field[0],"|",field[1],"|POS_01","|",pos]) except (UnicodeDecodeError, UnicodeEncodeError, IndexError, AssertionError): # Tag failed UTF-8 lines NA to enable repair print "".join([field[0],"|",field[1],"|POS_01","|NA"]) continue # I. Close the file fp.close() # EOF
def _chunk_MBSP(self, txt): chunked = MBSP.chunk(txt) return unicode(chunked)