def filepath_to_unicode_strings(self, filepath): """Read text out of an input file. The default just reads the text, converts to unicode and yields one unicode string. Subclasses can override this function in order to preprocess, and can yield any number of strings. Args: filepath: a string Yields: unicode strings. """ f = tf.gfile.Open(filepath) b = f.read() yield text_encoder.to_unicode_ignore_errors(b)
def filepath_to_unicode_strings(self, filepath): """Overrides the base class to clean up the xml dump before tokenizing.""" dump = text_encoder.to_unicode_ignore_errors(tf.gfile.Open(filepath).read()) pages = _dump_to_pages(dump) ret = u"" for p in pages: title = _page_to_title(p) text = _page_to_text(p) text = _remove_triple_quotes( _remove_double_brackets(_remove_references(text))) if u":" in title: # not a regular article continue if len(text) <= 140: # Probably a redirect or something like that. Skip it. continue ret += u"title: \"%s\" length: %d\n%s\n" % (title, len(text), text) yield ret