def load_trigrams(self, filename): """Load model data from a text file where each formatted thus: <number of occurances><space><the three bytes><end of line> The format of the trigram is specific to the trigram mode, but no attempt is made to ensure that it is right. """ f = open_maybe_gzip(filename) for line in f: count, tg = line.rstrip('\n').split(' ', 1) self.lut[tg] += int(count) f.close()
def save_trigrams(self, filename): """Save the trigram data to a file. The format of the file is specific to the trigram mode. It is much quicker to load the trigrams from a trigram file than to regenerate the model from raw text.""" values = [(v, k) for k, v in self.lut.iteritems()] values.sort() values.reverse() f = open_maybe_gzip(filename, 'w') for count, tg in values: print >> f, count, tg f.close()
def import_text(self, fn): """Update the model with the character trigrams in the named file. """ f = open_maybe_gzip(fn) s = f.read() #normalise as canonical utf8 for encoding in ('utf8', 'iso-8859-1'): try: s = s.decode(encoding) break except UnicodeDecodeError: pass s = s.encode('utf8') f.close() self.trigrams += self.update_lut(self.lut, s)
def hose_filter(self, infile): """Read the drink_the_hose json lines in infile and yield similarity.""" if hasattr(infile, 'next'): f = infile else: f = open_maybe_gzip(infile) for line in f: j = json.loads(line) s = j['text'].encode('utf8') p = self.probable_similarity(s) yield {'score': p, 'text': s, #'id': j['id'], 'screen_name': j["screen_name"].encode('utf8') } if f is not infile: f.close()
def get_tokens_from_file(self, filename): """Read the json lines from specified file and yield tokens.""" f = open_maybe_gzip(filename) for line in f: if isinstance(line, unicode): line = line.encode("utf-8") j = json.loads(line) doc_type = j["doc_type"] if doc_type == "raw_tweet": screen_name = "@" + j["username"] for token in self.get_tokens_from_raw_tweet(j): yield (screen_name.lower(), token) elif doc_type == "csharp_munged_tweet": screen_name = "@" + j["screen_name"] for token in self.get_tokens_from_csharp_munged_tweet(j): yield (screen_name.lower(), token) else: raise Exception("unexpected doc type ", doc_type) f.close()