Example #1
0
    def load_trigrams(self, filename):
        """Load model data from a text file where each formatted thus:

        <number of occurances><space><the three bytes><end of line>

        The format of the trigram is specific to the trigram mode, but
        no attempt is made to ensure that it is right.
        """
        f = open_maybe_gzip(filename)
        for line in f:
            count, tg = line.rstrip('\n').split(' ', 1)
            self.lut[tg] += int(count)
        f.close()
Example #2
0
    def save_trigrams(self, filename):
        """Save the trigram data to a file.

        The format of the file is specific to the trigram mode.

        It is much quicker to load the trigrams from a trigram file
        than to regenerate the model from raw text."""
        values = [(v, k) for k, v in self.lut.iteritems()]
        values.sort()
        values.reverse()
        f = open_maybe_gzip(filename, 'w')
        for count, tg in values:
            print >> f, count, tg
        f.close()
Example #3
0
    def import_text(self, fn):
        """Update the model with the character trigrams in the named
        file.
        """
        f = open_maybe_gzip(fn)
        s = f.read()
        #normalise as canonical utf8
        for encoding in ('utf8', 'iso-8859-1'):
            try:
                s = s.decode(encoding)
                break
            except UnicodeDecodeError:
                pass
        s = s.encode('utf8')

        f.close()
        self.trigrams += self.update_lut(self.lut, s)
Example #4
0
 def hose_filter(self, infile):
     """Read the drink_the_hose json lines in infile and yield
     similarity."""
     if hasattr(infile, 'next'):
         f = infile
     else:
         f = open_maybe_gzip(infile)
     for line in f:
         j = json.loads(line)
         s = j['text'].encode('utf8')
         p = self.probable_similarity(s)
         yield {'score': p,
                'text': s,
                #'id': j['id'],
                'screen_name': j["screen_name"].encode('utf8')
                }
     if f is not infile:
         f.close()
Example #5
0
    def get_tokens_from_file(self, filename):
        """Read the json lines from specified file and yield tokens."""

        f = open_maybe_gzip(filename)
        for line in f:
            if isinstance(line, unicode):
                line = line.encode("utf-8")
            j = json.loads(line)
            doc_type = j["doc_type"]

            if doc_type == "raw_tweet":
                screen_name = "@" + j["username"]
                for token in self.get_tokens_from_raw_tweet(j):
                    yield (screen_name.lower(), token)

            elif doc_type == "csharp_munged_tweet":
                screen_name = "@" + j["screen_name"]
                for token in self.get_tokens_from_csharp_munged_tweet(j):
                    yield (screen_name.lower(), token)

            else:
                raise Exception("unexpected doc type ", doc_type)
        f.close()