def __init__(self, data=[]):
        # Lookup cache (constantly rerunning tagger takes time)
        self.cache = Cache('ark_tweet')

        # Unescape data
        self.h = HTMLParser()

        # Resolve and cache all currently uncached tweets
        self.resolve(data)
    def __init__(self):

        # Global spell checker
        #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt')
        self.d = enchant.Dict("en_US")

        # Common abbreviations and mistakes
        self.common = {}
        abbrevs = os.path.join(enabled_modules['spell'], 'abbrv.txt')
        with open(abbrevs, 'r') as f:
            for line in f.readlines():
                if line == '\n': continue
                abbrev, full = tuple(line.strip('\n').split(' || '))
                self.common[abbrev] = full

        # Load cache of spell-corrected words
        self.cache = Cache('B-enchant')
Esempio n. 3
0
    def __init__(self, sids=[], data=[]):
        # Tweet cache
        self.cache = Cache('twitter_data')

        # Cache all given data
        self.resolve(sids, data)