def __init__(self, data=[]): # Lookup cache (constantly rerunning tagger takes time) self.cache = Cache('ark_tweet') # Unescape data self.h = HTMLParser() # Resolve and cache all currently uncached tweets self.resolve(data)
def __init__(self): # Global spell checker #self.d = enchant.DictWithPWL("en_US", '/data1/wboag/ml/twitvec/twitvec/spelling/output.txt') self.d = enchant.Dict("en_US") # Common abbreviations and mistakes self.common = {} abbrevs = os.path.join(enabled_modules['spell'], 'abbrv.txt') with open(abbrevs, 'r') as f: for line in f.readlines(): if line == '\n': continue abbrev, full = tuple(line.strip('\n').split(' || ')) self.common[abbrev] = full # Load cache of spell-corrected words self.cache = Cache('B-enchant')
def __init__(self, sids=[], data=[]): # Tweet cache self.cache = Cache('twitter_data') # Cache all given data self.resolve(sids, data)