class FollowingFeatureConverter(FeatureConverter): def __init__(self, getFollowing = None): assert(getFollowing is not None) self.getFollowing = getFollowing self.indexer = Indexer() def passIndexer(self, feature_arr): ''' @param feature_arr This is a list of (predicate, weight) ''' def _indexed(feature_weight): feature, weight = feature_weight return (self.indexer.index(feature), weight) return map(_indexed, feature_arr) def convertSingle(self, user_id): user_id = str(user_id) followings = self.getFollowing(user_id) if followings is None: return None feature_arr = map(lambda x: (x, 1), followings) feature_arr = self.passIndexer(feature_arr) return sorted(feature_arr, key = itemgetter(0))
class IndexedFeature(object): def __init__(self): self.indexer = Indexer() def freeze(self): self.indexer.freeze() def vectorize(self, text): # better filter maybe texts = text.split("\n") texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts) text = "\n".join(texts) # tokenize and remove stoplist words_arr = re.findall(r"\w+", text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context def number(self, context): """ convert text to feature array """ idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0] >= 0, idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def text2feature(self, text): return self.number(self.vectorize(text))
class IndexedFeature(object): def __init__(self): self.indexer = Indexer() def freeze(self): self.indexer.freeze() def vectorize(self, text): # better filter maybe texts = text.split('\n') #texts = filter(lambda t: not re.search('year', t), texts) #texts = filter(lambda t: not re.search('old', t), texts) texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts) text = '\n'.join(texts) # Replace Twitter specific patterns. #text = re.sub(r'https?:\S+', ' ', text) #text = re.sub(r'\b\d{2} years old', ' ', text) #text = re.sub(r'old', ' ', text) #text = re.sub(r'years', ' ', text) #text = re.sub(r'\d+', ' ', text) #text = re.sub(r'#\w+', ' ', text) #text = re.sub(r'@\w+', ' ', text) # tokenize and remove stoplist words_arr = re.findall(r'\w+', text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context def number(self, context): ''' convert text to feature array ''' idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0] >= 0, idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def text2feature(self, text): return self.number(self.vectorize(text))
class IndexedContext(object): def __init__(self): self.indexer = Indexer() self.count = 0 def freeze(self): self.indexer.freeze() def getIndexer(self): return self.indexer def setIndexer(self, indexer): self.indexer = indexer def loadIndexedContext(self, user_id): context = loadContext(user_id) idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0], idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def loadIndexedContexts(self, users): contexts = map(lambda u: self.loadIndexedContext(u), users) return contexts def libsvmString(self, context, label): ''' convert the loaded context into libsvm context. ''' row = str(label) for f, v in context: # (feature, value) row += ' %s:%s' % (f, v) row += '\n' return row def loadLibsvmString(self, user_id, label): return self.libsvmString(self.loadIndexedContext(user_id), label) def writeToFile(self, output_file, text): fout = open(output_file, 'a') fout.write(text) fout.close() def writeId2File(self, output_uid, user_id): fout = open(output_uid, 'a') fout.write("%s\n" % user_id) fout.close() def tick_counter(self): self.count += 1 if self.count % 1000 == 0: print "%d numbers processed" % self.count elif self.count % 100 == 0: sys.stdout.write('.') sys.stdout.flush() def processUser(self, arg): ''' arg = (user_id, label) ''' user_id, label = arg output_file = self.output_libsvm output_uid = self.output_uid try: row = self.loadLibsvmString(user_id, label) except UserNotFoundError: return False finally: #self.tick_counter() self.pbar.update(self.pbar_idx) self.pbar_idx += 1 self.writeToFile(output_file, row) self.writeId2File(output_uid, user_id) return True #def processFile(self, input_file, output_file):#{{{ # assert(os.path.exists(input_file)) # fout = open(output_file, 'w') # fout.close() # # Load users to process # params = [] # for line in open(input_file): # user_id, label = line.rstrip('\n').split('\t') # params.append((user_id, label, output_file)) # # Run!! # results = map(lambda p: self.processUser(p), params) # print 'finished processing %d out of %d' % ( # sum(results), len(results))#}}} def processPairs(self, pairs, output_libsvm, output_uid): # convert the format params = [(p[0], p[1]) for p in pairs] self.output_libsvm = output_libsvm self.output_uid = output_uid # Run!! self.pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(params)).start() self.pbar_idx = 0 ## Main Loop results = map(lambda p: self.processUser(p), params) self.pbar.finish() print 'finished processing %d out of %d' % ( sum(results), len(results))
class IndexedContext(object): def __init__(self): self.indexer = Indexer() self.count = 0 def freeze(self): self.indexer.freeze() def getIndexer(self): return self.indexer def setIndexer(self, indexer): self.indexer = indexer def loadIndexedContext(self, user_id): context = loadContext(user_id) idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0], idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def loadIndexedContexts(self, users): contexts = map(lambda u: self.loadIndexedContext(u), users) return contexts def libsvmString(self, context, label): ''' convert the loaded context into libsvm context. ''' row = str(label) for f, v in context: # (feature, value) row += ' %s:%s' % (f, v) row += '\n' return row def loadLibsvmString(self, user_id, label): return self.libsvmString(self.loadIndexedContext(user_id), label) def writeToFile(self, output_file, text): fout = open(output_file, 'a') fout.write(text) fout.close() def tick_counter(self): self.count += 1 if self.count % 1000 == 0: print "%d numbers processed" % self.count elif self.count % 100 == 0: sys.stdout.write('.') sys.stdout.flush() def processUser(self, arg): ''' arg = (user_id, label, output_file) ''' user_id, label, output_file = arg try: row = self.loadLibsvmString(user_id, label) except UserNotFoundError: return False finally: self.tick_counter() self.writeToFile(output_file, row) return True def processFile(self, input_file, output_file): assert(os.path.exists(input_file)) fout = open(output_file, 'w') fout.close() # Load users to process params = [] for line in open(input_file): user_id, label = line.rstrip('\n').split('\t') params.append((user_id, label, output_file)) # Run!! results = map(lambda p: self.processUser(p), params) print 'finished processing %d out of %d' % ( sum(results), len(results))
class TextFeatureConverter(FeatureConverter): ''' The indexer used to map a given word to a feature number is maintained even after converting a series of users, thus on the successive calls of the function 'convert', it is expected to return the consistent feature mapping. ''' def __init__(self, getText = None, stopfile = None): ''' @param getText a function which gets user_id as a paramter and returns a tuple (text, length), where length means the number of the statuses crawled which is used to normalized the text later. @param stopfile A file which includes a list of stop words. A default file is used if not mentioned. ''' super(TextFeatureConverter, self).__init__() # if any self.getText = getText self.indexer = Indexer() # Process stop list self.stoplist = {} if stopfile is None: stopfile = '../../data/stoplist.txt' stop = open(stopfile) for line in stop: line = line.rstrip('\n') self.stoplist[line] = True def writeIndexed(self, fout_name): fout = open(fout_name, 'w') fout.write(str(self.indexer)) fout.close() def textToSparseTokens(self, text, pass_indexer = True): ''' Given a long text, we convert it to a list of words (or features). The format should be like this: [(word, count), (word, count), ...] I.e., they are unnormalized feature list. ''' def _removeStoplist(words_arr): new_words_arr = [] for word in words_arr: if not word.lower() in self.stoplist: new_words_arr.append(word) return new_words_arr def _replacePattern(text): #text = re.sub(r'#\w+', 'tag_pattern', text) #text = re.sub(r'\b\d+\b', ' ', text) #text = re.sub(r'@\w+', 'mention_pattern', text) #text = re.sub(r'https?:\S+', 'url_pattern', text) return text words_arr = re.findall(r'\w+', _replacePattern(text)) words_arr = map(lambda x: x.lower(), words_arr) words_arr = _removeStoplist(words_arr) #if pass_indexer: # this conditioning added to handle a special case # # needed by passWeightCoeff module # words_arr = map(self.indexer.index, words_arr) c = Counter(words_arr) return c.most_common() def normarlizeWordsArr(self, words_arr, length): def _single(pair): return (pair[0], float(pair[1]) / length) return map(_single, words_arr) def additionalPass(self, words_arr, user_id, semi_label): ''' subclass may want to override this method to add functionality, like changing the weight of features. ''' return words_arr def passIndexer(self, words_arr): ''' @param words_arr This is a list of (word, weight) ''' def _indexed(word_weight): word, weight = word_weight return (self.indexer.index(word), weight) return map(_indexed, words_arr) def convertSingle(self, user_id, semi_label = None): ''' This method is called by 'writeFeatures' method repeatedly for each user_id encountered. ''' user_id = str(user_id) mytext, length = self.getText(user_id) if mytext is None: # when something's wrong return None words_arr = self.textToSparseTokens(mytext) words_arr = self.normarlizeWordsArr(words_arr, length) words_arr = self.additionalPass(words_arr, user_id, semi_label) # hook words_arr = self.passIndexer(words_arr) return sorted(words_arr, key=itemgetter(0))