class IndexedFeature(object): def __init__(self): self.indexer = Indexer() def freeze(self): self.indexer.freeze() def vectorize(self, text): # better filter maybe texts = text.split("\n") texts = filter(lambda t: not re.search(r"\b(I am|I'm|Im) a (man|woman)\b", t, re.I), texts) text = "\n".join(texts) # tokenize and remove stoplist words_arr = re.findall(r"\w+", text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context def number(self, context): """ convert text to feature array """ idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0] >= 0, idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def text2feature(self, text): return self.number(self.vectorize(text))
class IndexedFeature(object): def __init__(self): self.indexer = Indexer() def freeze(self): self.indexer.freeze() def vectorize(self, text): # better filter maybe texts = text.split('\n') #texts = filter(lambda t: not re.search('year', t), texts) #texts = filter(lambda t: not re.search('old', t), texts) texts = filter(lambda t: not re.search(r'\d{2} years old', t), texts) text = '\n'.join(texts) # Replace Twitter specific patterns. #text = re.sub(r'https?:\S+', ' ', text) #text = re.sub(r'\b\d{2} years old', ' ', text) #text = re.sub(r'old', ' ', text) #text = re.sub(r'years', ' ', text) #text = re.sub(r'\d+', ' ', text) #text = re.sub(r'#\w+', ' ', text) #text = re.sub(r'@\w+', ' ', text) # tokenize and remove stoplist words_arr = re.findall(r'\w+', text) words_arr = removeStopList(words_arr) # normalize length = len(words_arr) context = Counter(words_arr).most_common() context = map(lambda x: (x[0], x[1] / float(length)), context) return context def number(self, context): ''' convert text to feature array ''' idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0] >= 0, idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def text2feature(self, text): return self.number(self.vectorize(text))
class IndexedContext(object): def __init__(self): self.indexer = Indexer() self.count = 0 def freeze(self): self.indexer.freeze() def getIndexer(self): return self.indexer def setIndexer(self, indexer): self.indexer = indexer def loadIndexedContext(self, user_id): context = loadContext(user_id) idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0], idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def loadIndexedContexts(self, users): contexts = map(lambda u: self.loadIndexedContext(u), users) return contexts def libsvmString(self, context, label): ''' convert the loaded context into libsvm context. ''' row = str(label) for f, v in context: # (feature, value) row += ' %s:%s' % (f, v) row += '\n' return row def loadLibsvmString(self, user_id, label): return self.libsvmString(self.loadIndexedContext(user_id), label) def writeToFile(self, output_file, text): fout = open(output_file, 'a') fout.write(text) fout.close() def writeId2File(self, output_uid, user_id): fout = open(output_uid, 'a') fout.write("%s\n" % user_id) fout.close() def tick_counter(self): self.count += 1 if self.count % 1000 == 0: print "%d numbers processed" % self.count elif self.count % 100 == 0: sys.stdout.write('.') sys.stdout.flush() def processUser(self, arg): ''' arg = (user_id, label) ''' user_id, label = arg output_file = self.output_libsvm output_uid = self.output_uid try: row = self.loadLibsvmString(user_id, label) except UserNotFoundError: return False finally: #self.tick_counter() self.pbar.update(self.pbar_idx) self.pbar_idx += 1 self.writeToFile(output_file, row) self.writeId2File(output_uid, user_id) return True #def processFile(self, input_file, output_file):#{{{ # assert(os.path.exists(input_file)) # fout = open(output_file, 'w') # fout.close() # # Load users to process # params = [] # for line in open(input_file): # user_id, label = line.rstrip('\n').split('\t') # params.append((user_id, label, output_file)) # # Run!! # results = map(lambda p: self.processUser(p), params) # print 'finished processing %d out of %d' % ( # sum(results), len(results))#}}} def processPairs(self, pairs, output_libsvm, output_uid): # convert the format params = [(p[0], p[1]) for p in pairs] self.output_libsvm = output_libsvm self.output_uid = output_uid # Run!! self.pbar = ProgressBar(widgets=[Percentage(), Bar()], maxval=len(params)).start() self.pbar_idx = 0 ## Main Loop results = map(lambda p: self.processUser(p), params) self.pbar.finish() print 'finished processing %d out of %d' % ( sum(results), len(results))
class IndexedContext(object): def __init__(self): self.indexer = Indexer() self.count = 0 def freeze(self): self.indexer.freeze() def getIndexer(self): return self.indexer def setIndexer(self, indexer): self.indexer = indexer def loadIndexedContext(self, user_id): context = loadContext(user_id) idx_context = map(lambda t: (self.indexer.index(t[0]), t[1]), context) idx_context = filter(lambda t: t[0], idx_context) idx_context = sorted(idx_context, key=itemgetter(0)) return idx_context def loadIndexedContexts(self, users): contexts = map(lambda u: self.loadIndexedContext(u), users) return contexts def libsvmString(self, context, label): ''' convert the loaded context into libsvm context. ''' row = str(label) for f, v in context: # (feature, value) row += ' %s:%s' % (f, v) row += '\n' return row def loadLibsvmString(self, user_id, label): return self.libsvmString(self.loadIndexedContext(user_id), label) def writeToFile(self, output_file, text): fout = open(output_file, 'a') fout.write(text) fout.close() def tick_counter(self): self.count += 1 if self.count % 1000 == 0: print "%d numbers processed" % self.count elif self.count % 100 == 0: sys.stdout.write('.') sys.stdout.flush() def processUser(self, arg): ''' arg = (user_id, label, output_file) ''' user_id, label, output_file = arg try: row = self.loadLibsvmString(user_id, label) except UserNotFoundError: return False finally: self.tick_counter() self.writeToFile(output_file, row) return True def processFile(self, input_file, output_file): assert(os.path.exists(input_file)) fout = open(output_file, 'w') fout.close() # Load users to process params = [] for line in open(input_file): user_id, label = line.rstrip('\n').split('\t') params.append((user_id, label, output_file)) # Run!! results = map(lambda p: self.processUser(p), params) print 'finished processing %d out of %d' % ( sum(results), len(results))