def get_training_data(): print '获取样本...' Draft = namedtuple('Draft', 'id, sid, pic, snum, lnum, author, text, utime, ctime, status') conn = engine.connect() rs = conn.execute('select * from sample where status=0') trash_tweets = map(Draft._make, rs) count_trash = len(trash_tweets) print '垃圾样本数量' rs = conn.execute('select * from entry') snap_tweets = map(Draft._make, rs) tweets = trash_tweets + snap_tweets print '随机打乱样本顺序...' random.shuffle(tweets, random.random) print '获取特征表...' top = get_top_list(tweets=(t.text for t in tweets)) words = [w for w, v in top] print '特征表存入redis...' db = redis.StrictRedis() db.set('features', simplejson.dumps(words)) print '特征表写入本地文件...' with open('feature_words.txt', 'w') as f: f.writelines((w + '\n' for w in words)) f.close() def build_x(text): text = url_re.sub('', text) w_list = seg.cut(text.strip()) w_list.reverse() w_list = [w.encode('utf-8') for w in w_list] fs = [] for w in words: if w in w_list: fs.append(1) else: fs.append(0) return fs fx = [] fy = [] fd = [] print '构建fx, fy, fd' for t in tweets: features = build_x(t.text) fx.append(features) status = 1 if not t.status: status = -1 fy.append(status) item = (t.pic, str(t.sid)) fd.append(item) return fy, fx, fd
def download_snap_timeline(): page = 1 all=[] tweets = client.trends__statuses(trend_name='街拍', page=page) while tweets: for i in tweets: id = i['id'] pic = i.get('bmiddle_pic') author = i['user']['id'] text = i['text'].encode('utf-8') retweet = i.get('retweeted_status') if retweet: id = retweet['id'] text = retweet['text'].encode('utf-8') author = retweet['user']['id'] if pic: text = text.replace("'", '"') pic = pic.encode('utf-8') feature = (id, pic, author, text) all.append(feature) page += 1 tweets = [] try: tweets = client.trends__statuses(trend_name='街拍', page=page) except HTTPError as e: if str(e.code) in ('400', '403', '401'): '''bad request or forbidden''' print '微博接口%s错误', e.code else: raise if page > 5: break # # # #mysql engine conn = engine.connect() #svm predict dup = 0 selected = set() for t in all: is_ban = False for ban in baned_list: if ban in t[3]: is_ban = True break #是否ban掉 if is_ban: continue rs = conn.execute('select * from draft where sid=%s' % t[0]) rs = map(Draft._make, rs) if len(rs): dup += 1 continue text = t[3] if predict(text): id, pic, author, text = t passed = True try: r = urllib2.urlopen(pic.replace('bmiddle', 'thumbnail')) im = open_pic(r.read()) width, height = im.size if height / width > 1.7777 or width / height > 1.7777: passed = False except : pass #print '抓取、分析图片异常' #import traceback; traceback.print_exc() if passed: selected.add(t) #save to draft count = 0 for line in selected: id, pic, author, text = line try: conn.execute('insert into draft (sid, pic, author, text, create_time) \ values(%s,"%s",%s,"%s", now())' % line) print '入库:', id, pic, author, text count += 1 except: pass #print '重复入库:', id, pic, author, text conn.close() return len(all), len(selected), dup