def database(): server = config.server port = config.port db = config.db name = config.name passwd = config.passwd col = config.col conn = MongoDBPipeline(server, port, db, name, passwd, col) dd = list( conn.getIds({ "symbol": { "$regex": u"smtusdt" }, "period": "1day" }, col)) data = pd.DataFrame(dd) return data
def get(self): url = request.args.get('key') logger.info('url is '+url) if url is None or url == '': return json.dumps({'success': False,'msg': 'url can\'t be empty'}) try: if not url.startswith('http://mp.weixin.qq.com/') and not url.startswith('https://mp.weixin.qq.com/'): return {'success': False, 'msg': 'url pattern is not correct'} d0 = time.time() if(url.startswith('https')):#https不可以解析 转换成http ,效果一样 url = url.replace('https','http') ######分解参数 然后拼接最小化参数#### url = url.replace(base_url,'') list = str(url).split('&') params = '' for param in list: if param.startswith('__biz') or param.startswith('mid') or param.startswith('idx') or param.startswith('sn'): params = params + param + '&' if params.endswith('&'): params = params[0:len(params) -1] url = base_url + params DB = app.config.get('DB') md = MongoDBPipeline(DB['db'],DB['col'],DB['address'],DB['replicaSet']) md5 = hashlib.new("md5", url).hexdigest() logger.info('url is ' + url + ' and md5 is ' + md5) item = md.find({'md5': md5}); if item is not None and item.__len__() > 0: logger.info('fetch data from mongodb with key [' + md5 + ']') del item['_id'] item['success'] = True return item else: logger.info('request from url [' + url + ']') item = {} TINYURL = app.config.get('TINYURL') apiUrl = TINYURL['tinyurl'] # tiny url service logger.info('tinyurl config is :[' + apiUrl + ']') tempUrl = url.replace('#', '%23').replace('&', '%26') try: # generate tinyurl f = urllib2.urlopen(apiUrl + tempUrl,timeout=5) s = f.read() logger.info('tinyurl is [' + s + ']') item['tinyurl'] = s except Exception,ex: logger.error('generate tinyurl error') logger.error(ex) item['tinyurl'] = tempUrl s = requests.Session() headers = {"User-Agent": UA} s.headers.update(headers) url2 = BASE_URL + '/weixin?query=123' r = s.get(url2) if 'SNUID' not in s.cookies: p = re.compile(r'(?<=SNUID=)\w+') s.cookies['SNUID'] = p.findall(r.text)[0] suv = ''.join([str(int(time.time() * 1000000) + random.randint(0, 1000))]) s.cookies['SUV'] = suv # read page infomation s = requests.Session() s.headers.update({"User-Agent": UA}) try: r = s.get(url) html = r.text soup = BeautifulSoup(html, 'lxml') p = re.compile(r'\?wx_fmt.+?\"') content = str(soup.select("#js_content")[0]).replace('data-src', 'src') d = pq(content) item[u'title'] = soup.select('title')[0].text print item['title'] item[u'author'] = soup.select('#post-user')[0].text item['datetime'] = soup.select('#post-date')[0].text item['contenturl'] = url item['md5'] = md5 imgsrc = d.find('img[data-type]').attr('src') if imgsrc != '' and imgsrc != None:#uploading img and store it in to data logger.info('has picture in article') logger.info(imgsrc) try: pic_data = base64.b64encode(urllib2.urlopen(imgsrc,timeout=5).read()) pic_data_md5 = hashlib.new("md5", pic_data).hexdigest() data = {} # upload img data data['uid'] = 2634258 data['verifystr'] = pic_data_md5 #data['topic_id'] = 100 data['source'] = 16 data['data'] = pic_data data['apptoken'] = 'dmaitoken01' UPLOADING = app.config.get('UPLOADING') upload_url = UPLOADING['url']; # upload img url logger.info('uploading url is ['+upload_url+']') uploading_req = urllib2.urlopen(upload_url, json.dumps(data)) upload_result_content = uploading_req.read() upload_result = json.loads(upload_result_content) logger.info('uploading_result is ') logger.info(upload_result) if (upload_result['success'] == 1): logger.info('uploading img ' + imgsrc + ' successfully') item['img'] = upload_result['file_url'] else: logger.info('uploading img ' + imgsrc + ' failed') item['img'] = '' except Exception,ex: logger.error('uploading img error and set img to original img path') logger.error(ex) item['img']=imgsrc span_length = d('p').find('span').length for i in range(0,span_length): if d('p').find('span').eq(i).text() != '': digest = d('p').find('span').eq(i).text() try: digest = digest.encode("latin1").decode("utf8")#乱码判断 except Exception,e: logger.error('normal utf-8 string') item['digest'] = digest break if item.has_key('digest') == False: item['digest']='' md.save(item) item['success'] = True print item logger.info('save item to mongodb ')
def pa(self, url, start, limit): md = MongoDBPipeline(DB["db"], DB["col"], DB["address"]) while start <= limit: url = url.split("=").pop(0) + "=" url += str(start) start += 25 print url s = requests.session() headers = {"User-Agent": UA} s.headers.update(headers) r = s.get(url, timeout=(2, 4)) html = r.text # print html # soup = BeautifulSoup(html, 'lxml') # p = re.compile(r'\?wx_fmt.+?\"') # content = str(soup) d = pq(html) # item = aArray = d.find(".olt").find("tr") items = [] for i in aArray: it = pq(i) clazz = it.attr("class") if None == clazz: ## class 为 None的tr标签 print it elif "" == clazz: # class 为 "" 的标签 trItems = it.children() item = {} for j in range(0, 4): jItem = pq(trItems[j]) href = item.get("href") if href != None: item["md5"] = hashlib.new("md5", item["href"]).hexdigest() tmp = md.find({"md5": item["md5"]}) if tmp != None: item["md5"] = "" if j == 0: item["title"] = jItem.find("a").attr("title") item["href"] = jItem.find("a").attr("href") elif j == 1: item["author"] = jItem.text() item["author_link"] = jItem.find("a").attr("href") elif j == 2: item["reply_count"] = jItem.text() else: item["last_reply"] = jItem.text() if item.get("md5") != None and item.get("md5") != "": items.append(item) print items.__len__() if items.__len__() > 0: self.loop_article(items, s, pq) times = 0 for i in items: times += 1 md.save(i) print ("saved " + str(times) + " records") time.sleep(5)