#-*-coding:utf-8 -*- #__autor__='jufu' from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') #logger.info('dlm logger error') if __name__ == '__main__': table = initDB('fintech', 'ygdai_detail') newTable = initDB('fintech', 'new_ygdai_detail') for user in table.find(): data = {} for userString in user[u'信息']: for s in userString.split('\n'): #print s.split(u'\uff1a') key = s.split(u'\uff1a')[0].strip() value = s.split(u'\uff1a')[1].strip() #data[str(count)]=key+':'+value data[key] = value #print key,value #print data #newTable.insert_one(data) data[u'抓取时间'] = user[u'抓取时间'] data[u'总逾期借款笔数'] = user[u'总逾期借款笔数']
#-*- coding:utf-8 -*- # __autor__='jufu' import requests from bs4 import BeautifulSoup from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') table = initDB('fintech', 'ygdai_new') URL = 'http://www.ygdai.com/s/blacklist.html' domain = 'http://www.ygdai.com' baseUrl = 'http://www.ygdai.com/s/blacklist/page/' def getTotalPages(url): logger.info('start to get total url...') logger.info(url) html = requests.get(url) content = html.text soup = BeautifulSoup(content, 'lxml') link = soup.select('#yw0 > li.last > a') hrefStr = link[0]['href'] logger.info('getTotalPages end...') return hrefStr.split('/')[-1].split('.')[0] def genPageUrls(totalPages): urls = [baseUrl + str(i) + '.html' for i in range(1, int(totalPages) + 1)] return urls
def main(): conn = createDBconnection() initDB(conn) conn.close() server()
#-*- coding:utf-8 -*- import requests from bs4 import BeautifulSoup import sys import urllib2 baseUrl = 'http://www.p2p12580.com/blacklist.asp?' from util import initLogger from util import initDB logger=initLogger('log.conf','dlmLogger') table=initDB('fintech','p2p12580_new') def getDetail(url): pass def gettotalPages(startUrl): #http://www.p2p12580.com/blacklist.asp?page=1&id=1&strKeyWord= #html=requests.get(startUrl) html=urllib2.urlopen(startUrl).read().decode('gbk') soup=BeautifulSoup(html,'lxml') #print soup print soup.select('#__01 > tbody > tr:nth-of-type(6) > td:nth-of-type(2) > table > tbody > tr:nth-of-type(39) > td') if __name__=='__main__': print 'please open http://www.p2p12580.com/blacklist.asp' id = raw_input('please enter id:') if id.isdigit(): startUrl = baseUrl+'page=1&id='+str(id)+'&strKeyWord='
# -*-coding:utf-8 -*- # __autor__='jufu' import requests from bs4 import BeautifulSoup domainUrl = 'http://www.dailianmeng.com' URL = 'http://www.dailianmeng.com/p2pblacklist/index.html' baseUrl = 'http://www.dailianmeng.com/p2pblacklist/index.html?ajax=yw0&P2pBlacklist_page=' from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') table = initDB('fintech', 'dailianmeng_new') def getTotalPages(url): logger.info('start to get total url...') logger.info(url) html = requests.get(url) content = html.text soup = BeautifulSoup(content, 'lxml') link = soup.select('#yw1 > li.last > a') hrefStr = link[0]['href'] logger.info('getTotalPages end...') return hrefStr.split('=')[-1] def genPageUrls(totalPages): urls = [baseUrl + str(i) for i in range(1, int(totalPages) + 1)]
u'上传时间': 'Case_Exposed_Date', u'信息来源更新时间': 'Case_Exposed_Date', u'信息来源': 'Info_Source', u'信息来源URL': 'Info_source_URL', u'信息来源网址': 'Info_source_URL', u'抓取时间': 'Info_Captured_TS', u'创建日期': 'Created_Date', u'修改日期': 'Last_updated_date', } if __name__ == '__main__': # print 'please enter the table you want to convert' tableName = raw_input('please enter the table you want to convert.....\n') logger.info('connect to table ' + tableName) table = initDB('fintech', tableName) jsonTable = initDB('fintech', tableName + '_json') for t in table.find(): jsonData = {} for key in t.keys(): if (key != '_id' and key != u'妻子电话' and key != u'丈夫'): # print data[key],t[key] jsonKey = data[key] jsonValue = t[key] print jsonKey, jsonValue jsonData[jsonKey] = jsonValue # print jsonData jsonTable.insert_one(jsonData)
import requests from bs4 import BeautifulSoup import urllib2 from util import initLogger from util import initDB logger=initLogger('log.conf','dlmLogger') table=initDB('fintech','cxhDetail_new') def getUserDetail(url): html=requests.get(url) content=html.text soup=BeautifulSoup(content,'lxml') print soup if __name__=='__main__': url='http://www.chengxinhei.com/search/result/id/2388.html' getUserDetail(url)
#-*-coding:utf-8 -*- import requests from bs4 import BeautifulSoup import urllib2 import time from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') mongoTable = initDB('fintech', 'dlmDetail_new') def getUserDetail(userInfo): #print userInfo url = userInfo['url'] userName = userInfo['user'] #print userName logger.info('start to get user detail:\t' + url) print 'crawl user:'******'lxml') table = soup.table trs = table.findAll('tr') data = {u'姓名': userName} #print data for tr in trs:
# -*-coding:utf-8 -*- # __autor__='jufu' import requests from bs4 import BeautifulSoup from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') table = initDB('fintech', 'chengxinhei_new') baseUrl = 'http://www.chengxinhei.com/search/list/page/' URL = 'http://www.chengxinhei.com' def getTotalPages(url): html = requests.get(url) content = html.text soup = BeautifulSoup(content, 'lxml') lastNumber = soup.find('li', class_='last') hrefStr = lastNumber.a['href'] return hrefStr.split('/')[4].split('.')[0] def genPageUrls(totalPages): print 'total pages is ' + totalPages urls = [baseUrl + str(i) + '.html' for i in range(1, int(totalPages) + 1)] return urls def getPageDetail(urls):
# -*-coding:utf-8 -*- import requests from bs4 import BeautifulSoup import time from util import initLogger from util import initDB logger = initLogger('log.conf', 'dlmLogger') ygdaiTable = initDB('fintech', 'ygdai_new') ygdaiDetailTable = initDB('fintech', 'ygdaiDetail_new') def getUserDetail(userInfo): print userInfo['user'], userInfo['url'] userName = userInfo['user'] data = {u'姓名': userName} url = userInfo['url'] logger.info('start to get user detail:\t' + url) html = requests.get(url) content = html.text soup = BeautifulSoup(content, 'lxml') info = soup.select('#div-content > div > div.mt20 > div.w500.fl.ml20') money = soup.select('div.mt5') pres = info[0].findAll('pre') list = [] if pres != None: