Beispiel #1
0
#-*-coding:utf-8 -*-
#__autor__='jufu'

from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
#logger.info('dlm logger error')

if __name__ == '__main__':

    table = initDB('fintech', 'ygdai_detail')
    newTable = initDB('fintech', 'new_ygdai_detail')
    for user in table.find():

        data = {}
        for userString in user[u'信息']:

            for s in userString.split('\n'):
                #print s.split(u'\uff1a')
                key = s.split(u'\uff1a')[0].strip()
                value = s.split(u'\uff1a')[1].strip()
                #data[str(count)]=key+':'+value
                data[key] = value
                #print key,value

                #print data
                #newTable.insert_one(data)

        data[u'抓取时间'] = user[u'抓取时间']
        data[u'总逾期借款笔数'] = user[u'总逾期借款笔数']
Beispiel #2
0
#-*- coding:utf-8 -*-
# __autor__='jufu'

import requests
from bs4 import BeautifulSoup
from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
table = initDB('fintech', 'ygdai_new')

URL = 'http://www.ygdai.com/s/blacklist.html'
domain = 'http://www.ygdai.com'
baseUrl = 'http://www.ygdai.com/s/blacklist/page/'


def getTotalPages(url):
    logger.info('start to get total url...')
    logger.info(url)
    html = requests.get(url)
    content = html.text
    soup = BeautifulSoup(content, 'lxml')
    link = soup.select('#yw0 > li.last > a')
    hrefStr = link[0]['href']
    logger.info('getTotalPages end...')
    return hrefStr.split('/')[-1].split('.')[0]


def genPageUrls(totalPages):
    urls = [baseUrl + str(i) + '.html' for i in range(1, int(totalPages) + 1)]
    return urls
Beispiel #3
0
def main():
    conn = createDBconnection()
    initDB(conn)
    conn.close()

    server()
Beispiel #4
0
#-*- coding:utf-8 -*-

import requests
from bs4 import  BeautifulSoup
import sys
import urllib2

baseUrl = 'http://www.p2p12580.com/blacklist.asp?'

from util import initLogger
from util import initDB

logger=initLogger('log.conf','dlmLogger')
table=initDB('fintech','p2p12580_new')

def getDetail(url):
	pass
def gettotalPages(startUrl):

	#http://www.p2p12580.com/blacklist.asp?page=1&id=1&strKeyWord=
	#html=requests.get(startUrl)
	html=urllib2.urlopen(startUrl).read().decode('gbk')
	soup=BeautifulSoup(html,'lxml')
	#print soup
	print soup.select('#__01 > tbody > tr:nth-of-type(6) > td:nth-of-type(2) > table > tbody > tr:nth-of-type(39) > td')

if __name__=='__main__':
	print 'please open http://www.p2p12580.com/blacklist.asp'
	id = raw_input('please enter id:')
	if id.isdigit():
		startUrl = baseUrl+'page=1&id='+str(id)+'&strKeyWord='
Beispiel #5
0
# -*-coding:utf-8 -*-
# __autor__='jufu'

import requests
from bs4 import BeautifulSoup

domainUrl = 'http://www.dailianmeng.com'
URL = 'http://www.dailianmeng.com/p2pblacklist/index.html'
baseUrl = 'http://www.dailianmeng.com/p2pblacklist/index.html?ajax=yw0&P2pBlacklist_page='

from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
table = initDB('fintech', 'dailianmeng_new')


def getTotalPages(url):
    logger.info('start to get total url...')
    logger.info(url)
    html = requests.get(url)
    content = html.text
    soup = BeautifulSoup(content, 'lxml')
    link = soup.select('#yw1 > li.last > a')
    hrefStr = link[0]['href']
    logger.info('getTotalPages end...')
    return hrefStr.split('=')[-1]


def genPageUrls(totalPages):
    urls = [baseUrl + str(i) for i in range(1, int(totalPages) + 1)]
Beispiel #6
0
    u'上传时间': 'Case_Exposed_Date',
    u'信息来源更新时间': 'Case_Exposed_Date',
    u'信息来源': 'Info_Source',
    u'信息来源URL': 'Info_source_URL',
    u'信息来源网址': 'Info_source_URL',
    u'抓取时间': 'Info_Captured_TS',
    u'创建日期': 'Created_Date',
    u'修改日期': 'Last_updated_date',
}

if __name__ == '__main__':
    # print 'please enter the table you want to convert'
    tableName = raw_input('please enter the table you want to convert.....\n')

    logger.info('connect to table ' + tableName)
    table = initDB('fintech', tableName)
    jsonTable = initDB('fintech', tableName + '_json')

    for t in table.find():
        jsonData = {}
        for key in t.keys():
            if (key != '_id' and key != u'妻子电话' and key != u'丈夫'):
                # print data[key],t[key]

                jsonKey = data[key]
                jsonValue = t[key]
                print jsonKey, jsonValue
                jsonData[jsonKey] = jsonValue

        # print jsonData
        jsonTable.insert_one(jsonData)
Beispiel #7
0
import requests
from bs4 import BeautifulSoup
import urllib2

from util import initLogger
from util import initDB

logger=initLogger('log.conf','dlmLogger')
table=initDB('fintech','cxhDetail_new')

def getUserDetail(url):
	html=requests.get(url)
	content=html.text
	soup=BeautifulSoup(content,'lxml')
	print soup

if __name__=='__main__':
	url='http://www.chengxinhei.com/search/result/id/2388.html'
	getUserDetail(url)
Beispiel #8
0
#-*-coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup
import urllib2
import time

from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
mongoTable = initDB('fintech', 'dlmDetail_new')


def getUserDetail(userInfo):
    #print userInfo

    url = userInfo['url']
    userName = userInfo['user']
    #print userName
    logger.info('start to get user detail:\t' + url)

    print 'crawl user:'******'lxml')
    table = soup.table
    trs = table.findAll('tr')
    data = {u'姓名': userName}
    #print data
    for tr in trs:
Beispiel #9
0
# -*-coding:utf-8 -*-
# __autor__='jufu'
import requests
from bs4 import BeautifulSoup

from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
table = initDB('fintech', 'chengxinhei_new')

baseUrl = 'http://www.chengxinhei.com/search/list/page/'
URL = 'http://www.chengxinhei.com'


def getTotalPages(url):
    html = requests.get(url)
    content = html.text
    soup = BeautifulSoup(content, 'lxml')
    lastNumber = soup.find('li', class_='last')
    hrefStr = lastNumber.a['href']
    return hrefStr.split('/')[4].split('.')[0]


def genPageUrls(totalPages):
    print 'total pages is ' + totalPages
    urls = [baseUrl + str(i) + '.html' for i in range(1, int(totalPages) + 1)]
    return urls


def getPageDetail(urls):
Beispiel #10
0
# -*-coding:utf-8 -*-

import requests
from bs4 import BeautifulSoup

import time

from util import initLogger
from util import initDB

logger = initLogger('log.conf', 'dlmLogger')
ygdaiTable = initDB('fintech', 'ygdai_new')
ygdaiDetailTable = initDB('fintech', 'ygdaiDetail_new')


def getUserDetail(userInfo):
    print userInfo['user'], userInfo['url']
    userName = userInfo['user']
    data = {u'姓名': userName}

    url = userInfo['url']
    logger.info('start to get user detail:\t' + url)

    html = requests.get(url)
    content = html.text
    soup = BeautifulSoup(content, 'lxml')
    info = soup.select('#div-content > div > div.mt20 > div.w500.fl.ml20')
    money = soup.select('div.mt5')
    pres = info[0].findAll('pre')
    list = []
    if pres != None: