Python getHtmlSoup Exemples, myUtil.htmlUtil.getHtmlSoup Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : company.py Projet : maxcine13/IDEAWorkSpase

def companyMsg(url):
    try:
        co_soup = html.getHtmlSoup(url, 'gb2312')
        contactdiv = co_soup.find('div', {'class': 'codebuy'})
        if contactdiv is None or str(contactdiv).__len__() < 20:
            file.writefile('cn716_com_error.log', 'a+', url)
            return
        dic = {
            u'公 司 名：': None,
            u'公司地址：': None,
            u'所 在 地：': None,
            u'联系电话：': None,
            u'联 系 人：': None,
            u'手    机：': None,
        }
        tds = contactdiv.find_all('td')
        for i in xrange(0, len(tds)):
            title = tds[i].string
            if title is not None:
                title = title.strip()
            # print title
            if dic.has_key(title):
                tda = tds[i + 1].find_all('a')
                if len(tda) > 0:
                    values = ''
                    for a in tda:
                        values += a.string
                    dic[title] = values
                else:
                    dic[title] = tds[i + 1].string
        # for key, value in dic.items():
        #     print key, value
        return dic
    except BaseException, e:
        print 'company page error'
        print url
        file.writefile('cn716_com_error.log', 'a+', str(url))
        print e

Exemple #2

0

Afficher le fichier

def getCompanyMsg(url):
    try:
        soup = html.getHtmlSoup(url)
        dic = {
            u'公司名称：': None,
            u'公司地址：': None,
            u'所在地区：': None,
            u'公司电话：': None,
            u'公司传真：': None,
            u'电子邮件：': None,
            u'联 系 人：': None,
            u'部门(职位)：': None,
            u'手机号码：': None
        }
        for tr in soup.find('div', {'class': 'px13 lh18'}).find_all('tr'):
            title = tr.find_all('td')[0].string
            if dic.has_key(title):
                dic[title] = tr.find_all('td')[1].string
        for key, value in dic.items():
            print key, value
        return dic
    except BaseException, e:
        print e
        print 'company page error'

Exemple #3

0

Afficher le fichier

Fichier : jiayouzhan.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file,time

db = database.DB('192.168.1.193', 'skt', pwd='root', tb='company_jiayouzhuan')
db.delete()
# db.create("name", "area", "address", "trade", "mobile", "tele", "qq", "email")
base = 'https://www.atobo.com.cn'
urls = file.readfile('area_url.txt')
for url in urls:
    print url
    file.writefile('area_url_log.txt', 'w', url)
    soup = html.getHtmlSoup(url)
    pages = soup.find('li', {'class': 'spagelist'})
    cpage = pages.find_all('strong')[0].string
    page = pages.find_all('strong')[1].string
    print page
    lastpage = 0
    for x in range(49, int(page) + 1):
        nurl = url[:-1] + '-y%d' % x
        nsoup = html.getHtmlSoup(nurl)
        div = nsoup.find('div',{'class':'product_contextlist bplist'})
        for li in div.find_all('li',{'class':'pp_name'}):
            time.sleep(3)
            u1 = li.find_all('a')[0]['href']+'/WebSite/bexd122859-c13.html'
            u2 = li.find_all('a')[1]['href']
            print u1,u2
            detail = html.getHtmlSoup('http:'+u1)
            gs = html.getHtmlSoup('http:'+u2)
            title = gs.find('div',{'class':'cur_post'})

Exemple #4

0

Afficher le fichier

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time

db = database.DB('192.168.1.105', 'skb', tb='company_qy6_823')
db.create('name', 'scop', 'com_mode', 'business_mode', 'com_size', 'trade', 'regMoney', 'faRen'
          , 'regDate', 'person','phone', 'fax', 'mphone', 'address')
lurl = file.readfile('qy6.txt')
for url in lurl:
    try:
        urls = url.split(' ')
        hangye = urls[0]
        hurl = urls[1]
        soup = html.getHtmlSoup(hurl)
        tb = soup.select('body')[0].select('center')[1]
        td = tb.find_all('td')
        size = len(td)
        page = int(td[size - 1].find_all('strong')[3].string)
        for p in xrange(1, page + 1):
            purl = str(hurl).rstrip('qyC0101.html') + 'pqyC0101_p%d.html' % p
            psoup = html.getHtmlSoup(purl)
            tbs = psoup.select('body')[0].select('center')[1]
            tds = tbs.find_all('td')
            sizea = len(tds)
            for x in xrange(50, sizea):
                a = tds[x].find_all('a', {'target': '_blank'})
                if len(a) > 0:
                    url1 = a[0]['href']
                    hrefs = str(url1).split('comp')
                    url2 = hrefs[0] + 'about' + hrefs[1]

Exemple #5

0

Afficher le fichier

Fichier : test2.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.htmlUtil as html
import myUtil.dbutil as database

db = database.DB('192.168.1.105', 'skb', tb='company_wy0823')
db.create('companyName', 'address', 'phone', 'mphone', 'person', 'fax',
          'categroy', 'area')
try:
    strpage = str(
        html.getHtmlSoup('http://www.wyw.cn/companylist').find(
            'div', {'id': 'fenye'})).strip()
    page = int(str(strpage[:-6]).strip()[-16:].strip().lstrip('共').rstrip('页'))
    print page

    for p in xrange(0, page):
        url = 'http://www.wyw.cn/companylist/Default.aspx?start=%d' % (p * 99)
        print url
        soup = html.getHtmlSoup(url)
        for li in soup.find_all('div',
                                {'class': 'zuobox_contect'})[0].find_all('li'):
            curl = 'http://www.wyw.cn' + li.find('a')['href']
            print curl
            csoup = html.getHtmlSoup(curl)
            ul = csoup.find('ul', {'class': 'lh20'})
            if ul is None:
                continue
            else:
                uls = ul.find_all('li')
            # print uls
            companyname = uls[0].find('a').string
            address = uls[1].string.lstrip(u'公司地址： ')

Exemple #6

0

Afficher le fichier

Fichier : __init__.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time

# db = database.DB('192.168.1.105', 'skb', pwd='', tb='qiluwang')
url = 'http://www.qy6.com/qyml'
hsoup = html.getHtmlSoup(url)
for a in hsoup.find_all('a', {'target': '_blank'}):
    hangye = a.string
    hurl = a['href']
    if str(hurl).startswith('http://'):
        continue
    urls = hangye+' http://www.qy6.com'+hurl
    print urls
    file.writefile('qy6.txt', 'a+', urls.encode('utf8'))

Exemple #7

0

Afficher le fichier

Fichier : __init__.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time


db = database.DB('192.168.1.105', 'skb', tb='company_byw_824')
db.create('name', 'address', 'area', 'phone', 'fax', 'person', 'mphone',
          'com_mode', 'com_size', 'regMoney', 'regDate', 'business_mode', 'scop', 'trade')
url = 'http://www.byf.com/b2b/dianqihangye/'
soup = html.getHtmlSoup(url)
hyangdiv = soup.find('div', {'class': 'clist'})
try:
    for a in hyangdiv.find_all('a'):
        hangye = a.string
        url = a['href']
        while True:
            soup = html.getHtmlSoup(url)
            comli = soup.find('div', {'class': 'list'})
            for li in comli.find_all('li'):
                print 'page'
                contacturl = li.find('div', {'class': 'dz'}).a['href']
                crediturl = str(contacturl).replace('contact', 'credit')
                consoup = html.getHtmlSoup(contacturl)
                div = consoup.find('div', {'class': 'm-content'})
                # ul = div.find_all('ul')
                dict = {u'公司地址：': None, u'公司电话：': None, u'公司传真：': None, u'联 系 人：': None, u'手机号码：': None}
                for ul in div.find_all('ul'):
                    t = ul.find('li', {'class': 'cl'}).string
                    if dict.has_key(t):
                        dict[t] = ul.find('li', {'class': 'cr'}).string

Exemple #8

0

Afficher le fichier

Fichier : test1.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.htmlUtil as html

# soup = html.getHtmlSoup('http://www.qy6.com/qyml/compzrzg14233863.html')
soup = html.getHtmlSoup('http://www.qy6.com/qyml/compsdlmjq1234.html')
ll = soup.select('body')[0].select('center')
if len(ll) > 1:
    li = ll[1].find_all('td', {'align': 'center'})[1]
    l1 = li.find_all('tbody')[0]
    l2 = li.find_all('tbody')[1]
    l1s = l1.find_all('td')
    l2s = l2.find_all('td')
else:
    hrefs = str('http://www.qy6.com/qyml/compsdlmjq1234.html').split('comp')
    url2 = hrefs[0] + 'about' + hrefs[1]
    url3 = hrefs[0] + 'con' + hrefs[1]
    soup = html.getHtmlSoup(url2).select('body')[0].select(
        'center')[0].find_all('tbody')[0]
    l1s = soup.find_all('td')
    soup1 = html.getHtmlSoup(url3).select('body')[0].select(
        'center')[0].find_all('tbody')[0]
    l2s = soup1.find_all('td')

    print str(l1s[0]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[1]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[2]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[3]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[4]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[5]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[6]).split('<br/>')[1].rstrip('</td>').strip()
    print str(l1s[7]).split('<br/>')[1].rstrip('</td>').strip()

Exemple #9

0

Afficher le fichier

Fichier : spider.py Projet : maxcine13/IDEAWorkSpase

import myUtil.htmlUtil as html
import myUtil.fileutil as file, time
import sys, company, re

reload(sys)

sys.setdefaultencoding('gb2312')

db = database.DB('192.168.1.105', 'skb', tb='company_cn716_827')
db.create('name', 'scop', 'trade', 'area', 'person', 'phone', 'mphone',
          'address')
urltxt = file.readfile('cn716_category_url1.txt')
for txt in urltxt:
    trade = txt.split(' ')[0]
    urls = txt.split(' ')[1]
    soup = html.getHtmlSoup(urls, 'gb2312')
    sell = soup.find('div', class_='sell_1_b1_page')
    try:
        page = int(sell.find_all('span', class_='read')[-1].string) + 1
        for p in xrange(1, page):
            purl = urls + '_%d' % p
            print purl
            try:
                for li in html.getHtmlSoup(purl, 'gb2312').find_all(
                        'ul', class_='sell_new'):
                    # http://www.cn716.com/
                    url = 'http://www.cn716.com/' + li.find('li').a.get('href')
                    manpros = li.find('li').find_all('span')[2]
                    scop = str(manpros).split('<a')[0].split('>')[1].strip()
                    dic = company.companyMsg(url)
                    # print dic

Exemple #10

0

Afficher le fichier

Fichier : __init__.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time
import sys

reload(sys)

sys.setdefaultencoding("utf-8")

for p in xrange(1, 33):
    url = 'http://www.cn716.com/company%d' % p
    soup = html.getHtmlSoup(url, 'GB2312')
    divs = soup.find_all('span', {'class': 'class2_1x'})
    for dd in divs:
        cate = dd.a.string
        curl = dd.find('a')['href']
        if str(cate).startswith('未分类') or str(cate).startswith('其他'):
            file.writefile('cn716_error_url.txt', 'a+', str(cate.lstrip('.') + ' ' + 'http://www.cn716.com/' + curl))
            continue
        print cate, 'http://www.cn716.com/' + curl
        file.writefile('cn716_category_url1.txt', 'a+', str(cate.lstrip('.') + ' ' + 'http://www.cn716.com/' + curl))

Exemple #11

0

Afficher le fichier

Fichier : quluwang.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time

# db = database.DB('192.168.1.105', 'skb', pwd='', tb='qiluwang')

for area in xrange(1, 32):
    url = 'http://www.76330.com/list-%d-1.html' % area
    psoup = html.getHtmlSoup(url)
    pl = psoup.find('ul', {'class': 'pagelist'})
    page = int(pl.find('b').string) / 10 + 1
    print page
    for p in xrange(0, page):
        urlp = 'http://www.76330.com/list-%d-%d.html' % (area, p)
        print url
        soup = html.getHtmlSoup(urlp)
        try:
            for li in soup.find_all('a', {'class': 'title'}):
                curl = li['href']
                companyname = li.string
                csoup = html.getHtmlSoup(curl)
                div = csoup.find('div', {'class': 'base0910'})

            print ''
        except:
            print ''
        break
    break

Exemple #12

0

Afficher le fichier

# -*-coding:utf-8-*-
import myUtil.dbutil as database
import myUtil.htmlUtil as html
import myUtil.fileutil as file, time
import sys, company

reload(sys)

sys.setdefaultencoding("utf-8")

db = database.DB('192.168.1.105', 'skb', tb='company_qy6_823')
db.create('name', 'address', 'area', 'phone', 'fax', 'trade', 'email',
          'person', 'mphone', 'position')
url = 'http://www.tonbao.com/company/'
soup = html.getHtmlSoup(url)
divs = soup.find('div', {
    'class': 'left_box'
}).find('div', {'class': 'catalog'})
trade = ''
purl = ''
for td in divs.find_all('td'):
    try:
        lurl = td.find('p').find('a')['href']
        trade = td.find('p').a.strong.span.string
        print lurl
        lsoup = html.getHtmlSoup(lurl)
        page = int(
            str(lsoup.find('div', {
                'class': 'pages'
            }).cite.string).split('/')[1].strip(u'页'))
        for p in xrange(1, page + 1):

Exemple #13

0

Afficher le fichier

import sys

reload(sys)
sys.setdefaultencoding('utf8')

db = database.DB('192.168.1.193', 'skt', pwd='root', tb='company_qiye0712')
db.create('company_name', 'address', 'hangye', 'area', 'person', 'mobile')
for i in xrange(1002, 1035):
    m = "%06d" % i
    x = 1
    while True:
        url = 'http://www.qiye.net/company_pr%s-p%x' % (m, x)
        print url
        x += 1
        try:
            soup = html.getHtmlSoup(url)
            lis = soup.find('ul', {'class': 'companyList'})
            if lis is None:
                files.writefile('city.log', 'w', m)
                break
            areas = soup.find('div', {'class': 'crumbs'})
            area = str(areas).split('</em>')[2].rstrip('        </div>')
            for li in lis.find_all('li'):
                name = li.find('strong').find('a')['title']
                address = str(li.find_all('dl')[0].find_all('dd')
                              [3].string).strip(u'企业地址：')
                hangye = str(li.find_all('dl')[1].find_all('dd')
                             [1].string).strip(u'主营行业：')
                if hangye is None:
                    continue
                urls = 'http://www.qiye.net' + li.find_all('dl')[1].find(

Exemple #14

0

Afficher le fichier

Fichier : test3.py Projet : maxcine13/IDEAWorkSpase

# -*-coding:utf-8-*-
import myUtil.htmlUtil as html

soup = html.getHtmlSoup('http://www.wyw.cn/companylist/345345/')
print soup.find('ul', {'class': 'lh20'})