def companyMsg(url): try: co_soup = html.getHtmlSoup(url, 'gb2312') contactdiv = co_soup.find('div', {'class': 'codebuy'}) if contactdiv is None or str(contactdiv).__len__() < 20: file.writefile('cn716_com_error.log', 'a+', url) return dic = { u'公 司 名:': None, u'公司地址:': None, u'所 在 地:': None, u'联系电话:': None, u'联 系 人:': None, u'手 机:': None, } tds = contactdiv.find_all('td') for i in xrange(0, len(tds)): title = tds[i].string if title is not None: title = title.strip() # print title if dic.has_key(title): tda = tds[i + 1].find_all('a') if len(tda) > 0: values = '' for a in tda: values += a.string dic[title] = values else: dic[title] = tds[i + 1].string # for key, value in dic.items(): # print key, value return dic except BaseException, e: print 'company page error' print url file.writefile('cn716_com_error.log', 'a+', str(url)) print e
def getCompanyMsg(url): try: soup = html.getHtmlSoup(url) dic = { u'公司名称:': None, u'公司地址:': None, u'所在地区:': None, u'公司电话:': None, u'公司传真:': None, u'电子邮件:': None, u'联 系 人:': None, u'部门(职位):': None, u'手机号码:': None } for tr in soup.find('div', {'class': 'px13 lh18'}).find_all('tr'): title = tr.find_all('td')[0].string if dic.has_key(title): dic[title] = tr.find_all('td')[1].string for key, value in dic.items(): print key, value return dic except BaseException, e: print e print 'company page error'
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file,time db = database.DB('192.168.1.193', 'skt', pwd='root', tb='company_jiayouzhuan') db.delete() # db.create("name", "area", "address", "trade", "mobile", "tele", "qq", "email") base = 'https://www.atobo.com.cn' urls = file.readfile('area_url.txt') for url in urls: print url file.writefile('area_url_log.txt', 'w', url) soup = html.getHtmlSoup(url) pages = soup.find('li', {'class': 'spagelist'}) cpage = pages.find_all('strong')[0].string page = pages.find_all('strong')[1].string print page lastpage = 0 for x in range(49, int(page) + 1): nurl = url[:-1] + '-y%d' % x nsoup = html.getHtmlSoup(nurl) div = nsoup.find('div',{'class':'product_contextlist bplist'}) for li in div.find_all('li',{'class':'pp_name'}): time.sleep(3) u1 = li.find_all('a')[0]['href']+'/WebSite/bexd122859-c13.html' u2 = li.find_all('a')[1]['href'] print u1,u2 detail = html.getHtmlSoup('http:'+u1) gs = html.getHtmlSoup('http:'+u2) title = gs.find('div',{'class':'cur_post'})
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time db = database.DB('192.168.1.105', 'skb', tb='company_qy6_823') db.create('name', 'scop', 'com_mode', 'business_mode', 'com_size', 'trade', 'regMoney', 'faRen' , 'regDate', 'person','phone', 'fax', 'mphone', 'address') lurl = file.readfile('qy6.txt') for url in lurl: try: urls = url.split(' ') hangye = urls[0] hurl = urls[1] soup = html.getHtmlSoup(hurl) tb = soup.select('body')[0].select('center')[1] td = tb.find_all('td') size = len(td) page = int(td[size - 1].find_all('strong')[3].string) for p in xrange(1, page + 1): purl = str(hurl).rstrip('qyC0101.html') + 'pqyC0101_p%d.html' % p psoup = html.getHtmlSoup(purl) tbs = psoup.select('body')[0].select('center')[1] tds = tbs.find_all('td') sizea = len(tds) for x in xrange(50, sizea): a = tds[x].find_all('a', {'target': '_blank'}) if len(a) > 0: url1 = a[0]['href'] hrefs = str(url1).split('comp') url2 = hrefs[0] + 'about' + hrefs[1]
# -*-coding:utf-8-*- import myUtil.htmlUtil as html import myUtil.dbutil as database db = database.DB('192.168.1.105', 'skb', tb='company_wy0823') db.create('companyName', 'address', 'phone', 'mphone', 'person', 'fax', 'categroy', 'area') try: strpage = str( html.getHtmlSoup('http://www.wyw.cn/companylist').find( 'div', {'id': 'fenye'})).strip() page = int(str(strpage[:-6]).strip()[-16:].strip().lstrip('共').rstrip('页')) print page for p in xrange(0, page): url = 'http://www.wyw.cn/companylist/Default.aspx?start=%d' % (p * 99) print url soup = html.getHtmlSoup(url) for li in soup.find_all('div', {'class': 'zuobox_contect'})[0].find_all('li'): curl = 'http://www.wyw.cn' + li.find('a')['href'] print curl csoup = html.getHtmlSoup(curl) ul = csoup.find('ul', {'class': 'lh20'}) if ul is None: continue else: uls = ul.find_all('li') # print uls companyname = uls[0].find('a').string address = uls[1].string.lstrip(u'公司地址: ')
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time # db = database.DB('192.168.1.105', 'skb', pwd='', tb='qiluwang') url = 'http://www.qy6.com/qyml' hsoup = html.getHtmlSoup(url) for a in hsoup.find_all('a', {'target': '_blank'}): hangye = a.string hurl = a['href'] if str(hurl).startswith('http://'): continue urls = hangye+' http://www.qy6.com'+hurl print urls file.writefile('qy6.txt', 'a+', urls.encode('utf8'))
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time db = database.DB('192.168.1.105', 'skb', tb='company_byw_824') db.create('name', 'address', 'area', 'phone', 'fax', 'person', 'mphone', 'com_mode', 'com_size', 'regMoney', 'regDate', 'business_mode', 'scop', 'trade') url = 'http://www.byf.com/b2b/dianqihangye/' soup = html.getHtmlSoup(url) hyangdiv = soup.find('div', {'class': 'clist'}) try: for a in hyangdiv.find_all('a'): hangye = a.string url = a['href'] while True: soup = html.getHtmlSoup(url) comli = soup.find('div', {'class': 'list'}) for li in comli.find_all('li'): print 'page' contacturl = li.find('div', {'class': 'dz'}).a['href'] crediturl = str(contacturl).replace('contact', 'credit') consoup = html.getHtmlSoup(contacturl) div = consoup.find('div', {'class': 'm-content'}) # ul = div.find_all('ul') dict = {u'公司地址:': None, u'公司电话:': None, u'公司传真:': None, u'联 系 人:': None, u'手机号码:': None} for ul in div.find_all('ul'): t = ul.find('li', {'class': 'cl'}).string if dict.has_key(t): dict[t] = ul.find('li', {'class': 'cr'}).string
# -*-coding:utf-8-*- import myUtil.htmlUtil as html # soup = html.getHtmlSoup('http://www.qy6.com/qyml/compzrzg14233863.html') soup = html.getHtmlSoup('http://www.qy6.com/qyml/compsdlmjq1234.html') ll = soup.select('body')[0].select('center') if len(ll) > 1: li = ll[1].find_all('td', {'align': 'center'})[1] l1 = li.find_all('tbody')[0] l2 = li.find_all('tbody')[1] l1s = l1.find_all('td') l2s = l2.find_all('td') else: hrefs = str('http://www.qy6.com/qyml/compsdlmjq1234.html').split('comp') url2 = hrefs[0] + 'about' + hrefs[1] url3 = hrefs[0] + 'con' + hrefs[1] soup = html.getHtmlSoup(url2).select('body')[0].select( 'center')[0].find_all('tbody')[0] l1s = soup.find_all('td') soup1 = html.getHtmlSoup(url3).select('body')[0].select( 'center')[0].find_all('tbody')[0] l2s = soup1.find_all('td') print str(l1s[0]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[1]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[2]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[3]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[4]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[5]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[6]).split('<br/>')[1].rstrip('</td>').strip() print str(l1s[7]).split('<br/>')[1].rstrip('</td>').strip()
import myUtil.htmlUtil as html import myUtil.fileutil as file, time import sys, company, re reload(sys) sys.setdefaultencoding('gb2312') db = database.DB('192.168.1.105', 'skb', tb='company_cn716_827') db.create('name', 'scop', 'trade', 'area', 'person', 'phone', 'mphone', 'address') urltxt = file.readfile('cn716_category_url1.txt') for txt in urltxt: trade = txt.split(' ')[0] urls = txt.split(' ')[1] soup = html.getHtmlSoup(urls, 'gb2312') sell = soup.find('div', class_='sell_1_b1_page') try: page = int(sell.find_all('span', class_='read')[-1].string) + 1 for p in xrange(1, page): purl = urls + '_%d' % p print purl try: for li in html.getHtmlSoup(purl, 'gb2312').find_all( 'ul', class_='sell_new'): # http://www.cn716.com/ url = 'http://www.cn716.com/' + li.find('li').a.get('href') manpros = li.find('li').find_all('span')[2] scop = str(manpros).split('<a')[0].split('>')[1].strip() dic = company.companyMsg(url) # print dic
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time import sys reload(sys) sys.setdefaultencoding("utf-8") for p in xrange(1, 33): url = 'http://www.cn716.com/company%d' % p soup = html.getHtmlSoup(url, 'GB2312') divs = soup.find_all('span', {'class': 'class2_1x'}) for dd in divs: cate = dd.a.string curl = dd.find('a')['href'] if str(cate).startswith('未分类') or str(cate).startswith('其他'): file.writefile('cn716_error_url.txt', 'a+', str(cate.lstrip('.') + ' ' + 'http://www.cn716.com/' + curl)) continue print cate, 'http://www.cn716.com/' + curl file.writefile('cn716_category_url1.txt', 'a+', str(cate.lstrip('.') + ' ' + 'http://www.cn716.com/' + curl))
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time # db = database.DB('192.168.1.105', 'skb', pwd='', tb='qiluwang') for area in xrange(1, 32): url = 'http://www.76330.com/list-%d-1.html' % area psoup = html.getHtmlSoup(url) pl = psoup.find('ul', {'class': 'pagelist'}) page = int(pl.find('b').string) / 10 + 1 print page for p in xrange(0, page): urlp = 'http://www.76330.com/list-%d-%d.html' % (area, p) print url soup = html.getHtmlSoup(urlp) try: for li in soup.find_all('a', {'class': 'title'}): curl = li['href'] companyname = li.string csoup = html.getHtmlSoup(curl) div = csoup.find('div', {'class': 'base0910'}) print '' except: print '' break break
# -*-coding:utf-8-*- import myUtil.dbutil as database import myUtil.htmlUtil as html import myUtil.fileutil as file, time import sys, company reload(sys) sys.setdefaultencoding("utf-8") db = database.DB('192.168.1.105', 'skb', tb='company_qy6_823') db.create('name', 'address', 'area', 'phone', 'fax', 'trade', 'email', 'person', 'mphone', 'position') url = 'http://www.tonbao.com/company/' soup = html.getHtmlSoup(url) divs = soup.find('div', { 'class': 'left_box' }).find('div', {'class': 'catalog'}) trade = '' purl = '' for td in divs.find_all('td'): try: lurl = td.find('p').find('a')['href'] trade = td.find('p').a.strong.span.string print lurl lsoup = html.getHtmlSoup(lurl) page = int( str(lsoup.find('div', { 'class': 'pages' }).cite.string).split('/')[1].strip(u'页')) for p in xrange(1, page + 1):
import sys reload(sys) sys.setdefaultencoding('utf8') db = database.DB('192.168.1.193', 'skt', pwd='root', tb='company_qiye0712') db.create('company_name', 'address', 'hangye', 'area', 'person', 'mobile') for i in xrange(1002, 1035): m = "%06d" % i x = 1 while True: url = 'http://www.qiye.net/company_pr%s-p%x' % (m, x) print url x += 1 try: soup = html.getHtmlSoup(url) lis = soup.find('ul', {'class': 'companyList'}) if lis is None: files.writefile('city.log', 'w', m) break areas = soup.find('div', {'class': 'crumbs'}) area = str(areas).split('</em>')[2].rstrip(' </div>') for li in lis.find_all('li'): name = li.find('strong').find('a')['title'] address = str(li.find_all('dl')[0].find_all('dd') [3].string).strip(u'企业地址:') hangye = str(li.find_all('dl')[1].find_all('dd') [1].string).strip(u'主营行业:') if hangye is None: continue urls = 'http://www.qiye.net' + li.find_all('dl')[1].find(
# -*-coding:utf-8-*- import myUtil.htmlUtil as html soup = html.getHtmlSoup('http://www.wyw.cn/companylist/345345/') print soup.find('ul', {'class': 'lh20'})