def fetch(head): soup=getSoup(head) t=soup.find('h1') title=t.find(text=True) print title ans=[] for i in xrange(0,10000,10): link=head+'?start='+str(i) soup=getSoup(link) notes=[] for x in soup.findAll('span',{'class':'rec'}): if not x.has_key('id') : continue if x['id'][:5]!='Note-': continue notes.append(x['id'][5:]) if notes==[]: break for note in notes: soup=getSoup('http://www.douban.com/note/'+note) note_title=soup.find('title').find(text=True) article=soup.find('div',{'class':"note", 'id':"link-report"}) content=note_title+'\n'.join(map(clean,article.findAll(text=True)))+'\n\n' ans.append(content) g=open(title+'.txt','w') g.write('\n'.join(ans)) g.close()
def runoob(): url = 'http://www.runoob.com/python3/python3-tutorial.html' soup = getSoup.getSoup(url) urls = soup.select('div#leftcolumn a') for url in urls: url = url.get('href') url = 'http://www.runoob.com' + url print(url) soup = getSoup.getSoup(url) articleBody = soup.select('.article-intro') for articleBody in articleBody: articleBody = articleBody.get_text() saveDoc.saveDocs('runoob菜鸟教程.doc', articleBody)
def addDom(link, loc, year): soup = getSoup(link) for x in soup.findAll('pre'): time = None time1 = None time2 = None x = str(x).split('\n') for y in x: y = y.strip() if date(y): time = y[1:-1] elif data(y): t1, t2, p1, p2 = data(y) ans.append( [t1, p1 - p2, year, time, t2, loc, 'home', 'domestic']) ans.append( [t2, p2 - p1, year, time, t1, loc, 'away', 'domestic']) if date2(y): time1, time2 = date2(y) elif data2(y): t1, t2, p1, p2, p3, p4 = data2(y) ans.append( [t1, p1 - p2, year, time1, t2, loc, 'home', 'domestic']) ans.append( [t2, p2 - p1, year, time1, t1, loc, 'away', 'domestic']) ans.append( [t1, p3 - p4, year, time2, t2, loc, 'home', 'domestic']) ans.append( [t2, p4 - p3, year, time2, t1, loc, 'away', 'domestic'])
def addInt(link, year): soup = getSoup(link) for x in soup.findAll('pre'): time1 = None time2 = None x = str(x).split('\n') for y in x: y = y.strip() if date2(y): time1, time2 = date2(y) elif data3(y): t1, t2, p1, p2, p3, p4 = data3(y) ans.append([ t1, p1 - p2, year, time1, t2, 'international', 'home', 'international' ]) ans.append([ t2, p2 - p1, year, time1, t1, 'international', 'away', 'international' ]) ans.append([ t1, p3 - p4, year, time2, t2, 'international', 'home', 'international' ]) ans.append([ t2, p4 - p3, year, time2, t1, 'international', 'away', 'international' ])
def get_info(url): index_page=getSoup(url,encode='gbk') book_title=index_page.find('title').find(text=True)[:-10] yield book_title for ch in index_page.findAll('a'): if not ch.has_key('href') or not ch['href'].startswith(Ch_Start): continue ch_url=ch['href'][20:-2] ch_title=title_clean(ch.find(text=True)) yield ch_url,ch_title
def makeExcel(sites): # Workbook() takes one, non-optional, argument # which is the filename that we want to create. workbook = xlsxwriter.Workbook('ShopifyContacts.xlsx') # The workbook object is then used to add new # worksheet via the add_worksheet() method. worksheet = workbook.add_worksheet() row = 1 worksheet.write('A' + str(row), 'Website') worksheet.write('B' + str(row), 'Email') worksheet.write('C' + str(row), 'Instagram') worksheet.write('D' + str(row), 'Facebook') worksheet.write('E' + str(row), 'Twitter') worksheet.write('F' + str(row), 'Youtube') worksheet.write('G' + str(row), 'Pinterest') i = 0 while (len(sites) > i): row += 1 worksheet.write('A' + str(row), sites[i]) data = getSoup(sites[i]) #data = removeDups(data) #get rid of duplicate entries y = re.findall(r'[\w\.-]+@[\w\.-]+', data) #Get all emails if y: worksheet.write('B' + str(row), y[0]) print('---Email---') print(y[0]) else: worksheet.write('B' + str(row), 'N/A') x = re.findall(r'(https?://[^\s]+)', data) #Get all Urls x = removeDups(x) searchString(x, 'instagram.com', row, 'C', worksheet) searchString(x, 'facebook.com', row, 'D', worksheet) searchString(x, 'twitter.com', row, 'E', worksheet) searchString(x, 'youtube.com', row, 'F', worksheet) searchString(x, 'pinterest.com', row, 'G', worksheet) #if re.findall('instagram', x): # print('---Instagram---') # print(x) # y = re.findall('instagram', data) # if y: # worksheet.write('C' + str(row), y[0]) # print('---Instagram---') # print(y[0]) # else: # worksheet.write('C' + str(row), 'N/A') i += 1 print('making excel file') workbook.close()
def get_info(url): index_page = getSoup(url, encode='gbk') book_title = index_page.find('title').find(text=True)[:-10] yield book_title for ch in index_page.findAll('a'): if not ch.has_key('href') or not ch['href'].startswith(Ch_Start): continue ch_url = ch['href'][20:-2] ch_title = title_clean(ch.find(text=True)) yield ch_url, ch_title
def getplaylistids(url): url = url soup = getSoup.getSoup(url) # print(soup) playlistids = [] playlists = soup.select('a') print(playlists) for playlist in playlists: playlist = playlist.get('href') playlistids.append(playlist) return playlistids
def fetchNote((filename,info)): ans=[] for url, name in info: soup=getSoup(url) note=soup.find('div',{'class':'note-content'}) if not note: note=soup.find('div',{'class':'note','id':"link-report"}) ans.append(soupToTxt(note,title=name)) f=open(filename+'.txt','w') f.write(('\n\n\n\n'+'-'*30+'\n\n').join(ans)) f.close()
def fetchNote((filename, info)): ans = [] for url, name in info: soup = getSoup(url) note = soup.find('div', {'class': 'note-content'}) if not note: note = soup.find('div', {'class': 'note', 'id': "link-report"}) ans.append(soupToTxt(note, title=name)) f = open(filename + '.txt', 'w') f.write(('\n\n\n\n' + '-' * 30 + '\n\n').join(ans)) f.close()
def fetchInfo(url): filename='' info=[] for n in xrange(N): soup=getSoup(url+'?start='+str(M*n)) if not filename: filename=soup.find('title').find(text=True) titles=soup.findAll('div',{'class':"title"}) if not titles: break # no more for title in titles: link=title.find('a') info.append((link['href'],link.find(text=True).strip())) return (filename,info)
def fetchInfo(url): filename = '' info = [] for n in xrange(N): soup = getSoup(url + '?start=' + str(M * n)) if not filename: filename = soup.find('title').find(text=True) titles = soup.findAll('div', {'class': "title"}) if not titles: break # no more for title in titles: link = title.find('a') info.append((link['href'], link.find(text=True).strip())) return (filename, info)
def fetchInfo(url): soup = getSoup(url) title = soup.find('title').find(text=True) info = [] for img_field in soup.findAll('img'): if img_field.has_key('src') and img_field.has_key('alt'): name = img_field['alt'] if len(name) > 13 or len(name) < 8: continue name = img_field.findParent('p').findPreviousSibling('p') name = ''.join(name.findAll(text=True)) info.append((name, img_field['src'])) return (title, info)
def fetchInfo(url): soup=getSoup(url) title=soup.find('title').find(text=True) info=[] for img_field in soup.findAll('img'): if img_field.has_key('src') and img_field.has_key('alt'): name=img_field['alt'] if len(name)>13 or len(name)<8: continue name=img_field.findParent('p').findPreviousSibling('p') name=''.join(name.findAll(text=True)) info.append((name,img_field['src'])) return (title, info)
def fetch_book(info): book_title=info.next() old_title='' content=[] for url,ch_title in info: if ch_title==old_title: ch_title='' else: old_title=ch_title ch=getSoup(url,encode='gbk').find('div',{'id':"content"}) content.append(soupToTxt(ch,title=ch_title)) f=open(book_title+'.txt','w') f.write(''.join(content)) f.close()
def getMp3Info(albumid): url = 'http://www.kugou.com/yy/album/single/' + str(albumid) + '.html' soup = getSoup.getSoup(url) hashs = soup.select('.songList a') loadMp3Hash = [] for hashss in hashs: hash = hashss.get('data') # 通过spilt('|')分割字符串,获取hash mp3Hash = hash.split('|')[0] # print(hash.split('|')[0]) # hash = hash.spilt('|') loadMp3Hash.append(mp3Hash) # print(mp3Hash) return loadMp3Hash
def fetch_book(info): book_title = info.next() old_title = '' content = [] for url, ch_title in info: if ch_title == old_title: ch_title = '' else: old_title = ch_title ch = getSoup(url, encode='gbk').find('div', {'id': "content"}) content.append(soupToTxt(ch, title=ch_title)) f = open(book_title + '.txt', 'w') f.write(''.join(content)) f.close()
def fetchInfo(homeUrl, type=None): homeSoup = getSoup(homeUrl) pageSoup = homeSoup info = [] count = homeSoup.find('span', {'class': 'count'}) if count: count = count.find(text=True)[2:-2] count = int(count) else: count = N # only one page ind = len(homeSoup.findAll('h1')) - 1 if ind > 1: ind = 1 album_name = homeSoup.findAll('h1')[ind].find(text=True) if '-' in album_name: album_name = album_name.split('-')[1] album_name = album_name.replace("*", '') album_name = album_name.replace("/", '') album_name = album_name.split()[0] start = 0 while True: photos = pageSoup.findAll('div', {'class': 'photo_wrap'}) if len(photos) > N: print 'warning on photo number!' for photo in photos: aTag = photo.find('a', {'class': "photolst_photo"}) if not aTag: continue name = aTag['title'] url = photo.find('img')['src'] url = url.replace('thumb', 'large') info.append((name, url)) start += N if start > count: break page = getWebpage(homeUrl + '?start=' + str(start)) pageSoup = BeautifulSoup(page) photos = homeSoup.findAll('span', {'class': "img"}) if not photos: photos = homeSoup.findAll('a', {'class': "pic"}) for photo in photos: img = photo.find('img') if not img: continue if not img.has_key('alt'): continue name = img['alt'] if img.has_key('data-src'): url = img['data-src'] else: url = img['src'] url = url.replace('head', 'original') info.append((url, name)) return (album_name, info)
def fetchInfo(url): soup = getSoup(url, coo=coo) title = soup.find('title').find(text=True).split()[-1] info = [] for img_field in soup.findAll('a', {'class': "pic"}): img = img_field.find('img') if not img: continue if not img.has_key('alt'): continue name = img['alt'] if img.has_key('data-src'): url = img['data-src'] else: url = img['src'] url = url.replace('head', 'original') info.append((name, url)) return (title, info)
def addInt(link,year): soup=getSoup(link) for x in soup.findAll('pre'): time1=None time2=None x=str(x).split('\n') for y in x: y=y.strip() if date2(y): time1,time2=date2(y) elif data3(y): t1,t2,p1,p2,p3,p4=data3(y) ans.append([t1,p1-p2,year,time1,t2,'international','home','international']) ans.append([t2,p2-p1,year,time1,t1,'international','away','international']) ans.append([t1,p3-p4,year,time2,t2,'international','home','international']) ans.append([t2,p4-p3,year,time2,t1,'international','away','international'])
def checkMessage(): global last_check_time update=False page=getSoup(board_url,coo=coo) for message in page.findAll('li',{'class':"mbtrdot comment-item"}): pname_url=message.find('a')['href'] pname=message.find('a').find(text=True) ptext=message.findAll(text=True)[1][1:].strip() ptime=message.find('span',{'class','pl'}).find(text=True) t=parser.parse(ptime) t=(t-datetime.datetime(1970,1,1)).total_seconds() if t<last_check_time or (('by' in ptext) and (u'小黄鸡' in ptext)): return last_check_time=t reply(pname,pname_url,ptext) update=True return update
def fetchInfo(url): soup=getSoup(url,coo=coo) title=soup.find('title').find(text=True).split()[-1] info=[] for img_field in soup.findAll('a',{'class':"pic"}): img=img_field.find('img') if not img: continue if not img.has_key('alt'): continue name=img['alt'] if img.has_key('data-src'): url=img['data-src'] else: url=img['src'] url=url.replace('head','original') info.append((name,url)) return (title, info)
def getBook(n): bookstore = [] for i in range(1, 6): url = 'http://book.jd.com/booktop/0-0-0.html?category=3287-0-0-0-10003-' + str( i) + '#comfort' print(url) soup = getSoup.getSoup(url) books = soup.select('a.p-name') # print(books) for book in books: title = book.get("title") # title = book.get_text # print(title) bookstore.append(title.strip()) # print(bookstore) return bookstore
def checkContact(): page=getSoup(contact_url,coo=coo) contacts=page.findAll('li',{'class':'clearfix'}) update=False for contact in contacts: name=contact.find('a')['href'] if name=='http://www.douban.com/people/39500150/': break if contact.find('span',{'class':"user-cs"}): continue page_info = urllib2.build_opener() page_info.addheaders.append(('Cookie', coo)) postData='ck='+ck+'&people='+contact['id'][1:] postData=postData.encode('utf8') req = urllib2.Request(addcontact_url, postData) page_info.open(req) postData='ck='+ck+'&tag=195082&people='+contact['id'][1:] postData=postData.encode('utf8') req = urllib2.Request(addtotag_url, postData) page_info.open(req) update=True return update
def addDom(link,loc,year): soup=getSoup(link) for x in soup.findAll('pre'): time=None time1=None time2=None x=str(x).split('\n') for y in x: y=y.strip() if date(y): time=y[1:-1] elif data(y): t1,t2,p1,p2=data(y) ans.append([t1,p1-p2,year,time,t2,loc,'home','domestic']) ans.append([t2,p2-p1,year,time,t1,loc,'away','domestic']) if date2(y): time1,time2=date2(y) elif data2(y): t1,t2,p1,p2,p3,p4=data2(y) ans.append([t1,p1-p2,year,time1,t2,loc,'home','domestic']) ans.append([t2,p2-p1,year,time1,t1,loc,'away','domestic']) ans.append([t1,p3-p4,year,time2,t2,loc,'home','domestic']) ans.append([t2,p4-p3,year,time2,t1,loc,'away','domestic'])
[t2, p2 - p1, year, time, t1, loc, 'away', 'domestic']) if date2(y): time1, time2 = date2(y) elif data2(y): t1, t2, p1, p2, p3, p4 = data2(y) ans.append( [t1, p1 - p2, year, time1, t2, loc, 'home', 'domestic']) ans.append( [t2, p2 - p1, year, time1, t1, loc, 'away', 'domestic']) ans.append( [t1, p3 - p4, year, time2, t2, loc, 'home', 'domestic']) ans.append( [t2, p4 - p3, year, time2, t1, loc, 'away', 'domestic']) domPage = getSoup('http://www.rsssf.com/resultsp00.html') for loc in ['England', 'Italy', 'Spain']: for x in domPage.findAll('a'): text = x.find(text=True) text = text.split() if len(text) != 2: continue if text[0] != loc: continue link = 'http://www.rsssf.com/' + x['href'] addDom(link, loc, text[1]) def addInt(link, year): soup = getSoup(link) for x in soup.findAll('pre'): time1 = None time2 = None
# db连接 connectDB = connect_dataBase.ConnectDatabase() get_conf = connectDB.get_conf('databases_conf.json') conn, cur = connectDB.connect_db(get_conf["brazilCup"]["host"], get_conf["brazilCup"]["user"], get_conf["brazilCup"]["password"], get_conf["brazilCup"]["database"], get_conf["brazilCup"]["port"]) # url = 'http://worldcup.2014.163.com/playerrank/total/attPenGoal/' # 球员总数据 # url = 'http://worldcup.2014.163.com/playerrank/avg/attPenGoal/' # 球员场均数据 # url = 'http://worldcup.2014.163.com/teamrank/total/goals/' # 国家队场总数据 url = 'http://worldcup.2014.163.com/teamrank/avg/goals/' # 国家队场均数据 soup = getSoup.getSoup(url) trs = soup.select('tbody tr') # print(tds) length = len(trs) # print(length) players = [] for tr in trs: # print(row) player = [] # print(len(tr)) for td in tr: # 数据格式化, formatSQL tds = '\'' + str(td.string.strip()) + '\'' # print(tds) # player.append(str(td.string.strip())) # if '' in player:
import getSoup import saveDoc # times = time.strftime('%m%d') # def getArticle(): # urls = open('urls.json', 'r', encoding='utf-8') # print(urls) with open('urls.json', 'r', encoding='utf-8') as f: urls = json.load(f) # print(urls['sites']) urls = urls['sites'] # print(urls) ii = 1 for url in urls: url = url['url'].strip() # print(url) soup = getSoup.getSoup(url) # file = time.strftime('%m%d')+str(ii)+'.doc' # print(file) if url == 'http://www.cnblogs.com/': article = soup.select('#editor_pick_lnk') # for article in article: # articleUrl = article.get('href') # print(articleUrl) # # articleReponse = requests.get(articleUrl) # # articleReponse.raise_for_status() # articleSoup = getSoup.getSoup(articleUrl) # articleTitle = articleSoup.select('#cb_post_title_url') # for articleTitle in articleTitle: # articleTitle = articleTitle.get_text() # print(articleTitle) # file = articleTitle + '.doc'
elif data(y): t1,t2,p1,p2=data(y) ans.append([t1,p1-p2,year,time,t2,loc,'home','domestic']) ans.append([t2,p2-p1,year,time,t1,loc,'away','domestic']) if date2(y): time1,time2=date2(y) elif data2(y): t1,t2,p1,p2,p3,p4=data2(y) ans.append([t1,p1-p2,year,time1,t2,loc,'home','domestic']) ans.append([t2,p2-p1,year,time1,t1,loc,'away','domestic']) ans.append([t1,p3-p4,year,time2,t2,loc,'home','domestic']) ans.append([t2,p4-p3,year,time2,t1,loc,'away','domestic']) domPage=getSoup('http://www.rsssf.com/resultsp00.html') for loc in ['England','Italy','Spain']: for x in domPage.findAll('a'): text=x.find(text=True) text=text.split() if len(text)!=2: continue if text[0]!=loc: continue link='http://www.rsssf.com/'+x['href'] addDom(link,loc,text[1]) def addInt(link,year): soup=getSoup(link) for x in soup.findAll('pre'): time1=None time2=None