def catchEco(cursor, conn): for href in href_list: currenthref = href '''开始循环页面''' count = 0 while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 200: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") detail_list = soup.find_all('h3') for h in detail_list: #print 'Economy:',h.a.get('href'); GetWebContent.catch(unicode(h.a.get('href')), 'Economy', cursor, conn) div_nextpage = soup.find_all('a', class_="a_ts02") if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href') else: currenthref = '' count += 1 for href in href_home: htmlcode = GetHTML.getHtml(href) soup = BeautifulSoup(htmlcode, "html.parser") list1 = soup.find_all('a', href=re.compile('http://finance.ifeng.com/a/')) for item in list1: if unicode(item.get("href"))[0] == 'h': #print 'Economy:',item.get("href"); GetWebContent.catch(unicode(item.get('href')), 'Economy', cursor, conn)
def GetLinkEpisode(soup): ep = soup.find("a", {"class": "active"}) AnimeID = soup.find("input", {"id": "movie_id"})['value'] link = f"https://ajax.gogocdn.net/ajax/load-list-episode?ep_start={0}&ep_end={10000}&id={AnimeID}" soup = GetHTML(link) AnimeList = ["https://www19.gogoanime.io"+i['href'].lstrip() for i in soup.find_all("a")] return AnimeID, AnimeList
def get_thread_URL(self): """ get thread url """ # print "GetThreadURL" thread_links = GetHTML(self.url_catalog, "href", "a").get_links() for link in thread_links[:]: if "res" not in link: thread_links.remove(link) return thread_links
def GetEpisodeMethodeDICO(link): soup = GetHTML(link) embed = soup.find_all("script") embed = list( map(lambda data: str(data) if "multilinks" in str(data) else None, embed)) embed = list(filter(None, embed))[0] embed = embed.replace('<script type="text/javascript">', '') embed = embed.replace('</script>', '') embed = embed.replace('var multilinks = ', '') embed = embed.split(";")[0] Dico = json.loads(embed)[0] return Dico
def catchSports(cursor, conn): for href in href_list: currenthref = href '''开始循环页面''' count = 0 while currenthref !=''\ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 50: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") title_list = soup.find_all('div', class_='box_list clearfix') for t in title_list: #print 'Sports:',t.a.get('href'); if unicode(t.a.get('href')).endswith('wemedia.shtml'): GetWebContent.catch(unicode(t.a.get('href')), 'Sports_wemedia', cursor, conn) else: GetWebContent.catch(unicode(t.a.get('href')), 'Sports', cursor, conn) div_nextpage = soup.find_all('a', id='pagenext') if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href') else: currenthref = '' count += 1
def catchSociety(cursor, conn): try: for href in href_list: currenthref = href '''开始循环页面''' count = 0 while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 100: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") li_list = soup.find_all('div', class_='newsList')[0].find_all('li') for li in li_list: # print 'Society:',count, ':', li.a.get('href'); GetWebContent.catch(unicode(li.a.get('href')), 'Society', cursor, conn) div_nextpage = soup.find_all('div', id='backDay') if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].a.get('href') else: currenthref = '' count += 1 except BaseException, e: print e
def get_saison_anime_dbanimes(PGname): link = "https://dbanimes.com/anime/" + escape(PGname) soup = GetHTML(link) AnimeList = dbanimes.GetEpisodeDB(soup) result = dbanimes.GetEpisodeMethodeDB(AnimeList) result = sorted(result, key=lambda anime: anime['index']) return jsonify({"Page_name": PGname, "Link": link, "Episode": result})
def GetEpisodeAsync(link, key, Dico): soup = GetHTML(link) embed = soup.find("div", {"class": "anime_muti_link"}) embed = embed.find_all('li') title = soup.find("div", {"class": "title_name"}).text.rstrip().lstrip() methode = [] for i in embed: link = i.a['data-video'] if link[:2] == "//": link = "http:"+link methode.append({ "link": link, "title": i.a.text.replace("Choose this server", "") }) Dico.append({ "index": key, "title": title, "methode": methode })
def asyncGet(key, link, Dico): soup = GetHTML(link) title = soup.find("li", {"class": "breadcrumb-item active"}).text methodeDIC = [] methode = soup.find_all("div", {"class": "player"}) for i in methode: dataURL = i['data-url'] if dataURL != " " and dataURL != "None" and dataURL != " ": linkMethode = BeautifulSoup(dataURL, "html.parser").find("iframe")['src'] if linkMethode[:2] == "//": linkMethode = "http:" + linkMethode else: linkMethode = "link/methode/not_find" methodeDIC.append({ "link": linkMethode, "title": linkMethode.split("/")[2] }) Dico.append({"index": key, "title": title, "methode": methodeDIC})
def catchTech(cursor, conn): currenthref = 'http://tech.ifeng.com/listpage/803/1/list.shtml' '''开始循环页面''' count = 0 while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 70: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") title_list = soup.find_all('div', class_='zheng_list pl10 box') for t in title_list: #print 'Tech:',t.a.get('href'); GetWebContent.catch(unicode(t.a.get('href')), 'Tech', cursor, conn) div_nextpage = soup.find_all('a', id='pagenext') if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href') else: currenthref = '' count += 1 currenthref = 'http://tech.ifeng.com/listpage/26334/1/list.shtml' '''开始循环页面''' count = 0 while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 50: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") title_list = soup.find_all('div', class_='box_list clearfix') for t in title_list: #print 'Tech:',t.a.get('href'); GetWebContent.catch(unicode(t.a.get('href')), 'Tech', cursor, conn) div_nextpage = soup.find_all('a', id='pagenext') if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href') else: currenthref = '' count += 1
def catch(url, kind, cursor, conn): """ """ try: if url == '': return htmlcode = GetHTML.getHtml(url) soup = BeautifulSoup(htmlcode, "html.parser") title_soup = soup.title topic = '' if title_soup != None: topic = unicode(title_soup.string) rstr = r"[\/\\\:\*\?\"\<\>\|]" topic = re.sub(rstr, "", topic) # 消除非法字符 else: return kind_name = getkindChineseName(kind) out_path = 'fenghaungnews' filepath = u"../data/%s/%s/" % (out_path, kind_name) if not os.path.exists(filepath): os.makedirs(filepath) content = '' if kind == 'Sports_wemedia': for child in soup.find_all('div', class_="yc_con_txt")[0].descendants: if child.name == 'p' and child.string != None: content = content + unicode(child.string) else: content_list = soup.find_all('div', id='main_content') if content_list == []: return for child in content_list[0].descendants: if child.string != None \ and unicode(child)[0] != '<' \ and type(child.string) != bs4.element.Comment \ and unicode(child.string) != '\n': content = content + unicode(child.string) + u'\n' # cursor.execute("INSERT INTO web_page(article_class, article_content) VALUES(%s ,%s);", # (kind_name.encode('utf-8'), content.encode('utf-8'))) # conn.commit() with open(filepath + str(topic) + '.txt', 'w') as f: f.write(content) print kind_name, ' : ', url print except BaseException, e: dealException1(url, kind, e)
def get_thumbnail_URL(self, thread_links): """ get thumbnail url """ # print "GetThumbnailURL" t_url = [] for link in thread_links[:]: link = "http://img.2chan.net/b/" + link thumbnail_links = GetHTML(link, "href", "a").get_links() for i in thumbnail_links[:]: if "jpg" not in i: if "png" not in i: thumbnail_links.remove(i) for idx, thumbnail in enumerate(thumbnail_links): if len(thumbnail_links) != 0: tpl = (link, "http://img.2chan.net" + thumbnail_links[0]) t_url.insert(idx, tpl) return t_url
def catchGames(cursor, conn): for href in href_home: htmlcode = GetHTML.getHtml(href) soup = BeautifulSoup(htmlcode, "html.parser") list1 = soup.find_all('a', href=re.compile('http://games.ifeng.com/a/')) for item in list1: if unicode(item.get("href"))[0] == 'h': #print 'Games:',item.get("href"); GetWebContent.catch(unicode(item.get('href')), 'Games', cursor, conn)
def get_saison_anime_animeultime(PGname): link = "https://v5.anime-ultime.net/" + PGname + ".html" soup = GetHTML(link) AnimeID, ListID = animeultime.GetIdEpisode(soup) result = animeultime.jsonRequest(AnimeID, ListID) result = sorted(result, key=lambda anime: anime['index']) return jsonify({ "Page_name": PGname, "AnimeID": AnimeID, "Link": link, "Episode": result })
def get_saison_anime_gogoanime(PGname): link = "https://www19.gogoanime.io/category/" + escape(PGname) soup = GetHTML(link) AnimeID, AnimeList = gogoanime.GetLinkEpisode(soup) result = gogoanime.GetEpisodeMethodeGOGO(AnimeList) result = sorted(result, key=lambda anime: anime['index']) return jsonify({ "Page_name": PGname, "AnimeID": AnimeID, "Link": link, "Episode": result })
def index(): username = None password = None email = None form = loginForm() if form.validate_on_submit(): try: course_info = GetHTML.htmlHandle(form.username.data,form.password.data) timesold = course_info[2] names = course_info[1] course_info[2] = Formatter.formatTimes(course_info[2]) sdates = {'M':'26','T':'20','W':'21','R':'22','F':'23'} snames = {} for i in range(len(timesold)): snames[names[i]] = len(timesold[i]) dnames = {} k = 0 for j in range(7): for i in range(snames[names[j]]): dnames[k] = names[j] k += 1 filename = 'OliniCalendar.ics' iCalCreation.iCalWrite(course_info[2],"201501","20150430T000000",sdates,dnames,filename) ical = open('OliniCalendar.ics','r') email = form.email.data Emailer.iCalCreator(email,ical) os.remove('OliniCalendar.ics') # return render_template(html_sched) form.username.data = '' form.password.data = '' form.email.data = '' except: "Oops" return render_template('index.html', form=form)
def GetFreeVPNPass(FreeVPNdomain): #import modules import os import GetHTML from bs4 import BeautifulSoup #Assign Variables d = str(FreeVPNdomain) urlcat = "https://freevpn." + d + "/accounts/" url = GetHTML.GetSource(urlcat) #Write files for temp data storage. source = open('freevpn.html', 'w') source.write(str(url)) source.close() f = open('creds.txt', 'w') #Parse the downloaded html source code with open("freevpn.html") as fp: soup = BeautifulSoup(fp, "lxml") #Search parsed code for the login credentials for tag in soup.find_all('b'): f.write(str(tag.text) + "\n") f.write(str(tag.next_sibling) + "\n") #Delete html source code os.remove('freevpn.html') f.close() #Pass Credential info into list and extract password field. CredList = open('creds.txt', 'r').readlines() VPN_Pass = CredList[5] #Clean up credential list file os.remove('creds.txt') #return password. return str(VPN_Pass.strip())
def catchArmy(cursor, conn): for href in href_list: currenthref = href; '''开始循环页面''' count = 0; while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 50: htmlcode = GetHTML.getHtml(currenthref); soup = BeautifulSoup(htmlcode, "html.parser"); title_list = soup.find_all('div',class_='comListBox'); for t in title_list: #print 'Army:',t.a.get('href'); GetWebContent.catch(unicode(t.a.get('href')), 'Army', cursor, conn); div_nextpage = soup.find_all('a', id='pagenext'); if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href'); else: currenthref = ''; count += 1;
def catchInterNation(cursor, conn): for href in href_list: currenthref = href; '''开始循环页面''' count = 0; while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 100: htmlcode = GetHTML.getHtml(currenthref); soup = BeautifulSoup(htmlcode, "html.parser"); li_list = soup.find_all('div', class_='newsList')[0].find_all('li'); for li in li_list: #print 'InterNation:',count, ':', li.a.get('href'); GetWebContent.catch(unicode(li.a.get('href')), 'InterNation', cursor, conn); div_nextpage = soup.find_all('div', id='backDay'); if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].a.get('href'); else: currenthref = ''; count += 1;
def catchEntertain(cursor, conn): for href in href_list: currenthref = href '''开始循环页面''' count = 0 while currenthref != '' \ and currenthref != 'javascript:void(0);' \ and currenthref != "javascript:alert('没有了');" \ and count < 500: htmlcode = GetHTML.getHtml(currenthref) soup = BeautifulSoup(htmlcode, "html.parser") h2_list = soup.find_all('h2') for h2 in h2_list: print 'Entertain:', h2.a.get('href') GetWebContent.catch(unicode(h2.a.get('href')), 'Entertain', cursor, conn) div_nextpage = soup.find_all('a', id='pagenext') if div_nextpage.__len__() != 0: currenthref = div_nextpage[0].get('href') else: currenthref = '' count += 1
def GetEpisodeLink(link): soup = GetHTML(link) embed = soup.find_all('a', {"class": "btn-default"}) return embed
def GetSaison(link): soup = GetHTML(link) title = soup.find_all('div', {"class": "module-title"}) titleLink = soup.find_all('a', {"title": "Regarder en VOSTFR"}) return title, titleLink
import os filename = 'OliniCalendar.ics' # This is the start of the semester: yearmonth = '201501' # This is the end of the semester: endyearmonthdaytime = '20150430T000000' USERNAME = str(raw_input("Enter my.olin.edu Username: "******"time slots" are made instead. This just maps each class to it's respective slots. snames = {}
def get_saison_anime_vostfree(PGname): link = "https://vostfree.com/" + escape(PGname) + ".html" soup = GetHTML(link) result = vostfree.GetAllPlayer(soup) result = sorted(result, key=lambda anime: anime['index']) return jsonify({"Page_name": PGname, "Link": link, "Episode": result})