Ejemplo n.º 1
0
def get_article_url(page_list, url_list, title_list, pop_list, index_list):
	
	url_all = []
	title_all = []
	pop_all = []
	rs = requests.session()
	res = rs.post(ask, verify = False, data = payload)
	res = rs.get(url, verify = False)
	s = bsp(res.text, "html.parser")

	page_handle(s, url_all, title_all, pop_all)
	url_all.reverse()
	url_list.extend(url_all)
	title_all.reverse()
	title_list.extend(title_all)
	pop_all.reverse()
	pop_list.extend(pop_all)
	link = get_prev(s)
	page_list.append(link)
	
	
	for i in page_list:
		
		url_all = []
		title_all = []
		pop_all = []
		rs = requests.session()
		res = rs.post(ask, verify = False, data = payload)
		res = rs.get(i, verify = False)
		s = bsp(res.text, "html.parser")
		page_handle(s, url_all, title_all, pop_all)
		url_all.reverse()
		url_list.extend(url_all)
		title_all.reverse()
		title_list.extend(title_all)
		pop_all.reverse()
		pop_list.extend(pop_all)

		print('=============',i,'==============')

		for j in s.select('.btn-group-paging'):
			page_link_result = j.findAll('a', class_='btn wide')
			page_link = page_link_result[1]
			page_link = page_link['href']
			link = 'https://www.ptt.cc' + page_link

		time.sleep(0.3)
		print('Fetching ... ')
		if(check_today(s) == 1):
			page_list.append(link)
	
	index_url(url_list, index_list)
	select_article(url_list, title_list, pop_list, index_list)
	#print_list(pop_list, title_list, url_list, index_list)

	"""
Ejemplo n.º 2
0
def ReadAsin():
    # AsinList = csv.DictReader(open(os.path.join(os.path.dirname(__file__),"Asinfeed.csv")))
    driver = webdriver.Chrome(executable_path=r'g:/chromedriver.exe')
    my_url="https://www.amazon.com/s?k=pants&ref=nb_sb_noss_2"
    driver.get(my_url)
    html=driver.page_source
    page=bsp(html,"html.parser")
    AsinList = []
    containers=page.findAll(class_="sg-col-4-of-24 sg-col-4-of-12 sg-col-4-of-36 s-result-item sg-col-4-of-28 sg-col-4-of-16 sg-col sg-col-4-of-20 sg-col-4-of-32")
    for i in containers:
        if i["data-asin"]:
            AsinList.append(i["data-asin"])
    driver.quit()

    extracted_data = []
    extracted_data1=[]
    for i in AsinList:
        url = "http://www.amazon.com/dp/"+i
        print ("Processing: "+url)
        extracted_data.append(AmzonParser(url,i))
        sleep(1)
    f=open('data2.json','w')
    json.dump(extracted_data,f,indent=4)
    for i in AsinList:
        url = "http://www.amazon.com/dp/"+i
        print ("Processing: "+url)
        extracted_data1.append(AmzonParser1(url,i,extracted_data1))
        sleep(1)
    f=open('data.json','w')
    json.dump(extracted_data1,f,indent=4)
Ejemplo n.º 3
0
def Quora(self,r_url):
    html_=urllib2.urlopen(r_url)
    soup = bsp(html_)
    question['title'] = soup.title.string
    question['url'] = r_url
    details = soup.find_all('div',class_='question_details_text')
    for detail in details:
        question['details'] = detail.text
    topics = soup.find_all('div',class_='topic_list_item')
    for topic in topics:
        question['topics'] = [topic.text]

    ans_count = soup.find('div',class_='answer_header_text').text.split()
    count = int(ans_count[0])
    question['answer_count'] = count
    answers = soup.find_all('div',class_='pagedlist_item')
    if count < 6:
        count = len(answers)-1
    else:
        count = 6

    for i in range(count):
        if answers[i].find('div',class_='answer_content'):
            self.response.write(answers[i].find('div',class_='answer_content').text)
            self.response.write('-----------------------------------------------------------------')
Ejemplo n.º 4
0
async def extract_abstract(ix):
    url = 'https://www.cw.com.tw/article/articleLogin.action?id=%s' % ix
    async with aiohttp.ClientSession(connector=aiohttp.TCPConnector(
            ssl=False)) as session:
        async with session.get(url, headers=headers) as res:
            try:
                text = await res.text()
            except:
                print('%s failed' % ix)
                with open('failed.txt', 'a') as f:
                    f.write('%s\n' % ix)
                return
            soup = bsp(text, "lxml")
            try:
                img_url = soup.select(
                    '.main article .st_email_large')[0]['st_image']
                if len(img_url) == 0: return
                img_url = re.sub('\:\d+', '', img_url)
                print(img_url)
                print('--------')
                async with session.get(img_url, headers=headers) as img_res:
                    img = await img_res.read()
                    with open('%s/%s.jpg' % (output_dir, ix),
                              'wb') as out_file:
                        #shutil.copyfileobj(img, out_file)
                        out_file.write(img)
            except IndexError:
                return
Ejemplo n.º 5
0
def fetch_images(query):
    # addr = "https://www.google.com/search?tbm=isch&q=" + query
    addr = "https://www.google.co.in/search?q={}&source=lnms&tbm=isch".format(
        query)
    s = rq.session()
    r = s.get(addr)
    soup = bsp(r.text, 'html.parser')
    # print(soup)
    imgs = []
    for im in soup.find_all('a', href=True):
        if im.find('img'):
            imgs.append(im)
    # pp = pprint.PrettyPrinter(indent=4)
    # pp.pprint(imgs)
    try:
        os.mkdir('images')
    except OSError:
        shutil.rmtree('images')
        os.mkdir('images')
    count = 0
    prs = Presentation()
    blank_slide_layout = prs.slide_layouts[6]
    left = top = Inches(1)
    for im in imgs:
        for child in im.children:
            filename = wget.download(child['src'],
                                     out='images/image_{}'.format(count))
            count += 1
    for i in range(count):
        slide = prs.slides.add_slide(blank_slide_layout)
        pic = slide.shapes.add_picture('images/image_{}'.format(i), left, top)

    title = '{}_image.pptx'.format(query)
    prs.save(title)
    return title
Ejemplo n.º 6
0
def gujianew(code, riqi):
    url = 'http://www.aigaogao.com/tools/history.html?s=' + code
    detail_url = requests.get(url)
    soup = bsp(detail_url.content, 'lxml')
    detail_content = soup.find_all('table')
    #print(detail_content)
    table = detail_content[-1]
    #print(table)
    trs = table.find_all('tr')
    # print(trs)
    trs_list = []
    for i in trs:
        # print(i)
        tds = i.find_all('td')
        # print(tds)
        tr_list = []
        for j in tds:
            tr_list.append(j.text)
        trs_list.append(tr_list)
    #print(trs_list[0])
    result = DataFrame(trs_list[1:-1], columns=trs_list[0])
    #print(result)
    result['日期'] = pd.to_datetime(result['日期'])
    #print(riqi)
    #print(result['日期'])
    finally_result = result[result['日期']>riqi]
    #finally_result = [[time_to_timestr(j) for j in i] for i in finally_result.values]
    print(finally_result, type(finally_result))
    a = finally_result
    print(s)
    fw = open('./002401.txt', 'a', encoding='utf-8')
    for l in s[0]:
        fw.write(str(l)+'\t')
Ejemplo n.º 7
0
 def addCounties(self, cities):
     '添加县级城市'
     starttime  = time.time()
     insert = 0
     mycursor = self.__mydb.cursor()
     try:
         for city in cities:
             citynumber, cityname = tuple(city.split(':'))
             rqtApi = self.rrhighApi + citynumber + '.html'
             try:
                 htmlhandle = urllib.request.urlopen(rqtApi)
             except Exception as e:
                 self.log.write(time.asctime() + u'请求文档错误:' + str(e) + '\n')
             else:
                 print('---下载%s数据成功---' % (cityname))
                 htmldoc = htmlhandle.read().decode('utf-8')
                 htmlhandle.close()
                 btsp = bsp(htmldoc, 'html.parser')
                 
                 counties = btsp.find_all('a', href="#highschool_anchor")
                 mycursor.execute(self.queryCityIdSql, (cityname,))
                 cityid = mycursor.fetchone()['id']
                 for county in counties:
                     mycursor.execute(self.queryCountySql, (county.string, cityid))
                     if mycursor.fetchone()['num'] == 0:
                         insert += 1
                         print('插入%s-->%s' % (cityname, county.string))
                         mycursor.execute(self.insertCountySql, (county.string, cityid))
                 self.__mydb.commit()
     except Exception as e:
         self.log.write(time.asctime() + str(e) + '\n')
     mycursor.execute(self.countCountySql)
     countnum = mycursor.fetchone()['countnum']
     endtime = time.time()
     self.printExeResult(insert, endtime - starttime, countnum, '县区')
Ejemplo n.º 8
0
 def tencent(self):
     result = requests.get(self.url).content
     result = bsp(result, 'html5lib')
     self.title = result.head.title.text.split("-")[0]
     result = result.find(attrs={"class": "mod_episode"})
     text = result.text.replace('\t', '').splitlines()
     while '' in text:
         text.remove('')
     contents = result.contents
     for i in contents:
         if type(i) != 'bs4.element.Tag':
             contents.remove(i)
     num = len(text)
     # 原版采用列表逆序从头查找,现在采用text[-1]和__reversed__方法减少逆序时间
     # text.reverse()
     # contents.reverse()
     if text[-1].find('展开更多') >= 0:
         self.tencent2()
         return
     else:
         for i in range(num).__reversed__():
             if text[i].isdigit():
                 self.ep = text[i]
                 self.link = contents[i].find('a').get('href')
                 return
     self.tencent2()
Ejemplo n.º 9
0
def getRankData(partition,rankTime):
    #url = 'https://www.bilibili.com/ranking?spm_id_from=333.334.banner_link.1'
    url = urlConstructor('全站',partition,rankTime) 

    result = bsp(session.get(url,headers = HostReferer).content,'lxml')
    #将结果转化为bsp结构

    items = result.find_all('div',class_ = 'info')
    
    rankData = ''
    count = 1
    for item in items:
        detail = item.find_all('span')
        pts = item.find_all('div',class_ = 'pts')
        if detail:
            #获取该视频链接
            videoHref = item.a['href']
            #通过链接获取播放量和弹幕数
            data = getVideoView('http:'+videoHref)
            rankData += str(count)+u'\t标题: '+item.a.text + '\n'
            rankData += u'\t播放量: '+str(data['view'])+u'\t弹幕数: '+str(data['danmaku']) + '\t'+u'UP主: '+detail[1].text+'\t' + u'综合得分: '+pts[0].div.text+'\n\n'
            count += 1
        
        #break#调试先设置只执行一次
    return rankData
Ejemplo n.º 10
0
def get_zj(url):

    res = rqs.get(url)
    ques=[] #test input
    str_ques=""
    ans=[] #test output
    str_ans=""
    tmp=[]
    html = bsp(res.text, "html.parser")
    data = html.findAll("div", {"class" : "problembox"})
    data2 = html.findAll("pre")

    for i in data2:
        if i.text!="\r" and i.text!="\r\n":
            tmp.append(i.text+"\n")

    for i in range(0,len(tmp),2):
        ques.append(tmp[i])
        ans.append(tmp[i+1])

    for i in ques:
            str_ques+=i
    for i in ans:
            str_ans+=i
            
    str_ques=str_ques.replace("\n\n","\n")
    str_ans=str_ans.replace("\n\n","\n")

    str_ques=str_ques.replace("\r","")
    str_ans=str_ans.replace("\r","")

    return str_ques,str_ans
Ejemplo n.º 11
0
 def info(self):
     self.idget = requests.get(d.url + d.loc[5] + d.loc2[0],
                               cookies=self.kk).text
     # 注释部分用bs4脱裤子放屁
     # soup = bsp(idget, "html5lib")
     # self.id = soup.legend.contents[0].split(d.seg[0])[1].strip()
     # print(legend)
     # info = soup('script')
     # info = info[len(info) - 1].string
     # self.id = idget.split(d.seg[0])[1].split(d.seg[1])[0].strip()
     info = json.loads(self.idget.split(d.seg[2])[1].split(d.seg[3])[0])
     # print(info)
     self.n = info['nick']
     self.s = info['school']
     self.c = info['class']
     # gender 0:♂ 1:♀
     self.g = info['gender']
     self.e = info['email']
     self.b = info['birthday']
     userinfoPage = requests.get(d.url + d.loc[8], params={
         'user': self.id
     }).text
     userinfoPage = bsp(userinfoPage, "html5lib")
     tag = userinfoPage.find('div', id="yijiejue")
     # it=tag.text
     # print(it)
     # print(type(it))
     # print(re.findall(re.compile(r'\d\d\d\d'), it))
     self.headled = re.findall(re.compile(r'\d\d\d\d'), tag.text)
Ejemplo n.º 12
0
def get_message():
    url="http://www.freebuf.com"
    header={"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:56.0) Gecko/20100101 Firefox/56.0"}
    res=requests.get(url,headers=header)
    soup=bsp(res.text,"html.parser")
    a=soup.find_all('div',class_="news-img")
    
    for i in a:
        url1.append(i.a['href'])
        title.append(i.img['title'])
    for i in range(len(title)):
        dic={title[i]:url1[i]}
        dict.update(dic)
    #word=re.compile(u'\u62db\u8058|\u62db|\u62db\u4eba|\u8df3\u69fd|\u5b9e\u4e60|\u6316\u4eba|\u8bda\u62db')
    word=re.compile(u'招聘|诚聘|招人|招|聘|实习生|挖人')
    for key in dict:
        match=word.search(key)
        if match:
            pass
        else:
            dic={key:dict[key]}
            result.update(dic)
    for key in result:
        file.write('<a href=')
        file.write('"')
        file.write(result[key])
        file.write('">')
        file.write(key)
        file.write('</a>')
        file.write('<br>')
        
        
    file.close()
Ejemplo n.º 13
0
def get_page(artist_name):
    artist_name = artist_name.replace(' ', '%20')
    base_url = 'http://www.lyrics.com'
    full_address = base_url + '/artist/' + artist_name
    page = rq.get(full_address)
    soup = bsp(page.content, 'html.parser')
    return soup
Ejemplo n.º 14
0
def Quora(self, r_url):
    html_ = urllib2.urlopen(r_url)
    soup = bsp(html_)
    question["title"] = soup.title.string
    question["url"] = r_url
    details = soup.find_all("div", class_="question_details_text")
    for detail in details:
        question["details"] = detail.text
    topics = soup.find_all("div", class_="topic_list_item")
    for topic in topics:
        question["topics"] = [topic.text]

    ans_count = soup.find("div", class_="answer_header_text").text.split()
    count = int(ans_count[0])
    question["answer_count"] = count
    answers = soup.find_all("div", class_="pagedlist_item")
    if count < 6:
        count = len(answers) - 1
    else:
        count = 6

    for i in range(count):
        if answers[i].find("div", class_="answer_content"):
            self.response.write(answers[i].find("div", class_="answer_content").text)
            self.response.write("--" * 30)
Ejemplo n.º 15
0
def Quora(self, r_url):
    html_ = urllib2.urlopen(r_url)
    soup = bsp(html_)
    question['title'] = soup.title.string
    question['url'] = r_url
    details = soup.find_all('div', class_='question_details_text')
    for detail in details:
        question['details'] = detail.text
    topics = soup.find_all('div', class_='topic_list_item')
    for topic in topics:
        question['topics'] = [topic.text]

    ans_count = soup.find('div', class_='answer_header_text').text.split()
    count = int(ans_count[0])
    question['answer_count'] = count
    answers = soup.find_all('div', class_='pagedlist_item')
    if count < 6:
        count = len(answers) - 1
    else:
        count = 6

    for i in range(count):
        if answers[i].find('div', class_='answer_content'):
            self.response.write(answers[i].find('div',
                                                class_='answer_content').text)
            self.response.write(
                '-----------------------------------------------------------------'
            )
Ejemplo n.º 16
0
def scrapTodayYest():
    import requests
    from bs4 import BeautifulSoup as bsp
    a2m = lambda ahi: {
        'game': ahi.parent.h2.text,
        'time': ahi.previousSibling.previousSibling.text,
        'url': ahi['href'],
        'tvalue': ahi.parent.parent.h3.text
    }
    rh = requests.get('https://satta-king-fast.com/')
    sp = bsp(rh.text, 'html.parser')
    ahrefs = filter(lambda a: a.text == "Record Chart", sp.findAll('a'))
    temps = [[d['url'], d['game'], d['time'], d['tvalue']]
             for d in map(a2m, ahrefs)]
    rt = dict()
    for url, game, time, tv in temps:
        rt[game] = rt.get(game, {'url': urlpath(url), 'dt-val-ts': []})
        rt[game]['dt-val-ts'].append([tv, time])
    dates = [
        t.findAll('h1')[0].text.split()[-2].replace(',', '')
        for t in sp.findAll('table')[:2]
    ]
    for game in rt:
        rt[game]['dt-val-ts'] = list(
            map(lambda xy: [xy[0]] + xy[1], zip(dates, rt[game]['dt-val-ts'])))
    return rt
Ejemplo n.º 17
0
def parse_html(html):
    soup = bsp(html)
    position_soup = soup.find('ul', class_='item_con_list')

    for position_li in position_soup.find_all('li'):
        position_attr = position_li.attrs
        print position_attr.get('data-salary')
        print position_attr.get('data-company')
Ejemplo n.º 18
0
def organization_details():
    response = requests.get(myconstants.orgs_url)
    html = response.content
    soup = bsp(html, "html.parser")
    get_organizations = soup.findAll("li", {'class': 'organization-card__container'})
    final_result = fetch_details(get_organizations)
    print(final_result)
    return json.dumps(final_result)
Ejemplo n.º 19
0
 def get_article(cls, page_url):
     print 'self.get_article()'
     response = requests.get(page_url, headers=cls.headers)
     soup = bsp(response.text, 'html.parser')
     info = soup.findAll('a', class_='question_link')
     article_names = [i.text.strip() for i in info]
     article_links = [cls.base_url + i['href'] for i in info]
     return zip(article_names, article_links)
Ejemplo n.º 20
0
def scraper(url):
    try:
        head = {'User-Agent': 'Mozilla/5.0'}
        req = Request(url, headers=head)
        html = urlopen(req)
        bsObj = bsp(html, "html.parser")
        
        #scrape raw tags first
        imgs = bsObj.findAll('img', {'class':'th'})
        headlines = bsObj.findAll('h3', {'class':'r'})
        links = bsObj.findAll('span', {'class':'f'})
        dates = bsObj.findAll('div', {'class':'slp'})
        stories = bsObj.findAll('div', {'class':'st'})
        
        # print(len(imgs))
        # print(len(headlines))
        # print(len(links))
        # print(len(dates))
        # print(len(stories))
        
        
        
        #extract from scraped data
        #each data array may not be the same length; 
        #try to match domains to align arrays
        thumbs = [t['src'] for t in imgs]
        
        # if s is a hashed news headline link: s.split("=")[1].split("&")[0]
        hlinks = [h.a['href'].split("=")[1].split("&")[0] for h in headlines]
        
        datesources = [re.sub('\u200e', '', str(d.contents[0].contents[0])) for d in dates]
        
        hlines = [tagCleaner(h.a.contents) for h in headlines]
        
        for i in range(len(thumbs)):
            print(thumbs[i])
            print(hlinks[i])
            print(hlines[i])
            print(datesources[i])
        
        
        # for h in headlines:
        #     print(h.a['href'])
        #     print(tagCleaner(h.a.contents))
        #     print(type(h.a.contents[1])==bs4.NavigableString)
        #     print(str(h.a.contents[1]))
        
        # f = open(fname, 'w')
        
        # for s in stories:
        #     f.write(str(s) + "\n")
        
        # f.close()
        
    except Exception as e:
        print(e)
    finally:
        print("*"*25)
Ejemplo n.º 21
0
def findUrls(href, driver):
    driver.get(href)
    time.sleep(1)
    soup = bsp(driver.page_source, features='lxml')
    list_urls = soup.find_all('a', {"class": "bookTitle"}, itemprop='url')
    urls = []
    for url in list_urls:
        urls.append(url.get("href"))
    return urls
Ejemplo n.º 22
0
def get_sling_networks():
    data = "https://www.cnet.com/news/sling-tv-everything-you-need-to-know/"
    response = requests.get(data, timeout=5)
    content = bsp(response.content, "html.parser")
    data_container = content.find("div", class_="chartWrapper")

    target = data_container.findAll("th")
    for i in target:
        print i.getText().strip()
Ejemplo n.º 23
0
def scrapeHomePage():
    rh = requests.get('https://sattakingdarbar.com/')
    sp = bsp(rh.text, 'html.parser')
    ahrefs = filter(lambda a: a.text == "Record Chart", sp.findAll('a'))
    temps = {
        d['url']: [d['game'], d['time'], d['tvalue']]
        for d in map(a2m, ahrefs)
    }
    return temps
Ejemplo n.º 24
0
def cleanText(txt):
    text =''
    txt = unicode(str(txt),errors='ignore')
    try:
        soup = bsp(txt,"html.parser")
        for e in soup.findAll(['script', 'style','form','meta','head']):
            e.extract()

        soup = bsp(str(soup),"html.parser")        
        text = soup.get_text()

    except:
        print sys.exc_info()[0]

    text = re.sub('  +',' ',text)
    text = re.sub('\t\t+','\t',text)
    text = re.sub('\n\n+','\n',text)
    return text
Ejemplo n.º 25
0
def get_answer_url(userid):
    base="https://www.zhihu.com/people/"
    url=base+userid
    try:
        data=get_homepage_url_content(url)
        
    except:
        return []

    beautiful=bsp(data,"html.parser")
    url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"})
    url2=beautiful.find_all("div",{"class":"zm-profile-section-item zm-item clearfix","data-type-detail":"member_voteup_answer"})
    try:     
        last_datatime=url2[-1].get("data-time")
    except:
        return []
#    print url1
    url3=beautiful.find_all("div",{"class":"zh-profile-account-status"})
    if url3==[]:  
        isbanned = False
    else:
        isbanned = True
    max_len=100
 
    ans_url=[]
    while(url2!=[] and len(ans_url)<=max_len):
        for content in url1:
            if content.get("href").find("answer")>0:
                ans_url=ans_url+[content.get("href")]
            else:
                continue
        try:
            data=get_all_profile(userid,last_datatime)
        except:
            return ans_url
        beautiful=bsp(data,"html.parser")            
        url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"})
        url2=beautiful.find_all("div",{"class":"zm-profile-section-item zm-item clearfix","data-type-detail":"member_voteup_answer"})
        try:
            last_datatime=url2[-1].get("data-time") 
        except:
            return ans_url

    return ans_url
Ejemplo n.º 26
0
def get_votersprofile_id(url):
    try:
        url1=get_homepage_url_content(url)
        url2=bsp(url1,"html.parser").find_all("a",{"class":"zg-anchor-hidden"})
    except:
        url2=[]
    if url2==[]:
        return 10
    voter_id=url2[0].get("name")[7:]
    return voter_id
Ejemplo n.º 27
0
def scraper(url):

    out = {}

    try:
        head = {'User-Agent': 'Mozilla/5.0'}
        req = Request(url, headers=head)
        html = urlopen(req)
        bsObj = bsp(html, "html.parser")

        #scrape raw tags first
        imgs = bsObj.findAll('img', {'class': 'th'})
        headlines = bsObj.findAll('h3', {'class': 'r'})
        links = bsObj.findAll('span', {'class': 'f'})
        dates = bsObj.findAll('div', {'class': 'slp'})
        stories = bsObj.findAll('div', {'class': 'st'})

        # print(len(imgs))
        # print(len(headlines))
        # print(len(links))
        # print(len(dates))
        # print(len(stories))

        #extract from scraped data
        #each data array may not be the same length;
        #try to match domains to align arrays
        thumbs = [t['src'] for t in imgs]

        # if s is a hashed news headline link: s.split("=")[1].split("&")[0]
        hlinks = [h.a['href'].split("=")[1].split("&")[0] for h in headlines]
        datesources = [
            re.sub('\u200e', '', str(d.contents[0].contents[0])) for d in dates
        ]
        hlines = [tagCleaner(h.a.contents) for h in headlines]

        # for i in range(len(thumbs)):
        #     print(thumbs[i])
        #     print(hlinks[i])
        #     print(hlines[i])
        #     print(datesources[i])

        out = {
            'thumbs': thumbs,
            "hlinks": hlinks,
            "hlines": hlines,
            "dates": datesources
        }

    except Exception as e:
        print(e)
        out = {"Exception": e}
    finally:
        print("*" * 25)

    return out
Ejemplo n.º 28
0
def get_lyrics(url_list):
    list_lyrics = []
    for url in url_list:
        page = rq.get(url)
        soup = bsp(page.content, 'html.parser')
        container = soup.find('div', {"class": "lyric clearfix"})
        lyric_tags = container.findAll('pre', attrs={'id': 'lyric-body-text'})
        for tag in lyric_tags:
            lyric = tag.text
            list_lyrics.append(lyric)
    return list_lyrics
Ejemplo n.º 29
0
def save_all_img_src(url):
    #传入参数为首页
    try:
        html = requests.get(url=url)
        html.encoding = 'utf-8'
        sp = bsp(html.text, 'html.parser')
        title = sp.find('h1', {'class': 'title2'}).text
        with open(cur_path + 'list/' + title + '.txt', 'w') as src_file:
            while (True):
                save_onepage_img_src(sp, src_file)
                #如果有下一页,则更新url
                if next_page_url(sp):
                    url = next_page_url(sp)
                    html = requests.get(url=url)
                    html.encoding = 'utf-8'
                    sp = bsp(html.text, 'html.parser')
                else:
                    break
    except BaseException:
        log_file.write(url + '\t' + '下一页获取失败' + '\n')
Ejemplo n.º 30
0
def main():
    root_url = 'http://www.asahi.com'
    url = 'http://www.asahi.com/politics/list/'
    res = requests.get(url)

    soup = bsp(res.text, 'lxml')

    urls = get_articles_urls(soup, (
        ('ul', 'List'),
        ('li', ''),
    ),
                             link_class="SW")

    article_datas = []
    for u in urls:
        time.sleep(2)  # 過アクセス防止

        if not (u.startswith('http') and u.startswith('https')):
            u = root_url + u

        res = requests.get(u)
        soup = bsp(res.text, 'lxml')

        article_data = ArticleData()
        article_data.title = get_title(soup, (
            ('div', 'ArticleTitle'),
            ('div', 'Title'),
            ('h1', ''),
        ))
        article_data.description = get_description(soup, (
            ('div', 'ArticleText'),
            ('p', ''),
        ))

        article_datas.append(article_data)
        print('fetched from {}'.format(u))

    for a in article_datas:
        print(a.title)
        print(a.description)
        print('\n-+-+-+-+-+-+-+-\n')
Ejemplo n.º 31
0
 def get_max_page(cls):
     # print 'self.get_max_page()'
     try:
         response = requests.get(cls.oa_url, headers=cls.headers)
         soup = bsp(response.text, 'html.parser')
         max_page = int(soup.find_all('div', class_='w4_5')[-1].span.find_all('a')[-1].text)
         # 转为int
         cls.max_page = max_page
         # print type(max_page)
         return max_page
     except:
         return -1
Ejemplo n.º 32
0
def get_html(url):
    tag = ("section", "comments")

    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/55.0.2883.95 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    page_html = bsp(r.text, 'html.parser')

    html = page_html.find(tag[0], id=tag[1])
    return (html, url)
Ejemplo n.º 33
0
def scpeImage():
    browser_image = init_browser()
    url_image = 'https://www.jpl.nasa.gov/spaceimages/?search=&category=Mars'
    browser_image.visit(url_image)
    html_image = browser_image.html
    soup_image = bsp(html_image, 'html.parser')
    image = (soup_image.find_all(
        'div', class_='carousel_items')[0].a.get('data-fancybox-href'))
    images = 'https://www.jpl.nasa.gov' + image
    data_web['featImage'] = images
    browser_image.quit()
    return data_web
Ejemplo n.º 34
0
    def get_urls(self):
        # 从主页面获取所有子页面的URL
        html = get_conn(self.base_url)
        soup = bsp(html, 'lxml')
        slider = soup.find('div', {'class': 'slider'})
        new_home_box = soup.find_all('div', {'class': 'new-home-box'})
        urls_obj = []
        for item in [slider] + new_home_box:
            urls_obj.extend(item.find_all('a'))

        permerfor_urls = map(lambda item: item.get('href'), urls_obj)
        return permerfor_urls
Ejemplo n.º 35
0
def get_xc_item2(url):
    headers = {
        'User-Agent':
        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_4) AppleWebKit/537.36 (KHTML, like Gecko)'
        ' Chrome/55.0.2883.95 Safari/537.36'
    }
    r = requests.get(url, headers=headers)
    r.encoding = 'gb2312'

    # 题库列表 html
    content = bsp(r.text, 'html.parser')
    return content
Ejemplo n.º 36
0
def get_soup(url=BASE_URL):
    # raw content
    done = False
    while done == False:
        try:
            content = requests.get(url).content
        except:
            sleep(1)
            abc = 123
        finally:
            done = True
    # soup
    return bsp(content, "lxml")
Ejemplo n.º 37
0
def get_text(url):
    html = requests.get(url).text
    print(html)
    soup = bsp(html, 'lxml')
    title = soup.find('h1', id='articleTitle').get_text()
    autor = soup.find('div', class_="content-th-info").find('a').get_text()
    article_content = soup.find('div', class_="document").find_all('p')
    all_p = [
        i.get_text() for i in article_content
        if not i.find('img') and not i.find('a')
    ]  # 去除标签
    article = '\n'.join(all_p)
    yield {"title": title, "autor": autor, "article": article}
Ejemplo n.º 38
0
def test():
    proxy = urllib2.ProxyHandler({'http':'177.43.212.44'})
    opener = urllib2.build_opener(proxy)
    urllib2.install_opener(opener)
    response = urllib2.urlopen('https://en.wikipedia.org/wiki/apple')
    if response is not None:
        html = response.read()
        print html
    else:
        print 'None'
    #
    soup = bsp(open("wiki_files/123.html"))
    print soup.title
    print soup.find_all('h2')
Ejemplo n.º 39
0
 def getTitle(self, title, content):
     soup = bsp(content)
     hs = soup.findAll("h1")
     hs += soup.findAll("h2")
     lines = []
     for h in hs:
         lines.append(h.string)
     if len(lines) != 0:
         ret = self.getTitleFromLinesInHTags(title, lines)
         if ret and ret != "":
             return ret
     content, number = re.subn("<[\s\S]*?>", "", content)
     lines = content.split("\n")
     return self.getTitleFromLinesInAllContext(title, lines)
Ejemplo n.º 40
0
def get_totalpage(typeparam) :
	bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao'+typeparam;
	s = requests.Session();
	s.mount("http://number.sungoin.cn",HTTPAdapter(
    	max_retries=Retry(total=10, status_forcelist=[500, 503])
    ));
	response=s.get(bspurl);
	soup = bsp(response.content,"html.parser");
	s = soup.find_all('script');
	patt = 'val\("[0-9]+"\)';
	re_pat=re.compile(patt);
	ll = s[2].text.replace('\n',"");
	search_ret=re_pat.search(ll);
	if search_ret : 
		return int(search_ret.group().replace('val("','').replace('")',''));
Ejemplo n.º 41
0
def parse_voter_A(html):
    try:
        list = ""
        s = bsp(html, 'html.parser')
        list += s.a['href'][8:]
        list += ','+s.a['title']
        list += ','+s.span.text
        all = s.find_all('li')
        list += ',' + all[0].span.text.split(' ')[0]
        list += ',' + all[1].span.text.split(' ')[0]
        list += ',' + all[2].a.text.split(' ')[0] 
        list += ',' + all[3].a.text.split(' ')[0]
    except:
#        print(html.encode('utf-8'))
        pass
    return list.split(",")    
Ejemplo n.º 42
0
 def addTechnicalSchools(self, cities):
     '插入技校数据'
     starttime = time.time()
     insert = 0
     mycursor = self.__mydb.cursor()
     try:
         for city in cities:
             citynumber, cityname = tuple(city.split(':'))
             rqtApi = self.rrtechnicalApi + citynumber + '.html'
             try:
                 htmlhandle = urllib.request.urlopen(rqtApi)
             except Exception as e:
                  self.log.write(time.asctime() + u'请求技校文档错误:' + str(e) + '\n')
             else:
                 print('---下载%s数据成功---' % (cityname))
                 htmldoc = htmlhandle.read().decode('utf-8')
                 htmlhandle.close()
                 btsp = bsp(htmldoc, 'html.parser')
                 countieshtml = btsp.find_all('a', href="#highschool_anchor")
                 counties = []
                 for countyhtml in countieshtml:
                      counties.append([countyhtml.string.strip(),  re.search(r'[0-9]{4,}', countyhtml['onclick']).group()])
                 mycursor.execute(self.queryCityIdSql, (cityname,))
                 cityid = mycursor.fetchone()['id']
                 for county in counties:
                     mycursor.execute(self.queryCountyIdSql, (county[0], cityid))
                     try:
                         countyid = mycursor.fetchone()['id']
                     except Exception as e:
                         self.log.write('没有找到%s-->%s的id\n' % (cityname, county[0]))
                     else:
                         techshtml = btsp.select('ul[id$=' + county[1] +']')
                         techshtml = techshtml[0].find_all('a') if isinstance(techshtml, list) and len(techshtml) else []
                         for tech in techshtml:
                             if isinstance(tech.string, str) and tech.string.strip():
                                 mycursor.execute(self.queryTechnicalSql, (tech.string.strip()))
                                 if mycursor.fetchone()['num'] == 0:
                                     insert += 1
                                     print('插入技校%s--%s--%s' % (cityname, county[0], tech.string))
                                     mycursor.execute(self.insertTechnicalSql, (tech.string.strip(), countyid))
                         self.__mydb.commit()
     except Exception  as e:
         self.log.write(time.asctime() + str(e) + '\n')
     mycursor.execute(self.countTechnicalSql)
     countnum = mycursor.fetchone()['countnum']
     endtime = time.time()
     self.printExeResult(insert, endtime - starttime, countnum, '技校')
Ejemplo n.º 43
0
def get_answer_url(userid):
    base="https://www.zhihu.com/people/"
    url=base+userid
    try:
        data=get_homepage_url_content(url)
        
    except:
        return []
    beautiful=bsp(data,"html.parser")
    url1=beautiful.find_all("a",{"class":"question_link","target":"_blank"})
    ans_url=[]
    for content in url1:
        if content.get("href").find("answer")>0:
            ans_url=ans_url+[content.get("href")]
        else:
            continue
    return ans_url
Ejemplo n.º 44
0
def hanle_request(typeparam,pageNo) :
	arr = [];
	bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao&type='+typeparam+"&pageNo="+str(pageNo)+"&pageSize=80";
	# bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao&type=无规则&pageNo="+str(pageNo)+"&pageSize=80‘;
	print bspurl
	s = requests.Session();
	# s.mount(bspurl,HTTPAdapter(max_retries=10));
	s.mount("http://number.sungoin.cn",HTTPAdapter(
    	max_retries=Retry(total=10, status_forcelist=[500, 503])
    ));
	response=s.get(bspurl);
	# response = s.get(bspurl);
	soup = bsp(response.content,"html.parser");
	tds = soup.find_all('td',attrs={"width": "110px"});
	for j in range(0,len(tds)) : 
		td = tds[j];
		s = typeparam+","+td.text.strip()+","+td.find_next_sibling().text.strip();
		# arr.append(s)	
		levelparam=get_level(td.find_next_sibling().text.strip(),td.text.strip(),typeparam);
		(level,selfrule)=levelparam.split(":");
		update_sql_lite(td.text.strip(),level,selfrule,typeparam);
Ejemplo n.º 45
0
def filter_user(userid):
    base="https://www.zhihu.com/people/"
    url=base+userid
    asks="/people/"+userid+"/asks"
    answers="/people/"+userid+"/answers"
    posts="/people/"+userid+"/posts"
    collections="/people/"+userid+"/collections"
    logs="/people/"+userid+"/logs"
    
    try:
        data=get_homepage_url_content(url)
    except:
        print "user:"******"need to reconnect."
    beautiful=bsp(data,"html.parser")
    url1=beautiful.find_all("a",{"class":"item","href":asks})[0].span.string
    url2=beautiful.find_all("a",{"class":"item","href":answers})[0].span.string
    url3=beautiful.find_all("a",{"class":"item","href":posts})[0].span.string
    url4=beautiful.find_all("a",{"class":"item","href":collections})[0].span.string
    url5=beautiful.find_all("a",{"class":"item","href":logs})[0].span.string
    
    
    return [int(url1),int(url2),int(url3),int(url4),int(url5)]
Ejemplo n.º 46
0
def extractInfo(Link, domain):
    """Gets the link of the page, and returns Page title and Description of the Link
	works for - 
	khan academy
	EdX
	udacity
	Udemy
	"""
    LinkInfo = fs.Links()
    LinkInfo.link = Link
    try:
        response = urlopen(Link)
        html = response.read()
        sp = bsp(html)
        if domain != "mitocw":
            desc = sp.findAll(attrs={"name": "description"})
        else:
            desc = sp.findAll(attrs={"name": "Description"})
        LinkInfo.pagetitle = sp.title.text.decode("utf-8")
        LinkInfo.desc = desc[0]["content"].decode("utf-8")
    except Exception:
        LinkInfo.pagetitle = "pagetitle"
        LinkInfo.desc = "desc[:498]"
    return LinkInfo
Ejemplo n.º 47
0
		'wiki', '.ps', '.exe', '.txt', '.pps', 'drupal', 
		'lab401']

url_list.append(url)

for i in url_list:
	print(i)	
	print('Find',len(mail_list),'results')
	
	if(len(mail_list) >= 1000):
		break
	try :
		req = urllib.request.urlopen(i)
		#print(i)
		valid_url.append(i)
		s = bsp(req,"html.parser")
		
		#find <a href tag in the page
		link = s.findAll('a',href=True)
		for tmp in link:
			#print(tmp['href'])
			buf = tmp['href']
			
			# mail
			if('mailto:' in buf):

				if(buf not in mail_list):
					if( len(buf) < 40 and '\n' not in buf):
						mail_list.append(buf)
						#print(buf)
				
Ejemplo n.º 48
0
def get_soup(url=BASE_URL):
    # raw content
    content = requests.get(url).content
    # soup
    return bsp(content,"lxml")
Ejemplo n.º 49
0
			sqlitecu.execute("update four_hunder_number set is_sync = 1")
		sqliteconn.commit();
		mysqlconn.commit()
		#关闭游标连接,释放资源
		mysqlcurs.close()
		#关闭连接
		mysqlconn.close()
		sqlitecu.close()
		sqliteconn.close()


arr = [];
types= [];
bspurl = 'http://number.sungoin.cn/number/template/agents.do?param=findXuanHao';
response = requests.get(bspurl);
soup = bsp(response.content,"html.parser");

numertypes = soup.find_all('div',attrs={"class":"guize_tit"});
# print len(numertypes)
for m in range(0,len(numertypes)) :
	numbertype=numertypes[m];
	aLinks=numbertype.find_next_sibling().find_all('a');
	for h in range(0,len(aLinks)) :
		link = aLinks[h];
		typeparam = link['href'].replace("javascript:sub('","").replace("')","");
		if not typeparam.startswith('end'):
			continue;
		types.append(typeparam);
		total_page=get_totalpage("&type="+typeparam)
		for p in range(1,total_page):
			hanle_request(typeparam,p);