Ejemplo n.º 1
0
def get_info(url):
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
    times = soup.select('div.pc_temp_songlist > ul > li > span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        # print(rank.get_text())
        # print(title.get_text().strip())
        # print(time.get_text().strip())
        if title.get_text().find('-') > 0:
            data = {
                'rank': rank.get_text().strip(),
                'singer': title.get_text().split('-')[0].strip(),
                'song': title.get_text().split('-')[1].strip(),
                'time': time.get_text().strip(),
            }
        else:
            data = {
                'rank': rank.get_text().strip(),
                'singer': title.get_text().split('-')[0].strip(),
                'song': '',
                'time': time.get_text().strip(),
            }
        print(data)
Ejemplo n.º 2
0
def get_movie_data(soup):
	for child in soup.find_all('div', class_ = "showtime_box"):
		print child.div.a.get_text(strip=True)

		for length in child.find_all('div', class_ = "showtime_poster"):
			print length.get_text(strip=True)
		
		for time in child.find_all('li'):
			print time.get_text(strip=True)
		print
def get_items_info(item_link):
    wb_data = requests.get(item_link, headers=headers)
    if wb_data.status_code == 200:  #判断页面是否存在者ip是否被封ip
        soup = BeautifulSoup(wb_data.text, 'lxml')
        titles = soup.select('.title-name')
        times = soup.select('.pr-5')
        types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')
        prices = soup.select('i.f22')
        adrs = soup.select('ul.det-infor > li:nth-of-type(3)')
        cates = soup.select('div.h-crumbs')
        qualities = soup.select(
            ' div.leftBox > div:nth-of-type(4) > div.det-summary > div > div ')
        for title, time, type, price, adr, cate, quality in zip(
                titles, times, types, prices, adrs, qualities, cates):
            items_data = {
                'title': title.get_text(),
                'times': time.get_text().split(),
                'type': type.get_text(),
                'price': price.get_text(),
                'adr': list(adr.stripped_strings),
                'qualities': list(quality.stripped_strings),
                'cate': cate.get_text()
            }
            items_info9.insert_one(items_data)
            print(items_data)
def get_message(url):
    #print(url)
    wb_date = requests.get(url)
    id = (url.split('/')[-1])[:14]#截取商品ID得到对应的id,一共15位
    views = get_totols(id)

    soup = BeautifulSoup(wb_date.text,'lxml')

    categories = soup.select('#header > div.breadCrumb.f12 > span:nth-of-type(3) > a') #类目
    titles = soup.select('#content > div.person_add_top.no_ident_top >div.per_ad_left > div.col_sub.mainTitle > h1')#标题
    times = soup.select('#index_show > ul.mtit_con_left.fl > li.time')#发布时间
    prices = soup.select('#content > div.person_add_top.no_ident_top >div.per_ad_left > div.col_sub.sumary > ul >'
                         ' li:nth-of-type(1) > div.su_con > span')#价格
    olds = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left > div.col_sub.sumary > ul >'
                       ' li:nth-of-type(2) > div.su_con > span')#成色
    areas1 = soup.select('#content > div.person_add_top.no_ident_top >'
                         ' div.per_ad_left > div.col_sub.sumary > ul >'
                         ' li:nth-of-type(3) > div.su_con > span >'
                         ' a:nth-of-type(1)')#区域1
    areas2 = soup.select('#content > div.person_add_top.no_ident_top > div.per_ad_left >'
                         ' div.col_sub.sumary > ul > li:nth-of-type(3) > div.su_con > span > a:nth-of-type(2)')#区域2
    for category,title,time,price,old,area1,area2 in zip(categories,titles,times,prices,olds,areas1,areas2):
        data = {
            '类目':category.get_text(),
            '标题':title.get_text(),
            '发布时间':time.get_text(),
            '价格':price.get_text(),
            '成色':old.get_text().strip(),
            '区域':area1.get_text()+'-'+area2.get_text(),
            '浏览量':views
        }
        print(data)
        return None
Ejemplo n.º 5
0
def kg_spider(url):
    '''
    获取酷狗音乐top500,保存到mongdb
    :param url: 请求地址
    :return:
    '''
    res = requests.get(url, headers=headers)
    # print(res.text)
    soup = BeautifulSoup(res.text, 'lxml')
    ranks = soup.select('.pc_temp_num')
    titles = soup.select('.pc_temp_songlist  > ul > li > a')
    times = soup.select('.pc_temp_time')
    for rank, title, time in zip(ranks, titles, times):
        rank = rank.get_text().strip()
        # print(rank)
        song = title.get_text().split(' - ')[-1]
        # print(song)
        singer = title.get_text().split(' - ')[0]
        # print(singer)
        song_time = time.get_text().strip()
        # print(song_time)
        print(rank, song, singer, song_time)
        data = {
            'rank': rank,
            'song': song,
            'singer': singer,
            'song_time': song_time
        }
        storage_mongdb(data)
        print("---" * 20)
Ejemplo n.º 6
0
def check_results():
    URL = 'https://footballdatabase.com/ranking/'

    headers = {
        "User-Agent":
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36 Edge/18.18362'
    }

    page = requests.get(URL, headers=headers)

    soup = BeautifulSoup(page.content, 'html.parser')

    linksPaginacao = soup.find(attrs={"pagination pagination-sm"})
    for link in linksPaginacao.findAll("a"):
        URL = 'https://footballdatabase.com' + link["href"]
        print(URL)
        page = requests.get(URL, headers=headers)
        soup = BeautifulSoup(page.content, 'html.parser')
        tabelaDeDados = soup.find(attrs={"table-responsive"})
        for linha in tabelaDeDados.findAll("tr"):
            rank = linha.findAll(attrs={"rank"})
            time = linha.find(attrs={"limittext"})
            pais = linha.find(attrs={"sm_logo-name"})
            if time is not None:
                print(rank[0].get_text() + " - " + time.get_text() + " (" +
                      pais.get_text() + ")" + " ~ " + rank[1].get_text())
Ejemplo n.º 7
0
def kg_spider(url):
    rsp = requests.get(url,headers=headers)
    # print(rsp.text)
    soup = BeautifulSoup(rsp.text,'lxml')

    nums = soup.select('.pc_temp_num')
    # print(num)
    titles = soup.select('.pc_temp_songlist  > ul > li > a')
    # print(titles)
    times = soup.select('.pc_temp_time')
    # print(times)
    for num,title,time in zip(nums,titles,times):
        data = {
        # 歌曲编号
        'num':num.get_text().strip(),
        # print(num)
        # 歌曲名称
        'song':title.get_text().split('-')[-1].strip(),
        # print(song)
        # 歌手
        'singer':title.get_text().split('-')[0].strip(),
        # print(singer)
        # 歌曲时长
        'time':time.get_text().strip()
        # print(num,song,singer,song_time)
        }
        songs_id = songs.insert(data)
        print(songs_id)
Ejemplo n.º 8
0
def get_zhaopin(page_num):
    url = "http://sou.zhaopin.com/jobs/searchresult.ashx"
    querystring = {
        "jl": "%e5%8c%97%e4%ba%ac",
        "kw": "python",
        "sm": "0",
        "sg": "2d8d7bd1731e4c06a4fbbb6aa50d7eb6",
        "p": page_num
    }
    print("第{}页".format(page_num))
    response = requests.request("GET",
                                url,
                                headers=headers,
                                params=querystring).content

    soup = BeautifulSoup(response, 'lxml')

    job_name = soup.select("div#newlist_list_content_table td.zwmc a")
    salarys = soup.select("div#newlist_list_content_table td.zwyx")
    locatinos = soup.select("div#newlist_list_content_table td.gzdd")
    times = soup.select("div#newlist_list_content_table td.gxsj span")

    for name, salary, location, time in zip(job_name, salarys, locatinos,
                                            times):
        data = {
            '职位名称': name.get_text(),
            '薪资范围': salary.get_text(),
            '工作地点': location.get_text(),
            '更新时间': time.get_text(),
        }
        print(data)
        f = open("Python招聘职位信息.json", "a", encoding="utf-8")
        f.write(json.dumps(data, ensure_ascii=False))
        f.close()
Ejemplo n.º 9
0
def forbes_fetch_links(string):
    url="http://www.forbes.com"
    pages = []
    to_return = []
    pages.append(url+"/search/?q="+'"'+string.replace(' ','+')+'"')
    soup=crawl(pages[0])
    pages.pop()
    for page in soup.find_all('li',attrs={'class':'page'}):
        page=page.findChild('a')
        try:
            pages.append(url+page.get('href'))
        except:
            print
    pages.reverse()
    while pages:
        soup=crawl(pages.pop())
        links=soup.find_all('h2')
        times=soup.find_all('time',attrs={'class':'date'})
        for link,time in zip(links,times):
            link=link.findChild('a')
            try:
                if int(time.get_text()[-4:])<=2008:
                    to_return.append(link.get('href'))
            except:
                print
    return to_return
Ejemplo n.º 10
0
def get_attractions(url):
    import time
    wb_data = requests.get(url)
    time.sleep(2)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    titles = soup.select("body > div.main.auto-width > ul > li > a > p.g-name")
    persons = soup.select(
        "body > div.main.auto-width > ul > li > a > p.performerName")
    prices = soup.select(
        "body > div.main.auto-width > ul > li > a > p.g-price > b")
    times = soup.select("body > div.main.auto-width > ul > li > a > p.g-time")
    places = soup.select(
        "body > div.main.auto-width > ul > li > a > p.g-place.a-link")
    images = soup.select(
        "body > div.main.auto-width > ul > li > a > div > img")
    print(images)
    for title, person, price, time, place, imge in zip(titles, persons, prices,
                                                       times, places, images):
        data = {
            'title':
            title.get_text().replace("\t", "").replace("\n", "").replace(
                "独家", "").replace("【秀动呈献】", ""),
            'person':
            person.get_text().replace("\t", "").replace("\n", ""),
            'price':
            price.get_text(),
            'time':
            time.get_text().replace("\t", "").replace("\n", ""),
            'place':
            place.get_text(),
            'imge':
            imge.get('original')
        }
        print(data)
Ejemplo n.º 11
0
def get_info(url):
    wb_data = requests.get(url,headers=headers)
    print(wb_data)
    wb_data.encoding = wb_data.apparent_encoding
    soup = BeautifulSoup(wb_data.text,'lxml')
    # rankWrap > div.pc_temp_songlist > ul > li:nth-child(1) > span.pc_temp_num
    # rankWrap > div.pc_temp_songlist > ul > li:nth-child(1) > span.pc_temp_num
    #通过比较发现,选择器里信息有些标签都是一样的,所以可以只取部分即可
    ranks = soup.select('span.pc_temp_num')
    # for each in ranks:
    #     print(each.text.strip())#strip()去除换行符,spilt()用来去除特定的字符,返回一个列表对象
    # titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    # for each in titles:
    #     print(each.get('title'))
    times = soup.select('span.pc_temp_tips_r > span')
    # for each in times:
    # print(each.text.strip())
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank':rank.text.strip(),
            'singer':title.text.split('-')[0],
            'song':title.text.split('-')[1],
            'time':time.get_text().strip(),
        }
        print(data)
def kugou_spider(url):
    '''获取酷狗音乐TOP500, 保存至MongoDB'''
    rsp = requests.get(url, headers=headers)
    # print(rsp.text)
    soup = BeautifulSoup(rsp.text, 'lxml')
    # 排行
    ranks = soup.select('.pc_temp_num')
    # print(ranks)
    titles = soup.select('.pc_temp_songlist > ul > li > a')
    # print(titles)
    times = soup.select('.pc_temp_time')
    # print(times)

    for rank, title, time in zip(ranks, titles, times):
        # 歌曲排名
        rank = rank.get_text().strip()
        # 歌曲名称
        song = title.get_text().split('-')[-1].strip()
        # 歌手
        singer = title.get_text().split('-')[0].strip()
        # 歌曲时长
        song_time = time.get_text().strip()
        # print(rank, song, singer, song_time)
        # print('~~~' * 20)

        data = {
            'rank': rank,
            'song': song,
            'singer': singer,
            'song_time': song_time
        }
        songs_id = songs.insert(data)
        print(songs_id)
Ejemplo n.º 13
0
def get_position_results(url):
    ab_urls = 'https://www.zhipin.com'
    headers = {
        'Cookie':
        'lastCity=101210100; JSESSIONID=""; __c=1533020202; __g=-; __l=l=%2Fwww.zhipin.com%2F&r=https%3A%2F%2Fwww.google.com%2F; Hm_lvt_194df3105ad7148dcf2b98a91b5e727a=1532073970,1532323266,1532326685,1533020202; Hm_lpvt_194df3105ad7148dcf2b98a91b5e727a=1533021779; __a=15341501.1528789420.1532323266.1533020202.44.5.24.44; toUrl=https%3A%2F%2Fwww.zhipin.com%2Fc101210100%2Fh_101210100%2F%3Fquery%3D%25E6%2595%25B0%25E6%258D%25AE%25E6%258C%2596%25E6%258E%2598%26page%3D2',
        'user-agent':
        'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/67.0.3396.99 Safari/537.36'
    }
    response = requests.get(url, headers=headers)
    soup = BeautifulSoup(response.text, 'lxml')
    job_title = soup.select('div[class="job-title"]')
    job_salary = soup.select('span[class="red"]')
    job_info = soup.select('div.info-primary > p')
    company = soup.select('div.company-text > h3.name > a')
    company_info = soup.select('div.company-text > p')
    publish_time = soup.select('div.info-publis > p')
    details = soup.select('div.info-primary > h3.name > a')

    position_results = []
    for title, salary, info, comp, comp_info, time, detail in zip(
            job_title, job_salary, job_info, company, company_info,
            publish_time, details):
        title = title.get_text()
        salary = salary.get_text()
        job_info = info.get_text()
        company = comp.get_text()
        company_info = comp_info.get_text()
        publish_time = time.get_text()
        job_url = ab_urls + detail.get('href')
        lst = [
            title, salary, job_info, company, company_info, publish_time,
            job_url
        ]
        position_results.append(lst)
    return position_results
Ejemplo n.º 14
0
def get_zhaopin_1(page):
    url = 'http://sou.zhaopin.com/jobs/searchresult.ashx?jl=深圳&kw=python&p={0}&kt=3'.format(
        page)
    print("第{0}页".format(page))
    wbdata = requests.get(url).content
    soup = BeautifulSoup(wbdata, 'lxml')

    job_name = soup.select("table.newlist > tr > td.zwmc > div > a")
    salarys = soup.select("table.newlist > tr > td.zwyx")
    locations = soup.select("table.newlist > tr > td.gzdd")
    times = soup.select("table.newlist > tr > td.gxsj > span")

    for name, salary, location, time in zip(job_name, salarys, locations,
                                            times):

        url = name['href']
        wbdata = requests.get(url).content
        soup = BeautifulSoup(wbdata, 'lxml')
        gsmc = soup.select('div.inner-left > h2')
        if len(gsmc) > 0:
            company = gsmc[0].get_text()
        else:
            company = ''

        data = {
            'name': name.get_text(),
            'company': company,
            'salary': salary.get_text(),
            'location': location.get_text(),
            'time': time.get_text(),
            'url': name['href']
        }
        print(data)
Ejemplo n.º 15
0
def get_info(url):
    db = pymysql.connect(host='127.0.0.1',
                         user='******',
                         password='******',
                         port=3306,
                         db='spiders')
    cursor = db.cursor()
    sql = 'create table if not exists kugoulist(ranks VARCHAR(255) NOT NULL, singer VARCHAR(255) NOT NULL, song VARCHAR(255) NOT NULL, time VARCHAR(255) NOT NULL)'
    cursor.execute(sql)
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('#rankWrap > div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'ranks': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],
            'time': time.get_text().strip()
        }

        table = 'kugoulist'
        keys = ','.join(data.keys())
        values = ','.join(['%s'] * len(data))
        sql = 'INSERT INTO {table}({keys}) VALUES ({values})'.format(
            table=table, keys=keys, values=values)
        try:
            if cursor.execute(sql, tuple(data.values())):
                print('Successful')
                db.commit()
        except:
            print('Failed')
            db.rollback()
    db.close()
def get_page(html):
    soup = BeautifulSoup(html, 'lxml')
    for content, time in zip(soup.find_all('span', class_="ctt"), soup.find_all('span', class_="ct")):
        print(content.get_text(), time.get_text())
        query = 'insert content(微博内容, 发布时间) value(' + "\'" + content.get_text() + "\'," + "\'" + time.get_text() + "\'" + ');'
        cur.execute(query)
    conn.commit()
Ejemplo n.º 17
0
def kg_spider(url):
    """
    获取酷狗音乐top500, 保存至mongodb
    :param url:  请求地址
    :return:
    """
    data = {}
    res = requests.get(url, headers=headers)
    # print(res.text)
    soup = BeautifulSoup(res.text, "lxml")
    ranks = soup.select(".pc_temp_num")
    # print(ranks)
    titles = soup.select(".pc_temp_songlist > ul > li > a")
    # print(titles)
    times = soup.select(".pc_temp_time")
    # print(times)
    for rank, title, time in zip(ranks, titles, times):
        rank = rank.get_text().strip()
        # print(rank)
        # 歌曲名称
        song = title.get_text().split("-")[-1].strip()

        # 歌手
        singer = title.get_text().split("-")[0].strip()

        href = title["href"]
        req = requests.get(href, headers=headers)
        soup = BeautifulSoup(req.text, "lxml")
        mp3s = soup.select(".mainPage")
        print(mp3s)
        mp3 = mp3s["src"]
        print(mp3)
        song_time = time.get_text().strip()
        # print(rank, song, singer, song_time)

        data = {
            "rank": rank,
            "singer": singer,
            "song": song,
            "time": song_time,
            "href": href,
        }

        # data = {
        #     "rank": rank.get_text().strip(),
        #     "singer": title.get_text().split("-")[0].strip(),
        #     "song": title.get_text().split("-")[-1].strip(),
        #     "time": time.get_text().strip(),
        #     "href": title["href"],
        # }
        # songs_id = songs.insert(data)
    return data
Ejemplo n.º 18
0
def get_info(url):  # 获取网页信息
    web_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(web_data.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')

    for rank, title, time in zip(ranks, titles, times):
        data = {'rank': rank.get_text().strip(),
                'singer': title.get_text().split('-')[0],
                'song': title.get_text().split('-')[1],
                'time': time.get_text().strip()}
        print(data)
Ejemplo n.º 19
0
def get_info(url):
    wb_data = requests.get(url,headers=headers)
    soup = BeautifulSoup(wb_data.text,'lxml')
    ranks = soup.select('span.pc_temp_num')
    titiles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')
    for rank,titile,time in zip(ranks,titiles,times):
        data = {
            'rank':rank.get_text().strip(),
            'single':titile.get_text().strip(),
            'time':time.get_text().strip()
        }
        print(data)
Ejemplo n.º 20
0
def get_info(url):
    wb_data = requests.get(url, headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],  #通过split获取歌手和歌曲信息
            'time': time.get_text().strip()
        }
        print(data)  #获取爬虫信息并按字典打印
Ejemplo n.º 21
0
def get_info(url):
    wb_data = requests.get(url)
    soup = BeautifulSoup(wb_data.text, 'html.parser')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],
            'time': time.get_text().strip()
        }
        print(data)
Ejemplo n.º 22
0
def get_info(url):
    wb_data = requests.get(url, headers=comm.headers)
    soup = BeautifulSoup(wb_data.text, "lxml")
    ranks = soup.select("span.pc_temp_num")
    titles = soup.select("div.pc_temp_songlist > ul > li >a")
    times = soup.select("span.pc_temp_tips_r > span")
    for rank, title, time in zip(ranks, titles, times):
        data = {
            "rank": rank.get_text().strip(),
            "singer": get_song(title.get_text().split('-'), 0),
            "song": get_song(title.get_text().split('-'), 1),
            "time": time.get_text().strip()
        }
        print("结果:", data)
Ejemplo n.º 23
0
def get_info(url):
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    ranks = soup.select("span.pc_temp_num")
    titles = soup.select("#rankWrap > div.pc_temp_songlist > ul > li")
    times = soup.select("span.pc_temp_tips_r > span")
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get("title").split("-")[0],
            'song': title.get("title").split("-")[1],
            'time': time.get_text().strip()
        }
        print(data)
Ejemplo n.º 24
0
def get_info(url, counts):
    """获取网页指定元素信息"""
    try:
        # 编码格式不一致,需要判断
        if econding_type == 'ISO-8859-1':
            r = wb_data.text.encode('ISO-8859-1').decode(
                requests.utils.get_encodings_from_content(
                    wb_data.text)[0])  # 转码
            soup = BeautifulSoup(r, 'lxml')
        else:
            soup = BeautifulSoup(wb_data.text, 'lxml')  # UTF-8
        # 获取指定元素
        titles = soup.select(
            'body > section > div > div > div > header > h1 > a')
        genre_films = soup.select(
            'body > section > div > div > div > header > ul > li > a')
        times = soup.select(
            'body > section > div > div > div > header > ul > li:nth-child(2)')
        imgs = soup.select(
            'body > section > div > div > div > article > div.video_box > div.video_img > img'
        )
        url_paths = soup.select(
            'body > section > div > div > div > header > h1 > a')
        video_infos = soup.select(
            'body > section > div > div > div > article > div.video_box > div.video_info'
        )
        for title, genre_film, time, img, url_path, video_info in zip(
                titles, genre_films, times, imgs, url_paths, video_infos):
            data = [
                # 'article-title':title.get_text().strip(),
                # 'genre_film':genre_film.get_text().strip(),
                # 'time':time.get_text(),
                # 'img':img.get("src"),
                # 'url_path':url_path.get("href"),
                # 'video_info':video_info.get_text().replace('\n',',')
                title.get_text().strip(),
                genre_film.get_text().strip(),
                time.get_text(),
                img.get("src"),
                url_path.get("href"),
                video_info.get_text().replace('\n', ',').replace(' / ', '/')
            ]
            content = "{}{}".format(counts, str(data))  # 转换为st以便写入文本
            # return content
            # print(counts + "正在写入……")
            out_put_file(counts, content)  #调用文件操作函数
    except:
        print('Error,pass,get_info')
        pass
Ejemplo n.º 25
0
def get_info(url):
    Response = requests.get(url, headers=headers)
    soup = BeautifulSoup(Response.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select(' div.pc_temp_songlist > ul > li > a')
    print(titles)
    times = soup.select(' span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],
            'time': time.get_text().strip()
        }
        print(data)
Ejemplo n.º 26
0
def get_info(links):
    for link in links:
        r = requests.get(link, headers=headers)
        soup = BeautifulSoup(r.text, 'lxml')
        titles = soup.select('div.pc_temp_songlist > ul > li >a')
        ranks = soup.select('span.pc_temp_num')
        times = soup.select('span.pc_temp_time')
        for rank, title, time in zip(ranks, titles, times):
            data = {
                'rank': rank.get_text().strip(),
                'singer': title.get_text().strip().split('-')[0],
                'name': title.get_text().strip().split('-')[1],
                'time': time.get_text().strip()
            }
            print(data)
Ejemplo n.º 27
0
def get_info(url):
    wb_data = requests.get(url, headers=headers)  #获取信息
    soup = BeautifulSoup(wb_data.text, 'lxml')  #解析/结构化信息,使其日后方便过滤/提取
    ranks = soup.select('span.pc_temp_num')  #锁定歌曲排名在网页中的具体位置
    titles = soup.select(
        'div.pc_temp_songlist > ul > li > a')  #锁定歌曲名字在网页中对应的位置
    times = soup.select('span.pc_temp_tips_r > span')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            "rank": rank.get_text().strip(),
            "singer": title.get_text().split('-')[0],
            "song": title.get_text().split('-')[1],
            "time": time.get_text().strip()
        }
        print(data)
Ejemplo n.º 28
0
 def get_rank_info(self, url):
     res = requests.get(url, headers=self.headers)
     res.encoding = self.encoding  # 同样读取和写入的编码格式
     a = res.text
     soup = BeautifulSoup(res.text, 'lxml')
     ranks = soup.select('span.pc_temp_num')
     titles = soup.select('a.pc_temp_songname')
     times = soup.select('span.pc_temp_time')
     ids = [song['href'].split('/')[-1].split('.')[0] for song in soup.find_all('a', 'pc_temp_songname')]
     song_list = [(rank.get_text().strip(),
                   title.get_text().strip().split('-')[0].strip(),
                   title.get_text().strip().split('-')[1].strip(),
                   time.get_text().strip(),
                   song_id) for rank, title, time, song_id in zip(ranks, titles, times, ids)]
     return song_list
Ejemplo n.º 29
0
def top(url):
    html = requests.get(url, headers=headers)
    soup = BeautifulSoup(html.text, 'lxml')
    No = soup.select('.pc_temp_num')
    titles = soup.select('.pc_temp_songname')
    href = soup.select('.pc_temp_songname')
    time = soup.select('.pc_temp_time')
    for No, titles, time, href in zip(No, titles, time, href):
        data = {
            'NO': No.get_text().strip(),
            'titles': titles.get_text(),
            'time': time.get_text().strip(),
            'href': href.get('href')
        }
        print(data)
Ejemplo n.º 30
0
def get_info(url):
    wb_data = Requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')

    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist> ul > li> a ')
    times = soup.select('span.pc_temp_tips_r > span ')
    for rank in ranks:
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],
            'time': time.get_text().strip()
        }
        print(data)
Ejemplo n.º 31
0
def get_info(url, file):
    res = requests.get(url, headers=headers)
    res.encoding = file.encoding  # 同样读取和写入的编码格式
    soup = BeautifulSoup(res.text, 'lxml')
    ranks = soup.select('span.pc_temp_num')
    titles = soup.select('a.pc_temp_songname')
    times = soup.select('span.pc_temp_time')
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'title': title.get_text().strip(),
            'time': time.get_text().strip()
        }
        string = "{: <10}{: <30}{: <10}\n".format(data['rank'], data['title'],
                                                  data['time'])  # 格式化输出
        file.write(string)
Ejemplo n.º 32
0
def get_info(url):
    wb_data = requests.get(url, headers=headers)
    soup = BeautifulSoup(wb_data.text, 'lxml')
    ranks = soup.select('div.pc_temp_songlist > ul > li > span.pc_temp_num')
    titles = soup.select('div.pc_temp_songlist > ul > li > a')
    times = soup.select('span.pc_temp_tips_r > span')           # 此处由于这个标签和上一个
    # 标签是在同一个标签下,所以可以只要后面部分路径
    for rank, title, time in zip(ranks, titles, times):
        data = {
            'rank': rank.get_text().strip(),
            'singer': title.get_text().split('-')[0],
            'song': title.get_text().split('-')[1],
            'time': time.get_text().strip()
        }
        print(data)
        writer.writerow((data['rank'], data['singer'], data['song'], data['time']))
Ejemplo n.º 33
0
def get(str, HEADERS, csv_write, out, proxy_info):
    row = []
    result_req = requests.get(str, headers=HEADERS, proxies=proxy_info)
    resultsoup = BeautifulSoup(result_req.text, features='lxml')
    title = resultsoup.find('h2', {"class": "citation__title"})

    name_find = resultsoup.find('div', {'class', 'accordion-tabbed'})
    abstract_a = resultsoup.find('div',
                                 {"class": "article-section__content en main"})
    time = resultsoup.find('span', {"class": "epub-date"})

    if title is None:
        title = 'null'
        row.append(title)
    else:
        row.append(title.get_text())

    if name_find is None:
        author_list = 'null'
        row.append(author_list)
    else:
        span = name_find.find_all('span')
        author_list = ''
        for i in span:
            name = i.get_text()
            if re.search('E-mail', name) is None:
                if author_list == '':
                    author_list = name
                else:
                    author_list = author_list + ';' + name
            else:
                pass
        row.append(author_list)
    if time is None:
        timeresult = 'null'
        row.append(timeresult)
    else:
        row.append(time.get_text())
    if abstract_a is None:
        abstract_b = 'null'
        row.append(abstract_b)
    else:
        abstract_b = abstract_a.find('p')
        row.append(abstract_b.get_text())
    print(row)
    csv_write.writerow(row)
    out.flush()
Ejemplo n.º 34
0
def get(str, HEADERS, csv_write, out, proxy_info):
    row = []
    result_req = requests.get(str, headers=HEADERS, proxies=proxy_info)
    resultsoup = BeautifulSoup(result_req.text, features='lxml')
    title = resultsoup.find('header', {"class": "publicationContentTitle"})

    name_find = resultsoup.find_all('span', {"class": "contrib-author"})
    abstract_a = resultsoup.find('div',
                                 {"class": "abstractSection abstractInFull"})
    time = resultsoup.find('div',
                           {"class": "publicationContentEpubDate dates"})

    if title is None:
        title = 'null'
        row.append(title)
    else:
        title_a = title.find('h3')
        row.append(title_a.get_text())

    if name_find is None:
        author_list = 'null'
        row.append(author_list)
    else:
        author_list = ''
        for i in name_find:
            name = i.find_all('a')
            for j in name:
                if j.get_text() is None:
                    pass
                else:
                    author_list = author_list + ' ' + j.get_text()
        row.append(author_list)
    if time is None:
        timeresult = 'null'
        row.append(timeresult)
    else:
        row.append(time.get_text())
    if abstract_a is None:
        abstract_b = 'null'
        row.append(abstract_b)
    else:
        abstract_b = abstract_a.find('div')
        row.append(abstract_b.get_text())
    print(row)
    csv_write.writerow(row)
    out.flush()
Ejemplo n.º 35
0
    def sniffingThread(self, button, devstore, filter, count, time):
        modele = devstore.get_model()
        est_actif = devstore.get_active()
        dev = ""
        if est_actif < 0:
            dev = ""
        dev = modele[est_actif][0]

        self.log.info(_("Launching sniff process on dev {0} with : count={1}, timeout={2}, filter=\"{3}\"").format(dev, count.get_text(), time.get_text(), filter.get_text()))

        sniffer = pcapy.open_live(dev, 1024, False, int(time.get_text()))

        try:
            sniffer.setfilter(filter.get_text())
        except:
            self.log.warn(_("The provided filter is not valid (it should respects the BPF format"))
            button.set_sensitive(True)
            return

        sniffer.loop(int(count.get_text()), self.packetHandler)
        button.set_sensitive(True)
def get_items_info(item_link):
    wb_data=requests.get(item_link,headers=headers)
    if wb_data.status_code ==200: #判断页面是否存在者ip是否被封ip
        soup=BeautifulSoup(wb_data.text,'lxml')
        titles = soup.select('.title-name')
        times = soup.select('.pr-5')
        types = soup.select('ul.det-infor > li:nth-of-type(1) > span > a')
        prices = soup.select('i.f22')
        adrs = soup.select('ul.det-infor > li:nth-of-type(3)')
        cates=soup.select('div.h-crumbs')
        qualities = soup.select(' div.leftBox > div:nth-of-type(4) > div.det-summary > div > div ')
        for title, time, type, price, adr, cate, quality in zip(titles, times, types, prices, adrs, qualities,cates):
            items_data = {
                'title': title.get_text(),
                'times': time.get_text().split(),
                'type': type.get_text(),
                'price': price.get_text(),
                'adr': list(adr.stripped_strings),
                'qualities': list(quality.stripped_strings),
                'cate':cate.get_text()
            }
            items_info9.insert_one(items_data)
            print(items_data)
movies = soup.find_all("div", attrs={"class": "single-item single-film"})

for movie in movies:
    movieTitle = movie.find("a", attrs={"class": "filmInfoLink"}).get_text()
    try:
        movieRuntime = movie.find("span", attrs={"class": "runtime"}).get_text().strip()
    except AttributeError:
        movieRuntime = ""
    movieLength = movie.find("span", attrs={"class": "length"}).get_text().strip()
    movieGenre = movie.find("span", attrs={"class": "genre"}).get_text().strip()
    print "Movie: " + movieTitle
    print "Runtime: " + movieRuntime
    print "Length: " + movieLength
    print "Genre: " + movieGenre

    #Times
    runningTimesTable = movie.find("table", attrs={"class": "times times-single-day"})
    runningTimes = runningTimesTable.find_all("tr")

    print "Running Times:"
    print "****************************"
    for row in runningTimes:
        movie2d3d = row.find("th").get_text().strip()
        print movie2d3d
        for time in row.find_all("a", attrs={"class": "btn-runningtime"}):
            movieTime = time.get_text().strip()
            print movieTime
        print "++++++++++++++++++++++++++++"
    print "============================\n"