Example #1
0
def get_yike_detail(category, id):
    url = const.v2_base_categories['yike']['detail'] + id
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')

    content_list = []
    content = soup.find('div', attrs={'id': 'content'})
    content_soup = BeautifulSoup(str(content), 'html.parser')
    items = content_soup.find_all(name=['p', 'img'])
    for i in range(len(items)):
        item = items[i]
        txt = ''
        img = ''
        txt_cen = ''
        if item.name == 'p':
            try:
                if item['class'] == ['img_desc']:
                    txt_cen = item.text
                else:
                    txt = item.text
            except:
                txt = item.text
        elif item.name == 'img':
            img = item['src']
        if txt != '' or img != '' or txt_cen != '':
            info = const.get_default_content_item()
            info['txt'] = txt
            info['img'] = img
            info['txt_cen'] = txt_cen
            content_list.append(info)

    result = const.get_default_detail_item()
    result['content'] = content_list
    return result
Example #2
0
def get_juzi_detail(category, id):
    url = const.v2_base_categories['juzi']['detail'] + id + ".html"
    response = requests.get(url)
    print(url)
    print(response.text)
    soup = BeautifulSoup(response.text, 'html.parser')

    content_list = []
    content = soup.find('article', attrs={'id': 'juzi_info'})
    content_soup = BeautifulSoup(str(content), 'html.parser')
    items = content_soup.find_all(name=['p', 'img', 'h1'])
    for i in range(len(items)):
        item = items[i]
        txt = ''
        img = ''
        h = ''
        if item.name == 'p':
            txt = item.text.strip()
        elif item.name == 'img':
            img = item['data-original']
        elif item.name == 'h1':
            h = item.text
        if txt != '' or img != '' or h != '':
            info = const.get_default_content_item()
            info['h'] = h
            info['txt'] = txt
            info['img'] = img
            content_list.append(info)
    result = const.get_default_detail_item()
    result['content'] = content_list
    return result
    pass
Example #3
0
def get_36kr_detail(category, id):
    url = const.v2_base_categories['36kr']['detail'] + id
    response = requests.get(url).json()['data']
    title = response['catch_title']
    content = response['content']
    content_list = []
    content_soup = BeautifulSoup(str(content), 'html.parser')
    items = content_soup.find_all(name=['p', 'img', 'h3'])
    for i in range(len(items)):
        item = items[i]
        h = ''
        txt = ''
        img = ''
        if item.name == 'h3':
            h = item.text
        elif item.name == 'p':
            txt = item.text
        elif item.name == 'img':
            img = item['src']
        if h != '' or txt != '' or img != '':
            info = const.get_default_content_item()
            info['h'] = h
            info['txt'] = txt
            info['img'] = img
            content_list.append(info)

    result = const.get_default_detail_item()
    result['title'] = title
    result['content'] = content_list
    return result
    pass
Example #4
0
def get_qdaily_detail(category, id):
    url = const.v2_base_categories['qdaily']['detail'] + id + '.html'
    response = requests.get(url)
    response.encoding = 'utf-8'
    soup = BeautifulSoup(response.text, 'html.parser')
    head = soup.find('div', attrs={'class': 'article-detail-hd'})
    head_soup = BeautifulSoup(str(head), 'html.parser')
    title = head_soup.find('h1', attrs={'class': 'title'}).text
    author = head_soup.find('div', attrs={'class': 'author'}).text
    date = head_soup.find('span', attrs={'class': 'date'}).text
    content = soup.find('div', attrs={'class': 'article-detail-bd'})
    content_soup = BeautifulSoup(str(content), 'html.parser')
    items = content_soup.find_all(name=['p', 'img', 'h3', 'h2', 'li'])

    content_list = []
    for i in range(len(items)):
        item = items[i]
        h = ''
        txt = ''
        img = ''
        li = ''
        if item.name == 'h3' or item.name == 'h2':
            h = item.text
        elif item.name == 'li':
            li = item.text.strip()
        elif item.name == 'p':
            txt = item.text
        elif item.name == 'img':
            img = item['data-src']
        if txt.strip()[:4] == 'p.p1':
            txt = ''
        if h != '' or txt != '' or img != '' or li != '':
            info = const.get_default_content_item()
            info['h'] = h
            info['txt'] = txt
            info['img'] = img
            info['li'] = li
            content_list.append(info)

    result = const.get_default_detail_item()
    result['title'] = title
    result['author'] = author
    result['date'] = date
    result['content'] = content_list
    return result
Example #5
0
def get_zhihu_detail(category, id):
    url = const.v2_base_categories['zhihu']['detail'] + id
    response = urlopen(url)
    data = json.loads(response.read().decode())
    content = data['body']
    soup = BeautifulSoup(content, 'html.parser')

    content_list = []
    titles = soup.find_all('h2', attrs={'class': 'question-title'})
    for i in range(len(titles)):
        title = titles[i].text
        print(title)
        if title == '':
            continue
        info = const.get_default_content_item()
        info['h'] = title

    contents = soup.find_all('div', attrs={'class': 'content'})
    for i in range(len(contents)):
        content = contents[i]
        content_soup = BeautifulSoup(str(content), 'html.parser')
        items = content_soup.find_all(name=['p', 'img', 'h2', 'blockquote'])
        for i in range(len(items)):
            item = items[i]
            txt = ''
            img = ''
            h = ''
            if item.name == 'p' or item.name == 'blockquote':
                txt = item.text
            elif item.name == 'h2':
                h = item.text
            elif item.name == 'img':
                img = item['src']
            if txt != '' or img != '' or h != '':
                info = const.get_default_content_item()
                info['h'] = h
                info['txt'] = txt
                info['img'] = img
                content_list.append(info)

    result = const.get_default_detail_item()
    result['content'] = content_list
    return result
Example #6
0
def get_sspai_detail(category, id):
    url = const.v2_base_categories['sspai']['detail'] + id
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find('article')
    content_soup = BeautifulSoup(str(content), 'html.parser')
    title = content.h1.text
    head_content = content_soup.find('div', attrs={'class': 'meta'})
    author = head_content.h4.text
    date = head_content.time.text

    content_list = []
    content = content_soup.find('div', attrs={'id': 'article-content'})
    content_soup = BeautifulSoup(str(content.div.div), 'html.parser')
    items = content_soup.find_all(name=['p', 'img', 'h2', 'li', 'h4'])
    for i in range(len(items)):
        item = items[i]
        txt = ''
        img = ''
        h = ''
        li = ''
        if item.name == 'h2':
            h = item.text.strip()
        elif item.name == 'p' or item.name == 'h4':
            txt = item.text.strip()
        elif item.name == 'img':
            img = item['src']
        elif item.name == 'li':
            li = item.text.strip()
        if h != '' or txt != '' or img != '' or li != '':
            info = const.get_default_content_item()
            info['txt'] = txt
            info['h'] = h
            info['img'] = img
            info['li'] = li
            content_list.append(info)
    result = const.get_default_detail_item()
    result['title'] = title
    result['author'] = author
    result['date'] = date
    result['content'] = content_list
    return result
    pass
Example #7
0
def get_mark_detail(category, id):
    body = {'uid': '580287', 'muid': 'BLug54qV6Mut0iU9VuPABA==', 'id': str(id)}
    response = requests.post(const.v2_base_categories['mark']['detail'],
                             data=body)
    content = response.json()['data']['content']
    content_soup = BeautifulSoup(content, 'html.parser')

    content_list = []
    content = content_soup.find('div', attrs={'class': 'content'})
    content_soup = BeautifulSoup(str(content), 'html.parser')
    contents = content_soup.find_all(name=['p', 'img'])
    for i in range(len(contents)):
        item = contents[i]
        img = ''
        txt = ''
        txt_cen = ''
        print(item)
        if item.name == 'img':
            try:
                img = item['data-ke-src']
            except:
                print(item)
        elif item.name == 'p':
            try:
                if item['style'] == 'line-height: 25.6px; text-align: center;':
                    txt_cen = item.text
                else:
                    txt = item.text.strip()
            except:
                txt = item.text.strip()
        if img != '' or txt != '' or txt_cen != '':
            info = const.get_default_content_item()
            info['img'] = img
            info['txt'] = txt
            info['txt_cen'] = txt_cen
            content_list.append(info)
    result = const.get_default_detail_item()
    result['content'] = content_list
    return result
Example #8
0
def get_tmt_detail(category, id):
    url = const.v2_base_categories['tmt']['detail'] + id + '.html'
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    content = soup.find('div', attrs={'class': 'inner'})
    content_soup = BeautifulSoup(str(content), 'html.parser')

    content_list = []
    items = content_soup.find_all(name=['p', 'img', 'h2'])
    for i in range(len(items)):
        item = items[i]
        txt = ''
        img = ''
        h = ''
        print(item)
        if item.name == 'p':
            txt = item.text.strip()
            print(len(content_list))
            if len(content_list) > 0:
                if txt == content_list[len(content_list) - 1]['txt']:
                    txt = ''
        elif item.name == 'h2':
            h = item.text
            if len(content_list) > 0:
                if h == content_list[len(content_list) - 1]['txt']:
                    content_list.pop(len(content_list) - 1)
        elif item.name == 'img':
            img = item['src']
        if txt != '' or h != '' or img != '':
            info = const.get_default_content_item()
            info['h'] = h
            info['img'] = img
            info['txt'] = txt
            content_list.append(info)
    result = const.get_default_detail_item()
    result['content'] = content_list
    return result
Example #9
0
def get_one_detail(category, id):
    if category == '1':
        url = const.v2_base_categories['one']['detail_article'] + id
    elif category == '2':
        url = const.v2_base_categories['one']['detail_serial'] + id
    elif category == '3':
        url = const.v2_base_categories['one']['detail_question'] + id
    elif category == '4':
        url = const.v2_base_categories['one']['detail_music'] + id
    elif category == '5':
        url = const.v2_base_categories['one']['detail_movie'] + id
    else:
        url = ''
    response = requests.get(url)
    soup = BeautifulSoup(response.text, 'html.parser')
    title = soup.find(name=['p', 'div'], attrs={
        'class': 'text-title'
    }).text.strip()
    try:
        subtitle = soup.find('p', attrs={
            'class': 'text-subtitle'
        }).text.strip()
    except:
        subtitle = ''
    try:
        author = soup.find(name=['p', 'div'],
                           attrs={
                               'class': ['text-simple-author', 'text-author']
                           }).text.strip()
    except:
        author = ''

    contents = soup.find_all('div', attrs={'class': 'text-content'})
    editors = soup.find_all('p', attrs={'text-editor'})
    content_list = []
    try:
        i = soup.find('div', attrs={'class': 'text-music-info'})
        info = const.get_default_content_item()
        info['txt_cen'] = i.text.strip()
        content_list.append(info)
    except:
        pass

    if len(contents) > 1:
        content = contents[1]
        asker = soup.find('div', attrs={'class': 'text-askers'}).text.strip()
        info = const.get_default_content_item()
        info['txt_cen'] = asker
        content_list.append(info)
        question = contents[0].text
        info = const.get_default_content_item()
        info['txt'] = question
        content_list.append(info)
        answer = soup.find('div', attrs={'class': 'text-answers'}).text.strip()
        info = const.get_default_content_item()
        info['txt_cen'] = answer
        content_list.append(info)
    else:
        content = contents[0]

    content_soup = BeautifulSoup(str(content), 'html.parser')
    items = content_soup.find_all(name=['p', 'img'])[1:]
    for i in range(len(items)):
        item = items[i]
        txt = ''
        img = ''
        txt_cen = ''
        if item.name == 'p':
            try:
                if item['style'] == 'text-align: center;':
                    txt_cen = item.text
            except:
                txt = item.text.strip()
        elif item.name == 'img':
            img = item['src']
        if img == 'http://image.wufazhuce.com/music_copyright_2_2.png' or img == 'http://image.wufazhuce.com/music_copyright_1.png':
            continue
        if txt != '' or img != '' or txt_cen != '':
            info = const.get_default_content_item()
            info['txt'] = txt
            info['img'] = img
            info['txt_cen'] = txt_cen
            content_list.append(info)
    for i in range(len(editors)):
        editor = editors[i].text
        info = const.get_default_content_item()
        info['txt_cen'] = editor
        content_list.append(info)
    result = const.get_default_detail_item()
    result['title'] = title
    result['subtitle'] = subtitle
    result['author'] = author
    result['content'] = content_list
    return result
    pass