def getmaincontent(url): page = requests.get(url, headers=headers).text html = etree.HTML(page) result = html.xpath("//div[@class='show-content']") if result is None or len(result) <= 0: return "" tmp = result[0].xpath("string(.)") tmp = utils.dealstring(tmp) # print(tmp) return tmp
def save2db(mydict, scatalogid): article = OrderedDict() for tag in mydict['entries']: article['title'] = utils.dealstring(etree.HTML(tag['title']).xpath("string(.)")) article['preid'] = scatalogid article['href'] = "https://www.jianshu.com/p/" + tag["slug"] article['fullcontent'] = getmaincontenthtml(article['href']) article['content'] = getmaincontent(article['href']) mysql = Mysql() mysql.insert_data_to_pages(article)
def getmaincontenthtml(url): page = requests.get(url, headers=headers).text html = etree.HTML(page) result = html.xpath("//div[@class='show-content']") ans = "" for i in result: tmp = etree.tostring(i, encoding="utf-8") tmp = tmp.decode("utf-8").replace("<", "<").replace(">", ">") tmp = utils.dealstring(tmp) ans += tmp # print(ans) ans = deal_img_jianshu(ans) return ans
def getmaincontent(url): """ :param url: :return: """ page = requests.get(url).text html = etree.HTML(page) result = html.xpath("//div[@id='content_views']") if result is None or len(result) <= 0: return "" tmp = result[0].xpath("string(.)") tmp = utils.dealstring(tmp) return tmp
def getmaincontenthtml(url): """ :param url: :return: """ page = requests.get(url).text html = etree.HTML(page) result = html.xpath("//div[@id='content_views']") ans = "" for i in result: tmp = etree.tostring(i, encoding="utf-8") tmp = tmp.decode("utf-8").replace("<", "<").replace(">", ">") tmp = utils.dealstring(tmp) ans += tmp return ans