Beispiel #1
0
def getGpListNews(type, url):
    htmlContent = etree.HTML(UrlUtil.parse_url(url))
    if htmlContent.xpath(".//div[@class='repeatList']") != None and len(
            htmlContent.xpath(".//div[@class='repeatList']")
    ) > 0 and htmlContent.xpath(".//div[@class='repeatList']")[0].xpath(
            './/ul/li') != None:
        content = htmlContent.xpath(".//div[@class='repeatList']")[0].xpath(
            './/ul/li')
        orglists = []
        for div in content:
            org = {}
            href = str(
                div.xpath('.//p[@class="title"]')[0].xpath('.//a/@href')[0])
            org["href"] = href
            if len(div.xpath('.//div/a/img/@src')) > 0:
                icon = str(div.xpath('.//div/a/img/@src')[0])
                org["icon"] = 'http:' + icon
            else:
                org["icon"] = ""
            title = div.xpath('.//p[@class="title"]')[0].xpath('.//a')[0].text
            org["title"] = title
            desc = div.xpath('.//p[@class="info"]')[0].text
            org["desc"] = desc
            time = div.xpath('.//p[@class="time"]')[0].text
            org["time"] = time
            org["type"] = type
            orglists.append(org)

        for org in orglists:
            getDetailInfo(org)
Beispiel #2
0
def getDetailInfo(org):
    time.sleep(2)
    href = org["href"]
    detailContent = etree.HTML(UrlUtil.parse_url(href))
    if detailContent.xpath('.//div[@class="Body"]') != None and len(
            detailContent.xpath('.//div[@class="Body"]')) > 0:
        content = str(detailContent.xpath("string(.//div[@class='Body'])"))
        # org["news_content"] = content
        GpDbUtli.insertGpNews(org["title"].strip(), org["icon"].strip(), org["type"].strip(), org["desc"].strip(),
                              content.strip(), org["time"].strip())
Beispiel #3
0
def getNewsType():
    htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl))
    typs = []

    base = "http://finance.eastmoney.com"
    if len(htmlContent.xpath('.//ul[@id="daodu_header"]')) > 0:
        ul = htmlContent.xpath('.//ul[@id="daodu_header"]')[0]
        for li in ul:
            type = {}
            type["type"] = li.text
            type["href"] = str(li.xpath('./@data-href')[0])
            typs.append(type)

        for type in typs:
            getGpListNews(type["type"], type["href"])
Beispiel #4
0
def getNewsType(types):
    htmlContent = etree.HTML(UrlUtil.parse_url(baseUrl))
    typs = []

    base = "http://finance.eastmoney.com"
    if len(htmlContent.xpath('.//div[@id="box_pic"]')) > 0:
        ul = htmlContent.xpath('.//div[@id="box_pic"]')[0]
        uls = ul.xpath('.//ul/li')
        for li in uls:
            type = {}
            type["title"] = str(li.xpath('./a/@title')[0])
            type["href"] = str(li.xpath('./a/@href')[0])
            type["type"] = types
            type["icon"] = str(li.xpath('./a/img/@src')[0])
            typs.append(type)

        for type in typs:
            getTopDetailInfo(type)