コード例 #1
0
        curitem = i.find('a')
        url = 'http://www.espn.in' + curitem['href']
        newresponse = requests.get(url)
        soup = BeautifulSoup(newresponse.content, 'html.parser')
        headline = soup.find(
            "header", "article-header").find("h1").get_text().strip()
        subtitle = soup.find(
            "div", "article-body").find("p").get_text().strip()
        para = soup.find("div", "article-body").find_all("p")
        story = ""
        save = ""
        image = (soup.find_all('div', class_='img-wrap'))
        for i in image:
            curitem = i.find('picture')
            curitem = curitem.find('source')
            cur = curitem['srcset']
            curtime = str(time())
            fullfilename = os.path.join('../site/static/', curtime + ".jpg")
            urllib.request.urlretrieve(cur, fullfilename)
            save = save + str(curtime + ".jpg")
            save = save + ","
        for x in para:
            if x.string is not None:
                story = story + "<p> " + x.get_text().strip() + " </p>"
        if save == "" or story == "":
            raise Exception('No image')
        print('espn')
        push_to_database(headline, subtitle, story, 1, url, 'sports', save)
    except BaseException:
        pass
コード例 #2
0
ファイル: huffpost.py プロジェクト: h-sinha/dashfeed
        headline = (soup.find('h1',
                              class_='headline__title')).get_text().strip()
        subtitle = (soup.find('h2',
                              class_='headline__subtitle')).get_text().strip()
        para = soup.find("div", "post-contents").find_all("p")
        story = ""
        save = ""
        tags = soup.find('span', class_='entry-eyebrow').get_text().strip()
        tags = tags.lower()
        image = (soup.find_all('img', class_='image__src'))
        for i in image:
            cur = i['src']
            curtime = str(time())
            fullfilename = os.path.join('../site/static/', curtime + ".jpg")
            urllib.request.urlretrieve(cur, fullfilename)
            save = save + str(curtime + ".jpg")
            save = save + ","
        for x in para:
            if x.string is not None:
                story = story + "<p> " + x.get_text().strip() + " </p>"
        if tags == 'news':
            tags = 'world'
        elif tags == 'tech':
            tags = 'technology'
        if save == "" or story == "":
            raise Exception('No image')
        print('huffingtonpost')
        push_to_database(headline, subtitle, story, 1, url, tags, save)
    except BaseException:
        pass
コード例 #3
0
ファイル: iiit.py プロジェクト: h-sinha/dashfeed
    links = ret.find_all('h4')
    for i in links:
        try:
            aux = i.find('a')
            url = aux['href']
            newresponse = requests.get(url)
            soup = BeautifulSoup(newresponse.content, 'html.parser')
            headline = (soup.find('h1').get_text().strip())
            para = soup.find('div', class_='entry-content').find_all('p')
            story = ""
            save = ""
            subtitle = soup.find('p').get_text().strip()
            image = (soup.find('div', class_='feature-image').find_all('img'))
            save = ""
            for i in image:
                cur = i['src']
                curtime = str(time())
                fullfilename = os.path.join('../site/static', curtime + ".jpg")
                urllib.request.urlretrieve(cur, fullfilename)
                save = save + str(curtime + ".jpg")
                save = save + ","
            for x in para:
                if x.string is not None:
                    story = story + "<p> " + x.get_text().strip() + " </p>"
            if save == "" or story == "":
                raise Exception('No image')
            print('iiit')
            push_to_database(headline, subtitle, story, 1, url, 'iiit', save)
        except BaseException:
            pass