curitem = i.find('a') url = 'http://www.espn.in' + curitem['href'] newresponse = requests.get(url) soup = BeautifulSoup(newresponse.content, 'html.parser') headline = soup.find( "header", "article-header").find("h1").get_text().strip() subtitle = soup.find( "div", "article-body").find("p").get_text().strip() para = soup.find("div", "article-body").find_all("p") story = "" save = "" image = (soup.find_all('div', class_='img-wrap')) for i in image: curitem = i.find('picture') curitem = curitem.find('source') cur = curitem['srcset'] curtime = str(time()) fullfilename = os.path.join('../site/static/', curtime + ".jpg") urllib.request.urlretrieve(cur, fullfilename) save = save + str(curtime + ".jpg") save = save + "," for x in para: if x.string is not None: story = story + "<p> " + x.get_text().strip() + " </p>" if save == "" or story == "": raise Exception('No image') print('espn') push_to_database(headline, subtitle, story, 1, url, 'sports', save) except BaseException: pass
headline = (soup.find('h1', class_='headline__title')).get_text().strip() subtitle = (soup.find('h2', class_='headline__subtitle')).get_text().strip() para = soup.find("div", "post-contents").find_all("p") story = "" save = "" tags = soup.find('span', class_='entry-eyebrow').get_text().strip() tags = tags.lower() image = (soup.find_all('img', class_='image__src')) for i in image: cur = i['src'] curtime = str(time()) fullfilename = os.path.join('../site/static/', curtime + ".jpg") urllib.request.urlretrieve(cur, fullfilename) save = save + str(curtime + ".jpg") save = save + "," for x in para: if x.string is not None: story = story + "<p> " + x.get_text().strip() + " </p>" if tags == 'news': tags = 'world' elif tags == 'tech': tags = 'technology' if save == "" or story == "": raise Exception('No image') print('huffingtonpost') push_to_database(headline, subtitle, story, 1, url, tags, save) except BaseException: pass
links = ret.find_all('h4') for i in links: try: aux = i.find('a') url = aux['href'] newresponse = requests.get(url) soup = BeautifulSoup(newresponse.content, 'html.parser') headline = (soup.find('h1').get_text().strip()) para = soup.find('div', class_='entry-content').find_all('p') story = "" save = "" subtitle = soup.find('p').get_text().strip() image = (soup.find('div', class_='feature-image').find_all('img')) save = "" for i in image: cur = i['src'] curtime = str(time()) fullfilename = os.path.join('../site/static', curtime + ".jpg") urllib.request.urlretrieve(cur, fullfilename) save = save + str(curtime + ".jpg") save = save + "," for x in para: if x.string is not None: story = story + "<p> " + x.get_text().strip() + " </p>" if save == "" or story == "": raise Exception('No image') print('iiit') push_to_database(headline, subtitle, story, 1, url, 'iiit', save) except BaseException: pass