def get_headline_details(obj): try: from datetime import datetime timestamp_tag = obj.parent.parent.find( "span", {"class": "time-dt"} ) if timestamp_tag is None: timestamp = datetime.now() else: content = timestamp_tag.contents[0].strip() timestamp = datetime.strptime( content, "%b %d, %Y %H:%M" ) return { "content": "NA", "link": obj["href"].split("?")[0], "scraped_at": datetime.utcnow().isoformat(), "published_at": ist_to_utc(timestamp).isoformat(), "title": "\n".join(filter( str_is_set, map( str.strip, filter(is_string, obj.children) ) )) } except KeyError: import pdb pdb.set_trace()
def get_chronological_headlines(url): response = requests.get(url) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") soup.find("div", id="c_articlelist_widgets_1").decompose() data = [] objs = soup.find("div", { "class": "main-content" }).find_all("span", {"class": "w_tle"}) for obj in objs: dt = obj.find_next("span").find("span").get("rodate") if dt is not None: clean_dt = ist_to_utc(datetime.strptime( dt, "%d %b %Y, %H:%M")).isoformat() data.append({ "link": "https://timesofindia.indiatimes.com" + obj.find("a").get("href"), "content": "NA", "scraped_at": datetime.utcnow().isoformat(), "published_at": clean_dt, "title": obj.find("a").get("title") }) return data
def get_content(obj): from time import sleep sleep(0.7) response = requests.get(obj["link"]) if response.status_code == 200: soup = BeautifulSoup(response.text, "html.parser") pub_tag = soup.find("span", id="pub_date") str_time = pub_tag.find("script").text.split("'")[1].split( "<strong>First Published:</strong> ")[1] obj["published_at"] = ist_to_utc( datetime.strptime(str_time, "%B %d, %Y, %I:%M %p %Z")).isoformat() for i in soup.find("div", id="article_body").find_all("style"): i.decompose() soup.find("div", id="article_body").find("div", { "class": "tag" }).decompose() obj["content"] = soup.find("div", id="article_body").text return "NA"
def get_headline_details(obj): try: from datetime import datetime timestamp_tag = obj.find("span", {"class": "SunChDt2"}) if timestamp_tag is None: timestamp = datetime.now() else: content = timestamp_tag.contents[0].strip() timestamp = datetime.strptime(content, "%d %b %Y %I:%M %p") return { "content": "NA", "link": "https://www.deccanchronicle.com" + obj["href"], "scraped_at": datetime.utcnow().isoformat(), "published_at": ist_to_utc(timestamp).isoformat(), "title": obj.find(['h3', 'h2']).contents[0].strip() } except KeyError: import pdb pdb.set_trace()
def get_trending_headline_details(obj): try: from datetime import datetime timestamp_tag = obj.find("p", {"class": "date"}) if timestamp_tag is None: timestamp = datetime.now() else: content = timestamp_tag.contents[0].strip() timestamp = datetime.strptime(content, "%d-%m-%Y | %I:%M %p") return { "content": "NA", "link": "http://ddnews.gov.in" + obj.find("a")["href"], "scraped_at": datetime.utcnow().isoformat(), "published_at": ist_to_utc(timestamp).isoformat(), "title": obj.find("a").contents[0].strip() } except KeyError: import pdb pdb.set_trace()
def get_headline_details(obj): try: timestamp = datetime.strptime( obj["title"].split("Published: ")[1].split(" IST")[0], "%B %d, %Y %H:%M") return { "content": "NA", "link": obj["href"], "scraped_at": datetime.utcnow().isoformat(), "published_at": ist_to_utc(timestamp).isoformat(), "title": "\n".join( filter(str_is_set, map(str.strip, filter(is_string, obj.children)))) } except KeyError: import pdb pdb.set_trace()
def get_content(url, obj): response = requests.get(url) if response.status_code == 200: html_content = BeautifulSoup(response.text, "html.parser") # extracting time here timestamp_tag = html_content.find('ul', { 'class': 'rowUl' }).find('li') if timestamp_tag is None: timestamp = datetime.now() else: content = timestamp_tag.contents[2].strip() timestamp = datetime.strptime(content, "%d.%m.%y, %I:%M %p") obj["published_at"] = ist_to_utc(timestamp).isoformat() contents = html_content.find( 'div', {'class': 'padiingDetails story-advertise'}) text = '' for cont in contents.stripped_strings: text += cont + ' ' return text return "NA"
def get_headline_details(obj): try: from datetime import datetime timestamp_tag = obj.parent.parent.find("div", {"class": "nstory_dateline"}) if timestamp_tag is None: timestamp = datetime.now() else: content = timestamp_tag.contents[-1].strip() date = content.split("| ")[-1].split(", ") if date[-1].isdigit(): date = " ".join(date) else: for i in range(1, 10): if date[-i].isdigit(): break i -= 1 date = " ".join(date[:-i]) timestamp = datetime.strptime(date + " 05:30", "%A %B %d %Y %H:%M") return { "content": "NA", "link": obj["href"].split("?")[0], "scraped_at": datetime.utcnow().isoformat(), "published_at": ist_to_utc(timestamp).isoformat(), "title": "\n".join( filter(str_is_set, map(str.strip, filter(is_string, obj.children)))) } except KeyError: import pdb pdb.set_trace()