async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) headline = "" date = "" text = [] if parse_headline: headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search( r'date">\s+(\w+ \d+, \d+ \d+:\d+\w\w\s+)<\/span>', article_html) date = text_to_datetime(date_match.group(1)) for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)).replace( "Already have an account? Login", "") if paragraph.count(" ") <= 1 or string_contains( paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("benzinga", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url): if "video/" in url: return None article_html = await self._get(url) headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) date_match = re.search(r'Publish Date" datetime="([\d\-:T+]+)">', article_html) if date_match: date = text_to_datetime(date_match.group(1)) else: return None text = [] for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)) if paragraph.count(" ") <= 2 or string_contains( paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("thestreet", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline or parse_date: date_match = re.search(r"(\w+ \d+, \d{4}) \/\s+(\d+:\d+ \w+) ", article_html) headline_match = re.search( r'ArticleHeader_headline">([^<]+)<\/h1>', article_html) if not date_match or not headline_match: return None headline = clean_html_text(headline_match.group(1)) date = text_to_datetime( date_match.group(1) + " " + date_match.group(2)) start_idx = article_html.index("StandardArticleBody_body") try: end_idx = article_html.index("Attribution_container") except ValueError: end_idx = len(article_html) content_html = article_html[start_idx:end_idx] for paragraph_match in re.finditer(r"<p>([^<]+)<\/p>", content_html): paragraph = clean_html_text(paragraph_match.group(1)) if paragraph.count(" ") > 1: text.append(paragraph) if len(text) == 0: return None return ("reuters", headline, date, "\n\n\n".join(text), self.url + url)
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r">([^<]+)<\/h1>", article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search(r'datetime="([\d\-T:Z\.]+)" itemprop="datePublished"', article_html) if date_match: date = text_to_datetime(date_match.group(1)) else: return None for p_match in re.finditer(r"<(span|p) [^>]+>([\s\S]+?)<\/(span|p)>", article_html): paragraph = clean_html_text(p_match.group(2)) if paragraph.count(" ") <= 2 or string_contains(paragraph, IGNORE_TEXT) or paragraph[0] == ")": continue if "list is empty" in paragraph: break text.append(paragraph) if len(text) == 0: return None return ("yahoo", headline, date, "\n\n\n".join(text), self.url + url)
async def read_latest_headlines(self): headlines = [] urls = set() index_html = await self._get("/topic/stock-market-news") for match in re.finditer(r'"url":"([^"]+?)",[",\w:]+?"title":"([^"]+?)"', index_html): url = match.group(1).replace(r"\u002F", "/") if url in urls: continue else: urls.add(url) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) for kterm in ["Edited Transcript of", "StreetEvents", "Bloomberg", "Zacks"]: term_resp = await self._get( "/v1/finance/search?q={}&lang=en-US®ion=US"esCount=6&newsCount=4" + "&enableFuzzyQuery=false"esQueryId=tss_match_phrase_query" + "&multiQuoteQueryId=multi_quote_single_token_query&newsQueryId=news_cie_vespa" + "&enableCb=true&enableNavLinks=true&enableEnhancedTrivialQuery=true".format(kterm), site="https://query1.finance.yahoo.com", ) try: term_json = json.loads(term_resp) for item in term_json["news"]: url = item["link"] if url in urls: continue else: urls.add(url) headline = clean_html_text(item["title"]) headlines.append((url, headline)) except ValueError: pass return "yahoo", headlines
async def read_article(self, url, parse_headline=True, parse_date=True): article_html = await self._get(url) text = [] headline = "" date = "" if parse_headline: headline_match = re.search(r'itemprop="headline">([^<]+)<\/h1>', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if parse_date: date_match = re.search(r"(\w+ \d+, \w+ \d+:\d+ \w+ \w+)\s+<\/time>", article_html) if not date_match: return None date = text_to_datetime(date_match.group(1)) text = [] for p_match in re.finditer(r"<p>([\s\S]+?)<\/p>", article_html): paragraph = clean_html_text(p_match.group(1)) if len(paragraph) == 0 or string_contains(paragraph, IGNORE_TEXT): continue text.append(paragraph) if len(text) == 0: return None return ("barrons", headline, date, "\n\n\n".join(text), self.url + url)
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer( r'Title" href="([^"]+?)"[ <>h2clas="m\-eiptxrdongv]+?>([^<]+?)<[ /<>h2clas="m\-eiptxrdongvby]+>([^<]+?)<', index_html, ): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) text = clean_html_text(match.group(3)) headlines.append((url, headline, text)) return "thestreet", headlines
async def read_prs_with_regex(self, regex, url_path, type_to_group={ "date": 1, "url": 2, "title": 3 }, full_url_path=False, article_url_base=None, **kwargs): req_url = self._url + url_path if full_url_path: req_url = url_path resp = await self._get(req_url, **kwargs) releases = [] for match in re.finditer(regex, resp): if type_to_group["date"] != -1: date = text_to_datetime( match.group(type_to_group["date"]).strip()) else: date = pendulum.now() if article_url_base is None: url = self._url + match.group(type_to_group["url"]).strip() else: url = article_url_base + match.group( type_to_group["url"]).strip() url = url.replace(" ", "%20") title = clean_html_text(match.group(type_to_group["title"])) if len(title) == 0: continue releases.append((self.NAME.lower(), title, date, "", url)) return self.SYMBOL, self.NAME, releases
async def read_prs(self): resp = await self._get( self._url + "/phpSide/index.php", method="POST", form_params={"url": "News"}, headers={ "referer": "https://www.selectabio.com/investors&media/news&events/", "sec-fetch-dest": "empty", "sec-fetch-mode": "cors", "sec-fetch-site": "same-site", "user-agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36" + " (KHTML, like Gecko) Chrome/81.0.4044.113 Safari/537.36", }, ) releases = [] try: items = json.loads(resp)["data"] except: items = [] for item in items: date = text_to_datetime(item["releaseDate"]["dateUTC"]) url = item["link"]["hostedUrl"] title = clean_html_text(item["title"]) releases.append((self.NAME.lower(), title, date, "", url)) return self.SYMBOL, self.NAME, releases
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer( r'href="(\/[\w\-]+\/[\w\-]+\/\d+\/\d+\/\d+\/[^"]+?)">([^<]+?)<', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) pr_html = await self._get("/pressreleases/") for match in re.finditer( r'href="(\/pressreleases\/\d+[^"]+?)">([^<]+?)<', pr_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "benzinga", headlines
async def resolve_url_to_content(self, url): if not url.startswith(self.url): return None art = url_to_n3karticle(url) text = clean_html_text(art.text) if len(text) < 100: return None return text
async def read_latest_headlines(self): index_html = await self._get("/real-time?mod=hp_LATEST&mod=hp_LATEST") headlines = [] for match in re.finditer(r'headline-link--[\w\d ]+" href="([^"]+?)">([^<]+?)<', index_html): url = match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "barrons", headlines
async def read_latest_headlines(self): comp_html = await self._get("/companies") us_html = await self._get("/world/us") headlines = [] urls = set() for match in re.finditer(HEADLINE_REGEX, comp_html): url = self.url + match.group(1) urls.add(url) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) for match in re.finditer(HEADLINE_REGEX, us_html): url = self.url + match.group(1) if url in urls: continue headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "financialtimes", headlines
async def read_latest_headlines(self): resp = await self._get("/news.ashx") headlines = [] for match in re.finditer(HEADLINES_REGEX, resp): headline = clean_html_text(match.group(2)) url = match.group(1) headlines.append((url, headline)) return headlines
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer(r'title"><a\s*href="([^"]+?)"[^>]+?>([^<]+?)<', index_html): url = match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "alphastocknews", headlines
async def read_article(self, url, parse_headline=True, parse_date=True, parse_text=True): article_html = await self._get(url) text = [] headline = "" date = None if parse_headline: headline_match = re.search(r'itemprop="headline">([^<]+)<', article_html) if not headline_match: return None headline = clean_html_text(headline_match.group(1)) if string_contains(headline, IGNORE_HEADLINE): return None if parse_date: date_match = re.search(r'content="([\d\-T:Z]+)" itemprop="datePub', article_html) date = text_to_datetime(date_match.group(1)) if parse_text: for bullet_match in re.finditer( r'<p class="bullets_li">([^<]+?)<\/p>', article_html): bullet_text = clean_html_text(bullet_match.group(1)) if len(bullet_text) == 0 or string_contains( bullet_text, IGNORE_TEXT): continue text.append(bullet_text) for p_match in re.finditer(r'<p class="p p1">([^<]+?)<\/p>', article_html): p_text = clean_html_text(p_match.group(1)) if len(p_text) == 0 or string_contains(p_text, IGNORE_TEXT): continue text.append(p_text) if len(text) < 2: return None return ("seekingalpha", headline, date, "\n\n\n".join(text), self.url + url)
async def resolve_url_to_content(self, url): if not url.startswith(self.url): return None art = url_to_n3karticle(url) text = clean_html_text(art.text) filtered_text = "\n".join([ line for line in text.split("\n") if not line.startswith("Photo:") ]) return filtered_text
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer(r'a href="([A-Z][^"]+?.html)">([^<]+?)<', index_html): url = self.url + "/" + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "streetinsider", headlines
async def read_latest_headlines(self): index_html = await self._get("/press-releases") headlines = [] for match in re.finditer( r'<a href="(\/press-release\/[^"]+?)">([^<]+?)<', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "pharmiweb", headlines
async def read_latest_headlines(self): index_html = await self._get("/opub/ted/year.htm") headlines = [] for match in re.finditer(r'<a href="([^"]+?)" title="([^"]+?)"', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "usbls", headlines
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer( r'hl_News_\d+" href="([^"]+?)" target="_blank">([^<]+?)<', index_html): url = match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "miningnewsfeed", headlines
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer( r'<div class="title"><a href="(\/[\w\-]+?)">([^<]+?)<\/a><\/div>\s*<\/article>', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "ibtimes", headlines
async def read_latest_headlines(self): index_html = await self._get("/market-news") headlines = [] for match in re.finditer( r'href="([^"]+?)" class="[\w-]+" sasource="market_news\w+">([^<]+?)<', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "seekingalpha", headlines
async def read_latest_headlines(self): html = await self._get("/business/") urls = set() headlines = [] for match in re.finditer(r'"uri":"([^"]+?)","headline":"([^"]+?)"', html): url = self.url + match.group(1) urls.add(url) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) for match in re.finditer( r'<a href="(\/\d+\/\d+\/[^"]+?)"\s*><span class="cd__headline-text">([^<]+?)<\/span>', html): url = self.url + match.group(1) if url in urls: continue headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "cnn", headlines
async def read_latest_headlines(self): index_html = await self._get("/") headlines = [] for match in re.finditer( r'LatestNews-headline"><a href="(http[^"]+?)" title="">([^<]+?)<', index_html): url = match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "cnbc", headlines
async def read_latest_headlines(self): index_html = await self._get("/news-releases/news-releases-list/") headlines = [] for match in re.finditer( r'news-release" href="([^"]+?)" title="[^"]*?">([^<]+?)<', index_html): url = self.url + match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "prnewswire", headlines
async def read_latest_headlines(self): index_html = await self._get("/latest/") headlines = [] for match in re.finditer(HEADLINE_REGEX, index_html): url = match.group(1) if "/page/" in url: continue headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "stat", headlines
async def read_latest_headlines(self): index_html = await self._get("/api/newsroom.ashx") headlines = [] for match in re.finditer( r'headlinelink"><a href="([^"]+?)" class="headlinelink">([^"]+?)<', index_html): url = match.group(1) headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "accesswire", headlines
async def resolve_url_to_content(self, url): if not url.startswith(self.url): return None art = url_to_n3karticle(url) text = clean_html_text(art.text) filtered_text = "\n".join([ line for line in text.split("\n") if not string_contains(line, IGNORE_ITEMS) ]) return filtered_text
async def read_latest_headlines(self): index_html = await self._get("/latest-news") headlines = [] for match in re.finditer( r'headline"><a[ "=\w]+?href="(https:\/\/www.marketwatch.com[^"]+?)">([^<]+?)<', index_html): url = match.group(1).replace("?mod=newsviewer_click", "") headline = clean_html_text(match.group(2)) headlines.append((url, headline)) return "marketwatch", headlines