def markdown(site): handler = html2text.HTML2Text() handler.ignore_links = True url = f"https://archive.org/wayback/available?url={site}" jhtml = json.loads(get_html(url).decode('utf-8')) html = get_html(jhtml["archived_snapshots"]["closest"]["url"]) html = html.decode('utf-8') md = handler.handle(html) return md
def markdown(site): handler = html2text.HTML2Text() handler.ignore_links = True url = f"https://archive.org/wayback/available?url={site}" jhtml = json.loads(get_html(url).decode('utf-8')) html = get_html(jhtml["archived_snapshots"]["closest"]["url"]) html = html.decode('utf-8', errors='ignore') md = handler.handle(html) md = filterer.applyGenericFilter(md) if site is not None: if 'folha' in site: md = filterer.applyFolhaFilter(md) if 'estadao' in site: md = filterer.applyEstadaoFilter(md) return md
def update_SQL_through_flask(df, sql_ip): """ This function updates the SQL DB which tracks the source url for redirecting input: - df: a dataframe with the data to be saved in SQL - sql_ip: the ip_address of the SQL DB ("10.71.0.111") output: none """ for i, row in df.iterrows(): # break solr_id = row['solr_id'].replace(' ', '%20') web_url = row['web_url'].replace("'", '').replace(';', '').replace('"', '') redirect_url = web_url.replace(':', '**').replace('/', '!!') flask_url = 'http://' + sql_ip + '/link/' + solr_id + '/' + redirect_url #old server try: html_PR = get_html(flask_url) except Exception: print('was not able to access the url ', flask_url, ' to save as an html file') 1 + 1 print('succesful update of SQL DB: ', sql_ip)
def get_PR_summary(url, proxies, path, html_name, version): try: #html_PR = get_html(url, proxy) html_PR = get_html(url) except Exception: print('was not able to access the url ', url, ' to save as an html file') 1 + 1 if html_PR: if b'PDF' in html_PR[:6]: #if it's a pdf, don't save it as an html return None #saves the html of the page to a text file with open(path + html_name, "wb") as file: #open file in binary mode file.write(html_PR) #file.close() soup = BeautifulSoup(html_PR, "html.parser") summary = "" #initialize a string summary_cache = [] if version == 'RSS': body_tag = soup.body for child in body_tag.descendants: print(child) if child.string and child.string != '\n' and child.string not in summary_cache: #print(child.string) summary = summary + child.string + "\n" summary_cache.append(child.string) soup_text = soup.find_all('p') for i in range(0, len(soup_text)): if soup_text[i].text.strip() not in summary_cache: summary = summary + soup_text[i].text.strip() + " " summary_cache.append(soup_text[i].text.strip()) return summary """ divs = soup.find_all('div') for div in soup: # print(div.string) try: if div.find(text=True) != '\n': summary = summary + div.find(text=True) except Exception: 1+1 """ else: soup_text = soup.find_all('p') for i in range(0, len(soup_text)): summary = summary + soup_text[i].text.strip() + " " return summary else: return None
def shopping(query, pages=1): results = [] for i in range(pages): url = _get_shopping_url(query, i) html = get_html(url) if html: j = 0 soup = BeautifulSoup(html) products = soup.findAll("div", "g") print ("yoooo", products) for prod in products: res = ShoppingResult() divs = prod.findAll("div") for div in divs: match = re.search( "from (?P<count>[0-9]+) stores", div.text.strip()) if match: res.store_count = match.group("count") break h3 = prod.find("h3", "r") if h3: a = h3.find("a") if a: res.compare_url = a["href"] res.name = h3.text.strip() psliimg = prod.find("div", "psliimg") if psliimg: img = psliimg.find("img") if img: res.thumb = img["src"] f = prod.find("div", "f") if f: res.subtext = f.text.strip() price = prod.find("div", "psliprice") if price: res.min_price = price.text.strip() results.append(res) j = j + 1 return results
def shopping(query, pages=1): results = [] for i in range(pages): url = _get_shopping_url(query, i) html = get_html(url) if html: j = 0 soup = BeautifulSoup(html) products = soup.findAll("li", "g") for prod in products: res = ShoppingResult() divs = prod.findAll("div") for div in divs: match = re.search( "from (?P<count>[0-9]+) stores", div.text.strip()) if match: res.store_count = match.group("count") break h3 = prod.find("h3", "r") if h3: a = h3.find("a") if a: res.compare_url = a["href"] res.name = h3.text.strip() psliimg = prod.find("div", "psliimg") if psliimg: img = psliimg.find("img") if img: res.thumb = img["src"] f = prod.find("div", "f") if f: res.subtext = f.text.strip() price = prod.find("div", "psliprice") if price: res.min_price = price.text.strip() results.append(res) j = j + 1 return results
def convert(amount, from_currency, to_currency): """Method to convert currency. Args: amount: numeric amount to convert from_currency: currency denomination of the amount to convert to_currency: target currency denomination to convert to """ # same currency, no conversion if from_currency == to_currency: return amount * 1.0 req_url = _get_currency_req_url(amount, from_currency, to_currency) response = get_html(req_url) rate = _parse_currency_response(response, to_currency) return rate
def search(self, query, area='com', ncr=False, void=True, time_period=False, sort_by_date=False, first_page=0): for i in range(first_page, first_page + self.pages): url = _get_search_url(query, i, lang=self.lang, area=area, ncr=ncr, time_period=time_period, sort_by_date=sort_by_date) print(f'Search URL: {url}&tbm=nws') html = parse(get_html(url + "&tbm=nws")) links = html.xpath('//div[@id="rso"]/descendant::a/@href') [self.results.append(link) for link in links if link[0] != '/'] # URL leads out of Google
def search(query, pages=1, lang='en', void=True): """Returns a list of GoogleResult. Args: query: String to search in google. pages: Number of pages where results must be taken. Returns: A GoogleResult object.""" results = [] for i in range(pages): url = _get_search_url(query, i, lang=lang) html = get_html(url) if html: soup = BeautifulSoup(html, "html.parser") lis = soup.findAll("div", attrs={"class": "g"}) j = 0 for li in lis: res = GoogleResult() res.page = i res.index = j res.name = _get_name(li) res.link = _get_link(li) res.google_link = _get_google_link(li) res.description = _get_description(li) res.thumb = _get_thumb() res.cached = _get_cached(li) if void is True: if res.description is None: continue results.append(res) j += 1 return results
from google.modules.utils import get_html import html2text handler = html2text.HTML2Text() handler.ignore_links = True url = "https://link.estadao.com.br/noticias/inovacao,as-pessoas-estao-repensando-suas-casas-diz-presidente-do-quinto-andar,70003324028" html = get_html(url) html = html.decode('utf-8') md = handler.handle(html) print(md)
def get_PR(A, row, f, k, col, version): #A is a table containning the new PR links #row is a row of A, f is a text file, k is the iteration loop over A #col is the maximum number of PR links summary_cache = [ ] #to check for summary duplicates coming from different url if version == 'PR': start = 2 else: start = 1 if k > 0: #skip the first row bc header for x in range(start, col): #col is 100 flag = False if row[x]: if row[x].startswith('/'): if row[x].startswith('//'): url = 'http://' + row[x].lstrip('//') else: #there was a bug here @V2.0 - YMR root_path = get_root_path( row[1], 0) #there was a bug here @V2.0 - YMR url = root_path + row[ x] #there was a bug here @V2.0 - YMR else: if row[x].startswith('./'): root_path = get_root_path(row[1], 1) url = root_path + row[x].lstrip('.') else: url = row[x] try: #apps2.shareholder is the third party noise link that gives stock data #.php is a redirect link to share (FB, LinkedIn, Twitter) if 'apps2.shareholder' not in url and ".php" not in url: url2 = urllib.parse.quote_plus(url, '/:!#$%^&*()_-+=[]{}?', 'utf-8') url2_path = urllib.parse.urlparse(url).path #sometimes it may be .jpg, .png ext = os.path.splitext(url2_path)[1] name = row[0] + "_PR" + str(x - 1) + "_" + date response = get_html(url2) #if there is are no errors in opening up the socket connection then download if response: print("this is the modified_url of ", url2) if b'PDF' not in response[:6]: if not ext: ext = '.html' if 'html' in ext: ext = '.html' print('this is my extension {0}'.format(ext)) with open(html_directory + "/" + name + ext, 'wb') as out_file: print(html_directory + "/" + name + ext) out_file.write(response) # html_count += 1 #there was a bug here @V2.0 - YMR flag = True else: print( "\n------we will attempt to download a PDF instead\n" ) pdf_name = row[0] + "_PR" + str( x - 1 ) + "_" + date #there was a bug here @V2.0 - YMR pdf_response = get_pdf(url2) if pdf_response: with open( pdf_directory + "/" + pdf_name + '.pdf', 'wb') as out_file: out_file.write(pdf_response) flag = True except Exception as e: print('error in get_PR function:', e) pass
def get_PR_link_cache(data, row, fail): time_start = time.clock() end = len(data) print('start caching PRESS RELEASES links') j = 0 url = row[2] #if the company has no own PR website, then do a search on PRNewsWire.com if url == '': return None #url = url + row[0] #url = urllib.parse.quote_plus(url,'/:!#$%^&*()_-+=[]{}?', 'utf-8') #html_res = get_html(url, proxy) html_res = get_html(url) #each list of links can't have more than 32,000 characters to fit in an excell cell link_cache = [] link_cache2 = [] link_cache3 = [] link_cache4 = [] link_cache5 = [] link_cache6 = [] link_cache7 = [] link_cache8 = [] link_cache9 = [] link_cache10 = [] if not html_res: fail.append(url) print("no html results") #collects all the links within the web page if html_res: soup = BeautifulSoup(html_res, "html.parser") divs = soup.find_all('a') #check for website denying access while displaying an error page divs_flag = 0 if len(divs) == 0: print("divs a href is empty!!") divs_flag = 1 time_end = time.clock() delta = time_end - time_start return None #if return None, then the for loop needs to continue if divs_flag == 1: #if no links were found in the webpage, move to next company #ii = ii+1 return None for a in divs: f = 0 f_link_check = 0 link = '' try: if a["href"].startswith("http://"): link = a["href"] f_link_check = link_check(link) f = 1 elif a["href"].startswith("./"): link = a["href"] f_link_check = link_check(link) f = 1 elif a["href"].startswith("/"): link = a["href"] f_link_check = link_check(link) f = 1 elif a["href"].startswith("https://"): link = a["href"] f_link_check = link_check(link) f = 1 elif a["href"].startswith("/url?"): m = re.match('/url\?(url|q)=(.+?)&', link) if m and len(m.groups()) == 2: link = unquote(m.group(2)) f_link_check = link_check(link) f = 1 else: link = '/' + a["href"] f_link_check = link_check(link) f = 1 #checks that the number of characters per list does not exceeds excel cell limit if f == 1 and f_link_check == 0: if sum(len(i) for i in link_cache) < 28000: link_cache.append(link) elif sum(len(i) for i in link_cache2) < 28000: link_cache2.append(link) elif sum(len(i) for i in link_cache3) < 28000: link_cache3.append(link) elif sum(len(i) for i in link_cache4) < 28000: link_cache4.append(link) elif sum(len(i) for i in link_cache5) < 28000: link_cache5.append(link) elif sum(len(i) for i in link_cache6) < 28000: link_cache6.append(link) elif sum(len(i) for i in link_cache7) < 28000: link_cache7.append(link) elif sum(len(i) for i in link_cache8) < 28000: link_cache8.append(link) elif sum(len(i) for i in link_cache9) < 28000: link_cache9.append(link) elif sum(len(i) for i in link_cache10) < 28000: link_cache10.append(link) except Exception as e: print('no href found') if len(link_cache) != 0: print('PR link cache found and saved') link_cache.sort() data[data.index(row)].append(link_cache) if len(link_cache2) != 0: link_cache2.sort() data[data.index(row)].append(link_cache2) if len(link_cache3) != 0: link_cache3.sort() data[data.index(row)].append(link_cache3) if len(link_cache4) != 0: link_cache4.sort() data[data.index(row)].append(link_cache4) if len(link_cache5) != 0: link_cache5.sort() data[data.index(row)].append(link_cache5) if len(link_cache6) != 0: link_cache6.sort() data[data.index(row)].append(link_cache6) if len(link_cache7) != 0: link_cache7.sort() data[data.index(row)].append(link_cache7) if len(link_cache8) != 0: link_cache8.sort() data[data.index(row)].append(link_cache8) if len(link_cache9) != 0: link_cache9.sort() data[data.index(row)].append(link_cache9) if len(link_cache10) != 0: link_cache10.sort() data[data.index(row)].append(link_cache10) #ii = ii +1 #print(ii) #print(jj) time_end = time.clock() delta = time_end - time_start