def get_section_articles(url, title=None, verbose=1, demo=False): # get articles themarker = url == r"https://www.themarker.com/" base_url = url if themarker else r"https://www.haaretz.co.il" source = BeautifulSoup(urlopen(url), 'lxml') articles = st.get_all_links_with_strings( source, (url + '/', url[len(base_url):] + '/'), str_requirement_fun=any) if demo: articles = articles[:3] urls = [a.get('href') for a in articles] if themarker: urls = [ u for u in urls if u and (base_url in u or not 'http' in u) and re.findall('1\.[5-8][0-9]', u) ] urls = st.relative_to_absolute_url(urls, base_url) # remove duplications urls, ids, duplications = st.remove_duplications(urls) titles = [None for _ in ids] # summary st.print_scrapped_articles_summary(urls, verbose=verbose, header=title, duplications=duplications) return (urls, titles)
def get_all_sections(home, verbose=1): # get sections sections = st.get_all_links_with_strings( home, ('href="/home/', '"bananasDataLayerRprt')) urls = [s.get('href') for s in sections] titles = [ s.get('onclick')[len("bananasDataLayerRprt('"):-2] for s in sections ] # complete relative urls urls = st.relative_to_absolute_url(urls, r"https://www.ynet.co.il") # filter undesired sections desired_sections = ('חדשות', 'כלכלה', 'ספורט', 'תרבות', 'דיגיטל', 'בריאות וכושר', 'צרכנות', 'נדל"ן', 'חופש', 'אוכל', 'מדע', 'יחסים', 'דעות', 'קריירה') ids = [ i for i, tit in enumerate(titles) if tit.strip() in desired_sections ] titles = [titles[i] for i in ids] urls = [urls[i] for i in ids] # remove duplications urls, ids, duplications = st.remove_duplications(urls) titles = [titles[i] for i in ids] # return if verbose >= 1: print(f"{len(titles):d} Sections:") pprint(titles) return (urls, titles)
def get_articles_data(sections, save_to=None, update_freq=10, verbose=2, demo=False): return st.get_articles_data_from_sections( sections, get_article_data, save_to=save_to, sheet_name='mako_demo' if demo else 'mako', update_freq=update_freq, verbose=verbose )
def get_section_articles(source, title=None, verbose=1, demo=False): # get articles articles = st.get_all_links_with_strings(source, ('/Article-',), True) if demo: articles = articles[:3] urls = [a.get('href') for a in articles] urls = st.relative_to_absolute_url(urls, r"https://www.mako.co.il") titles = [a.text.strip() for a in articles] titles = [tit[:tit.find('\r\n')] if tit.find('\r\n')!=-1 else tit for tit in titles] # remove duplications urls, ids, duplications = st.remove_duplications(urls) titles = [titles[i] for i in ids] # summary st.print_scrapped_articles_summary(urls, titles, verbose, header=title, duplications=duplications) return (urls, titles)
def get_article_data(url): if 'https://www.themarker.com/' in url: return get_tm_article_data(url) # get page try: soup = st.url2html(url, error_on_failure=False, set_user_agent=True) except: warn(f'Bad URL: {url:s}') return 'BAD_URL' # get data from page try: title = soup.find_all('title')[0].text text = '\n'.join([ par.text.strip() for par in soup.find_all('p', class_='t-body-text') if not 'רשימת הקריאה מאפשרת לך' in par.text and not 'לחיצה על כפתור "שמור"' in par.text and not 'שים לב: על מנת להשתמש ברשימת הקריאה' in par.text ]) except: warn(f'Could not get title and body: {url:s}') return 'IRRELEVANT_PAGE' if len(text) < 30: return 'IRRELEVANT_PAGE' try: subtitle = [ s.get('content').strip() for s in soup.find_all('meta') if s.get('name') == 'description' ][0] except: subtitle = None try: author = [ a for a in soup.find_all('a') if a.get('data-statutil-writer') is not None ][0].text.strip() except: author = None try: date = [ a for a in soup.find_all('time') if a.get('itemprop') == 'datePublished' ][0].text date = re.findall('[0-3][0-9]\.[0-1][0-9]\.20[0-2][0-9]', date)[0] date = datetime.strptime(date, '%d.%m.%Y').date() except: date = None return { 'url': url, 'title': title, 'subtitle': subtitle, 'author': author, 'date': date, 'text': text }
def get_all_sections(home, verbose=1): # get sections sections = st.get_all_links_with_strings(home, ('?partner=NavBar',), True) urls = ['https://www.mako.co.il/news?partner=NavBar'] +\ [s.get('href') for s in sections] titles = ['חדשות'] + [s.text.strip() for s in sections] # complete relative urls urls = st.relative_to_absolute_url(urls, r"https://www.mako.co.il") # filter undesired sections desired_sections = ('חדשות','כסף','ספורט','תרבות', 'בריאות','משפט','אוכל','לימודים וקריירה') ids = [i for i,tit in enumerate(titles) if tit.strip() in desired_sections] titles = [titles[i] for i in ids] urls = [urls[i] for i in ids] # remove duplications urls, ids, duplications = st.remove_duplications(urls) titles = [titles[i] for i in ids] # return if verbose >= 1: print(f"{len(titles):d} Sections:") pprint(titles) return (urls, titles)
def load_data(path, sheets=('ynet', 'mako', 'haaretz'), filter_str=('source','title','text'), force_string=('title','subtitle','text','url','link_title', 'author','section','source'), verbose=1): df = st.load_data_frame(path, sheets=sheets, verbose=verbose) for h in filter_str: df = df[[(isinstance(t, str) and len(t)>0) for t in df[h].values]] pd.options.mode.chained_assignment = None for col in force_string: df.loc[[not isinstance(s,str) for s in df[col]], col] = '' df['blocked'] = [src=='haaretz' and txt.endswith('...') for src,txt in zip(df['source'], df['text'])] return df
def get_article_data(url): # get page try: soup = st.url2html(url,3,False) except: warn(f'Bad URL: {url:s}') return 'BAD_URL' # get data from page try: title = soup.find_all('h1', class_=None)[0].text heb_letters = tuple(range(ord('א'), ord('ת') + 1)) text = '\n'.join( [par.text.strip() for par in soup.find('div',class_='article').find_all('p', class_=None) if len(par.text.strip()) > 30 and np.mean([ord(c) in heb_letters for c in par.text.strip()]) > 0.5] ) if len(text) < 30: raise ValueError() except: warn(f"Could not process URL: {url:s}") return 'IRRELEVANT_PAGE' try: subtitle = soup.find_all('h2',class_=None)[0].text except: subtitle = None try: author = soup.find_all('span', class_='katav')[0].text except: author = None try: date = soup.find_all('span', class_='displayDate')[0].text date = re.findall('[0-3][0-9]/[0-1][0-9]/[0-2][0-9]', date)[0] date = datetime.strptime(date, '%d/%m/%y').date() except: date = None return {'url':url, 'title':title, 'subtitle':subtitle, 'author':author, 'date':date, 'text':text}
def get_homepage(url="https://www.ynet.co.il"): return st.get_homepage(url)