Esempio n. 1
0
def get_section_articles(url, title=None, verbose=1, demo=False):
    # get articles
    themarker = url == r"https://www.themarker.com/"
    base_url = url if themarker else r"https://www.haaretz.co.il"
    source = BeautifulSoup(urlopen(url), 'lxml')
    articles = st.get_all_links_with_strings(
        source, (url + '/', url[len(base_url):] + '/'),
        str_requirement_fun=any)
    if demo:
        articles = articles[:3]
    urls = [a.get('href') for a in articles]
    if themarker:
        urls = [
            u for u in urls if u and (base_url in u or not 'http' in u)
            and re.findall('1\.[5-8][0-9]', u)
        ]
    urls = st.relative_to_absolute_url(urls, base_url)
    # remove duplications
    urls, ids, duplications = st.remove_duplications(urls)
    titles = [None for _ in ids]
    # summary
    st.print_scrapped_articles_summary(urls,
                                       verbose=verbose,
                                       header=title,
                                       duplications=duplications)
    return (urls, titles)
Esempio n. 2
0
def get_all_sections(home, verbose=1):
    # get sections
    sections = st.get_all_links_with_strings(
        home, ('href="/home/', '"bananasDataLayerRprt'))
    urls = [s.get('href') for s in sections]
    titles = [
        s.get('onclick')[len("bananasDataLayerRprt('"):-2] for s in sections
    ]
    # complete relative urls
    urls = st.relative_to_absolute_url(urls, r"https://www.ynet.co.il")
    # filter undesired sections
    desired_sections = ('חדשות', 'כלכלה', 'ספורט', 'תרבות', 'דיגיטל',
                        'בריאות וכושר', 'צרכנות', 'נדל"ן', 'חופש', 'אוכל',
                        'מדע', 'יחסים', 'דעות', 'קריירה')
    ids = [
        i for i, tit in enumerate(titles) if tit.strip() in desired_sections
    ]
    titles = [titles[i] for i in ids]
    urls = [urls[i] for i in ids]
    # remove duplications
    urls, ids, duplications = st.remove_duplications(urls)
    titles = [titles[i] for i in ids]
    # return
    if verbose >= 1:
        print(f"{len(titles):d} Sections:")
        pprint(titles)
    return (urls, titles)
Esempio n. 3
0
File: mako.py Progetto: ido90/News
def get_articles_data(sections, save_to=None,
                      update_freq=10, verbose=2, demo=False):
    return st.get_articles_data_from_sections(
        sections, get_article_data, save_to=save_to,
        sheet_name='mako_demo' if demo else 'mako',
        update_freq=update_freq, verbose=verbose
    )
Esempio n. 4
0
File: mako.py Progetto: ido90/News
def get_section_articles(source, title=None, verbose=1, demo=False):
    # get articles
    articles = st.get_all_links_with_strings(source, ('/Article-',), True)
    if demo:
        articles = articles[:3]
    urls = [a.get('href') for a in articles]
    urls = st.relative_to_absolute_url(urls, r"https://www.mako.co.il")
    titles = [a.text.strip() for a in articles]
    titles = [tit[:tit.find('\r\n')] if tit.find('\r\n')!=-1 else tit
              for tit in titles]
    # remove duplications
    urls, ids, duplications = st.remove_duplications(urls)
    titles = [titles[i] for i in ids]
    # summary
    st.print_scrapped_articles_summary(urls, titles, verbose,
                                       header=title, duplications=duplications)
    return (urls, titles)
Esempio n. 5
0
def get_article_data(url):
    if 'https://www.themarker.com/' in url:
        return get_tm_article_data(url)
    # get page
    try:
        soup = st.url2html(url, error_on_failure=False, set_user_agent=True)
    except:
        warn(f'Bad URL: {url:s}')
        return 'BAD_URL'
    # get data from page
    try:
        title = soup.find_all('title')[0].text
        text = '\n'.join([
            par.text.strip()
            for par in soup.find_all('p', class_='t-body-text')
            if not 'רשימת הקריאה מאפשרת לך' in par.text
            and not 'לחיצה על כפתור "שמור"' in par.text
            and not 'שים לב: על מנת להשתמש ברשימת הקריאה' in par.text
        ])
    except:
        warn(f'Could not get title and body: {url:s}')
        return 'IRRELEVANT_PAGE'
    if len(text) < 30:
        return 'IRRELEVANT_PAGE'
    try:
        subtitle = [
            s.get('content').strip() for s in soup.find_all('meta')
            if s.get('name') == 'description'
        ][0]
    except:
        subtitle = None
    try:
        author = [
            a for a in soup.find_all('a')
            if a.get('data-statutil-writer') is not None
        ][0].text.strip()
    except:
        author = None
    try:
        date = [
            a for a in soup.find_all('time')
            if a.get('itemprop') == 'datePublished'
        ][0].text
        date = re.findall('[0-3][0-9]\.[0-1][0-9]\.20[0-2][0-9]', date)[0]
        date = datetime.strptime(date, '%d.%m.%Y').date()
    except:
        date = None

    return {
        'url': url,
        'title': title,
        'subtitle': subtitle,
        'author': author,
        'date': date,
        'text': text
    }
Esempio n. 6
0
File: mako.py Progetto: ido90/News
def get_all_sections(home, verbose=1):
    # get sections
    sections = st.get_all_links_with_strings(home, ('?partner=NavBar',), True)
    urls = ['https://www.mako.co.il/news?partner=NavBar'] +\
           [s.get('href') for s in sections]
    titles = ['חדשות'] + [s.text.strip() for s in sections]
    # complete relative urls
    urls = st.relative_to_absolute_url(urls, r"https://www.mako.co.il")
    # filter undesired sections
    desired_sections = ('חדשות','כסף','ספורט','תרבות',
                        'בריאות','משפט','אוכל','לימודים וקריירה')
    ids = [i for i,tit in enumerate(titles) if tit.strip() in desired_sections]
    titles = [titles[i] for i in ids]
    urls = [urls[i] for i in ids]
    # remove duplications
    urls, ids, duplications = st.remove_duplications(urls)
    titles = [titles[i] for i in ids]
    # return
    if verbose >= 1:
        print(f"{len(titles):d} Sections:")
        pprint(titles)
    return (urls, titles)
Esempio n. 7
0
def load_data(path,
              sheets=('ynet', 'mako', 'haaretz'),
              filter_str=('source','title','text'),
              force_string=('title','subtitle','text','url','link_title',
                            'author','section','source'),
              verbose=1):
    df = st.load_data_frame(path, sheets=sheets, verbose=verbose)
    for h in filter_str:
        df = df[[(isinstance(t, str) and len(t)>0) for t in df[h].values]]
    pd.options.mode.chained_assignment = None
    for col in force_string:
        df.loc[[not isinstance(s,str) for s in df[col]], col] = ''
    df['blocked'] = [src=='haaretz' and txt.endswith('...')
                     for src,txt in zip(df['source'], df['text'])]
    return df
Esempio n. 8
0
File: mako.py Progetto: ido90/News
def get_article_data(url):
    # get page
    try:
        soup = st.url2html(url,3,False)
    except:
        warn(f'Bad URL: {url:s}')
        return 'BAD_URL'
    # get data from page
    try:
        title = soup.find_all('h1', class_=None)[0].text
        heb_letters = tuple(range(ord('א'), ord('ת') + 1))
        text = '\n'.join(
            [par.text.strip()
             for par in soup.find('div',class_='article').find_all('p', class_=None)
             if len(par.text.strip()) > 30
             and np.mean([ord(c) in heb_letters for c in par.text.strip()]) > 0.5]
        )
        if len(text) < 30: raise ValueError()
    except:
        warn(f"Could not process URL: {url:s}")
        return 'IRRELEVANT_PAGE'
    try:
        subtitle = soup.find_all('h2',class_=None)[0].text
    except:
        subtitle = None
    try:
        author = soup.find_all('span', class_='katav')[0].text
    except:
        author = None
    try:
        date = soup.find_all('span', class_='displayDate')[0].text
        date = re.findall('[0-3][0-9]/[0-1][0-9]/[0-2][0-9]', date)[0]
        date = datetime.strptime(date, '%d/%m/%y').date()
    except:
        date = None

    return {'url':url, 'title':title, 'subtitle':subtitle,
            'author':author, 'date':date, 'text':text}
Esempio n. 9
0
def get_homepage(url="https://www.ynet.co.il"):
    return st.get_homepage(url)