Esempi in Python per html_parser, esempi in Python per websiteparser.html_parser

Esempio n. 1

0

Mostra file

File: indiaTodayCrawler.py Progetto: shreyanshu007/Crime-Analysis-BTP

def indiaTodayNews(url, depth, CITY_LIST):
    # calling for given URL.
    # checking if further depth allowed.
    final_time = time.time()
    # print("INDIA TODAY")
    # print(initial_time, "\n", final_time)
    if depth > 0 and (final_time - initial_time) < TOTAL_TIME:
        #print("DEPTH SATISFIED")
        # is this url already present or not
        if not db.IsUrlExists(url):
        # if not db.IsUrlExists(url):

            # reading url
            pgsrc, index = wp.read_webpage(url)
            # if url is opened and read
            if pgsrc:
                url_soup = wp.html_parser(pgsrc)

                # extracting the meta data of the url to check whether it is article or non-article url
                url_type = url_soup.find('meta', {'property': 'og:type'})

                # if the url is article type.
                if url_type and url_type.get('content').lower() in ['story', 'article']:
                    # if the url is article type then extract required details and do further processing.
                    try:
                        article = newspaper.Article(url)
                        article.download()
                        article.parse()
                        location = None
                        for city in CITY_LIST:
                            if city in article.url:
                                location = city
                                break
                                
                        if location:
                            extractToiNews(article.title, article.text, article.publish_date, article.url, location)

                    except Exception as e:
                        print(str(e))

                # extract all anchor tags in this webpage and then send ecah url for further procesing .
                returned_url_list = url_soup.find_all('a')

                for return_url in returned_url_list:

                    link = return_url.get('href')
                    if link and link != URL:
                        # checking if the url is complete or not i.e. starting with http ot not.
                        if 'http' not in link:
                            link = return_link(URL, link)
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                indiaTodayNews(link, depth-1, CITY_LIST)

                        elif 'indiatoday.in' in link:
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                indiaTodayNews(link, depth-1, CITY_LIST)

Esempio n. 2

0

Mostra file

File: thehinducrawler.py Progetto: shreyanshu007/Crime-Analysis-BTP

def theHinduNews(url, depth, CITY_LIST):
    # calling for given URL.
    # checking if further depth allowed.
    final_time = time.time()
    if depth > 0 and (final_time - initial_time) < TOTAL_TIME:
        # is this url already present or not
        if not db.IsUrlExists(url):
        # if not db.IsUrlExists(url):
            pgsrc, index = wp.read_webpage(url)
            # if url is opened and read
            if pgsrc:
                url_soup = wp.html_parser(pgsrc)
                # extracting the meta data of the url to check whether it is article or non-article url
                url_type = url_soup.find('meta', {'property': 'og:type'})
                # if the url is article type.
                if url_type and url_type.get('content').lower() in ['article', 'story']:
                    # if the url is article type then extract required details and do further processing.
                    try:
                        article = newspaper.Article(url)
                        article.download()
                        article.parse()
                        publish_date = url_soup.find('meta', {'name': 'publish-date'})
                        location = url_soup.find('meta', {'property': 'article:section'})
                        if publish_date:
                            publish_date = publish_date.get('content')

                        if location:
                            location = location.get('content')

                        print(location)
                        if any(item.lower() in location.lower() for item in CITY_LIST):
                            extractToiNews(article.title, article.text, publish_date, article.url, location)

                    except Exception as e:
                        print(str(e))

                # extract all anchor tags in this webpage and then send ecah url for further procesing .
                returned_url_list = url_soup.find_all('a')

                for return_url in returned_url_list:

                    link = return_url.get('href')
                    if link and link != URL:
                        # checking if the url is complete or not i.e. starting with http ot not.
                        # checking if the url is complete or not i.e. starting with http ot not.
                        if 'http' not in link:
                            link = return_link(URL, link)
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                theHinduNews(link, depth-1, CITY_LIST)

                        elif 'thehindu.com' in link:
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                theHinduNews(link, depth-1, CITY_LIST)

Esempio n. 3

0

Mostra file

File: hindustantimescrawler.py Progetto: shreyanshu007/Crime-Analysis-BTP

def HindustanTimesNewsSiteCrawler(url, depth, CITY_LIST):
    print("Opening the Website: ", url)
    # reading given url page
    pgsrc, index = wp.read_webpage(url)

    # if source page read is not None
    if pgsrc:

        # parsing the page
        soup = wp.html_parser(pgsrc)

        # read all anchor tags
        a_tags = soup.find_all('a')
        # print(a_tags)
        for tag in a_tags:

            link = tag.get('href')
            if 'http' not in link:
                link = return_link(url, link)

            hindustanTimesNews(link, depth, CITY_LIST)
    else:
        print("can't read")

Esempio n. 4

0

Mostra file

File: thehinducrawler.py Progetto: shreyanshu007/Crime-Analysis-BTP

def TheHinduNewsSiteCrawler(url, depth, CITY_LIST):
    print("Opening the Website: ", url)
    
    # reading given url page
    pgsrc, index = wp.read_webpage(url)

    # if source page read is not None
    if pgsrc:

        # parsing the page
        soup = wp.html_parser(pgsrc)

        # read all anchor tags
        a_tags = soup.find_all('a')

        for tag in a_tags:
            
            if tag:
                link = tag.get('href')
                if link and 'http' not in link:
                    link = return_link(url, link)
    
                theHinduNews(link, depth, CITY_LIST)

Esempio n. 5

0

Mostra file

File: hindustantimescrawler.py Progetto: shreyanshu007/Crime-Analysis-BTP

def hindustanTimesNews(url, depth, CITY_LIST):
    # calling for given URL.
    # checking if further depth allowed.
    final_time = time.time()
    if depth > 0 and (final_time - initial_time) < TOTAL_TIME:
        # is this url already present or not
        if not db.IsUrlExists(url):
            # reading url
            pgsrc, index = wp.read_webpage(url)
            # if url is opened and read
            if pgsrc:
                url_soup = wp.html_parser(pgsrc)
                # extracting the meta data of the url to check whether it is article or non-article url
                tag = url_soup.find('meta', {'property': 'og:type'})

                if tag and tag.get('content') == 'article':
                    print("Article")
                    try:
                        # if the url is article type then extract required details and do further processing.
                        article = newspaper.Article(url)
                        article.download()
                        article.parse()
                        location = url_soup.find('meta', {'name': 'section'})
                        location = location.get('content').split()[0].split(
                            '-')[0]
                        if any(item.lower() in location.lower()
                               for item in CITY_LIST):
                            extract_details(article.title, article.text,
                                            article.publish_date, article.url,
                                            location)

                    except Exception as e:
                        print("Exception occurred: ", e)

                else:
                    print("Non Article")

                # extract all anchor tags in this webpage and then send each url for further procesing .
                returned_url_list = url_soup.find_all('a')
                # print(returned_url_list)
                for return_url in returned_url_list:

                    link = return_url.get('href')
                    if link and link != URL:
                        # checking if the url is complete or not i.e. starting with http ot not.
                        if 'http' not in link:
                            link = return_link(URL, link)
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                hindustanTimesNews(link, depth - 1, CITY_LIST)

                        elif 'hindustantimes.com' in link:
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                hindustanTimesNews(link, depth - 1, CITY_LIST)
                    else:
                        pass
                        #print("NO LINK")
            else:
                pass
            # print("ERROR READING PAGE")
        else:
            pass
            #print("ALREADY PRESENT")
    else:
        pass

Esempio n. 6

0

Mostra file

def news18(url, depth, CITY_LIST):
    # calling for given URL.
    # checking if further depth allowed.
    final_time = time.time()
    if depth > 0 and (final_time - initial_time) < TOTAL_TIME:
        # is this url already present or not
        if not db.IsUrlExists(url):
            # if not db.IsUrlExists(url):
            # reading url
            pgsrc, index = wp.read_webpage(url)
            # if url is opened and read
            if pgsrc:
                url_soup = wp.html_parser(pgsrc)
                # extracting the meta data of the url to check whether it is article or non-article url
                url_type = url_soup.find('meta', {'property': 'og:type'})
                # if the url is article type.
                if url_type and url_type.get('content').lower() in [
                        'article', 'story'
                ]:
                    # if the url is article type then extract required details and do further processing.
                    try:
                        article = newspaper.Article(url)
                        article.download()
                        article.parse()

                        date_tags = url_soup.find_all(
                            'script', {'type': 'application/ld+json'})
                        for date_tag in date_tags:
                            contentDict = eval(date_tag.get_text())
                            if 'datePublished' in contentDict.keys():
                                articleDate = contentDict['datePublished']
                                break

                        location = None
                        for city in CITY_LIST:
                            if city in article.url:
                                location = city
                                break

                        if location:
                            extractToiNews(article.title, article.text,
                                           articleDate, article.url, location)

                    except Exception as e:
                        print(str(e))

                # extract all anchor tags in this webpage and then send ecah url for further procesing .
                returned_url_list = url_soup.find_all('a')

                for return_url in returned_url_list:

                    link = return_url.get('href')
                    if link and link != URL:
                        # checking if the url is complete or not i.e. starting with http ot not.
                        if 'http' not in link:
                            link = return_link(URL, link)
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                news18(link, depth - 1, CITY_LIST)

                        elif 'news18.com' in link:
                            # addition of all the urls present in this web page
                            if any(item in link for item in CITY_LIST):
                                news18(link, depth - 1, CITY_LIST)