Esempio n. 1
0
def load_previous(data_base):
    previous = []
    try:
        file = open("./documents/web/news.bank", "r", encoding='utf8')
        for line in file:
            previous.append(line)

        i = 0
        while i < len(previous):

            url = regex.get_data('>\s(.+?)\s<', previous[i + 4])[0]
            key = regex.get_data('>\s(.+?)\s<', previous[i + 1])[0]
            #date = regex.get_data('>\s(.+?)\s<',previous[i+5])[0]

            data_base[key].append(url)

            #data_base[url][key] = date
            #data_base[url] = defaultdict(str)
            #data_base[id]['id'] = previous[i]
            #data_base[key]['key'] = previous[i]
            #data_base[url]['title'] = previous[i+1]
            #data_base[url]['source'] = previous[i+2]
            #data_base[url]['url'] = previous[i+3]
            #data_base[url]['date'] = previous[i+4]
            #data_base[url]['author'] = previous[i+5]
            #data_base[url]['content1'] = previous[i+6]
            #data_base[url]['content2'] = previous[i+7]

            i += 10

    except FileNotFoundError:
        pass
Esempio n. 2
0
def update_key(data_base, url, kkey):
    keys_saved = regex.get_data('<key>\s(.+?)\s<', data_base[url]['key'])

    if kkey not in keys_saved:
        data_base[url]['key'] = data_base[url]['key'][:-1]
        data_base[url]['key'] += ' <key> ' + kkey + ' <\key>\n'
        return True

    return False
Esempio n. 3
0
def check_last_update(url, date):
    count = 0
    for u in url:
        d = regex.get_data('\S+\/(\d+\/\d+\/\d+)\S+', u)[0]
        d = int(re.sub(r'/', '', d))
        if d < date:
            return count

        count += 1

    return -1
Esempio n. 4
0
def washington_post(data_base,data_print,key,date):

    hp.login("*****@*****.**", "qazxdr12")

    kkey = fmt.file_name(key,'_')

    kkkey = fmt.file_name(key,'+')

    print("----- "+"washington_post."+kkey+" -----")
    print("Start loading Urls...")


    #case for exact keyword search
    url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query="'
    url2='"&sort=&startat='
    url3='&callback=angular.callbacks._0'
    baseurl = url1+kkkey+url2+'0'+url3

    try:
        page = hp.getHtml(baseurl)
    except urllib.error.URLError:
        pass

    article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0]
    if article_number == 0:

        url1='https://sitesearchapp.washingtonpost.com/sitesearch-api/v2/search.json?count=20&datefilter=displaydatetime:%5B*+TO+NOW%2FDAY%2B1DAY%5D&facets.fields=%7B!ex%3Dinclude%7Dcontenttype,%7B!ex%3Dinclude%7Dname&filter=%7B!tag%3Dinclude%7Dcontenttype:("Article"+OR+(contenttype:"Blog"+AND+name:("Opinions")))&highlight.fields=headline,body&highlight.on=true&highlight.snippets=1&query='
        url2='&sort=&startat='
        url3='&callback=angular.callbacks._0'
        baseurl = url1+kkkey+url2+'0'+url3  

        try:
            page = hp.getHtml(baseurl)
        except urllib.error.URLError:
            print("Washington Post website is not correct, please check the code!")
            return -1

        article_number = regex.get_data('"total"\S(\S+?),"documents',page)[0]

        if article_number == 0:
            print("No Washington Post article was found by this key word")
            return -1

    #get all urls
    count = 0
    index = 0
    urls = []
    page_total = int(article_number) / 20 + 1
    while(count < page_total):

        currenturl = url1+key+url2+str(index)+url3
        try:
            page = hp.getHtml(currenturl)
        except urllib.error.URLError:
            continue

        url = regex.get_data('"contenturl"\S"(https:\/\/www.washingtonpost.com\/opinions/\S+?)"\S"',page)

        if date != 0:
            a_num = check.check_last_update(url,date)
            if a_num != -1:
                urls += url[:-(len(url)-a_num )]
                break

        urls += url

        index += 20
        count += 1
    

    print(str(len(urls))+" Urls loaded...")

    print("There are "+str(len(data_base)+len(data_print))+" loaded file...",)

    
    print("Now starting updating...",)
    count = 0
    #count2 = 0

    for url in urls:

        if url in data_base and kkey in data_base[url]:
            #if check.update_key(data_base, url, kkey):
            #    count2 += 1
            continue

        try:
            html = hp.getHtml(url)
        except urllib.error.URLError:
            continue


        title = regex.get_data('"headline":"(.*?)",',html)
        #<meta content="Julian Zelizer, CNN Political Analyst" name="author">
        author = regex.get_data('this.props.author="(.*?)";',html)
        #<meta content="2018-02-17T00:19:47Z" name="pubdate">
        date = regex.get_data('"datePublished":"(\S+?)T',html)
        
        text2 = regex.get_data('<article.*?>(.*?)<\/p>\s<\/article>',html)

        if text2 != []:
            text = regex.get_data('<p.*?>(.*?)<\/p>',text2[0])
        else:   
            text = regex.get_data('<p.*?>(.*?)<\/p>',html)     

        if text == [] or title == []:
            continue    

        data_print[url] = defaultdict(str)
        # line 1
        data_print[url]['ID'] = fmt.formatted_id(len(data_base)+len(data_print)-1)
        data_print[url]['key'] = fmt.formatted_key(kkey)
        # line 2
        data_print[url]['title'] = fmt.formatted_title(title)
        # line 3
        data_print[url]['source'] = fmt.formatted_source("Washington Post")
        # line 4
        data_print[url]['url'] = fmt.formatted_url(url)
        # line 5
        data_print[url]['date'] = fmt.formatted_date(date)
        # line 6
        data_print[url]['author'] = fmt.formatted_author(author,';')
        # line 7
        data_print[url]['content1'] = fmt.formatted_content_with_symbol(text)
        # line 8
        data_print[url]['content2'] = fmt.formatted_content(text)

        count += 1



    print("Updated "+str(count)+" articles...")
    #if count2 > 0:
    #    print("Updated "+str(count2)+" keys...")
    print("There are "+str(len(data_base)+len(data_print))+" articles...")
Esempio n. 5
0
def cbs(data_base, data_print, key, date_, previous_len):

    kkey = fmt.file_name(key, '_')

    kkkey = fmt.file_name(key, '+')

    print("----- " + "cbs." + kkey + " -----")
    print("Start loading Urls...")

    kkkey = re.sub(r'/', '%2F', kkkey)
    #kkkey = re.sub(r'+', '%2B', kkkey)

    kkkey = re.sub(r'%', '%25', kkkey)

    #case for exact keyword search
    url1 = 'https://www.cbsnews.com/search/?q='
    url2 = '&o=1&p='
    url3 = '&t=opinion'
    baseurl = url1 + kkkey + url2 + '1' + url3
    article_number = '0'

    try:
        page = hp.getHtml(baseurl)
    except urllib.error.URLError:
        print("CBS website is not correct, please check the code!")
        return -1

    try:
        article_number = regex.get_text('<h1\sclass="result-title">(\d+)\s',
                                        page)[0]
    except IndexError:
        article_number = '0'

    if int(article_number) == 0:
        print("No CBS article was found by this key word")
        return -1

    #get all urls
    count = 0
    index = 0
    page_num = 1
    urls = defaultdict(str)
    page_total = int(int(article_number) / 10 + 1)

    reach_updated = False

    print("There are " + article_number + " articles...")
    print("Start loading and Updating...")
    while (count < page_total):

        currenturl = url1 + key + url2 + str(page_num) + url3
        try:
            page = hp.getHtml(currenturl)
        except urllib.error.URLError:
            continue

        url = regex.get_text('<a\shref="(\S+?)"><h3\sclass="title"', page)
        date = regex.get_text(
            '<span\sclass="date">(\S+?\s\d+,\s\d+?)\s\S+\s\S+?\s\S+<\/span>',
            page)

        for cnt in range(0, len(date)):
            date[cnt] = fmt.convert_date(date[cnt])
            if date_ > date[cnt]:
                reach_updated = True
                break

        for i in range(0, cnt + 1):
            try:
                urls['https://www.cbsnews.com' +
                     url[i]] = str(date[i])[0:4] + '-' + str(
                         date[i])[4:6] + '-' + str(date[i])[6:8]
            except IndexError:
                break

        if reach_updated:
            break

        index += 10
        page_num += 1
        count += 1

    print(str(len(urls)) + " URLs loaded...")
    print("Updating database...")

    for url in urls:
        if url in data_base[kkey]:
            continue

        try:
            html = hp.getHtml(url)
        except urllib.error.URLError:
            continue

        title = regex.get_data('<title>([^<]+?)\s-\s[^<]+?<\/title>', html)
        if title == 'Noun':
            title = regex.get_data('<title>([^<]+?)<\/title>', html)
        author = regex.get_data(
            '"author":{".type":"[^"]+?","name":"([^"]+?)"}', html)

        text1 = []
        text1.append(
            regex.get_data('<div\sdata-page=[^>]+?><[^>]*?>\n?([^\n]+?)<.?p>',
                           html))
        text2 = regex.get_text('<p>([^\n]+?)<\/p>', html)
        text = text1 + text2

        if text == [] or title == "Noun":
            continue

        data_base[kkey].append(url)

        data_print[url] = defaultdict(str)
        # line 1
        data_print[url]['ID'] = fmt.formatted_id(
            len(data_base[kkey]) - 1 + previous_len)
        # line 2
        data_print[url]['key'] = fmt.formatted_key(kkey)
        # line 3
        data_print[url]['title'] = fmt.formatted_title(title)
        # line 4
        data_print[url]['source'] = fmt.formatted_source("CBS")
        # line 5
        data_print[url]['url'] = fmt.formatted_url(url)
        # line 6
        data_print[url]['date'] = fmt.formatted_date(urls[url])
        # line 7
        if len(author) != 0:
            aa = author.split(',')
            if len(aa) > 1:
                author = ','.join(aa[:-1])
            elif len(aa) == 0:
                author = 'Noun Noun'
        else:
            author = 'Noun Noun'

        data_print[url]['author'] = fmt.formatted_author(author, ',')
        # line 8
        data_print[url]['content'] = fmt.formatted_content(text)
        # line 9
        #data_print[url[i]]['content2'] = fmt.formatted_content(text)

        print('■', end='', flush=True)

    print("\nThere are " + str(len(data_print) + previous_len) +
          " articles...")
    print("Updated " + str(len(data_print)) + " articles...")
Esempio n. 6
0
def cnn(data_base, data_print, key, date_, previous_len):

    kkey = fmt.file_name(key, '_')

    kkkey = fmt.file_name(key, '+')

    print("----- " + "cnn." + kkey + " -----")

    #case for exact keyword search
    url1 = 'https://search.api.cnn.io/content?size=10&q=%22'
    url2 = '%22&category=opinion'
    baseurl = url1 + key + url2
    article_number = '0'

    try:
        page = hp.getHtml(baseurl)
    except urllib.error.URLError:
        print("CNN website is not correct, please update the scraper!")
        return -1

    article_number = regex.get_text('"meta":{\S+?"of":(\d+?),"maxScore',
                                    page)[0]

    if int(article_number) == 0:
        print("No CNN article was found by this key word")
        return -1

    #get all urls
    count = 0
    index = 0
    page_num = 1
    urls = defaultdict(str)
    page_total = int(int(article_number) / 10 + 1)

    reach_updated = False

    print("There are " + article_number + " articles...")
    print("Start loading URLs...")

    while (count < page_total):

        currenturl = url1 + key + url2 + '&from=' + str(
            index) + '&page=' + str(page_num)
        try:
            page = hp.getHtml(currenturl)
        except urllib.error.URLError:
            continue

        url = regex.get_text('"url":"([^,]+?.html)"\S"', page)
        #title =  regex.get_text('"headline":"([^{]*?)"',page)
        #author = regex.get_text('"byLine":(.*?),',page)

        for i in range(0, len(url)):
            try:
                d = regex.get_data('\/(\d+?\/\d+?\/\d+?)\/', url[i])
            except IndexError:
                break
            d_int = int(re.sub(r'/', '', d))

            if date_ > d_int:
                reach_updated = True
                break

            urls[url[i]] = re.sub(r'/', '-', d)

        if reach_updated:
            break

        index += 10
        page_num += 1
        count += 1

    print(str(len(urls)) + " URLs loaded...")
    print("Updating database...")

    for url in urls:
        if url in data_base[kkey]:
            continue

        try:
            html = hp.getHtml(url)
        except urllib.error.URLError:
            continue

        title = regex.get_data('<title>([^<]+?)\s-\s\w+?<\/title>', html)
        if title == 'Noun':
            title = regex.get_data('<title>([^<]+?)<\/title>', html)
        author = regex.get_data('<meta\scontent\S"([^"]+?)"\sname="author">',
                                html)

        text2 = []
        text2.append(
            regex.get_data(
                '<cite\sclass="el-editorial-source">\s\S\S\S\S\S</cite>([^=]*?)<\/p><\/div>',
                html))
        text1 = regex.get_text(
            '<div\sclass="zn-body__paragraph\s*?\w*?">([^=]+?)</div>?', html)

        text = text2 + text1

        if text == [] or title == "Noun":
            continue

        data_base[kkey].append(url)

        data_print[url] = defaultdict(str)
        # line 1
        data_print[url]['ID'] = fmt.formatted_id(
            len(data_base[kkey]) - 1 + previous_len)
        # line 2
        data_print[url]['key'] = fmt.formatted_key(kkey)
        # line 3
        data_print[url]['title'] = fmt.formatted_title(title)
        # line 4
        data_print[url]['source'] = fmt.formatted_source("CNN")
        # line 5
        data_print[url]['url'] = fmt.formatted_url(url)
        # line 6
        data_print[url]['date'] = fmt.formatted_date(urls[url])
        # line 7
        if len(author) > 5:
            if author[0:3] == "By ":
                author = author[3:]
            aa = author.split(',')
            if len(aa) > 1:
                author = ','.join(aa[:-1])
        else:
            author = 'Noun Noun'

        data_print[url]['author'] = fmt.formatted_author(author, ',')
        # line 8
        data_print[url]['content'] = fmt.formatted_content(text)
        # line 9
        #data_print[url[i]]['content2'] = fmt.formatted_content(text)
        print('■', end='', flush=True)

    print("\nThere are " + str(len(data_print) + previous_len) +
          " articles...")
    print("Updated " + str(len(data_print)) + " articles...")
Esempio n. 7
0
def politico(data_base, data_print, key, date_, previous_len):

    kkey = fmt.file_name(key, '_')

    kkkey = fmt.file_name(key, '+')

    print("----- " + "politico." + kkey + " -----")

    # https://www.politico.com/search/2?s=newest&q=tax%20reform&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000
    url1 = 'https://www.politico.com/search/'
    url2 = '?s=newest&q="'
    url3 = '"&adv=true&c=0000014b-324d-d4f3-a3cb-f3ff415e0035&pv=0000014e-a307-d012-a3fe-bb8793910000'
    baseurl = url1 + '1' + url2 + key + url3
    article_number = '0'

    try:
        page = hp.getHtml(baseurl)
    except urllib.error.URLError:
        print("Politico website is not correct, please update the scraper!")
        return -1

    article_number = regex.get_text(
        '<h1>Results[^<]+?<\/h1>[^<]+?<p>(\d+?)\sSearch\sResults<\/p>',
        page)[0]

    if int(article_number) == 0:
        print("No Policito article was found by this key word")
        return -1

    #get all urls
    count = 0
    page_num = 1
    urls = defaultdict(str)
    page_total = int(int(article_number) / 20 + 1)

    reach_updated = False

    print("There are " + article_number + " articles...")
    print("Start loading URLs...")

    while (count < page_total):

        currenturl = url1 + str(page_num) + url2 + key + url3
        try:
            page = hp.getHtml(currenturl)
        except urllib.error.URLError:
            continue

        url = regex.get_text('<a\shref="([^"]+?)"\s[^<]+?<\/a><\/h3>', page)
        date = regex.get_text(
            '<time datetime=.(\d+?\-\d+?\-\d+?)T\S+.>[^<]+?<\/time><\/p>',
            page)

        #title =  regex.get_data('"title":"([^{]*?)",',page)
        for cnt in range(0, len(date)):
            date[cnt] = int(re.sub(r'-', '', date[cnt]))
            if date_ > date[cnt]:
                reach_updated = True
                break

        for i in range(0, cnt + 1):
            try:
                urls[url[i]] = str(date[i])[0:4] + '-' + str(
                    date[i])[4:6] + '-' + str(date[i])[6:8]
            except IndexError:
                break

        if reach_updated:
            break

        page_num += 1
        count += 1

    print(str(len(urls)) + " URLs loaded...")
    print("Updating database...")

    for url in urls:
        if url in data_base[kkey]:
            continue

        try:
            html = hp.getHtml(url)
        except urllib.error.URLError:
            continue

        title = regex.get_data('<title>([^-]+?)\s-\s[^<]+?<\/title>', html)
        if title == 'None':
            title = regex.get_data('<title>([^<]+?)<\/title>', html)

        author = regex.get_data(
            '<div\sitemprop="author"[^>]+?>[^<]+?<meta\s[^<]+?\s[^"]+?="([^>]+?)"\/>',
            html)

        text = regex.get_text('<p>([^\n]*?)</p>', html)

        if text != []:
            text = text[:-1]

        if text == [] or title == "Noun":
            continue

        data_base[kkey].append(url)

        data_print[url] = defaultdict(str)
        # line 1
        data_print[url]['ID'] = fmt.formatted_id(
            len(data_base[kkey]) - 1 + previous_len)
        # line 2
        data_print[url]['key'] = fmt.formatted_key(kkey)
        # line 3
        data_print[url]['title'] = fmt.formatted_title(title)
        # line 4
        data_print[url]['source'] = fmt.formatted_source("Politico")
        # line 5
        data_print[url]['url'] = fmt.formatted_url(url)
        # line 6
        data_print[url]['date'] = fmt.formatted_date(urls[url])
        # line 7
        data_print[url]['author'] = fmt.formatted_author(author, ',')
        # line 8
        data_print[url]['content'] = fmt.formatted_content(text)
        # line 9
        #data_print[url[i]]['content2'] = fmt.formatted_content(text)

        print('■', end='', flush=True)

    print("\nThere are " + str(len(data_print) + previous_len) +
          " articles...")
    print("Updated " + str(len(data_print)) + " articles...")
Esempio n. 8
0
def foxnews(data_base, data_print, key, date_, previous_len):

    kkey = fmt.file_name(key, '_')

    print("----- " + "foxnews." + kkey + " -----")
    print("Start loading Urls...")

    #case for exact keyword search
    url1 = 'http://api.foxnews.com/v1/content/search?q="'
    url2 = '"&fields=date,description,title,url,image,type,taxonomy&sort=latest&section.path=fnc/opinion&type=article&start='
    url3 = '&callback=angular.callbacks._0'
    baseurl = url1 + key + url2 + '0' + url3
    article_number = '0'

    try:
        page = hp.getHtml(baseurl)
    except urllib.error.URLError:
        print("Foxnews website is not correct, please update the scraper!")
        return -1

    article_number = regex.get_text('"response"\S\S"numFound":(\S+),"docs":\S',
                                    page)[0]

    if int(article_number) == 0:
        print("No Foxnews article was found by this key word")
        return -1

    #get all urls
    count = 0
    index = 0
    urls = defaultdict(str)
    page_total = int(int(article_number) / 10 + 1)

    reach_updated = False

    print("There are " + article_number + " articles...")
    print("Start loading and Updating...")

    while (count < page_total):

        currenturl = url1 + key + url2 + str(index) + url3
        try:
            page = hp.getHtml(currenturl)
        except urllib.error.URLError:
            continue

        url = regex.get_text('url":\S"(\S+?)"\S', page)
        #title =  regex.get_data('"title":"([^{]*?)",',page)

        for i in range(0, len(url)):
            try:
                d = regex.get_data('\/(\d+\/\d+\/\d+)', url[i])
            except IndexError:
                break
            d_int = int(re.sub(r'/', '', d))

            if date_ > d_int:
                reach_updated = True
                break

            urls[url[i]] = re.sub(r'/', '-', d)

        if reach_updated:
            break

        index += 10
        count += 1

    print(str(len(urls)) + " URLs loaded...")
    print("Updating database...")

    for url in urls:
        if url in data_base[kkey]:
            continue

        try:
            html = hp.getHtml(url)
        except urllib.error.URLError:
            continue

        title = regex.get_data('<meta\sname="dc.title"\scontent="([^=]+?)">',
                               html)
        author = regex.get_data(
            '<meta\sname="dc.creator"\scontent="([^"]+?)">', html)

        text = regex.get_text('<p[^>]*?>([^\n]*?)</p>[^<]*?<[^/]', html)
        if text != []:
            text = text[:-1]

        if text == [] or title == "Noun":
            continue

        data_base[kkey].append(url)

        data_print[url] = defaultdict(str)
        # line 1
        data_print[url]['ID'] = fmt.formatted_id(
            len(data_base[kkey]) - 1 + previous_len)
        # line 2
        data_print[url]['key'] = fmt.formatted_key(kkey)
        # line 3
        data_print[url]['title'] = fmt.formatted_title(title)
        # line 4
        data_print[url]['source'] = fmt.formatted_source("Foxnews")
        # line 5
        data_print[url]['url'] = fmt.formatted_url(url)
        # line 6
        data_print[url]['date'] = fmt.formatted_date(urls[url])
        # line 7
        data_print[url]['author'] = fmt.formatted_author(author, ',')
        # line 8
        data_print[url]['content'] = fmt.formatted_content(text)
        # line 9
        #data_print[url[i]]['content2'] = fmt.formatted_content(text)

        print('■', end='', flush=True)

    print("\nThere are " + str(len(data_print) + previous_len) +
          " articles...")
    print("Updated " + str(len(data_print)) + " articles...")