Esempio n. 1
0
    def parse(self, response):
        url = response.url
        full_url = response.urljoin(url)
        news = newspaper(full_url)

        dict_to_return = news.get_dict
        all_paras = []

        for i in response.xpath("//div[@class='fs-17 pt-2 noto-regular']/p"):
            para = i.xpath(".//text()").get()
            all_paras.append(para)

        tags_list = []
        for i in response.xpath(
                "//div[@class='pb-3 text-center fs-12 uk-text-69 noto-regular listed_topics']/a"
        ):
            tag = i.xpath(".//text()").get()
            tags_list.append(tag)

        full_text = ' \n'.join(all_paras)

        dict_to_return['text_by_para'] = full_text

        dict_to_return['tags_list'] = tags_list

        yield dict_to_return
Esempio n. 2
0
def write_articles(topic):
    # link_file_name = 'data/links/news_' + topic + '.csv'
    link_file_name = 'data/links/moneycontrol_' + topic + '.csv'
    if not os.path.exists(link_file_name) or os.stat(
            link_file_name).st_size == 0:
        print('skipping, link file does not exist')
        return

    with open(link_file_name, 'r') as link_file:
        csv_reader = csv.reader(link_file)
        link_count = 0
        for row in csv_reader:
            link = row[0]
            link_count += 1
            print(link)
            news = newspaper(link)
            news_dict = news.__dict__
            # print(news_dict['get_dict'])
            if news_dict['get_dict']['headline'] == '': return
            # article_file_name = 'data/articles/articles_' + topic + '.csv'
            article_file_name = 'data/articles/moneycontrol_' + topic + '.csv'
            # if os.path.exists(link_file_name) and os.stat(link_file_name).st_size != 0:
            #     print('skipping, article file already exist')
            #     continue

            with open(article_file_name, 'a+') as article_file:
                csv_writer = csv.writer(article_file)
                if link_count == 1:
                    csv_writer.writerow(news_dict['get_dict'].keys())

                csv_writer.writerow(news_dict['get_dict'].values())

        print('total_links', link_count)
Esempio n. 3
0
def run(input, client, output_path, index_name):
    INPUT = str(input)
    CLIENT = lib.create_connection(client)
    OUTPUT_PATH = output_path
    INDEX_NAME = index_name
    MODE = "url"

    if not lib.check_index(client=CLIENT, index=INDEX_NAME):
        lib.logger.debug(f"{INDEX_NAME} not found.")
        return Exception(f"{INDEX_NAME} not found.")

    website = newspaper(INPUT)
    fulltext = website.article

    try:
        nltk_download('punkt')
    except:
        pass
    sentences = tokenize.sent_tokenize(fulltext.strip())

    scores_sentences = lib.get_scores(CLIENT, INDEX_NAME, sentences)
    format_scores_sentences = lib.format_scores(sentences, scores_sentences)
    result = lib.save_result(fulltext, INDEX_NAME, INPUT,
                             format_scores_sentences, OUTPUT_PATH, MODE)
    return result
Esempio n. 4
0
def update_sentiment(n_clicks, input_value):
    news = newspaper(input_value)
    publish_date = news.date_publish
    headline = news.headline
    body = news.article
    sentiment = round(sid.polarity_scores(body)['compound'], 2)
    summary = news.summary
    keywords = ', '.join(news.keywords)
    authors = ', '.join(news.authors)
    return sentiment, publish_date, headline, body, summary, keywords, authors
Esempio n. 5
0
 def prediction():
     news = []
     text = newspaper(data["data"])
     news.append(text.article)
     news = lstm['tokenizer'].texts_to_sequences(news)
     news = tf.keras.preprocessing.sequence.pad_sequences(news,
                                                          padding='post',
                                                          maxlen=256)
     pred = lstm['model'].predict(news)
     print('fake' if pred < 0 else 'true')
Esempio n. 6
0
                print(dir)
                print(f"ATTEMPTING : {url} ({external_link})")
                r = requests.get(url, timeout=1000, allow_redirects=True)

                if r.status_code != 200:
                    print(f"WEB ARCHIVE ERROR: {r.status_code}")
                    f2.write(dir)
                    f2.write("\n")
                    continue

                out['request_response_url'] = r.__dict__['url']
                post['archived_link'] = out['request_response_url']
                post['original_link'] = external_link

            # print("NEWSPAPER")
            n = newspaper(external_link)
            # pprint(n.get_dict)

            out['summary_title'] = n.headline
            out['summary_description'] = n.description
            out['summary_summary'] = n.summary
            out['summary_article'] = n.article
            out['summary_dict'] = n.get_dict

            # print("CPZ")

            f.write(json.dumps(out))
            f.write("\n")

            post['title'] = out['summary_title']
Esempio n. 7
0
                        # print(driver.save_screenshot("ERROR.png"))
                        pass

                    # Wayback Machine
                    print("WAYBACK MACHINE")
                    wayback = waybackpy.Url(res['external_link'])
                    try:
                        res['archived_url'] = wayback.newest().archive_url
                    except:
                        print("SAVING ON WAYBACK")
                        wayback.save()
                        res['archived_url'] = wayback.newest().archive_url

                    # Newspape Metadata
                    print("NEWSFETCH")
                    n = newspaper(res['external_link']).get_dict
                    res['headline'] = n['headline']
                    res['summary'] = n['summary']
                    res['article'] = n['article']
                    res['description'] = n['description']
                    res['publication'] = n['publication']
                    res['date'] = n['date_publish']
                    res['url'] = n['url']
                    res['original_url'] = n['url']

                    f.write(json.dumps(res))
                    f.write("\n")
                    f.flush()

                    out.append(res)
Esempio n. 8
0
from PIL import Image
# from answers import POSITIVE, NEGATIVE
import pytesseract
import json, sys
import base64
import pathlib
import string
from newsfetch.news import newspaper
PATH = "C:/Users/Joao/Documents/Projetos/TCC/fake-news-detector/genuino/src/"
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'

# POSITIVE = ["sim", "s", "certo", "correto", "isso", "isso mesmo", "ok"]
# NEGATIVE = ["não", "n", "errado", "incorreto", "não é isso", "está errado", "ok"]

news = newspaper('http://www.agendadopoder.com/lstarticle.aspx?id=3623')
print(news.article)

file = open(PATH + "tokens/base64.json")
data = json.load(file)
data['img'] = data['img'].replace("data:image/jpeg;base64,", "")
img = base64.b64decode(data['img'])
filename = PATH + "assets/img/received/img.jpg"
with open(filename, 'wb') as f:
    f.write(img)
print(pytesseract.image_to_string(Image.open(filename), lang='por'))

# print("O texto está correto?")
# res = input()

# if res in POSITIVE:
#     print("Estou analisando a notícia, aguarde um momento...")
Esempio n. 9
0
def grab_data():
    asli = []
    
    for j in websites:
        response = requests.get(j)
            
        soup = BeautifulSoup(response.text,'html.parser')
        
        url = soup.find_all('a')
        
        
        for i in range(len(url)) :
            try:
                url[i] = url[i]['href']
            except:
                try:
                    url.remove(url[i])
                except:
                    pass
        
        var=[]
        for i in url:
           if i not in var:
               var.append(i)
    
        
        url = var
       
    
        try:
            f  = open('urlparsed.txt','r')
            already_parsed = f.read().split('\n')
            f.close()
        except:
            f = open('urlparsed.txt','w')
            for i in url:
                try:
                    i =i['href']
                except:
                    pass
                f.write(str(i))
                f.write('\n')
            f.close()
           
        try:
            for i in already_parsed:
                try:
                    url.remove(i)
                except:
                    pass
            
            for i in url:
                already_parsed.append(i)
            
            f = open('urlparsed.txt','w')
            for i in already_parsed:
                    try:
                        i =i['href']
                    except:
                        pass
                    f.write(str(i))
                    f.write('\n')
            f.close()
        except:
            pass
            
        for i in url:    
            try :
                try:
                    i =i['href']
                except:
                    pass
                
                
                if 'https' not in i:
                    if 'http' not in i:
                        i = j+i
                
                #print('\n',i)
                    
                response = requests.get(i,timeout=10)
                details = newspaper(i)
                count = len(details.article)
                publish_date = details.date_publish
                cr_date = details.date_download
                description = details.description
                summary = details.summary
                category = details.category
                if count > 1500:
                    if len(description) > 10 or len(summary) > 10:
                
                        #print("Appended")
                        asli.append(i)
                        
                    else:
                        pass
                else:
                    pass
            except:
                pass
    
    
    
    
    headline=[]
    timestamp=[]
    AUTHORS =[]
    SUMMARY=[]
    date_crawled = []
    news_source = [] 
    full = []
    img_url = []
    keywords=[]
    url_news=[]
    types = []
    for i in asli:
        try:
            chrome_options = Options()
            chrome_options.add_argument('--headless')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--disable-dev-shm-usage')
            driver = webdriver.Chrome('/usr/bin/chromedriver',chrome_options = chrome_options)
            driver.get(i)
            details = newspaper(i)
            
            if 'bbc' in i:
                news_source.append('bbc')
            elif 'techcrunch' in i:
                news_source.append('techcrunch')
            elif 'theguardian' in i:
                news_source.append('theguardian')
            elif 'voanews' in i:
                news_source.append('voanews')
            elif 'abc.net' in i:
                news_source.append('abc')
                
            headline.append(details.headline)
            timestamp.append(details.date_publish)
            url_news.append(i)
            types.append('newspaper')
            author=''
            for i in details.authors:
                author = author + i
                author =author + ', '
                
            
            author= author[:-2]
            AUTHORS.append(author)
            
            keyword=''
            for i in details.keywords:
                keyword = keyword + i
                keyword =keyword + ', '
                
            
            keyword= keyword[:-2]
            keywords.append(keyword)
            
            if len(details.summary) > 10:
                SUMMARY.append(details.summary)
            else:
                SUMMARY.append(details.description)
            
            date_crawled.append(details.date_download)
            
            
            full.append(details.article)
            try:
               re = driver.find_elements_by_tag_name('img')
               for i in re:
                    if'.jpg' in i.get_attribute('src'):
                       im = i.get_attribute('src')
                       break;
               if len(im)>3:
                    img_url.append(im)
               else:
                   img_url.append(None)
            except:
                   img_url.append(None)
           # print('Done inside')
            
            driver.close()
        except:
            try:
                driver.close()
            except:
                pass
            pass
    
    final = pd.DataFrame({'Title':headline,'Author':AUTHORS,'Summary':SUMMARY,
                              'full_text':full,'date_published':timestamp, 'date_crawled':date_crawled,
                              'news_source':news_source,'img':img_url,'keywords':keywords,'url_news':url_news,'Types':types})
        
    for i in final.index:
        try:
            t = pd.DataFrame()
            t =t.append(final.loc[i])
            t.reset_index(drop=True, inplace=True)
            try:
                count = search(t.loc[0]['Title'],t.loc[0]['news_source'])
                #print(count)
                if count < 25 or count==None :
                    test =t.loc[0].to_json()
                    send_data(test,t.loc[0]['news_source'])
                    #print('Data sent')
                else:
                    pass
                    #print('Skipped')
            except:
                test =t.loc[0].to_json()
                send_data(test,t.loc[0]['news_source'])
                
        except Exception as e:
            pass
Esempio n. 10
0
def get_title(url):
    return newspaper(url).headline
Esempio n. 11
0
def summarize(url, queue):
    news = newspaper(url)
    queue.put(summarizer.summarize(news.article, words=100))
Esempio n. 12
0
def get_keywords(url, queue):
    news = newspaper(url)
    queue.put(news.keywords)
## This Python code lets you fetch the News Article Details using the Article Link URL
from newsfetch.news import newspaper
import json

news = newspaper(
    'https://edition.cnn.com/travel/article/disney-world-trip-planning-2020/index.html'
)

#GET RESULT IN JSON
data = {
    'Article': [{
        'headline': news.headline,
        'author': news.authors,
        'publish_date': news.date_publish,
        'modify_date': news.date_modify,
        'download_date': news.date_download,
        'image_url': news.image_url,
        'filename': news.filename,
        'description': news.description,
        'publication': news.publication,
        'category': news.category,
        'source_domain': news.source_domain,
        'article': news.article,
        'summary': news.summary,
        'keyword': news.keywords,
        'title_page': news.title_page,
        'title_rss': news.title_rss,
        'url': news.uri
    }]
}
Esempio n. 14
0
def get_sentences(worp,n=15):
    wordorphrase=worp
    mode = 1
    sentence_list=[]
    link_read=[]
    repeats=0
    while len(sentence_list)<n:
        
        if mode==1:
            
            try:
                google_news = GNews(max_results=75)
                # google_news = GNews(max_results=2)
                temp = google_news.get_news(wordorphrase)
                for i in range(len(temp)):
                    # print(temp[i]['url'])
                    link=temp[i]['url']
                    print("No of sentences collected:",len(sentence_list)," "*10,end='\r')
                    
                    if repeats==15 or len(sentence_list)>=n:
                        repeats=0
                        raise Exception
                    if link in link_read:
                        
                        repeats=repeats+1
                        continue
                    try :
                        requests.get(link,timeout=3)
                        pass
                    except :
                        continue
                    link_read.append(link)
                    news=newspaper(link)
                    article_data=news.article
                    if wordorphrase in article_data:
                        sent=sentences(wordorphrase,article_data)
                        sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>")
            except:
             
                mode=2
                
                continue
            mode=2

        elif mode==2:
            
            try:
                url = bing_news.get_search_url(wordorphrase)
                next_page_url=url
                while next_page_url is not None:
                    resp = requests.get(next_page_url)
                    html = resp.text
                    results, next_page_url = bing_news.extract_search_results(html, url)
                    # print(len(results))
                    for result in results:
                        # print(result['url'])
                        print("No of sentences collected:",len(sentence_list)," "*10,end='\r')
                        
                        if repeats==15 or len(sentence_list)>=n:
                            repeats=0
                            raise Exception
                        if result['url'] in link_read:
                            
                            repeats=repeats+1
                            continue
                        try :
                            requests.get(link,timeout=3)
                            pass
                        except :
                            continue
                        link_read.append(result['url'])
                        news=newspaper(result['url'])
                        article_data=news.article
                        if wordorphrase in article_data:
                            try:
                                sent=sentences(wordorphrase, article_data)
                                sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>")
                                
                            except:
                                continue
                    time.sleep(10)
            except:
                
                mode=3
                
                continue
            mode=3

        elif mode==3:
            
            try:
                url = yahoo_news.get_search_url(wordorphrase)
                next_page_url=url
                while next_page_url is not None:
                    resp = requests.get(next_page_url)
                    html = resp.text
                    results, next_page_url = yahoo_news.extract_search_results(html, url)
                    # print(len(results))
                    for result in results:
                        # print(result['url'])
                        print("No of sentences collected:",len(sentence_list)," "*10,end='\r')
                        if repeats==10 or len(sentence_list)>=n:
                            repeats=0
                            raise Exception
                        if result['url'] in link_read:
                            repeats=repeats+1
                            continue
                            try :
                                requests.get(link,timeout=3)
                                pass
                            except :
                                continue
                        link_read.append(result['url'])
                        news=newspaper(result['url'])
                        article_data=news.article
                        if wordorphrase in article_data:
                            try:
                                sent=sentences(wordorphrase, article_data)
                                sentence_list.append(str(len(sentence_list)+1)+") "+sent+"<br>")
                            except:
                                continue
                    time.sleep(10)
            except:
                break
            break
        # print("Curr Mode: ",mode," "*15)
        time.sleep(10)
    print(sentence_list)
    return " ".join(sentence_list)