Exemple #1
0
def main():
    maxInt = sys.maxsize

    while True:
        # decrease the maxInt value by factor 10
        # as long as the OverflowError occurs.

        try:
            csv.field_size_limit(maxInt)
            break
        except OverflowError:
            maxInt = int(maxInt / 10)

    create_file("news.csv")
    create_file("papers.csv")
    create_file("others.csv")
    articles = read_articles()
    x = 0
    for item in articles:
        x = x + 1
        # if x < 30050:
        #     continue
        if x % 50 == 0:
            print(str(x) + ": " + item[2])
        real_url = get_page_url(item[2])
        if real_url:
            a = Article(real_url)
            if a.is_valid_url():
                write_csv("news.csv", item)
            elif item[1].lower().find("introduction") is not -1 and item[
                    1].lower().find("result") is not -1:
                write_csv("papers.csv", item)
            else:
                write_csv("others.csv", item)
Exemple #2
0
def get_valid_articles(tweet_df, db_article_collection):
    articles = {}
    tweet_url_list = tweet_df.collected_urls.to_list()
    all_urls = [
        url_info["url"] for url_list in tweet_url_list for url_info in url_list
    ]
    already_in_db_urls = [
        el["url"]
        for el in db_article_collection.find({"url": {
            "$in": all_urls
        }}, {"url"})
    ]

    for idx, url_list in enumerate(tweet_url_list):
        for url_info in url_list:
            if url_info["url"] in articles.keys():
                articles[url_info["url"]]["id"].append(url_info["tweet_id"])
                articles[url_info["url"]]["object_id"].append(
                    tweet_df.iloc[idx].object_id)
                continue
            potential_article = Article(url_info["url"])
            if potential_article.is_valid_url():
                article_features = {}
                if not url_info["url"] in already_in_db_urls:
                    article_features = load_article_content(potential_article)
                    # we assume that articles without author and publish_date are not proper newspaper articles
                    if (article_features["authors"] == []
                            and article_features["publish_date"] == None):
                        continue
                article_features["id"] = [url_info["tweet_id"]]
                article_features["object_id"] = [tweet_df.iloc[idx].object_id]
                article_features["url"] = url_info["url"]
                articles[article_features["url"]] = article_features
    return articles
Exemple #3
0
def predict():

    article_data = {}

    url = request.get_data(as_text=True)[5:]
    url = urllib.parse.unquote(url)
    article = Article(str(url))

    if article.is_valid_url():
        article.download()
        article.parse()
        article.nlp()

        article_data = {
            'title': article.title,
            'authors': article.authors,
            'summary': article.summary,
            'publish_date': article.publish_date,
            'images': article.images,
            'videos': article.movies,
            'url': article.url
        }

        #Passing the news article to the model and returing whether it is Fake or Real
        article_data['pred'] = model.predict([article_data['summary']])
        article_data['pred_probability'] = model.predict_proba(
            [article_data['summary']])

    else:
        article_data['error'] = 'Something seems wrong with the link provided.'

    return render_template('index.html', article_data=article_data)
Exemple #4
0
 def __init__(self, domain: Domain.Domain):
     url = domain.getArticleUrl()
     self.articleUrl = url
     article = Article(url, language='it')
     self.isValidUrl = article.is_valid_url()
     if (article.is_valid_url()):
         article.download()
         article.parse()
         self.title = article.title
         self.author = article.authors
         self.publish_date = article.publish_date
         self.text = article.text
         self.tags = article.tags
         article.nlp()
         self.summary = article.summary
         self.keywords = article.keywords
Exemple #5
0
def get_news_data(url, num_words=None):
    """Retrieves information about the news article"""
    article = Article(url)
    article.download()
    article.parse()
    article.nlp()

    metadata = article.meta_data
    if num_words is None:
        summary_ = summarize(article.text)
    else:
        summary_ = summarize(article.text, words=num_words)
    authors = [metadata['author']]
    for author in article.authors:
        if author not in authors:
            authors.append(author)

    return News(article.title, authors, metadata['description'], article.text,
                article.summary, summary_,
                article.is_valid_body() and article.is_valid_url(),
                metadata['og']['site_name'], metadata['generator'])
Exemple #6
0
    def run(self, set_local=None, pickle=False, verbose=False):
        if not set_local:
            set_local = self.settings.Options.Set_Local

        if verbose:
            print('Starting Scraper')

        start_time = datetime.now()

        reddit = self.settings.Reddit_Params
        art_ignore = self.settings.Article.None_Article_Links

        API = Parameters()
        API.loader('dat/praw.secret')
        API = API.loaded.API_Script_Keys

        api = praw.Reddit(client_id=API.client_id,
                          client_secret=API.client_secret,
                          password=API.password,
                          user_agent=API.user_agent,
                          username=API.username)

        posts_dict = {
            "post title": [],
            "subreddit": [],
            "score": [],
            "is article": [],
            "article title": [],
            "title polarity": [],
            "title objectivity": [],
            "keywords": [],
            "domain": [],
            "link": [],
            "author": [],
            "text": [],
            "comments": [],
            "date": [],
            "target": [],
        }

        article_count = 0
        invalid_links = 0
        failed_links_c = 0
        failed_links = []
        red_sub = 0
        blue_sub = 0

        if verbose:
            print("Pulling Articles")

        for sub in reddit.Red_List + reddit.Blue_List:
            submissions = (x for x in api.subreddit(sub).hot(
                limit=reddit.Scraper_Depth_Limit) if not x.stickied)

            for post in submissions:

                if sub in reddit.Red_List:
                    posts_dict["target"].append(True)
                    red_sub += 1
                if sub in reddit.Blue_List:
                    blue_sub += 1
                    posts_dict["target"].append(False)

                posts_dict["post title"].append(
                    post.title)  # praw reddit scraping to dict
                posts_dict["link"].append(post.url)
                posts_dict["score"].append(int(post.score))
                posts_dict["subreddit"].append(sub)
                posts_dict["date"].append(
                    datetime.fromtimestamp(post.created_utc))

                comments = []  # Comments parsing and scoring
                for comment in post.comments:
                    try:
                        if comment.author != 'AutoModerator':
                            comments.append(
                                (round(comment.score / (post.num_comments),
                                       2), comment.body))
                    except:
                        pass
                posts_dict["comments"].append(comments)

                parsed_url = urlparse(post.url)  # Parse URL for domain
                posts_dict['domain'].append(parsed_url.netloc)

                post_blob = TextBlob(post.title)  # TextBlob NLP - VERY SIMPLE
                posts_dict["title polarity"].append(post_blob.sentiment[0])
                posts_dict["title objectivity"].append(post_blob.sentiment[1])
                posts_dict["keywords"].append(post_blob.noun_phrases)

                article = Article(post.url)  # Instantiate newspaper3k library
                if article.is_valid_url(
                ) and parsed_url.netloc not in art_ignore:

                    try:  # Try to download and parse article
                        article.download()
                        article.parse()

                        article_count += 1
                        posts_dict["is article"].append(True)

                        if article.title != []:  # Title parsed?
                            posts_dict["article title"].append(article.title)
                        else:
                            posts_dict["article title"].append(np.nan)

                        if article.authors != []:  # Author parsed?
                            posts_dict["author"].append(article.authors)
                        else:
                            posts_dict["author"].append(np.nan)

                        if article.text != []:  # Text parsed?
                            posts_dict['text'].append(article.text)
                        else:
                            posts_dict["text"].append(np.nan)

                    except:
                        posts_dict["is article"].append(False)
                        posts_dict["article title"].append(np.nan)
                        posts_dict["author"].append(np.nan)
                        posts_dict["text"].append(np.nan)
                        failed_links_c += 1
                        failed_links.append(post.url)

                else:
                    invalid_links += 1
                    posts_dict["is article"].append(False)
                    posts_dict["article title"].append(np.nan)
                    posts_dict["author"].append(np.nan)
                    posts_dict["text"].append(np.nan)

        if set_local:
            time_now = self.utc_to_pacific(datetime.now())
        else:
            time_now = datetime.now()  # Set local Time
        log_date = time_now.strftime('%m%d%y_%H%M')

        if verbose:
            print("Generating DataFrame")

        posts_df = pd.DataFrame(posts_dict)  # Make it a dataframe
        posts_df = posts_df[[
            "subreddit", "post title", "title polarity", "title objectivity",
            "score", "keywords", "comments", "domain", "link", "is article",
            "article title", "author", "text", "date", "target"
        ]]

        if pickle:
            posts_df.to_pickle(f'log/{log_date}.pickle')

        z = datetime.now() - start_time
        self.scrape_time = f"{(z.seconds//60)%60}min, {z.seconds%60}sec"

        log = Parameters()
        log.loader('log/scraper.log', 'loaded', default=True)

        log.loaded.SCRAPERLOG.Date = time_now.ctime()
        log.loaded.SCRAPERLOG.Scraper_Timer = self.scrape_time
        log.loaded.SCRAPERLOG.Article_Count = article_count
        log.loaded.SCRAPERLOG.Invalid_Links = invalid_links
        log.loaded.SCRAPERLOG.Failed_Links = failed_links
        log.loaded.SCRAPERLOG.Failed_Links_Count = failed_links_c
        log.loaded.SCRAPERLOG.Red_Sub_Count = red_sub
        log.loaded.SCRAPERLOG.Blue_Sub_Count = blue_sub

        log.writer('log/scraper.log', log.loaded, append=True)
        log.writer('log/scraper.log', self.settings, append=True)

        self.scraper_df = posts_df