def main(): maxInt = sys.maxsize while True: # decrease the maxInt value by factor 10 # as long as the OverflowError occurs. try: csv.field_size_limit(maxInt) break except OverflowError: maxInt = int(maxInt / 10) create_file("news.csv") create_file("papers.csv") create_file("others.csv") articles = read_articles() x = 0 for item in articles: x = x + 1 # if x < 30050: # continue if x % 50 == 0: print(str(x) + ": " + item[2]) real_url = get_page_url(item[2]) if real_url: a = Article(real_url) if a.is_valid_url(): write_csv("news.csv", item) elif item[1].lower().find("introduction") is not -1 and item[ 1].lower().find("result") is not -1: write_csv("papers.csv", item) else: write_csv("others.csv", item)
def get_valid_articles(tweet_df, db_article_collection): articles = {} tweet_url_list = tweet_df.collected_urls.to_list() all_urls = [ url_info["url"] for url_list in tweet_url_list for url_info in url_list ] already_in_db_urls = [ el["url"] for el in db_article_collection.find({"url": { "$in": all_urls }}, {"url"}) ] for idx, url_list in enumerate(tweet_url_list): for url_info in url_list: if url_info["url"] in articles.keys(): articles[url_info["url"]]["id"].append(url_info["tweet_id"]) articles[url_info["url"]]["object_id"].append( tweet_df.iloc[idx].object_id) continue potential_article = Article(url_info["url"]) if potential_article.is_valid_url(): article_features = {} if not url_info["url"] in already_in_db_urls: article_features = load_article_content(potential_article) # we assume that articles without author and publish_date are not proper newspaper articles if (article_features["authors"] == [] and article_features["publish_date"] == None): continue article_features["id"] = [url_info["tweet_id"]] article_features["object_id"] = [tweet_df.iloc[idx].object_id] article_features["url"] = url_info["url"] articles[article_features["url"]] = article_features return articles
def predict(): article_data = {} url = request.get_data(as_text=True)[5:] url = urllib.parse.unquote(url) article = Article(str(url)) if article.is_valid_url(): article.download() article.parse() article.nlp() article_data = { 'title': article.title, 'authors': article.authors, 'summary': article.summary, 'publish_date': article.publish_date, 'images': article.images, 'videos': article.movies, 'url': article.url } #Passing the news article to the model and returing whether it is Fake or Real article_data['pred'] = model.predict([article_data['summary']]) article_data['pred_probability'] = model.predict_proba( [article_data['summary']]) else: article_data['error'] = 'Something seems wrong with the link provided.' return render_template('index.html', article_data=article_data)
def __init__(self, domain: Domain.Domain): url = domain.getArticleUrl() self.articleUrl = url article = Article(url, language='it') self.isValidUrl = article.is_valid_url() if (article.is_valid_url()): article.download() article.parse() self.title = article.title self.author = article.authors self.publish_date = article.publish_date self.text = article.text self.tags = article.tags article.nlp() self.summary = article.summary self.keywords = article.keywords
def get_news_data(url, num_words=None): """Retrieves information about the news article""" article = Article(url) article.download() article.parse() article.nlp() metadata = article.meta_data if num_words is None: summary_ = summarize(article.text) else: summary_ = summarize(article.text, words=num_words) authors = [metadata['author']] for author in article.authors: if author not in authors: authors.append(author) return News(article.title, authors, metadata['description'], article.text, article.summary, summary_, article.is_valid_body() and article.is_valid_url(), metadata['og']['site_name'], metadata['generator'])
def run(self, set_local=None, pickle=False, verbose=False): if not set_local: set_local = self.settings.Options.Set_Local if verbose: print('Starting Scraper') start_time = datetime.now() reddit = self.settings.Reddit_Params art_ignore = self.settings.Article.None_Article_Links API = Parameters() API.loader('dat/praw.secret') API = API.loaded.API_Script_Keys api = praw.Reddit(client_id=API.client_id, client_secret=API.client_secret, password=API.password, user_agent=API.user_agent, username=API.username) posts_dict = { "post title": [], "subreddit": [], "score": [], "is article": [], "article title": [], "title polarity": [], "title objectivity": [], "keywords": [], "domain": [], "link": [], "author": [], "text": [], "comments": [], "date": [], "target": [], } article_count = 0 invalid_links = 0 failed_links_c = 0 failed_links = [] red_sub = 0 blue_sub = 0 if verbose: print("Pulling Articles") for sub in reddit.Red_List + reddit.Blue_List: submissions = (x for x in api.subreddit(sub).hot( limit=reddit.Scraper_Depth_Limit) if not x.stickied) for post in submissions: if sub in reddit.Red_List: posts_dict["target"].append(True) red_sub += 1 if sub in reddit.Blue_List: blue_sub += 1 posts_dict["target"].append(False) posts_dict["post title"].append( post.title) # praw reddit scraping to dict posts_dict["link"].append(post.url) posts_dict["score"].append(int(post.score)) posts_dict["subreddit"].append(sub) posts_dict["date"].append( datetime.fromtimestamp(post.created_utc)) comments = [] # Comments parsing and scoring for comment in post.comments: try: if comment.author != 'AutoModerator': comments.append( (round(comment.score / (post.num_comments), 2), comment.body)) except: pass posts_dict["comments"].append(comments) parsed_url = urlparse(post.url) # Parse URL for domain posts_dict['domain'].append(parsed_url.netloc) post_blob = TextBlob(post.title) # TextBlob NLP - VERY SIMPLE posts_dict["title polarity"].append(post_blob.sentiment[0]) posts_dict["title objectivity"].append(post_blob.sentiment[1]) posts_dict["keywords"].append(post_blob.noun_phrases) article = Article(post.url) # Instantiate newspaper3k library if article.is_valid_url( ) and parsed_url.netloc not in art_ignore: try: # Try to download and parse article article.download() article.parse() article_count += 1 posts_dict["is article"].append(True) if article.title != []: # Title parsed? posts_dict["article title"].append(article.title) else: posts_dict["article title"].append(np.nan) if article.authors != []: # Author parsed? posts_dict["author"].append(article.authors) else: posts_dict["author"].append(np.nan) if article.text != []: # Text parsed? posts_dict['text'].append(article.text) else: posts_dict["text"].append(np.nan) except: posts_dict["is article"].append(False) posts_dict["article title"].append(np.nan) posts_dict["author"].append(np.nan) posts_dict["text"].append(np.nan) failed_links_c += 1 failed_links.append(post.url) else: invalid_links += 1 posts_dict["is article"].append(False) posts_dict["article title"].append(np.nan) posts_dict["author"].append(np.nan) posts_dict["text"].append(np.nan) if set_local: time_now = self.utc_to_pacific(datetime.now()) else: time_now = datetime.now() # Set local Time log_date = time_now.strftime('%m%d%y_%H%M') if verbose: print("Generating DataFrame") posts_df = pd.DataFrame(posts_dict) # Make it a dataframe posts_df = posts_df[[ "subreddit", "post title", "title polarity", "title objectivity", "score", "keywords", "comments", "domain", "link", "is article", "article title", "author", "text", "date", "target" ]] if pickle: posts_df.to_pickle(f'log/{log_date}.pickle') z = datetime.now() - start_time self.scrape_time = f"{(z.seconds//60)%60}min, {z.seconds%60}sec" log = Parameters() log.loader('log/scraper.log', 'loaded', default=True) log.loaded.SCRAPERLOG.Date = time_now.ctime() log.loaded.SCRAPERLOG.Scraper_Timer = self.scrape_time log.loaded.SCRAPERLOG.Article_Count = article_count log.loaded.SCRAPERLOG.Invalid_Links = invalid_links log.loaded.SCRAPERLOG.Failed_Links = failed_links log.loaded.SCRAPERLOG.Failed_Links_Count = failed_links_c log.loaded.SCRAPERLOG.Red_Sub_Count = red_sub log.loaded.SCRAPERLOG.Blue_Sub_Count = blue_sub log.writer('log/scraper.log', log.loaded, append=True) log.writer('log/scraper.log', self.settings, append=True) self.scraper_df = posts_df