def parallel_arct(arg): arcticle = arg local_id = 0 with lock: local_id = id.value id.value += 1 try: newarticle = NewsArticle(local_id, arcticle[0], arcticle[1], arcticle[2], arcticle[3], arcticle[4], countries) newarticle.extract_metadata() except Exception: print "Constructor" return newarticle
def get_article(art): art.download() if art.is_downloaded: art.parse() if art.is_parsed: art.nlp() na = NewsArticle(art) p_uri = urlparse(art.source_url) p_domain = '{uri.netloc}'.format(uri=p_uri) loop.run_until_complete( do_insert(db[p_domain], na.GetMongoDocument())) else: print(',', end="", flush=True) else: print('*', end="", flush=True)
def retrieve_homepage_articles(soup): homepage_headline_class_tag = "storylink" headlines = soup.find_all("a", {"class": homepage_headline_class_tag}) articles = list() for result in headlines: news_article = NewsArticle() news_article.title = result.string.strip() news_article.url = result['href'] news_article.source = "HackerNews" articles.append(news_article) return articles
def retrieve_homepage_articles(soup): homepage_headline_class_tag = "block-link__overlay-link" headlines = soup.find_all("a", {"class": homepage_headline_class_tag}) articles = list() for i in range(20): result = headlines[i] news_article = NewsArticle() news_article.title = result.string.strip() news_article.url = result['href'] news_article.source = "BBC" articles.append(news_article) return articles
def get_search_result(soup): search_result_class_tag = "search-results" headlines_class_tag = "headline" footer_date_tag = "flags btm" date_class_tag = "display-date" search_results = soup.find("ol", { "class": search_result_class_tag }).find_all("li") articles = list() for result in search_results: news_article = NewsArticle() result = result.find("div") result_headline = result.find("h1", {"itemprop": headlines_class_tag}) # date under tags: footer -> dl -> dd -> time result_date = result.find("footer").find("dl", { "class": footer_date_tag }).find("dd").find("time", {"class": date_class_tag}) news_article.title = result_headline.find("a").string.strip() news_article.url = result_headline.find("a")['href'] #TODO: put date in correct format news_article.date = result_date.string.strip() news_article.source = "BBC" articles.append(news_article) return articles
def get_homepage_articles(): headlines = list() subreddit = "all" url = "https://www.reddit.com/r/" + subreddit + "/hot.json" json_str = requests.get(url, headers = {'User-agent': 'your bot 0.2'}).text data = json.loads(json_str) for i in data["data"]["children"]: the_data = i["data"] if "title" in the_data.keys(): new_article = NewsArticle(title=the_data["title"], source='Reddit') headlines.append(new_article) return headlines
def get_headlines(num_headlines=None, browser=None): url = "https://www.cnn.com/" soup = get_url_soup(url, browser=browser) counter = 0 headlines = list() urls = list() for h3_soup in soup.find_all("h3", {"class": "cd__headline"}): counter += 1 headline = h3_soup.find("span", { "class": "cd__headline-text" }).get_text() article = NewsArticle() article.title = headline print(article.title) headlines.append(article) url = h3_soup.find("a")["href"] if "https://www.cnn.com" not in url: url = "https://www.cnn.com" + url urls.append(url) if num_headlines is not None and counter >= num_headlines: break return headlines
def get_headlines(num_headlines=None): url = "https://www.cnet.com/" soup = get_url_soup(url) #print(soup.prettify()) counter = 0 headlines = list() for headline_soup in soup.find_all( "a", {"section": lambda x: x is not None and "pebble" in x}): h3_soup = headline_soup.find("h3") if h3_soup is None: continue counter += 1 title = (h3_soup.get_text().strip()) headlines.append(NewsArticle(aTitle=title)) if num_headlines is not None and counter >= num_headlines: break return headlines
def convert_to_class(item): news_article = NewsArticle() news_article.authors = item['authors'] news_article.date_download = ExtractedInformationStorage.datestring_to_date( item['date_download']) news_article.date_modify = ExtractedInformationStorage.datestring_to_date( item['date_modify']) news_article.date_publish = ExtractedInformationStorage.datestring_to_date( item['date_publish']) news_article.description = item['description'] news_article.filename = item['filename'] news_article.image_url = item['image_url'] news_article.language = item['language'] news_article.localpath = item['localpath'] news_article.title = item['title'] news_article.title_page = item['title_page'] news_article.title_rss = item['title_rss'] news_article.source_domain = item['source_domain'] news_article.text = item['text'] news_article.url = item['url'] return news_article
def put_article_in_db(self): counter=0 try: for x in self.sublist: submissions=self.reddit.get_subreddit(x).get_hot(limit=30) for submission in submissions: story_url=submission.url.encode('ascii', 'ignore') if( not self.db.in_set({'url':story_url})): print str(story_url) current_article = NewsArticle(story_url) #publish date for article : datetime object article_published = current_article.date_made() #title of article : String article_title=current_article.get_title() #print article_title current_article.goodArticle() #keywords in article: Array of Strings article_key_words = current_article.getKeywords() #videos in story : Array of Strings (url to videos) article_videos = current_article.get_videos() #summary of article : String article_summary = current_article.getSummary() #authors of article: Array of Strings article_authors = current_article.getAuthors() #image for article : String (url to image) article_thumbnaillink = current_article.thumbnail_url() mydb = pymongo.MongoClient() res=get_tld(story_url, as_object=True) new_entry = {} new_entry['title']=article_title new_entry['sum']=article_summary new_entry['author']=article_authors new_entry['thumb'] = article_thumbnaillink new_entry['pub'] = article_published new_entry['keywords'] = article_key_words new_entry['vids'] = article_videos new_entry['likes']=0 new_entry['dislikes']=0 new_entry['comments'] = [] new_entry['url'] = story_url new_entry['_id'] = uuid.uuid4().hex new_entry['postnum']=mydb.lyket.articles.count() new_entry['creationtime']=datetime.datetime.now() new_entry['publisher'] = res.domain new_entry['creationtime']=datetime.datetime.utcnow() new_entry['companycreator'] = res.domain self.db.CollectionSubmitOne(new_entry) print "Done with article " + str(mydb.lyket.articles.count()) else: print "Already have it " + str(counter) counter=counter+1 except Exception as e: print "------" print "its f****d emma" print e print "------"
progressBar = ProgressBar(int(len(xmlfiles))) supportBar = SupportBar() #create file for results results = open('../output/results.txt', 'w+') debug = open('../output/debug.txt', 'w+') id = -1 for filename in xmlfiles: larct = parse("../filesXML/" + filename) sys.stdout.write("(" + str(len(larct)) + "/" ) sys.stdout.flush() for arcticle in larct: id += 1 try: newarticle = NewsArticle(id, arcticle[0], arcticle[1], arcticle[2], arcticle[3], arcticle[4]) newarticle.extract_metadata() aggr.add_article(newarticle) #Update StatusBar supportBar.increase() size = len(str(supportBar.get())) spaces = ' ' * (4 - size) sys.stdout.write("{0}){1}\b\b\b\b\b".format(supportBar.get(), spaces)) sys.stdout.flush() except KeyboardInterrupt: print "\nProgram Closed Successfully!" sys.exit(1) except Exception,e:
def put_article_in_db(self,story_url): try: if( not self.db.in_set({'url':story_url})): current_article = NewsArticle(story_url) #publish date for article : datetime object article_published = current_article.date_made() #title of article : String article_title=current_article.get_title() #print article_title current_article.goodArticle() #keywords in article: Array of Strings article_key_words = current_article.getKeywords() #videos in story : Array of Strings (url to videos) article_videos = current_article.get_videos() #summary of article : String article_summary = current_article.getSummary() #authors of article: Array of Strings article_authors = current_article.getAuthors() #image for article : String (url to image) article_thumbnaillink = current_article.thumbnail_url() article_url = current_article.get_url() res=get_tld(article_url, as_object=True) new_entry = {} new_entry['title']=article_title new_entry['sum']=article_summary new_entry['auth']=article_authors new_entry['thumb'] = article_thumbnaillink new_entry['pub'] = article_published new_entry['keywords'] = article_key_words new_entry['vids'] = article_videos new_entry['likes']=0 new_entry['dislikes']=0 new_entry['comments'] = [] new_entry['url'] = article_url new_entry['creationtime']=datetime.datetime.now() new_entry['publisher'] = res.domain new_entry['creationtime']=datetime.datetime.utcnow() new_entry['companycreator'] = res.domain self.db.CollectionSubmitOne(new_entry) except Exception as e: print "------" print "its f****d emma" print e print "------"
def put_article_in_db(self, story_url): try: if (not self.db.in_set({'url': story_url})): current_article = NewsArticle(story_url) #publish date for article : datetime object article_published = current_article.date_made() #title of article : String article_title = current_article.get_title() #print article_title current_article.goodArticle() #keywords in article: Array of Strings article_key_words = current_article.getKeywords() #videos in story : Array of Strings (url to videos) article_videos = current_article.get_videos() #summary of article : String article_summary = current_article.getSummary() #authors of article: Array of Strings article_authors = current_article.getAuthors() #image for article : String (url to image) article_thumbnaillink = current_article.thumbnail_url() article_url = current_article.get_url() res = get_tld(article_url, as_object=True) new_entry = {} new_entry['title'] = article_title new_entry['sum'] = article_summary new_entry['auth'] = article_authors new_entry['thumb'] = article_thumbnaillink new_entry['pub'] = article_published new_entry['keywords'] = article_key_words new_entry['vids'] = article_videos new_entry['likes'] = 0 new_entry['dislikes'] = 0 new_entry['comments'] = [] new_entry['url'] = article_url new_entry['creationtime'] = datetime.datetime.now() new_entry['publisher'] = res.domain new_entry['creationtime'] = datetime.datetime.utcnow() new_entry['companycreator'] = res.domain self.db.CollectionSubmitOne(new_entry) except Exception as e: print "------" print "its f****d emma" print e print "------"
def convert_to_class(item): news_article = NewsArticle() news_article.authors = item['authors'] news_article.date_download = ExtractedInformationStorage.datestring_to_date(item['date_download']) news_article.date_modify = ExtractedInformationStorage.datestring_to_date(item['date_modify']) news_article.date_publish = ExtractedInformationStorage.datestring_to_date(item['date_publish']) news_article.description = item['description'] news_article.filename = item['filename'] news_article.image_url = item['image_url'] news_article.language = item['language'] news_article.localpath = item['localpath'] news_article.title = item['title'] news_article.title_page = item['title_page'] news_article.title_rss = item['title_rss'] news_article.source_domain = item['source_domain'] news_article.text = item['text'] news_article.url = item['url'] return news_article