def process(source, limit): """ Grabs n articles from source, parses and analyses them, where limit specifies maximum n. source must be an RSS feed. limit must be an integer. Returns: list of dictionaries, each containing the tags and parsed information of one article. """ # Getting article links feedLinks = parsing.get_links(source, limit) # Download articles and gather into list articleList = [] for link in feedLinks: article = parsing.get_article(link) articleList.append(article) # Analyse each article analysedList = [] for article in articleList: analysed = parsing.analyse_article(article, ARTICLE_ELEMENTS) analysedList.append(analysed) vprint(verbose, str(len(articleList)) + " articles processed.") return analysedList
def get_article(url): """ Downloads an article from url. url must be a string. Returns: instance of newspaper.article. """ vprint(verbose, "Downloading article from " + url + " ...") article = Article(url, language="en") article.download() vprint(verbose, "Article downloaded.\n") return article
def get_feed_list(feeds): """ Gets RSS feed urls from specified text file. feeds must be a string specifying the location to a .txt file. Returns: list of RSS feeds. """ # Getting RSS feed URLs vprint(verbose, "Getting link list from source file.") feedList = [] feedsFile = open(feeds, "r") for link in feedsFile: feedList.append(link.strip("\n")) feedsFile.close() vprint(verbose, str(len(feedList)) + " RSS links acquired.") return feedList
def get_links(RSSFeed, numLinks): """ Scrape n article URLs from an RSS feed, where n=numLinks. RSSFeed must be a string holding an RSS feed URL. numLinks must be an integer. Returns: list of article urls. """ vprint( verbose, str("Getting " + str(numLinks) + " article links from the feed at " + RSSFeed + "...")) parsedFeed = feedparser.parse(RSSFeed) articleLinks = [] i = 0 while i < numLinks and i < len(parsedFeed.entries): articleLinks.append(parsedFeed.entries[i].link) i = i + 1 vprint(verbose, "Article links acquired.\n") return articleLinks
def analyse_article(article, elems): """ Parses article for information that matches tags specified in elems, then performs NLP on that information. article must be an object of type newspaper.article. elems must be a list of tags to scrape from article. Returns: dictionary of tags and matching data. """ vprint(verbose, "Parsing and analysing article...") # Parse article and conduct NLP analysis using newspaper supplied methods article.parse() article.nlp() breakdown = {} for elem in elems: payload = getattr(article, elem) breakdown.update({elem: payload}) vprint(verbose, "Parsing and analysis complete.") return breakdown