def _get_reddit_posts_as_documents(self, n, keyword="data"): #récupère n publication de reddit.com hot_posts = reddit.subreddit(keyword).hot(limit=n) posts = [] for post in hot_posts: posts.append(Document.factory("Reddit", post)) return posts
def _get_arxiv_publications_as_documents(self, n, keyword="data"): #récupère n publications de arxiv.org url = 'http://export.arxiv.org/api/query?search_query=all:' + keyword + '&start=0&max_results=' + str( n) data = xmltodict.parse(urllib.request.urlopen(url).read()) pubs = [] #si un seul document est requêté, le format est différent if n == 1: data["feed"]["entry"] = [data["feed"]["entry"]] for pub in data["feed"]["entry"]: pubs.append(Document.factory("Arxiv", pub)) return pubs