################################## Création du Corpus ################################## corpus = Corpus("Corona") reddit = praw.Reddit(client_id='A7Cy6zC5PKFqoQ', client_secret='nLLooEBnPAnYNP2yonryN_97foY', user_agent='Reddit WebScraping') hot_posts = reddit.subreddit('Coronavirus').hot(limit=10) for post in hot_posts: datet = dt.datetime.fromtimestamp(post.created) txt = post.title + ". " + post.selftext txt = txt.replace('\n', ' ') txt = txt.replace('\r', ' ') doc = Document(datet, post.title, post.author_fullname, txt, post.url) corpus.add_doc(doc) url = 'http://export.arxiv.org/api/query?search_query=all:covid&start=0&max_results=10' data = urllib.request.urlopen(url).read().decode() docs = xmltodict.parse(data)['feed']['entry'] for i in docs: datet = dt.datetime.strptime(i['published'], '%Y-%m-%dT%H:%M:%SZ') try: author = [aut['name'] for aut in i['author']][0] except: author = i['author']['name'] txt = i['title'] + ". " + i['summary'] txt = txt.replace('\n', ' ') txt = txt.replace('\r', ' ')