################################## Création du Corpus ##################################

corpus = Corpus("Corona")

reddit = praw.Reddit(client_id='A7Cy6zC5PKFqoQ',
                     client_secret='nLLooEBnPAnYNP2yonryN_97foY',
                     user_agent='Reddit WebScraping')
hot_posts = reddit.subreddit('Coronavirus').hot(limit=10)
for post in hot_posts:
    datet = dt.datetime.fromtimestamp(post.created)
    txt = post.title + ". " + post.selftext
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')
    doc = Document(datet, post.title, post.author_fullname, txt, post.url)
    corpus.add_doc(doc)

url = 'http://export.arxiv.org/api/query?search_query=all:covid&start=0&max_results=10'
data = urllib.request.urlopen(url).read().decode()
docs = xmltodict.parse(data)['feed']['entry']

for i in docs:
    datet = dt.datetime.strptime(i['published'], '%Y-%m-%dT%H:%M:%SZ')
    try:
        author = [aut['name'] for aut in i['author']][0]
    except:
        author = i['author']['name']
    txt = i['title'] + ". " + i['summary']
    txt = txt.replace('\n', ' ')
    txt = txt.replace('\r', ' ')