from newsplease import NewsPlease import json i = 1 articles = NewsPlease.from_file('/Users/wyw/Box/6350/Project/URLsplit/URLs260k'+str(i)+'.txt') # print (article.text) count = 0 with open ('/Users/wyw/Box/6350/Project/OutputSet/output'+str(i)+'.json', 'w') as outfile: for url in articles: if articles[url].text != None and len(articles[url].text) > 10: count += 1 json.dump(articles[url].__dict__, outfile, default=str, sort_keys=True) outfile.write('\n') print (count)
#!/usr/bin/env python """ This script reads in URLs from a text-file "name" and downloads article information for each of URLs. The results are stored in JSON-files in a sub-folder. You need to adapt the variables name and basepath in order to use the script. """ import json import os from newsplease import NewsPlease name = "trump-in-saudi-arabia.txt" basepath = "/Users/felix/Downloads/" download_dir = basepath + "dir" + name + "/" os.makedirs(download_dir) articles = NewsPlease.from_file(basepath + name) for url in articles: article = articles[url] with open(download_dir + article["filename"] + ".json", "w") as outfile: json.dump(article, outfile, indent=4, sort_keys=True)