def content_scraper(table): docs = table_to_list(table) for i,doc in enumerate(docs): if 'content' in doc.keys(): continue thread = threading.Thread(name=i, target=content_adder_thread, args=(table, doc, i)) thread.start() time.sleep(np.random.random()/3+0.3)
def __init__(self): self.i = 0 self.table = st.open_database_collection('nyt') docs = st.table_to_list(self.table) self.seen_urls = {doc['web_url'] for doc in docs} self.link = 'http://api.nytimes.com/svc/search/v2/articlesearch.json' NYT_API_KEY = os.environ['NYT_API_KEY'] self.payload = {'api-key': NYT_API_KEY} self._set_filters()
def remove_dups(table): #ipdb.set_trace() docs = st.table_to_list(table) # urls = [ doc['link'] for doc in docs] # _ids = [ doc['_id'] for doc in docs] if 'web_url' in docs[0].keys(): for i, _ in enumerate(docs): docs[i]['link'] = docs[i]['web_url'] pairs = [(doc['link'], doc['_id']) for doc in docs] pair_dict = dict(pairs) id_keepers = set(pair_dict.values()) id_all = {doc['_id'] for doc in docs} kill_ids = id_all.difference(id_keepers) for _id in kill_ids: table.delete_one(filter={'_id': _id})