def main(src_dir, dst_fp): engine, session = get_engine_and_session(dst_fp) objectIDs = get_objectIDs(session) print('get_objectIDs done') filelist = list(yield_filelist(src_dir)) total = len(filelist) for n, fp in enumerate(filelist): j = load_json(fp) hits = j['hits'] insert_hits(hits, objectIDs, engine) print('{0}/{1}'.format(n + 1, total)) commit(session)
def main(src_dir, dst_fp): engine, session = get_engine_and_session(dst_fp) objectIDs = get_objectIDs(session) print('get_objectIDs done') filelist = list(yield_filelist(src_dir)) total = len(filelist) for n, fp in enumerate(filelist): j = load_json(fp) hits = j['hits'] is_ok_to_delete = is_this_json_in_db(hits, objectIDs) if is_ok_to_delete: os.remove(fp) print('{0}/{1}'.format(n, total))
timestamp = get_json_data(tag, timestamp, out_dir) if timestamp == 'end' or not timestamp: break n += 1 print n, timestamp except Exception as e: print timestamp print e, 'error' pass if __name__ == '__main__': BASE_URL = ('http://hn.algolia.com/api/v1/search_by_date?' 'tags={tag}&hitsPerPage={hits_per_page}' '&numericFilters=created_at_i{big_or_small}{timestamp}') BIG_OR_SMALL = '>' # get news that created after db's latest timestamp for dst, flag, db_fp in out_dirs_and_flag: engine, session = get_engine_and_session(db_fp) timestamp = get_latest_timestamp(session) if not timestamp: BIG_OR_SMALL = '<' # start from scratch d = datetime.utcnow() timestamp = calendar.timegm(d.utctimetuple()) make_folder_if_not_exists(dst) main(timestamp, flag, dst) print('{} is done!'.format(flag))