Ejemplo n.º 1
0
def main(src_dir, dst_fp):
    engine, session = get_engine_and_session(dst_fp)
    objectIDs = get_objectIDs(session)
    print('get_objectIDs done')

    filelist = list(yield_filelist(src_dir))
    total = len(filelist)

    for n, fp in enumerate(filelist):
        j = load_json(fp)
        hits = j['hits']
        insert_hits(hits, objectIDs, engine)
        print('{0}/{1}'.format(n + 1, total))
    commit(session)
def main(src_dir, dst_fp):
    engine, session = get_engine_and_session(dst_fp)
    objectIDs = get_objectIDs(session)
    print('get_objectIDs done')

    filelist = list(yield_filelist(src_dir))
    total = len(filelist)

    for n, fp in enumerate(filelist):
        j = load_json(fp)
        hits = j['hits']
        is_ok_to_delete = is_this_json_in_db(hits, objectIDs)
        if is_ok_to_delete:
            os.remove(fp)
        print('{0}/{1}'.format(n, total))
            timestamp = get_json_data(tag, timestamp, out_dir)
            if timestamp == 'end' or not timestamp:
                break

            n += 1
            print n, timestamp
        except Exception as e:
            print timestamp
            print e, 'error'
            pass

if __name__ == '__main__':
    BASE_URL = ('http://hn.algolia.com/api/v1/search_by_date?'
                'tags={tag}&hitsPerPage={hits_per_page}'
                '&numericFilters=created_at_i{big_or_small}{timestamp}')

    BIG_OR_SMALL = '>'  # get news that created after db's latest timestamp

    for dst, flag, db_fp in out_dirs_and_flag:
        engine, session = get_engine_and_session(db_fp)
        timestamp = get_latest_timestamp(session)

        if not timestamp:
            BIG_OR_SMALL = '<'  # start from scratch
            d = datetime.utcnow()
            timestamp = calendar.timegm(d.utctimetuple())

        make_folder_if_not_exists(dst)
        main(timestamp, flag, dst)
        print('{} is done!'.format(flag))