def parse_areas_of_study_and_write_to_database(input_arguments): db = connect_to_database(input_arguments.db_host, input_arguments.db) if input_arguments.erase: db.areas.drop() db.courses.drop() html_univ_files = get_univ_files(input_arguments.path) areas = dict() for univ_file in html_univ_files: with open(univ_file, 'rb') as file: q = file.read() local_areas = get_area_course_info(q) for key, value in local_areas.items(): if key not in areas: areas[key] = list() for course in value: if course not in areas[key]: areas[key].append(course) print(areas) areas_dao = [{'area_id_old': generate_id(title), 'area_title': title} for title in areas.keys()] courses_dao = list() for title, course_list in areas.items(): curr_courses = [{'area_id_old': generate_id(title), 'course_title': c} for c in course_list] courses_dao.extend(curr_courses) db.areas.insert_many(areas_dao) db.courses.insert_many(courses_dao)
def parse_univ_pages_and_write_to_database(input_arguments): db = connect_to_database(input_arguments.db_host, input_arguments.db) if input_arguments.erase: db.univs.drop() html_univ_files = get_univ_files(input_arguments.path) univs_to_insert = dict() for univ_file in html_univ_files: with open(univ_file, 'rb') as file: q = file.read() univ = get_univ_info_from_page_2017(q) # tlower = univ['univ_title'].lower() # if 'коледж' in tlower or 'технікум' in tlower or 'училищ' in tlower: # continue univ['univ_title'] = sub(r'\([^)]*\)', '', univ['univ_title'].strip()) while ' ' in univ['univ_title']: univ['univ_title'] = univ['univ_title'].replace(' ', ' ') univ['univ_location'] = get_city_name_from_address( univ['univ_address']) # generate_id(univ['univ_title']) univ['univ_id'] = int(univ_file[univ_file.rindex('i') + 1:-5]) univs_to_insert[univ['univ_id']] = univ if len(univs_to_insert) % 10 == 0: print(len(univs_to_insert)) db.univs.insert_many(univs_to_insert.values())
def _create_database(): with connect_to_database() as db: cursor = db.cursor() cursor.execute(''' CREATE TABLE IF NOT EXISTS Авторы(`id` INTEGER PRIMARY KEY, `имя` TEXT, `страна` TEXT, `годы жизни` TEXT) ''') db.commit() cursor.execute(''' CREATE TABLE IF NOT EXISTS Книги(`id` INTEGER PRIMARY KEY, `id автора` INTEGER, `название` TEXT, `количество страниц` INTEGER, `издательство` TEXT, `год издания` INTEGER, FOREIGN KEY('id автора') REFERENCES Авторы(id)) ''') db.commit() cursor.execute(''' CREATE TABLE IF NOT EXISTS Пользователи(`id` INTEGER PRIMARY KEY, `логин` TEXT, `пароль` TEXT) ''') db.commit()
def _fill_database(): authors = ("1|L.N.Tolstoi |Russia |1828-1910\n" "2|F.M.Dostoyevsky|Russia |1821-1881\n" "3|B.Vian |France |1920-1959\n" "4|A.Camus |France |1913-1960\n" "5|F.Kafka |Austria|1883-1924") books = ("1|1|War and Peace |1225|The Russian Messanger|1869\n" "2|1|Resurrection |483 |Niva |1899\n" "3|2|The Idiot |678 |The Russian Messanger|1868\n" "4|2|The Gambler |241 |The Moscow Renaisanse|1867\n" "5|3|The Foam of days|219 |Gallimard |1947\n" "6|4|The Stranger |159 |Hamish Hamilton |1946\n" "7|4|The Rebel |238 |Gallimard |1951\n" "8|5|The Trial |395 |Verlag Die Schmiede |1925\n" "9|5|Amerika |351 |Routledge |1938") users = "1|admin|d4d1c9e67f05a7785990dea88020f20a" with connect_to_database() as db: cursor = db.cursor() def fill(table_string, table_name): for row_string in table_string.split('\n'): values = ','.join( repr(field.strip()) for field in row_string.split('|')) cursor.execute(''' INSERT OR IGNORE into {} values ({}) '''.format(table_name, values)) db.commit() fill(authors, 'Авторы') fill(books, 'Книги') fill(users, 'Пользователи')
def start_parsing_pages_and_write_result_to_database(input_arguments): if input_arguments.erase: db = connect_to_database(input_arguments.db_host, input_arguments.db) db.requests.drop() path_to_data = input_arguments.path files_to_read_queue, file_cache, is_all_files_read = create_queues() add_files_to_queue(path_to_data, files_to_read_queue) read_data_from_files(files_to_read_queue, file_cache, is_all_files_read) print(f'Total files to process: {files_to_read_queue.qsize()}') print() pool = multiprocessing.Pool(input_arguments.workers) # create pool pool.starmap(main_worker, [(file_cache, is_all_files_read, input_arguments)] * input_arguments.workers) # and process queue using pool print('exit')
def main_worker(files_cache, is_all_files_read, input_arguments): db = connect_to_database(input_arguments.db_host, input_arguments.db) results = [] while not is_all_files_read.is_set() or not files_cache.empty(): try: filename, file_string = files_cache.get(timeout=FILE_CACHE_DELAY) except queue.Empty: continue result = process_page_with_admission_requests(filename, file_string) if result is None or len(result) == 0: continue for r in result: logger.info(r) results.extend(result) if len(results) > NUM_RESULTS_TO_SAVE: save_results_to_db(db, results) results = [] if results: # ensure everything is saved save_results_to_db(db, results)