def scrap_user(user_id): # Return False in case of error otherwise True print(f'User scrapping start.') if user_id is None: print('All users processed!') return client = PreAuthVKClient() scrapper = VKScrapper(client) # Scrap profile. file_path = os.path.join(USER_PROFILES_DIR, user_id + '.json') profile_storage = FileStorage(file_path) scrap_profile_result = scrapper.scrap_profile(user_id, profile_storage) if scrap_profile_result == ScrapProfileResult.Failed: print(f'User scrapping failed: {user_id}.') return False elif scrap_profile_result == ScrapProfileResult.Skip: print(f'User scrapping skipped: {user_id}.') return True # Scrap wall. file_path = os.path.join(USER_WALLS_DIR, user_id + '.json') wall_storage = FileStorage(file_path) success = scrapper.scrap_wall(user_id, wall_storage, 1000) if not success: print('Failed to request wall posts!') return False print(f'User scrapping end: {user_id}.') return True
def gather_process(): logger.info("gather") storage_authors = FileStorage(SCRAPPED_AUTHORS) storage_author_info = FileStorage(SCRAPPED_AUTHOR_INFO) scrapper = Scrapper() scrapper.scrap_process(storage_authors, storage_author_info)
def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_FILE) result = [] parser = DataParser([]) for line in storage.read_data(): result = result + parser.parse(line) df = pd.DataFrame(result) df.to_csv(TABLE_FORMAT_FILE, encoding='utf8')
def convert_data_to_table_format(): logger.info("transform") # Your code here # transform gathered data from txt file to pandas DataFrame and save as csv storage = FileStorage(SCRAPPED_JSON) objects = [] for row in storage.read_data(): objects.append(json.loads(row)) df = pd.DataFrame(objects) df.to_csv(TABLE_FORMAT_FILE, encoding='utf-8')
def scrap_rand_user(): # Return False in case of error otherwise True fetcher = UserIDFetcher(FileStorage(USER_IDS_FILE_NAME), FileStorage(PROCESSED_USER_IDS_FILE_NAME)) user_id = fetcher.get_not_processed_user_id() result = scrap_user(user_id) if result: fetcher.mark_user_id_as_processed(user_id) return result
def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_FILE) data = [] parser = DotaFileParser(data) for line in storage.read_data(): data = data + parser.parse(line) with open(TABLE_FORMAT_FILE, 'w') as outfile: json.dump(data, outfile)
def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_FILE) data = storage.read_data() parser = VacancyParser(COLUMNS) # Your code here # transform gathered data from txt file to pandas DataFrame and save as csv with open(TABLE_FORMAT_FILE, encoding='utf-8', mode='w') as f: df = pd.DataFrame(columns=COLUMNS) for vacancy in data: df = df.append(pd.DataFrame(parser.parse(vacancy), columns=COLUMNS), ignore_index=True) df.to_csv(f, encoding='utf-8') pass
def convert_data_to_table_format(): logger.info("transform") # Your code here # transform gathered data from txt file to pandas DataFrame and save as csv storage = FileStorage(SCRAPPED_FILE) parser = JSONParser() data = storage.read_data() df = parser.parse(data) df.to_csv(TABLE_FORMAT_FILE)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)
def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_FILE) # transform gathered data from json file to pandas DataFrame and save as csv parser = Parser(storage) parser.parse(TABLE_FORMAT_FILE)
def gather_process(obj_count): logger.info("gather") storage = FileStorage(SCRAPPED_JSON) # You can also pass a storage scrapper = Scrapper(obj_count) scrapper.scrap_process(storage)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPING_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage, SCRAPPING_DIRECTORY, SCRAPPING_PAGES_COUNT)
def scrap_group(group_name): file_path = os.path.join(GROUP_MEMBERS_DIR, group_name + '.csv') storage = FileStorage(file_path) client = PreAuthVKClient() scrapper = VKScrapper(client) total = scrapper.scrap_group(group_name, storage, 10000) print(f'scrapped {total} group({group_name}) members.')
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper(limit=VACANCIES_LIMIT, per_page=VACANCIES_PER_PAGE, area=VACANCIES_SEARCH_AREA, specialization=SPECIALIZATION) scrapper.scrap_process(storage)
def gather_process(use_proxy): logger.info("gather") storage = FileStorage(SCRAPPED_STORAGE) proxy = ProxyScrapper() if use_proxy else None scrapper = BookingScrapper(proxy, storage) if scrapper.scrap_process(limit=LIMIT): logging.error('Success booking.com hotels scraping') else: logging.error('Failed booking.com hotels scraping')
def gather_process_test(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) parser = html_parser.HtmlParser(fields=[]) # You can also pass a storage # data_dict = {'marka' : [], 'model' : [], 'price' : [], 'year' : [], 'probeg' : [], 'owners' : [], 'photo_len' : []} scrapper = Scrapper() url = START_SCRAPING_PAGE.replace('###p=1###', 'p=1') response = scrapper.scrap_process(url=url) data = parser.get_parsed_data_test(response.text, DOMEN)
def convert_data_to_table_format(): logger.info("transform") storage = FileStorage(SCRAPPED_STORAGE) hotels = storage.keys() with open(TABLE_FORMAT_FILE, encoding='utf-8', mode='w') as csv_file: df = pd.DataFrame(columns=[ 'name', 'stars', 'rating', 'reviews_count', 'has_free_wifi', 'gallery_images_count', 'address', 'start_year', 'good_district', ]) for hotel in hotels: parser = BookingHotelParser(storage.get(hotel)) df = df.append(pd.DataFrame([[ parser.title(), parser.stars(), parser.rating(), parser.reviews_count(), parser.has_free_wifi(), len(parser.gallery_images()), parser.address(), parser.the_year_of_the_beginning_on_the_booking(), parser.geo_summary() is not None, ]], columns=[ 'name', 'stars', 'rating', 'reviews_count', 'has_free_wifi', 'gallery_images_count', 'address', 'start_year', 'good_district', ]), ignore_index=True) df.to_csv(csv_file, encoding='utf-8')
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPED_FILE) scraper = Scraper() scraper.scrape_process(storage)
def convert_data_to_table_format(): logger.info("transform") transformer = Transformer() data = FileStorage(SCRAPPED_FILE).read_data() transformer.transform(data)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) print("Hello") session = vk.Session() vk_api = vk.API(session) members = vk_api.groups.getMembers(group_id='bolshe_buketa', v=5) i = 0 with open('list_to_csv.csv', 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file) for vk_user_id in members['users']: time.sleep(1) user = vk_api.users.get( user_id=vk_user_id, v=5, fields='name, online,bdate,city,sex,about,connections,contacts' )[0] if 'home_phone' in user: user['home_phone'] = user['home_phone'].replace( '\u2665', '').replace('\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'about' in user: user['about'] = user['about'].replace('\u2665', '').replace( '\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'city' in user: city = vk_api.database.getCitiesById(city_ids=user['city'], v=5) if user['city'] != 0: user['city_name'] = city[0]['title'].replace(':', '') else: user['city_name'] = '' del user['city'] i = i + 1 print(i) print(user) try: csv_writer.writerow([user]) except: user['about'] = 'Не удалось декодировать и записать' try: csv_writer.writerow([user]) except: user['home_phone'] = 'Не удалось декодировать и записать' csv_writer.writerow([user]) print('Done') # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)