Example #1
0
def scrap_user(user_id):
    # Return False in case of error otherwise True
    print(f'User scrapping start.')

    if user_id is None:
        print('All users processed!')
        return

    client = PreAuthVKClient()
    scrapper = VKScrapper(client)

    # Scrap profile.
    file_path = os.path.join(USER_PROFILES_DIR, user_id + '.json')
    profile_storage = FileStorage(file_path)
    scrap_profile_result = scrapper.scrap_profile(user_id, profile_storage)
    if scrap_profile_result == ScrapProfileResult.Failed:
        print(f'User scrapping failed: {user_id}.')
        return False
    elif scrap_profile_result == ScrapProfileResult.Skip:
        print(f'User scrapping skipped: {user_id}.')
        return True

    # Scrap wall.
    file_path = os.path.join(USER_WALLS_DIR, user_id + '.json')
    wall_storage = FileStorage(file_path)
    success = scrapper.scrap_wall(user_id, wall_storage, 1000)
    if not success:
        print('Failed to request wall posts!')
        return False

    print(f'User scrapping end: {user_id}.')

    return True
Example #2
0
def gather_process():
    logger.info("gather")
    storage_authors = FileStorage(SCRAPPED_AUTHORS)
    storage_author_info = FileStorage(SCRAPPED_AUTHOR_INFO)

    scrapper = Scrapper()
    scrapper.scrap_process(storage_authors, storage_author_info)
Example #3
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_FILE)
    result = []
    parser = DataParser([])
    for line in storage.read_data():
        result = result + parser.parse(line)

    df = pd.DataFrame(result)
    df.to_csv(TABLE_FORMAT_FILE, encoding='utf8')
Example #4
0
def convert_data_to_table_format():
    logger.info("transform")
    # Your code here
    # transform gathered data from txt file to pandas DataFrame and save as csv
    storage = FileStorage(SCRAPPED_JSON)
    objects = []
    for row in storage.read_data():
        objects.append(json.loads(row))

    df = pd.DataFrame(objects)
    df.to_csv(TABLE_FORMAT_FILE, encoding='utf-8')
Example #5
0
def scrap_rand_user():
    # Return False in case of error otherwise True
    fetcher = UserIDFetcher(FileStorage(USER_IDS_FILE_NAME),
                            FileStorage(PROCESSED_USER_IDS_FILE_NAME))
    user_id = fetcher.get_not_processed_user_id()
    result = scrap_user(user_id)

    if result:
        fetcher.mark_user_id_as_processed(user_id)

    return result
Example #6
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_FILE)

    data = []

    parser = DotaFileParser(data)
    for line in storage.read_data():
        data = data + parser.parse(line)

    with open(TABLE_FORMAT_FILE, 'w') as outfile:
        json.dump(data, outfile)
Example #7
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_FILE)
    data = storage.read_data()
    parser = VacancyParser(COLUMNS)
    # Your code here
    # transform gathered data from txt file to pandas DataFrame and save as csv
    with open(TABLE_FORMAT_FILE, encoding='utf-8', mode='w') as f:
        df = pd.DataFrame(columns=COLUMNS)
        for vacancy in data:
            df = df.append(pd.DataFrame(parser.parse(vacancy), columns=COLUMNS), ignore_index=True)
        df.to_csv(f, encoding='utf-8')
    pass
Example #8
0
def convert_data_to_table_format():
    logger.info("transform")

    # Your code here
    # transform gathered data from txt file to pandas DataFrame and save as csv

    storage = FileStorage(SCRAPPED_FILE)
    parser = JSONParser()

    data = storage.read_data()

    df = parser.parse(data)

    df.to_csv(TABLE_FORMAT_FILE)
Example #9
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage)
Example #10
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_FILE)

    # transform gathered data from json file to pandas DataFrame and save as csv
    parser = Parser(storage)
    parser.parse(TABLE_FORMAT_FILE)
Example #11
0
def gather_process(obj_count):
    logger.info("gather")
    storage = FileStorage(SCRAPPED_JSON)

    # You can also pass a storage
    scrapper = Scrapper(obj_count)
    scrapper.scrap_process(storage)
Example #12
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPING_FILE)

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage, SCRAPPING_DIRECTORY, SCRAPPING_PAGES_COUNT)
Example #13
0
def scrap_group(group_name):
    file_path = os.path.join(GROUP_MEMBERS_DIR, group_name + '.csv')
    storage = FileStorage(file_path)
    client = PreAuthVKClient()
    scrapper = VKScrapper(client)
    total = scrapper.scrap_group(group_name, storage, 10000)
    print(f'scrapped {total} group({group_name}) members.')
Example #14
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    # You can also pass a storage
    scrapper = Scrapper(limit=VACANCIES_LIMIT, per_page=VACANCIES_PER_PAGE, area=VACANCIES_SEARCH_AREA,
                        specialization=SPECIALIZATION)
    scrapper.scrap_process(storage)
Example #15
0
def gather_process(use_proxy):
    logger.info("gather")
    storage = FileStorage(SCRAPPED_STORAGE)
    proxy = ProxyScrapper() if use_proxy else None
    scrapper = BookingScrapper(proxy, storage)

    if scrapper.scrap_process(limit=LIMIT):
        logging.error('Success booking.com hotels scraping')
    else:
        logging.error('Failed booking.com hotels scraping')
Example #16
0
def gather_process_test():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)
    parser = html_parser.HtmlParser(fields=[])
    # You can also pass a storage
    # data_dict = {'marka' : [], 'model' : [], 'price' : [], 'year' : [], 'probeg' : [], 'owners' : [], 'photo_len' : []}
    scrapper = Scrapper()
    url = START_SCRAPING_PAGE.replace('###p=1###', 'p=1')
    response = scrapper.scrap_process(url=url)
    data = parser.get_parsed_data_test(response.text, DOMEN)
Example #17
0
def convert_data_to_table_format():
    logger.info("transform")
    storage = FileStorage(SCRAPPED_STORAGE)
    hotels = storage.keys()
    with open(TABLE_FORMAT_FILE, encoding='utf-8', mode='w') as csv_file:
        df = pd.DataFrame(columns=[
            'name',
            'stars',
            'rating',
            'reviews_count',
            'has_free_wifi',
            'gallery_images_count',
            'address',
            'start_year',
            'good_district',
        ])

        for hotel in hotels:
            parser = BookingHotelParser(storage.get(hotel))
            df = df.append(pd.DataFrame([[
                parser.title(),
                parser.stars(),
                parser.rating(),
                parser.reviews_count(),
                parser.has_free_wifi(),
                len(parser.gallery_images()),
                parser.address(),
                parser.the_year_of_the_beginning_on_the_booking(),
                parser.geo_summary() is not None,
            ]], columns=[
                'name',
                'stars',
                'rating',
                'reviews_count',
                'has_free_wifi',
                'gallery_images_count',
                'address',
                'start_year',
                'good_district',
            ]), ignore_index=True)

        df.to_csv(csv_file, encoding='utf-8')
Example #18
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPED_FILE)
    scraper = Scraper()
    scraper.scrape_process(storage)
Example #19
0
def convert_data_to_table_format():
    logger.info("transform")

    transformer = Transformer()
    data = FileStorage(SCRAPPED_FILE).read_data()
    transformer.transform(data)
Example #20
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    print("Hello")

    session = vk.Session()
    vk_api = vk.API(session)

    members = vk_api.groups.getMembers(group_id='bolshe_buketa', v=5)

    i = 0
    with open('list_to_csv.csv', 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        for vk_user_id in members['users']:

            time.sleep(1)

            user = vk_api.users.get(
                user_id=vk_user_id,
                v=5,
                fields='name, online,bdate,city,sex,about,connections,contacts'
            )[0]

            if 'home_phone' in user:
                user['home_phone'] = user['home_phone'].replace(
                    '\u2665',
                    '').replace('\u2605',
                                '').replace('\u260e',
                                            '').replace(':',
                                                        '').replace(',', '')

            if 'about' in user:
                user['about'] = user['about'].replace('\u2665', '').replace(
                    '\u2605', '').replace('\u260e',
                                          '').replace(':',
                                                      '').replace(',', '')

            if 'city' in user:

                city = vk_api.database.getCitiesById(city_ids=user['city'],
                                                     v=5)

                if user['city'] != 0:
                    user['city_name'] = city[0]['title'].replace(':', '')
                else:
                    user['city_name'] = ''

                del user['city']
            i = i + 1
            print(i)
            print(user)
            try:
                csv_writer.writerow([user])
            except:
                user['about'] = 'Не удалось декодировать и записать'
                try:
                    csv_writer.writerow([user])
                except:
                    user['home_phone'] = 'Не удалось декодировать и записать'
                    csv_writer.writerow([user])

    print('Done')

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage)