コード例 #1
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage)
コード例 #2
0
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPING_FILE)

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage, SCRAPPING_DIRECTORY, SCRAPPING_PAGES_COUNT)
コード例 #3
0
ファイル: gathering.py プロジェクト: erlong15/data_gathering
def gather_process(obj_count):
    logger.info("gather")
    storage = FileStorage(SCRAPPED_JSON)

    # You can also pass a storage
    scrapper = Scrapper(obj_count)
    scrapper.scrap_process(storage)
コード例 #4
0
def gather_process():
    logger.info("gather")
    storage_authors = FileStorage(SCRAPPED_AUTHORS)
    storage_author_info = FileStorage(SCRAPPED_AUTHOR_INFO)

    scrapper = Scrapper()
    scrapper.scrap_process(storage_authors, storage_author_info)
コード例 #5
0
ファイル: gathering.py プロジェクト: damu4/data_gathering
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    # You can also pass a storage
    scrapper = Scrapper(limit=VACANCIES_LIMIT, per_page=VACANCIES_PER_PAGE, area=VACANCIES_SEARCH_AREA,
                        specialization=SPECIALIZATION)
    scrapper.scrap_process(storage)
コード例 #6
0
def parse_tickers(sect, base_url):
    # url = 'https://www.estimize.com/sectors/{sector}?per={max_tickers}'.format(sector=sect, max_tickers=MAX_TICKERS)
    url = base_url.format(sector=sect, max_tickers=MAX_TICKERS)
    scraper = Scrapper()
    soup = BeautifulSoup(scraper.scrap_process(url), 'lxml')  # lxm parser
    tickers_table_dict = {}
    # опредилим год отсчета
    season_txt = soup.find('div', {'class': 'season'}).find('strong').text
    year_txt = season_txt.split(' ')[1]
    # скачаем таблицу тикеров
    tickers_html = soup.find('div', {'class': 'linked-table'})
    items = tickers_html.find_all(
        'a', {'class': ['linked-row opened', 'linked-row closed']})
    for item in items:
        line_dict = {}
        ticker_nm = item.find('div', {
            'class': 'td symbol'
        }).text.replace('\n', '')
        if ticker_nm not in tickers_table_dict.keys(
        ):  # бывает, что в 1-м квартале 2 отчета, оставляем самый ранний
            date_str = item.find('div', {
                'class': 'td reports'
            }).text.replace('\n', '')[:-3] + ' ' + year_txt
            line_dict['Report'] = pd.to_datetime(date_str, format='%b %d %Y')
            line_dict['Time'] = item.find('div', {
                'class': 'td reports'
            }).text.replace('\n', '')[-3:]
            line_dict['Qurter'] = item.find('div', {
                'class': 'td quarter'
            }).text.replace('\n', '')
            line_dict['Sector'] = sect
            tickers_table_dict[ticker_nm] = line_dict
    df_out = pd.DataFrame(tickers_table_dict).T
    df_out.index.rename('tic', inplace=True)
    return df_out
コード例 #7
0
def gather_process_test():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)
    parser = html_parser.HtmlParser(fields=[])
    # You can also pass a storage
    # data_dict = {'marka' : [], 'model' : [], 'price' : [], 'year' : [], 'probeg' : [], 'owners' : [], 'photo_len' : []}
    scrapper = Scrapper()
    url = START_SCRAPING_PAGE.replace('###p=1###', 'p=1')
    response = scrapper.scrap_process(url=url)
    data = parser.get_parsed_data_test(response.text, DOMEN)
コード例 #8
0
def gather_process():
    logger.info("gather")
    storage = JSONStorage(SCRAPPED_FILE)
    quotes_storage = JSONStorage(QUOTES_FILE)

    # You can also pass a storage

    # Search vacancies using popular job title name in data analytics
    # https://blog.udacity.com/2018/01/4-types-data-science-jobs.html
    search_texts = ['Data Analyst', 'Machine Learning Engineer', 'Data Engineer', 'Data Scientist']
    search_params = {'page': 0, 'no_magic': False,
                     'period': 30, 'only_with_salary': False}


    # Scrape information about vacancies
    scrapper = Scrapper()
    scrapper.scrap_process(storage, search_texts, search_params)

    # Scrape information about current quotes for currencies vs RUR
    quotes_scrapper = QuotesScrapper()
    quotes_scrapper.scrap_process(quotes_storage, 'RUR')
コード例 #9
0
ファイル: gathering.py プロジェクト: shaihulud68/Otus.Lesson1
def gather_process():
    logger.info("gather")
    storage = FileStorage(SCRAPPED_FILE)

    print("Hello")

    session = vk.Session()
    vk_api = vk.API(session)

    members = vk_api.groups.getMembers(group_id='bolshe_buketa', v=5)

    i = 0
    with open('list_to_csv.csv', 'w', newline='') as csv_file:
        csv_writer = csv.writer(csv_file)
        for vk_user_id in members['users']:

            time.sleep(1)

            user = vk_api.users.get(
                user_id=vk_user_id,
                v=5,
                fields='name, online,bdate,city,sex,about,connections,contacts'
            )[0]

            if 'home_phone' in user:
                user['home_phone'] = user['home_phone'].replace(
                    '\u2665',
                    '').replace('\u2605',
                                '').replace('\u260e',
                                            '').replace(':',
                                                        '').replace(',', '')

            if 'about' in user:
                user['about'] = user['about'].replace('\u2665', '').replace(
                    '\u2605', '').replace('\u260e',
                                          '').replace(':',
                                                      '').replace(',', '')

            if 'city' in user:

                city = vk_api.database.getCitiesById(city_ids=user['city'],
                                                     v=5)

                if user['city'] != 0:
                    user['city_name'] = city[0]['title'].replace(':', '')
                else:
                    user['city_name'] = ''

                del user['city']
            i = i + 1
            print(i)
            print(user)
            try:
                csv_writer.writerow([user])
            except:
                user['about'] = 'Не удалось декодировать и записать'
                try:
                    csv_writer.writerow([user])
                except:
                    user['home_phone'] = 'Не удалось декодировать и записать'
                    csv_writer.writerow([user])

    print('Done')

    # You can also pass a storage
    scrapper = Scrapper()
    scrapper.scrap_process(storage)
コード例 #10
0
def gather_process(pageCounts):
    logger.info("gather")
    scrapper = Scrapper(pageCounts)
    scrapper.scrap_process()
コード例 #11
0
def gather_process():
    logger.info("gather")
    scrapper = Scrapper()
    scrapper.scrap_process(SCRAPPED_FILE)