def parse_tickers(sect, base_url): # url = 'https://www.estimize.com/sectors/{sector}?per={max_tickers}'.format(sector=sect, max_tickers=MAX_TICKERS) url = base_url.format(sector=sect, max_tickers=MAX_TICKERS) scraper = Scrapper() soup = BeautifulSoup(scraper.scrap_process(url), 'lxml') # lxm parser tickers_table_dict = {} # опредилим год отсчета season_txt = soup.find('div', {'class': 'season'}).find('strong').text year_txt = season_txt.split(' ')[1] # скачаем таблицу тикеров tickers_html = soup.find('div', {'class': 'linked-table'}) items = tickers_html.find_all( 'a', {'class': ['linked-row opened', 'linked-row closed']}) for item in items: line_dict = {} ticker_nm = item.find('div', { 'class': 'td symbol' }).text.replace('\n', '') if ticker_nm not in tickers_table_dict.keys( ): # бывает, что в 1-м квартале 2 отчета, оставляем самый ранний date_str = item.find('div', { 'class': 'td reports' }).text.replace('\n', '')[:-3] + ' ' + year_txt line_dict['Report'] = pd.to_datetime(date_str, format='%b %d %Y') line_dict['Time'] = item.find('div', { 'class': 'td reports' }).text.replace('\n', '')[-3:] line_dict['Qurter'] = item.find('div', { 'class': 'td quarter' }).text.replace('\n', '') line_dict['Sector'] = sect tickers_table_dict[ticker_nm] = line_dict df_out = pd.DataFrame(tickers_table_dict).T df_out.index.rename('tic', inplace=True) return df_out
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)
def gather_process(obj_count): logger.info("gather") storage = FileStorage(SCRAPPED_JSON) # You can also pass a storage scrapper = Scrapper(obj_count) scrapper.scrap_process(storage)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPING_FILE) # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage, SCRAPPING_DIRECTORY, SCRAPPING_PAGES_COUNT)
def gather_process(): logger.info("gather") storage_authors = FileStorage(SCRAPPED_AUTHORS) storage_author_info = FileStorage(SCRAPPED_AUTHOR_INFO) scrapper = Scrapper() scrapper.scrap_process(storage_authors, storage_author_info)
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) # You can also pass a storage scrapper = Scrapper(limit=VACANCIES_LIMIT, per_page=VACANCIES_PER_PAGE, area=VACANCIES_SEARCH_AREA, specialization=SPECIALIZATION) scrapper.scrap_process(storage)
def gather_process_test(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) parser = html_parser.HtmlParser(fields=[]) # You can also pass a storage # data_dict = {'marka' : [], 'model' : [], 'price' : [], 'year' : [], 'probeg' : [], 'owners' : [], 'photo_len' : []} scrapper = Scrapper() url = START_SCRAPING_PAGE.replace('###p=1###', 'p=1') response = scrapper.scrap_process(url=url) data = parser.get_parsed_data_test(response.text, DOMEN)
def parse_ticker_data(tic_str, q_shift, date_str_lst, metric_str_lst, base_url): ''' scraping data for ONE ticker create urls list, parse data, convert to Pandas series, concat in serie for one ticker ''' urls = [ base_url.format(ticker=tic_str, quarter=quarter2ez(q_str), metric_name=met_str) for q_str in date_str_lst for met_str in metric_str_lst ] scraper = Scrapper() # скачаем таблицы данных в текстовом формате # <table class="rel-chart-tbl"> селектор для таблицы # class ="release-header-information-breadcrumb" селектор для сектора table_txt_list = scraper.scrap_slenium(urls, LOCATOR_PATHS) concat_dict = { } # словарь для списков первоначальных series, keys - метрики (EPS, revenue) for tb_txt in table_txt_list: if tb_txt['metric_table'] is not None: mtrc, df = parse_table_txt(tb_txt['metric_table']) s = transform_df_to_concat(tic_str=tic_str, mertic_str=mtrc, q_shift=q_shift, df=df) try: # каждая series лежат в mtrc-именованом lst concat_dict[mtrc] = concat_dict[mtrc] + [s] except KeyError: concat_dict[mtrc] = [s] concat_lst = [] # list для склеивания по метрикам - горизонтально i = 0 for mtrc, s_lst in concat_dict.items(): _s = pd.concat(s_lst, axis=0).sort_index() _s = _s[~_s.index.duplicated( )] # дубли могут возникать в начале и конце диапазонов, если отображаест меньше чем 3 года concat_lst.append(_s) # склеили series вертикально для кождой mtrc df_out = pd.concat(concat_lst, axis=1, sort=True).reset_index(level='qrt').sort_index( level=['calendar_qrt', 'sources']) # склеили горизонтально df_out.columns = pd.MultiIndex.from_product( [[tic_str], df_out.columns]) # добавили в колонки MultiIndex, level_0 => ticker return df_out
def gather_process(): logger.info("gather") storage = JSONStorage(SCRAPPED_FILE) quotes_storage = JSONStorage(QUOTES_FILE) # You can also pass a storage # Search vacancies using popular job title name in data analytics # https://blog.udacity.com/2018/01/4-types-data-science-jobs.html search_texts = ['Data Analyst', 'Machine Learning Engineer', 'Data Engineer', 'Data Scientist'] search_params = {'page': 0, 'no_magic': False, 'period': 30, 'only_with_salary': False} # Scrape information about vacancies scrapper = Scrapper() scrapper.scrap_process(storage, search_texts, search_params) # Scrape information about current quotes for currencies vs RUR quotes_scrapper = QuotesScrapper() quotes_scrapper.scrap_process(quotes_storage, 'RUR')
def gather_process(): res = pd.DataFrame() scrapper = Scrapper() page_part = '?p=' for car_model in MODELS: initial_url = BASE_URL + car_model + page_part + '1' total_pages = scrapper.get_total_pages(scrapper.get_html(initial_url)) for i in range(1, total_pages + 1): time.sleep(5) url_gen = BASE_URL + car_model + page_part + str(i) logger.info('processing ' + str(i) + ' ' + url_gen) df = scrapper.get_page_data(scrapper.get_html(url_gen)) res = pd.concat([res, df]) res.to_csv('data.csv', index=False)
def __init__(self, url: str, type_recipe: str): Scrapper.__init__(self, url, type_recipe) print('BBC')
def gather_process(): logger.info("gather") storage = FileStorage(SCRAPPED_FILE) print("Hello") session = vk.Session() vk_api = vk.API(session) members = vk_api.groups.getMembers(group_id='bolshe_buketa', v=5) i = 0 with open('list_to_csv.csv', 'w', newline='') as csv_file: csv_writer = csv.writer(csv_file) for vk_user_id in members['users']: time.sleep(1) user = vk_api.users.get( user_id=vk_user_id, v=5, fields='name, online,bdate,city,sex,about,connections,contacts' )[0] if 'home_phone' in user: user['home_phone'] = user['home_phone'].replace( '\u2665', '').replace('\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'about' in user: user['about'] = user['about'].replace('\u2665', '').replace( '\u2605', '').replace('\u260e', '').replace(':', '').replace(',', '') if 'city' in user: city = vk_api.database.getCitiesById(city_ids=user['city'], v=5) if user['city'] != 0: user['city_name'] = city[0]['title'].replace(':', '') else: user['city_name'] = '' del user['city'] i = i + 1 print(i) print(user) try: csv_writer.writerow([user]) except: user['about'] = 'Не удалось декодировать и записать' try: csv_writer.writerow([user]) except: user['home_phone'] = 'Не удалось декодировать и записать' csv_writer.writerow([user]) print('Done') # You can also pass a storage scrapper = Scrapper() scrapper.scrap_process(storage)
def gather_process(pageCounts): logger.info("gather") scrapper = Scrapper(pageCounts) scrapper.scrap_process()
def gather_process(): logger.info("gather") scrapper = Scrapper() scrapper.scrap_process(SCRAPPED_FILE)