import time from gray.common.data_utils import write_entries, time_measure from gray.common.node_utils import Node, Provider doc = Node("https://generalassemb.ly/education", Provider.PHANTOMJS) entries = [] row_els = doc.select_list("#catalog-results > div > *:not(.date-divider)") for row_idx, row_el in enumerate(row_els): start_time = time.time() entry = {} entry["link"] = row_el.select("a").attr("href") details_el = row_el.select(".item-details") entry["title"] = details_el.select(".medium.item-title").text() entry["desc"] = details_el.children(1).text() entry["instructor"] = details_el.select(".instructor").text() entry["series"] = row_el.select(".series-info").text() entry["date"] = row_el.select(".date-details").text() entry["time"] = row_el.select(".date-details").text() entry["topics"] = row_el.select_list("li.topic-icon-item").attrs("title") entry["promo"] = row_el.select_list(".cyber-monday-promo").texts() time_measure(str(row_idx) + " row", start_time, 3) entries.append(entry) write_entries(entries, "generalassembly")
opp_link_el = cell_els[i_col].select("a:contains('Details')") if opp_link_el: entry["Opportunity_link"] = opp_link_el.attr("href") else: entry["Opportunity_link"] = "NA" na_cells += 1 else: entry[column] = cell_els[i_col].text() page_entries.append(entry) if na_cells: print("--", str(na_cells)) return page_entries start_time = time.time() saved_pages = [x for x in os.listdir(rainking_out_path) if x.endswith(".html")] people_pages = [ x for x in saved_pages if x.startswith("People") ] # split saved_pages for separate processing because their columns are different companies_pages = [x for x in saved_pages if x.startswith("Companies")] scoops_pages = [x for x in saved_pages if x.startswith("Scoops")] for i_cat, category_pages in enumerate( [people_pages, companies_pages, scoops_pages]): category_entries = [] for i_page, saved_page in enumerate(category_pages): print(i_cat, i_page, "" if i_page % 10 else str(round(time.time() - start_time, 2))) category_entries += process_page(saved_page) write_entries(category_entries, i_cat)
location = first_match(location_regex, cur_cell_text) entry["Location"] = re.sub("^,\s*", "", location) opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format(company_name) entry["Opportunity"] = first_match(opportunity_regex, cur_cell_text) opp_link_el = cell_els[i_col].select("a:contains('Details')") if opp_link_el: entry["Opportunity_link"] = opp_link_el.attr("href") else: entry["Opportunity_link"] = "NA" na_cells += 1 else: entry[column] = cell_els[i_col].text() page_entries.append(entry) if na_cells: print("--", str(na_cells)) return page_entries start_time = time.time() saved_pages = [x for x in os.listdir(rainking_out_path) if x.endswith(".html")] people_pages = [x for x in saved_pages if x.startswith("People")] # split saved_pages for separate processing because their columns are different companies_pages = [x for x in saved_pages if x.startswith("Companies")] scoops_pages = [x for x in saved_pages if x.startswith("Scoops")] for i_cat, category_pages in enumerate([people_pages, companies_pages, scoops_pages]): category_entries = [] for i_page, saved_page in enumerate(category_pages): print(i_cat, i_page, "" if i_page % 10 else str(round(time.time() - start_time, 2))) category_entries += process_page(saved_page) write_entries(category_entries, i_cat)