Beispiel #1
0
import time

from gray.common.data_utils import write_entries, time_measure
from gray.common.node_utils import Node, Provider

doc = Node("https://generalassemb.ly/education", Provider.PHANTOMJS)
entries = []
row_els = doc.select_list("#catalog-results > div > *:not(.date-divider)")
for row_idx, row_el in enumerate(row_els):
    start_time = time.time()
    entry = {}
    entry["link"] = row_el.select("a").attr("href")

    details_el = row_el.select(".item-details")
    entry["title"] = details_el.select(".medium.item-title").text()
    entry["desc"] = details_el.children(1).text()
    entry["instructor"] = details_el.select(".instructor").text()

    entry["series"] = row_el.select(".series-info").text()
    entry["date"] = row_el.select(".date-details").text()
    entry["time"] = row_el.select(".date-details").text()
    entry["topics"] = row_el.select_list("li.topic-icon-item").attrs("title")
    entry["promo"] = row_el.select_list(".cyber-monday-promo").texts()
    time_measure(str(row_idx) + " row", start_time, 3)
    entries.append(entry)

write_entries(entries, "generalassembly")
Beispiel #2
0
                opp_link_el = cell_els[i_col].select("a:contains('Details')")
                if opp_link_el:
                    entry["Opportunity_link"] = opp_link_el.attr("href")
                else:
                    entry["Opportunity_link"] = "NA"
                    na_cells += 1
            else:
                entry[column] = cell_els[i_col].text()
        page_entries.append(entry)
    if na_cells:
        print("--", str(na_cells))
    return page_entries


start_time = time.time()
saved_pages = [x for x in os.listdir(rainking_out_path) if x.endswith(".html")]
people_pages = [
    x for x in saved_pages if x.startswith("People")
]  # split saved_pages for separate processing because their columns are different
companies_pages = [x for x in saved_pages if x.startswith("Companies")]
scoops_pages = [x for x in saved_pages if x.startswith("Scoops")]

for i_cat, category_pages in enumerate(
    [people_pages, companies_pages, scoops_pages]):
    category_entries = []
    for i_page, saved_page in enumerate(category_pages):
        print(i_cat, i_page,
              "" if i_page % 10 else str(round(time.time() - start_time, 2)))
        category_entries += process_page(saved_page)
    write_entries(category_entries, i_cat)
                location = first_match(location_regex, cur_cell_text)
                entry["Location"] = re.sub("^,\s*", "", location)
                opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format(company_name)
                entry["Opportunity"] = first_match(opportunity_regex, cur_cell_text)
                opp_link_el = cell_els[i_col].select("a:contains('Details')")
                if opp_link_el:
                    entry["Opportunity_link"] = opp_link_el.attr("href")
                else:
                    entry["Opportunity_link"] = "NA"
                    na_cells += 1
            else:
                entry[column] = cell_els[i_col].text()
        page_entries.append(entry)
    if na_cells:
        print("--", str(na_cells))
    return page_entries


start_time = time.time()
saved_pages = [x for x in os.listdir(rainking_out_path) if x.endswith(".html")]
people_pages = [x for x in saved_pages if x.startswith("People")]  # split saved_pages for separate processing because their columns are different
companies_pages = [x for x in saved_pages if x.startswith("Companies")]
scoops_pages = [x for x in saved_pages if x.startswith("Scoops")]

for i_cat, category_pages in enumerate([people_pages, companies_pages, scoops_pages]):
    category_entries = []
    for i_page, saved_page in enumerate(category_pages):
        print(i_cat, i_page, "" if i_page % 10 else str(round(time.time() - start_time, 2)))
        category_entries += process_page(saved_page)
    write_entries(category_entries, i_cat)