Esempio n. 1
0
def process_page(saved_page):
    filter_name = first_match("(?<=_)\w+(?=_)", saved_page)
    i_page = first_match("(?<=_)\d*(?=\.)",
                         saved_page)  # i_ prefix means "index"

    node = Node(rainking_out_path + saved_page)  # utils wrapper class
    table_el = node.select(
        "#searchForm .grid-content.expanding-content > table")
    column_names = list(
        map(lambda x: x.text(), table_el.select_list("thead th")))
    row_els = table_el.select_list("tbody > tr")
    page_entries = []
    na_cells = 0  # we will set na for cells with exceptions. It is important for unstructured "Inside scoops" column

    for i_row, row_el in enumerate(row_els):
        entry = {"filter_name": filter_name, "page#": i_page}
        cell_els = row_el.select_list("td")
        for i_col, column in enumerate(column_names):
            if column in ["", "Status"]:
                continue
            elif column == "Company":
                entry[column] = "Accounts::::" + cell_els[i_col].text()
            elif column == "Inside Scoop":
                # example for regex:
                # Topic: Security, Staffing.
                # Company: Virgin Media Inc., New York, NY    Find the Best Contacts
                # Opportunity: Seeking a Security Data Engineer (England,Peterborough,PE3).    View Details
                cur_cell_text = cell_els[i_col].text()
                company_el = cell_els[i_col].select("a")
                if company_el is None:
                    company_name = first_match("(?<=Company:)[^,]*",
                                               cur_cell_text)
                    if not company_name:
                        company_name = cur_cell_text
                        na_cells += 1
                else:
                    company_name = company_el.text()
                entry["Company"] = "Accounts::::" + company_name
                location_regex = "(?<={0}).+?(?=\n|$|Find the Best Contacts)".format(
                    company_name)
                location = first_match(location_regex, cur_cell_text)
                entry["Location"] = re.sub("^,\s*", "", location)
                opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format(
                    company_name)
                entry["Opportunity"] = first_match(opportunity_regex,
                                                   cur_cell_text)
                opp_link_el = cell_els[i_col].select("a:contains('Details')")
                if opp_link_el:
                    entry["Opportunity_link"] = opp_link_el.attr("href")
                else:
                    entry["Opportunity_link"] = "NA"
                    na_cells += 1
            else:
                entry[column] = cell_els[i_col].text()
        page_entries.append(entry)
    if na_cells:
        print("--", str(na_cells))
    return page_entries
def process_page(saved_page):
    filter_name = first_match("(?<=_)\w+(?=_)", saved_page)
    i_page = first_match("(?<=_)\d*(?=\.)", saved_page)  # i_ prefix means "index"

    node = Node(rainking_out_path + saved_page)  # utils wrapper class
    table_el = node.select("#searchForm .grid-content.expanding-content > table")
    column_names = list(map(lambda x: x.text(), table_el.select_list("thead th")))
    row_els = table_el.select_list("tbody > tr")
    page_entries = []
    na_cells = 0  # we will set na for cells with exceptions. It is important for unstructured "Inside scoops" column

    for i_row, row_el in enumerate(row_els):
        entry = {"filter_name": filter_name, "page#": i_page}
        cell_els = row_el.select_list("td")
        for i_col, column in enumerate(column_names):
            if column in ["", "Status"]:
                continue
            elif column == "Company":
                entry[column] = "Accounts::::" + cell_els[i_col].text()
            elif column == "Inside Scoop":
                # example for regex:
                # Topic: Security, Staffing.
                # Company: Virgin Media Inc., New York, NY    Find the Best Contacts
                # Opportunity: Seeking a Security Data Engineer (England,Peterborough,PE3).    View Details
                cur_cell_text = cell_els[i_col].text()
                company_el = cell_els[i_col].select("a")
                if company_el is None:
                    company_name = first_match("(?<=Company:)[^,]*", cur_cell_text)
                    if not company_name:
                        company_name = cur_cell_text
                        na_cells += 1
                else:
                    company_name = company_el.text()
                entry["Company"] = "Accounts::::" + company_name
                location_regex = "(?<={0}).+?(?=\n|$|Find the Best Contacts)".format(company_name)
                location = first_match(location_regex, cur_cell_text)
                entry["Location"] = re.sub("^,\s*", "", location)
                opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format(company_name)
                entry["Opportunity"] = first_match(opportunity_regex, cur_cell_text)
                opp_link_el = cell_els[i_col].select("a:contains('Details')")
                if opp_link_el:
                    entry["Opportunity_link"] = opp_link_el.attr("href")
                else:
                    entry["Opportunity_link"] = "NA"
                    na_cells += 1
            else:
                entry[column] = cell_els[i_col].text()
        page_entries.append(entry)
    if na_cells:
        print("--", str(na_cells))
    return page_entries
Esempio n. 3
0
 def select(self, query, is_browser_css_query=True):
     # "{0}:contains('{1}')" "//{0}[contains(., '{1}')]"
     if self.el is None:
         return self.__create_node__(None)
     if hasattr(self, 'browser'):
         if ":contains" in query:
             # "h1:contains('job is private')""
             possible_tag = re.sub(":contains\(.+?\)", "", query)
             if " " not in possible_tag and ":" not in possible_tag and ">" not in possible_tag:
                 contains_text = first_match("(?<=:contains\(.)[^'\"]+",
                                             query)
                 by = By.XPATH
                 if possible_tag.startswith("."):
                     query = "//*[@class='{0}' and contains(., '{1}')]".format(
                         possible_tag[1:], contains_text)
                 elif "." not in possible_tag:
                     query = "//{0}[contains(., '{1}')]".format(
                         possible_tag, contains_text)
                 else:
                     raise ValueError(
                         "could not rewrite ':contains' in xpath")
             else:
                 raise ValueError("could not rewrite ':contains' in xpath")
         else:
             by = By.CSS_SELECTOR if is_browser_css_query else By.XPATH
         result_el = __safe_execution__(self.el.find_element, by, query)
     else:
         selector = CSSSelector(query)
         result_els = __safe_execution__(selector,
                                         self.el)  # check for None
         result_el = result_els[0] if result_els else None
     return self.__create_node__(result_el)
Esempio n. 4
0
 def number(self, pattern=None, prec=0, attr_name=None):
     if attr_name:
         str_number = first_match(
             pattern,
             self.attr(attr_name)) if pattern else self.attr(attr_name)
     else:
         str_number = self.text(pattern)
     return parse_number(str_number, prec) if str_number else None
Esempio n. 5
0
 def text(self, pattern=None, is_replacement=False, safe=True):
     if self.el is None and safe:
         return ""
     if pattern:
         text = re.sub(pattern, "",
                       self.el.text) if is_replacement else first_match(
                           pattern, self.el.text)
     else:
         text = self.el.text  # coincidence of selenium and lxml
     return text if hasattr(self, 'browser') else clear_text(text)
Esempio n. 6
0
 def date(self, need_parse=True, pattern=None, attr_name=None):
     if attr_name:
         str_number = first_match(
             pattern,
             self.attr(attr_name)) if pattern else self.attr(attr_name)
     else:
         str_number = self.text(pattern)
     if not str_number:
         return None
     return parse_date(str_number) if need_parse else time.gmtime(
         parse_number(str_number))
Esempio n. 7
0
def get_api_token():
    client = upwork.Client(os.environ["upkey"], os.environ["upsecret"])
    authorize_url = client.auth.get_authorize_url()
    doc = Node(authorize_url, "url", "chrome")
    print("Navigating authorize url...")
    doc.select("#login_username").send_keys(os.getenv("email"))
    doc.select("#login_password").send_keys(os.getenv("up") + "#u")
    doc.select("#layout form").submit()

    print("Navigating token url...")
    verifier_el = doc.select("#main > div")
    verifier = first_match("(?<=oauth_verifier=).+", verifier_el.text)

    oauth_token, oauth_token_secret = client.auth.get_token(verifier)
    oauth_token = oauth_token.decode("utf-8")
    oauth_token_secret = oauth_token_secret.decode("utf-8")
    print(oauth_token, oauth_token_secret)
    return oauth_token, oauth_token_secret