def process_page(saved_page): filter_name = first_match("(?<=_)\w+(?=_)", saved_page) i_page = first_match("(?<=_)\d*(?=\.)", saved_page) # i_ prefix means "index" node = Node(rainking_out_path + saved_page) # utils wrapper class table_el = node.select( "#searchForm .grid-content.expanding-content > table") column_names = list( map(lambda x: x.text(), table_el.select_list("thead th"))) row_els = table_el.select_list("tbody > tr") page_entries = [] na_cells = 0 # we will set na for cells with exceptions. It is important for unstructured "Inside scoops" column for i_row, row_el in enumerate(row_els): entry = {"filter_name": filter_name, "page#": i_page} cell_els = row_el.select_list("td") for i_col, column in enumerate(column_names): if column in ["", "Status"]: continue elif column == "Company": entry[column] = "Accounts::::" + cell_els[i_col].text() elif column == "Inside Scoop": # example for regex: # Topic: Security, Staffing. # Company: Virgin Media Inc., New York, NY Find the Best Contacts # Opportunity: Seeking a Security Data Engineer (England,Peterborough,PE3). View Details cur_cell_text = cell_els[i_col].text() company_el = cell_els[i_col].select("a") if company_el is None: company_name = first_match("(?<=Company:)[^,]*", cur_cell_text) if not company_name: company_name = cur_cell_text na_cells += 1 else: company_name = company_el.text() entry["Company"] = "Accounts::::" + company_name location_regex = "(?<={0}).+?(?=\n|$|Find the Best Contacts)".format( company_name) location = first_match(location_regex, cur_cell_text) entry["Location"] = re.sub("^,\s*", "", location) opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format( company_name) entry["Opportunity"] = first_match(opportunity_regex, cur_cell_text) opp_link_el = cell_els[i_col].select("a:contains('Details')") if opp_link_el: entry["Opportunity_link"] = opp_link_el.attr("href") else: entry["Opportunity_link"] = "NA" na_cells += 1 else: entry[column] = cell_els[i_col].text() page_entries.append(entry) if na_cells: print("--", str(na_cells)) return page_entries
def process_page(saved_page): filter_name = first_match("(?<=_)\w+(?=_)", saved_page) i_page = first_match("(?<=_)\d*(?=\.)", saved_page) # i_ prefix means "index" node = Node(rainking_out_path + saved_page) # utils wrapper class table_el = node.select("#searchForm .grid-content.expanding-content > table") column_names = list(map(lambda x: x.text(), table_el.select_list("thead th"))) row_els = table_el.select_list("tbody > tr") page_entries = [] na_cells = 0 # we will set na for cells with exceptions. It is important for unstructured "Inside scoops" column for i_row, row_el in enumerate(row_els): entry = {"filter_name": filter_name, "page#": i_page} cell_els = row_el.select_list("td") for i_col, column in enumerate(column_names): if column in ["", "Status"]: continue elif column == "Company": entry[column] = "Accounts::::" + cell_els[i_col].text() elif column == "Inside Scoop": # example for regex: # Topic: Security, Staffing. # Company: Virgin Media Inc., New York, NY Find the Best Contacts # Opportunity: Seeking a Security Data Engineer (England,Peterborough,PE3). View Details cur_cell_text = cell_els[i_col].text() company_el = cell_els[i_col].select("a") if company_el is None: company_name = first_match("(?<=Company:)[^,]*", cur_cell_text) if not company_name: company_name = cur_cell_text na_cells += 1 else: company_name = company_el.text() entry["Company"] = "Accounts::::" + company_name location_regex = "(?<={0}).+?(?=\n|$|Find the Best Contacts)".format(company_name) location = first_match(location_regex, cur_cell_text) entry["Location"] = re.sub("^,\s*", "", location) opportunity_regex = "(?<=Opportunity:).+?(?=\n|$|View Details)".format(company_name) entry["Opportunity"] = first_match(opportunity_regex, cur_cell_text) opp_link_el = cell_els[i_col].select("a:contains('Details')") if opp_link_el: entry["Opportunity_link"] = opp_link_el.attr("href") else: entry["Opportunity_link"] = "NA" na_cells += 1 else: entry[column] = cell_els[i_col].text() page_entries.append(entry) if na_cells: print("--", str(na_cells)) return page_entries
def select(self, query, is_browser_css_query=True): # "{0}:contains('{1}')" "//{0}[contains(., '{1}')]" if self.el is None: return self.__create_node__(None) if hasattr(self, 'browser'): if ":contains" in query: # "h1:contains('job is private')"" possible_tag = re.sub(":contains\(.+?\)", "", query) if " " not in possible_tag and ":" not in possible_tag and ">" not in possible_tag: contains_text = first_match("(?<=:contains\(.)[^'\"]+", query) by = By.XPATH if possible_tag.startswith("."): query = "//*[@class='{0}' and contains(., '{1}')]".format( possible_tag[1:], contains_text) elif "." not in possible_tag: query = "//{0}[contains(., '{1}')]".format( possible_tag, contains_text) else: raise ValueError( "could not rewrite ':contains' in xpath") else: raise ValueError("could not rewrite ':contains' in xpath") else: by = By.CSS_SELECTOR if is_browser_css_query else By.XPATH result_el = __safe_execution__(self.el.find_element, by, query) else: selector = CSSSelector(query) result_els = __safe_execution__(selector, self.el) # check for None result_el = result_els[0] if result_els else None return self.__create_node__(result_el)
def number(self, pattern=None, prec=0, attr_name=None): if attr_name: str_number = first_match( pattern, self.attr(attr_name)) if pattern else self.attr(attr_name) else: str_number = self.text(pattern) return parse_number(str_number, prec) if str_number else None
def text(self, pattern=None, is_replacement=False, safe=True): if self.el is None and safe: return "" if pattern: text = re.sub(pattern, "", self.el.text) if is_replacement else first_match( pattern, self.el.text) else: text = self.el.text # coincidence of selenium and lxml return text if hasattr(self, 'browser') else clear_text(text)
def date(self, need_parse=True, pattern=None, attr_name=None): if attr_name: str_number = first_match( pattern, self.attr(attr_name)) if pattern else self.attr(attr_name) else: str_number = self.text(pattern) if not str_number: return None return parse_date(str_number) if need_parse else time.gmtime( parse_number(str_number))
def get_api_token(): client = upwork.Client(os.environ["upkey"], os.environ["upsecret"]) authorize_url = client.auth.get_authorize_url() doc = Node(authorize_url, "url", "chrome") print("Navigating authorize url...") doc.select("#login_username").send_keys(os.getenv("email")) doc.select("#login_password").send_keys(os.getenv("up") + "#u") doc.select("#layout form").submit() print("Navigating token url...") verifier_el = doc.select("#main > div") verifier = first_match("(?<=oauth_verifier=).+", verifier_el.text) oauth_token, oauth_token_secret = client.auth.get_token(verifier) oauth_token = oauth_token.decode("utf-8") oauth_token_secret = oauth_token_secret.decode("utf-8") print(oauth_token, oauth_token_secret) return oauth_token, oauth_token_secret