def __nodes_containing_attrs(self, root="", dict_entries=None, elem="", attr=None, lang='en'): if not dict_entries: return None tree = self.page_dict.get("tree") if attr == None: attr = "@*" if elem == None: elem = "*" lang_dict = lang_dicts.get(lang) en_dict = lang_dicts.get('en') xpath = "" xpath += root xpath += "//" xpath += elem xpath += "[" words_set = set() for entry in dict_entries: for word in lang_dict.get(entry): words_set.add(word) if lang != 'en': for word in en_dict.get(entry): words_set.add(word) xpath += "@*[" for word in words_set: xpath += "contains(.,'" + word.lower() + "')" xpath += " or " xpath += "contains(.,'" + word.title() + "')" xpath += " or " xpath = xpath[:-4] xpath += "]]" nodes = tree.xpath(xpath, namespaces=namespaces) if nodes: eval_nodes = [] regexdictentries = regexofdictentries(dict_entries, lang=lang) for node in nodes: attrvals = node.values() attrvals = [ attr for attr in attrvals if re.search(regexdictentries, attr) ] if attrvals: eval_nodes.append(node) return eval_nodes else: return nodes
def find(self,params): ''' Look for the word "price" in the text and attributes values ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] attr_vals = [] # get a list of all nonempty strings from body (except scripts) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths # get a list of all attribute values from the body (except scripts) if "attr_vals" in self.page_dict: attr_vals = self.page_dict["attr_vals"] else: attr_vals = tree2attributevalslist(tree) self.page_dict["attr_vals"] = attr_vals dict_entries = ["PRICE"] priceregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = counter2 = 0 for text in text_nodes: #text = str(text) if re.search(priceregex,text): counter1 += 1 for val in attr_vals: #val = str(val) if re.search(priceregex,val): counter2 += 1 ''' ''' self.features = [counter1,counter2] if counter1 + counter2 > 10: result = self.certainty = 1 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the word "price" in the text and attributes values ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] attr_vals = [] # get a list of all nonempty strings from body (except scripts) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree) self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths # get a list of all attribute values from the body (except scripts) if "attr_vals" in self.page_dict: attr_vals = self.page_dict["attr_vals"] else: attr_vals = tree2attributevalslist(tree) self.page_dict["attr_vals"] = attr_vals dict_entries = ["PRICE"] priceregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = counter2 = 0 for text in text_nodes: #text = str(text) if re.search(priceregex, text): counter1 += 1 for val in attr_vals: #val = str(val) if re.search(priceregex, val): counter2 += 1 ''' ''' self.features = [counter1, counter2] if counter1 + counter2 > 10: result = self.certainty = 1 else: result = self.certainty = 0 return result
def __nodes_containing_attrs(self,root="",dict_entries=None,elem="",attr=None,lang='en'): if not dict_entries: return None tree = self.page_dict.get("tree") if attr == None: attr = "@*" if elem == None: elem = "*" lang_dict = lang_dicts.get(lang) en_dict = lang_dicts.get('en') xpath = "" xpath += root xpath += "//" xpath += elem xpath += "[" words_set = set() for entry in dict_entries: for word in lang_dict.get(entry): words_set.add(word) if lang != 'en': for word in en_dict.get(entry): words_set.add(word) xpath += "@*[" for word in words_set: xpath += "contains(.,'"+word.lower()+"')" xpath += " or " xpath += "contains(.,'"+word.title()+"')" xpath += " or " xpath = xpath[:-4] xpath += "]]" nodes = tree.xpath(xpath,namespaces=namespaces) if nodes: eval_nodes = [] regexdictentries = regexofdictentries(dict_entries, lang=lang) for node in nodes: attrvals = node.values() attrvals = [attr for attr in attrvals if re.search(regexdictentries,attr)] if attrvals: eval_nodes.append(node) return eval_nodes else: return nodes
def find(self, params): ''' Look for the word "wishlist" in the anchors ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") a_text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "a_text_nodes" in self.page_dict: a_text_nodes = self.page_dict["a_text_nodes"] text_nodes_paths = self.page_dict["a_text_nodes_paths"] else: a_text_nodes, text_nodes_paths = tree2textnodeslist(tree, element="a") self.page_dict["a_text_nodes"] = a_text_nodes self.page_dict["a_text_nodes_paths"] = text_nodes_paths dict_entries = ["WISHLIST"] wishregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in a_text_nodes: #text = str(text) if re.search(wishregex, text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 1: result = self.certainty = 1 elif counter1 == 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self, params): ''' Look for the shipping and returns information ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree, element="a") self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths dict_entries = ["SHIPPING", "RETURNS"] shipregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in text_nodes: #text = str(text) if re.search(shipregex, text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 3: result = self.certainty = 1 elif counter1 > 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self,params): ''' Look for the word "wishlist" in the anchors ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") a_text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "a_text_nodes" in self.page_dict: a_text_nodes = self.page_dict["a_text_nodes"] text_nodes_paths = self.page_dict["a_text_nodes_paths"] else: a_text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a") self.page_dict["a_text_nodes"] = a_text_nodes self.page_dict["a_text_nodes_paths"] = text_nodes_paths dict_entries = ["WISHLIST"] wishregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in a_text_nodes: #text = str(text) if re.search(wishregex,text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 1: result = self.certainty = 1 elif counter1 == 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self,params): ''' Look for the shipping and returns information ''' result = None tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") text_nodes = [] # get a list of all nonempty strings from body (only children of anchors) if "text_nodes" in self.page_dict: text_nodes = self.page_dict["text_nodes"] text_nodes_paths = self.page_dict["text_nodes_paths"] else: text_nodes, text_nodes_paths = tree2textnodeslist(tree,element="a") self.page_dict["text_nodes"] = text_nodes self.page_dict["text_nodes_paths"] = text_nodes_paths dict_entries = ["SHIPPING","RETURNS"] shipregex = regexofdictentries(dict_entries, lang) ''' CORE ''' counter1 = 0 for text in text_nodes: #text = str(text) if re.search(shipregex,text): counter1 += 1 ''' ''' self.features = [counter1] if counter1 > 3: result = self.certainty = 1 elif counter1 > 1: result = self.certainty = 0.5 else: result = self.certainty = 0 return result
def find(self,params): tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") # 1. Find all with attributes nodes = self.__nodes_containing_attrs(elem='*', lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]) #print("---") baglink_candidates_list = [] for node in nodes: cand = BagLinkCandidate() cand.node = node cand.nodepath = tree.getpath(node) cand.points = 0 baglink_candidates_list.append(cand) #print(len(baglink_candidates_list)) # 2a. Remove ones that have more than 3 levels of children baglink_candidates_list = [cand for cand in baglink_candidates_list if not tree.xpath(cand.nodepath+"/*/*/*/*/*")] #print(len(baglink_candidates_list)) # 2b. Remove ones that have more than 20 descendants baglink_candidates_list = [cand for cand in baglink_candidates_list if tree.xpath("count("+cand.nodepath+"//*)")<20] #print(len(baglink_candidates_list)) # 2c. Remove the ones that look like add-to-cart buttons addclassif = AddToBasketButtonClassifier() baglink_candidates_list = [cand for cand in baglink_candidates_list if not addclassif.classifygivennode(self.page_dict, cand.nodepath)] # 2d. Remove the ones that have more than 20 words in text nodes inside baglink_candidates_list = [cand for cand in baglink_candidates_list if sum(len(text.split()) for text in tree.xpath(cand.nodepath+"//text()"))<10] #print(len(baglink_candidates_list)) # 2e. Remove ones that are children of others found baglink_candidates_list = [cand for cand in baglink_candidates_list if self.__isbagroot(cand,baglink_candidates_list)] #print(len(baglink_candidates_list)) #print(' '.join(x.node.tag for x in baglink_candidates_list)) for cand in baglink_candidates_list: # 3. If element is anchor, add points if cand.node.tag == 'a': cand.points += points[0] # 4. If element has anchor descendants, add points xpath = cand.nodepath+"//"+"a" if tree.xpath(xpath,namespaces=namespaces): cand.points += points[1] # 5. If element is image descendants, add points xpath = cand.nodepath+"//"+"img" if tree.xpath(xpath,namespaces=namespaces): cand.points += points[2] # 6. If element has descendants with attrs, add points if self.__nodes_containing_attrs(root=cand.nodepath, elem="*", lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]): cand.points += points[3] # 7. If element has anchor descendants with attrs, add points if self.__nodes_containing_attrs(root=cand.nodepath, elem="a", lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]): cand.points += points[4] # if not attention_flag: # # 8. If element has descendants with text type 1, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[5] # # 9. If element has descendants with text type 2, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, dict_entries=["ECOM_ITEM"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[6] # # 9. If element has descendants with text type 3, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, regexs=["\d+"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[7] # else: text_nodes = tree.xpath(cand.nodepath+"//text()") text_nodes = [text.strip() for text in text_nodes if text.strip()] match1 = match2 = match3 = 0 regex1 = regexofdictentries(entries=["ECOM_CART","ECOM_CHECKOUT"],lang=lang) regex2 = regexofdictentries(entries=["ECOM_ITEM"],lang=lang) regex3 = "\d+" for text in text_nodes: #text = str(text) # 8. If element has descendants with text type 1, add points if match1==0 and re.search(regex1,text): match1 = 1 # 9. If element has descendants with text type 2, add points if match2==0 and re.search(regex2,text): match2 = 1 # 10. If element has descendants with text type 3, add points if match3==0 and re.search(regex3,text): match3 = 1 cand.points += match1 + match2 + match3 if len(baglink_candidates_list) > 0: baglink_candidates_list = sorted(baglink_candidates_list, key=lambda x: -x.points) cand = baglink_candidates_list[0] # print("\n--\n\n") # print(i,cand.points,":::") # prettyprint.print_html(cand.node) self.features = [cand.points] self.nodepath = cand.nodepath if cand.points>=2 else None self.certainty = 1.0 if cand.points>5 else cand.points/5.0 else: self.features = [0] return self.nodepath
def find(self, params): tree = self.page_dict.get("tree") lang = self.page_dict.get("lang") # 1. Find all with attributes nodes = self.__nodes_containing_attrs( elem='*', lang=lang, dict_entries=["ECOM_CART", "ECOM_CHECKOUT"]) #print("---") baglink_candidates_list = [] for node in nodes: cand = BagLinkCandidate() cand.node = node cand.nodepath = tree.getpath(node) cand.points = 0 baglink_candidates_list.append(cand) #print(len(baglink_candidates_list)) # 2a. Remove ones that have more than 3 levels of children baglink_candidates_list = [ cand for cand in baglink_candidates_list if not tree.xpath(cand.nodepath + "/*/*/*/*/*") ] #print(len(baglink_candidates_list)) # 2b. Remove ones that have more than 20 descendants baglink_candidates_list = [ cand for cand in baglink_candidates_list if tree.xpath("count(" + cand.nodepath + "//*)") < 20 ] #print(len(baglink_candidates_list)) # 2c. Remove the ones that look like add-to-cart buttons addclassif = AddToBasketButtonClassifier() baglink_candidates_list = [ cand for cand in baglink_candidates_list if not addclassif.classifygivennode(self.page_dict, cand.nodepath) ] # 2d. Remove the ones that have more than 20 words in text nodes inside baglink_candidates_list = [ cand for cand in baglink_candidates_list if sum( len(text.split()) for text in tree.xpath(cand.nodepath + "//text()")) < 10 ] #print(len(baglink_candidates_list)) # 2e. Remove ones that are children of others found baglink_candidates_list = [ cand for cand in baglink_candidates_list if self.__isbagroot(cand, baglink_candidates_list) ] #print(len(baglink_candidates_list)) #print(' '.join(x.node.tag for x in baglink_candidates_list)) for cand in baglink_candidates_list: # 3. If element is anchor, add points if cand.node.tag == 'a': cand.points += points[0] # 4. If element has anchor descendants, add points xpath = cand.nodepath + "//" + "a" if tree.xpath(xpath, namespaces=namespaces): cand.points += points[1] # 5. If element is image descendants, add points xpath = cand.nodepath + "//" + "img" if tree.xpath(xpath, namespaces=namespaces): cand.points += points[2] # 6. If element has descendants with attrs, add points if self.__nodes_containing_attrs( root=cand.nodepath, elem="*", lang=lang, dict_entries=["ECOM_CART", "ECOM_CHECKOUT"]): cand.points += points[3] # 7. If element has anchor descendants with attrs, add points if self.__nodes_containing_attrs( root=cand.nodepath, elem="a", lang=lang, dict_entries=["ECOM_CART", "ECOM_CHECKOUT"]): cand.points += points[4] # if not attention_flag: # # 8. If element has descendants with text type 1, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, dict_entries=["ECOM_CART","ECOM_CHECKOUT"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[5] # # 9. If element has descendants with text type 2, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, dict_entries=["ECOM_ITEM"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[6] # # 9. If element has descendants with text type 3, add points # xpath = self.__xpath_contains_attrs(root=cand.nodepath, elem='*', attr="text()", # lang=lang, regexs=["\d+"]) # if tree.xpath(xpath,namespaces=namespaces): # cand.points += points[7] # else: text_nodes = tree.xpath(cand.nodepath + "//text()") text_nodes = [text.strip() for text in text_nodes if text.strip()] match1 = match2 = match3 = 0 regex1 = regexofdictentries(entries=["ECOM_CART", "ECOM_CHECKOUT"], lang=lang) regex2 = regexofdictentries(entries=["ECOM_ITEM"], lang=lang) regex3 = "\d+" for text in text_nodes: #text = str(text) # 8. If element has descendants with text type 1, add points if match1 == 0 and re.search(regex1, text): match1 = 1 # 9. If element has descendants with text type 2, add points if match2 == 0 and re.search(regex2, text): match2 = 1 # 10. If element has descendants with text type 3, add points if match3 == 0 and re.search(regex3, text): match3 = 1 cand.points += match1 + match2 + match3 if len(baglink_candidates_list) > 0: baglink_candidates_list = sorted(baglink_candidates_list, key=lambda x: -x.points) cand = baglink_candidates_list[0] # print("\n--\n\n") # print(i,cand.points,":::") # prettyprint.print_html(cand.node) self.features = [cand.points] self.nodepath = cand.nodepath if cand.points >= 2 else None self.certainty = 1.0 if cand.points > 5 else cand.points / 5.0 else: self.features = [0] return self.nodepath