def test_definition(self, pattern, text, title, link, description): if (pattern == None): return if (link == "https://www.w3.org/about"): return self.item_pattern = clean_input(pattern) if(Debug): print("Item Pattern: '" + self.item_pattern + "'") first = self.item_pattern.find("{") if (first < 0): return None second = self.item_pattern.rfind("}") if (second < 0): return None start_pattern = self.item_pattern[:first] stop_pattern = self.item_pattern[second+1:] if(Debug): print("Start pattern: '" + start_pattern + "'") if(Debug): print("Stop pattern: '" + stop_pattern + "'") data = self.get_item_text(clean_input(text), start_pattern, stop_pattern) if(Debug): print(data[0]) item_info = self.parse_items(data[:3]) if(Debug): print(item_info) iterator = 0 items = [] for item in item_info: items.append(RSSItem( item, title=title, link=link, description=description).toJSON()) if(Debug): print(items) if (Debug): print("Test complete") return items
def generate_items(self, text, test=False): """scrapes the page to find any new items """ if(Debug): print("Item Pattern: '" + self.item_pattern + "'") if (self.item_pattern == None): return if (self.link == "https://www.w3.org/about"): return if (len(self.items) > 0): self.items = [] start = self.item_pattern.find("{") stop = self.item_pattern.rfind("}") if(start == -1 or stop == -1): return start_pattern = self.item_pattern[:start] stop_pattern = self.item_pattern[stop+1:] if(Debug): print("Start pattern: '" + start_pattern + "'") if(Debug): print("Stop pattern: '" + stop_pattern + "'") data = self.get_item_text(clean_input(text), start_pattern, stop_pattern) item_info = self.parse_items(data) if test == True: return item_info for item in item_info: self.items.append(self.create_item(item)) self.lastBuildDate = datetime.datetime.now() self.pubDate = datetime.datetime.now()
def test_pattern(self, pattern, text): """Creates a list of items from the given text and item pattern Parameters: pattern (string): an item pattern text (string): the text to scrape for items """ self.item_pattern = clean_input(pattern) return self.generate_items(text, True)
def parse_item_text(self, item_text, pattern = None): """generates a list of item fields from the item pattern and a snippet of text from get_item_text Parameters: item_text (string): A snippet of text from the source code that matches the item pattern pattern (string): An item pattern """ if (Debug): print("Parsing Item Text") if (pattern is None): if (self.item_pattern is None): return item_pattern = self.item_pattern else: item_pattern = pattern output = [] Left_capture_pattern_start_index = 0 #The position in the pattern capture_search_start_index = 0 #The position in the text num_fields_total = item_pattern.count("{%}") num_fields_captured = 0 if (Debug): print("Total Fields: '" + str(num_fields_total) + "'") if (Debug): print("Item Text: '" + item_text + "'") if (Debug): print("Item Pattern: '" + item_pattern + "'") while(capture_search_start_index >= 0): if (Debug): print("==========================================") if (Debug): print("Left Capture Pattern Start Index: '" + str(Left_capture_pattern_start_index) + "'") Left_capture_pattern_stop_index = item_pattern.find("{", Left_capture_pattern_start_index) if (Debug): print("Left Capture Pattern Stop Index: '" + str(Left_capture_pattern_stop_index) + "'") Left_capture_pattern = item_pattern[Left_capture_pattern_start_index:Left_capture_pattern_stop_index] if (Debug): print("Left Capture Pattern: '" + Left_capture_pattern + "'") right_capture_pattern_start_index = item_pattern.find("}", Left_capture_pattern_stop_index)+1 if (Debug): print("Right Capture Pattern Start Index: '" + str(right_capture_pattern_start_index) + "'") right_capture_pattern_stop_index = item_pattern.find("{", right_capture_pattern_start_index) if (Debug): print("Right Capture Pattern Stop Index: : '" + str(right_capture_pattern_stop_index) + "'") if (right_capture_pattern_stop_index > 0): right_capture_pattern = item_pattern[right_capture_pattern_start_index:right_capture_pattern_stop_index] else: right_capture_pattern = item_pattern[right_capture_pattern_start_index:] if (Debug): print("Right Capture Pattern: '" + right_capture_pattern + "'") capture_character = item_pattern[Left_capture_pattern_stop_index+1] if (Debug): print("Capture Character: '" + capture_character + "'") if (Debug): print("Capture Search Start Index: '" + str(capture_search_start_index) + "'") left_capture_pattern_found = item_text.find(Left_capture_pattern, capture_search_start_index) if (left_capture_pattern_found >= 0): capture_start_index = left_capture_pattern_found + len(Left_capture_pattern) else : capture_start_index = -1 if (Debug): print("Capture Start Index: '" + str(capture_start_index) + "'") capture_end_index = item_text.find(right_capture_pattern, capture_start_index) if (Debug): print("Capture End Index: '" + str(capture_end_index) + "'") if (left_capture_pattern_found >= 0 & capture_end_index >= 0): capture_search_start_index = capture_end_index if (capture_character == "%"): if (left_capture_pattern_found >= 0): captured = clean_input(item_text[capture_start_index:capture_end_index]) if (Debug): print("Captured: '" + captured + "'") output.append(captured) else: output.append("") num_fields_captured += 1 if (Debug): print(str(num_fields_captured) + " of " + str(num_fields_total) + " fields captured") if (num_fields_captured == num_fields_total): capture_search_start_index = -1 Left_capture_pattern_start_index = right_capture_pattern_start_index if (Debug): print("==========================================") return output
def __init__(self, data=None, chrome_instance=None): """generates an RSS Channel Parameters: data (string list): the variables of a channel in the format: item_title:{%6} language:en-ca link:https://google.com title:Google.com Feed ttl:30 """ self.items = [] if data is None: if (Debug): self.print() return for line in data: semi = line.find(":") prefix = line[:semi] semi += 1 # Unfortunately, Python does not include Switch if (prefix =='category'): cats = clean_input(line[semi:]).split(",") self.category = [cat.strip() for cat in cats] elif (prefix =='copyright'): self.copyright = clean_input(line[semi:]) elif (prefix =='description'): self.description = clean_input(line[semi:]) elif (prefix =='enclosure_length'): self.enclosure_length = clean_input(line[semi:]) elif (prefix =='enclosure_type'): self.enclosure_type = clean_input(line[semi:]) elif (prefix =='enclosure_url'): self.enclosure_url = clean_input(line[semi:]) elif (prefix =='image_link'): self.image_link = clean_input(line[semi:]) elif (prefix =='image_title'): self.image_title = clean_input(line[semi:]) elif (prefix =='image_url'): self.image_url = clean_input(line[semi:]) elif (prefix =='item_author'): self.item_author = clean_input(line[semi:]) elif (prefix =='item_category'): self.item_category = clean_input(line[semi:]) elif (prefix =='item_comments'): self.item_comments = clean_input(line[semi:]) elif (prefix =='item_description'): self.item_description = clean_input(line[semi:]) elif (prefix =='item_guid'): self.item_guid = clean_input(line[semi:]) elif (prefix =='item_link'): self.item_link = clean_input(line[semi:]) elif (prefix =='item_pattern'): self.item_pattern = clean_input(line[semi:]) elif (prefix =='item_pubDate'): self.item_pubDate = clean_input(line[semi:]) elif (prefix =='item_source'): self.item_source = clean_input(line[semi:]) elif (prefix =='item_title'): self.item_title = clean_input(line[semi:]) elif (prefix =='language'): self.language = clean_input(line[semi:]) elif (prefix =='link'): self.link = clean_input(line[semi:]) elif (prefix =='managingEditor'): self.managingEditor = clean_input(line[semi:]) elif (prefix == 'title'): self.title = clean_input(line[semi:]) elif (prefix =='ttl'): self.ttl = clean_input(line[semi:]) elif (prefix =='webMaster'): self.webMaster = clean_input(line[semi:]) elif (prefix =='username'): self.username = clean_input(line[semi:]) elif (prefix =='website'): self.website = clean_input(line[semi:]) elif (prefix =='password'): self.password = clean_input(line[semi:]) elif (prefix =='delay'): self.delay = int(clean_input(line[semi:])) if (Debug): self.print()