예제 #1
0
 def test_definition(self, pattern, text, title, link, description):
     if (pattern == None):
         return
     if (link == "https://www.w3.org/about"):
         return
     self.item_pattern = clean_input(pattern)
     if(Debug): print("Item Pattern: '" + self.item_pattern + "'")
     first = self.item_pattern.find("{")
     if (first < 0):
         return None
     second = self.item_pattern.rfind("}")
     if (second < 0):
         return None
     start_pattern = self.item_pattern[:first]
     stop_pattern = self.item_pattern[second+1:]
     if(Debug): print("Start pattern: '" + start_pattern + "'")
     if(Debug): print("Stop pattern: '" + stop_pattern + "'")
     data = self.get_item_text(clean_input(text), start_pattern, stop_pattern)
     if(Debug): print(data[0])
     item_info = self.parse_items(data[:3])
     if(Debug): print(item_info)
     iterator = 0
     items = []
     for item in item_info:
         items.append(RSSItem(
                 item,
                 title=title,
                 link=link,
                 description=description).toJSON())
     if(Debug): print(items)
     if (Debug): print("Test complete")
     return items
예제 #2
0
 def generate_items(self, text, test=False):
     """scrapes the page to find any new items
     """
     if(Debug): print("Item Pattern: '" + self.item_pattern + "'")
     if (self.item_pattern == None):
         return
     if (self.link == "https://www.w3.org/about"):
         return
     if (len(self.items) > 0):
         self.items = []
     start = self.item_pattern.find("{")
     stop = self.item_pattern.rfind("}")
     if(start == -1 or stop == -1):
         return
     start_pattern = self.item_pattern[:start]
     stop_pattern = self.item_pattern[stop+1:]
     if(Debug): print("Start pattern: '" + start_pattern + "'")
     if(Debug): print("Stop pattern: '" + stop_pattern + "'")
     data = self.get_item_text(clean_input(text), start_pattern, stop_pattern)
     item_info = self.parse_items(data)
     if test == True:
         return item_info
     for item in item_info:
         self.items.append(self.create_item(item))
     self.lastBuildDate = datetime.datetime.now()
     self.pubDate = datetime.datetime.now()
예제 #3
0
    def test_pattern(self, pattern, text):
        """Creates a list of items from the given text and item pattern
    
        Parameters:

        pattern (string): an item pattern

        text (string): the text to scrape for items
        """
        self.item_pattern = clean_input(pattern)
        return self.generate_items(text, True)
예제 #4
0
    def parse_item_text(self, item_text, pattern = None):
        """generates a list of item fields from the item pattern and a snippet of text from get_item_text
    
        Parameters:

        item_text (string): A snippet of text from the source code that matches the item pattern
        
        pattern (string): An item pattern
        """
        if (Debug): print("Parsing Item Text")
        if (pattern is None):
            if (self.item_pattern is None):
                return
            item_pattern = self.item_pattern
        else:
            item_pattern = pattern
        output = []
        Left_capture_pattern_start_index = 0 #The position in the pattern
        capture_search_start_index = 0 #The position in the text

        num_fields_total = item_pattern.count("{%}")
        num_fields_captured = 0
        if (Debug): print("Total Fields: '" + str(num_fields_total) + "'")
        if (Debug): print("Item Text: '" + item_text + "'")
        if (Debug): print("Item Pattern: '" + item_pattern + "'")
        while(capture_search_start_index >= 0):
            if (Debug): print("==========================================")
            if (Debug): print("Left Capture Pattern Start Index: '" + str(Left_capture_pattern_start_index) + "'")
            Left_capture_pattern_stop_index = item_pattern.find("{", Left_capture_pattern_start_index)
            if (Debug): print("Left Capture Pattern Stop Index: '" + str(Left_capture_pattern_stop_index) + "'")
            Left_capture_pattern = item_pattern[Left_capture_pattern_start_index:Left_capture_pattern_stop_index]
            if (Debug): print("Left Capture Pattern: '" + Left_capture_pattern + "'")
            
            right_capture_pattern_start_index = item_pattern.find("}", Left_capture_pattern_stop_index)+1
            if (Debug): print("Right Capture Pattern Start Index: '" + str(right_capture_pattern_start_index) + "'")
            right_capture_pattern_stop_index = item_pattern.find("{", right_capture_pattern_start_index)
            if (Debug): print("Right Capture Pattern Stop Index: : '" + str(right_capture_pattern_stop_index) + "'")

            if (right_capture_pattern_stop_index > 0):
                right_capture_pattern = item_pattern[right_capture_pattern_start_index:right_capture_pattern_stop_index]
            else:
                right_capture_pattern = item_pattern[right_capture_pattern_start_index:]

            if (Debug): print("Right Capture Pattern: '" + right_capture_pattern + "'")

            capture_character = item_pattern[Left_capture_pattern_stop_index+1]
            if (Debug): print("Capture Character: '" + capture_character + "'")

            if (Debug): print("Capture Search Start Index: '" + str(capture_search_start_index) + "'")

            left_capture_pattern_found = item_text.find(Left_capture_pattern, capture_search_start_index)
            if (left_capture_pattern_found >= 0):
                capture_start_index =  left_capture_pattern_found + len(Left_capture_pattern)
            else :
                capture_start_index = -1
            if (Debug): print("Capture Start Index: '" + str(capture_start_index) + "'")

            capture_end_index = item_text.find(right_capture_pattern, capture_start_index)
            if (Debug): print("Capture End Index: '" + str(capture_end_index) + "'")

            if (left_capture_pattern_found >= 0 & capture_end_index >= 0):
                capture_search_start_index = capture_end_index
            if (capture_character == "%"):
                if (left_capture_pattern_found >= 0):
                    captured = clean_input(item_text[capture_start_index:capture_end_index])
                    if (Debug): print("Captured: '" + captured + "'")
                    output.append(captured)
                else:
                    output.append("")
                num_fields_captured += 1
            if (Debug): print(str(num_fields_captured) + " of " + str(num_fields_total) + " fields captured")
            if (num_fields_captured == num_fields_total):
                capture_search_start_index = -1
            Left_capture_pattern_start_index = right_capture_pattern_start_index
            if (Debug): print("==========================================")
        return output
예제 #5
0
    def __init__(self, data=None, chrome_instance=None):
        """generates an RSS Channel
    
        Parameters:

        data (string list): the variables of a channel in the format:
            item_title:{%6}
            language:en-ca
            link:https://google.com
            title:Google.com Feed
            ttl:30  
        """
        self.items = []

        if data is None:
            if (Debug): self.print()
            return

        for line in data:

            semi = line.find(":")
            prefix = line[:semi]

            semi += 1
            
            # Unfortunately, Python does not include Switch
            
            if (prefix =='category'):
                cats = clean_input(line[semi:]).split(",")
                self.category = [cat.strip() for cat in cats]
            elif (prefix =='copyright'):
                self.copyright = clean_input(line[semi:])
            elif (prefix =='description'):
                self.description = clean_input(line[semi:])
            

            elif (prefix =='enclosure_length'):
                self.enclosure_length = clean_input(line[semi:])
            elif (prefix =='enclosure_type'):
                self.enclosure_type = clean_input(line[semi:])
            elif (prefix =='enclosure_url'):
                self.enclosure_url = clean_input(line[semi:])


            elif (prefix =='image_link'):
                self.image_link = clean_input(line[semi:])
            elif (prefix =='image_title'):
                self.image_title = clean_input(line[semi:])
            elif (prefix =='image_url'):
                self.image_url = clean_input(line[semi:])


            elif (prefix =='item_author'):
                self.item_author = clean_input(line[semi:])
            elif (prefix =='item_category'):
                self.item_category = clean_input(line[semi:])
            elif (prefix =='item_comments'):
                self.item_comments = clean_input(line[semi:])
            elif (prefix =='item_description'):
                self.item_description = clean_input(line[semi:])
            elif (prefix =='item_guid'):
                self.item_guid = clean_input(line[semi:])
            elif (prefix =='item_link'):
                self.item_link = clean_input(line[semi:])
            elif (prefix =='item_pattern'):
                self.item_pattern = clean_input(line[semi:])
            elif (prefix =='item_pubDate'):
                self.item_pubDate = clean_input(line[semi:])
            elif (prefix =='item_source'):
                self.item_source = clean_input(line[semi:])
            elif (prefix =='item_title'):
                self.item_title = clean_input(line[semi:])

            elif (prefix =='language'):
                self.language = clean_input(line[semi:])
            elif (prefix =='link'):
                self.link = clean_input(line[semi:])
            elif (prefix =='managingEditor'):
                self.managingEditor = clean_input(line[semi:])
            elif (prefix == 'title'):
                self.title = clean_input(line[semi:])
            elif (prefix =='ttl'):
                self.ttl = clean_input(line[semi:])
            elif (prefix =='webMaster'):
                self.webMaster = clean_input(line[semi:])

            elif (prefix =='username'):
                self.username = clean_input(line[semi:])
            elif (prefix =='website'):
                self.website = clean_input(line[semi:])
            elif (prefix =='password'):
                self.password = clean_input(line[semi:])

            elif (prefix =='delay'):
                self.delay = int(clean_input(line[semi:]))

        if (Debug): self.print()