class WebParser(InteractiveImporter): BREAK_AROUND = [ 'p', 'title', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'table', 'p', 'blockquote', 'title', 'div', 'section', 'header', 'footer', 'nav' ] IS_BREAK = ['br'] NESTED = { 'tr': ['table'], 'li': ['ol', 'ul'], 'dd': ['dl'], } TAB_BEFORE = ['td', 'dt'] IGNORE = ['script', 'meta', 'select', 'link', 'img', 'style'] TAB = ' ' JOINABLE = [ 'instructions', 'notes', 'recipe', 'ignore', 'ingredients', 'include', None ] INVISIBLE_TYPES = [CData, Comment, Declaration, ProcessingInstruction] # BeautifulSoup.CData, BeautifulSoup.Comment, BeautifulSoup.Declaration, BeautifulSoup.ProcessingInstruction] do_postparse = True imageexcluders = None # This could be a list of compiled regexps which would # be used to search image URL strings for # potential ads, etc. def __init__(self, url, data, content_type): self.ignore_unparsed = False self.url = url #self.name = 'Web Parser' self.soup = BeautifulSoup.BeautifulSoup( data, convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES, ) InteractiveImporter.__init__(self) #self.generic_parser = RecipeParser() self.preparse() self.get_images() self.text_parser = RecipeParser() def commit_rec(self): if not self.rec.get('link', ''): self.rec['link'] = self.url gourmet.importers.importer.Importer.commit_rec(self) def preparse(self): self.preparsed_elements = [] def identify_match(self, tag): for t, label in self.preparsed_elements: if tag == t: return label def get_images(self): self.images = [] for i in self.soup('img'): try: src = i['src'] except KeyError: continue img_url = urllib.basejoin(self.url, src) if self.imageexcluders: exclude = False for exc in self.imageexcluders: if exc.search(img_url): exclude = True break if exclude: continue self.images.append(img_url) def parse(self, tag=None): if not tag: tag = self.soup self.parsed = [] self.buffer = '' self.last_label = None self.crawl(tag) if self.buffer: self.add_buffer_to_parsed() return self.parsed def crawl(self, tag, parent_label=None): formatting = self.format_tag_whitespace(tag) if formatting == -1: return # special case allows formatting method to # auto-skip scripts and what-not else: start_ws, end_ws = formatting self.buffer += start_ws label = self.identify_match(tag) if not label and parent_label: # inherit... label = parent_label elif self.ignore_unparsed and not label: label = 'ignore' #elif not label: # print 'DONT IGNORE' #print 'ID TAG',tag,'with',label if hasattr(tag, 'contents') and tag.contents: for child in tag.contents: self.crawl(child, label) else: if label != self.last_label or self.last_label not in self.JOINABLE: if self.buffer: self.add_buffer_to_parsed() self.last_label = label if hasattr(tag, 'string'): self.buffer += self.reduce_whitespace(tag.string or '') if end_ws: self.buffer += end_ws return label def reduce_whitespace(self, s): if not hasattr(self, '__whitespace_regexp'): self.__whitespace_regexp = re.compile(r'\s+') return self.__whitespace_regexp.sub(' ', s) def cut_extra_whitespace(self, s): if s.count('\n') > 2: s = s.replace('\n', '', s.count('\n') - 2) return s def add_buffer_to_parsed(self): if not self.buffer.strip(): return tws = 0 #tws = # of trailing whitespace characters while tws + 1 < len(self.buffer) and self.buffer[-(tws + 1)].isspace(): tws += 1 if not tws: to_add = self.buffer self.buffer = '' else: to_add = self.buffer[:-tws] self.buffer = self.buffer[-tws:] self.buffer = self.cut_extra_whitespace(self.buffer) lws = 0 while lws + 1 < len(to_add) and to_add[lws].isspace(): lws += 1 if lws: # In this case, we're going to add the white space separately with no label... pre_add = to_add[:lws] pre_add = self.cut_extra_whitespace(pre_add) to_add = to_add[lws:] self.parsed.append((pre_add, None)) # Do extra substitution of MS Characters -- shouldn't be necessary... for char, tup in list(BeautifulSoup.UnicodeDammit.MS_CHARS.items()): char = char.decode('iso-8859-1').encode('utf-8') if to_add.find(char) >= 0: try: to_add = to_add.replace(char, chr(int(tup[1], 16))) except ValueError: print("ValueError caught in add_buffer_to_parsed") self.parsed.append((to_add, self.last_label)) def format_tag_whitespace(self, tag): '''Return any whitespace required by tag, or -1 if tag should not be considered for text ''' for klass in self.INVISIBLE_TYPES: if isinstance(tag, klass): return -1 if not hasattr(tag, 'name'): return '', '' elif tag.name in self.IGNORE: return -1 if tag.name in self.IS_BREAK: return '\n', '' elif tag.name in self.NESTED: parent_types = self.NESTED[tag.name] parents = 0 for typ in parent_types: parents += len(tag.fetchParents(typ)) return '\n' + self.TAB * parents, '' elif tag.name in self.TAB_BEFORE: return self.TAB, '' elif tag.name in self.BREAK_AROUND: return '\n', '\n' else: return '', '' def postparse(self, parsed): '''Do purely text-based parsing of content. ''' new_parse = [] for p, attr in parsed: p = re.sub(r'(\n\s*\n)+', '\n\n', p) # Take out extra newlines if attr == None or attr == 'recipe': new_parse.extend(self.text_parser.parse(p)) else: new_parse.append((p, attr)) return new_parse def parse_webpage(self): self.preparse() tags = [pp[1] for pp in self.preparsed_elements] if 'include' in tags: self.ignore_unparsed = True parsed = self.parse() if self.do_postparse: return self.postparse(parsed) else: return parsed def do_run(self): parsed = self.parse_webpage() self.set_parsed(parsed) return InteractiveImporter.do_run(self)
class InteractiveImporter(ConvenientImporter, NotThreadSafe): NEW_REC_TEXT = _('New Recipe') def __init__(self, custom_parser=None, tags=DEFAULT_TAGS, tag_labels=DEFAULT_TAG_LABELS, modal=True, title=_('Import recipe')): self.title = title if custom_parser: self.parser = custom_parser else: self.parser = RecipeParser() self.labels_by_tag = tag_labels self.tags_by_label = {self.NEW_REC_TEXT: 'newrec'} for k, v in list(self.labels_by_tag.items()): self.tags_by_label[v] = k self.tags = tags self.setup_window() self.setup_action_area() self.markup_marks = {} self.markup_partners = {} self.anchors = [] self.midno = 0 # an ID counter for markup marks we insert self.labelled = [] self.label_counts = {} self.modal = modal # If we're in an embedded gtk mainloop... ConvenientImporter.__init__(self) def setup_window(self): # set our parent... from gourmet.threadManager import get_thread_manager_gui import gourmet.GourmetRecipeManager tmg = get_thread_manager_gui() self.w = Gtk.Window() self.w.set_title(self.title) main_app = gourmet.GourmetRecipeManager.get_application() self.w.set_transient_for(main_app.window) self.w.set_destroy_with_parent(False) self.hb = Gtk.HBox() self.w.add(self.hb) self.tv = Gtk.TextView() self.tv.set_size_request(600, 500) self.tv.set_wrap_mode(Gtk.WrapMode.WORD) self.action_area = Gtk.VBox() sw = Gtk.ScrolledWindow() sw.add(self.tv) sw.set_policy(Gtk.PolicyType.NEVER, Gtk.PolicyType.AUTOMATIC) self.hb.add(sw) sw.show() self.tv.show() self.hb.pack_end(self.action_area, expand=False, fill=False, padding=0) self.action_area.show() self.tb = self.tv.get_buffer() self.setup_tags() def setup_action_area(self): # Set up hard-coded functional buttons... self.new_recipe_button = Gtk.Button.new_with_mnemonic(_('_New Recipe')) self.new_recipe_button.connect('clicked', self.new_recipe_cb) self.remove_markup_button = Gtk.Button.new_with_mnemonic( _('Clear _Tags')) # noqa self.remove_markup_button.connect('clicked', self.clear_tags) # Set up ActionModel (for drop-down menu version of these commands) self.action_model = Gtk.ListStore(str, str) action_table = Gtk.Table() self.action_area.pack_start(action_table, expand=False, fill=False, padding=0) # Get our UI layout from UI_TAG_ORDER r = 0 # row number for label, rows in UI_TAG_ORDER: if r != 0: blank = Gtk.Label(label='') action_table.attach(blank, 0, 2, r, r + 1) blank.show() r += 1 glabel = Gtk.Label() glabel.set_markup('<b>' + label + '</b>') glabel.set_alignment(0.0, 0.5) action_table.attach(glabel, 0, 2, r, r + 1) glabel.show() r += 1 for row in rows: for c, t in enumerate(row): # column number, tag if t == 'clear': tag_btn = self.remove_markup_button elif t == 'newrec': tag_btn = self.new_recipe_button else: tag_btn = Gtk.Button.new_with_mnemonic( '_' + self.labels_by_tag[t]) self.action_model.append([self.labels_by_tag[t], t]) tag_btn.connect('clicked', self.label_callback, self.labels_by_tag[t]) action_table.attach(tag_btn, c, c + 1, r, r + 1, xpadding=12) r += 1 action_table.set_margin_top(3) action_table.set_margin_bottom(3) action_table.set_margin_start(3) action_table.set_margin_end(3) self.import_button = Gtk.Button(label=_('Import Recipe')) self.import_button.connect('clicked', lambda *args: self.commit_changes()) self.action_area.pack_end(self.import_button, fill=False, expand=False, padding=0) self.action_area.show_all() def setup_tags(self): self.markup_tag = Gtk.TextTag.new('markup') self.markup_tag.set_property('editable', False) # see https://developer.gnome.org/pango/stable/pango-Text-Attributes.html#PANGO-SCALE-XX-SMALL:CAPS # noqa # for magic number meaning self.markup_tag.set_property('scale', 0.8333333333333) self.markup_tag.set_property('rise', 15) self.markup_tag.set_property('foreground', '#f00') self.ignore_tag = Gtk.TextTag.new('ignore') self.ignore_tag.set_property('invisible', True) self.ignore_tag.set_property('editable', False) self.tb.get_tag_table().add(self.markup_tag) self.tb.get_tag_table().add(self.ignore_tag) def label_callback(self, button, label): self.label_selection(label) def label_selection(self, label: str): cursel = self.tb.get_selection_bounds() if cursel: start, end = cursel else: # Otherwise, there's no clear sane default... we'll just # select the current whole line cur_mark = self.tb.get_insert() cur_pos = self.tb.get_iter_at_mark(cur_mark) cur_pos.backward_chars(cur_pos.get_line_offset()) start = cur_pos.copy() cur_pos.forward_line() end = cur_pos self.label_range(start, end, label) def insert_with_label(self, st, text, label): start_offset = st.get_offset() self.tb.insert(st, text) end_offset = start_offset + len(text) self.label_range(self.tb.get_iter_at_offset(start_offset), self.tb.get_iter_at_offset(end_offset), label) def unhide_area(self, midno): st, end = self.markup_marks[midno] self.tb.remove_tag(self.ignore_tag, self.tb.get_iter_at_mark(st), self.tb.get_iter_at_mark(end)) def hide_range(self, st, end): """Hide text between start and end. Return midno that can be used to unhide the range.""" midno = self.midno self.midno += 1 start_mark = Gtk.TextMark.new(f'start-markup-{midno}', False) end_mark = Gtk.TextMark.new(f'end-markup-{midno}', True) self.tb.apply_tag(self.ignore_tag, st, end) self.tb.add_mark(start_mark, st) self.tb.add_mark(end_mark, end) self.markup_marks[midno] = (start_mark, end_mark) return midno def label_range(self, st, end, label): if self.tags_by_label.get(label, '') == 'ignore': midno = self.hide_range(st, end) b = Gtk.Button(label='Ignored text: Reveal hidden text') anchor = self.insert_widget(end, b) def unhide_text(*args): self.unhide_area(midno) self.remove_widget(anchor) b.connect('clicked', unhide_text) b.show() return if label in self.label_counts: count = self.label_counts[label] self.label_counts[label] += 1 else: self.label_counts[label] = 1 count = 0 smark = Gtk.TextMark.new(f'{label}-{count}-start', True) emark = Gtk.TextMark.new(f'{label}-{count}-end', False) self.tb.add_mark(smark, st) self.tb.add_mark(emark, end) self.labelled.append((smark, emark)) # Now we add the labels... start_txt = '[' start_id = self.insert_markup_text(st, start_txt, self.markup_tag) # Now move the mark back up... new_pos = self.tb.get_iter_at_mark(smark) new_pos.forward_chars(len(start_txt)) self.tb.move_mark(smark, new_pos) # Create a "Remove me" button #b = Gtk.Button('_Remove tag'); b.show)( b = Gtk.Button() img = Gtk.Image.new_from_icon_name(Gtk.STOCK_REMOVE, Gtk.IconSize.MENU) b.add(img) img.show() itr = self.tb.get_iter_at_mark(emark) anchor = self.insert_widget(itr, b) # Set up combo button... labelbutton = Gtk.ComboBoxText.new() labelbutton.set_model(self.action_model) cb.cb_set_active_text(labelbutton, label) anchor2 = self.insert_widget(self.tb.get_iter_at_mark(smark), labelbutton) # Add final bracket for end of markup end_bracket_itr = self.tb.get_iter_at_mark(emark) end_id = self.insert_markup_text(end_bracket_itr, ']', self.markup_tag) self.markup_partners[start_id] = end_id self.markup_partners[end_id] = start_id # Now back up our itr one character (it got advanced by adding # the right bracket and the button) eitr = self.tb.get_iter_at_mark(emark) eitr.backward_chars(2) self.tb.move_mark(emark, eitr) # Define callback to remove our text when button is clicked def remove_markup(*args): self.labelled.remove((smark, emark)) self.remove_markup_text(start_id) self.remove_markup_text(end_id) self.remove_widget(anchor) self.remove_widget(anchor2) def change_mark(cb): # copy marks for safekeeping... new_text = cb.get_active_text() sm = Gtk.TextMark.new(None, True) self.tb.add_mark(sm, self.tb.get_iter_at_mark(smark)) em = Gtk.TextMark.new(None, False) self.tb.add_mark(em, self.tb.get_iter_at_mark(emark)) # remove old marks... remove_markup() # And relabel! self.label_range(self.tb.get_iter_at_mark(sm), self.tb.get_iter_at_mark(em), new_text) labelbutton.connect('changed', change_mark) b.connect('clicked', remove_markup) def new_recipe_cb(self, *args): # Start a new recipe at cursor itr = self.tb.get_iter_at_mark(self.tb.get_insert()) self.label_range(itr, itr, self.NEW_REC_TEXT) def insert_markup_text(self, itr, text, *tags): """Insert markup text into the buffer. We do this in such a way that we can remove it easily later. """ midno = self.midno self.midno += 1 start_mark = Gtk.TextMark.new(f'start-markup-{midno}', False) end_mark = Gtk.TextMark.new(f'end-markup-{midno}', True) start_offset = itr.get_offset() if tags: self.tb.insert_with_tags(itr, text, *tags) else: self.tb.insert(itr, text) self.tb.add_mark(start_mark, self.tb.get_iter_at_offset(start_offset)) end_offset = start_offset + len(text) end_itr = self.tb.get_iter_at_offset(end_offset) self.tb.add_mark(end_mark, end_itr) self.markup_marks[midno] = (start_mark, end_mark) return midno def insert_widget(self, itr, widget): anchor = self.tb.create_child_anchor(itr) self.anchors.append(anchor) self.tv.add_child_at_anchor(widget, anchor) widgetstart = self.tb.get_iter_at_child_anchor(anchor) widgetend = widgetstart.copy() widgetend.forward_char() self.tb.apply_tag(self.markup_tag, widgetstart, widgetend) widget.show() return anchor def remove_widget(self, anchor): anchor_iter = self.tb.get_iter_at_child_anchor(anchor) delete_to = anchor_iter.copy() delete_to.forward_char() self.tb.delete(anchor_iter, delete_to) def remove_markup_text(self, idno): smark, emark = self.markup_marks[idno] sitr, eitr = (self.tb.get_iter_at_mark(smark), self.tb.get_iter_at_mark(emark)) self.tb.delete(sitr, eitr) def clear_tags(self, *args): """Clear all markup in current selection, or whole buffer if there is no selection """ cursel = self.tb.get_selection_bounds() if cursel: st, end = cursel else: st, end = self.tb.get_bounds() st_offset = st.get_offset() e_offset = end.get_offset() for idno, iters in list(self.markup_marks.items()): lst, lend = iters if ((e_offset > self.tb.get_iter_at_mark(lst).get_offset() > st_offset) or (e_offset > self.tb.get_iter_at_mark(lend).get_offset() > st_offset)): self.remove_markup_text(idno) if idno in self.markup_partners: self.remove_markup_text(self.markup_partners[idno]) for lst, lend in self.labelled[:]: if ((e_offset > self.tb.get_iter_at_mark(lst).get_offset() > st_offset) or (e_offset > self.tb.get_iter_at_mark(lend).get_offset() > st_offset)): self.labelled.remove((lst, lend)) for anchor in self.anchors[:]: anchor_iter = self.tb.get_iter_at_child_anchor(anchor) if e_offset > anchor_iter.get_offset() > st_offset: self.anchors.remove(anchor) self.remove_widget(anchor) def commit_changes(self): self.labelled.sort( key=lambda x: self.tb.get_iter_at_mark(x[0]).get_offset()) if not self.labelled: return self.start_rec() started = False for smark, emark in self.labelled: siter = self.tb.get_iter_at_mark(smark) eiter = self.tb.get_iter_at_mark(emark) text = siter.get_text(eiter) name = smark.get_name() label = name.split('-')[0] tag = self.tags_by_label[label] if not text: continue if tag in gglobals.TEXT_ATTR_DIC: self.add_text(tag, text) started = True elif tag in gglobals.REC_ATTR_DIC: self.add_attribute(tag, text) elif tag == 'ingredient': self.add_ing_from_text(text) started = True elif tag == 'ingredients': self.add_ings_from_text(text) started = True elif tag == 'inggroup': self.add_ing_group(text) started = True elif tag == 'newrec': if not started: continue # Then we're starting a new recipe at this point... # Commit old recipe... self.commit_rec() started = False # Start new one... self.start_rec() elif tag == 'ignore': continue elif tag == 'servings': self.add_attribute('yields', text) self.add_attribute('yield_unit', 'servings') else: print('UNKNOWN TAG', tag, text, label) if started: self.commit_rec() if hasattr(self, 'images') and self.images: for rec in self.added_recs: browser = ImageBrowser(self.w, self.images) response = browser.run() if response == Gtk.ResponseType.OK: thumb = browser.image.copy() thumb.thumbnail((40, 40)) self.rd.modify_rec( rec, { 'image': image_to_bytes(browser.image), 'thumb': image_to_bytes(thumb) }) browser.destroy() if self.modal: self.w.hide() Gtk.main_quit() def set_text(self, txt): txt = str(txt) # convert to unicode for good measure txt = re.sub(r'(\n\s*\n)+', '\n\n', txt) # Take out extra newlines txt = self.parser.parse(txt) # Parse self.set_parsed(txt) def set_parsed(self, parsed): #dbg_file = open('/tmp/out','w') for chunk, tag in parsed: #dbg_file.write(chunk) if tag == None: self.tb.insert(self.tb.get_end_iter(), chunk) else: self.insert_with_label(self.tb.get_end_iter(), chunk, self.labels_by_tag.get(tag, tag)) #dbg_file.close() def do_run(self): self.w.show_all() if self.modal: self.w.connect('delete-event', Gtk.main_quit) Gtk.main() else: self.w.connect('delete-event', lambda *args: self.w.hide())
class WebParser (InteractiveImporter): BREAK_AROUND = ['p','title','h1','h2','h3','h4','h5','h6', 'table','p','blockquote','title','div','section','header','footer','nav'] IS_BREAK = ['br'] NESTED = {'tr':['table'], 'li':['ol','ul'], 'dd':['dl'], } TAB_BEFORE = ['td','dt'] IGNORE = ['script','meta','select','link','img','style'] TAB = ' ' JOINABLE = ['instructions','notes','recipe','ignore','ingredients','include',None] INVISIBLE_TYPES = [ BeautifulSoup.CData, BeautifulSoup.Comment, BeautifulSoup.Declaration, BeautifulSoup.ProcessingInstruction] do_postparse = True imageexcluders = None # This could be a list of compiled regexps which would # be used to search image URL strings for # potential ads, etc. def __init__ (self, url, data, content_type): self.ignore_unparsed = False self.url = url #self.name = 'Web Parser' self.soup = BeautifulSoup.BeautifulSoup(data, convertEntities=BeautifulSoup.BeautifulStoneSoup.XHTML_ENTITIES, ) InteractiveImporter.__init__(self) #self.generic_parser = RecipeParser() self.preparse() self.get_images() self.text_parser = RecipeParser() def commit_rec (self): if not self.rec.get('link',''): self.rec['link'] = self.url gourmet.importers.importer.Importer.commit_rec(self) def preparse (self): self.preparsed_elements = [] def identify_match (self, tag): for t,label in self.preparsed_elements: if tag==t: return label def get_images (self): self.images = [] for i in self.soup('img'): try: src = i['src'] except KeyError: continue img_url = urllib.basejoin(self.url,src) if self.imageexcluders: exclude = False for exc in self.imageexcluders: if exc.search(img_url): exclude = True break if exclude: continue self.images.append(img_url) def parse (self, tag=None): if not tag: tag = self.soup self.parsed = [] self.buffer = '' self.last_label = None self.crawl(tag) if self.buffer: self.add_buffer_to_parsed() return self.parsed def crawl (self, tag, parent_label=None): formatting = self.format_tag_whitespace(tag) if formatting == -1: return # special case allows formatting method to # auto-skip scripts and what-not else: start_ws,end_ws = formatting self.buffer += start_ws label = self.identify_match(tag) if not label and parent_label: # inherit... label = parent_label elif self.ignore_unparsed and not label: label = 'ignore' #elif not label: # print 'DONT IGNORE' #print 'ID TAG',tag,'with',label if hasattr(tag,'contents') and tag.contents: for child in tag.contents: self.crawl(child,label) else: if label != self.last_label or self.last_label not in self.JOINABLE: if self.buffer: self.add_buffer_to_parsed() self.last_label = label if hasattr(tag,'string'): self.buffer += self.reduce_whitespace(tag.string or '') if end_ws: self.buffer += end_ws return label def reduce_whitespace (self, s): if not hasattr(self,'__whitespace_regexp'): self.__whitespace_regexp = re.compile('\s+') return self.__whitespace_regexp.sub(' ',s) def cut_extra_whitespace (self, s): if s.count('\n')>2: s = s.replace( '\n','', s.count('\n')-2) return s def add_buffer_to_parsed (self): if not self.buffer.strip(): return tws = 0 #tws = # of trailing whitespace characters while tws+1 < len(self.buffer) and self.buffer[-(tws+1)].isspace(): tws += 1 if not tws: to_add = self.buffer self.buffer = '' else: to_add = self.buffer[:-tws] self.buffer = self.buffer[-tws:] self.buffer = self.cut_extra_whitespace(self.buffer) lws = 0 while lws+1 < len(to_add) and to_add[lws].isspace(): lws += 1 if lws: # In this case, we're going to add the white space separately with no label... pre_add = to_add[:lws] pre_add = self.cut_extra_whitespace(pre_add) to_add = to_add[lws:] self.parsed.append((pre_add,None)) # Do extra substitution of MS Characters -- shouldn't be necessary... for char,tup in BeautifulSoup.UnicodeDammit.MS_CHARS.items(): char = char.decode('iso-8859-1').encode('utf-8') if to_add.find(char) >= 0: try: to_add = to_add.replace(char,unichr(long(tup[1],16))) except ValueError: print("ValueError caught in add_buffer_to_parsed") self.parsed.append((to_add,self.last_label)) def format_tag_whitespace (self, tag): '''Return any whitespace required by tag, or -1 if tag should not be considered for text ''' for klass in self.INVISIBLE_TYPES: if isinstance(tag,klass): return -1 if not hasattr(tag,'name'): return '','' elif tag.name in self.IGNORE: return -1 if tag.name in self.IS_BREAK: return '\n','' elif tag.name in self.NESTED: parent_types = self.NESTED[tag.name]; parents = 0 for typ in parent_types: parents += len(tag.fetchParents(typ)) return '\n'+self.TAB*parents,'' elif tag.name in self.TAB_BEFORE: return self.TAB,'' elif tag.name in self.BREAK_AROUND: return '\n','\n' else: return '','' def postparse (self, parsed): '''Do purely text-based parsing of content. ''' new_parse = [] for p,attr in parsed: p = re.sub('(\n\s*\n)+','\n\n',p) # Take out extra newlines if attr == None or attr == 'recipe': new_parse.extend( self.text_parser.parse(p) ) else: new_parse.append((p,attr)) return new_parse def parse_webpage (self): self.preparse() tags = [pp[1] for pp in self.preparsed_elements] if 'include' in tags: self.ignore_unparsed = True parsed = self.parse() if self.do_postparse: return self.postparse(parsed) else: return parsed def do_run (self): parsed = self.parse_webpage() self.set_parsed(parsed) return InteractiveImporter.do_run(self)