def extend(best_candidate, candidates): """Enrich the best candidate by tacking on good-ranking siblings""" threshold = max([10, best_candidate['score'] * 0.5]) soup = BeautifulSoup() soup.append(best_candidate['el']) for sibling in best_candidate['el'].parent.children: if type(sibling) != Tag or sibling == best_candidate['el']: continue append = False if sibling in candidates and candidates[sibling]['score'] >= threshold: append = True if sibling.name == "p": density = link_density(sibling) content = sibling.get_text(strip=True) or "" length = len(content) if length > 80 and density < 0.25: append = True elif length < 80 and density == 0 and patterns.punctuation.search(content): append = True if append: soup.body.append(sibling) return soup
def mkv_metadata(video): root = BeautifulSoup(features='xml') root.append(Doctype('Tags SYSTEM "matroskatags.dtd"')) tags = root.new_tag("Tags") tag = root.new_tag("Tag") tags.append(tag) root.append(tags) keep = ('title', 'description', 'url', 'genre') targets = root.new_tag("Targets") ttv = root.new_tag("TargetTypeValue") ttv.string = str(50) targets.append(ttv) tag.append(targets) for key in video: if not key in keep: continue simple = root.new_tag('Simple') name = root.new_tag('Name') name.string = key.upper() simple.append(name) sstring = root.new_tag('String') sstring.string = video[key] simple.append(sstring) tag.append(simple) return str(root)
def _process_toc(self): """Creates a toc based on the headings, h1 to h6. """ # Create soups soup = BeautifulSoup(self.html_str) toc_soup = BeautifulSoup() # Create the new tags for toc div_tag = toc_soup.new_tag('div') div_tag['class'] = 'toc' h2_tag = toc_soup.new_tag('h2') a_tag = toc_soup.new_tag('a') a_tag['href'] = '#top' a_tag.string = 'Contents' ul_tag = toc_soup.new_tag('ul') h2_tag.append(a_tag) toc_soup.append(div_tag) div_tag.append(h2_tag) div_tag.append(ul_tag) # For each heading, add an li for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']): li_tag = toc_soup.new_tag('li') li_tag['class'] = heading.name a_tag = toc_soup.new_tag('a') a_tag.string = heading.string a_tag['href'] = '#' + heading['id'] li_tag.append(a_tag) ul_tag.append(li_tag) #Add the toc to the end of the content toc_soup.append(soup) self.html_str = str(toc_soup)
def indexHTML(links): from bs4 import BeautifulSoup, Tag soup = BeautifulSoup() h = open('index.html', 'w') html = soup.new_tag('html', ) head = soup.new_tag('head') meta = soup.new_tag('meta', charset = "utf-8") title = soup.new_tag('title') body = soup.new_tag('body') soup.append(html) html.append(head) html.append(meta) html.append(title) title.append('Ryosuke Ogata | Final Project') html.append(body) # create img tags with image links as 'src' for srcs in links: w = "window.innerWidth" img = soup.new_tag('img', src = srcs, width = '20%') html.append(img) h.write( soup.prettify() )
def parse_begin_xxx(self, m, root): symbol = m.group(1) if symbol in ['html', 'HTML']: new_tag = BeautifulSoup(m.group(2), 'html.parser').contents[0] elif symbol in ['example', 'EXAMPLE']: new_tag = self.soup.new_tag('pre') new_tag['class'] = 'example' new_tag.string = m.group(2) elif symbol in ['quote', 'QUOTE']: new_tag = self.soup.new_tag('blockquote') # new_tag.string = m.group(2) for part in re.split('\n{2,}', m.group(2)): new_p_tag = self.soup.new_tag('p') new_p_tag.string = part new_tag.append(new_p_tag) elif symbol in ['verse', 'VERSE']: new_tag = self.soup.new_tag('p') new_tag['class'] = 'verse' new_tag.string = m.group(2) elif symbol in ['center', 'CENTER']: new_tag = self.soup.new_tag('div') new_tag['class'] = 'center' new_tag.string = m.group(2) else: raise RuntimeError('Not supportted begin symbol: %s' % symbol) root.append(new_tag)
def from_directory(cls, directory): """Build an AdventureDoc by processing a directory. Arguments: directory (str): Path to the directory containing the ORDER file along with the sections as markdown files. Returns: AdventureDoc: """ ordered_section_file_names = cls.get_order(directory) all_sections_soup = BeautifulSoup('', 'html.parser') for file_name in ordered_section_file_names: # The ORDER file specifies each filename relative to itself, thusly, # we must prepend the directory these files are in to read them. file_path = os.path.join(directory, file_name) with open(file_path) as f: file_contents = f.read() section_soup = cls.build_section(file_contents, file_name, ordered_section_file_names) all_sections_soup.append(section_soup) cls.put_in_nice_bowl(all_sections_soup) return AdventureDoc(all_sections_soup)
def ToHtml(self, soup): if self.title: chapter = BeautifulSoup('<div><h1 class="chapter"></h1></div>') chapter.h1.append(self.title) chapter.append(self.contents) return chapter return self.contents
def index(request): # return HttpResponse('Hello from Python!') # return render(request, 'index.html' ) resultsParser = ResultsParser() resultsModel = resultsParser.parse('http://cfrsolo2.com/2016/04-17-16-brooksville_fin.htm') # return render(request, 'adrian0.html') # r = requests.get('http://httpbin.org/status/418') # print r.text # return HttpResponse('<pre>' + r.text + '</pre>') soup = BeautifulSoup() new_img_tag = soup.new_tag("img", style='position: absolute; top: 0; right: 0; border: 0;', src="https://camo.githubusercontent.com/e7bbb0521b397edbd5fe43e7f760759336b5e05f/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677265656e5f3030373230302e706e67") new_a_tag = soup.new_tag("a", href='https://github.com/orozcoadrian/race-graphs') new_a_tag.append(new_img_tag) soup.append(new_a_tag) years = get_years_from_homepage() for year in years: new_a_tag = soup.new_tag("a", href=year) new_a_tag.string = year soup.append(new_a_tag) new_a_tag.append(soup.new_tag('br')) # self.wfile.write(soup.prettify()) return HttpResponse(soup.prettify())
def parse_existing_html_code(self): html_doc = self.driver.find_element_by_id('Some_id_from_form') # ex = open('example.html', 'r') # html_doc = ex.read() # ex.close() # -------------- ^^^^^^^^^^ soup = BeautifulSoup(html_doc, 'html.parser') elements = soup.find_all() # select only root elements elements = [el for el in elements if el.parent == soup] upper_elements = BeautifulSoup() for el in elements: upper_elements.append(el) if el.text.lower().startswith('enjoy'): break # try to find main type of flyers # <b id="main_flyer_type"><!-- Summer Flyers --></b> soup = BeautifulSoup(html_doc, 'html.parser') main_type_tag = soup.find(id="main_flyer_type") if main_type_tag: upper_elements.append(main_type_tag) main_flyer_type = main_type_tag.string.strip().lower()\ .replace(' ', '_') else: main_flyer_type = None upper_part = upper_elements.prettify(formatter='html') return upper_part, main_flyer_type
def create_one_zip_file(zip_archive_number): inMemoryOutputFile = StringIO() zipFile = ZipFile(inMemoryOutputFile, 'w') for xml_file_number in xrange(1, XML_FILES_COUNT + 1): soup = BeautifulSoup(features='xml') soup.append(soup.new_tag("root")) var_id = soup.new_tag('var', value=str(uuid4())) var_id['name'] = 'id' soup.root.append(var_id) var_level = soup.new_tag('var', value=random.randint(1, 100)) var_level['name'] = 'level' soup.root.append(var_level) soup.root.append(soup.new_tag('objects')) for i in xrange(1, random.randint(1, 10)): new_object = soup.new_tag('object') new_object['name'] = str(uuid4()) soup.root.objects.append(new_object) zipFile.writestr('%s.xml' % xml_file_number, str(soup)) zipFile.close() inMemoryOutputFile.seek(0) with open('%s/%s.zip' % (GENERATED_FILES_DIR, zip_archive_number), 'w') as fd: fd.write(inMemoryOutputFile.getvalue())
def build_rss(url, list_selector, item_selector, ignored_qp, output, pretty=False): try: soup = BeautifulSoup('<rss version="2.0" />', "xml") rss = soup.rss has_lxml = True except FeatureNotFound: rss = BeautifulSoup('<rss version="2.0" />').rss has_lxml = False r = requests.get(url) list_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html channel = Tag(name="channel") rss.append(channel) channel.append(new_tag("title", list_html.head.title.string)) channel.append(new_tag("link", url)) channel.append(new_tag("description", "--")) channel.append(new_tag("lastBuildDate", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime()))) channel.append(new_tag("generator", "RSS Builder")) item_urls = list_html.select(list_selector) for item_url in map(lambda i: i["href"], item_urls): item_url = urlparse.urljoin(url, item_url) parsed = urlparse.urlparse(item_url) query_params = urlparse.parse_qsl(parsed.query) item_url = urlparse.urlunparse( ( parsed.scheme, parsed.netloc, parsed.path, parsed.params, "&".join([k + "=" + v for k, v in query_params if k not in ignored_qp]), parsed.fragment, ) ) r = requests.get(item_url) item_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html item = Tag(name="item") item.append(new_tag("title", item_html.head.title.string)) item.append(new_tag("link", item_url)) item.append(new_tag("description", str(item_html.select(item_selector)[0]))) channel.append(item) out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode("utf-8") if output == "-": out_file = sys.stdout close_file = lambda: None else: out_file = open(output, "w") close_file = out_file.close if has_lxml: out_file.write(out_func(soup)) else: out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n') out_file.write(out_func(rss)) out_file.write("\n") close_file()
def get_new_body(self): new_soup = BeautifulSoup('<html><head></head><body></body></html>') thumb = self.get_thumbnail() if thumb: hdr = new_soup.new_tag('img') hdr['src'] = './img/{}'.format(self.id + '.jpg') new_soup.body.append(hdr) #Title title = self.get_title() hdr = new_soup.new_tag('title') hdr.append(title) new_soup.head.append(hdr) hdr = new_soup.new_tag('h1') hdr.append(title) new_soup.body.append(hdr) #source source = self.soup.find(id='cphMiddle_cphMain_hlSource') if source: new_soup.body.append(source) #ingredients hdr = new_soup.new_tag('h3') hdr.append('Ingredients') new_soup.body.append(hdr) item = self.soup.find('ul', {'class':'inggroups'}) if item: new_soup.body.append(item) else: new_soup.body.append('No ingedients listed') #instructions hdr = new_soup.new_tag('h3') hdr.append('Instructions') new_soup.body.append(hdr) item = self.soup.find('ol', {'class':'dirgroupitems'}) if item: new_soup.body.append(item) else: new_soup.body.append('No instructions listed') #Notes hdr = new_soup.new_tag('h3') hdr.append('Notes') new_soup.body.append(hdr) notes = self.soup.find(id="cphMiddle_cphMain_lblNotes") if notes: hdr = new_soup.new_tag('pre') hdr.append(notes.get_text()) new_soup.append(hdr) return new_soup.prettify('latin-1')
def extract_body_from_html(html_soup): """Return an XML beautiful soup object with the <body> of the input HTML file""" body = html_soup.body.extract() xml_soup = BeautifulSoup('', 'xml') xml_soup.append(body) return xml_soup
class CimXML(): '''Classe que representa os dados dos componentes em padrão CIM''' def __init__(self, scene): self.scene = scene self.cim_xml = BeautifulSoup() self.cim_xml.append(self.cim_xml.new_tag("Node")) self.cim_xml.find('Node').append(self.cim_xml.new_tag("Breaker")) for item in scene.items(): if isinstance(item, Node): if item.myItemType == item.Religador: tag_id = self.cim_xml.new_tag(str(item.id)) self.cim_xml.find("Breaker").append(tag_id) tag_rc = self.cim_xml.new_tag("ratedCurrent") tag_rc.append(str(item.chave.ratedCurrent)) tag_id.append(tag_rc) tag_itt = self.cim_xml.new_tag("inTransitTime") tag_itt.append(str(item.chave.inTransitTime)) tag_id.append(tag_itt) tag_bc = self.cim_xml.new_tag("breakingCapacity") tag_bc.append(str(item.chave.breakingCapacity)) tag_id.append(tag_bc) tag_rs = self.cim_xml.new_tag("recloseSequences") tag_rs.append(str(item.chave.recloseSequences)) tag_id.append(tag_rs) tag_state = self.cim_xml.new_tag("state") tag_state.append(str(item.chave.estado)) tag_id.append(tag_state) # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("ratedCurrent")) # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("inTransitTime")) # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("breakingCapacity")) # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("recloseSequences")) # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("state")) def write_xml(self, path): ''' Função que cria o arquivo XML na localização indicada pelo argumento path ''' f = open(path, 'w') f.write(self.cim_xml.prettify()) f.close()
def insertEarlyIn(soup: BeautifulSoup, tag: Tag): if soup.body is not None and soup.body.find() is not None: soup.body.find().insert_before(tag) elif soup.title is not None: soup.title.insert_after(tag) elif soup.find() is not None: soup.find().insert_after(tag) # after first element else: soup.append(tag)
def add_tracker(email_html, contactid, messageid): tracker_url = str(os.getenv('TRACKER_URL')) tracker_url += "?contactid={}&messageid={}".format(contactid, messageid) soup = Soup(email_html, 'html.parser') div = soup.new_tag('div') img = soup.new_tag('img', attrs={'height': '0', 'width': '0', 'src': tracker_url}) div.append(img) soup.append(div) return str(soup)
def to_xml(self): el = BeautifulSoup().new_tag('resource') el['class'] = 'package' el['provider'] = self.package_provider.get_key() for pkg in self.packages: pkg_el = BeautifulSoup().new_tag('package') pkg_el['name'] = pkg el.append(pkg_el) return el
def generate_score(num_measures, measure_length, key_number, rest_prob, treble_tp_key_choices=('complex', ), bass_tp_key_choices=('complex', ), treble_cp_key_choices=('complex', ), bass_cp_key_choices=('complex', )): # generates a score num_measures measures long # measure_length is the number of sixteenth notes in a measure soup = BeautifulSoup('', 'xml') score_partwise = soup.new_tag('score-partwise', version='3.1') work = soup.new_tag('work') work_title = soup.new_tag('work-title') alpha = list( 'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890 ') text = list(np.random.choice(alpha, size=np.random.randint(8, 25))) text = ''.join(text) work_title.string = text score_partwise.append(work) work.append(work_title) part_list = soup.new_tag('part-list') score_part = soup.new_tag('score-part', id='P1') part_name = soup.new_tag('part-name') soup.append(score_partwise) score_partwise.append(part_list) part_list.append(score_part) score_part.append(part_name) part_name.append('Piano') part = soup.new_tag('part', id='P1') score_partwise.append(part) attributes = generate_attributes(measure_length, key_number) for i in range(num_measures): n = np.random.choice(len(treble_tp_key_choices)) treble_tp_key = treble_tp_key_choices[n] n = np.random.choice(len(bass_tp_key_choices)) bass_tp_key = bass_tp_key_choices[n] n = np.random.choice(len(treble_cp_key_choices)) treble_cp_key = treble_cp_key_choices[n] n = np.random.choice(len(bass_cp_key_choices)) bass_cp_key = bass_cp_key_choices[n] measure = generate_measure_for_score(measure_length, key_number, rest_prob, treble_tp_key, bass_tp_key, treble_cp_key, bass_cp_key, i + 1) if i == 0: measure.insert(0, attributes) part.append(measure) return soup # with open('sample_score.musicxml', 'w+') as f: # f.write(str(generate_score(64, 16, 0, 0.2, treble_tp_key_choices=('quarters',))))
def create_raw_descs(link_inside): def _remove_all_attrs(text): # removing tag attributes for tag in text.find_all(True): tag.attrs = {} return text # FORMING THE DESCRIPTIONS page = requests.get(link_inside) # getting the object from url soup = BeautifulSoup(page.content, 'html.parser') # loading it into the soup desc_divs = [] # a list for all descs' divs # the code below is a sample, do NOT paste it in your project as is """ main_heading = soup.find("h1") main_heading.name = "div" # change the name for uniformity if main_heading: desc_divs.append(main_heading) else: pass main_desc = soup.find("div", class_="product-main-text") # main desc if main_desc: desc_divs.append(main_desc) else: pass features_table_desc = soup.find("div", class_="title") # features (table heading) if features_table_desc: desc_divs.append(features_table_desc) else: pass features_table = soup.find("table", class_="table-striped") # features table if features_table: desc_divs.append(features_table) else: pass catalog_detail_block = soup.find("div", class_="catalog_detail_info") # text after features table if catalog_detail_block: desc_divs.append(catalog_detail_block) else: pass """ soup.clear() # clearing the old soup for desc_div in desc_divs: # loading the new soup with objects from the list soup.append(desc_div) soup_without_attrs = _remove_all_attrs( soup) # removing all unnecessary attrs return soup_without_attrs
def create_musicxml(path, measure_length, key_number): """ This function takes the path to an uploaded file, its measure_length, and its key number (usually info inputted by user) and passes the image through the first neural net to extract the measures, then passes each measure through the second neural net to convert it to xml. The handle_page function covers the first part and the run_model function covers the second. """ handle_page(path, measure_length, key_number, os.path.join(MEDIA_ROOT, 'current_measures')) measures = [] # initialize the xml output soup = BeautifulSoup(features='xml') score_partwise = soup.new_tag('score-partwise', version='3.1') part_list = soup.new_tag('part-list') score_part = soup.new_tag('score-part', id='P1') part_name = soup.new_tag('part-name') soup.append(score_partwise) score_partwise.append(part_list) part_list.append(score_part) score_part.append(part_name) part_name.append('Piano') part = soup.new_tag('part', id='P1') score_partwise.append(part) # loop through each extracted measure and convert it to xml # if the conversion fails, return a blank measure for i in range( len(os.listdir(os.path.join(MEDIA_ROOT, 'current_measures')))): print('handling measure ', i + 1) measure_soup = run_model( os.path.join(MEDIA_ROOT, 'current_measures', f'subimage{i}.png'), measure_length, key_number) if measure_soup: measure = measure_soup.find('measure') # only need the key and time sig info on the first measure if i != 0: attributes = measure.find('attributes') attributes.extract() measures.append(measure) print(f'measure {i+1} successful') else: blank_measure = get_blank_measure(measure_length) measures.append(blank_measure) print('error in measure ', i + 1) for measure in measures: part.append(measure) # pick a random filename for the output filename = np.random.choice(list('abcdefghijklmnopqrstuvwxyz0123456789'), size=16) filename = ''.join(filename) with open(os.path.join(MEDIA_ROOT, f'{filename}.musicxml'), 'w+') as f: f.write(str(soup)) return filename
def get_content_html(self, images_url): """Returns the html content of the tab, inside a <div> with an id attibute. """ # Create wrapper soup = BeautifulSoup() div_tag = soup.new_tag('div') div_tag['id'] = self.html_id div_tag.append(BeautifulSoup(self.html_content.get_html(images_url))) soup.append(div_tag) return str(soup)
def create_body(flair_output): soup = BeautifulSoup() soup.append(soup.new_tag('text')) soup.find('text').append(soup.new_tag('body')) soup.body.append(soup.new_tag('div')) for paragraph in flair_output: paragraph_tag = soup.new_tag('p') markup = create_markup_with_entities(paragraph, paragraph_tag, soup) soup.div.append(paragraph_tag) return soup
def tag(tagname, attrs=None, text=None, dtrs=None): """Return a soup Tag element.""" attrs = {} if attrs is None else attrs dtrs = [] if dtrs is None else dtrs newtag = BeautifulSoup('', features='lxml').new_tag(tagname, attrs=attrs) if text is not None: newtag.append(text) for dtr in dtrs: newtag.append(dtr) return newtag
def add_css_in_page(url, css): print('dentro') page = requests.get(url) soup = BeautifulSoup(page.content, 'html.parser') outros_css = '''<link rel="stylesheet" href="pd-material-viewer.3b3f13a5.css"> <link rel="stylesheet" href="pd-material-viewer~pd-profile~pd-search~pd-subject.f4b332e0.css">''' link_ref = BeautifulSoup('<link rel="stylesheet" href="' + css + '">', 'html.parser') soup.append(link_ref) return soup
def main(): print("") print("######################################") print("# #") print("# FedEx CBA 2015 Parser Version 1.00 #") print("# Updated 27 June 2020 #") print("# #") print("######################################") print("") DEFAULT_INPUT_FILENAME = "fdx_2015_working.html" DEFAULT_OUTPUT_FILENAME = "fdx_2015_parsed.html" input_filename = input("Enter input filename: ") if input_filename == "": input_filename = DEFAULT_INPUT_FILENAME soup = "" try: with open(input_filename, "r") as file: soup = BeautifulSoup(file, features="html.parser") except: print("") print("File not found. Exiting...") print("") exit() nodes = soup.findAll(["p","h1","h2","h3","h4","h5"]) find_paragraph_id_and_set_node_id(nodes) newest_soup = BeautifulSoup() for node in nodes: nodeId = "" if node.has_attr('id'): nodeId = node['id'] node.attrs = {} if nodeId != "": node['id'] = nodeId newest_soup.append(node) output_filename = input("Enter output filename: ") if output_filename == "": output_filename = DEFAULT_OUTPUT_FILENAME with open(output_filename, "w") as file: file.write(str(newest_soup))
def split_infobox_value(tag: BeautifulSoup) -> List[str]: if tag is None: return [] groups = [] curr = [] seps = [',', '、', '\n', ';', ';'] single_seps = ['及', '等', '和', '或'] counter = PunctuationCounter() for content in tag.contents: if isinstance(content, NavigableString): if content.strip() in single_seps: if len(curr) > 0: groups.append(curr) curr = [] else: start_id = 0 for id, char in enumerate(content): if char in seps and counter.splittable(): if id - start_id > 0: curr.append(NavigableString(content[start_id:id])) start_id = id + 1 if len(curr) > 0: groups.append(curr) curr = [] counter.count(char) if counter.splittable() and start_id < len(content): end_id = len(content) if content.endswith('等等'): end_id -= 2 elif content.endswith('等'): end_id -= 1 if start_id < end_id: curr.append(NavigableString(content[start_id:end_id])) elif isinstance(content, Tag): if content.name == 'br' and len(curr) > 0: groups.append(curr) curr = [] else: curr.append(content) if len(curr) > 0: groups.append(curr) tags = [] for g in groups: tag = Tag(name='div', parser='html.parser') for sub in g: tag.append(sub) tags.append(format_str(tag)) return tags
def path(path_html): if path_html == "habr": habr_path = "https://habr.com/" else: habr_path = "https://habr.com/"+path_html if request.headers['Accept'].split(",")[0] == "text/html": print("+++++HABR_PATH+++++: ", habr_path) page = requests.get(habr_path) print("!====!Get Habr_Page!====!") page = BeautifulSoup(page.text.encode('utf-8'), "html.parser") body_without_scripts = page.body scripts = [] for x in body_without_scripts.find_all("script"): scripts.append(x.extract()) # поиск слова с 6 буквами во всех словах body_without_scripts new_rows = [ re.sub("[^\w]", "", word) \ for word in re.findall(r"\s\b\w{6}\b", body_without_scripts.get_text()) ] myList = sorted(set(new_rows)) print("!====!Write Word_List!====!") with open("templates/my_list_word.txt","w") as file: for word in myList: file.write(word + " ") # замена слов в body_without_scripts for word_in_list in myList: # print(word_in_list) body_without_scripts = re.sub( r"\b" + word_in_list + r"\b", " " + word_in_list + "™", str(body_without_scripts) ) print("!====!Replace Word_in_Body!====!") my_html = BeautifulSoup(body_without_scripts, 'html.parser') # добавляю скрипты в изменёный тег body for script in scripts: my_html.append(script) print("!====!Append Script!====!") my_html = replace_habr_href_in_body(my_html) body_page = str(page.body) page_html = str(page).replace(body_page, str(my_html)) print("!====!Page_Html_is_Done!====!") return page_html
def format_recipe(old_soup): new_soup = BeautifulSoup('<html><head></head><body></body></html>') thumb = old_soup.find(id='cphMiddle_cphMain_imgRecipeThumb') if thumb: hdr = new_soup.new_tag('img') m = re.search('recipes/(.+\.jpg)', thumb['src']) hdr['src'] = './img/{}'.format(m.group(1)) new_soup.body.append(hdr) source = soup.find(id='cphMiddle_cphMain_hlSource') title = old_soup.find(id='cphMiddle_cphMain_lblTitle').get_text().strip() hdr = new_soup.new_tag('title') hdr.append(title) new_soup.head.append(hdr) hdr = new_soup.new_tag('h1') hdr.append(title) new_soup.body.append(hdr) if source: new_soup.body.append(source) hdr = new_soup.new_tag('h3') hdr.append('Ingredients') new_soup.body.append(hdr) item = old_soup.find('ul', {'class':'inggroups'}) if item: new_soup.body.append(item) else: new_soup.body.append('No ingedients listed') hdr = new_soup.new_tag('h3') hdr.append('Instructions') new_soup.body.append(hdr) item = old_soup.find('ol', {'class':'dirgroupitems'}) if item: new_soup.body.append(item) else: new_soup.body.append('No instructions listed') hdr = new_soup.new_tag('h3') hdr.append('Notes') new_soup.body.append(hdr) notes = old_soup.find(id="cphMiddle_cphMain_lblNotes") if notes: hdr = new_soup.new_tag('pre') hdr.append(notes.get_text()) new_soup.append(hdr) return new_soup
def get_xml(base_xxx, db_package): #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE) #psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY) initiate_threaded_connection_pool(db_package) with getconnection() as conn: cursor = conn.cursor() cursor.execute("SELECT id, nom FROM optin_list WHERE abreviation = %s", (str(base_xxx), )) records = cursor.fetchone() if records: optin_id = records[0] nom = records[1] else: optin_id = '0' nom = "" cursor.execute( "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s", (str(optin_id), 'header')) records = cursor.fetchone()[0] if records: header = records else: header = "" cursor.execute( "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s", (str(optin_id), 'footer')) records = cursor.fetchone()[0] if records: footer = records else: footer = "" conn_pool.closeall() post_dict = {} post_dict['id'] = '1' post_dict['nom'] = nom post_dict['header'] = header post_dict['footer'] = footer xml_doc = BeautifulSoup(features='xml') xml_doc.append(xml_doc.new_tag("bases")) xml_doc.bases.append(xml_doc.new_tag("base")) cpt_content = 0 for key, value in post_dict.iteritems(): xml_doc.bases.base.append(xml_doc.new_tag(str(key))) xml_container = xml_doc.bases.base.contents[cpt_content] if key == 'footer': xml_formatted_value = "<![CDATA[" + value + "]]>" else: xml_formatted_value = value xml_container.append(xml_doc.new_string(xml_formatted_value)) cpt_content += 1 xml_feed = xml_doc.prettify() xml_feed = xml_feed.replace("<", "<").replace( ">", ">") #.replace("<p>", "").replace("</p>", "") return xml_feed
def get_button_html(self): """Return the tab button, an <a> inside an <li>. """ soup = BeautifulSoup() li_tag = soup.new_tag('li') a_tag = soup.new_tag('a') a_tag['href'] = '#' + self.html_id a_tag.string = self.name soup.append(li_tag) li_tag.append(a_tag) return str(soup)
def _img(html): soup = BeautifulSoup(html, 'html.parser').find_all('img', src=True) if soup == []: soup.append('none') else: img = [] for key in soup: key = key.get('src') img.append(unquote(key)) return img
class ERD: def __init__(self, xml=None): self._config = Config() if xml is not None: self.soup = BeautifulSoup(xml, 'xml') else: self.soup = BeautifulSoup(features='xml') self.soup.append( self.soup.new_tag( 'erModel', **self._config.XML['ErRootAttributes'] ) ) XMLObject.soup = self.soup self.entities: Dict[int, Entity] = {} self.relations: List[Relation] = [] self._parse_xml() def add_entity(self, entity: Entity): assert entity._id not in self.entities self.entities[entity._id] = entity def add_relation(self, relation: Relation): self.relations.append(relation) def _parse_xml(self): for tag in self.soup.find_all('entity'): entity = Entity.from_xml(tag) self.add_entity(entity) self.relations = [ Relation.from_xml(tag) for tag in self.soup.find_all('relation') ] def iter_relations(self, filter_): for relation in self.relations: if filter_(relation): yield relation elif len(relation) == 2 and filter_(relation.invert()): yield relation.invert() def to_xml(self): soup = BeautifulSoup(features='xml') soup.append( soup.new_tag('erModel', **self._config.XML['ErRootAttributes']) ) root = soup.find('erModel') XMLObject.soup = self.soup for entity in self.entities.values(): root.append(entity.to_xml()) for relation in self.relations: root.append(relation.to_xml()) return soup
def handle_page_table(filepath): soup = BeautifulSoup('<html></html>', 'lxml') all_pc_children = get_all_pc_children(filepath) table_cells = get_table_cells(all_pc_children) for cells in table_cells: cells_dict = create_cells_dict(cells) table_coordinate = create_table_coordinate(cells_dict) html_table = create_html_table(cells_dict, table_coordinate) soup.append(html_table) return soup
def convertToOSM(lst): ret = """<?xml version='1.0' encoding='UTF-8'?> <osm version='0.6' upload='false' generator='punktyadresowe_import.php'> """ ret = BeautifulSoup("", "xml") osm = ret.new_tag('osm', version='0.6', upload='false', generator='punktyadresowe_import.py') ret.append(osm) for (node_id, val) in enumerate(lst): osm.append(val.asOsmSoup(-1 * (node_id + 1))) return ret.prettify()
def getHs(soupH, h, hText): Text = BeautifulSoup("", "lxml") if (h in ValidH): allH = soupH.find_all(h) for H in allH: if hText in H: #print hText+" found" nextSib = H.find_next(True) while nextSib is not None and h not in nextSib.name: Text.append(nextSib) nextSib = nextSib.nextSibling return Text
def to_xml(self): soup = BeautifulSoup(features='xml') soup.append( soup.new_tag('erModel', **self._config.XML['ErRootAttributes']) ) root = soup.find('erModel') XMLObject.soup = self.soup for entity in self.entities.values(): root.append(entity.to_xml()) for relation in self.relations: root.append(relation.to_xml()) return soup
def to_xml(self): el = BeautifulSoup().new_tag('resource_set', ) el['name'] = self.name el['rollaback'] = self.rollback_mode el['executed'] = self._executed if self._items: for item in self._items: el.append(item.to_xml()) el.append('\n') return el
def initialize(self): # load the file with open(self.html, "r") as f: soup = BeautifulSoup(f.read(), 'html.parser') # set up html file soup.append(dom2soup(html(head(), body()))) soup.head.append(dom2soup(link(rel='stylesheet', href="https://fonts.googleapis.com/css?family=Open+Sans"))) soup.head.append(dom2soup(link(rel='stylesheet', href=self.css))) soup.body.append(dom2soup(table())) return soup
def _script(html): soup = BeautifulSoup(html, 'html.parser').find_all('script', src=True) if soup == []: soup.append('none') else: script = [] for link in soup: link = link.get('src') script.append(unquote(link)) return script
def write_sorting(sorting: Union[MultiSortingExtractor, SortingExtractor], save_path: PathType): save_path = Path(save_path) if save_path.suffix == '': sorting_name = save_path.name else: sorting_name = save_path.stem xml_name = sorting_name save_xml_filepath = save_path / (str(xml_name) + '.xml') assert not save_path.is_file(), "'save_path' should be a folder" if not save_path.is_dir(): os.makedirs(save_path) if save_xml_filepath.is_file(): raise FileExistsError(f'{save_xml_filepath} already exists!') soup = BeautifulSoup("", 'xml') new_tag = soup.new_tag('samplingrate') new_tag.string = str(sorting.get_sampling_frequency()) soup.append(new_tag) # write parameters file with open(save_xml_filepath, "w") as f: f.write(str(soup)) if isinstance(sorting, MultiSortingExtractor): counter = 1 for sort in sorting.sortings: # Create and save .res.%i and .clu.%i files from the current sorting object save_res = save_path / f'{sorting_name}.res.{counter}' save_clu = save_path / f'{sorting_name}.clu.{counter}' counter += 1 res, clu = _extract_res_clu_arrays(sort) np.savetxt(save_res, res, fmt='%i') np.savetxt(save_clu, clu, fmt='%i') elif isinstance(sorting, SortingExtractor): # assert units have group property assert 'group' in sorting.get_shared_unit_property_names() sortings, groups = get_sub_extractors_by_property(sorting, 'group', return_property_list=True) for (sort, group) in zip(sortings, groups): # Create and save .res.%i and .clu.%i files from the current sorting object save_res = save_path / f'{sorting_name}.res.{group}' save_clu = save_path / f'{sorting_name}.clu.{group}' res, clu = _extract_res_clu_arrays(sort) np.savetxt(save_res, res, fmt='%i') np.savetxt(save_clu, clu, fmt='%i')
def block(): with open('templates/map1.html') as inf: txt = inf.read() soup = BeautifulSoup(txt, "html.parser") block_cont = '{% block content %}' end_cont = '{% endblock %}' soup.append(block_cont) soup.append(end_cont) # запись в html with open('templates/map1.html', "w") as outf: outf.write(str(soup))
def header_content_extraction(soup, headers_list): for x in soup.find_all(): if len(x.text) == 0: x.extract() section_dict = {} section_dict_bullets = {} section_dict_bold = {} for header in range(len(headers_list)): header_tag = None header_tag = soup.find(headers_list[header]) if header_tag is None: break header_tag_list = [] header_tag_list = header_tag.parent.findChildren(headers_list[header]) if len(header_tag_list) == 0: break for component_tag in header_tag_list: header_tag_siblings = component_tag.nextSiblingGenerator() header_tag_sibling_list = [] header_tag_sibling_tag_list = [] within_para_bold_tag_list = [] for header_tag_sibling in header_tag_siblings: if header_tag_sibling.name in (headers_list[:(header + 1)]): if header_tag_sibling_list: section_dict[component_tag.get_text() + '[Full Contents]'] = ' '.join(header_tag_sibling_list) if within_para_bold_tag_list: section_dict_bold[component_tag.get_text() + '[Bold Text]'] = ' '.join(within_para_bold_tag_list) new_tag = BS('').new_tag('kghtmlextractiontag') for bullet_tag in header_tag_sibling_tag_list: new_tag.append(bullet_tag) bundled_bullet_tag_list = [] bundled_bullet_tag_list = new_tag.find_all('p', class_ = 'list_Paragraph') bundled_bullet_text_list = [] within_para_bold_tag_list = [] try: bundled_bullet_text_list = [j.get_text() for j in bundled_bullet_tag_list] except AttributeError: pass if bundled_bullet_text_list: section_dict_bullets[component_tag.get_text() + '[Bullets Only]'] = ' '.join(bundled_bullet_text_list) del new_tag break try: header_tag_sibling_tag_list.append(header_tag_sibling) header_tag_sibling_list.append(header_tag_sibling.get_text()) within_para_bold_tag_list += [bold_tag.get_text() for bold_tag in header_tag_sibling.find_all('b')] except AttributeError: pass full_content_dict = {**section_dict, **section_dict_bullets, **section_dict_bold} return full_content_dict
def _link(html): soup = BeautifulSoup(html, 'html.parser').find_all('link', href=True) if soup == []: soup.append('none') else: link = [] for key in soup: key = key.get('href') link.append(unquote(key)) return link
def postprocess(self, text): # Hack for unescape special chars for key, value in markdown2.g_escape_table.iteritems(): text = text.replace(value, key) urls = set(URL_RE.findall(text)) # Treat images as gallery post = BeautifulSoup(text, 'html.parser') imgs = post.find_all('img') gallery = Tag(name='div', attrs={ 'class': "gallery", 'style': 'display: none;', 'id': hashlib.md5(text.encode('utf-8')).hexdigest(), }) img_urls = [img['src'] for img in imgs] for img in imgs: img.extract() img.attrs.update({ 'data-image': img['src'], 'data-description': img['alt'], }) gallery.append(img) # Add url as web rich object wros = '' for url in urls: if HTMLParser().unescape(url) in img_urls: continue try: wro = WebRichObject.objects.create_or_update_from_url(url) if wro.type != 'image': wros += wro.get_widget(video_width="100%", video_height='320px')\ .decode('utf8') else: img = Tag(name='img', attrs={ 'alt': wro.description or '', 'src': wro.url, 'data-image': wro.url, 'data-description': wro.description or '' }) gallery.append(img) except IOError as err: print err post.append(gallery) text = urlize_html(unicode(post)) text += wros return text
class WindowsEventData(object): def __init__(self): self.data_list = [] self.soup = BeautifulSoup(features='lxml') @property def get(self): return self.soup def add(self, name, text): new_tag = self.soup.new_tag(name) self.soup.append(new_tag) new_tag.string = text
def search(query): query = query.replace(" ","+") url = "https://www.youtube.com/results?search_query="+query h3 = BeautifulSoup(requests.get(url).text, "html.parser").find('h3',{'class' : 'yt-lockup-title'}) x = BeautifulSoup('',"html.parser") child = h3.findChildren() for c in child: x.append(c) watch_url = '' for link in x.findAll('a',{'rel':'spf-prefetch'}): watch_url = link.get('href') yt_url = "https://www.youtube.com"+watch_url return yt_url
def getTextAndImg(page): wxsoup = BeautifulSoup(page,'html.parser') bodyElem = wxsoup.body #print bodyElem.prettify() #print bodyElem.contents[0] #loop through the children newsoup = BeautifulSoup('') btagnew = newsoup.new_tag('div') newsoup.append(btagnew) #processChildren2(bodyElem,btagnew,newsoup) processChildren(bodyElem,btagnew,newsoup) return newsoup.prettify()
def start(xmlfilename, prsfilename): data = BeautifulSoup("lxml") data.append(BeautifulSoup.new_tag(data, "body")) prslines = [] with open(prsfilename, "r", encoding='utf-8') as file: prslines = file.read().split('\n') res = sentstoxml(readtable(prslines), data.body) with open(xmlfilename, "wb") as file: file.write(res.prettify("utf-8"))
def export_to_xml(roots, version): """ Converts the intermediate structure to a soup and saves the xml """ for root in roots: with open(f'{US_XML_PATH}/{root["itempath"][1:]}_{version}.xml', "wb") as f: soup = BeautifulSoup("", "lxml") soup.append(doc_to_soup(root, soup, 0, version, root=True)) remove_unnecessary_subseqitems(soup) add_keys_to_items(soup, f'{root["itempathcomponents"][0]}_{version}') f.write(soup.encode("utf-8"))
def wrap_ul(tags): all_li = tags.find_all("li") if len(all_li) > 0 and all_li[0].parent.name != "ul": for item in tags: if item.find("li") and item.name == "p": item.attrs = {"class": "p_ul"} else: break ul = BeautifulSoup(features="html.parser").new_tag('ul') all_li[0].insert_before(ul) for li in all_li: if li.parent.name != "ul": ul.append(li)
def generate_xml(data): soup = BeautifulSoup(features='xml') soup.append(soup.new_tag('condition')) for i in data.keys(): item = soup.new_tag('item') name = soup.new_tag('name') name.string = i item.append(name) value = soup.new_tag('value') value.string = data[i] item.append(value) soup.condition.append(item) return soup.decode(eventual_encoding='GBK')
def getLinks(links, useTitleAsKey=False): linksDict = {} soup = BeautifulSoup('', features='html.parser') for link in links: soup.append(link) for a in soup.find_all('a'): if useTitleAsKey and a.has_attr('title') and a.has_attr('href'): linksDict[a['title']] = a['href'] else: if a.string != None and a.has_attr( 'href') and a.string in string.ascii_letters: linksDict[a.string] = 'https:' + a['href'] return linksDict
def display_people(graph, iri): types_of = list(set([o for o in graph.objects(subject=iri, predicate=rdflib.RDF.type)])) if BF.Person in types_of: return '' output = BeautifulSoup() for year_iri in KNOWLEDGE_GRAPH.subjects(predicate=SCHEMA.organizer, object=iri): year_label = KNOWLEDGE_GRAPH.value(subject=year_iri, predicate=rdflib.RDFS.label) div = output.new_tag("div") h2 = output.new_tag("h2") h2.string = year_label div.append(h2) people = dict() h3 = output.new_tag("h3") h3.string = "People" div.append(h3) ul = output.new_tag("ul") for pred, obj in KNOWLEDGE_GRAPH.predicate_objects( subject=year_iri): if isinstance(obj, rdflib.URIRef): pred_label = get_label(KNOWLEDGE_GRAPH, pred) obj_label = get_label(KNOWLEDGE_GRAPH, obj) for type_ in KNOWLEDGE_GRAPH.objects(subject=obj, predicate=rdflib.RDF.type): if type_ == BF.Person: li = output.new_tag("li") person_a = output.new_tag("a", href=str(obj)) if obj_label is None: person_a.string = str(obj) else: person_a.string = obj_label li.append(person_a) if pred_label is not None: title = output.new_tag('span') title.string = ", {}".format(pred_label) li.append(title) ul.append(li) if pred in people: people[pred]["persons"].append({"iri": obj, "name": obj_label}) else: people[pred] = {"persons": [{"iri": obj, "name": obj_label}], "label": pred_label} div.append(ul) output.append(div) return output.decode(pretty_print=True)
def get_span(self, start, end): """Given indices (start, end) in the pure-text version of the htmlString this object is initialized with, returns the html string that corresponds to the specified text string """ # we need to copy so that we don't destroy self._top_level_ranges # converting to a string and reparsing is much faster than doing a deepcopy top_level_ranges = [{'el': BeautifulSoup(unicode(r['el'])), 'range': r['range']} for r in self._get_applicable_ranges(self._top_level_ranges, start, end)] # create a new top-level soup so that we can modify elements in place result = BeautifulSoup() for r in top_level_ranges: result.append(r['el']) if len(top_level_ranges) == 0: return u'' elif len(top_level_ranges) == 1: range_offset = top_level_ranges[0]['range'][0] inner_start = start - range_offset inner_end = end - range_offset self._modify_ranges(self._get_applicable_ranges( self._get_text_indices(top_level_ranges[0]['el']), inner_start, inner_end), inner_start, inner_end) else: range_offset = top_level_ranges[0]['range'][0] inner_start = start - range_offset inner_end = end - range_offset self._modify_ranges( self._get_applicable_ranges( self._get_text_indices(top_level_ranges[0]['el']), inner_start, inner_end), start = inner_start) range_offset = top_level_ranges[-1]['range'][0] inner_start = start - range_offset inner_end = end - range_offset self._modify_ranges( self._get_applicable_ranges( self._get_text_indices(top_level_ranges[-1]['el']), inner_start, inner_end), end = inner_end) return unicode(result)
def merget_element(self, soup): _s = BeautifulSoup("") l = ['span'] #span 嵌套 for e in soup.find_all(l): es = e.find_all(True, recursive=False) if es : continue _ = to_unicode(' '.join(e.get_text().strip().split())).strip() if _: e.string = _ for e in soup.find_all(l): e.unwrap() for e in soup.find_all('p'): _s.append(e) return _s
def create_note(note_data, soup): """Create an ENEX note element""" note = soup.new_tag('note') title = soup.new_tag('title') title.string = note_data.title note.append(title) content_inside = BeautifulSoup(features="xml") content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"')) content_inside_note = soup.new_tag('en-note') content_inside_note.string = note_data.content content_inside.append(content_inside_note) # Holy crap this is super hacky and horrible but I don't want to fight with # BeautifulSoup to make it not convert all the text to HTML entities, so # manually convert everything to < and > content_inside_str = str(content_inside).replace('<', '<').replace('>', '>') content = soup.new_tag('content') content.string = CData(content_inside_str) note.append(content) created = soup.new_tag('created') created.string = str(note_data.created) note.append(created) updated = soup.new_tag('updated') updated.string = str(note_data.updated) note.append(updated) for single_tag in note_data.tags: if single_tag is not None: tag = soup.new_tag('tag') tag.string = single_tag note.append(tag) attributes = soup.new_tag('note-attributes') author = soup.new_tag('author') author.string = "Andrew Heiss" attributes.append(author) note.append(attributes) return note
def convert_to_dynetml(self, is_entire_file=False): """ Converts the graph to dynetml and returns a BeautifulSoup tag :param is_entire_file: if True, wraps value as a soup. If False, returns the top tag :type is_entire_file: bool :return: bs4.element.Tag :raise TypeError: if is_entire_file isn't a bool """ dmlpu.check_type(is_entire_file, 'is_entire_file', bool) bs = BeautifulSoup(features='xml') bs.append(bs.new_tag('MetaNetwork')) for attr in self.attributes: bs.MetaNetwork[attr] = dmlpu.unformat_prop(self.attributes[attr]) bs.MetaNetwork.append(dmlpu.get_property_identities_tag(self.propertyIdentities)) bs.MetaNetwork.append(bs.new_tag('properties')) for key in self.properties: prop_tag = bs.new_tag('property') prop_tag['id'] = key prop_tag['value'] = dmlpu.unformat_prop(self.properties[key]) bs.MetaNetwork.properties.append(prop_tag) bs.MetaNetwork.append(bs.new_tag('nodes')) for class_type in self.__node_tree: for class_id in self.__node_tree[class_type]: nodeclass_tag = bs.new_tag('nodeclass', type=class_type, id=class_id) nodeclass_tag.append(dmlpu.get_property_identities_tag(self.__node_tree[class_type][class_id][0])) for key in self.__node_tree[class_type][class_id][1]: node_tag = bs.new_tag('node', id=key) for attr in self.__node_tree[class_type][class_id][1][key][0]: node_tag[attr] = dmlpu.unformat_prop(self.__node_tree[class_type][class_id][1][key][0][attr]) node_tag.append(dmlpu.get_properties_tag(self.__node_tree[class_type][class_id][1][key][1])) nodeclass_tag.append(node_tag) bs.MetaNetwork.nodes.append(nodeclass_tag) networks_tag = self._get_networks_tag() bs.MetaNetwork.networks.append(networks_tag) if not is_entire_file: bs = bs.MetaNetwork return bs
def extract_toc(content): if isinstance(content, contents.Static): return soup = BeautifulSoup(content._content,'html.parser') filename = content.source_path extension = path.splitext(filename)[1][1:] toc = None # if it is a Markdown file if extension in readers.MarkdownReader.file_extensions: toc = soup.find('div', class_='toc') if toc: toc.extract() # else if it is a reST file elif extension in readers.RstReader.file_extensions: toc = soup.find('div', class_='contents topic') if toc: toc.extract() if toc: tag=BeautifulSoup(str(toc)) tag.div['class']='toc' tag.div['id']='' p=tag.find('p', class_='topic-title first') if p:p.extract() toc=tag elif extension in ['org']: toc = soup.find('div', id="table-of-contents") if toc: toc.extract() tag=BeautifulSoup(str(toc)) tag.div['class']='toc' tag.div['id']='' p=tag.find('p', class_='topic-title first') if p:p.extract() h2=tag.find('h2') # 'Table of Contents' if h2: h2.extract() orgfile = path.basename(content.source_path) tag.append(BeautifulSoup('<a href="%s">Org source</a>'%orgfile)) toc=tag elif not toc: # Pandoc reader toc = soup.find('nav', id='TOC') if toc: toc.extract() content._content = soup.decode() content.toc = toc.decode() if content.toc.startswith('<html>'): content.toc = content.toc[12:-14]
def description_short(self): # get up to 20 words soup = BeautifulSoup(self.description) truncated = self.keep_first_nwords(soup, 50) if truncated: more_info_link = soup.new_tag('a', href=urlresolvers.reverse('event', kwargs = {'pk' : str(self.id)})) more_info_link['class'] = 'more_info_link' more_info_link.append('[...]') soup.append(more_info_link) # get rid of paragraphs for p_tag in soup.findAll('p'): p_tag.append(' ') # ensure paragraph ends with a space before we flatten it p_tag.unwrap() return safestring.mark_safe(soup.decode(formatter='html'))
def get_html(self, images_url): """Get the html for this image tag. When you click on the image, it links to a big version of the image. """ img_original_url = images_url + self.original_name img_resized_url = images_url + self.new_name # Create the new image tag a_img_soup = BeautifulSoup() a_tag = a_img_soup.new_tag('a') a_tag['href'] = img_original_url img_tag = a_img_soup.new_tag('img') img_tag['src'] = img_resized_url a_img_soup.append(a_tag) a_tag.append(img_tag) new_tag = str(a_img_soup) # Return return new_tag