Ejemplo n.º 1
0
def extend(best_candidate, candidates):
    """Enrich the best candidate by tacking on good-ranking siblings"""

    threshold = max([10, best_candidate['score'] * 0.5])
    soup = BeautifulSoup()
    soup.append(best_candidate['el'])
    for sibling in best_candidate['el'].parent.children:
        if type(sibling) != Tag or sibling == best_candidate['el']:
            continue
        append = False 
        if sibling in candidates and candidates[sibling]['score'] >= threshold:
            append = True
        
        if sibling.name == "p":
            density = link_density(sibling)
            content = sibling.get_text(strip=True) or ""
            length = len(content)

            if length > 80 and density < 0.25:
                append = True
            elif length < 80 and density == 0 and patterns.punctuation.search(content):
                append = True
        if append:
            soup.body.append(sibling)
    return soup
Ejemplo n.º 2
0
def mkv_metadata(video):
    root = BeautifulSoup(features='xml')
    root.append(Doctype('Tags SYSTEM "matroskatags.dtd"'))
    tags = root.new_tag("Tags")
    tag = root.new_tag("Tag")
    tags.append(tag)
    root.append(tags)
    keep = ('title', 'description', 'url', 'genre')
    targets = root.new_tag("Targets")
    ttv = root.new_tag("TargetTypeValue")
    ttv.string = str(50)
    targets.append(ttv)
    tag.append(targets)
    for key in video:
        if not key in keep:
            continue
        simple = root.new_tag('Simple')
        name = root.new_tag('Name')
        name.string = key.upper()
        simple.append(name)
        sstring = root.new_tag('String')
        sstring.string = video[key]
        simple.append(sstring)
        tag.append(simple)
    return str(root)
Ejemplo n.º 3
0
 def _process_toc(self):
     """Creates a toc based on the headings, h1 to h6.
     """
     # Create soups
     soup = BeautifulSoup(self.html_str)
     toc_soup = BeautifulSoup()
     # Create the new tags for toc
     div_tag = toc_soup.new_tag('div')
     div_tag['class'] = 'toc'
     h2_tag = toc_soup.new_tag('h2')
     a_tag = toc_soup.new_tag('a')
     a_tag['href'] = '#top'
     a_tag.string = 'Contents'
     ul_tag = toc_soup.new_tag('ul')
     h2_tag.append(a_tag)
     toc_soup.append(div_tag)
     div_tag.append(h2_tag)
     div_tag.append(ul_tag)
     # For each heading, add an li
     for heading in soup.find_all(['h1', 'h2', 'h3', 'h4', 'h5', 'h6']):
         li_tag = toc_soup.new_tag('li')
         li_tag['class'] = heading.name
         a_tag = toc_soup.new_tag('a')
         a_tag.string = heading.string
         a_tag['href'] = '#' + heading['id']
         li_tag.append(a_tag)
         ul_tag.append(li_tag)
     #Add the toc to the end of the content
     toc_soup.append(soup)
     self.html_str = str(toc_soup)
Ejemplo n.º 4
0
def indexHTML(links):
	from bs4 import BeautifulSoup, Tag
	soup = BeautifulSoup()

	h = open('index.html', 'w')

	html 	= soup.new_tag('html', )
	head 	= soup.new_tag('head')
	meta	= soup.new_tag('meta', charset = "utf-8")
	title	= soup.new_tag('title')

	body	= soup.new_tag('body')


	soup.append(html)
	html.append(head)
	html.append(meta)
	html.append(title)
	title.append('Ryosuke Ogata | Final Project')

	html.append(body)

	# create img tags with image links as 'src'
	for srcs in links:
			w = "window.innerWidth"
			img		= soup.new_tag('img', src = srcs, width = '20%')
			html.append(img)

	h.write( soup.prettify() )
	
Ejemplo n.º 5
0
    def parse_begin_xxx(self, m, root):
        symbol = m.group(1)
        if symbol in ['html', 'HTML']:
            new_tag = BeautifulSoup(m.group(2), 'html.parser').contents[0]
        elif symbol in ['example', 'EXAMPLE']:
            new_tag = self.soup.new_tag('pre')
            new_tag['class'] = 'example'
            new_tag.string = m.group(2)
        elif symbol in ['quote', 'QUOTE']:
            new_tag = self.soup.new_tag('blockquote')
            # new_tag.string = m.group(2)
            for part in re.split('\n{2,}', m.group(2)):
                new_p_tag = self.soup.new_tag('p')
                new_p_tag.string = part
                new_tag.append(new_p_tag)
        elif symbol in ['verse', 'VERSE']:
            new_tag = self.soup.new_tag('p')
            new_tag['class'] = 'verse'
            new_tag.string = m.group(2)
        elif symbol in ['center', 'CENTER']:
            new_tag = self.soup.new_tag('div')
            new_tag['class'] = 'center'
            new_tag.string = m.group(2)
        else:
            raise RuntimeError('Not supportted begin symbol: %s' % symbol)

        root.append(new_tag)
Ejemplo n.º 6
0
    def from_directory(cls, directory):
        """Build an AdventureDoc by processing a directory.

        Arguments:
            directory (str): Path to the directory containing
                the ORDER file along with the sections as
                markdown files.

        Returns:
            AdventureDoc:

        """

        ordered_section_file_names = cls.get_order(directory)
        all_sections_soup = BeautifulSoup('', 'html.parser')

        for file_name in ordered_section_file_names:
            # The ORDER file specifies each filename relative to itself, thusly,
            # we must prepend the directory these files are in to read them.
            file_path = os.path.join(directory, file_name)

            with open(file_path) as f:
                file_contents = f.read()

            section_soup = cls.build_section(file_contents, file_name,
                                             ordered_section_file_names)
            all_sections_soup.append(section_soup)

        cls.put_in_nice_bowl(all_sections_soup)

        return AdventureDoc(all_sections_soup)
Ejemplo n.º 7
0
 def ToHtml(self, soup):
     if self.title:
         chapter = BeautifulSoup('<div><h1 class="chapter"></h1></div>')
         chapter.h1.append(self.title)
         chapter.append(self.contents)
         return chapter
     return self.contents
Ejemplo n.º 8
0
def index(request):
    # return HttpResponse('Hello from Python!')
    # return render(request, 'index.html' )
    resultsParser = ResultsParser()
    resultsModel = resultsParser.parse('http://cfrsolo2.com/2016/04-17-16-brooksville_fin.htm')
    # return render(request, 'adrian0.html')
    # r = requests.get('http://httpbin.org/status/418')
    # print r.text
    # return HttpResponse('<pre>' + r.text + '</pre>')
    soup = BeautifulSoup()

    new_img_tag = soup.new_tag("img", style='position: absolute; top: 0; right: 0; border: 0;', src="https://camo.githubusercontent.com/e7bbb0521b397edbd5fe43e7f760759336b5e05f/68747470733a2f2f73332e616d617a6f6e6177732e636f6d2f6769746875622f726962626f6e732f666f726b6d655f72696768745f677265656e5f3030373230302e706e67")
    new_a_tag = soup.new_tag("a", href='https://github.com/orozcoadrian/race-graphs')
    new_a_tag.append(new_img_tag)
    soup.append(new_a_tag)

    years = get_years_from_homepage()

    for year in years:
        new_a_tag = soup.new_tag("a", href=year)
        new_a_tag.string = year
        soup.append(new_a_tag)
        new_a_tag.append(soup.new_tag('br'))
    # self.wfile.write(soup.prettify())
    return HttpResponse(soup.prettify())
Ejemplo n.º 9
0
 def parse_existing_html_code(self):
     html_doc = self.driver.find_element_by_id('Some_id_from_form')
     # ex = open('example.html', 'r')
     # html_doc = ex.read()
     # ex.close()
     # -------------- ^^^^^^^^^^
     soup = BeautifulSoup(html_doc, 'html.parser')
     elements = soup.find_all()
     # select only root elements
     elements = [el for el in elements if el.parent == soup]
     upper_elements = BeautifulSoup()
     for el in elements:
         upper_elements.append(el)
         if el.text.lower().startswith('enjoy'):
             break
     # try to find main type of flyers
     # <b id="main_flyer_type"><!-- Summer Flyers --></b>
     soup = BeautifulSoup(html_doc, 'html.parser')
     main_type_tag = soup.find(id="main_flyer_type")
     if main_type_tag:
         upper_elements.append(main_type_tag)
         main_flyer_type = main_type_tag.string.strip().lower()\
             .replace(' ', '_')
     else:
         main_flyer_type = None
     upper_part = upper_elements.prettify(formatter='html')
     return upper_part, main_flyer_type
Ejemplo n.º 10
0
def create_one_zip_file(zip_archive_number):
    inMemoryOutputFile = StringIO()
    zipFile = ZipFile(inMemoryOutputFile, 'w') 

    for xml_file_number in xrange(1, XML_FILES_COUNT + 1):
        soup = BeautifulSoup(features='xml')
        soup.append(soup.new_tag("root"))
        var_id = soup.new_tag('var', value=str(uuid4()))
        var_id['name'] = 'id'
        soup.root.append(var_id)
        var_level = soup.new_tag('var', value=random.randint(1, 100))
        var_level['name'] = 'level'
        soup.root.append(var_level)
        soup.root.append(soup.new_tag('objects'))
        for i in xrange(1, random.randint(1, 10)):
            new_object = soup.new_tag('object')
            new_object['name'] = str(uuid4())
            soup.root.objects.append(new_object)

        zipFile.writestr('%s.xml' % xml_file_number, str(soup))

    zipFile.close()
    inMemoryOutputFile.seek(0)

    with open('%s/%s.zip' % (GENERATED_FILES_DIR, zip_archive_number), 'w') as fd:
        fd.write(inMemoryOutputFile.getvalue())
Ejemplo n.º 11
0
def build_rss(url, list_selector, item_selector, ignored_qp, output, pretty=False):
    try:
        soup = BeautifulSoup('<rss version="2.0" />', "xml")
        rss = soup.rss
        has_lxml = True
    except FeatureNotFound:
        rss = BeautifulSoup('<rss version="2.0" />').rss
        has_lxml = False

    r = requests.get(url)
    list_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html

    channel = Tag(name="channel")
    rss.append(channel)
    channel.append(new_tag("title", list_html.head.title.string))
    channel.append(new_tag("link", url))
    channel.append(new_tag("description", "--"))
    channel.append(new_tag("lastBuildDate", time.strftime("%a, %d %b %Y %H:%M:%S +0000", time.gmtime())))
    channel.append(new_tag("generator", "RSS Builder"))

    item_urls = list_html.select(list_selector)
    for item_url in map(lambda i: i["href"], item_urls):
        item_url = urlparse.urljoin(url, item_url)
        parsed = urlparse.urlparse(item_url)
        query_params = urlparse.parse_qsl(parsed.query)
        item_url = urlparse.urlunparse(
            (
                parsed.scheme,
                parsed.netloc,
                parsed.path,
                parsed.params,
                "&".join([k + "=" + v for k, v in query_params if k not in ignored_qp]),
                parsed.fragment,
            )
        )

        r = requests.get(item_url)
        item_html = (BeautifulSoup(r.text, "lxml") if has_lxml else BeautifulSoup(r.text)).html

        item = Tag(name="item")
        item.append(new_tag("title", item_html.head.title.string))
        item.append(new_tag("link", item_url))
        item.append(new_tag("description", str(item_html.select(item_selector)[0])))
        channel.append(item)

    out_func = lambda x: (x.prettify() if pretty else unicode(x)).encode("utf-8")
    if output == "-":
        out_file = sys.stdout
        close_file = lambda: None
    else:
        out_file = open(output, "w")
        close_file = out_file.close

    if has_lxml:
        out_file.write(out_func(soup))
    else:
        out_file.write('<?xml version="1.0" encoding="UTF-8" ?>\n')
        out_file.write(out_func(rss))
    out_file.write("\n")
    close_file()
Ejemplo n.º 12
0
    def get_new_body(self):
        new_soup = BeautifulSoup('<html><head></head><body></body></html>')

        thumb = self.get_thumbnail()
        if thumb:
            hdr = new_soup.new_tag('img')

            hdr['src'] = './img/{}'.format(self.id + '.jpg')
            new_soup.body.append(hdr)

        #Title
        title = self.get_title()
        hdr = new_soup.new_tag('title')
        hdr.append(title)
        new_soup.head.append(hdr)
    
        hdr = new_soup.new_tag('h1')
        hdr.append(title)
        new_soup.body.append(hdr)

        #source
        source = self.soup.find(id='cphMiddle_cphMain_hlSource')
        if source:
            new_soup.body.append(source)

        #ingredients
        hdr = new_soup.new_tag('h3')
        hdr.append('Ingredients')
        new_soup.body.append(hdr)
    
        item = self.soup.find('ul', {'class':'inggroups'})
        if item:
            new_soup.body.append(item)
        else:
            new_soup.body.append('No ingedients listed')

        #instructions 
        hdr = new_soup.new_tag('h3')
        hdr.append('Instructions')
        new_soup.body.append(hdr)
    
        item = self.soup.find('ol', {'class':'dirgroupitems'})
        if item:
            new_soup.body.append(item)
        else:
            new_soup.body.append('No instructions listed')

        #Notes 
        hdr = new_soup.new_tag('h3')
        hdr.append('Notes')
        new_soup.body.append(hdr)
    
        notes = self.soup.find(id="cphMiddle_cphMain_lblNotes")
        if notes:
            hdr = new_soup.new_tag('pre')
            hdr.append(notes.get_text())
            new_soup.append(hdr)
    
        return new_soup.prettify('latin-1')
def extract_body_from_html(html_soup):
	"""Return an XML beautiful soup object with the <body> of the input HTML file"""

	body = html_soup.body.extract()
	xml_soup = BeautifulSoup('', 'xml')
	xml_soup.append(body)

	return xml_soup
Ejemplo n.º 14
0
class CimXML():

    '''Classe que representa os dados dos componentes em padrão CIM'''

    def __init__(self, scene):
        self.scene = scene

        self.cim_xml = BeautifulSoup()
        self.cim_xml.append(self.cim_xml.new_tag("Node"))
        self.cim_xml.find('Node').append(self.cim_xml.new_tag("Breaker"))

        for item in scene.items():
            if isinstance(item, Node):

                if item.myItemType == item.Religador:
                    tag_id = self.cim_xml.new_tag(str(item.id))
                    self.cim_xml.find("Breaker").append(tag_id)

                    tag_rc = self.cim_xml.new_tag("ratedCurrent")
                    tag_rc.append(str(item.chave.ratedCurrent))
                    tag_id.append(tag_rc)

                    tag_itt = self.cim_xml.new_tag("inTransitTime")
                    tag_itt.append(str(item.chave.inTransitTime))
                    tag_id.append(tag_itt)

                    tag_bc = self.cim_xml.new_tag("breakingCapacity")
                    tag_bc.append(str(item.chave.breakingCapacity))
                    tag_id.append(tag_bc)

                    tag_rs = self.cim_xml.new_tag("recloseSequences")
                    tag_rs.append(str(item.chave.recloseSequences))
                    tag_id.append(tag_rs)

                    tag_state = self.cim_xml.new_tag("state")
                    tag_state.append(str(item.chave.estado))
                    tag_id.append(tag_state)


                    # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("ratedCurrent"))


                    # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("inTransitTime"))

                    # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("breakingCapacity"))

                    # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("recloseSequences"))

                    # self.cim_xml.find(str(item.id)).append(self.cim_xml.new_tag("state"))

    def write_xml(self, path):
        '''
            Função que cria o arquivo XML na localização indicada pelo
            argumento path
        '''
        f = open(path, 'w')
        f.write(self.cim_xml.prettify())
        f.close()
Ejemplo n.º 15
0
def insertEarlyIn(soup: BeautifulSoup, tag: Tag):
    if soup.body is not None and soup.body.find() is not None:
        soup.body.find().insert_before(tag)
    elif soup.title is not None:
        soup.title.insert_after(tag)
    elif soup.find() is not None:
        soup.find().insert_after(tag)  # after first element
    else:
        soup.append(tag)
Ejemplo n.º 16
0
def add_tracker(email_html, contactid, messageid):
    tracker_url = str(os.getenv('TRACKER_URL'))
    tracker_url += "?contactid={}&messageid={}".format(contactid, messageid)
    soup = Soup(email_html, 'html.parser')
    div = soup.new_tag('div')
    img = soup.new_tag('img', attrs={'height': '0', 'width': '0', 'src': tracker_url})
    div.append(img)
    soup.append(div)
    return str(soup)
Ejemplo n.º 17
0
 def to_xml(self):
     el = BeautifulSoup().new_tag('resource')
     el['class'] = 'package'
     el['provider'] = self.package_provider.get_key()
     for pkg in self.packages:
         pkg_el = BeautifulSoup().new_tag('package')
         pkg_el['name'] = pkg
         el.append(pkg_el)
     return el
def generate_score(num_measures,
                   measure_length,
                   key_number,
                   rest_prob,
                   treble_tp_key_choices=('complex', ),
                   bass_tp_key_choices=('complex', ),
                   treble_cp_key_choices=('complex', ),
                   bass_cp_key_choices=('complex', )):
    # generates a score num_measures measures long
    # measure_length is the number of sixteenth notes in a measure
    soup = BeautifulSoup('', 'xml')
    score_partwise = soup.new_tag('score-partwise', version='3.1')
    work = soup.new_tag('work')
    work_title = soup.new_tag('work-title')
    alpha = list(
        'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ1234567890      ')
    text = list(np.random.choice(alpha, size=np.random.randint(8, 25)))
    text = ''.join(text)
    work_title.string = text
    score_partwise.append(work)
    work.append(work_title)
    part_list = soup.new_tag('part-list')
    score_part = soup.new_tag('score-part', id='P1')
    part_name = soup.new_tag('part-name')
    soup.append(score_partwise)
    score_partwise.append(part_list)
    part_list.append(score_part)
    score_part.append(part_name)
    part_name.append('Piano')
    part = soup.new_tag('part', id='P1')
    score_partwise.append(part)

    attributes = generate_attributes(measure_length, key_number)

    for i in range(num_measures):
        n = np.random.choice(len(treble_tp_key_choices))
        treble_tp_key = treble_tp_key_choices[n]
        n = np.random.choice(len(bass_tp_key_choices))
        bass_tp_key = bass_tp_key_choices[n]
        n = np.random.choice(len(treble_cp_key_choices))
        treble_cp_key = treble_cp_key_choices[n]
        n = np.random.choice(len(bass_cp_key_choices))
        bass_cp_key = bass_cp_key_choices[n]
        measure = generate_measure_for_score(measure_length, key_number,
                                             rest_prob, treble_tp_key,
                                             bass_tp_key, treble_cp_key,
                                             bass_cp_key, i + 1)
        if i == 0:
            measure.insert(0, attributes)
        part.append(measure)

    return soup


# with open('sample_score.musicxml', 'w+') as f:
#     f.write(str(generate_score(64, 16, 0, 0.2, treble_tp_key_choices=('quarters',))))
Ejemplo n.º 19
0
    def create_raw_descs(link_inside):
        def _remove_all_attrs(text):  # removing tag attributes
            for tag in text.find_all(True):
                tag.attrs = {}
            return text

        # FORMING THE DESCRIPTIONS

        page = requests.get(link_inside)  # getting the object from url
        soup = BeautifulSoup(page.content,
                             'html.parser')  # loading it into the soup
        desc_divs = []  # a list for all descs' divs

        # the code below is a sample, do NOT paste it in your project as is
        """
        main_heading = soup.find("h1")
        main_heading.name = "div"  # change the name for uniformity
        if main_heading:
            desc_divs.append(main_heading)
        else:
            pass

        main_desc = soup.find("div", class_="product-main-text")  # main desc
        if main_desc:
            desc_divs.append(main_desc)
        else:
            pass

        features_table_desc = soup.find("div", class_="title")  # features (table heading)
        if features_table_desc:
            desc_divs.append(features_table_desc)
        else:
            pass

        features_table = soup.find("table", class_="table-striped")  # features table
        if features_table:
            desc_divs.append(features_table)
        else:
            pass

        catalog_detail_block = soup.find("div", class_="catalog_detail_info")  # text after features table
        if catalog_detail_block:
            desc_divs.append(catalog_detail_block)
        else:
            pass
        """

        soup.clear()  # clearing the old soup

        for desc_div in desc_divs:  # loading the new soup with objects from the list
            soup.append(desc_div)

        soup_without_attrs = _remove_all_attrs(
            soup)  # removing all unnecessary attrs

        return soup_without_attrs
Ejemplo n.º 20
0
def create_musicxml(path, measure_length, key_number):
    """
    This function takes the path to an uploaded file, its measure_length, and its key number
    (usually info inputted by user) and passes the image through the first neural net to extract the measures,
    then passes each measure through the second neural net to convert it to xml.
    
    The handle_page function covers the first part and the run_model function covers the second.
    """
    handle_page(path, measure_length, key_number,
                os.path.join(MEDIA_ROOT, 'current_measures'))
    measures = []

    # initialize the xml output
    soup = BeautifulSoup(features='xml')
    score_partwise = soup.new_tag('score-partwise', version='3.1')
    part_list = soup.new_tag('part-list')
    score_part = soup.new_tag('score-part', id='P1')
    part_name = soup.new_tag('part-name')
    soup.append(score_partwise)
    score_partwise.append(part_list)
    part_list.append(score_part)
    score_part.append(part_name)
    part_name.append('Piano')
    part = soup.new_tag('part', id='P1')
    score_partwise.append(part)

    # loop through each extracted measure and convert it to xml
    # if the conversion fails, return a blank measure
    for i in range(
            len(os.listdir(os.path.join(MEDIA_ROOT, 'current_measures')))):
        print('handling measure ', i + 1)
        measure_soup = run_model(
            os.path.join(MEDIA_ROOT, 'current_measures', f'subimage{i}.png'),
            measure_length, key_number)
        if measure_soup:
            measure = measure_soup.find('measure')
            # only need the key and time sig info on the first measure
            if i != 0:
                attributes = measure.find('attributes')
                attributes.extract()
            measures.append(measure)
            print(f'measure {i+1} successful')
        else:
            blank_measure = get_blank_measure(measure_length)
            measures.append(blank_measure)
            print('error in measure ', i + 1)
    for measure in measures:
        part.append(measure)

    # pick a random filename for the output
    filename = np.random.choice(list('abcdefghijklmnopqrstuvwxyz0123456789'),
                                size=16)
    filename = ''.join(filename)
    with open(os.path.join(MEDIA_ROOT, f'{filename}.musicxml'), 'w+') as f:
        f.write(str(soup))
    return filename
Ejemplo n.º 21
0
 def get_content_html(self, images_url):
     """Returns the html content of the tab, inside a <div> with an id attibute.
     """
     # Create wrapper
     soup = BeautifulSoup()
     div_tag = soup.new_tag('div')
     div_tag['id'] = self.html_id
     div_tag.append(BeautifulSoup(self.html_content.get_html(images_url)))
     soup.append(div_tag)
     return str(soup)
Ejemplo n.º 22
0
def create_body(flair_output):
    soup = BeautifulSoup()
    soup.append(soup.new_tag('text'))
    soup.find('text').append(soup.new_tag('body'))
    soup.body.append(soup.new_tag('div'))
    for paragraph in flair_output:
        paragraph_tag = soup.new_tag('p')
        markup = create_markup_with_entities(paragraph, paragraph_tag, soup)
        soup.div.append(paragraph_tag)
    return soup
Ejemplo n.º 23
0
def tag(tagname, attrs=None, text=None, dtrs=None):
    """Return a soup Tag element."""
    attrs = {} if attrs is None else attrs
    dtrs = [] if dtrs is None else dtrs
    newtag = BeautifulSoup('', features='lxml').new_tag(tagname, attrs=attrs)
    if text is not None:
        newtag.append(text)
    for dtr in dtrs:
        newtag.append(dtr)
    return newtag
Ejemplo n.º 24
0
def add_css_in_page(url, css):
    print('dentro')
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    outros_css = '''<link rel="stylesheet" href="pd-material-viewer.3b3f13a5.css">
<link rel="stylesheet" href="pd-material-viewer~pd-profile~pd-search~pd-subject.f4b332e0.css">'''
    link_ref = BeautifulSoup('<link rel="stylesheet" href="' + css + '">',
                             'html.parser')
    soup.append(link_ref)
    return soup
Ejemplo n.º 25
0
def main():
    print("")
    print("######################################")
    print("#                                    #")
    print("# FedEx CBA 2015 Parser Version 1.00 #")
    print("# Updated 27 June 2020               #")
    print("#                                    #")
    print("######################################")

    print("")

    DEFAULT_INPUT_FILENAME = "fdx_2015_working.html"
    DEFAULT_OUTPUT_FILENAME = "fdx_2015_parsed.html"

    input_filename = input("Enter input filename: ")

    if input_filename == "":
        input_filename = DEFAULT_INPUT_FILENAME

    soup = ""


    try:
        with open(input_filename, "r") as file:
            soup = BeautifulSoup(file, features="html.parser")

    except:
        print("")
        print("File not found. Exiting...")
        print("")
        exit()

    nodes = soup.findAll(["p","h1","h2","h3","h4","h5"])
    find_paragraph_id_and_set_node_id(nodes)
    newest_soup = BeautifulSoup()
    for node in nodes: 
        nodeId = ""
        if node.has_attr('id'):
            nodeId = node['id']
        
        node.attrs = {}

        if nodeId != "": 
            node['id'] = nodeId
        newest_soup.append(node)
        


    output_filename = input("Enter output filename: ")

    if output_filename == "":
        output_filename = DEFAULT_OUTPUT_FILENAME

    with open(output_filename, "w") as file:
        file.write(str(newest_soup))
Ejemplo n.º 26
0
def split_infobox_value(tag: BeautifulSoup) -> List[str]:
    if tag is None: return []

    groups = []
    curr = []
    seps = [',', '、', '\n', ';', ';']
    single_seps = ['及', '等', '和', '或']

    counter = PunctuationCounter()
    for content in tag.contents:
        if isinstance(content, NavigableString):
            if content.strip() in single_seps:
                if len(curr) > 0:
                    groups.append(curr)
                    curr = []
            else:

                start_id = 0
                for id, char in enumerate(content):
                    if char in seps and counter.splittable():
                        if id - start_id > 0:
                            curr.append(NavigableString(content[start_id:id]))
                        start_id = id + 1
                        if len(curr) > 0:
                            groups.append(curr)
                            curr = []
                    counter.count(char)

                if counter.splittable() and start_id < len(content):
                    end_id = len(content)
                    if content.endswith('等等'):
                        end_id -= 2
                    elif content.endswith('等'):
                        end_id -= 1

                    if start_id < end_id:
                        curr.append(NavigableString(content[start_id:end_id]))

        elif isinstance(content, Tag):
            if content.name == 'br' and len(curr) > 0:
                groups.append(curr)
                curr = []
            else:
                curr.append(content)

    if len(curr) > 0:
        groups.append(curr)

    tags = []
    for g in groups:
        tag = Tag(name='div', parser='html.parser')
        for sub in g:
            tag.append(sub)
        tags.append(format_str(tag))
    return tags
Ejemplo n.º 27
0
def path(path_html):
    if path_html == "habr":
        habr_path = "https://habr.com/"
    else:
        habr_path = "https://habr.com/"+path_html
    

    if request.headers['Accept'].split(",")[0] == "text/html":

        print("+++++HABR_PATH+++++: ", habr_path)

        page = requests.get(habr_path)
        print("!====!Get Habr_Page!====!")

        page = BeautifulSoup(page.text.encode('utf-8'), "html.parser")
        body_without_scripts = page.body
        scripts = []
        for x in body_without_scripts.find_all("script"):
            scripts.append(x.extract())

        # поиск слова с 6 буквами во всех словах body_without_scripts
        new_rows = [
            re.sub("[^\w]", "", word) \
            for word in re.findall(r"\s\b\w{6}\b", 
            body_without_scripts.get_text())
            ]
        myList = sorted(set(new_rows))
        print("!====!Write Word_List!====!")
        with open("templates/my_list_word.txt","w") as file:
            for word in myList:
                file.write(word + " ")  

        # замена слов в body_without_scripts
        for word_in_list in myList:
            # print(word_in_list)
            body_without_scripts = re.sub(
                r"\b" + word_in_list + r"\b", 
                " " + word_in_list + "™", 
                str(body_without_scripts)
                )
        

        print("!====!Replace Word_in_Body!====!")

        my_html = BeautifulSoup(body_without_scripts, 'html.parser')
        # добавляю скрипты в изменёный тег body
        for script in scripts:
            my_html.append(script)
        print("!====!Append Script!====!")

        my_html = replace_habr_href_in_body(my_html)
        body_page = str(page.body)
        page_html = str(page).replace(body_page, str(my_html))
        print("!====!Page_Html_is_Done!====!")
        return page_html
def format_recipe(old_soup):
    new_soup = BeautifulSoup('<html><head></head><body></body></html>')

    thumb = old_soup.find(id='cphMiddle_cphMain_imgRecipeThumb')
    if thumb:
        hdr = new_soup.new_tag('img')

        m = re.search('recipes/(.+\.jpg)', thumb['src'])

        hdr['src'] = './img/{}'.format(m.group(1))
        new_soup.body.append(hdr)

    source = soup.find(id='cphMiddle_cphMain_hlSource')
    title = old_soup.find(id='cphMiddle_cphMain_lblTitle').get_text().strip()
    hdr = new_soup.new_tag('title')
    hdr.append(title)
    new_soup.head.append(hdr)

    hdr = new_soup.new_tag('h1')
    hdr.append(title)
    new_soup.body.append(hdr)
    if source:
        new_soup.body.append(source)
    hdr = new_soup.new_tag('h3')
    hdr.append('Ingredients')
    new_soup.body.append(hdr)

    item = old_soup.find('ul', {'class':'inggroups'})
    if item:
        new_soup.body.append(item)
    else:
        new_soup.body.append('No ingedients listed')

    hdr = new_soup.new_tag('h3')
    hdr.append('Instructions')
    new_soup.body.append(hdr)

    item = old_soup.find('ol', {'class':'dirgroupitems'})
    if item:
        new_soup.body.append(item)
    else:
        new_soup.body.append('No instructions listed')

    hdr = new_soup.new_tag('h3')
    hdr.append('Notes')
    new_soup.body.append(hdr)

    notes = old_soup.find(id="cphMiddle_cphMain_lblNotes")
    if notes:
        hdr = new_soup.new_tag('pre')
        hdr.append(notes.get_text())
        new_soup.append(hdr)

    return new_soup
Ejemplo n.º 29
0
def get_xml(base_xxx, db_package):
    #psycopg2.extensions.register_type(psycopg2.extensions.UNICODE)
    #psycopg2.extensions.register_type(psycopg2.extensions.UNICODEARRAY)
    initiate_threaded_connection_pool(db_package)
    with getconnection() as conn:
        cursor = conn.cursor()
        cursor.execute("SELECT id, nom FROM optin_list WHERE abreviation = %s",
                       (str(base_xxx), ))
        records = cursor.fetchone()
        if records:
            optin_id = records[0]
            nom = records[1]
        else:
            optin_id = '0'
            nom = ""
        cursor.execute(
            "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s",
            (str(optin_id), 'header'))
        records = cursor.fetchone()[0]
        if records:
            header = records
        else:
            header = ""
        cursor.execute(
            "SELECT xml FROM criteo_xml WHERE optin_id = %s AND usage = %s",
            (str(optin_id), 'footer'))
        records = cursor.fetchone()[0]
        if records:
            footer = records
        else:
            footer = ""
    conn_pool.closeall()
    post_dict = {}
    post_dict['id'] = '1'
    post_dict['nom'] = nom
    post_dict['header'] = header
    post_dict['footer'] = footer
    xml_doc = BeautifulSoup(features='xml')
    xml_doc.append(xml_doc.new_tag("bases"))
    xml_doc.bases.append(xml_doc.new_tag("base"))
    cpt_content = 0
    for key, value in post_dict.iteritems():
        xml_doc.bases.base.append(xml_doc.new_tag(str(key)))
        xml_container = xml_doc.bases.base.contents[cpt_content]
        if key == 'footer':
            xml_formatted_value = "<![CDATA[" + value + "]]>"
        else:
            xml_formatted_value = value
        xml_container.append(xml_doc.new_string(xml_formatted_value))
        cpt_content += 1
    xml_feed = xml_doc.prettify()
    xml_feed = xml_feed.replace("&lt;", "<").replace(
        "&gt;", ">")  #.replace("&lt;p&gt;", "").replace("&lt;/p&gt;", "")
    return xml_feed
Ejemplo n.º 30
0
 def get_button_html(self):
     """Return the tab button, an <a> inside an <li>.
     """
     soup = BeautifulSoup()
     li_tag = soup.new_tag('li')
     a_tag = soup.new_tag('a')
     a_tag['href'] = '#' + self.html_id
     a_tag.string = self.name
     soup.append(li_tag)
     li_tag.append(a_tag)
     return str(soup)
Ejemplo n.º 31
0
        def _img(html):
            soup = BeautifulSoup(html, 'html.parser').find_all('img', src=True)

            if soup == []:
                soup.append('none')
            else:
                img = []
                for key in soup:
                    key = key.get('src')
                    img.append(unquote(key))
                return img
Ejemplo n.º 32
0
class ERD:
    def __init__(self, xml=None):
        self._config = Config()
        if xml is not None:
            self.soup = BeautifulSoup(xml, 'xml')
        else:
            self.soup = BeautifulSoup(features='xml')
            self.soup.append(
                self.soup.new_tag(
                    'erModel', **self._config.XML['ErRootAttributes']
                )
            )
        XMLObject.soup = self.soup

        self.entities: Dict[int, Entity] = {}
        self.relations: List[Relation] = []

        self._parse_xml()

    def add_entity(self, entity: Entity):
        assert entity._id not in self.entities
        self.entities[entity._id] = entity

    def add_relation(self, relation: Relation):
        self.relations.append(relation)

    def _parse_xml(self):
        for tag in self.soup.find_all('entity'):
            entity = Entity.from_xml(tag)
            self.add_entity(entity)

        self.relations = [
            Relation.from_xml(tag) for tag in self.soup.find_all('relation')
        ]

    def iter_relations(self, filter_):
        for relation in self.relations:
            if filter_(relation):
                yield relation
            elif len(relation) == 2 and filter_(relation.invert()):
                yield relation.invert()

    def to_xml(self):
        soup = BeautifulSoup(features='xml')
        soup.append(
            soup.new_tag('erModel', **self._config.XML['ErRootAttributes'])
        )
        root = soup.find('erModel')
        XMLObject.soup = self.soup
        for entity in self.entities.values():
            root.append(entity.to_xml())
        for relation in self.relations:
            root.append(relation.to_xml())
        return soup
Ejemplo n.º 33
0
def handle_page_table(filepath):

    soup = BeautifulSoup('<html></html>', 'lxml')
    all_pc_children = get_all_pc_children(filepath)
    table_cells = get_table_cells(all_pc_children)
    for cells in table_cells:
        cells_dict = create_cells_dict(cells)
        table_coordinate = create_table_coordinate(cells_dict)
        html_table = create_html_table(cells_dict, table_coordinate)
        soup.append(html_table)
    return soup
def convertToOSM(lst):
    ret = """<?xml version='1.0' encoding='UTF-8'?>
<osm version='0.6' upload='false' generator='punktyadresowe_import.php'>
"""
    ret = BeautifulSoup("", "xml")
    osm = ret.new_tag('osm', version='0.6', upload='false', generator='punktyadresowe_import.py')
    ret.append(osm)

    for (node_id, val) in enumerate(lst):
        osm.append(val.asOsmSoup(-1 * (node_id + 1)))

    return ret.prettify()
Ejemplo n.º 35
0
def getHs(soupH, h, hText):
    Text = BeautifulSoup("", "lxml")
    if (h in ValidH):
        allH = soupH.find_all(h)
        for H in allH:
            if hText in H:
                #print hText+" found"
                nextSib = H.find_next(True)
                while nextSib is not None and h not in nextSib.name:
                    Text.append(nextSib)
                    nextSib = nextSib.nextSibling
    return Text
Ejemplo n.º 36
0
 def to_xml(self):
     soup = BeautifulSoup(features='xml')
     soup.append(
         soup.new_tag('erModel', **self._config.XML['ErRootAttributes'])
     )
     root = soup.find('erModel')
     XMLObject.soup = self.soup
     for entity in self.entities.values():
         root.append(entity.to_xml())
     for relation in self.relations:
         root.append(relation.to_xml())
     return soup
Ejemplo n.º 37
0
    def to_xml(self):
        el = BeautifulSoup().new_tag('resource_set', )
        el['name'] = self.name
        el['rollaback'] = self.rollback_mode
        el['executed'] = self._executed

        if self._items:
            for item in self._items:
                el.append(item.to_xml())
                el.append('\n')

        return el
Ejemplo n.º 38
0
    def initialize(self):
        # load the file
        with open(self.html, "r") as f:
            soup = BeautifulSoup(f.read(), 'html.parser')

        # set up html file
        soup.append(dom2soup(html(head(), body())))
        soup.head.append(dom2soup(link(rel='stylesheet', href="https://fonts.googleapis.com/css?family=Open+Sans")))
        soup.head.append(dom2soup(link(rel='stylesheet', href=self.css)))
        soup.body.append(dom2soup(table()))

        return soup
Ejemplo n.º 39
0
        def _script(html):
            soup = BeautifulSoup(html, 'html.parser').find_all('script',
                                                               src=True)

            if soup == []:
                soup.append('none')
            else:
                script = []
                for link in soup:
                    link = link.get('src')
                    script.append(unquote(link))
                return script
    def write_sorting(sorting: Union[MultiSortingExtractor, SortingExtractor], save_path: PathType):
        save_path = Path(save_path)
        if save_path.suffix == '':
            sorting_name = save_path.name
        else:
            sorting_name = save_path.stem
        xml_name = sorting_name
        save_xml_filepath = save_path / (str(xml_name) + '.xml')

        assert not save_path.is_file(), "'save_path' should be a folder"
        if not save_path.is_dir():
            os.makedirs(save_path)

        if save_xml_filepath.is_file():
            raise FileExistsError(f'{save_xml_filepath} already exists!')

        soup = BeautifulSoup("", 'xml')

        new_tag = soup.new_tag('samplingrate')
        new_tag.string = str(sorting.get_sampling_frequency())
        soup.append(new_tag)

        # write parameters file
        with open(save_xml_filepath, "w") as f:
            f.write(str(soup))

        if isinstance(sorting, MultiSortingExtractor):
            counter = 1
            for sort in sorting.sortings:
                # Create and save .res.%i and .clu.%i files from the current sorting object
                save_res = save_path / f'{sorting_name}.res.{counter}'
                save_clu = save_path / f'{sorting_name}.clu.{counter}'
                counter += 1

                res, clu = _extract_res_clu_arrays(sort)

                np.savetxt(save_res, res, fmt='%i')
                np.savetxt(save_clu, clu, fmt='%i')

        elif isinstance(sorting, SortingExtractor):
            # assert units have group property
            assert 'group' in sorting.get_shared_unit_property_names()
            sortings, groups = get_sub_extractors_by_property(sorting, 'group', return_property_list=True)

            for (sort, group) in zip(sortings, groups):
                # Create and save .res.%i and .clu.%i files from the current sorting object
                save_res = save_path / f'{sorting_name}.res.{group}'
                save_clu = save_path / f'{sorting_name}.clu.{group}'

                res, clu = _extract_res_clu_arrays(sort)

                np.savetxt(save_res, res, fmt='%i')
                np.savetxt(save_clu, clu, fmt='%i')
Ejemplo n.º 41
0
def block():
    with open('templates/map1.html') as inf:
        txt = inf.read()
        soup = BeautifulSoup(txt, "html.parser")
    block_cont = '{% block content %}'
    end_cont = '{% endblock %}'

    soup.append(block_cont)
    soup.append(end_cont)
    # запись в html
    with open('templates/map1.html', "w") as outf:
        outf.write(str(soup))
Ejemplo n.º 42
0
def header_content_extraction(soup, headers_list):
    for x in soup.find_all():
        if len(x.text) == 0:
            x.extract()


    section_dict = {}
    section_dict_bullets = {}
    section_dict_bold = {}
    for header in range(len(headers_list)):
        header_tag = None
        header_tag = soup.find(headers_list[header])
        if header_tag is None:
            break
        header_tag_list = []
        header_tag_list = header_tag.parent.findChildren(headers_list[header])
        if len(header_tag_list) == 0:
            break
        for component_tag in header_tag_list:
            header_tag_siblings = component_tag.nextSiblingGenerator()
            header_tag_sibling_list = []
            header_tag_sibling_tag_list = []
            within_para_bold_tag_list = []
            for header_tag_sibling in header_tag_siblings:
                if header_tag_sibling.name in (headers_list[:(header + 1)]):
                    if header_tag_sibling_list:
                        section_dict[component_tag.get_text() + '[Full Contents]'] = ' '.join(header_tag_sibling_list)
                    if within_para_bold_tag_list:
                        section_dict_bold[component_tag.get_text() + '[Bold Text]'] = ' '.join(within_para_bold_tag_list)
                    new_tag = BS('').new_tag('kghtmlextractiontag')
                    for bullet_tag in header_tag_sibling_tag_list:
                        new_tag.append(bullet_tag)
                    bundled_bullet_tag_list = []
                    bundled_bullet_tag_list = new_tag.find_all('p', class_ = 'list_Paragraph')
                    bundled_bullet_text_list = []
                    within_para_bold_tag_list = []
                    try:
                        bundled_bullet_text_list = [j.get_text() for j in bundled_bullet_tag_list]
                    except AttributeError:
                        pass
                    if bundled_bullet_text_list:
                        section_dict_bullets[component_tag.get_text() + '[Bullets Only]'] = ' '.join(bundled_bullet_text_list)
                    del new_tag
                    break
                try:
                    header_tag_sibling_tag_list.append(header_tag_sibling)
                    header_tag_sibling_list.append(header_tag_sibling.get_text())
                    within_para_bold_tag_list += [bold_tag.get_text() for bold_tag in header_tag_sibling.find_all('b')]
                except AttributeError:
                    pass
    
    full_content_dict = {**section_dict, **section_dict_bullets, **section_dict_bold}
    return full_content_dict
Ejemplo n.º 43
0
        def _link(html):
            soup = BeautifulSoup(html, 'html.parser').find_all('link',
                                                               href=True)

            if soup == []:
                soup.append('none')
            else:
                link = []
                for key in soup:
                    key = key.get('href')
                    link.append(unquote(key))
                return link
Ejemplo n.º 44
0
    def postprocess(self, text):
        # Hack for unescape special chars
        for key, value in markdown2.g_escape_table.iteritems():
            text = text.replace(value, key)

        urls = set(URL_RE.findall(text))

        # Treat images as gallery
        post = BeautifulSoup(text, 'html.parser')
        imgs = post.find_all('img')

        gallery = Tag(name='div',
                      attrs={
                          'class': "gallery",
                          'style': 'display: none;',
                          'id': hashlib.md5(text.encode('utf-8')).hexdigest(),
                      })
        img_urls = [img['src'] for img in imgs]
        for img in imgs:
            img.extract()
            img.attrs.update({
                'data-image': img['src'],
                'data-description': img['alt'],
            })
            gallery.append(img)

        # Add url as web rich object
        wros = ''
        for url in urls:
            if HTMLParser().unescape(url) in img_urls:
                continue
            try:
                wro = WebRichObject.objects.create_or_update_from_url(url)
                if wro.type != 'image':
                    wros += wro.get_widget(video_width="100%", video_height='320px')\
                        .decode('utf8')
                else:
                    img = Tag(name='img',
                              attrs={
                                  'alt': wro.description or '',
                                  'src': wro.url,
                                  'data-image': wro.url,
                                  'data-description': wro.description or ''
                              })
                    gallery.append(img)
            except IOError as err:
                print err

        post.append(gallery)
        text = urlize_html(unicode(post))
        text += wros
        return text
Ejemplo n.º 45
0
class WindowsEventData(object):
    def __init__(self):
        self.data_list = []
        self.soup = BeautifulSoup(features='lxml')

    @property
    def get(self):
        return self.soup

    def add(self, name, text):
        new_tag = self.soup.new_tag(name)
        self.soup.append(new_tag)
        new_tag.string = text
Ejemplo n.º 46
0
def search(query):
    query = query.replace(" ","+")
    url = "https://www.youtube.com/results?search_query="+query
    h3 = BeautifulSoup(requests.get(url).text, "html.parser").find('h3',{'class' : 'yt-lockup-title'})
    x = BeautifulSoup('',"html.parser")
    child = h3.findChildren()
    for c in child:
        x.append(c)
    watch_url = ''
    for link in x.findAll('a',{'rel':'spf-prefetch'}):
        watch_url = link.get('href')
    yt_url = "https://www.youtube.com"+watch_url
    return yt_url
Ejemplo n.º 47
0
def getTextAndImg(page):
    wxsoup = BeautifulSoup(page,'html.parser')
    bodyElem = wxsoup.body
    #print bodyElem.prettify()
    #print bodyElem.contents[0]
    #loop through the children
    newsoup = BeautifulSoup('')
    btagnew = newsoup.new_tag('div')
    newsoup.append(btagnew) 
    #processChildren2(bodyElem,btagnew,newsoup)
    processChildren(bodyElem,btagnew,newsoup)

    return newsoup.prettify()
Ejemplo n.º 48
0
def start(xmlfilename, prsfilename):
    data = BeautifulSoup("lxml")
    data.append(BeautifulSoup.new_tag(data, "body"))

    prslines = []

    with open(prsfilename, "r", encoding='utf-8') as file:
        prslines = file.read().split('\n')

    res = sentstoxml(readtable(prslines), data.body)

    with open(xmlfilename, "wb") as file:
        file.write(res.prettify("utf-8"))
Ejemplo n.º 49
0
def export_to_xml(roots, version):
    """
    Converts the intermediate structure to a soup and saves the xml
    """
    for root in roots:
        with open(f'{US_XML_PATH}/{root["itempath"][1:]}_{version}.xml',
                  "wb") as f:
            soup = BeautifulSoup("", "lxml")
            soup.append(doc_to_soup(root, soup, 0, version, root=True))
            remove_unnecessary_subseqitems(soup)
            add_keys_to_items(soup,
                              f'{root["itempathcomponents"][0]}_{version}')
            f.write(soup.encode("utf-8"))
Ejemplo n.º 50
0
def wrap_ul(tags):
    all_li = tags.find_all("li")
    if len(all_li) > 0 and all_li[0].parent.name != "ul":
        for item in tags:
            if item.find("li") and item.name == "p":
                item.attrs = {"class": "p_ul"}
            else:
                break
        ul = BeautifulSoup(features="html.parser").new_tag('ul')
        all_li[0].insert_before(ul)
        for li in all_li:
            if li.parent.name != "ul":
                ul.append(li)
Ejemplo n.º 51
0
def generate_xml(data):
    soup = BeautifulSoup(features='xml')
    soup.append(soup.new_tag('condition'))
    for i in data.keys():
        item = soup.new_tag('item')
        name = soup.new_tag('name')
        name.string = i
        item.append(name)
        value = soup.new_tag('value')
        value.string = data[i]
        item.append(value)
        soup.condition.append(item)
    return soup.decode(eventual_encoding='GBK')
Ejemplo n.º 52
0
def getLinks(links, useTitleAsKey=False):
    linksDict = {}
    soup = BeautifulSoup('', features='html.parser')
    for link in links:
        soup.append(link)
    for a in soup.find_all('a'):
        if useTitleAsKey and a.has_attr('title') and a.has_attr('href'):
            linksDict[a['title']] = a['href']
        else:
            if a.string != None and a.has_attr(
                    'href') and a.string in string.ascii_letters:
                linksDict[a.string] = 'https:' + a['href']
    return linksDict
Ejemplo n.º 53
0
def display_people(graph, iri):
    types_of = list(set([o for o in graph.objects(subject=iri,
                                                  predicate=rdflib.RDF.type)]))
    if BF.Person in types_of:
        return ''
    output = BeautifulSoup()
    for year_iri in KNOWLEDGE_GRAPH.subjects(predicate=SCHEMA.organizer,
        object=iri):
        year_label = KNOWLEDGE_GRAPH.value(subject=year_iri,
            predicate=rdflib.RDFS.label)
        div = output.new_tag("div")
        h2 = output.new_tag("h2")
        h2.string = year_label
        div.append(h2)
        people = dict()
        h3 = output.new_tag("h3")
        h3.string = "People"
        div.append(h3)
        ul = output.new_tag("ul")
        for pred, obj in KNOWLEDGE_GRAPH.predicate_objects(
            subject=year_iri):
            if isinstance(obj, rdflib.URIRef):
                pred_label = get_label(KNOWLEDGE_GRAPH, pred)
                obj_label = get_label(KNOWLEDGE_GRAPH, obj)
                for type_ in KNOWLEDGE_GRAPH.objects(subject=obj,
                    predicate=rdflib.RDF.type):
                    if type_ == BF.Person:
                        li = output.new_tag("li")
                        person_a = output.new_tag("a", href=str(obj))
                        if obj_label is None:
                            person_a.string = str(obj)
                        else:
                            person_a.string = obj_label
                        li.append(person_a)
                        if pred_label is not None:
                            title = output.new_tag('span')
                            title.string = ", {}".format(pred_label)
                            li.append(title)
                        ul.append(li)
                        if pred in people:
                            people[pred]["persons"].append({"iri": obj,
                                                            "name": obj_label})
                        else:
                            people[pred] = {"persons": [{"iri": obj,
                                                         "name": obj_label}],
                                            "label": pred_label}
        div.append(ul)
        output.append(div)
    return output.decode(pretty_print=True)
Ejemplo n.º 54
0
	def get_span(self, start, end):
		"""Given indices (start, end) in the pure-text version of the
		htmlString this object is initialized with, returns the html string
		that corresponds to the specified text string
		"""

		# we need to copy so that we don't destroy self._top_level_ranges
		# converting to a string and reparsing is much faster than doing a deepcopy
		top_level_ranges = [{'el': BeautifulSoup(unicode(r['el'])), 'range': r['range']}
			for r in self._get_applicable_ranges(self._top_level_ranges, start, end)]

		# create a new top-level soup so that we can modify elements in place
		result = BeautifulSoup()
		for r in top_level_ranges:
			result.append(r['el'])

		if len(top_level_ranges) == 0:
			return u''
		elif len(top_level_ranges) == 1:
			range_offset = top_level_ranges[0]['range'][0]
			inner_start = start - range_offset
			inner_end = end - range_offset

			self._modify_ranges(self._get_applicable_ranges(
					self._get_text_indices(top_level_ranges[0]['el']),
					inner_start, inner_end),
				inner_start, inner_end)
		else:
			range_offset = top_level_ranges[0]['range'][0]
			inner_start = start - range_offset
			inner_end = end - range_offset

			self._modify_ranges(
				self._get_applicable_ranges(
					self._get_text_indices(top_level_ranges[0]['el']),
					inner_start, inner_end),
				start = inner_start)

			range_offset = top_level_ranges[-1]['range'][0]
			inner_start = start - range_offset
			inner_end = end - range_offset

			self._modify_ranges(
				self._get_applicable_ranges(
					self._get_text_indices(top_level_ranges[-1]['el']),
					inner_start, inner_end),
				end = inner_end)

		return unicode(result)
Ejemplo n.º 55
0
 def merget_element(self, soup):
     _s = BeautifulSoup("")
     l = ['span']
     #span 嵌套
     for e in soup.find_all(l):
         es = e.find_all(True, recursive=False)
         if es :
             continue
         _ = to_unicode(' '.join(e.get_text().strip().split())).strip()
         if _:
             e.string = _
     for e in soup.find_all(l):
         e.unwrap()
     for e in soup.find_all('p'):
         _s.append(e)
     return _s
Ejemplo n.º 56
0
def create_note(note_data, soup):
    """Create an ENEX note element"""

    note = soup.new_tag('note')

    title = soup.new_tag('title')
    title.string = note_data.title
    note.append(title)

    content_inside = BeautifulSoup(features="xml")
    content_inside.append(Doctype('en-export SYSTEM "http://xml.evernote.com/pub/evernote-export3.dtd"'))

    content_inside_note = soup.new_tag('en-note')
    content_inside_note.string = note_data.content
    content_inside.append(content_inside_note)

    # Holy crap this is super hacky and horrible but I don't want to fight with
    # BeautifulSoup to make it not convert all the text to HTML entities, so
    # manually convert everything to < and >
    content_inside_str = str(content_inside).replace('&lt;', '<').replace('&gt;', '>')

    content = soup.new_tag('content')
    content.string = CData(content_inside_str)
    note.append(content)

    created = soup.new_tag('created')
    created.string = str(note_data.created)
    note.append(created)

    updated = soup.new_tag('updated')
    updated.string = str(note_data.updated)
    note.append(updated)

    for single_tag in note_data.tags:
        if single_tag is not None:
            tag = soup.new_tag('tag')
            tag.string = single_tag
            note.append(tag)

    attributes = soup.new_tag('note-attributes')
    author = soup.new_tag('author')
    author.string = "Andrew Heiss"

    attributes.append(author)
    note.append(attributes)

    return note
Ejemplo n.º 57
0
    def convert_to_dynetml(self, is_entire_file=False):
        """
        Converts the graph to dynetml and returns a BeautifulSoup tag
        :param is_entire_file: if True, wraps value as a soup. If False, returns the top tag
        :type is_entire_file: bool
        :return: bs4.element.Tag
        :raise TypeError: if is_entire_file isn't a bool
        """
        dmlpu.check_type(is_entire_file, 'is_entire_file', bool)

        bs = BeautifulSoup(features='xml')
        bs.append(bs.new_tag('MetaNetwork'))

        for attr in self.attributes:
            bs.MetaNetwork[attr] = dmlpu.unformat_prop(self.attributes[attr])

        bs.MetaNetwork.append(dmlpu.get_property_identities_tag(self.propertyIdentities))

        bs.MetaNetwork.append(bs.new_tag('properties'))
        for key in self.properties:
            prop_tag = bs.new_tag('property')
            prop_tag['id'] = key
            prop_tag['value'] = dmlpu.unformat_prop(self.properties[key])
            bs.MetaNetwork.properties.append(prop_tag)

        bs.MetaNetwork.append(bs.new_tag('nodes'))
        for class_type in self.__node_tree:
            for class_id in self.__node_tree[class_type]:
                nodeclass_tag = bs.new_tag('nodeclass', type=class_type, id=class_id)
                nodeclass_tag.append(dmlpu.get_property_identities_tag(self.__node_tree[class_type][class_id][0]))

                for key in self.__node_tree[class_type][class_id][1]:
                    node_tag = bs.new_tag('node', id=key)
                    for attr in self.__node_tree[class_type][class_id][1][key][0]:
                        node_tag[attr] = dmlpu.unformat_prop(self.__node_tree[class_type][class_id][1][key][0][attr])
                    node_tag.append(dmlpu.get_properties_tag(self.__node_tree[class_type][class_id][1][key][1]))
                    nodeclass_tag.append(node_tag)

                bs.MetaNetwork.nodes.append(nodeclass_tag)

        networks_tag = self._get_networks_tag()
        bs.MetaNetwork.networks.append(networks_tag)

        if not is_entire_file:
            bs = bs.MetaNetwork

        return bs
Ejemplo n.º 58
0
def extract_toc(content):
    if isinstance(content, contents.Static):
        return

    soup = BeautifulSoup(content._content,'html.parser')
    filename = content.source_path
    extension = path.splitext(filename)[1][1:]
    toc = None

    # if it is a Markdown file
    if extension in readers.MarkdownReader.file_extensions:
        toc = soup.find('div', class_='toc')
        if toc: toc.extract()
    # else if it is a reST file
    elif extension in readers.RstReader.file_extensions:
        toc = soup.find('div', class_='contents topic')
        if toc: toc.extract()
        if toc:
            tag=BeautifulSoup(str(toc))
            tag.div['class']='toc'
            tag.div['id']=''
            p=tag.find('p', class_='topic-title first')
            if p:p.extract()
            toc=tag
    elif extension in ['org']:
        toc = soup.find('div', id="table-of-contents")
        if toc:
            toc.extract()
            tag=BeautifulSoup(str(toc))
            tag.div['class']='toc'
            tag.div['id']=''
            p=tag.find('p', class_='topic-title first')
            if p:p.extract()
            h2=tag.find('h2')   # 'Table of Contents'
            if h2: h2.extract()
            orgfile = path.basename(content.source_path)
            tag.append(BeautifulSoup('<a href="%s">Org source</a>'%orgfile))
            toc=tag

    elif not toc:  # Pandoc reader
        toc = soup.find('nav', id='TOC')
    if toc:
        toc.extract()
        content._content = soup.decode()
        content.toc = toc.decode()
        if content.toc.startswith('<html>'):
            content.toc = content.toc[12:-14]
Ejemplo n.º 59
0
    def description_short(self):
        # get up to 20 words
        soup = BeautifulSoup(self.description)
        truncated = self.keep_first_nwords(soup, 50)

        if truncated:
            more_info_link = soup.new_tag('a', href=urlresolvers.reverse('event', kwargs = {'pk' : str(self.id)}))
            more_info_link['class'] = 'more_info_link'
            more_info_link.append('[...]')
            soup.append(more_info_link)

        # get rid of paragraphs
        for p_tag in soup.findAll('p'):
            p_tag.append(' ')  # ensure paragraph ends with a space before we flatten it
            p_tag.unwrap()

        return safestring.mark_safe(soup.decode(formatter='html'))
Ejemplo n.º 60
0
 def get_html(self, images_url):
     """Get the html for this image tag. When you click on the image, it links to a big version
     of the image.
     """
     img_original_url = images_url + self.original_name
     img_resized_url = images_url + self.new_name 
     # Create the new image tag
     a_img_soup = BeautifulSoup()
     a_tag = a_img_soup.new_tag('a')
     a_tag['href'] = img_original_url
     img_tag = a_img_soup.new_tag('img')
     img_tag['src'] = img_resized_url
     a_img_soup.append(a_tag)
     a_tag.append(img_tag)
     new_tag = str(a_img_soup)
     # Return
     return new_tag