def process_file(all_contents):
    print("roadrunner, num of contents", len(all_contents))

    content1 = all_contents[0]
    content2 = all_contents[1]

    # create trees from contents
    parsed1 = BeautifulSoup(content1, 'html.parser')
    parsed2 = BeautifulSoup(content2, 'html.parser')

    print("prettify")
    print(parsed1.body.prettify())
    print("------------")

    for tag in parsed1.recursiveChildGenerator():
        if hasattr(tag, 'attrs'):
            tag.attrs = None

    for tag in parsed2.recursiveChildGenerator():
        if hasattr(tag, 'attrs'):
            tag.attrs = None

    prepared1 = parsed1.body.contents
    prepared2 = parsed2.body.contents

    # print(parsed1.body)
    # print("----------------")
    # print(parsed2.body)
    # print("----------------")

    compare_contents(prepared1, prepared2)
Example #2
0
def extract(html_str, filter_list):
    soup = BeautifulSoup(html_str, 'lxml')

    for tag in soup.recursiveChildGenerator():
        if hasattr(tag, 'name') and tag.name in filter_list:
            tag.extract()
            print tag
Example #3
0
    def extract(self, html):
        soup = BeautifulSoup(html, 'html.parser')

        [s.extract() for s in soup('script')]
        [s.extract() for s in soup('style')]
        for br in soup.find_all("br"):
            if len(br.findChildren()) == 0:
                br.replace_with(" ")

        document = ''
        n_strs = [
            i for i in soup.recursiveChildGenerator()
            if type(i) == NavigableString
        ]
        for n_str in n_strs:
            content = n_str.strip()
            if len(content) == 0 or content is None:
                continue

            # Tokenize content and remove double spaces.
            document += content
            document += '\n'

        names = self.extract_names(document)
        # for n in names:
        #   print n
        return names
Example #4
0
def sanitizeHTML(value, mode='none'):
    """ Удаляет из value html-теги.
        Если mode==none - все теги
        Если mode==strict - все теги кроме разрешенных
    """
    if mode == 'strict':
        valid_tags = 'p i em strong b u a h1 h2 h3 h4 pre br div span ul ol li img ' \
                     'blockquote object param embed iframe ' \
                     'table thead tbody tr td'.split()
    else:
        valid_tags = []

    valid_attrs = 'href src pic user page class text title alt style colspan rowspan rel'.split()
    # параметры видеороликов
    valid_attrs += 'width height classid codebase id name value flashvars webkitallowfullscreen mozallowfullscreen ' \
                   'allowfullscreen allowscriptaccess ' \
                   'quality src type bgcolor base seamlesstabbing swLiveConnect pluginspage data frameborder'.split()

    soup = BeautifulSoup(value.encode('utf8'), from_encoding='utf8')
    for tag in soup.recursiveChildGenerator():
        if isinstance(tag, element.Tag):
            if tag.name in valid_tags:
                tag.attrs = dict((attr, val) for attr, val in tag.attrs.items() if attr in valid_attrs)
            else:
                tag.hidden = True

        elif isinstance(tag, element.Comment):
            tag.extract()

    return soup.renderContents().decode('utf8')
Example #5
0
def ReadHtmlKegTable(CommonArea):
    print(
        "Trying to read the online kegg html table to translate OrgId to OrgName : http://www.genome.jp/kegg/catalog/org_list.html"
    )
    from bs4 import BeautifulSoup
    import urllib2

    dOrgIdOrgName = dict()
    f1 = urllib2.urlopen(CommonArea['ikeggtrans'])
    soup1 = BeautifulSoup(f1)

    for child in soup1.recursiveChildGenerator():
        name = getattr(child, "name", None)
        try:
            xchild = str(child)
            xname = str(name)
            if xname.startswith("a") and xchild.startswith(
                    '<a href="/dbget-bin/www_'):
                ychild = str(xchild)
                OrgName = ychild.split(">")[1].split('<')[0]
                OrgNameOrgId = OrgId + "\t" + OrgName
                dOrgIdOrgName[OrgId] = OrgName

            if xname.startswith("a") and xchild.startswith(
                    '<a href="/kegg-bin/show_organism?org='):
                ychild = str(xchild)
                OrgId = ychild.split(">")[1].split('<')[0]
        except:
            pass
    CommonArea['dOrgIdOrgName'] = dOrgIdOrgName
    return CommonArea
Example #6
0
def __extract_all(html, is_doc):
    """Extracting the text from the HTML of the WSO2 documentation and removing the code segments"""
    # Adding '.' to the end of each list in HTML body
    modified_html = re.sub(r'(<\/(li|h\d|td|th)>)', r'.\1', html)
    soup = BeautifulSoup(modified_html, 'lxml')
    if is_doc:
        [
            c.extract_entities() for c in soup.findAll(
                'div',
                attrs={'class': 'code panel pdl conf-macro output-block'})
        ]

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    # URLs are saved separately for future use
    urls = []
    output = ''

    # Working with the hyperlink
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            if hasattr(value, 'href'):
                attr = value.get('href')
                urls.append(attr)
                if value.text != attr:
                    output += ' ' + value.text
        elif value.name is None and not visit_to_a:
            output += ' ' + value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)
    return output, urls
Example #7
0
def parse(path_to_file):
    with open(path_to_file, encoding='utf8') as html_file:
        soup = BeautifulSoup(html_file.read(), 'lxml').find(id='bodyContent')

        # Find a images with width lager than 200.
        imgs = len([img for img in soup.find_all('img', width=True) if int(img['width']) >= 200])

        # Find headers 'h' that starts with are E, T, C.
        headers = len([header for header in soup.find_all(('h1', 'h2', 'h3', 'h4', 'h5', 'h6'))
                       if header.text[:1] in ('E', 'T', 'C')])

        # Find max sequence of tags 'a'.
        linkslen = 0
        for tag in soup.recursiveChildGenerator():
            if tag.name == 'a':
                cnt = 1
                next_tags = tag.find_next_siblings()
                if next_tags:
                    for i in next_tags:
                        if i.name == 'a':
                            cnt += 1
                        else:
                            break
                    if linkslen < cnt:
                        linkslen = cnt

        # Find ul, ol if one is not nested in other
        lists = sum(
            1 for tag in soup.find_all(['ol', 'ul']) if not tag.find_parent(['ol', 'ul']))

        return [imgs, headers, linkslen, lists]
Example #8
0
def extract_q_without_code(question):
    """Question body is extracted without the code and blockquote sections"""
    soup = BeautifulSoup(question, 'lxml')

    # Obtaining the code segments and the blockquotes from the body
    [c.extract_entities() for c in soup('code')]
    [e.extract_entities() for e in soup('blockquote')]

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    output = ''

    # Not adding the hyperlinks to the output
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            if hasattr(value, 'href') and value.text != value['href']:
                output += value.text

        elif value.name is None and not visit_to_a:
            output += value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)
    return output
Example #9
0
def __extract_all(html, is_doc):
    """Extracting the text from the HTML of the WSO2 documentation and removing the code segments"""
    # Adding '.' to the end of each list in HTML body
    modified_html = re.sub(r'(<\/(li|h\d|td|th)>)', r'.\1', html)
    soup = BeautifulSoup(modified_html, 'lxml')
    if is_doc:
        [c.extract_entities() for c in soup.findAll('div', attrs={'class': 'code panel pdl conf-macro output-block'})]

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    # URLs are saved separately for future use
    urls = []
    output = ''

    # Working with the hyperlink
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            if hasattr(value, 'href'):
                attr = value.get('href')
                urls.append(attr)
                if value.text != attr:
                    output += ' ' + value.text
        elif value.name is None and not visit_to_a:
            output += ' ' + value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)
    return output, urls
Example #10
0
    def get_phones(self, url):
        for _ in range(3):
            response = requests.get(url)
            if response.ok:
                break
        else:
            raise HttpError

        log.debug('%02d: %s downloaded', self.number, url)

        soup = BeautifulSoup(response.content, 'lxml')

        for tag in soup.recursiveChildGenerator():
            if isinstance(tag, element.Tag):
                tag.hidden = True

            elif isinstance(tag, element.Comment):
                tag.extract()

        phones = set()
        text = soup.renderContents().decode('utf8')
        for match in re.findall(PHONE_RE, text):
            numbers = re.sub(r'\D', '', match)

            phone = self.canonize(numbers)
            if phone:
                phones.add(phone)

        return phones
Example #11
0
def extractall(html):
    """Extracting the text from the HTML and removing the code segments"""
    soup = BeautifulSoup(html, 'lxml')
    [
        c.extract_entities() for c in soup.findAll(
            'div', attrs={'class': 'code panel pdl conf-macro output-block'})
    ]

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    # URLs are saved separately for future use
    urls = []
    output = ''

    # Working with the hyperlink
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            if hasattr(value, 'href'):
                urls.append(value['href'])
                if value.text != value['href']:
                    output += value.text
        elif value.name is None and not visit_to_a:
            output += ' ' + value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)

    return output, urls
Example #12
0
def parse_text(text):
    data = []
    soup = BeautifulSoup(text, 'lxml')
    for child in soup.recursiveChildGenerator():
        if child.name and child.name not in data:
            data.append(child.name)
    return data
Example #13
0
def wholeDaySchedule(day, who, ID):
    page = requests.get("http://rasp.guap.ru/?" + who + "=" + ID)
    raspGuap = BeautifulSoup(page.text, 'lxml')
    weekType = raspGuap.find('p').em.get('class')
    gettingSchedule = []
    firstDayGone = 0
    for child in raspGuap.recursiveChildGenerator():
        if firstDayGone == 1:
            if child.name == 'h4':
                gettingSchedule.append('\n' + child.get_text())
            elif child.name == 'span':
                gettingSchedule.append(child.get_text())
        if child.name == 'h3':
            if child.get_text() == day and firstDayGone == 0:
                firstDayGone = 1
            elif firstDayGone == 1:
                break
    if gettingSchedule == []:
        return 'Занятий нет'
    else:
        retStr = ''
        if weekType == ['up']:
            retStr += 'Сейчас верхняя неделя ▲\n'
        else:
            retStr += 'Сейчас нижняя неделя ▼\n'
        for i in range(len(gettingSchedule)):
                retStr += gettingSchedule[i] + '\n'
        return retStr
Example #14
0
def find_all_links_on_page(path, start_page, cnt, visited, all_links):
    try:
        with open(f'{path}{start_page}', encoding='utf8') as f:
            soup = BeautifulSoup(f.read(), 'lxml')
            links = [
                link for link in BeautifulSoup.recursiveChildGenerator(soup)
                if link.name == 'a'
            ]
            temp = set()
            # Добавляем подходящие ссылки в словарь
            for link in links:
                try:
                    pattern = f'^([\w\/]*wiki\/(\w*\D*\w+\D\w*))$'
                    href = link.get('href')
                    file_name = re.findall(pattern, href)
                    if file_name:
                        if file_name[0][1] not in visited:
                            temp.add(file_name[0][1])
                            if cnt in all_links:
                                all_links[cnt].add(file_name[0][1])
                            else:
                                all_links[cnt] = set(file_name[0][1])
                except:
                    continue
            return temp
    except:
        return
Example #15
0
def test_xml_7():
    from bs4 import BeautifulSoup

    with open('test/test_folder/srcml_manager/file_1.xml', 'r') as f:

        contents = f.read()

        soup = BeautifulSoup(contents, 'lxml')

        items = ["function", "if"]
        # items=["if"]
        result = list()
        index = 0
        line = 0

        # for child in soup.recursiveChildGenerator():
        #     if str(type(child)) == "<class 'bs4.element.NavigableString'>":
        #         print(index, child)
        #         index+=1
        #         if child=="\n":
        #             print("AAA")

        for child in soup.recursiveChildGenerator():
            print(child)

        return
        index = 0

        return_text_new_new(items, soup, result, index, line)

        print(len(result))
        for r in result:
            print(" ".join(r))
            print("_____")
def reduce_xml(path):
    with open(path) as f:
        soup = BeautifulSoup(f, 'xml')

    tags_to_keep = soup.find_all([
        'score-partwise', 'part-list', 'score-part', 'part-name', 'part',
        'measure', 'attributes', 'divisions', 'key', 'fifths', 'time', 'beats',
        'beat-type', 'clef', 'sign', 'line', 'note', 'pitch', 'step', 'alter',
        'octave', 'duration', 'type', 'rest', 'dot', 'staff', 'notations',
        'slur', 'direction', 'direction-type', 'dynamics', 'ff', 'f', 'mf',
        'mp', 'p', 'pp', 'backup', 'chord'
    ])

    attributes_to_keep = [
        'version', 'encoding', 'id', 'number', 'type', 'placement'
    ]

    for x in soup.find_all():
        if x not in tags_to_keep:
            x.extract()

    for tag in soup.recursiveChildGenerator():
        if 'attrs' in tag.__dict__:
            tag.attrs = {
                key: tag.attrs[key]
                for key in tag.attrs if key in attributes_to_keep
            }

    return soup
Example #17
0
def extractall(html):
    """Extracting the text from the HTML and removing the code segments"""
    soup = BeautifulSoup(html, 'lxml')
    [c.extract_entities() for c in soup.findAll('div', attrs={'class': 'code panel pdl conf-macro output-block'})]

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    # URLs are saved separately for future use
    urls = []
    output = ''

    # Working with the hyperlink
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            if hasattr(value, 'href'):
                urls.append(value['href'])
                if value.text != value['href']:
                    output += value.text
        elif value.name is None and not visit_to_a:
            output += ' ' + value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)

    return output, urls
Example #18
0
def getPreview(links, bookkeeping):
    titles = []
    urls = []
    previews = []

    # Loop through all the links returned by the query and parse through their HTML responses
    for link in links:
        with open('/webpages_raw/' + link, encoding='utf-8') as rawData:
            soup = BeautifulSoup(rawData.read(), "lxml")
            tags = ['h1', 'h2', 'h3', 'h4', 'p', 'li', 'table', 'address']
            snippit = ''
            for child in soup.recursiveChildGenerator():
                if child.name in tags:
                    # join lines of text so they look nice
                    segment = " ".join(
                        word for word in child.get_text().strip().split()
                        if word)
                    if segment:
                        snippit += segment if snippit == '' else ' ' + segment

                # Break when we have a large enough snippet
                if len(snippit) > 200:
                    break

            # If there was a title in the HTML text, grab it, else pull the first 50 characters from the snippet
            if soup.title and soup.title.string:
                titles.append(soup.title.string[:100])
            else:
                titles.append(snippit[:50] + '...')

            urls.append(bookkeeping[link])
            previews.append(snippit[:200] + '...')

    return titles, urls, previews
Example #19
0
def fetch_html(url):
    res = requests.get(url)
    # check if url is json data
    # if url.endswith('.json'):
    #     Jdata = res.json()
    #     urlAppend = {'URL': url}
    #     Jdata.insert(0, urlAppend)
    #     print(data)
    #     return Jdata

    html_status(res)

    if html_status(res) == False:
        print('still down')
    else:
        soup = BeautifulSoup(res.content, 'html.parser')
        for x in soup.recursiveChildGenerator():
            if x.name is None and not x.isspace():
                item = {
                    'tag': x.parent.name,
                    'text': x.strip(),
                    'attrs': x.parent.attrs
                }
                print("Crawling...")
                data.append(item)

        urlAppend = {'URL': url}
        data.insert(0, urlAppend)
    return data
    def parseBioPages(self, response):
        bio_item = BioItem()
        homepage_xpath = "//div[@class='panel-pane pane-entity-field pane-node-field-up2-webpage']/a/@href"
        bio_xpath = "//td/p  | //div[contains(@class, 'panel-pane pane-entity-field pane-node-field-up2-bio')]/p"
        title_xpath = "//div/h2[@class='a-subtitle--primary o-card__subtitle o-card__subtitle--primary']/text()"
        full_name_xpath = "//h1/text()"
        email_xpath = "//div/div[@class='panel-pane pane-entity-field pane-node-field-up2-email']/a/text()"
        image_url_xpath = "//div[last()]/img/@src"
        bio_item['image_url'] = response.xpath(image_url_xpath).extract()
        bio_item['department'] = "Human-Computer Interaction Institute"
        bio_item['department_url'] = response.url
        bio_item['email'] = response.xpath(email_xpath).extract()
        bio_item['full_name'] = response.xpath(full_name_xpath).extract()
        bio_item['full_name'] = bio_item['full_name'][0].strip()
        bio_item['title'] = response.xpath(title_xpath).extract()
        if len(bio_item['title']) == 1:
            bio_item['title'] = bio_item['title'][0].strip()
        bio_item['homepage'] = response.xpath(homepage_xpath).extract()
        bio_item['biography'] = response.xpath(bio_xpath).extract()
        bio_item['biography'] = ' '.join(bio_item['biography'])
        soup = BeautifulSoup(bio_item['biography'], 'html.parser')
        for tag in soup.recursiveChildGenerator():
            try:
                print(tag.attrs)
                tag.attrs = dict((key, value) for key, value in tag.attrs.items() if key not in self.attributes_blacklist)
            except AttributeError:
                pass
        
        bio_item['biography'] = soup.prettify()

        return bio_item
        
def get_image(markdown_source_path, url_root):
    main_page = render_markdown_from_file(markdown_source_path, url_root)
    borscht = BeautifulSoup(main_page, 'html5lib')
    # Looking for an image tag with the correct ID
    for tag in borscht.recursiveChildGenerator():
        if tag.name is None:
            continue
        if tag.name == 'img' and tag.attrs.get('id') == 'preview':
            return tag.attrs['src']
    # Looking for an image before first h2 tag
    for tag in borscht.recursiveChildGenerator():
        if tag.name is None:
            continue
        if tag.name.startswith('h2'):
            return
        if tag.name == 'img':
            return tag.attrs['src']
Example #22
0
def html_tags(html_txt: str):
    """Count html tags in a html text and return a dictionary with html tags as keys and counts as values."""
    s = BeautifulSoup(html_txt, features='html.parser')
    tags = []
    for child in s.recursiveChildGenerator():
        if child.name:
            tags.append(child.name)
    return tags
Example #23
0
def remove_style(conteudo):
    if 'style' not in conteudo:
        return conteudo  # atalho que acelera muito os casos sem style

    soup = BeautifulSoup(conteudo, 'html.parser')
    for tag in soup.recursiveChildGenerator():
        if hasattr(tag, 'attrs'):
            tag.attrs = {k: v for k, v in tag.attrs.items() if k != 'style'}
    return str(soup)
Example #24
0
def main():
    inp = input("Please write URL: ")
    r = requests.get(inp)
    soup = BeautifulSoup(r.text, 'html.parser')
    text = soup.body.getText()
    get_words_frequency(text)
    tags_frequency = get_tags_frequency(soup.recursiveChildGenerator())
    print(f'''links count : {tags_frequency.get('link', 0)}''')
    print(f'''images count : {tags_frequency.get('img', 0)}''')
Example #25
0
def parse_it(url):
    text = ''
    tags = ['p', 'a', 'h1', 'h2', 'h3', 'h4', 'h5', 'h6', 'ul', 'ol', 'li', 'span']
    page = requests.get(url)
    soup = BeautifulSoup(page.content, 'html.parser')
    for t in soup.recursiveChildGenerator():
        if t.name in tags:
            text += t.text
    return text
Example #26
0
 def traverseTree(self, markup):
     soup = markup
     if not isinstance(soup, BeautifulSoup):
         soup = BeautifulSoup(markup, "lxml")
     keys = self.events.keys()
     outputs = []
     for node in soup.recursiveChildGenerator():
         if node.name in keys:
             outputs.append(self.events[node.name](node))
     return outputs
Example #27
0
 def _get_text(cls, html):
     """
     Extract text from html content by removing markup tags & code.
     :param html: String
     :return List of strings in the given content
     """
     soup = BeautifulSoup(html)
     [s.extract() for s in soup('code')]
     return [i for i in soup.recursiveChildGenerator()
             if isinstance(i, NavigableString)]
def parse_html(content):
    soup = BeautifulSoup(content, 'lxml')
    for child in soup.recursiveChildGenerator():
        if child.name:
            for tag in soup.find_all(child.name):
                html_text = tag.text
                if not html_text:
                    return 'empty'
                else:
                    return 'non empty'
def deep_search_sample(df):
    '''
    storing zillowAPIdataset into a csv file
    '''
    api_url_base = 'http://www.zillow.com/webservice/GetDeepSearchResults.htm'

    columns = [
        'address', 'amount', 'zipcode', 'city', 'state', 'latitude',
        'longitude', 'usecode', 'bedrooms', 'last-updated'
    ]

    # lst that will be used to create dataframe
    for i in range(df.shape[0]):

        # grabs data from df
        address_param = df.loc[i, 'address']
        citystatezip_param = df.loc[i, 'city'] + ' ' + df.loc[i, 'state']

        # upload data as param
        payload = {'zws-id':os.environ['ZWID_API_KEY'], 'address': address_param, 'citystatezip':citystatezip_param,\
                'rentzestimate': 'true'}

        # uploads api
        current_house_info = single_query(api_url_base, payload)

        # api to dataframe
        html_soup = BeautifulSoup(current_house_info, features='html.parser')

        dict = {}
        # creates dictionary
        for child in html_soup.recursiveChildGenerator():
            if child.name in columns:
                dict[child.name] = html_soup.find(child.name).text

        if len(html_soup.find_all('amount')) == 2:
            rental_val = html_soup.find_all('amount')[1].text
            dict['rent'] = rental_val

        if i == 0:
            deep_search_df = pd.DataFrame(dict, index=[0])
        else:
            deep_search_df = deep_search_df.append(dict, ignore_index=True)

    deep_search_df = clean_api_dataframe(deep_search_df)
    deep_search_df = deep_search_df[deep_search_df['bedrooms'] > 1]
    deep_search_df['rentPerUnit'] = deep_search_df['rent'] / (
        deep_search_df['bedrooms'] - 1)
    deep_search_df['totalMonthlyIncome'] = deep_search_df[
        'rentPerUnit'] * deep_search_df['bedrooms']

    mortgage_details(deep_search_df)
    one_year_nwroi(deep_search_df)
    deep_search_df = deep_search_df.dropna()

    return deep_search_df
Example #30
0
def reform(r):
    col.init()
    soup = BeautifulSoup(r.content, 'html.parser')
    res = ''

    for child in soup.recursiveChildGenerator():
        if child.name in ['p', 'ul', 'ol', 'li']:
            res += col.Style.RESET_ALL + child.text + '\n'
        elif child.name == 'a':
            res += col.Fore.BLUE + child.text + '\n'
    return re.sub(r'\n\n', '', res)
Example #31
0
        def fetch_database(dbData):
            triesLeft = self.setting_retryCount
            while (triesLeft > 0):
                try:
                    page = requests.get(
                        dbData.NAR_href,
                        timeout=self.setting_singleRequestTimeout)
                    dbData.response = page.status_code
                    self.done += 1
                    print(self.done / self.total * 100, "%", sep="")
                    if page.status_code >= 200 and page.status_code < 300:
                        dbData.status = "GOOD"
                    else:
                        dbData.status = "BAD"

                    soup_main = BeautifulSoup(page.text, 'html.parser')
                    text = [
                        i for i in soup_main.recursiveChildGenerator()
                        if type(i) == NavigableString
                    ]
                    currentYear = datetime.datetime.now().year
                    yearRange = range(1900, currentYear + 1)
                    for t in text:
                        for y in yearRange:
                            if str(y) in t:
                                if dbData.lastYear == -1 or dbData.lastYear < y:
                                    dbData.lastYear = y
                                if dbData.firstYear == -1 or dbData.firstYear > y:
                                    dbData.firstYear = y
                        if "last updated" in t:
                            result = CheckDate(t)
                            if result is not None:
                                if result.day is not None:
                                    dbData.update_day = result.day
                                if result.month is not None:
                                    dbData.update_month = result.month
                                if result.year is not None:
                                    dbData.update_year = result.year
                    break
                except (ConnectionError, ConnectionResetError,
                        urllib3.exceptions.ProtocolError,
                        requests.exceptions.ConnectionError,
                        requests.exceptions.ConnectionError,
                        requests.exceptions.ReadTimeout,
                        requests.exceptions.ChunkedEncodingError,
                        requests.exceptions.InvalidSchema,
                        requests.exceptions.ChunkedEncodingError,
                        socket.timeout, http.client.IncompleteRead,
                        requests.exceptions.ContentDecodingError) as e:
                    pass
                except UnicodeEncodeError:
                    break  #unable to read website
                triesLeft -= 1
                time.sleep(self.setting_retrySleep)
def extract_entries(path):
    with open(path) as f:
        soup = BeautifulSoup(f, 'lxml')
        clean(soup)
        for child in soup.recursiveChildGenerator():
            name = getattr(child, 'name', None)
            if name is not None:
                if name.startswith('h'):
                    headline = get_headline(child)
                    if headline:
                        yield headline, child
Example #33
0
def lab(file_path="poc/input1.html", out_put_file_name="poc/output.html"):

    # Reading the HTML File
    with open(file_path, 'r') as j:
        content = j.read()

    out_file = open(out_put_file_name, "w")

    # content = html.unescape(fileContent)
    soup = BeautifulSoup(content, "html5lib")

    # kill all script and style elements
    for script in soup(["script", "style"]):
        script.extract()  # rip it out

    # Getting Text and Comment Child
    childs = []
    for child in soup.recursiveChildGenerator():
        try:
            if isinstance(child, NavigableString) or isinstance(
                    child, Comment):
                childs.append(child)
        except:
            pass

    # Killing childs
    for child in childs:
        child.extract()

    # print(soup)

    # Removing Attributes
    for tag in soup.recursiveChildGenerator():
        try:
            tag.attrs = {}
        except AttributeError:
            pass

    text = str(soup.body)
    percentage = div_snatcher(soup.body, text, out_put_file_name)
    return percentage
Example #34
0
 def _get_text(cls, html):
     """
     Extract text from html content by removing markup tags & code.
     :param html: String
     :return List of strings in the given content
     """
     soup = BeautifulSoup(html)
     [s.extract() for s in soup('code')]
     return [
         i for i in soup.recursiveChildGenerator()
         if isinstance(i, NavigableString)
     ]
class DataFormator():

    data = None
    soup = None
    # nonTextTags = ["img","a"]

    def __init__(self, content):
        self.data = content
        self.soup = BeautifulSoup(self.data)
        
    def getFormattedData(self):
        
        article = BlogArticle()
        parenttagname = ""
        tagname = ""
        tagChanged = False
        text = ""
        for child in self.soup.recursiveChildGenerator():
            
            name = getattr(child, "name", None)
            if name is not None:
                parenttagname = tagname
                tagname = name
                if Text2ContentMapper.has_key(name):
                    if not text is "":
                        article.addContent("text", "", text.rstrip())
                        text = ""
                    self.getNonTextData(child, article)
                    
                tagChanged = True
            elif not child.isspace():  # leaf node, don't print spaces
                tagChanged = not tagChanged
                tagg = None
                if (tagChanged):
                    tagg = parenttagname
                else:
                    tagg = tagname
                if not Text2ContentMapper2.has_key(tagg):
                    text = text + child
        return article
                
                    
    def getNonTextData(self, node, barticle):
        mapattr = Text2ContentMapper[node.name]
        if mapattr == 'h':
            barticle.addContent(node.name, '', node.get_text())
        elif not mapattr is '':
#             barticle.addContent(node.name, node[mapattr], node.get_text())
            atrb = node[mapattr] if node.has_attr(mapattr) else ''
            barticle.addContent(node.name, atrb, node.get_text())
        
        else:
            barticle.addContent(node.name, "", "")
def strip_tags(html_text):
    """
    Get the text in html tags
    :param html_text:  text containing possible html tags
    :return:
    """
    try:
        soup = BeautifulSoup(html_text, "html.parser")
        cleaned_text = ''.join([e for e in soup.recursiveChildGenerator() if isinstance(e, unicode)])
        return cleaned_text
    except:
        logging.error("Failed to strip html tags for %s" % (html_text))
        return html_text
Example #37
0
class Parser:
    def __init__(self, html_file):
        self.soup = BeautifulSoup(open(html_file), "lxml")

    def text_gen(self):
        for item in self.soup.recursiveChildGenerator():
            if item.string != None:

                s = item.string.encode("utf-8", "ignore")

                #Many contracts have lines that are simply one whitespace character, hope to modify to include empty lines
                if s != " ":
                    yield item.string.encode("utf-8", "ignore")
Example #38
0
def strip_html_bs(text):
    """
    Use BeautifulSoup to strip off HTML but in such a way that <BR> and
    <P> tags get rendered as new lines
    """
    soup = BeautifulSoup(text)
    fragments = []
    for element in soup.recursiveChildGenerator():
        if isinstance(element, basestring):
            fragments.append(element.strip())
        elif element.name == 'br':
            fragments.append(u"\n")
        elif element.name == 'p':
            fragments.append(u"\n")
    result = u"".join(fragments).strip()
    return result
Example #39
0
def extractall(html):
    soup = BeautifulSoup(html, 'lxml', )
    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    output = ''

    for value in data:
        if value.name == 'a':
            visit_to_a = True
            output += value.text
            if hasattr(value, 'href') and value.text != value['href']:
                output += ' [' + value['href'] + '] '
        elif value.name is None and not visit_to_a:
            # visit_to_a = False
            output += value
        else:
            visit_to_a = False

    return output
Example #40
0
def extractall(html):
    """Extracting the text from the HTML and checking whether there are codes or blockquotes"""
    soup = BeautifulSoup(html, 'lxml')

    # Obtaining the code segments from the body
    codes = [c.get_text() for c in soup('code')]
    errors = [e.get_text() for e in soup('blockquote')]

    has_codes = False
    for code in codes:
        if list(code).count('\n') > 1:
            has_codes = True

    has_errors = False
    for error in errors:
        if list(error).count('\n') > 1:
            has_errors = True

    # Check whether the HTML has codes or blockquotes
    has_codes_errors = has_codes or has_errors

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    output = ''

    # Working with the hyperlink and images with links
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            output += value.text
            if hasattr(value, 'href') and value.text != value['href']:
                output += ' [' + value['href'] + '] '

        elif value.name is None and not visit_to_a:
            output += value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)

    return output, has_codes_errors
Example #41
0
def external_urls(html, root_url):
    """
    Finds external links in an HTML fragment and returns an iterator
    with their URLs.

    root_url defines a root outside of which links are considered external.
    """
    s, root_host, root_path, q, f = urlsplit(root_url)

    def is_external(url):
        schema, host, path, query, fragment = urlsplit(url)
        return schema in ("", "http", "https") and host != "" and (host != root_host or not path.startswith(root_path))

    soup = BeautifulSoup(html.encode("utf8"), from_encoding="utf8")

    for tag in soup.recursiveChildGenerator():
        if isinstance(tag, element.Tag) and tag.name == "a":
            link = tag.attrs.get("href")
            if link and is_external(link):
                yield link
Example #42
0
def parse_links(value):
    if isinstance(value, RichText):
        soup = BeautifulSoup(expand_db_html(value.source), 'html.parser')
    else:
        soup = BeautifulSoup(expand_db_html(value), 'html.parser')

    # This removes style tags <style>
    for s in soup('style'):
        s.decompose()

    # This removes all inline style attr's
    for tag in soup.recursiveChildGenerator():
        try:
            del tag['style']
        except:
            # 'NavigableString' object has does not have attr's
            pass

    # This adds the link markup
    link_tags = get_link_tags(soup)
    add_link_markup(link_tags)

    return soup
Example #43
0
class HTMLPARSER:
    
    headers = ['h1','h2','h3','h4','h5', 'h6']	
    keepTags = ['a','ul','ol','figure','strong','img','p','table','caption','th','tr','td','li','dl','dt']   
    imagesLocation = 'www.dev.applications.ene.gov.on.ca/dams/images/'
    docsLocation = 'http://www.dev.applications.ene.gov.on.ca/dams/Docs/'

    def __init__(self, file_name):
        
        self.file = file_name
        #self.soup = BeautifulSoup(open(file_name), "lxml")
        self.soup = BeautifulSoup(open(file_name), "html.parser") 
        #print self.soup.prettify("utf-8")
	self.string = ''       
	self.toc = None
	self.id = file_name.split('/')[-1].split('-')[0]
	self.lastHeaderIndex = 0
	self.lastHeaderObj = {'h1':None, 'h2':None,'h3':None, 'h4':None, 'h5':None, 'h6':None}

 
    def prettify(self):
        html = self.soup.prettify("utf-8")
        with open(self.file, "wb") as file:
            file.write(html)
        file.close()

    def header(self):
	title =  self.soup.find('h1')
	self.string += str (title)
	self.lastHeaderIndex = self.headers.index(title.name)
	print self.lastHeaderIndex 
	
	
	#print title.contents
	#print title.prettify()
	self.get_contents(title.next_sibling)
	 
	while (title.parent != None and title.parent.next_sibling != None):
	    title = title.parent.next_sibling
	    inside = title
	    #print inside.contents 
	    
     	    while (inside.next_sibling != None):
	        try:
	            print "1 <br />" + title
	            for child in inside.contents:
	    	        print "child ...." + child + "<br />"
	    	    	self.get_contents(child)
		except:
		    self.string += str(title)
		    print "in except <br />"

       	   # print vars(title)

    def get_contents(self, contents):
	contents = contents.next_sibling
	while (contents != None):
	    print "=====================" + str(contents) + "~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~"
	    self.string += str(contents)
	    contents = contents.next_sibling

    def DFS(self):
	parent = None
	current = None
	toc = None

	#change anchor links to documents
 	self.change_doc_ref()

	#change image tag links
	self.change_image_source()

        for child in self.soup.recursiveChildGenerator():
     	    #print "(((((((((((((((((" + self.string + ")))))))))))))))))"
	    name = getattr(child, "name", None)
     	    if name is not None :
		if name in self.keepTags and self.check_parents(child):
		   # if name == 'a':
		#	if self.aTag(child):
		#	    self.string += child.prettify('utf-8')
		 #   else:
		    self.string += child.prettify('utf-8')
		#tag is one of the header tags
		if name in self.headers:
		    toc = TOC(child, current)
	 	    self.lastHeaderObj[name] = toc
		    self.lastHeaderIndex = self.headers.index(name)
		    if name != 'h1':
			#print "~~~~~~~~~~~~~~~~~~~~~~+++++++++++++++++++++++++++++++++++++++++++++++++"
			#print self.string
			current.add_content(self.string)
			self.string = ''
			self.lastHeaderObj[self.headers[self.headers.index(name) - 1]].insert(toc)
			parent = current
			#parent.insert(toc)
		    current = toc 
		    #self.string +=  child.prettify('utf-8')
		          
  	    elif not child.isspace(): # leaf node, don't print spaces
                if child.parent.name not in self.headers and self.check_parents(child) and self.lastHeaderObj['h1'] != None:
		    self.string += child.encode('utf-8')
	toc.add_content(self.string)
	self.toc = toc

    def build_TOC(self):
	tocStr = ''
	for child in self.soup.recursiveChildGenerator():
	    name = getattr(child, "name", None)
	    if name is not None and name in self.headers:
		tocStr +=  child.prettify('utf-8')
	return tocStr

    def aTag(self, tag):
	for x in tag.parents:
	    if x.name in self.headers:
		return False
	return True

    def check_parents(self, tag):
	for x in tag.parents:
	    if x.name in self.keepTags:
		return False
	return True

    def print_inorder(self, index, pid = 0):
	#print indent, index.title	
	tags = ["<"+x+">" for x in self.keepTags]
	tags = ' '.join(tags)
	data = {'assetId': self.id, 'title':index.title,'data':index.content, 'pid': str(pid), 'keepTags':tags}
	req = HTTP()
	req.post_data(data) 
	if (pid == 0):
	    req.delete_existing()
	    
	#print "<br />vvvvvvvvvvvvvvvvvvvvvvv<br />" + index.title.text + "<br />" + index.content + "<br /> ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^~<br />" 
	pid = int(req.send_request())
	
	for x in index.child:
	    self.print_inorder(x, pid)
  
    def change_doc_ref(self):
	for x in self.soup.findAll('a'):
	    if x.has_attr('href') and x['href'].count("Docs/"):
		x['href'] = self.docsLocation + x['href'].split('/')[-1]

    def change_image_source(self):
	for x in self.soup.findAll('img'):
	    x['src'] = self.imagesLocation + x['src'].split('/')[-1]
def index():
    req = urllib2.urlopen("http://www.cinepolis.com/_CARTELERA/cartelera.aspx?ic=31") #Monterrey = 31, DF = 13

    content = req.read()
    encoding=req.headers['content-type'].split('charset=')[-1]
    soup = BeautifulSoup(unicode(content, encoding))
    
    all_data = {}

    theatres = {}
    movies = {}

    cT_hash = None
    cM_hash = None
    currentTheatre = None
    currentMovie = None

    monterrey_cinepolis = []
    images_to_movieTitles = {}
    titles_to_images = {}

    tags_used = []

    images = []

    movie_endings = ["Esp", "Sub", "Dig", "3D", "4DX", "IMAX", "XE"]

    def parseMovie(m):
        g = m.split(" ")
        i = 0
        title = ""
        tags = []
        while len(movie_endings) > i and (g[-(i+1)] in movie_endings):
            i+=1
        if i > 0:
            return " ".join(g[0:-i]), g[-i:]
        else:
            return m, []

    for tag in soup.recursiveChildGenerator():
        try:
            if tag.name == "span":
                if 'class' in tag.attrs and ("TitulosBlanco" in tag['class']   ):
                    currentTheatre = unicode(tag.string)
                    cT_hash = re.sub('[^0-9a-zA-Z]+', '_', unidecode(currentTheatre))
                    theatres[currentTheatre] = cT_hash
            if tag.name == "a":
                if 'class' in tag.attrs and  "peliculaCartelera" in tag['class'] :
                    currentMovie = unicode(tag.string)
                    cM_hash = re.sub('[^0-9a-zA-Z]+', '_', unidecode(currentMovie))

                    title, tags = parseMovie(currentMovie)
                    title_hash = re.sub('[^0-9a-zA-Z]+', '_', unidecode(title))

                    for t in tags:
                        if t not in tags_used:
                            tags_used.append(t)

                    if title not in movies:
                        movies[title] = title_hash

                    if currentMovie not in all_data:
                        all_data[currentMovie] = {"title":title, "title_hash":title_hash, "hash":cM_hash, "tags":tags, "theatres":{}}

                if 'class' in tag.attrs and  "horariosCarteleraUnderline" in tag['class'] :
                    time = unicode(tag.string);
                    time_href = unicode(tag['href'])

                    if currentTheatre not in all_data[currentMovie]["theatres"]:
                        all_data[currentMovie]["theatres"][currentTheatre] = {"times":[{"time":time, "link":time_href}]}
                    else:
                        all_data[currentMovie]["theatres"][currentTheatre]["times"].append({"time":time, "link":time_href})

            if tag.name =="img":
                image_url = unicode(tag['src']);
                if "http://www.cinepolis.com.mx/Imagenes/Peliculas" in image_url:
                    if "image" not in all_data[currentMovie]:
                        all_data[currentMovie]["image"] = image_url

        except:
            pass

    return template('cinepolis', theatres=theatres, movies=movies, tags=tags_used, data=all_data)
Example #45
0
 def _remove_tag(self):
     soup=BeautifulSoup(self.data, "lxml")
     self.outtext=''.join([element  for element in soup.recursiveChildGenerator() if isinstance(element,unicode)])
def index():
    req = urllib2.urlopen("http://www.cinepolis.com/_CARTELERA/cartelera.aspx?ic=31") #Monterrey = 31, DF = 13

    content = req.read()
    encoding=req.headers['content-type'].split('charset=')[-1]
    soup = BeautifulSoup(unicode(content, encoding))
    theatres = []
    movies = []


    currentTheatre = None
    currentMovie = None

    monterrey_cinepolis = []
    images_to_movieTitles = {}
    titles_to_images = {}

    tags_used = []

    images = []

    movie_endings = ["Esp", "Sub", "Dig", "3D", "4DX", "IMAX", "XE"]

    def parseMovie(m):
        g = m.split(" ")
        i = 0
        title = ""
        tags = []
        while len(movie_endings) > i and (g[-(i+1)] in movie_endings):
            i+=1
        if i > 0:
            return " ".join(g[0:-i]), g[-i:]
        else:
            return m, []

    for tag in soup.recursiveChildGenerator():
        try:
            if tag.name == "span":
                if 'class' in tag.attrs and ("TitulosBlanco" in tag['class']   ):
                    currentTheatre = unicode(tag.string)
                    theatres.append(currentTheatre)
            if tag.name == "a":
                if 'class' in tag.attrs and  "peliculaCartelera" in tag['class'] :
                    currentMovie = unicode(tag.string)

                    title, tags = parseMovie(currentMovie)

                    for t in tags:
                        if t not in tags_used:
                            tags_used.append(t)

                    if title not in movies:
                        movies.append(title)

                if 'class' in tag.attrs and  "horariosCarteleraUnderline" in tag['class'] :
                    time = unicode(tag.string);
                    time_href = unicode(tag['href'])
                    monterrey_cinepolis.append({ "currentMovie": currentMovie, "title": title, "tags": tags, "theatre":currentTheatre, "time": time, "cineLink": time_href})
            if tag.name =="img":
                image_url = unicode(tag['src']);
                if "http://www.cinepolis.com.mx/Imagenes/Peliculas" in image_url:

                    imdat = {"title":title, "img_src":image_url}
                    if imdat not in images:
                        images.append(imdat)
                    if title not in titles_to_images:
                        titles_to_images[title] = image_url

                    if image_url not in images_to_movieTitles:
                        images_to_movieTitles[image_url] = [title]
                    else:
                        if title not in images_to_movieTitles[image_url]:
                            images_to_movieTitles[image_url].append(title)

        except:
            pass

    movieDATA = {"theatres":theatres, "movies":movies, "images":images, "tags":tags_used, "data":monterrey_cinepolis}
    test = {"jokes":['on','you'],"dummy":"json-file","are":["your",{"favourite":"right?","or":"so","i":"though"}]}
    return template('cinepolis', theatres=theatres, movies=movies, tags=tags, data=monterrey_cinepolis, images=titles_to_images) #json.dumps(movieDATA,indent=True, ensure_ascii=True, encoding="utf8")
Example #47
0
def strip_tags(html):
	soup = BeautifulSoup(html)
	return ''.join([e for e in soup.recursiveChildGenerator() if isinstance(e,unicode)])
Example #48
0
def xkcdify(content):
    """
    Replace text within a string as specified by the xkcd Substitutions comics.

    This takes an HTML fragment and replaces the text accordingly, wrapping the
    resulting substitutions in span tags.

    :param content: Original content with text to be replaced.
    :returns: Resulting content after xkcd substitutions.
    """

    def sub(matchobj):
        match = matchobj.group()
        key = match.lower().replace("-", " ")
        key1 = re.escape(key)
        key2 = re.escape(key.rstrip("'s"))

        # First, check if the match has a substitution.
        # If it doesn't, check as if the match were plural or possessive.
        if key1 in subs:
            result = subs[key1]
        elif key2 in subs:
            result = subs[key2]
            # If the pattern encountered a match that's the plural or
            # possessive form of a key, modify the return value accordingly.
            if match.endswith("s"):
                result = result + "s"
            elif match.endswith("'"):
                result = result + "'"
        else:
            return ""

        return result

    # Get all the plain text strings in the document without their tags.
    soup = BeautifulSoup(content, 'html.parser')
    content_strings = [element for element in soup.recursiveChildGenerator() \
                       if type(element) == NavigableString]

    for string in content_strings:
        # Use index to track where the current substring of plain text starts.
        index = 0

        # Use wrapper to string together plain text and span elements.
        wrapper_tag = soup.new_tag('span')

        # Upon each match, write to the wrapper the substitution result and the
        # plain text preceding it. Then update index to the position after the
        # matched substring to mark the start of the next plain text substring.
        for match in pattern.finditer(string):
            wrapper_tag.append(soup.new_string(string[index:match.start()]))
            replacement = soup.new_tag('span',
                                       **{
                                           'class': 'substitution',
                                           'data-tooltip': match.group()
                                       })
            replacement.string = sub(match)
            if replacement.string:
                wrapper_tag.append(replacement)
            else:
                wrapper_tag.append(soup.new_string(match.group()))
            index = match.end()

        # Keep the original plain text unless substitutions were made.
        if wrapper_tag.contents:
            # Only append the rest of the string if substitutions were made,
            # because we would otherwise be left with the full original string.
            wrapper_tag.append(string[index:])
            string.replace_with(wrapper_tag)
            wrapper_tag.unwrap()

    return unicode(soup)
Example #49
0
def make_sections(root):
    sections = []

    # unwrap unnecessary inline elements (other than b or strong)
    # sometimes people put crazy tags in body elements (like script) that
    # shouldn't be there -- must delete these from the document

    unwrap = {"i", "u", "em", "a", "span", "font"}
    delete = {"script", "style", "head", "meta", "link", "title", "noscript", "select", "form", "input", "nav", "iframe"}

    for tag in unwrap: 
        try:
            for match in root(tag):
                match.unwrap()
        except:
            pass

    for tag in delete: 
        try:
            for match in root(tag):
                match.extract()
        except:
            pass

    root = BeautifulSoup(" ".join(str(root).split()), "html.parser")

    for child in root.recursiveChildGenerator():
        if type(child) is bs4.element.NavigableString:

            # find all of a child node's parent names

            parents = set()
            element = child
            while element.name != "[document]":
                if element.name != None:
                    parents.add(element.name)
                element = element.parent

            # eliminate all h* headers

            h_header = False
            for parentTag in parents:
                if parentTag[0] == "h" and parentTag[1].isdigit():
                    h_header = True
                    break

            # get the body text of the child, stripping whitespace

            text = child.string.strip()

            # record h* or bold-type headers (if requested)

            if h_header or "b" in parents or "strong" in parents:
                if text != "":
                    sections.append(Heading(text))
                continue

            # record list items

            elif "li" in parents:
                if len(text) > 0:
                    if len(sections) == 0 or type(sections[-1]) is not List:
                        sections.append(List([text]))
                    else:
                        sections[-1].add_item(text)
                continue

            # record non-list paragraphs

            if len(text) > 10 and ("." in text or text[-1] == ":"):
                if text.count("}") + text.count("{") < 5 and text.count("=") < 2:
                    if text[0] == "." or text[0] == ":":
                        text = text[1:].strip()
                    sections.append(Paragraph(text))

    return sections
Example #50
0
def extract(html, remove_codes_blockquotes=False, num=1):
    """Extracting the text from the HTML text generated by the SO and checking whether there are codes or block-quotes

    :type html: str
    :param html: Text with the HTML tags
    :type remove_codes_blockquotes: bool
    :param remove_codes_blockquotes: Whether the codes and the block-quotes has to be removed or not
    :type num: int
    :param num: Lines with more than 'num' no. of new-lines are considered as codes or block-quotes
    :returns: Plain text
    """
    soup = BeautifulSoup(html, 'lxml')

    unimp_tags = ['style', 'script', '[document]', 'head', 'title']

    for tag in soup():
        for attribute in unimp_tags:
            del tag[attribute]

    has_codes_errors = False
    if remove_codes_blockquotes:
        # Obtaining the code segments from the body
        codes = [c.get_text() for c in soup('code')]
        errors = [e.get_text() for e in soup('blockquote')]

        has_codes = False
        for code in codes:
            if list(code).count('\n') > num:
                has_codes = True

        has_errors = False
        for error in errors:
            if list(error).count('\n') > num:
                has_errors = True

        # Check whether the HTML has codes or blockquotes
        has_codes_errors = has_codes or has_errors

    data = list(soup.recursiveChildGenerator())
    visit_to_a = False
    output = ''

    # Working with the hyperlink and images with links
    for value in data:
        if value.name == 'a':
            visit_to_a = True
            output += value.text
            if hasattr(value, 'href') and value.text != value['href']:
                output += ' [' + value['href'] + '] '

        elif value.name is None and not visit_to_a:
            output += value
        else:
            visit_to_a = False

    # Converting HTML entities into Unicode characters
    output = unicode(output)

    if remove_codes_blockquotes:
        return output, has_codes_errors
    else:
        return output
Example #51
0
def cleave(x):

   x = re.sub("[.,?!:;]+","",x)
   x = re.sub("\s+"," ",x)

   return x.strip()

ofile = open("data/ham_unspooled3.txt","w")
bs = BeautifulSoup(open("data/ham.xml").read())

bs = bs.find("body")
act = "0"
scene = "0"
lineno = "0"

for s in bs.recursiveChildGenerator():

   try:
      s.name
   except: continue

   if s.name == "sp":

      g = s.find("speaker")
      sp = g.string
      sp = sp.encode('utf-8')
      sp = re.sub(" ","_",sp)

      ofile.write(act+" "+scene+" "+lineno+" -1 "+sp+" -1 -1 -1 -1 -1 NEWLINE NEWLINE\n")
      ofile.write(act+" "+scene+" "+lineno+" -1 "+sp+" -1 -1 -1 -1 -1 SPEAKER "+sp.upper()+"\n")
      ofile.write(act+" "+scene+" "+lineno+" -1 "+sp+" -1 -1 -1 -1 -1 NEWLINE NEWLINE\n")
Example #52
0
			soup = BeautifulSoup(followedlink)
		#	for tag in soup.recursiveChildGenerator(): # and tag.name in ('title', 'body', 'h2', 'h1'):
		#		printvar = True
		#		name = getattr(tag, "name", None)
		#		#if name is not 'a':
		#		#	unused += 1
		#		if name is not None: # and name not in ('title', 'body', 'h2', 'h1'):
		#			if name in ('a', 'br', 'li'):
		#				printvar = False
		#			if printvar:
		#				print name
		#				unused += 1
		#		elif not tag.isspace():
		#			if printvar:
		#				print tag
			for tag in soup.recursiveChildGenerator(): # and tag.name in ('title', 'body', 'h2', 'h1'):
#				printvar = True
				name = getattr(tag, "name", None)
				#if name is not 'a':
				#	unused += 1
				if name is not None: # and name not in ('title', 'body', 'h2', 'h1'):
			#		if name in ('a', 'br', 'li'):
		#				printvar = False
#					if printvar:
#						print name
					unused += 1
				elif not tag.isspace():
#					if printvar:
					print tag
		time.sleep(6)