Python HTMLParser.handle_dataの例、HTMLParser.HTMLParser.handle_data Pythonの例

コード例 #1

0

ファイルを表示

ファイル: google_translator_cli.py プロジェクト: huangzx/google_translator_cli

 def handle_data(self, data):
     """ 处理文本元素 """
     HTMLParser.handle_data(self, data)
     for line in data.split(";"):
         (key, _, value) = line.partition("=")
         if key == "TRANSLATED_TEXT":
             print value
             break

コード例 #2

0

ファイルを表示

ファイル: google_translator_cli.py プロジェクト: huangzx/google_translator_cli

 def handle_data(self, data):
     ''' 处理文本元素 '''
     HTMLParser.handle_data(self, data)
     for line in data.split(';'):
         (key, _, value) = line.partition('=')
         if key == 'TRANSLATED_TEXT':
             print value
             break

コード例 #3

0

ファイルを表示

 def handle_data(self, data):
     HTMLParser.handle_data(self, data)
     for i in range(0,self.taglistcnt):
         if self.tags[i][2][0]:
             self.ret[i]+='%s'%data
             for el in self.tags[i][1]:#reset the get flag
                 el[1]=False
             self.tags[i][2][0] = False

コード例 #4

0

ファイルを表示

ファイル: genInvertedList.py プロジェクト: cenyk1230/Culinary

	def handle_data(self, data):
		HTMLParser.handle_data(self, data)
		print data
		l = re.findall(r'\b[\w,.]+?\b', data)
		if len(l) != 0:
			print l
		for i in l:
			if dic.has_key(i):
				if not (tmp[1] in dic[i]):
					dic[i].append(tmp[1])
			else:
				dic[i] = [tmp[1]]

コード例 #5

0

ファイルを表示

    def handle_data(self, data):
        HTMLParser.handle_data(self, data)
        if str_address in data:
            self.dicts[self.index]['server'] = data[data.find(':') + 1:]

        elif str_port in data:
            self.dicts[self.index]['server_port'] = data[data.find(':') + 1:]
        elif str_password in data:
            self.dicts[self.index]['password'] = data[data.find(':') + 1:]
        elif str_crypt_method in data:
            self.dicts[self.index]['method'] = data[data.find(':') + 1:]
            self.index += 1

コード例 #6

0

ファイルを表示

ファイル: phrases.py プロジェクト: vitalbond/phrases

def strip_tags(html):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ' '.join(result)

コード例 #7

0

ファイルを表示

def htmlstrip(html):
    #html = html.strip()
    #html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile('<\s*script[^>]*>[^<]*<\s*/\s*script\s*>',
                           re.I)  #Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>',
                             re.I)
    re_script_2 = re.compile(r'<script>.+</script>', re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>',
                             re.I)
    re_comment = re.compile(r'<!--.+//-->', re.I)
    re_iframe = re.compile(r'<iframe.+</iframe>', re.I)
    html = re_script.sub('', html)  #去掉SCRIPT
    html = re_script_1.sub('', html)  #strip script
    html = re_script_2.sub('', html)
    html = re_script_3.sub('', html)
    html = re_comment.sub('', html)
    html = re_iframe.sub('', html)

    html = html.replace('&nbsp;&nbsp;&nbsp;&nbsp;', '')
    html = html.replace('<br />', '\n')
    html = html.replace('<br>', '\n')
    html = html.replace('<br/>', '\n')
    html = html.replace('\n\n\n\n', '\n\n')
    #soup = BeautifulSoup(html, fromEncoding = "utf-8")
    #html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #8

0

ファイルを表示

ファイル: gettext.py プロジェクト: dndn/novel

def htmlstrip(html):
    # html = html.strip()
    # html = html.strip("http://")
    html = html.replace(u"<!面页章节尾部广告>", "")

    re_script = re.compile("<\s*script[^>]*>[^<]*<\s*/\s*script\s*>", re.I)  # Script
    re_script_1 = re.compile(r'<script type="text/javascript">.+</script>', re.I)
    re_script_2 = re.compile(r"<script>.+</script>", re.I)
    re_script_3 = re.compile(r'<script&nbsp;type="text/javascript.+</script>', re.I)
    re_comment = re.compile(r"<!--.+//-->", re.I)
    re_iframe = re.compile(r"<iframe.+</iframe>", re.I)
    html = re_script.sub("", html)  # 去掉SCRIPT
    html = re_script_1.sub("", html)  # strip script
    html = re_script_2.sub("", html)
    html = re_script_3.sub("", html)
    html = re_comment.sub("", html)
    html = re_iframe.sub("", html)

    html = html.replace("&nbsp;&nbsp;&nbsp;&nbsp;", "")
    html = html.replace("<br />", "\n")
    html = html.replace("<br>", "\n")
    html = html.replace("<br/>", "\n")
    html = html.replace("\n\n\n\n", "\n\n")
    # soup = BeautifulSoup(html, fromEncoding = "utf-8")
    # html = soup.prettify()

    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return "".join(result)

コード例 #9

0

ファイルを表示

ファイル: html_filter.py プロジェクト: alexliyu/lincdm

 def strip_tags(self, html):
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

コード例 #10

0

ファイルを表示

ファイル: Download_files_1.0.py プロジェクト: bitcsdby/python-every-day

def Action(url,ext='pdf',output='.'):

	lpdf = []
	urlpdf = []
	namepdf = []

	#domain
	index = url.rfind('/')
	domain = url[0:index+1];

	print domain

	request = urllib2.Request(url);
	response = urllib2.urlopen(request);

	#content
	content = response.read()

	#resource
	mode = '<a[^>]+>[^<]+.pdf[^>]+a>'
	lpdf = re.compile(mode).findall(content)
	parserurl = HTMLParser()
	parsername = HTMLParser()
	print lpdf
	for x in lpdf:
		sta = x.find("href=\"")+6
		end = x.find("\"",sta+1)
		urlpdf.append(x[sta:end])
		sta = x.find(">")+1;
		end = x.find("pdf",sta);
		namepdf.append(x[sta:end+3])
	#print len(namepdf),len(urlpdf)
	for i in range(len(urlpdf)):
		tmp = []
		parserurl.handle_data = tmp.append
		parserurl.feed(urlpdf[i])
		urlpdf[i] = '&'.join(tmp);
	parserurl.close()
	for i in range(len(namepdf)):
		tmp = []
		parsername.handle_data = tmp.append
		parsername.feed(namepdf[i])
		namepdf[i] = '&'.join(tmp);
	for i in range(len(urlpdf)):
		print urlpdf[i]
		print namepdf[i]
		urllib.urlretrieve(urlpdf[i],output + unicode(namepdf[i],"utf8"))

コード例 #11

0

ファイルを表示

ファイル: bundlebuilder.py プロジェクト: vipulgupta2048/sugar-toolkit-gtk3

def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = [
        'xgettext', '--join-existing', '--language=Python', '--keyword=_',
        '--add-comments=TRANS:',
        '--output=%s' % pot_file
    ]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode

コード例 #12

0

ファイルを表示

ファイル: 178.py プロジェクト: kisa77/Crawl

    def crawl_item(self, url):
        self.__data = {}

        for i in range(1, self.__retryMax):
            self.output_log("crawling " + url + " ... retry:" + str(i))
            tmpCont = self.request_url(url)
            if not tmpCont :
                continue
            if tmpCont.readline() == 'no data':
                self.output_log("---\t no data")
                return

            tmpSoup = self.parse_web_page(tmpCont.read())
            bbCode = tmpSoup.find(id='bbcode_content')
            try :
                self.__data['img'] = re.compile(r'\[img\](.*)\[\/img\]').findall(bbCode.prettify())[0]
            except:
                self.__data['img'] =  ''
            try :
                self.__data['quality'] = re.compile(r'(\d)').findall(tmpSoup.find(id='item_detail').find('h2')['class'][0])[0]
            except:
                self.__data['quality'] =  ''
            try :
                self.__data['name'] = tmpSoup.find(id='item_detail').find('strong').text
            except:
                self.__data['name'] =  ''
            try :
                self.__data['id'] = re.compile(r'ID:([0-9]*)').findall(tmpSoup.find(id='item_detail').find('span').text)[0]
            except:
                self.__data['id'] =  ''
            try :
                self.__data['qnumber'] = tmpSoup.find(id='item_detail').find(id='ilv').text
            except:
                self.__data['qnumber'] =  ''
            try :
                self.__data['position'] = tmpSoup.find(id='item_detail').find('table').find('table').find('th').text
            except:
                self.__data['position'] =  ''
            try :
                self.__data['html'] = tmpSoup.find(id='main').find_all('div')[1].prettify()
            except:
                self.__data['html'] =  ''
            try :
                """ strip html tag """
                parser = HTMLParser()
                tmpList = []
                parser.handle_data = tmpList.append
                parser.feed(tmpSoup.find(id='item_detail').find(id='_dps').prettify().strip("\n"))
                parser.close()
                self.__data['attribute'] = ''.join(tmpList)
            except:
                self.__data['attribute'] = ''
            """ del temporary variables"""
            del(parser,tmpList,tmpSoup,bbCode,tmpCont)

            if not self.__data:
                continue

            return self.save_to_db(self.__data)

コード例 #13

0

ファイルを表示

ファイル: autohomepromotion.py プロジェクト: top/python

def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #14

0

ファイルを表示

ファイル: currency.py プロジェクト: saiprashanth173/Crawlers

def parse(data):
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(data)
    data= "".join(result)
    parser.close()
    return data

コード例 #15

0

ファイルを表示

ファイル: parseutils.py プロジェクト: devilWwj/pyml

def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)

コード例 #16

0

ファイルを表示

ファイル: parseutils.py プロジェクト: fengkaicnic/pyml

def html_parser(content):
    content = content.strip()
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(content)
    parser.close()
    return ''.join(result)

コード例 #17

0

ファイルを表示

ファイル: autohomepromotion.py プロジェクト: cash2one/language-Python

def stripTags(html):
    html = html.strip().strip('\r').strip('\n').strip(u' ')
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #18

0

ファイルを表示

ファイル: Parse.py プロジェクト: kaize0409/tianchi

def strip_tags(htmlStr):
    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)

コード例 #19

0

ファイルを表示

ファイル: search.py プロジェクト: w3h/SearchCrawler

def strip_tags(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    return "".join(result)

コード例 #20

0

ファイルを表示

def strip_tags(html):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)

コード例 #21

0

ファイルを表示

ファイル: untag.py プロジェクト: CodingDancerSky/Web2KnowledgeBase

def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)

コード例 #22

0

ファイルを表示

ファイル: PUNITHELP_find_HTML_TextInChm.py プロジェクト: ppellen/findEntities

    def handle_data(self, data):
        if self.string_to_find in data.lower():
            if not self.currentHtmFile in self.text_found_in.keys():
                self.text_found_in[self.currentHtmFile] = 1
            else:
                self.text_found_in[self.currentHtmFile] += 1

            if dbg: print "found in %s" % (self.currentHtmFile, )
        return HTMLParser.handle_data(self, data)

コード例 #23

0

ファイルを表示

def strip_tags(file_path):
    input_file = open(file_path)
    raw_data = input_file.read()
    plain_text = []
    parser = HTMLParser()
    parser.handle_data = plain_text.append
    parser.feed(raw_data)
    parser.close()
    return ''.join(plain_text)

コード例 #24

0

ファイルを表示

def strip_tags(html, length):
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)[:length]

コード例 #25

0

ファイルを表示

ファイル: proxy_server_generate_tm.py プロジェクト: TypeRHonda/Clone1

 def handle_data(self, data):
     tmp_data = data
     if self.start_tag not in ['script', 'link']:
         if not self.is_flag:
             if self.str_start_tag == self.get_starttag_text():
                 self.str_start_tag = ''
             else:
                 self.is_flag = True
         if self.is_flag:
             self.str_start_tag = self.get_starttag_text()
             self.is_flag = False
         str_starttag_text = '%s%s' % (self.str_start_tag, tmp_data)
         LST_SUB.append(str_starttag_text)
         regex = re.compile(u'(\\b[a-zA-Zа-яА-Я]{6}\\b)', re.U)
         tmp_data = regex.sub(r'\1&trade;', tmp_data)
         str_modify_text = '%s%s' % (self.str_start_tag, tmp_data)
         LST_SUB_MODIFY.append(str_modify_text)
     HTMLParser.handle_data(self, tmp_data)

コード例 #26

0

ファイルを表示

ファイル: xml2excel.py プロジェクト: ttyy1095/FSP_Test

 def strip_tags(self,htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)

コード例 #27

0

ファイルを表示

ファイル: bundlebuilder.py プロジェクト: leonardcj/sugar-toolkit-gtk3

def cmd_genpot(config, options):
    """Generate the gettext pot file"""

    os.chdir(config.source_dir)

    po_path = os.path.join(config.source_dir, 'po')
    if not os.path.isdir(po_path):
        os.mkdir(po_path)

    python_files = []
    for root, dirs_dummy, files in os.walk(config.source_dir):
        for file_name in files:
            if file_name.endswith('.py'):
                file_path = os.path.relpath(os.path.join(root, file_name),
                                            config.source_dir)
                python_files.append(file_path)
    python_files.sort()

    # First write out a stub .pot file containing just the translated
    # activity name, then have xgettext merge the rest of the
    # translations into that. (We can't just append the activity name
    # to the end of the .pot file afterwards, because that might
    # create a duplicate msgid.)
    pot_file = os.path.join('po', '%s.pot' % config.bundle_name)
    escaped_name = _po_escape(config.activity_name)
    f = open(pot_file, 'w')
    f.write('#: activity/activity.info:2\n')
    f.write('msgid "%s"\n' % escaped_name)
    f.write('msgstr ""\n')
    if config.summary is not None:
        escaped_summary = _po_escape(config.summary)
        f.write('#: activity/activity.info:3\n')
        f.write('msgid "%s"\n' % escaped_summary)
        f.write('msgstr ""\n')

    if config.description is not None:
        parser = HTMLParser()
        strings = []
        parser.handle_data = strings.append
        parser.feed(config.description)

        for s in strings:
            s = s.strip()
            if s:
                f.write('#: activity/activity.info:4\n')
                f.write('msgid "%s"\n' % _po_escape(s))
                f.write('msgstr ""\n')
    f.close()

    args = ['xgettext', '--join-existing', '--language=Python',
            '--keyword=_', '--add-comments=TRANS:', '--output=%s' % pot_file]

    args += python_files
    retcode = subprocess.call(args)
    if retcode:
        print 'ERROR - xgettext failed with return code %i.' % retcode

コード例 #28

0

ファイルを表示

ファイル: lib.py プロジェクト: chenhbzl/novel

def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '');
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #29

0

ファイルを表示

ファイル: article.py プロジェクト: coding-neuropathy/nut

 def strip_content(self):
     html_string = self.content.strip()
     html_string = html_string.strip('\n')
     res = []
     parser = HTMLParser()
     parser.handle_data = res.append
     parser.feed(html_string)
     parser.close()
     content_string = ''.join(res)
     return content_string.encode('utf-8')

コード例 #30

0

ファイルを表示

ファイル: main.py プロジェクト: SunnyKale/underflow

def strip_tags(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #31

0

ファイルを表示

ファイル: lib.py プロジェクト: wangjun/novel

def htmlstrip(html):
    html = html.strip()
    html = html.replace('</>', '')
    html = html.strip("http://")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #32

0

ファイルを表示

ファイル: utils.py プロジェクト: zouchao2010/xcat

 def to_text(s):
     if None == s: return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

コード例 #33

0

ファイルを表示

ファイル: utils.py プロジェクト: wing1000/wb3

 def to_text(s):
     if None == s : return None
     html = s.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

コード例 #34

0

ファイルを表示

ファイル: script_gen_whoosh_database.py プロジェクト: jiajie999/TorCMS

def html_strip(html):
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(html)
    parse.close()
    return "".join(result)

コード例 #35

0

ファイルを表示

ファイル: views.py プロジェクト: warmstranger/Projects

def parse_html(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    str = ''.join(result)
    str2 = ''.join(str.split())
    return str2

コード例 #36

0

ファイルを表示

ファイル: htmlHelper.py プロジェクト: wylazy/hamilton

def get_text(html) :
  parse = HTMLParser()

  html = html.strip().strip('\n')
  result = []
  parse.handle_data = result.append

  parse.feed(html)
  parse.close()

  return "".join(result)

コード例 #37

0

ファイルを表示

ファイル: htmlHelper.py プロジェクト: wylazy/hamilton

def get_text(html):
    parse = HTMLParser()

    html = html.strip().strip('\n')
    result = []
    parse.handle_data = result.append

    parse.feed(html)
    parse.close()

    return "".join(result)

コード例 #38

0

ファイルを表示

ファイル: spider.py プロジェクト: earthwu/spider

 def cleanHtmlTag(self,html):
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     res = ''.join(result)
     res = self.cleanHtmlAgain(res)
     return res

コード例 #39

0

ファイルを表示

ファイル: html.py プロジェクト: MarsStirner/caesar

 def ultimate_rip(data):
     """Вспомогательная функция вычищения тэгов. Оставляет ничего"""
     ripper = HTMLParser()
     from types import MethodType
     ripper.handle_data = MethodType(lambda self, d: self.fed.append(d),
                                     ripper, HTMLParser)
     ripper.get_data = MethodType(lambda self: u''.join(self.fed), ripper,
                                  HTMLParser)
     ripper.fed = []
     ripper.feed(data)
     return ripper.get_data()

コード例 #40

0

ファイルを表示

ファイル: parser_html.py プロジェクト: yoyoidea/crawler

def strip_tags(html):
    try:
        html = html.strip()
        result = []
        parse = HTMLParser()
        parse.handle_data = result.append
        parse.feed(html)
        parse.close()
        return "".join(result)
    except Exception as e:
        print e
        return ''

コード例 #41

0

ファイルを表示

ファイル: base.py プロジェクト: XinMa1/work

    def strip_tags(self, html):
        if not html:
            return ""

        from HTMLParser import HTMLParser
        html=html.strip()
        html=html.strip("\n")
        result=[]
        parse=HTMLParser()
        parse.handle_data=result.append
        parse.feed(html)
        parse.close()
        return "".join(result)

コード例 #42

0

ファイルを表示

ファイル: string.py プロジェクト: dbongo/flask-angularjs-seed

def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)

コード例 #43

0

ファイルを表示

ファイル: music_interface.py プロジェクト: GItLqr/www---api-6-17-

def transform_html_text(html):
    html=html.strip()
    html=html.strip("\n")
    result=[]
    parse=HTMLParser()
    parse.handle_data=result.append
    parse.feed(html)
    parse.close()
    text = " ".join(result)
    text = text.replace('\n',' ')
    text = text.replace('\t',' ')
    result, number = re.subn('  +', ' ', text)
    return result

コード例 #44

0

ファイルを表示

ファイル: string.py プロジェクト: Lobos/wordstore

def strip_tags(text):
    """
    去除html标记
    """
    from HTMLParser import HTMLParser
    text = text.strip()
    text = text.strip('\n')
    result = []
    parse = HTMLParser()
    parse.handle_data = result.append
    parse.feed(text)
    parse.close()
    return ''.join(result)

コード例 #45

0

ファイルを表示

ファイル: jsonfiles.py プロジェクト: harixxy/solutions

	def _content_to_list(self, content):
	
		try:
			parser = HTMLParser()
			parser.fed = []
			parser.handle_data = lambda d: parser.fed.append(d)
			
			parser.feed(content)
			return (''.join(parser.fed).encode('ascii', 'ignore')
					.translate(string.maketrans("",""), string.punctuation).lower().split())
		
		except:
			return list()

コード例 #46

0

ファイルを表示

ファイル: convert_xml2csv.py プロジェクト: bee-bee-bee/convert_xml2csv

 def strip_tags(self, htmlStr):
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     # É¾³ýstyle±êÇ©
     re_style = re.compile('<\s*style[^>]*>[^<]*<\s*/\s*style\s*>', re.I)
     htmlStr = re_style.sub('', htmlStr)
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     htmlStr = parser.unescape(htmlStr)
     parser.feed(htmlStr)
     parser.close()
     return ''.join(result)

コード例 #47

0

ファイルを表示

    def _content_to_list(self, content):

        try:
            parser = HTMLParser()
            parser.fed = []
            parser.handle_data = lambda d: parser.fed.append(d)

            parser.feed(content)
            return (''.join(parser.fed).encode('ascii', 'ignore').translate(
                string.maketrans("", ""), string.punctuation).lower().split())

        except:
            return list()

コード例 #48

0

ファイルを表示

ファイル: FilterTag.py プロジェクト: kmlxk/scripts-py

 def strip_tags(self,htmlStr):
     '''
     使用HTMLParser进行html标签过滤
     :param htmlStr:
     '''
     htmlStr = htmlStr.strip()
     htmlStr = htmlStr.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(htmlStr)
     parser.close()
     return  ''.join(result)

コード例 #49

0

ファイルを表示

ファイル: txnews.py プロジェクト: syllable2009/py3

def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #50

0

ファイルを表示

    def strip_tags(self,htmlStr):
        htmlStr = re.sub("[\\x00-\\x08\\x0b-\\x0c\\x0e-\\x1f\\x7f]","",htmlStr)
        htmlStr = re.sub(codecs.BOM_UTF8,"",htmlStr)
        htmlStr = re.sub("\\xef\\xbb\\xbf","",htmlStr)
        
        htmlStr = htmlStr.strip()
        htmlStr = htmlStr.strip("\n")

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(htmlStr)
        parser.close()
        return  ''.join(result)

コード例 #51

0

ファイルを表示

ファイル: extrat_html2csv.py プロジェクト: cfhb/uds-spider

def strip_tags(html):
    """
    Python中过滤HTML标签的函数
    """
    if html is None:
        return ''
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()
    return ''.join(result)

コード例 #52

0

ファイルを表示

def strip_tags1(htmlStr):
    '''
        使用HTMLParser进行html标签过滤
        :param htmlStr:
        '''

    htmlStr = htmlStr.strip()
    htmlStr = htmlStr.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(htmlStr)
    parser.close()
    return ''.join(result)

コード例 #53

0

ファイルを表示

ファイル: data.py プロジェクト: winkidney/GG-Blog

 def get_post_summary(self, html=''):
     """利用html返回一串纯文本"""
     from HTMLParser import HTMLParser
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     result = "<br/>".join(result)
     if len(result) > 200:
         result = result[0:200]
     return result

コード例 #54

0

ファイルを表示

def strip_tags(html):  
    """  
    Python中过滤HTML标签的函数  
    >>> str_text=strip_tags("<font color=red>hello</font>")  
    >>> print str_text  
    hello  
    """  
    html = html.strip()  
    html = html.strip("\n")  
    result = []  
    parser = HTMLParser()  
    parser.handle_data = result.append  
    parser.feed(html)  
    parser.close()  
    return ''.join(result)

コード例 #55

0

ファイルを表示

ファイル: extractext.py プロジェクト: waterblas/Diffind

 def strip_tags(self, html):
     """
     Python中过滤HTML标签的函数
     >>> str_text=strip_tags("<font color=red>hello</font>")
     >>> print str_text
     hello
     """
     html = html.strip()
     html = html.strip("\n")
     result = []
     parser = HTMLParser()
     parser.handle_data = result.append
     parser.feed(html)
     parser.close()
     return ''.join(result)

コード例 #56

0

ファイルを表示

ファイル: PUNITHELP_find_HTML_TextInChm.py プロジェクト: ppellen/findEntities

    def handle_data(self, data):

        result = self.regexp_for_search_compiled.search(data.lower())
        if result is not None:
            #  re.finditer(pattern, string) returns an iterator over MatchObject objects.
            MatchObjects = self.regexp_for_search_compiled.finditer(
                data.lower())

            for thisMatchObject in MatchObjects:
                if dbg: print thisMatchObject.start(), thisMatchObject.end()

                if not self.currentHtmFile in self.text_found_in.keys():
                    self.text_found_in[self.currentHtmFile] = 1
                else:
                    self.text_found_in[self.currentHtmFile] += 1

            # if dbg: print "found in %s" % (self.currentHtmFile,)

        return HTMLParser.handle_data(self, data)

コード例 #57

0

ファイルを表示

ファイル: OneArticleADay.py プロジェクト: zuoa/one

def strip_tags(html):
    """
    Python strip html tags
    >>> str_text=strip_tags("<font color=red>hello</font>")
    >>> print str_text
    hello
    """
    from HTMLParser import HTMLParser
    html = html.strip()
    html = html.strip("\n")
    result = []
    parser = HTMLParser()
    parser.handle_data = result.append
    parser.feed(html)
    parser.close()

    r = ''.join(result)
    if len(r) >= 2048:
        r = r[:2048]
    return r

コード例 #58

0

ファイルを表示

ファイル: ir_qweb.py プロジェクト: wuhuizhong/odoo11_uw

    def strip_tags_parser(self, html):
        """
        去除文本中的HTML标签.用到了HTMLParser
        使用示例：
        str_text=strip_tags("<font color=red>hello</font>")

        :return: String
        """
        from HTMLParser import HTMLParser
        html = html.strip('\n')
        html = html.strip('\t')
        html = html.strip(' ')
        html = html.strip()

        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(html)
        parser.close()
        return '$'.join(result)

コード例 #59

0

ファイルを表示

def prepareString(string, lenght=0, strip_html=True):
    string = string.strip()

    if (strip_html):
        result = []
        parser = HTMLParser()
        parser.handle_data = result.append
        parser.feed(string)
        parser.close()
        string = ''.join(result)
    else:
        string = html2safehtml(string,
                               valid_tags=("b", "a", "i", "br", "ul", "li",
                                           "strong"))

    if lenght > 0:
        string = string[0:lenght]
        string += "..."

    string = cgi.escape(string)

    return string