def url_tamil_text_filter( url ): if USE_BS4: tapage = bs4.BeautifulSoup(urlopen(url),"html.parser") else: tapage = bs4.BeautifulSoup(urlopen(url)) #tatext = tapage.body.text # Ref: SO 1936466 tatext = tapage.findAll(text=True) tatext = filter( lambda x: not (x.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']), tatext ) tatext = u" ".join([txt.strip() for txt in tatext]) print_tamil_words( tatext )
def url_tamil_text_filter( url ): if USE_BS4: tapage = bs4.BeautifulSoup(urlopen(url),"html.parser") else: tapage = bs4.BeautifulSoup(urlopen(url)) #tatext = tapage.body.text # Ref: SO 1936466 tatext = tapage.findAll(text=True) tatext = [x for x in tatext if not (x.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]'])] tatext = " ".join([txt.strip() for txt in tatext]) print_tamil_words( tatext )
def url_tamil_text_filter( url ): tapage = bs4.BeautifulSoup(urlopen(url)) tatext = tapage.body.text print_tamil_words( tatext )
# # This file is part of 'open-tamil' package tests # import sys import imp try: reload # Python 2.7 except NameError: try: from importlib import reload # Python 3.4+ except ImportError: from imp import reload # Python 3.0 - 3.3 imp.reload(sys) # sys.setdefaultencoding('utf-8') import codecs from tamil.utf8 import print_tamil_words from transliterate import * import operator if __name__ == "__main__": if len(sys.argv) < 2: print("Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n") for filename in sys.argv[1:]: with codecs.open(filename, "r", "UTF-8") as fp: for line in fp: # SO:6475328 - read file line by line print_tamil_words(line)
#!python # -*- coding: utf-8 -*- # (C) 2013-2018 Muthiah Annamalai # # This file is part of 'open-tamil' package tests # from __future__ import print_function import sys import codecs from tamil.utf8 import print_tamil_words from transliterate import * import operator if __name__ == u"__main__": if len(sys.argv) < 2: print(u"Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n") for filename in sys.argv[1:]: with codecs.open(filename,"r","UTF-8") as fp: print_tamil_words( fp.read() )
# (C) 2013-2019 Muthiah Annamalai # # This file is part of 'open-tamil' package tests # from __future__ import print_function import sys try: reload # Python 2.7 except NameError: try: from importlib import reload # Python 3.4+ except ImportError: from imp import reload # Python 3.0 - 3.3 reload(sys) sys.setdefaultencoding('utf-8') import codecs from tamil.utf8 import print_tamil_words from transliterate import * import operator if __name__ == u"__main__": if len(sys.argv) < 2: print(u"Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n") for filename in sys.argv[1:]: with codecs.open(filename,"r","UTF-8") as fp: for line in fp: #SO:6475328 - read file line by line print_tamil_words( line )