def url_tamil_text_filter( url ):
    if USE_BS4:
        tapage = bs4.BeautifulSoup(urlopen(url),"html.parser")
    else:
        tapage = bs4.BeautifulSoup(urlopen(url))
    #tatext = tapage.body.text
    # Ref: SO 1936466
    tatext = tapage.findAll(text=True)
    tatext = filter( lambda x: not (x.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']), tatext )
    tatext = u" ".join([txt.strip() for txt in tatext]) 
    print_tamil_words( tatext )
Esempio n. 2
0
def url_tamil_text_filter( url ):
    if USE_BS4:
        tapage = bs4.BeautifulSoup(urlopen(url),"html.parser")
    else:
        tapage = bs4.BeautifulSoup(urlopen(url))
    #tatext = tapage.body.text
    # Ref: SO 1936466
    tatext = tapage.findAll(text=True)
    tatext = [x for x in tatext if not (x.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]'])]
    tatext = " ".join([txt.strip() for txt in tatext]) 
    print_tamil_words( tatext )
Esempio n. 3
0
def url_tamil_text_filter( url ):
    tapage = bs4.BeautifulSoup(urlopen(url))
    tatext = tapage.body.text
    print_tamil_words( tatext )
Esempio n. 4
0
#
# This file is part of 'open-tamil' package tests
#

import sys
import imp

try:
    reload  # Python 2.7
except NameError:
    try:
        from importlib import reload  # Python 3.4+
    except ImportError:
        from imp import reload  # Python 3.0 - 3.3

imp.reload(sys)
# sys.setdefaultencoding('utf-8')

import codecs
from tamil.utf8 import print_tamil_words
from transliterate import *
import operator

if __name__ == "__main__":
    if len(sys.argv) < 2:
        print("Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n")
    for filename in sys.argv[1:]:
        with codecs.open(filename, "r", "UTF-8") as fp:
            for line in fp:  # SO:6475328 - read file line by line
                print_tamil_words(line)
Esempio n. 5
0
#!python
# -*- coding: utf-8 -*-
# (C) 2013-2018 Muthiah Annamalai
# 
# This file is part of 'open-tamil' package tests
# 
from __future__ import print_function
import sys
import codecs
from tamil.utf8 import print_tamil_words
from transliterate import *
import operator

if __name__ == u"__main__":
    if len(sys.argv) < 2:
        print(u"Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n")
    for filename in sys.argv[1:]:
        with codecs.open(filename,"r","UTF-8") as fp:
            print_tamil_words( fp.read() )
# (C) 2013-2019 Muthiah Annamalai
# 
# This file is part of 'open-tamil' package tests
# 
from __future__ import print_function
import sys
try:
    reload  # Python 2.7
except NameError:
    try:
        from importlib import reload  # Python 3.4+
    except ImportError:
        from imp import reload  # Python 3.0 - 3.3


reload(sys)
sys.setdefaultencoding('utf-8')

import codecs
from tamil.utf8 import print_tamil_words
from transliterate import *
import operator

if __name__ == u"__main__":
    if len(sys.argv) < 2:
        print(u"Usage: tamilwordfilter.py <filename-1> <filename-2> ... \n")
    for filename in sys.argv[1:]:
        with codecs.open(filename,"r","UTF-8") as fp:
            for line in fp: #SO:6475328 - read file line by line
                print_tamil_words( line )