Python getHtmlText Examples

Programming Language: Python

Namespace/Package Name: gethtml

Method/Function: getHtmlText

Examples at hotexamples.com: 7

Python getHtmlText - 7 examples found. These are the top rated real world Python examples of gethtml.getHtmlText extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: articletext.py Project: Anirudh58/keyword-gen

def getArticle(url):

        if "timesofindia" in url:
                htmltext = gethtml.getHtmlText(url)
                return getArticleTextTOI(htmltext)

        else:
                htmltext = gethtml.getHtmlText(url)
                return getArticleText(htmltext)

Example #2

Show file

File: RSS_URL.py Project: fro391/Investing

def getURLs2(rss):
    htmltext = gethtml.getHtmlText(rss)
    regex = '<link>(.+?)</link>'
    pattern = re.compile(regex)
    links = re.findall(pattern, htmltext)
    #returns valid links
    goodlinks = [
        link for link in links if bool(urlparse.urlparse(link)) == True
    ]
    return goodlinks

Example #3

Show file

File: RSS_URL.py Project: fro391/Investing

def getURLs(rss):
    Titles = []
    soup = BeautifulSoup(gethtml.getHtmlText(rss), 'lxml')
    for item in soup.findAll('item'):
        #link tag cut off after stripping for item... only </link> is there
        for i in item.findAll('title'):
            try:
                Titles.append(i.contents[0])
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                print message
    return Titles

Example #4

Show file

File: articletext.py Project: MannyAcevedo/adbnews

def getArticle(url):
	htmltext = gethtml.getHtmlText(url)
	return getArticleText(htmltext)

Example #5

Show file

File: articletext.py Project: fro391/Investing

def getArticle(url):
    htmltext = gethtml.getHtmlText(url)
    return getArticleText(htmltext)

Example #6

Show file

def getArticleText(web_text):
    # client = MongoClient('mongodb://localhost:27017/')
    # db = client.ndtv
    # cur.execute('''
    # DROP TABLE IF EXISTS News''')

    # cur.execute('''
    # CREATE TABLE News (image TEXT,title TEXT, place TEXT,day TEXT,short_desc TEXT, full_desc TEXT)''')
    #print web_text
    articletext = ""
    soup = BeautifulSoup(web_text)
    article_all = soup.find('div', {'id': 'ins_storylist'})
    i = 0
    all_data = list()
    for article in article_all.ul.findAll('li'):
        data = dict()
        try:
            a_tag = article.find('a')
            title = a_tag['title']
            n_url = a_tag['href']
            # print title
            # print n_url
            image_tag = a_tag.find('img')
            #print image_tag
            image = image_tag['src']
            place_date = article.find('div', {'class': 'nstory_dateline'})
            kk = place_date.text.split(' | ')[1].split(',')
            day = kk[0] + ", " + kk[1]
            place = " ".join(kk[2:])
            short_desc = article.find('div', {'class': 'nstory_intro'}).text
            #print title
            # print image
            # print place
            # print day
            # print short_desc
            data['title'] = title
            data['image'] = image
            data['place'] = place
            data['day'] = day
            data['short_desc'] = short_desc
            print len(short_desc)
            print len(day)
            full = gethtml.getHtmlText(n_url)
            full_soup = BeautifulSoup(full)
            full_desc = full_soup.find('div', {'id': 'ins_storybody'}).text
            #print full_desc
            data['full_desc'] = full_desc
            print len(full_desc)
            print
            # cur.execute('SELECT title FROM News WHERE title = ? ', (title, ))
            # row = cur.fetchone()
            # if row is None:
            # 	cur.execute('''INSERT INTO News (image, title, place, day, short_desc,full_desc)
            # 	        VALUES ( ?, ?, ?, ?, ?, ? )''', ( image, title, place, day, short_desc, full_desc, ) )
            # conn.commit()
            #print i
            # ans = db.news.find({"title": title})
            # print ans
            # print "title"
            # result = db.news.insert_one(data)
            # print result.inserted_id
            all_data.append(data)
            # if title not in ans:
            # 	#data = json.dumps(data)
            # 	result = db.news.insert_one(data)
            # i+=1
        except:
            print "except"
            continue
        # db.news.insert_one({'t':'t'})
        # for r_article in article.next_siblings:
        # 	print r_article
    # 		place_date = r_article.find('div',{'class':'nstory_dateline'})
    # 		place,day = place_date.text.split(' | ')
    # 		short_desc = r_article.find('div',{'class':'nstory_intro'}).text
    # 		print title
    # 		print image
    # 		print place
    # 		print day
    # 		print short_desc
    # 		full =	gethtml.getHtmlText(n_url)
    # 		full_soup = BeautifulSoup(full)
    # 		full_desc = full_soup.find('div',{'id':'ins_storybody'}).text
    # 		print full_desc
    # 		cur.execute('SELECT title FROM News WHERE title = ? ', (title, ))
    # 		row = cur.fetchone()
    # 		if row is None:
    # 			cur.execute('''INSERT INTO News (image, title, place, day, short_desc,full_desc)
    # 			        VALUES ( ?, ?, ?, ?, ?, ? )''', ( image, title, place, day, short_desc, full_desc, ) )
    # conn.commit()
    # break
    return all_data

Example #7

Show file

File: RSS_URL.py Project: fro391/Investing

def getURLs3(rss):
    htmltext = gethtml.getHtmlText(rss)
    regex = '<pubDate>(.+?)</pubDate>'
    pattern = re.compile(regex)
    date = re.findall(pattern, htmltext)
    return date