Example #1
0
def getArticle(url):

        if "timesofindia" in url:
                htmltext = gethtml.getHtmlText(url)
                return getArticleTextTOI(htmltext)

        else:
                htmltext = gethtml.getHtmlText(url)
                return getArticleText(htmltext)
Example #2
0
def getURLs2(rss):
    htmltext = gethtml.getHtmlText(rss)
    regex = '<link>(.+?)</link>'
    pattern = re.compile(regex)
    links = re.findall(pattern, htmltext)
    #returns valid links
    goodlinks = [
        link for link in links if bool(urlparse.urlparse(link)) == True
    ]
    return goodlinks
Example #3
0
def getURLs(rss):
    Titles = []
    soup = BeautifulSoup(gethtml.getHtmlText(rss), 'lxml')
    for item in soup.findAll('item'):
        #link tag cut off after stripping for item... only </link> is there
        for i in item.findAll('title'):
            try:
                Titles.append(i.contents[0])
            except Exception as ex:
                template = "An exception of type {0} occured. Arguments:\n{1!r}"
                message = template.format(type(ex).__name__, ex.args)
                print message
    return Titles
Example #4
0
def getArticle(url):
	htmltext = gethtml.getHtmlText(url)
	return getArticleText(htmltext)
Example #5
0
def getArticle(url):
    htmltext = gethtml.getHtmlText(url)
    return getArticleText(htmltext)
Example #6
0
def getArticleText(web_text):
    # client = MongoClient('mongodb://localhost:27017/')
    # db = client.ndtv
    # cur.execute('''
    # DROP TABLE IF EXISTS News''')

    # cur.execute('''
    # CREATE TABLE News (image TEXT,title TEXT, place TEXT,day TEXT,short_desc TEXT, full_desc TEXT)''')
    #print web_text
    articletext = ""
    soup = BeautifulSoup(web_text)
    article_all = soup.find('div', {'id': 'ins_storylist'})
    i = 0
    all_data = list()
    for article in article_all.ul.findAll('li'):
        data = dict()
        try:
            a_tag = article.find('a')
            title = a_tag['title']
            n_url = a_tag['href']
            # print title
            # print n_url
            image_tag = a_tag.find('img')
            #print image_tag
            image = image_tag['src']
            place_date = article.find('div', {'class': 'nstory_dateline'})
            kk = place_date.text.split(' | ')[1].split(',')
            day = kk[0] + ", " + kk[1]
            place = " ".join(kk[2:])
            short_desc = article.find('div', {'class': 'nstory_intro'}).text
            #print title
            # print image
            # print place
            # print day
            # print short_desc
            data['title'] = title
            data['image'] = image
            data['place'] = place
            data['day'] = day
            data['short_desc'] = short_desc
            print len(short_desc)
            print len(day)
            full = gethtml.getHtmlText(n_url)
            full_soup = BeautifulSoup(full)
            full_desc = full_soup.find('div', {'id': 'ins_storybody'}).text
            #print full_desc
            data['full_desc'] = full_desc
            print len(full_desc)
            print
            # cur.execute('SELECT title FROM News WHERE title = ? ', (title, ))
            # row = cur.fetchone()
            # if row is None:
            # 	cur.execute('''INSERT INTO News (image, title, place, day, short_desc,full_desc)
            # 	        VALUES ( ?, ?, ?, ?, ?, ? )''', ( image, title, place, day, short_desc, full_desc, ) )
            # conn.commit()
            #print i
            # ans = db.news.find({"title": title})
            # print ans
            # print "title"
            # result = db.news.insert_one(data)
            # print result.inserted_id
            all_data.append(data)
            # if title not in ans:
            # 	#data = json.dumps(data)
            # 	result = db.news.insert_one(data)
            # i+=1
        except:
            print "except"
            continue
        # db.news.insert_one({'t':'t'})
        # for r_article in article.next_siblings:
        # 	print r_article
    # 		place_date = r_article.find('div',{'class':'nstory_dateline'})
    # 		place,day = place_date.text.split(' | ')
    # 		short_desc = r_article.find('div',{'class':'nstory_intro'}).text
    # 		print title
    # 		print image
    # 		print place
    # 		print day
    # 		print short_desc
    # 		full =	gethtml.getHtmlText(n_url)
    # 		full_soup = BeautifulSoup(full)
    # 		full_desc = full_soup.find('div',{'id':'ins_storybody'}).text
    # 		print full_desc
    # 		cur.execute('SELECT title FROM News WHERE title = ? ', (title, ))
    # 		row = cur.fetchone()
    # 		if row is None:
    # 			cur.execute('''INSERT INTO News (image, title, place, day, short_desc,full_desc)
    # 			        VALUES ( ?, ?, ?, ?, ?, ? )''', ( image, title, place, day, short_desc, full_desc, ) )
    # conn.commit()
    # break
    return all_data
Example #7
0
def getURLs3(rss):
    htmltext = gethtml.getHtmlText(rss)
    regex = '<pubDate>(.+?)</pubDate>'
    pattern = re.compile(regex)
    date = re.findall(pattern, htmltext)
    return date