예제 #1
0
    def parse_url(self):
        db = database.get_db_connection()
        image_path = ''
        for item in item_list:
            news ={}
            title =  item.find('title').text
            href_link = item.find('link').text
            publish_date = item.find('pubdate').text
            description  =  item.find('description').text


            match=re.search(r'(.*)\.<img', description)
            if match:
                description =  match.group(1)
            else:
                description = ''


            news['title'] = title
            #print title
            news['description'] = description
            news['link'] = href_link
            news['pubDate'] = publish_date
            news['created_date'] =  datetime.datetime.utcnow()

            req = request.get(href_link, headers = appconfig. get_config(appconfig.REQ_HEADERS_MOZILLA), allow_redirects = True)
            soup = Soup(req.content)


            tag = soup.find("div",{"id": "hcenter"})
            size = size = 256, 256
            if tag:
                image_link =  (tag.find('img'))['src']
                req = request.get(image_link)
                image_name =regex.remove_white_spaces.sub('',title[:10]+".jpg")
                path = appconfig.get_config(appconfig.IMAGE_PATH)
                path = (path['thumbnail_path']).strip()
                now = datetime.datetime.now()
                folder_name = str(now.strftime("%Y_%m_%d"))
                path = path+folder_name+"/"
                if not os.path.exists(path):
                    os.makedirs(path)

                try:
                    db.news.insert(news)
                    if image_link:
                        image_path = image.create_thumbnail(size, req.content, image_name, path)
                        news['thumbnail_path'] = image_path
                except DuplicateKeyError :
                    print 'duplicate title found'
                    pass




            '''
예제 #2
0
def get_db_connection():
    try:
        app_config_dict = appconfig.get_config(appconfig.DATA_BASE_CONFIG)
        host = app_config_dict['host']
        port = app_config_dict['port']
        db_name = app_config_dict['name']
        client = pymongo.MongoClient(host, int(port))
        db = client[db_name]
    except ConnectionFailure, e:
        raise ConnectionFailure
예제 #3
0
def get_appconfig(key):
    return appconfig.get_config(key)
예제 #4
0
import requests as request
import re
from bs4 import BeautifulSoup as Soup
import datetime, time
from xtractor import Xtractor
from httpservice import HttpRequest
import database
import appconfig
import image
import regex
import os


URL = 'http://www.thehindu.com/news/cities/Mangalore/?service=rss'
http_request = HttpRequest()
http_response = http_request.get_response(URL, headers =appconfig.get_config(appconfig.REQ_HEADERS_MOZILLA))
soup = Soup(http_response.content)
item_list = soup.find_all('item')
#print item_list

class HinduExtractor(Xtractor):

    def parse_url(self):
        db = database.get_db_connection()
        image_path = ''
        for item in item_list:
            news ={}
            title =  item.find('title').text
            href_link = item.find('link').text
            publish_date = item.find('pubdate').text
            description  =  item.find('description').text