Ejemplo n.º 1
0
    def get_parameters_for_poem_content_page(self, category, poem_name,
                                             author_name):
        db = DBHelper()
        para_dict = {}

        logo_path = db.get_logo_for_category(category)
        para_dict['logo_path'] = self.img_path + logo_path

        title = self.title_prefix + poem_name
        para_dict['title'] = title

        para_dict['main_content'] = db.get_poem_content(poem_name, author_name)
        return para_dict
Ejemplo n.º 2
0
    def get_parameters_for_paper_content_page(self, paper_title, author_name):
        db = DBHelper()
        para_dict = {}

        logo_path = db.get_logo_for_category('yanjiulunwen')
        para_dict['logo_path'] = self.img_path + logo_path

        title = self.title_prefix + paper_title
        para_dict['title'] = title

        para_dict['main_content'] = db.get_paper_content(
            paper_title, author_name)
        pdf_link = para_dict['main_content']['link']
        para_dict['main_content']['link'] = self.pdf_path + pdf_link
        return para_dict
Ejemplo n.º 3
0
class CreateJsonHelper:


    def __init__(self):
        self.db = DBHelper()
        pass

    def writeGifJson(self, gif_list):
        print len(gif_list)

        response_result = {'status': '200'}
        response_result['data'] = gif_list

        json_content = json.dumps(response_result)
        name = '../../gif_api/gif_main.json'
        if os.path.exists(name):
            os.remove(name)
        print name
        output = open(name, 'w')
        output.write(json_content)
        output.flush()
        output.close()

    def writeOkResultJson(self):
        response_result = {'status': '200'}
        name = '../../gif_api/sucess.json'
        output = open(name, 'w')
        output.write(json.dumps(response_result))
        output.flush()
        output.close()

    def writeJson(self):
        items = self.db.getGifItemsLimit(4500,4800)
        self.writeGifJson(items)
Ejemplo n.º 4
0
    def get_parameters_for_blog_from_db(self, category):
        db = DBHelper()
        para_dict = {}
        logo_path = db.get_logo_for_category(category)

        para_dict['logo_path'] = self.img_path + logo_path

        chn_category = db.get_chn_name_for_category(category)
        title = self.title_prefix + chn_category
        para_dict['title'] = title

        blog_dict = db.get_blog_dict_of_shishe(category)
        if blog_dict['blog_img']:
            blog_dict['blog_img'] = self.img_path + blog_dict['blog_img']

        para_dict['blog_dict'] = blog_dict
        return para_dict
Ejemplo n.º 5
0
    def get_parameters_for_shirenjianjie_from_db(self, category, pinyin):
        db = DBHelper()
        para_dict = {}

        # retrieve logo_url for page from topic db
        logo_path = db.get_logo_for_category(category)
        para_dict['logo_path'] = self.img_path + logo_path

        # retrieve chn_name for page from topic db
        chn_category = db.get_chn_name_for_category(category)
        title = self.title_prefix + chn_category
        para_dict['title'] = title

        # retrieve slider for page from topic db
        slider_info = db.get_slider_info_for_category(
            chn_category)  # slider_list
        if slider_info:
            para_dict['sliders'] = slider_info
        else:
            para_dict['sliders'] = ""

        # update slider path
        for slider_dict in para_dict['sliders']:
            slider_dict['path'] = self.img_path + slider_dict['path']

        # retrieve all poet info for the specific pinyin
        para_dict['main_content'] = db.get_all_poet_info_list(pinyin)

        return para_dict
Ejemplo n.º 6
0
    def get_parameters_for_topic_from_db(self, category, type):
        db = DBHelper()
        para_dict = {}

        # retrieve logo_url for page from topic db
        logo_path = db.get_logo_for_category(category)
        para_dict['logo_path'] = self.img_path + logo_path

        # retrieve chn_name for page from topic db
        chn_category = db.get_chn_name_for_category(category)
        title = self.title_prefix + chn_category
        para_dict['title'] = title

        # retrieve slider for page from topic db
        slider_info = db.get_slider_info_for_category(
            chn_category)  # slider_list
        if slider_info:
            para_dict['sliders'] = slider_info
        else:
            para_dict['sliders'] = ""

        # update slider path
        for slider_dict in para_dict['sliders']:
            slider_dict['path'] = self.img_path + slider_dict['path']

        if type == "poem":
            # get poet_poem_list
            poet_info_dict = db.get_poet_info_list_for_a_category(chn_category)
            para_dict['main_content'] = poet_info_dict
        elif type == "paper":
            # get author_paper_list
            author_info_dict = db.get_author_info_list_for_paper()
            para_dict['main_content'] = author_info_dict
        elif type == "video":
            # get author_video_list
            video_info_dict = db.get_author_info_list_for_video()
            para_dict['main_content'] = video_info_dict
        else:
            print("Wrong Type!")
        return para_dict
Ejemplo n.º 7
0
from db.DBHelper import DBHelper

db = DBHelper()
# sql_query = "CREATE TABLE Paper (id int, title varchar(255), author varchar(255), link varchar(1000));"
# # sql_query = "ALTER TABLE Poet MODIFY description TEXT;"
# db.execute(sql_query)

# sql_query = "ALTER TABLE Poet ADD COLUMN source VARCHAR(255) AFTER description;"
# db.execute(sql_query)

# sql_query = "INSERT INTO Topic (id, name, chn_name, logo_url) VALUES (10, 'lishishijian', '歷史事件', 'logo-lishishijian.png');"
# # sql_query = "UPDATE `Topic` SET `blog_title` = '新嘉坡風土記(1936)' WHERE `id` = 11;"
# db.execute(sql_query)

# sql_query = "ALTER TABLE Topic ADD blog_content TEXT"
# db.execute(sql_query)

# sql_query = "show tables"
# [{'Tables_in_poemDB': 'Poem'}, {'Tables_in_poemDB': 'Poet'}, {'Tables_in_poemDB': 'Topic'}]
# sql_query = "describe Topic"
# sql_query = "SELECT id, name, chn_name, logo_url, slider from Topic WHERE id = 8;"
# sql_query = "SELECT * FROM Topic;"
# sql_query = "INSERT INTO Topic (id, name, chn_name, logo_url) VALUES (24, 'wenyihuodong', '文藝活動', 'logo-wenyihuodong.png');"
# sql_query = "UPDATE Poem SET category = replace(category, '詩作品選', '潘受作品選')"
# db.execute(sql_query)
# [{'Field': 'id', 'Type': 'int(11)', 'Null': 'NO', 'Key': 'PRI', 'Default': None, 'Extra': ''},
#  {'Field': 'name', 'Type': 'varchar(32)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''},
#  {'Field': 'chn_name', 'Type': 'varchar(32)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''},
#  {'Field': 'logo_url', 'Type': 'varchar(256)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''},
#  {'Field': 'slider', 'Type': 'text', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''}
#  ]
Ejemplo n.º 8
0
from urllib import parse
from items import PwItem, StarItem, GenreItem, LinkItem
from db.DBHelper import DBHelper

#existmag 有链接 mag ,所有all

PageCount = 0
magkey = 'all'
headers = {
    'user-agent':
    'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1'
}
ChineseHDPoint = 2
ChinesePoint = 1.5
HDPoint = 1.2
db = DBHelper(True)


class PWSpider(scrapy.Spider):
    name = "pw"

    def start_requests(self):
        keys = self.getConfig('star.json')
        for key in keys:
            url = 'https://www.javbus.com/star/' + key
            yield scrapy.Request(url=url,
                                 headers=headers,
                                 cookies={'existmag': 'mag'},
                                 callback=self.parseStar)

    # yield scrapy.Request(url='https://www.javbus.com/genre', headers=headers, callback=self.parseGenre)
Ejemplo n.º 9
0
 def __init__(self):
     self.db = DBHelper()
     pass
Ejemplo n.º 10
0
# Don't forget to add your pipeline to the ITEM_PIPELINES setting
# See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html

# from scrapy.pipelines.images import ImagesPipeline
from scrapy import Request
from scrapy.exceptions import DropItem
from items import PwItem
import shutil
from db.DBHelper import DBHelper
from scrapy.pipelines.images import ImagesPipeline

from items import PwItem, StarItem, GenreItem, LinkItem
from scrapy.utils.project import get_project_settings
import os

db = DBHelper(False)


class PwImagePipeline(ImagesPipeline):
    def get_media_requests(self, item, info):
        if isinstance(item, LinkItem):
            db.insertAVLink(item)
            return None
        if isinstance(item, GenreItem):
            db.inserGenreItem(item)
            return None
        if isinstance(item, StarItem):
            db.insertStarItem(item)
            return None

        yield Request(item['imageUrl'], meta={'parseStar': item['parseStar']})
Ejemplo n.º 11
0
from db.DBHelper import DBHelper

db = DBHelper()
#
# # Create dict to JSON list
# chunlian = {"slider_type": 0, "urls": ['slider-chunlian.jpeg']}
# dazhuan = {"slider_type": 0, "urls": ['slider-dazhuan.png']}
# nus = {"slider_type": 0, "urls": ['slider-nus.png']}
# yinglian = {"slider_type": 0, "urls": ['slider-sgyinglian.png']}
# shuanglin = {"slider_type": 0, "urls": ['slider-shuanglin.jpg']}
# xinzhou = {"slider_type": 0, "urls": ['slider-xinzhou.png']}
# mingsheng = {
#     "slider_type": 1,
#     "urls_dict": {
#         "slider-mingshengguji-1.jpg": ["濱海灣", "魚尾獅", "1965-2019"],
#         "slider-shuanglin.jpg": ["184 Jalan Toa Payoh, Singapore 319944", "雙林寺"],
#     }
# }
# fengsu = {
#     "slider_type": 1,
#     "urls_dict": {
#         "slider-yusheng.jpeg": ["南洋文化傳統", "撈魚生", "2019"],
#         "slider-durian.jpg": ["南洋水果文化", "榴蓮"],
#     }
# }
# yuyan = {
#     "slider_type": 1,
#     "urls_dict": {
#         "slider-nanyang.jpg": ["百花齐放", "南洋方言", "2019"],
#         "slider-minnan.png": ["Hokkien", "闽南语"],
#     }
Ejemplo n.º 12
0
 def __init__(self):
     self.pageUrl = "http://sc.jb51.net/gaoxiaotupian/"
     self.domain = 'http://sc.jb51.net'
     BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312')
     self.dbHelper = DBHelper()
Ejemplo n.º 13
0
class ScJbSpider(BaseGifSpider):
    def __init__(self):
        self.pageUrl = "http://sc.jb51.net/gaoxiaotupian/"
        self.domain = 'http://sc.jb51.net'
        BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312')
        self.dbHelper = DBHelper()

    def startRequest(self):
        request_list = self.getTotalPageList()
        for item in request_list:
            list = self.startRequestCategoryList(item)
            for item in list:
                detail_list = self.requestChildPageList(item)
                for url in detail_list:
                    self.requestDetailPage(url)

    def startRequestCategoryList(self, url):
        tree = super(ScJbSpider, self).request(url)
        if tree is not None:
            target_list = []
            xhs = tree.xpath('//div[@class="all_list"]/ul/li')
            if xhs:
                length = len(xhs)
                for index in range(1, length + 1):
                    hrefXhs = tree.xpath('//div[@class="all_list"]/ul/li[%d]/a/@href' % index)
                    if hrefXhs is not None and len(hrefXhs) > 0:
                        target_list.append(self.domain + hrefXhs[0])
            return target_list

    def formatChinese(self, title):
        return title.encode('utf-8')

    def getTotalPageList(self):
        page_list = [self.pageUrl]
        tree = super(ScJbSpider, self).request()
        if tree is not None:
            nextPagesXhs = tree.xpath('//div[@class="pagelist"]//span[@class="pageinfo"]/strong[1]/text()')
            if nextPagesXhs:
                pageInfo = self.formatChinese(nextPagesXhs[0])
                prefix = self.pageUrl + 'list_%s.html'
                if re.match("\d+", pageInfo):
                    for index in range(2, 1 + int(pageInfo)):
                        url = prefix % (str(index))
                        page_list.append(url)
        return page_list

    def requestDetailPage(self, page_url):
        tree = super(ScJbSpider, self).request(page_url)
        if tree is not None:
            xhs = tree.xpath('//div[@class="content-c2"]')
            if xhs:
                srcXhs = tree.xpath('//div[@class="content-c2"]//p[count(*)=1]/img/@src')  # 选择所有p子节点为1的下面的img的href
                #titleXhs = tree.xpath('//div[@class="content-c2"]//p[not(self::text()[not(normalize-space())])]/text()')
                titleXhs = tree.xpath('//div[@class="content-c2"]//p[string-length(text()) > 0]/text()')
                title_list = []
                if titleXhs is not None and len(titleXhs):
                      for i in range(0,len(titleXhs)):
                           title =self.formatChinese(titleXhs[i]).translate(string.maketrans("\n\t\r", "   ")).strip()
                           if  len(title) > 0:
                               title_list.append(title)

                if srcXhs is not None and len(srcXhs) > 0:
                    if  len(title_list) > 0:
                        titleLength = len(title_list)
                        for index in range(0, len(srcXhs)):
                            href = srcXhs[index]
                            if not href.startswith("http:"):
                                href = self.domain + href

                            if href.endswith('.gif'):
                                if index < titleLength:
                                    if self.isGifUrlAvailable(href):
                                        item = GifItem()
                                        item.gif_url = href
                                        item.gif_title = title_list[index]
                                        self.dbHelper.saveGifItem(item)

    def isGifUrlAvailable(self,gifUrl):
        return super(ScJbSpider,self).isGifUrlAvailable(gifUrl)

    def requestChildPageList(self, page_url):
        child_list = [page_url]
        tree = super(ScJbSpider, self).request(page_url)
        if tree is not None:
            nextPagesXhs = tree.xpath('//div[@class="content-c2"]/div/ul[1]/li[1]/a/text()')
            if nextPagesXhs:
                url = self.formatChinese(nextPagesXhs[0])
                regex = re.compile(r'\S+(\d+)\S+')
                result = regex.findall(url)
                if result:
                    pageCount = int(result[0])
                    url = page_url.split('.htm')[0]
                    for index in range(2, 1 + pageCount):
                       full_url = url + '_' + str(index) + '.htm'
                       child_list.append(full_url)
                       print full_url
        return child_list
Ejemplo n.º 14
0
 def __init__(self):
     self.pageUrl = "http://www.gaoxiaogif.com/all/"
     self.domain = 'http://www.gaoxiaogif.com'
     BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312')
     self.dbHelper = DBHelper()
Ejemplo n.º 15
0
class GaoXiaoGifSpider(BaseGifSpider):
    def __init__(self):
        self.pageUrl = "http://www.gaoxiaogif.com/all/"
        self.domain = 'http://www.gaoxiaogif.com'
        BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312')
        self.dbHelper = DBHelper()

    def getTotalPageList(self):
        page_list = [self.pageUrl]
        tree = super(GaoXiaoGifSpider, self).request()
        hrefXhs = tree.xpath('//div[@class="page"]/a/@href')
        if hrefXhs is not None and len(hrefXhs) > 0:
            lasthref = hrefXhs[len(hrefXhs) - 1]
            if lasthref:
                array = lasthref.split("_")
                if len(array) > 1:
                    pageInfo = array[1].split(".")[0]
                    count = int(pageInfo)
                    url_template = 'http://www.gaoxiaogif.com/all/index_%d.html'
                    for index in range(2, 1 + count):
                        url = url_template % (index)
                        page_list.append(url)

        return page_list

    def parseCategoryPage(self, url):
        tree = super(GaoXiaoGifSpider, self).request(url)
        detail_page_url_list = []
        if tree is not None:
            xhs = tree.xpath('//div[@class="likepage"]/ul/li')
            if xhs is not None and len(xhs) > 0:
                for index in range(2, 1 + len(xhs)):
                    srcXhs = tree.xpath(
                        '//div[@class="likepage"]/ul/li[%d]//a[@title]/@href' %
                        index)
                    if srcXhs is not None and len(srcXhs) > 0:
                        href = srcXhs[0]
                        if href.endswith('.html'):
                            url = href
                            if not href.startswith('http:'):
                                url = self.domain + href
                            detail_page_url_list.append(url)
        return detail_page_url_list

    def parseGifDetailPage(self, url):
        tree = super(GaoXiaoGifSpider, self).request(url)
        titleXhs = tree.xpath('//div[@class="listgif-title"]//h1/text()')
        if titleXhs is not None and len(titleXhs) > 0:
            title = self.formatChinese(titleXhs[0])
            srcXhs = tree.xpath(
                '//div[@class="listgif-giftu content_pic"]//img/@src')
            if srcXhs is not None and len(srcXhs) > 0:
                for index in range(0, len(srcXhs)):
                    gif_url = srcXhs[index]
                    if not gif_url.startswith("http:"):
                        gif_url = self.domain + gif_url

                    if gif_url.endswith('.gif'):
                        item = GifItem()
                        if self.isGifUrlAvailable(gif_url):
                            item.gif_url = gif_url
                            item.gif_title = title
                            self.dbHelper.saveGifItem(item)

    def isGifUrlAvailable(self, gifUrl):
        return super(GaoXiaoGifSpider, self).isGifUrlAvailable(gifUrl)

    def formatChinese(self, title):
        return title.encode('utf-8')

    def startRequest(self, reverse=False):
        category_list = self.getTotalPageList()
        if reverse:
            #逆序抓取
            for i in range(len(category_list)):
                item = category_list[len(category_list) - i - 1]

                detail_urls = self.parseCategoryPage(item)
                for index in range(len(detail_urls)):
                    url = detail_urls[len(detail_urls) - index - 1]
                    self.parseGifDetailPage(url)
        else:
            for item in category_list:
                detail_urls = self.parseCategoryPage(item)
                time.sleep(10)
                for url in detail_urls:
                    time.sleep(10)
                    self.parseGifDetailPage(url)