def get_parameters_for_poem_content_page(self, category, poem_name, author_name): db = DBHelper() para_dict = {} logo_path = db.get_logo_for_category(category) para_dict['logo_path'] = self.img_path + logo_path title = self.title_prefix + poem_name para_dict['title'] = title para_dict['main_content'] = db.get_poem_content(poem_name, author_name) return para_dict
def get_parameters_for_paper_content_page(self, paper_title, author_name): db = DBHelper() para_dict = {} logo_path = db.get_logo_for_category('yanjiulunwen') para_dict['logo_path'] = self.img_path + logo_path title = self.title_prefix + paper_title para_dict['title'] = title para_dict['main_content'] = db.get_paper_content( paper_title, author_name) pdf_link = para_dict['main_content']['link'] para_dict['main_content']['link'] = self.pdf_path + pdf_link return para_dict
class CreateJsonHelper: def __init__(self): self.db = DBHelper() pass def writeGifJson(self, gif_list): print len(gif_list) response_result = {'status': '200'} response_result['data'] = gif_list json_content = json.dumps(response_result) name = '../../gif_api/gif_main.json' if os.path.exists(name): os.remove(name) print name output = open(name, 'w') output.write(json_content) output.flush() output.close() def writeOkResultJson(self): response_result = {'status': '200'} name = '../../gif_api/sucess.json' output = open(name, 'w') output.write(json.dumps(response_result)) output.flush() output.close() def writeJson(self): items = self.db.getGifItemsLimit(4500,4800) self.writeGifJson(items)
def get_parameters_for_blog_from_db(self, category): db = DBHelper() para_dict = {} logo_path = db.get_logo_for_category(category) para_dict['logo_path'] = self.img_path + logo_path chn_category = db.get_chn_name_for_category(category) title = self.title_prefix + chn_category para_dict['title'] = title blog_dict = db.get_blog_dict_of_shishe(category) if blog_dict['blog_img']: blog_dict['blog_img'] = self.img_path + blog_dict['blog_img'] para_dict['blog_dict'] = blog_dict return para_dict
def get_parameters_for_shirenjianjie_from_db(self, category, pinyin): db = DBHelper() para_dict = {} # retrieve logo_url for page from topic db logo_path = db.get_logo_for_category(category) para_dict['logo_path'] = self.img_path + logo_path # retrieve chn_name for page from topic db chn_category = db.get_chn_name_for_category(category) title = self.title_prefix + chn_category para_dict['title'] = title # retrieve slider for page from topic db slider_info = db.get_slider_info_for_category( chn_category) # slider_list if slider_info: para_dict['sliders'] = slider_info else: para_dict['sliders'] = "" # update slider path for slider_dict in para_dict['sliders']: slider_dict['path'] = self.img_path + slider_dict['path'] # retrieve all poet info for the specific pinyin para_dict['main_content'] = db.get_all_poet_info_list(pinyin) return para_dict
def get_parameters_for_topic_from_db(self, category, type): db = DBHelper() para_dict = {} # retrieve logo_url for page from topic db logo_path = db.get_logo_for_category(category) para_dict['logo_path'] = self.img_path + logo_path # retrieve chn_name for page from topic db chn_category = db.get_chn_name_for_category(category) title = self.title_prefix + chn_category para_dict['title'] = title # retrieve slider for page from topic db slider_info = db.get_slider_info_for_category( chn_category) # slider_list if slider_info: para_dict['sliders'] = slider_info else: para_dict['sliders'] = "" # update slider path for slider_dict in para_dict['sliders']: slider_dict['path'] = self.img_path + slider_dict['path'] if type == "poem": # get poet_poem_list poet_info_dict = db.get_poet_info_list_for_a_category(chn_category) para_dict['main_content'] = poet_info_dict elif type == "paper": # get author_paper_list author_info_dict = db.get_author_info_list_for_paper() para_dict['main_content'] = author_info_dict elif type == "video": # get author_video_list video_info_dict = db.get_author_info_list_for_video() para_dict['main_content'] = video_info_dict else: print("Wrong Type!") return para_dict
from db.DBHelper import DBHelper db = DBHelper() # sql_query = "CREATE TABLE Paper (id int, title varchar(255), author varchar(255), link varchar(1000));" # # sql_query = "ALTER TABLE Poet MODIFY description TEXT;" # db.execute(sql_query) # sql_query = "ALTER TABLE Poet ADD COLUMN source VARCHAR(255) AFTER description;" # db.execute(sql_query) # sql_query = "INSERT INTO Topic (id, name, chn_name, logo_url) VALUES (10, 'lishishijian', '歷史事件', 'logo-lishishijian.png');" # # sql_query = "UPDATE `Topic` SET `blog_title` = '新嘉坡風土記(1936)' WHERE `id` = 11;" # db.execute(sql_query) # sql_query = "ALTER TABLE Topic ADD blog_content TEXT" # db.execute(sql_query) # sql_query = "show tables" # [{'Tables_in_poemDB': 'Poem'}, {'Tables_in_poemDB': 'Poet'}, {'Tables_in_poemDB': 'Topic'}] # sql_query = "describe Topic" # sql_query = "SELECT id, name, chn_name, logo_url, slider from Topic WHERE id = 8;" # sql_query = "SELECT * FROM Topic;" # sql_query = "INSERT INTO Topic (id, name, chn_name, logo_url) VALUES (24, 'wenyihuodong', '文藝活動', 'logo-wenyihuodong.png');" # sql_query = "UPDATE Poem SET category = replace(category, '詩作品選', '潘受作品選')" # db.execute(sql_query) # [{'Field': 'id', 'Type': 'int(11)', 'Null': 'NO', 'Key': 'PRI', 'Default': None, 'Extra': ''}, # {'Field': 'name', 'Type': 'varchar(32)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''}, # {'Field': 'chn_name', 'Type': 'varchar(32)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''}, # {'Field': 'logo_url', 'Type': 'varchar(256)', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''}, # {'Field': 'slider', 'Type': 'text', 'Null': 'YES', 'Key': '', 'Default': None, 'Extra': ''} # ]
from urllib import parse from items import PwItem, StarItem, GenreItem, LinkItem from db.DBHelper import DBHelper #existmag 有链接 mag ,所有all PageCount = 0 magkey = 'all' headers = { 'user-agent': 'Mozilla/5.0 (iPhone; CPU iPhone OS 11_0 like Mac OS X) AppleWebKit/604.1.38 (KHTML, like Gecko) Version/11.0 Mobile/15A372 Safari/604.1' } ChineseHDPoint = 2 ChinesePoint = 1.5 HDPoint = 1.2 db = DBHelper(True) class PWSpider(scrapy.Spider): name = "pw" def start_requests(self): keys = self.getConfig('star.json') for key in keys: url = 'https://www.javbus.com/star/' + key yield scrapy.Request(url=url, headers=headers, cookies={'existmag': 'mag'}, callback=self.parseStar) # yield scrapy.Request(url='https://www.javbus.com/genre', headers=headers, callback=self.parseGenre)
def __init__(self): self.db = DBHelper() pass
# Don't forget to add your pipeline to the ITEM_PIPELINES setting # See: https://doc.scrapy.org/en/latest/topics/item-pipeline.html # from scrapy.pipelines.images import ImagesPipeline from scrapy import Request from scrapy.exceptions import DropItem from items import PwItem import shutil from db.DBHelper import DBHelper from scrapy.pipelines.images import ImagesPipeline from items import PwItem, StarItem, GenreItem, LinkItem from scrapy.utils.project import get_project_settings import os db = DBHelper(False) class PwImagePipeline(ImagesPipeline): def get_media_requests(self, item, info): if isinstance(item, LinkItem): db.insertAVLink(item) return None if isinstance(item, GenreItem): db.inserGenreItem(item) return None if isinstance(item, StarItem): db.insertStarItem(item) return None yield Request(item['imageUrl'], meta={'parseStar': item['parseStar']})
from db.DBHelper import DBHelper db = DBHelper() # # # Create dict to JSON list # chunlian = {"slider_type": 0, "urls": ['slider-chunlian.jpeg']} # dazhuan = {"slider_type": 0, "urls": ['slider-dazhuan.png']} # nus = {"slider_type": 0, "urls": ['slider-nus.png']} # yinglian = {"slider_type": 0, "urls": ['slider-sgyinglian.png']} # shuanglin = {"slider_type": 0, "urls": ['slider-shuanglin.jpg']} # xinzhou = {"slider_type": 0, "urls": ['slider-xinzhou.png']} # mingsheng = { # "slider_type": 1, # "urls_dict": { # "slider-mingshengguji-1.jpg": ["濱海灣", "魚尾獅", "1965-2019"], # "slider-shuanglin.jpg": ["184 Jalan Toa Payoh, Singapore 319944", "雙林寺"], # } # } # fengsu = { # "slider_type": 1, # "urls_dict": { # "slider-yusheng.jpeg": ["南洋文化傳統", "撈魚生", "2019"], # "slider-durian.jpg": ["南洋水果文化", "榴蓮"], # } # } # yuyan = { # "slider_type": 1, # "urls_dict": { # "slider-nanyang.jpg": ["百花齐放", "南洋方言", "2019"], # "slider-minnan.png": ["Hokkien", "闽南语"], # }
def __init__(self): self.pageUrl = "http://sc.jb51.net/gaoxiaotupian/" self.domain = 'http://sc.jb51.net' BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312') self.dbHelper = DBHelper()
class ScJbSpider(BaseGifSpider): def __init__(self): self.pageUrl = "http://sc.jb51.net/gaoxiaotupian/" self.domain = 'http://sc.jb51.net' BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312') self.dbHelper = DBHelper() def startRequest(self): request_list = self.getTotalPageList() for item in request_list: list = self.startRequestCategoryList(item) for item in list: detail_list = self.requestChildPageList(item) for url in detail_list: self.requestDetailPage(url) def startRequestCategoryList(self, url): tree = super(ScJbSpider, self).request(url) if tree is not None: target_list = [] xhs = tree.xpath('//div[@class="all_list"]/ul/li') if xhs: length = len(xhs) for index in range(1, length + 1): hrefXhs = tree.xpath('//div[@class="all_list"]/ul/li[%d]/a/@href' % index) if hrefXhs is not None and len(hrefXhs) > 0: target_list.append(self.domain + hrefXhs[0]) return target_list def formatChinese(self, title): return title.encode('utf-8') def getTotalPageList(self): page_list = [self.pageUrl] tree = super(ScJbSpider, self).request() if tree is not None: nextPagesXhs = tree.xpath('//div[@class="pagelist"]//span[@class="pageinfo"]/strong[1]/text()') if nextPagesXhs: pageInfo = self.formatChinese(nextPagesXhs[0]) prefix = self.pageUrl + 'list_%s.html' if re.match("\d+", pageInfo): for index in range(2, 1 + int(pageInfo)): url = prefix % (str(index)) page_list.append(url) return page_list def requestDetailPage(self, page_url): tree = super(ScJbSpider, self).request(page_url) if tree is not None: xhs = tree.xpath('//div[@class="content-c2"]') if xhs: srcXhs = tree.xpath('//div[@class="content-c2"]//p[count(*)=1]/img/@src') # 选择所有p子节点为1的下面的img的href #titleXhs = tree.xpath('//div[@class="content-c2"]//p[not(self::text()[not(normalize-space())])]/text()') titleXhs = tree.xpath('//div[@class="content-c2"]//p[string-length(text()) > 0]/text()') title_list = [] if titleXhs is not None and len(titleXhs): for i in range(0,len(titleXhs)): title =self.formatChinese(titleXhs[i]).translate(string.maketrans("\n\t\r", " ")).strip() if len(title) > 0: title_list.append(title) if srcXhs is not None and len(srcXhs) > 0: if len(title_list) > 0: titleLength = len(title_list) for index in range(0, len(srcXhs)): href = srcXhs[index] if not href.startswith("http:"): href = self.domain + href if href.endswith('.gif'): if index < titleLength: if self.isGifUrlAvailable(href): item = GifItem() item.gif_url = href item.gif_title = title_list[index] self.dbHelper.saveGifItem(item) def isGifUrlAvailable(self,gifUrl): return super(ScJbSpider,self).isGifUrlAvailable(gifUrl) def requestChildPageList(self, page_url): child_list = [page_url] tree = super(ScJbSpider, self).request(page_url) if tree is not None: nextPagesXhs = tree.xpath('//div[@class="content-c2"]/div/ul[1]/li[1]/a/text()') if nextPagesXhs: url = self.formatChinese(nextPagesXhs[0]) regex = re.compile(r'\S+(\d+)\S+') result = regex.findall(url) if result: pageCount = int(result[0]) url = page_url.split('.htm')[0] for index in range(2, 1 + pageCount): full_url = url + '_' + str(index) + '.htm' child_list.append(full_url) print full_url return child_list
def __init__(self): self.pageUrl = "http://www.gaoxiaogif.com/all/" self.domain = 'http://www.gaoxiaogif.com' BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312') self.dbHelper = DBHelper()
class GaoXiaoGifSpider(BaseGifSpider): def __init__(self): self.pageUrl = "http://www.gaoxiaogif.com/all/" self.domain = 'http://www.gaoxiaogif.com' BaseGifSpider.__init__(self, self.pageUrl, charset='gb2312') self.dbHelper = DBHelper() def getTotalPageList(self): page_list = [self.pageUrl] tree = super(GaoXiaoGifSpider, self).request() hrefXhs = tree.xpath('//div[@class="page"]/a/@href') if hrefXhs is not None and len(hrefXhs) > 0: lasthref = hrefXhs[len(hrefXhs) - 1] if lasthref: array = lasthref.split("_") if len(array) > 1: pageInfo = array[1].split(".")[0] count = int(pageInfo) url_template = 'http://www.gaoxiaogif.com/all/index_%d.html' for index in range(2, 1 + count): url = url_template % (index) page_list.append(url) return page_list def parseCategoryPage(self, url): tree = super(GaoXiaoGifSpider, self).request(url) detail_page_url_list = [] if tree is not None: xhs = tree.xpath('//div[@class="likepage"]/ul/li') if xhs is not None and len(xhs) > 0: for index in range(2, 1 + len(xhs)): srcXhs = tree.xpath( '//div[@class="likepage"]/ul/li[%d]//a[@title]/@href' % index) if srcXhs is not None and len(srcXhs) > 0: href = srcXhs[0] if href.endswith('.html'): url = href if not href.startswith('http:'): url = self.domain + href detail_page_url_list.append(url) return detail_page_url_list def parseGifDetailPage(self, url): tree = super(GaoXiaoGifSpider, self).request(url) titleXhs = tree.xpath('//div[@class="listgif-title"]//h1/text()') if titleXhs is not None and len(titleXhs) > 0: title = self.formatChinese(titleXhs[0]) srcXhs = tree.xpath( '//div[@class="listgif-giftu content_pic"]//img/@src') if srcXhs is not None and len(srcXhs) > 0: for index in range(0, len(srcXhs)): gif_url = srcXhs[index] if not gif_url.startswith("http:"): gif_url = self.domain + gif_url if gif_url.endswith('.gif'): item = GifItem() if self.isGifUrlAvailable(gif_url): item.gif_url = gif_url item.gif_title = title self.dbHelper.saveGifItem(item) def isGifUrlAvailable(self, gifUrl): return super(GaoXiaoGifSpider, self).isGifUrlAvailable(gifUrl) def formatChinese(self, title): return title.encode('utf-8') def startRequest(self, reverse=False): category_list = self.getTotalPageList() if reverse: #逆序抓取 for i in range(len(category_list)): item = category_list[len(category_list) - i - 1] detail_urls = self.parseCategoryPage(item) for index in range(len(detail_urls)): url = detail_urls[len(detail_urls) - index - 1] self.parseGifDetailPage(url) else: for item in category_list: detail_urls = self.parseCategoryPage(item) time.sleep(10) for url in detail_urls: time.sleep(10) self.parseGifDetailPage(url)