def fetchTextData(self, url, channel): try: soup = self.fetchUrl(url) datalist = soup.findAll("tr", {"class": "tr3 t_one"}) objs = [] sortType = dateutil.y_m_d() print url, len(datalist) for item in datalist: ahref = item.first("a") if ahref != None and item.first("h3") != None: try: if ahref.get('href').count("read-htm") > 0: continue obj = {} obj['fileDate'] = '' name = item.first("h3").text obj['name'] = name.replace("'", "") print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() # self.t_queue.put(TextItemContentParse(ahref.get('href'))) ret = self.fetchText(ahref.get('href')) if ret == None: print '没有文章数据', ahref.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(url) div = soup.first("div", {"class": "box list channel"}) if div == None: print '没有数据', url return [] datalist = div.findAll("li") objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref != None: try: obj = {} obj['fileDate'] = ahref.first('span').text obj['name'] = ahref.text.replace(obj['fileDate'], '') print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() # self.t_queue.put(TextItemContentParse(ahref.get('href'))) ret = self.fetchText(ahref.get('href')) if ret == None: print '没有文章数据', ahref.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)
def run(self): dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) for i in range(1, maxImgPage): objs = self.fetchGirlChannelData(i) print "解析 Girl channel图片ok----channel=", self.t_obj[ 'url'], ' size=', len(objs) for obj in objs: try: sortType = dateutil.y_m_d() obj['sortType'] = sortType ops.inertImgItems(obj) print 'items :', obj['url'], obj[ 'channel'], " piclen=", len(obj['picList']) for picItem in obj['picList']: item = {} item['itemUrl'] = obj['url'] item['picUrl'] = picItem item['origUrl'] = picItem ops.inertImgItems_item(item) print 'items_item :', obj dbVPN.commit() except Exception as e: print common.format_exception(e) dbVPN.commit() dbVPN.close()
def fetchImgItemsData(self, url, channel): try: lis = self.fetchDataHead(url) print url, ";itemsLen=", len(lis) objs = [] sortType = dateutil.y_m_d() for item in lis: obj = {} obj['name'] = item.first("div",{"class":"float-left"}).text print obj['name'] obj['url'] = item.get('href') obj['fileDate'] = item.first("div",{"class":"float-right"}).text obj['baseurl'] = baseurlImg obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(item.get('href')) if len(pics) == 0: print '没有 图片文件--', item, '---', url continue obj['picList'] = pics obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], 'filedate=', obj['fileDate'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def fetchImgItemsData(self, url, channel): soup = self.fetchUrl(baseurl6,url) datalist = soup.findAll("li",{"class":"yun1 yun-large1 border-gray"}) objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref!=None: try: obj = {} obj['fileDate'] = "" name = ahref.first("p").text obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl6 obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print name,pics[0],' url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) except Exception as e: print common.format_exception(e) return objs
def fetchImgItemsData(self, url, channel): objs = [] try: lis = self.fetchDataHead(url) sortType = dateutil.y_m_d() for item in lis: ahref = item.first("a") if ahref != None: obj = {} name = item.first("span").text obj['name'] = name print name aurl = ahref.get("href") if aurl.count("http") == 0: aurl = '/' + aurl else: aurl = aurl.replace(baseurl, '') obj['url'] = aurl obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = item.first("b", {"class": "b1"}).text pics = self.fetchImgs(baseurl + obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print 'url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e) return objs
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(url) datalist = soup.findAll("li",{"class":"col-xs-12 clearfix news-box"}) objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref!=None: try: obj = {} obj['fileDate'] = '' name = ahref.get("title") obj['name'] = name print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() ret = self.fetchText(ahref.get('href')) if ret==None: print '没有文章数据',ahref.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(url) div = soup.first("div", {"class": "text-list-html"}) if div == None: print '没有数据', url return [] datalist = div.findAll("li") objs = [] sortType = dateutil.y_m_d() for item in datalist: ahrefs = item.findAll("a") for ahref in ahrefs: obj = {} span = ahref.first('span') if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = ahref.text.replace(obj['fileDate'], '') obj['name'] = name print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() self.t_queue.put(TextItemContentParse(ahref.get('href'))) obj['sortType'] = sortType objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def fetchImgItemsData(self, url, channel): soup = self.fetchUrl(baseurl10, url) div = soup.findAll( "li", {"class": "col-md-14 col-sm-16 col-xs-12 clearfix news-box"}) objs = [] sortType = dateutil.y_m_d() for item in div: ahref = item.first("a") if ahref != None: try: obj = {} obj['fileDate'] = '' obj['name'] = ahref.get("title") obj['url'] = ahref.get('href') obj['baseurl'] = baseurl10 obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print name, pics[0], ' url=', obj['url'], ' 图片数量=', len( pics) objs.append(obj) except Exception as e: print common.format_exception(e) return objs
def fetchImgItemsData(self, url, channel): objs = [] try: divs = self.fetchDataHead(url) sortType = dateutil.y_m_d() for item in divs: imgDiv = item.first("div", {"class": "media-image"}) if imgDiv != None: obj = {} name = item.first("div", { "class": "block-layer block-inner" }).first("a").text obj['name'] = name obj['url'] = imgDiv.first("a").get("href") obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e) return objs
def fetchImgGrilChannel(self, url): soup = self.fetchUrl(url) objs = [] table = soup.find('div', {"class": "box movie_list"}) if table == None: print '没有 channel:', url return None aList = table.findAll('a') for item in aList: obj = {} obj['url'] = item.get('href') obj['baseurl'] = baseurl img = item.find('img') if img != None: obj['pic'] = img.get('data-original') else: obj['pic'] = None obj['updateTime'] = dateutil.y_m_d() obj['rate'] = 1.4 obj['showType'] = 3 obj['channel'] = 'porn_sex' obj['channelType'] = 'porn_sex' obj['name'] = self.fetchImgGrilChannelName(item.get('href')) print obj objs.append(obj) return objs
def fetchFileData(self, url, channel): try: soup = self.fetchUrl(url) data = soup.first("div", {"class": "text-list-html"}) objs = [] sortType = dateutil.y_m_d() if data!=None: item = data.first("ul") if item!=None: ahrefs = item.findAll("a") for ahref in ahrefs: obj = {} span = ahref.first('span') if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = ahref.get("title") obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() obj['sortType'] = sortType mp3 = self.fetchMp3(ahref.get('href')) if mp3 == None: print '没有mp3文件--', ahref, '---', url continue print name,mp3 obj['file'] = mp3 objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def fetchImgItemsData(self, url, channel): try: lis = self.fetchDataHead(url) objs = [] sortType = dateutil.y_m_d() for item in lis: ahrefs = item.findAll("a") for ahref in ahrefs: obj = {} span = ahref.first('span') if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = ahref.text.replace(obj['fileDate'], '') obj['name'] = name print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(ahref.get('href')) if len(pics) == 0: print '没有 图片文件--', ahref, '---', url continue obj['picList'] = pics obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def fetchImgItemsData(self, url, channel): soup = self.fetchUrl(url) div = soup.first("div", {"class": "list_art"}) if div == None: print '没有数据', url return [] datalist = div.findAll("li") objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref != None: try: obj = {} name = ahref.text obj['name'] = name print name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() obj['fileDate'] = item.first('span').text pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print 'url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) except Exception as e: print common.format_exception(e) return objs
def fetchImgItemsData(self, url, channel): objs = [] try: soup = self.fetchUrl(url) sortType = dateutil.y_m_d() div = soup.first('div', {'class': 'box list channel'}) if div != None: lis = div.findAll('li') for item in lis: ahref = item.first("a") if ahref != None: obj = {} udate = ahref.first('span').text name = ahref.text.replace(udate, '') obj['name'] = name obj['url'] = ahref.get("href") obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e) return objs
def fetchImgItemsData(self, url, channel): try: lis = self.fetchDataHead(url) print url, ";itemsLen=", len(lis) objs = [] sortType = dateutil.y_m_d() for li in lis: ahref = li.first("a") if ahref != None: obj = {} obj['name'] = ahref.get("title") print obj['name'] obj['url'] = ahref.get('href') obj['fileDate'] = ahref.first("span").text obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(ahref.get('href')) if len(pics) == 0: print '没有 图片文件--', ahref, '---', url continue obj['picList'] = pics obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], 'filedate=', obj[ 'fileDate'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def fetchImgItemsData(self, url, channel): soup = self.fetchUrl(baseurl7, url) datalist = soup.findAll("div", {"class": "x3 margin-top"}) objs = [] sortType = dateutil.y_m_d() for item in datalist: ahref = item.first("a") if ahref != None: try: obj = {} obj['fileDate'] = item.first("span", { "class": "icon-heart text-small text-gray float-right" }).text name = item.first("img").get("alt") obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl7 obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(obj['url']) if len(pics) == 0: print '没有 图片文件--', obj['url'], '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType print name, pics[0], ' url=', obj['url'], ' 图片数量=', len( pics) objs.append(obj) except Exception as e: print common.format_exception(e) return objs
def fix3(): dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) sortType = dateutil.y_m_d() items = ops.getImgItems_itemBySortType(sortType) dbVPN.close() for obj in items: ext = os.path.splitext(obj['picUrl'])[1] out = fileOrige + str(obj['id']) + ext path = fileCompress + str(obj['id']) + ext os.system("wget -O %s %s " % (out, obj['picUrl'])) os.system("mogrify -resize 80%x80% " + out) os.system("convert -resize 50%x50% " + out + ' ' + path) print 'sync imgok url=', obj['picUrl']
def run(self): channels = self.parseChannel() dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) sortType = dateutil.y_m_d() for obj in channels: channel = obj['url'] url = obj['baseurl'] ops.inertImgChannel(obj) dbVPN.commit() imgitem = {} imgitem['name'] = '搞笑gif动态' imgitem['url'] = 'hugao8.com/category/gao-gif/' imgitem['baseurl'] = baseurl2 imgitem['channel'] = channel imgitem['updateTime'] = datetime.datetime.now() imgitem['fileDate'] = '' imgitem['showType'] = 3 imgitem['sortType'] = sortType pics = [] for i in range(1, maxImgPage): if i != 1: url = "%s%s%s%s" % (obj['baseurl'], "page/", i, "/") imgs = self.fetchImgs(url) print len(imgs), url pics.extend(imgs) if len(imgs) == 0: break print(i % 2) if i % 2 == 0: imgitem['picList'] = pics imgitem['pics'] = len(pics) imgitem['pic'] = pics[0] imgitem['url'] = '%s%s' % ('hugao8.com/category/gao-gif/', i) ops.inertImgItems(imgitem) dbVPN.commit() print '一次提交', imgitem['url'], len(pics) try: for picItem in imgitem['picList']: item = {} item['itemUrl'] = imgitem['url'] item['picUrl'] = picItem ops.inertImgItems_item(item) dbVPN.commit() except Exception as e: print common.format_exception(e) pics = [] dbVPN.commit()
def run(self): channels = self.parseChannel() dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) sortType = dateutil.y_m_d() for obj in channels: channel = obj['url'] url = obj['baseurl'] ops.inertImgChannel(obj) dbVPN.commit() imgitem = {} imgitem['name'] = 'gif动态' imgitem['url'] = 'forum-47-1.html' imgitem['baseurl'] = baseurl5 imgitem['channel'] = channel imgitem['updateTime'] = datetime.datetime.now() imgitem['fileDate'] = '' imgitem['showType'] = 3 imgitem['sortType'] = sortType pics = [] for i in range(1, maxImgPage): url = "%s%s%s" % (obj['baseurl'].replace("1.html", ''), i, ".html") imgs = self.fetchImgs(url) print len(imgs), url pics.extend(imgs) if i % 5 == 0: imgitem['picList'] = pics imgitem['pics'] = len(pics) imgitem['pic'] = pics[0] imgitem['url'] = '%s%s' % ('xng666.com/a/gif/', i) ops.inertImgItems(imgitem) dbVPN.commit() print '一次提交', imgitem['url'], len(pics) try: for picItem in imgitem['picList']: item = {} item['itemUrl'] = imgitem['url'] item['picUrl'] = picItem ops.inertImgItems_item(item) dbVPN.commit() except Exception as e: print common.format_exception(e) pics = [] dbVPN.commit()
def run(self): dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) ops.inertImgChannel(self.t_obj) dbVPN.commit() # 有分页 sortType = dateutil.y_m_d() # channel = self.t_obj['url'] # channel = urlparse(self.t_obj['baseurl']).netloc for name, url in img_channels.items(): obj = {} obj['name'] = name obj['channel'] = self.t_obj['url'] obj['updateTime'] = datetime.datetime.now() obj['fileDate'] = '' obj['baseurl'] = baseurl obj['showType'] = 3 # obj['url'] = url.replace("&", "") obj['url'] = urlparse(self.t_obj['baseurl']).path print obj['url'] # obj['pics'] = len(pics) obj['sortType'] = sortType pics = [] for i in range(1, 3): url = url + str(i) alist = self.fetchDataHead(url) print '解析', i, "页--", len(alist) for item in alist: pic = self.fetchImgItemData(item.get("href")) if pic == None: continue pics.append(pic) obj['picList'] = pics obj['pics'] = len(pics) ops.inertImgItems(obj) for picItem in obj['picList']: item = {} item['itemUrl'] = obj['url'] item['picUrl'] = picItem ops.inertImgItems_item(item) dbVPN.commit()
def fetchImgItemsData(self, url, channel): try: trs = self.fetchDataHead(url) print url, ";itemsLen=", len(trs) objs = [] sortType = dateutil.y_m_d() for item in trs: ahrefs = item.findAll("a") if ahrefs == None: continue for ahref in ahrefs: match = img_channel_title.search(ahref.text) if match == None: continue obj = {} match = img_channel_date.search(ahref.text) if match != None: obj['fileDate'] = match.group(0) else: obj['fileDate'] = '' name = ahref.text.replace(obj['fileDate'], '') obj['name'] = name obj['url'] = ahref.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() pics = self.fetchImgs(ahref.get('href')) if len(pics) == 0: print '没有 图片文件--', ahref, '---', url continue obj['picList'] = pics obj['showType'] = 3 obj['pics'] = len(pics) obj['sortType'] = sortType obj['showType'] = 3 print 'url=', obj['url'], 'filedate=', obj[ 'fileDate'], ' 图片数量=', len(pics) objs.append(obj) return objs except Exception as e: print common.format_exception(e)
def run(self): try: dbVPN = db.DbVPN() ops = db_ops.DbOps(dbVPN) sortType = dateutil.y_m_d() # sortType = "2017-07-12" for i in range(0, 20000): # ret = ops.getTextChannelItems(self.t_item["url"], i) ret = ops.getTextChannelItemsById(i, sortType) if len(ret) == 0: print '写入完毕' break print '开始写入 channel :', self.t_item["url"], cloase = False for item in ret: # path = filePATH + str(item['id']) + ".txt" # if os.path.exists(path) == False: # output = open(path, 'w') # output.write(item['file']) # output.close() # print '写完文件:' + path # path = filePATHWeb + str(item['id']) + ".txt" # if os.path.exists(path) == False: # output = open(path, 'w') # output.write(html_parse.filter_tags(item['file'])) # output.close() # print '写完文件:' + path path = filePATHHtml + str(item['id']) + ".html" # if os.path.exists(path) == False: output = open(path, 'w') output.write( html_parse.txtToHtml( html_parse.filter_tags(item['file']))) output.close() print '写完文件:' + path print '写完页', i print 'channel :', self.t_item["url"], '同步完成 len=', len(ret) dbVPN.close() except Exception as e: print common.format_exception(e)
def fetchgirlChannelItemsOne(self, item): obj = {} obj['url'] = item.get("href") strName = item.text.replace("[if lt IE 9 ]>", "").replace("<![endif]", "") obj['name'] = html_parse.filter_tags(strName) span = item.first('span') if span != None: obj['fileDate'] = html_parse.filter_tags( span.text.replace("[if lt IE 9 ]>", "").replace("<![endif]", "")) obj['name'] = obj['name'].replace(obj['fileDate'], '') else: obj['fileDate'] = '' obj['channel'] = self.t_obj['url'].replace("/?m=", '') obj['updateTime'] = dateutil.y_m_d() obj['baseurl'] = baseurl pics = self.fetchImgs(item.get("href")) obj['pics'] = len(pics) obj['picList'] = pics obj['showType'] = 3 print obj['url'], '解析完毕', obj['channel'], len(pics), obj['name'] return obj
def fetchTextData(self, url, channel): try: soup = self.fetchUrl(baseurl+url) div = soup.first("div", {"class": "novelList"}) if div == None: print '没有数据', url return [] datalist = div.findAll("a") objs = [] sortType = dateutil.y_m_d() for item in datalist: try: obj = {} span = item.first('div',{"class":"pull-right date "}) if span != None: obj['fileDate'] = span.text else: obj['fileDate'] = '' name = item.first("div",{"class":"pull-left"}).text obj['name'] = name.replace("【完】","") print name obj['url'] = item.get('href') obj['baseurl'] = baseurl obj['channel'] = channel obj['updateTime'] = datetime.datetime.now() ret = self.fetchText(item.get('href')) if ret==None: print '没有文章数据',item.get('href') continue obj['sortType'] = sortType objs.append(obj) except Exception as e: print common.format_exception(e) return objs except Exception as e: print common.format_exception(e)