def extract_data(text): global total_data pq = PyQuery(text) data = pq.find('p.data').text() total_data = total_data + data nextState = pq.find('.nextState').attr('value') return nextState
def detail_page(self, response): t = response.text.replace(' ', '') d = PyQuery(t) base = response.save base_url = response.url fenbu = dict(map( lambda x: (x.find('.field-righttit').text(), x.find('ul').text()), list(d.find(".right-border div").items()) )) basic_info = dict(map( lambda x: (x.text().replace(u':', "").strip(), x.parent().text().replace(x.text(), "").strip()), list(d.find('.fc-gray').items()) )) other_info = dict(map( lambda x: (x.text().replace(u':', ''), x.next().text()), list(d.find('.xiaoqu-otherinfo dt').items()) )) info_temp = { 'base': base, 'sell_rent_info': fenbu, 'basic_info': basic_info, 'other_info': other_info } url = base_url + 'amenities/' self.crawl(url, callback=self.amenities_page, save=info_temp, retries=100) return [ 2, response.url, json.dumps(info_temp), time.strftime('%Y-%m-%d %X', time.localtime()) ]
def __getPageAllLink(self,p): # if self.kind=="1": # lis=PyQuery(p)("div.qiuzu li") # elif self.kind=="2": # lis=PyQuery(p)("div.qiuzu li") if self.kind=="1" or self.kind=="2": lis=PyQuery(p)("div.house") else: lis=PyQuery(p)("div.qiuzu li") links=[] for li in lis: # if self.kind=="3": # tm=PyQuery(li)("p.time span").eq(1).text() # link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") if self.kind=="2" or self.kind=="1": tm=PyQuery(li)("p.time").text() tm=tm and tm.replace("个人","") or "" link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href") else: tm=PyQuery(li)("span.li5").text() link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") if self.kind=="4": if PyQuery(li)("span.li1").text()=="合租 ": continue # tm=PyQuery(li)("span.li5").text() # link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href") # print link if u"天" in tm: s=tm.find(u"天") tm=tm[:s] if int(tm)<8: links.append(link) else: break elif u"小时" in tm: links.append(link) elif u"分钟" in tm: links.append(link) else: continue if 1:#not checkPath(homepath,self.folder,link): LinkLog.info("%s|%s"%(self.kind,link)) try: getContent(link,self.citycode,self.kind) except Exception,e:print "ganji getContent Exception %s"%e time.sleep(int(self.st)) # fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind}) # self.clinks.extend(links) if self.kind=="1" or self.kind=="2": if len(links)!=30: return False else: return True else: if len(links)!=35: return False else: return True
def parse_html_page(self): pq = PyQuery(self.html_page) main_table = pq('#mainBody > table.coltable') def find_row(text): for c in main_table.find('td:first-child').items(): if c.text() == text: return c.nextAll().items().next() def find_row_text(text, default=''): row = find_row(text) if row: return row.text() return default def find_row_html(text, default=''): row = find_row(text) if row: return row.html() return default self.info_hash = find_row_text('Info hash') self.title = pq.find('#mainBody > h1').text() self.category, self.subcategory = find_row_text('Type').split(' - ', 1) self.language = find_row_text('Language') self.cover_url = find_row('Picture:').find('img').attr('src') self.small_description = find_row_html('Small Description') self.description = find_row_html('Description') self.torrent_url = find_row('Download').find('a#dlNormal').attr('href') size_string = find_row_text('Size') match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string) self.torrent_size = int(match.group('size').replace(',', ''))
def onSuccess(self, tid, context, response,headers): resp = PyQuery(response) for h3 in resp.find("h3 a"): url="http://dev.open.taobao.com/bbs/"+h3.attrib['href'] print h3.text Spider.executeSql(self,"insert into task (task_type,url,status,http_code,task_context) values('topbbs文章',%s,0,-1,%s)",(url,h3.text)) Spider.onSuccess(self,tid, context,response,headers);
def page_parse(content, url): d = PyQuery(content) # print content[:200].encode('utf8') shop_name = d.find('.shop-name>a').text() shop_years = d.find('.shop-time>em').text() open_time = d.find('.store-time>em').text() contact_person = d.find('.contactName').text() contact_block = d.find('.box.block.clear-block').html() contact_detail = re.findall(pattern_contact_info, contact_block) crawl_time = time.strftime('%Y-%m-%d %X', time.localtime()) return [ url.replace('contactinfo/', '').replace('.html', ''), json.dumps(dict([ ('shop_name', shop_name), ('contact_url', url), ('shop_years', shop_years), ('open_time', open_time), ('contact_person', contact_person) ] + contact_detail) ), crawl_time ]
def _parse(self, response): d = PyQuery(response) # page_turning __url = map(lambda x: x.attr('href'), d.find(self.__css).items() ) if config_dictionary.get(self.__url_start).get('basejoin'): new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url) else: new_url = __url self.__url_pool = self.__url_pool.union(set(new_url)) # IP address extracting rst = ':'.join(d.text().split(' ')) proxy_list = re.findall(pattern_ip_address, rst) proxy_port_queue.put((proxy_list, self.__url_base))
def serializeArray(form): form = PyQuery(form) if not form.is_('form'): return [] source = form.find('input, select, textarea') data = [] for input in source: input = PyQuery(input) if input.is_('[disabled]') or not input.is_('[name]'): continue if input.is_('[type=checkbox]') and not input.is_('[checked]'): continue data.append((input.attr('name'), input.val())) return data
def rent(self,url): hc= urlparse(url)[1].replace('.58.com',"") hc2=citynameDict_sf.get(hc) if hc2: self.fd['house_city']=hc2 else: self.fd['house_city']=hc self.fd['house_flag'] = 2 request = urllib2.Request(url, None, self.header) response = urllib2.urlopen(request).read() if self.mayGetIt(response): self.fd={} return # tree = etree.HTML(response) soup =BeautifulSoup(response) detail_mer = soup.find('ul',{'class':'info'}) detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer)) #print detail_mer_str #非个人房源 return #print re.search(self.agencyname_regex, response).group(1) if re.search(self.agencyname_regex, response): agencyname=re.search(self.agencyname_regex, response).group(1) if agencyname != '个人房源':return else: return if re.search(self.username_regex, response): username=re.search(self.username_regex, response).group(1) self.fd['owner_name'] = username else: self.fd['owner_name'] = "" owner_phone = soup('img') # print owner_phone self.fd['owner_phone_pic'] = '' for phone in owner_phone: if phone['src'].find('58.com/showphone.aspx') != -1: self.fd['owner_phone_pic'] = phone['src'] #没有联系方式 return if not self.fd['owner_phone_pic']:return if soup.find('div',{"class":'other'}): posttime = soup.find('div',{"class":'other'}).contents[0] posttime = re.sub('\n|\r| |\t','',posttime) posttime = posttime.replace('发布时间:','').replace(' 浏览','') else: posttime = '' if not posttime: return elif posttime.find('-') !=-1: s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2])) posttime = int(time.mktime(s.timetuple())) elif posttime.find('分钟') !=-1: n = int(posttime.replace('分钟前',''))*60 posttime = int(time.time() - n) elif posttime.find('小时') !=-1: n = int(posttime.replace('小时前',''))*60*60 posttime = int(time.time() - n) self.fd['house_posttime'] = posttime if (time.time() - self.fd['house_posttime']) > 3600*24*7: return # print "++++++++++++++++" # print time.strftime('%Y %m %d', time.localtime(self.fd['posttime'])) if re.search(self.house_floor_regex, detail_mer_str): house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1) self.fd['house_floor'] = int(house_floor) else: self.fd['house_floor'] = 0 if re.search(self.house_topfloor_regex, detail_mer_str): house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1) self.fd['house_topfloor'] = int(house_topfloor) else: self.fd['house_topfloor'] = 0 if re.search(self.house_totalarea_regex, detail_mer_str): house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1) self.fd['house_addr'] = int(house_totalarea) else: self.fd['house_addr'] = 0 #类型 self.fd['house_type'] = housetype(detail_mer_str) self.fd['house_price'] = str(detail_mer.em.string) if re.search(self.house_room_regex, detail_mer_str): house_room=re.search(self.house_room_regex, detail_mer_str).group(1) self.fd['house_room'] =int(house_room) else: self.fd['house_room'] = 0 if re.search(self.house_hall_regex, detail_mer_str): house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1) self.fd['house_hall'] = int(house_hall) else: self.fd['house_hall'] = 0 if re.search(self.house_toilet_regex, detail_mer_str): house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1) self.fd['house_toilet'] = int(house_toilet) else: self.fd['house_toilet'] = 0 if re.search(self.house_veranda_regex, response): house_veranda=re.search(self.house_veranda_regex, response).group(1) self.fd['house_veranda'] = int(house_veranda) else: self.fd['house_veranda'] = 0 if re.search(self.house_title_regex, response): house_title=re.search(self.house_title_regex, response).group(1) self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","") else: self.fd['house_title'] = '' #描述 detail_box = soup.find('div',{'class':'maincon'}) if detail_box: house_desc = str(detail_box) self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时,请说是在58同城上看到的,谢谢!","",house_desc) else: self.fd['house_desc'] = None #小区名 if re.search(self.borough_name_regex, detail_mer_str): borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1) try: self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name) except: self.fd['borough_name'] =borough_name else: self.fd['borough_name'] = '' lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li") for li in lis: lit= PyQuery(li).text() if "地址:" in lit: self.fd['house_addr']=lit[lit.find(":")+1:lit.find(u"(")] break #区域 area=detail_mer.find(text=u"区域:") if area: area_box = area.parent.parent area_a = area_box('a') if area_a and len(area_a)>1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = str(area_a[1].string) elif area_a and len(area_a)==1: self.fd['house_region'] = str(area_a[0].string) self.fd['house_section'] = "" else: self.fd['house_region'] = "" self.fd['house_section'] = "" else: self.fd['cityarea'] = "" self.fd['section'] = "" if re.search(self.house_age_regex, response): house_age=re.search(self.house_age_regex, response).group(1) Y=int(time.strftime('%Y', time.localtime())) house_age=Y-int(house_age) self.fd['house_age'] = house_age else: self.fd['house_age'] = 0 #朝向 self.fd['house_toward'] = toward(detail_mer_str) self.fd['house_fitment'] = fitment(detail_mer_str) self.fd['house_deposit'] = deposit(detail_mer_str) request = None response = None soup=None del request del response del soup
def extract_upload_errors(html): pq = PyQuery(html) result = [] for e in pq.find('.thin > p[style="color: red; text-align: center;"]'): result.append(PyQuery(e).text()) return result
class Classifier(object): """classify verious licences. >>> c = Classifier() >>> c.segments [SoftwareLicenses, DocumentationLicenses, OtherLicenses] >>> c.segments[0].categories [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftwareLicenses] >>> c.segments[0].categories[0].licenses [GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ... """ default_data = 'lic_check/license.html' def __init__(self): """initialize.""" with open(self.default_data) as fobj: data = fobj.read() self.html = PyQuery(data) self.segments = self._parse() def _parse(self): """parse license html.""" segments = [] for segment in self._segments(): segment.categories = self.categories(segment) for category in segment.categories: category.licenses = self.licenses(category) segments.append(segment) return segments def _segments(self): """segments.""" return (Segment(i) for i in self.html.find('.big-section h3') .filter(lambda i: i != 0)) def categories(self, segment=None): """categories. >>> c = Classifier() >>> c.categories(c.segments[0]) [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware... >>> c.categories(c.segments[1]) [FreeDocumentationLicenses, NonFreeDocumentationLicenses] >>> c.categories(c.segments[2]) [OtherLicenses, Fonts, OpinionLicenses, Designs] >>> c.categories().get('SoftwareLicenses') [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware... >>> c.categories().get('DocumentationLicenses') [FreeDocumentationLicenses, NonFreeDocumentationLicenses] """ if segment: return [Category(i, segment) for i in self.__retrieve_cat_elem(segment)] else: return {'{0}'.format(_seg): self.categories(_seg) for _seg in self.segments} def __retrieve_cat_elem(self, segment): return (self.html.find('.toc ul li a') .filter(lambda i, this: PyQuery(this) .attr('href') == '#{0}'.format(segment)) .siblings('ul').find('a')) def licenses(self, category=None): """licenses. >>> c = Classifier() >>> sw_lic = c.segments[0] >>> gpl_compat_lic = c.categories(sw_lic)[0] >>> gpl_compat_lics = c.licenses(gpl_compat_lic) >>> len(gpl_compat_lics) 50 >>> gpl_compat_lics[0] GNUGPLv3 >>> gpl_compat_lics[0].category GPLCompatibleLicenses >>> gpl_compat_lics[0].segment SoftwareLicenses >>> gpl_incompat_lic = c.categories(c.segments[0])[1] >>> c.licenses(gpl_incompat_lic) [AGPLv1.0, AcademicFreeLicense, apache1.1, ... >>> nonfree_lic = c.categories(sw_lic)[2] >>> c.licenses(nonfree_lic) [NoLicense, Aladdin, apsl1, ... >>> c.licenses().get('GPLCompatibleLicenses') [GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ... """ if category: return [License(i, category) for i in self.__retrieve_lic_elem(category) if i.get('id') and i.text] else: categories = [] for i in self.categories().values(): categories += i return {'{0}'.format(cat): self.licenses(cat) for cat in categories} def __retrieve_lic_elem(self, category): return (self.html.find('.big-subsection h4#{0}'.format(category)) .parent().next_all('dl').eq(0).children('dt a'))
def brand_list(): res = requests.get('http://list.jd.com/list.html?cat=1319%2C1523%2C7052&go=0') d = PyQuery(res.content) return map(lambda a: a.text().split(u'(')[0], list(d.find('#brandsArea li a').items()))
def brand_list(url): # 利用京东地址返回品牌列表 # 弃用,现用本地文件返回品牌列表 res = requests.get(url) d = PyQuery(res.content) return map(lambda a: a.text().split(u'(')[0], list(d.find('#brandsArea li a').items()))
def auto_save_img(html, skip_domain=None, img_url_base=''): from web.flask.globals import g from web.flask.helpers import url_for from pyquery.pyquery import PyQuery from runkit.http_utility import domain from runkit.utility import build_date_folder_file #from config.globals import PHOTOS_PATH #import Image, ImageEnhance #from manage.models.material import Material, MaterialService """ 自动保存远端的图片 """ if not html: return html pq = PyQuery(html) img_list = pq.find("img") replace_list = {} for img in img_list: if 'src' in img.attrib: img_src = img.attrib['src'] if img_src.find('http') != -1: img_domain = domain(img_src) if img_domain != skip_domain and img_src not in replace_list: #print img_domain, img_src new_img_file = img_src.split('/')[-1] name, ext = os.path.splitext(new_img_file) ext = ext[1:] folder_name, file_name = build_date_folder_file() file_name += new_img_file directory = '%s%s' % (PHOTOS_PATH, folder_name) # 要创建目录 if not os.path.exists(directory): os.makedirs(directory) local_file = '%s/%s' % (directory, file_name) new_img_src = '%s%s/%s' % (img_url_base, folder_name, file_name) #print local_file, new_img_src # 1、下载数据 # 2、计算md5 # 3、从素材库中查找是否存在 try: #urllib.urlretrieve(img_src, local_file) sock = urllib2.urlopen(img_src) rcv = sock.read() sock.close() m = hashlib.md5() m.update(rcv) material = MaterialService.get_by_file_signature( m.hexdigest()) if not material: f = open(local_file, 'wb') f.write(rcv) size = f.tell() f.close() material = Material() material.added_user_id = g.user.id material.file_name = file_name material.file_ext = ext material.file_path = folder_name material.file_type = ext material.file_size = size material.file_signature = m.hexdigest() material.thumbnail_file = '' material.url = new_img_src if 'alt' in img.attrib: material.title = img.attrib['alt'] MaterialService.add_or_update(material) new_img_src = url_for('misc.photo', id=material.id, ext=ext) except Exception, e: raise e replace_list[img_src] = new_img_src else: raise Exception(u'内部错误')
def auto_save_img(html, skip_domain=None, img_url_base=''): from web.flask.globals import g from web.flask.helpers import url_for from pyquery.pyquery import PyQuery from runkit.http_utility import domain from runkit.utility import build_date_folder_file #from config.globals import PHOTOS_PATH #import Image, ImageEnhance #from manage.models.material import Material, MaterialService """ 自动保存远端的图片 """ if not html: return html pq = PyQuery(html) img_list = pq.find("img") replace_list = {} for img in img_list: if 'src' in img.attrib: img_src = img.attrib['src'] if img_src.find('http') != -1: img_domain = domain(img_src) if img_domain != skip_domain and img_src not in replace_list: #print img_domain, img_src new_img_file = img_src.split('/')[-1] name, ext = os.path.splitext(new_img_file) ext = ext[1:] folder_name, file_name = build_date_folder_file() file_name += new_img_file directory = '%s%s' % (PHOTOS_PATH, folder_name) # 要创建目录 if not os.path.exists(directory): os.makedirs(directory) local_file = '%s/%s' % (directory, file_name) new_img_src = '%s%s/%s' % (img_url_base, folder_name, file_name) #print local_file, new_img_src # 1、下载数据 # 2、计算md5 # 3、从素材库中查找是否存在 try: #urllib.urlretrieve(img_src, local_file) sock = urllib2.urlopen(img_src) rcv = sock.read() sock.close() m = hashlib.md5() m.update(rcv) material = MaterialService.get_by_file_signature(m.hexdigest()) if not material: f = open(local_file, 'wb') f.write(rcv) size = f.tell() f.close() material = Material() material.added_user_id = g.user.id material.file_name = file_name material.file_ext = ext material.file_path = folder_name material.file_type = ext material.file_size = size material.file_signature = m.hexdigest() material.thumbnail_file = '' material.url = new_img_src if 'alt' in img.attrib: material.title = img.attrib['alt'] MaterialService.add_or_update(material) new_img_src = url_for('misc.photo', id=material.id, ext=ext) except Exception, e: raise e replace_list[img_src] = new_img_src else: raise Exception(u'内部错误')