def parse(message): pattern=scrapemark.compile(""" <xml> <ToUserName>![CDATA[{{ message.toUser }}]]</ToUserName> <FromUserName>![CDATA[{{ message.fromUser }}]]</FromUserName> <CreateTime>{{ message.createTime }}</CreateTime> <MsgType>![CDATA[{{ message.msgType }}]]</MsgType> {* <Content>![CDATA[{{ message.content }}]]</Content> *} {* <Location_X>{{ message.locationX }}</Location_X> <Location_Y>{{ message.localtionY }}</Location_Y> <Scale>20</Scale> <Label>![CDATA[{{ message.label }}]]</Label> *} {* <PicUrl>![CDATA[{{ message.picUrl }}]]</PicUrl> *} {* <Title>![CDATA[{{ message.title }}]]</Title> <Description>![CDATA[{{ message.description }}]]</Description> <Url>![CDATA[{{ message.url }}]]</Url> *} {* <Event>![CDATA[{{ message.event }}]]</Event> <EventKey>![CDATA[{{ message.eventkey }}]]</EventKey> *} {* <MsgId>{{ message.msgId }}</MsgId> *} </xml> """) msg=dict([(k,v) for (k,v) in pattern.scrape(html=re.sub('<(\!\[CDATA\[.*\]\])>', cdatarepl, message))['message'].items() if v]) msg['message']=message msg['createTime']=int(msg['createTime']) return msg
headers['User-Agent'] = user_agent if verbose: print 'fetching', url, '...' request = urllib2.Request(url, post, headers) opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cookie_jar)) res = opener.open(request).read() if verbose: print 'DONE fetching.' bs=BeautifulSoup(res).prettify() return bs scrapemark.fetch_html=fetch_beautified_html pattern = scrapemark.compile(""" {* <td valign="top" width="140"><img src="{{ [fishes].image }}" /></td><td>{{ [fishes].data }}</td> *} """) def scrape(): for fish in pattern.scrape(url='http://www.tcfishery.com/price/default.asp', post={'page':7})['fishes']: yield fish if __name__ == "__main__": for fish in scrape(): data=fish['data'].split(' ') fish['name']=data[1] fish['price']=float(data[5]) fish['date']=data[8] del fish['data'] logging.error(fish)
from application import scrapemark import logging import re pattern = scrapemark.compile( """ <xml> <ToUserName>![CDATA[{{ message.toUser }}]]</ToUserName> <FromUserName>![CDATA[{{ message.fromUser }}]]</FromUserName> <CreateTime>{{ message.createTime }}</CreateTime> <MsgType>![CDATA[{{ message.msgType }}]]</MsgType> {* <Content>![CDATA[{{ message.content }}]]</Content> *} {* <Location_X>{{ message.locationX }}</Location_X> <Location_Y>{{ message.localtionY }}</Location_Y> <Scale>20</Scale> <Label>![CDATA[{{ message.label }}]]</Label>*} {* <PicUrl>![CDATA[{{ message.picUrl }}]]</PicUrl> *} </xml> """ ) def cdatarepl(matchobj): return matchobj.group(1)