Python PyQuery.findの例、pyquery.pyquery.PyQuery.find Pythonの例

コード例 #1

0

ファイルを表示

ファイル: solution.py プロジェクト: gleitz/code-kata

def extract_data(text):
    global total_data
    pq = PyQuery(text)
    data = pq.find('p.data').text()
    total_data = total_data + data
    nextState = pq.find('.nextState').attr('value')
    return nextState

コード例 #2

0

ファイルを表示

ファイル: ganji_ershoufang.py プロジェクト: yangmingsong/python

    def detail_page(self, response):
        t = response.text.replace('&nbsp;', '')
        d = PyQuery(t)
        base = response.save
        base_url = response.url
        fenbu = dict(map(
                lambda x: (x.find('.field-righttit').text(), x.find('ul').text()),
                list(d.find(".right-border div").items())
        ))
        basic_info = dict(map(
                lambda x: (x.text().replace(u'：', "").strip(),
                           x.parent().text().replace(x.text(), "").strip()),
                list(d.find('.fc-gray').items())
        ))
        other_info = dict(map(
                lambda x: (x.text().replace(u'：', ''), x.next().text()), list(d.find('.xiaoqu-otherinfo dt').items())
        ))
        info_temp = {
            'base': base,
            'sell_rent_info': fenbu,
            'basic_info': basic_info,
            'other_info': other_info
        }
        url = base_url + 'amenities/'
        self.crawl(url, callback=self.amenities_page, save=info_temp, retries=100)

        return [
            2,
            response.url,
            json.dumps(info_temp),
            time.strftime('%Y-%m-%d %X', time.localtime())
        ]

コード例 #3

0

ファイルを表示

ファイル: soufun.py プロジェクト: ptphp/PyLib

    def __getPageAllLink(self,p):        
#        if self.kind=="1":
#            lis=PyQuery(p)("div.qiuzu li")
#        elif self.kind=="2":
#            lis=PyQuery(p)("div.qiuzu li")
        if self.kind=="1" or self.kind=="2":
            lis=PyQuery(p)("div.house")
        else:
            lis=PyQuery(p)("div.qiuzu li")
        links=[]
        for li in lis:
#            if self.kind=="3":
#                tm=PyQuery(li)("p.time span").eq(1).text()
#                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            if self.kind=="2" or self.kind=="1":
                tm=PyQuery(li)("p.time").text()
                tm=tm and tm.replace("个人","") or ""
                link=self.baseurl+PyQuery(li)("p.housetitle a").attr("href")
            else: 
                tm=PyQuery(li)("span.li5").text()
                link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            if self.kind=="4": 
                if PyQuery(li)("span.li1").text()=="合租 ":
                    continue
#            tm=PyQuery(li)("span.li5").text()
#            link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
            #link=self.baseurl+PyQuery(li)("span.li2 a").attr("href")
#            print link
            if u"天" in tm:
                s=tm.find(u"天")
                tm=tm[:s]
                if int(tm)<8:
                    links.append(link)
                else:
                    break
            elif u"小时" in tm:
                links.append(link)
            elif u"分钟" in tm:
                links.append(link)
            else:
                continue
            if 1:#not checkPath(homepath,self.folder,link):
                LinkLog.info("%s|%s"%(self.kind,link))
                try:
                    getContent(link,self.citycode,self.kind)
                except Exception,e:print "ganji getContent Exception %s"%e
            time.sleep(int(self.st))
#            fetch_quere.put({"mod":"soufang","link":link,"citycode":self.citycode,"kind":self.kind})
#        self.clinks.extend(links)
       
        if self.kind=="1" or self.kind=="2":
            if len(links)!=30:
                return False
            else:
                return True
        else:
            if len(links)!=35:
                return False
            else:
                return True

コード例 #4

0

ファイルを表示

ファイル: models.py プロジェクト: point-source/WhatManager2

    def parse_html_page(self):
        pq = PyQuery(self.html_page)
        main_table = pq('#mainBody > table.coltable')

        def find_row(text):
            for c in main_table.find('td:first-child').items():
                if c.text() == text:
                    return c.nextAll().items().next()

        def find_row_text(text, default=''):
            row = find_row(text)
            if row:
                return row.text()
            return default

        def find_row_html(text, default=''):
            row = find_row(text)
            if row:
                return row.html()
            return default

        self.info_hash = find_row_text('Info hash')
        self.title = pq.find('#mainBody > h1').text()
        self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
        self.language = find_row_text('Language')
        self.cover_url = find_row('Picture:').find('img').attr('src')
        self.small_description = find_row_html('Small Description')
        self.description = find_row_html('Description')
        self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
        size_string = find_row_text('Size')
        match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
        self.torrent_size = int(match.group('size').replace(',', ''))

コード例 #5

0

ファイルを表示

ファイル: TopBBS.py プロジェクト: liuyun96/python

 def onSuccess(self, tid, context, response,headers):
     resp = PyQuery(response)
     for h3 in resp.find("h3 a"):
         url="http://dev.open.taobao.com/bbs/"+h3.attrib['href']
         print h3.text
         Spider.executeSql(self,"insert into task (task_type,url,status,http_code,task_context) values('topbbs文章',%s,0,-1,%s)",(url,h3.text))
     Spider.onSuccess(self,tid, context,response,headers);

コード例 #6

0

ファイルを表示

ファイル: models.py プロジェクト: ChaosTherum/WhatManager2

    def parse_html_page(self):
        pq = PyQuery(self.html_page)
        main_table = pq('#mainBody > table.coltable')

        def find_row(text):
            for c in main_table.find('td:first-child').items():
                if c.text() == text:
                    return c.nextAll().items().next()

        def find_row_text(text, default=''):
            row = find_row(text)
            if row:
                return row.text()
            return default

        def find_row_html(text, default=''):
            row = find_row(text)
            if row:
                return row.html()
            return default

        self.info_hash = find_row_text('Info hash')
        self.title = pq.find('#mainBody > h1').text()
        self.category, self.subcategory = find_row_text('Type').split(' - ', 1)
        self.language = find_row_text('Language')
        self.cover_url = find_row('Picture:').find('img').attr('src')
        self.small_description = find_row_html('Small Description')
        self.description = find_row_html('Description')
        self.torrent_url = find_row('Download').find('a#dlNormal').attr('href')
        size_string = find_row_text('Size')
        match = re.match('.* \((?P<size>\d+(,\d\d\d)*) bytes\)', size_string)
        self.torrent_size = int(match.group('size').replace(',', ''))

コード例 #7

0

ファイルを表示

ファイル: contact_info_aliexpress_adsl.py プロジェクト: yangmingsong/python

def page_parse(content, url):
    d = PyQuery(content)
    # print content[:200].encode('utf8')
    shop_name = d.find('.shop-name>a').text()
    shop_years = d.find('.shop-time>em').text()
    open_time = d.find('.store-time>em').text()
    contact_person = d.find('.contactName').text()
    contact_block = d.find('.box.block.clear-block').html()
    contact_detail = re.findall(pattern_contact_info, contact_block)
    crawl_time = time.strftime('%Y-%m-%d %X', time.localtime())
    return [
        url.replace('contactinfo/', '').replace('.html', ''),
        json.dumps(dict([
                            ('shop_name', shop_name),
                            ('contact_url', url),
                            ('shop_years', shop_years),
                            ('open_time', open_time),
                            ('contact_person', contact_person)
                        ] + contact_detail)
                   ),
        crawl_time
    ]

コード例 #8

0

ファイルを表示

ファイル: proxy_collection.py プロジェクト: yangmingsong/python

 def _parse(self, response):
     d = PyQuery(response)
     # page_turning
     __url = map(lambda x: x.attr('href'),
                 d.find(self.__css).items()
                 )
     if config_dictionary.get(self.__url_start).get('basejoin'):
         new_url = map(lambda u: urlparse.urljoin(self.__url_base, u), __url)
     else:
         new_url = __url
     self.__url_pool = self.__url_pool.union(set(new_url))
     # IP address extracting
     rst = ':'.join(d.text().split(' '))
     proxy_list = re.findall(pattern_ip_address, rst)
     proxy_port_queue.put((proxy_list, self.__url_base))

コード例 #9

0

ファイルを表示

ファイル: utils.py プロジェクト: ivanp/emailsopener

def serializeArray(form):
    form = PyQuery(form)
    if not form.is_('form'):
        return []

    source = form.find('input, select, textarea')

    data = []
    for input in source:
        input = PyQuery(input)
        if input.is_('[disabled]') or not input.is_('[name]'):
            continue
        if input.is_('[type=checkbox]') and not input.is_('[checked]'):
            continue

        data.append((input.attr('name'), input.val()))

    return data

コード例 #10

0

ファイルを表示

ファイル: tongcheng58.py プロジェクト: aviatorBeijing/ptpy

    def rent(self,url):
        hc= urlparse(url)[1].replace('.58.com',"") 
        hc2=citynameDict_sf.get(hc)
        if hc2:
            self.fd['house_city']=hc2
        else:
            self.fd['house_city']=hc  
        self.fd['house_flag'] = 2
        request = urllib2.Request(url, None, self.header)
        response = urllib2.urlopen(request).read()
        if self.mayGetIt(response):
            self.fd={}
            return 
#        tree = etree.HTML(response)  
        soup =BeautifulSoup(response)
        detail_mer = soup.find('ul',{'class':'info'})
        detail_mer_str =re.sub("\n|\t\r| ","",str(detail_mer))
        #print detail_mer_str
        #非个人房源 return
        #print re.search(self.agencyname_regex, response).group(1)
        if re.search(self.agencyname_regex, response):
            agencyname=re.search(self.agencyname_regex, response).group(1)
            if agencyname != '个人房源':return            
        else:
            return
                
        if re.search(self.username_regex, response):
            username=re.search(self.username_regex, response).group(1)
            self.fd['owner_name'] = username
        else:             
            self.fd['owner_name'] = ""

        owner_phone = soup('img')
#        print owner_phone
        self.fd['owner_phone_pic'] = ''
        for phone in owner_phone:
            if phone['src'].find('58.com/showphone.aspx') != -1:
                self.fd['owner_phone_pic'] = phone['src']
        #没有联系方式  return
        if not self.fd['owner_phone_pic']:return 
        
        if soup.find('div',{"class":'other'}):
            posttime = soup.find('div',{"class":'other'}).contents[0]                            
            posttime = re.sub('\n|\r| |\t','',posttime)
            posttime = posttime.replace('发布时间：','').replace('　浏览','')
        else:
            posttime = ''
                            
        if not posttime:
            return                            
        elif posttime.find('-') !=-1:
            s = datetime.datetime(int(posttime.split('-')[0]),int(posttime.split('-')[1],),int(posttime.split('-')[2]))
            posttime = int(time.mktime(s.timetuple()))
        elif posttime.find('分钟') !=-1:
            n = int(posttime.replace('分钟前',''))*60
            posttime = int(time.time() - n)
        elif posttime.find('小时') !=-1:
            n = int(posttime.replace('小时前',''))*60*60
            posttime = int(time.time() - n)
        self.fd['house_posttime'] = posttime
                            
        if (time.time() - self.fd['house_posttime']) > 3600*24*7: 
            return
#            print "++++++++++++++++"                 
#        print time.strftime('%Y %m %d', time.localtime(self.fd['posttime']))    
        
        if re.search(self.house_floor_regex, detail_mer_str):
            house_floor=re.search(self.house_floor_regex, detail_mer_str).group(1)
            self.fd['house_floor']  = int(house_floor)
        else:
            self.fd['house_floor'] = 0
            
        if re.search(self.house_topfloor_regex, detail_mer_str):
            house_topfloor=re.search(self.house_topfloor_regex, detail_mer_str).group(1)
            self.fd['house_topfloor'] = int(house_topfloor)
        else:
            self.fd['house_topfloor'] = 0   
        
        if re.search(self.house_totalarea_regex, detail_mer_str):
            house_totalarea=re.search(self.house_totalarea_regex, detail_mer_str).group(1)
            self.fd['house_addr'] = int(house_totalarea)
        else:
            self.fd['house_addr'] = 0
            
        #类型 
        self.fd['house_type'] = housetype(detail_mer_str) 
            
        self.fd['house_price'] = str(detail_mer.em.string)  
            
        if re.search(self.house_room_regex, detail_mer_str):
            house_room=re.search(self.house_room_regex, detail_mer_str).group(1)
            self.fd['house_room'] =int(house_room)
        else:
            self.fd['house_room'] = 0
            
        if re.search(self.house_hall_regex, detail_mer_str):
            house_hall=re.search(self.house_hall_regex, detail_mer_str).group(1)
            self.fd['house_hall'] = int(house_hall)
        else:
            self.fd['house_hall'] = 0
        
        if re.search(self.house_toilet_regex, detail_mer_str):
            house_toilet=re.search(self.house_toilet_regex, detail_mer_str).group(1)
            self.fd['house_toilet'] = int(house_toilet)
        else:
            self.fd['house_toilet'] = 0
            
        if re.search(self.house_veranda_regex, response):
            house_veranda=re.search(self.house_veranda_regex, response).group(1)
            self.fd['house_veranda'] = int(house_veranda)
        else:
            self.fd['house_veranda'] = 0
        
        if re.search(self.house_title_regex, response):
            house_title=re.search(self.house_title_regex, response).group(1)
            self.fd['house_title'] = house_title.replace("(求购)","").replace("(求租)","").replace("(出售)","")
        else:
            self.fd['house_title'] = ''
        
        #描述        
        detail_box = soup.find('div',{'class':'maincon'})
        if detail_box:
            house_desc = str(detail_box)
            self.fd['house_desc'] = re.sub("<.*?>|\n|\r|\t|联系我时，请说是在58同城上看到的，谢谢！","",house_desc)
        else:
            self.fd['house_desc'] = None

        #小区名
        if re.search(self.borough_name_regex, detail_mer_str):
            borough_name=re.search(self.borough_name_regex, detail_mer_str).group(1)
            try:
                self.fd['borough_name'] = re.sub("\(.*\)|<.*?>","",borough_name)
            except:
                self.fd['borough_name'] =borough_name
        else:
            self.fd['borough_name'] = ''
        lis=PyQuery(unicode(detail_mer_str,"UTF-8"))("li")
        for li in lis:
            lit= PyQuery(li).text()
            if "地址：" in lit:
                self.fd['house_addr']=lit[lit.find("：")+1:lit.find(u"(")]
                break
        #区域     
        area=detail_mer.find(text=u"区域：")
        if area:
            area_box = area.parent.parent
            area_a = area_box('a')
            if area_a and len(area_a)>1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = str(area_a[1].string)
            elif area_a and len(area_a)==1:
                self.fd['house_region'] = str(area_a[0].string)
                self.fd['house_section'] = ""
            else:
                self.fd['house_region'] = ""
                self.fd['house_section'] = ""
        else:
                self.fd['cityarea'] = ""
                self.fd['section'] = ""
        
        if re.search(self.house_age_regex, response):
            house_age=re.search(self.house_age_regex, response).group(1)
            Y=int(time.strftime('%Y', time.localtime()))
            house_age=Y-int(house_age)
            self.fd['house_age'] = house_age
        else:
            self.fd['house_age'] = 0
            
        #朝向
        self.fd['house_toward'] = toward(detail_mer_str)    
        self.fd['house_fitment'] = fitment(detail_mer_str)        
        self.fd['house_deposit'] = deposit(detail_mer_str)
        request = None
        response = None
        soup=None
        del request
        del response
        del soup

コード例 #11

0

ファイルを表示

ファイル: what_upload.py プロジェクト: karamanolev/WhatManager2

def extract_upload_errors(html):
    pq = PyQuery(html)
    result = []
    for e in pq.find('.thin > p[style="color: red; text-align: center;"]'):
        result.append(PyQuery(e).text())
    return result

コード例 #12

0

ファイルを表示

ファイル: what_upload.py プロジェクト: casedefault/WhatManager2

def extract_upload_errors(html):
    pq = PyQuery(html)
    result = []
    for e in pq.find('.thin > p[style="color: red; text-align: center;"]'):
        result.append(PyQuery(e).text())
    return result

コード例 #13

0

ファイルを表示

ファイル: classifier.py プロジェクト: mkouhei/lic-check

class Classifier(object):
    """classify verious licences.

    >>> c = Classifier()
    >>> c.segments
    [SoftwareLicenses, DocumentationLicenses, OtherLicenses]
    >>> c.segments[0].categories
    [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftwareLicenses]
    >>> c.segments[0].categories[0].licenses
    [GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ...
    """

    default_data = 'lic_check/license.html'

    def __init__(self):
        """initialize."""
        with open(self.default_data) as fobj:
            data = fobj.read()
        self.html = PyQuery(data)
        self.segments = self._parse()

    def _parse(self):
        """parse license html."""
        segments = []
        for segment in self._segments():
            segment.categories = self.categories(segment)
            for category in segment.categories:
                category.licenses = self.licenses(category)
            segments.append(segment)
        return segments

    def _segments(self):
        """segments."""
        return (Segment(i) for i in self.html.find('.big-section h3')
                .filter(lambda i: i != 0))

    def categories(self, segment=None):
        """categories.

        >>> c = Classifier()
        >>> c.categories(c.segments[0])
        [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware...
        >>> c.categories(c.segments[1])
        [FreeDocumentationLicenses, NonFreeDocumentationLicenses]
        >>> c.categories(c.segments[2])
        [OtherLicenses, Fonts, OpinionLicenses, Designs]
        >>> c.categories().get('SoftwareLicenses')
        [GPLCompatibleLicenses, GPLIncompatibleLicenses, NonFreeSoftware...
        >>> c.categories().get('DocumentationLicenses')
        [FreeDocumentationLicenses, NonFreeDocumentationLicenses]
        """
        if segment:
            return [Category(i, segment)
                    for i in self.__retrieve_cat_elem(segment)]
        else:
            return {'{0}'.format(_seg): self.categories(_seg)
                    for _seg in self.segments}

    def __retrieve_cat_elem(self, segment):
        return (self.html.find('.toc ul li a')
                .filter(lambda i, this: PyQuery(this)
                        .attr('href') == '#{0}'.format(segment))
                .siblings('ul').find('a'))

    def licenses(self, category=None):
        """licenses.

        >>> c = Classifier()
        >>> sw_lic = c.segments[0]
        >>> gpl_compat_lic = c.categories(sw_lic)[0]
        >>> gpl_compat_lics = c.licenses(gpl_compat_lic)
        >>> len(gpl_compat_lics)
        50
        >>> gpl_compat_lics[0]
        GNUGPLv3
        >>> gpl_compat_lics[0].category
        GPLCompatibleLicenses
        >>> gpl_compat_lics[0].segment
        SoftwareLicenses
        >>> gpl_incompat_lic = c.categories(c.segments[0])[1]
        >>> c.licenses(gpl_incompat_lic)
        [AGPLv1.0, AcademicFreeLicense, apache1.1, ...
        >>> nonfree_lic = c.categories(sw_lic)[2]
        >>> c.licenses(nonfree_lic)
        [NoLicense, Aladdin, apsl1, ...
        >>> c.licenses().get('GPLCompatibleLicenses')
        [GNUGPLv3, GPLv2, LGPLv3, LGPLv2.1, AGPLv3.0, ...
        """
        if category:
            return [License(i, category)
                    for i in self.__retrieve_lic_elem(category)
                    if i.get('id') and i.text]
        else:
            categories = []
            for i in self.categories().values():
                categories += i
            return {'{0}'.format(cat): self.licenses(cat)
                    for cat in categories}

    def __retrieve_lic_elem(self, category):
        return (self.html.find('.big-subsection h4#{0}'.format(category))
                .parent().next_all('dl').eq(0).children('dt a'))

コード例 #14

0

ファイルを表示

ファイル: cellphone_weibo.py プロジェクト: yangmingsong/python

def brand_list():
    res = requests.get('http://list.jd.com/list.html?cat=1319%2C1523%2C7052&go=0')
    d = PyQuery(res.content)
    return map(lambda a: a.text().split(u'（')[0], list(d.find('#brandsArea li a').items()))

コード例 #15

0

ファイルを表示

ファイル: weibo_base_framework.py プロジェクト: yangmingsong/python

def brand_list(url):
    # 利用京东地址返回品牌列表
    # 弃用，现用本地文件返回品牌列表
    res = requests.get(url)
    d = PyQuery(res.content)
    return map(lambda a: a.text().split(u'（')[0], list(d.find('#brandsArea li a').items()))

コード例 #16

0

ファイルを表示

def auto_save_img(html, skip_domain=None, img_url_base=''):
    from web.flask.globals import g
    from web.flask.helpers import url_for
    from pyquery.pyquery import PyQuery
    from runkit.http_utility import domain
    from runkit.utility import build_date_folder_file
    #from config.globals import PHOTOS_PATH
    #import Image, ImageEnhance
    #from manage.models.material import Material, MaterialService
    """
    自动保存远端的图片
    """

    if not html:
        return html

    pq = PyQuery(html)
    img_list = pq.find("img")

    replace_list = {}

    for img in img_list:
        if 'src' in img.attrib:
            img_src = img.attrib['src']
            if img_src.find('http') != -1:
                img_domain = domain(img_src)

                if img_domain != skip_domain and img_src not in replace_list:
                    #print img_domain, img_src
                    new_img_file = img_src.split('/')[-1]
                    name, ext = os.path.splitext(new_img_file)
                    ext = ext[1:]

                    folder_name, file_name = build_date_folder_file()
                    file_name += new_img_file

                    directory = '%s%s' % (PHOTOS_PATH, folder_name)

                    # 要创建目录
                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    local_file = '%s/%s' % (directory, file_name)
                    new_img_src = '%s%s/%s' % (img_url_base, folder_name,
                                               file_name)

                    #print local_file, new_img_src

                    # 1、下载数据
                    # 2、计算md5
                    # 3、从素材库中查找是否存在
                    try:
                        #urllib.urlretrieve(img_src, local_file)
                        sock = urllib2.urlopen(img_src)
                        rcv = sock.read()
                        sock.close()

                        m = hashlib.md5()
                        m.update(rcv)

                        material = MaterialService.get_by_file_signature(
                            m.hexdigest())

                        if not material:
                            f = open(local_file, 'wb')
                            f.write(rcv)
                            size = f.tell()
                            f.close()

                            material = Material()
                            material.added_user_id = g.user.id
                            material.file_name = file_name
                            material.file_ext = ext
                            material.file_path = folder_name
                            material.file_type = ext
                            material.file_size = size
                            material.file_signature = m.hexdigest()
                            material.thumbnail_file = ''
                            material.url = new_img_src

                            if 'alt' in img.attrib:
                                material.title = img.attrib['alt']
                            MaterialService.add_or_update(material)

                        new_img_src = url_for('misc.photo',
                                              id=material.id,
                                              ext=ext)

                    except Exception, e:
                        raise e

                    replace_list[img_src] = new_img_src
        else:
            raise Exception(u'内部错误')

コード例 #17

0

ファイルを表示

ファイル: save_img.py プロジェクト: liushaochan/cn486

def auto_save_img(html, skip_domain=None, img_url_base=''):
    from web.flask.globals import g
    from web.flask.helpers import url_for
    from pyquery.pyquery import PyQuery
    from runkit.http_utility import domain
    from runkit.utility import build_date_folder_file
    #from config.globals import PHOTOS_PATH
    #import Image, ImageEnhance
    #from manage.models.material import Material, MaterialService
    """
    自动保存远端的图片
    """

    if not html:
        return html

    pq = PyQuery(html)
    img_list = pq.find("img")

    replace_list = {}

    for img in img_list:
        if 'src' in img.attrib:
            img_src = img.attrib['src']
            if img_src.find('http') != -1:
                img_domain = domain(img_src)

                if img_domain != skip_domain and img_src not in replace_list:
                    #print img_domain, img_src
                    new_img_file = img_src.split('/')[-1]
                    name, ext = os.path.splitext(new_img_file)
                    ext = ext[1:]

                    folder_name, file_name = build_date_folder_file()
                    file_name += new_img_file

                    directory = '%s%s' % (PHOTOS_PATH, folder_name)

                    # 要创建目录
                    if not os.path.exists(directory):
                        os.makedirs(directory)

                    local_file = '%s/%s' % (directory, file_name)
                    new_img_src = '%s%s/%s' % (img_url_base, folder_name, file_name)

                    #print local_file, new_img_src

                    # 1、下载数据
                    # 2、计算md5
                    # 3、从素材库中查找是否存在
                    try:
                        #urllib.urlretrieve(img_src, local_file)
                        sock = urllib2.urlopen(img_src)
                        rcv = sock.read()
                        sock.close()

                        m = hashlib.md5()
                        m.update(rcv)

                        material = MaterialService.get_by_file_signature(m.hexdigest())

                        if not material:
                            f = open(local_file, 'wb')
                            f.write(rcv)
                            size = f.tell()
                            f.close()

                            material = Material()
                            material.added_user_id = g.user.id
                            material.file_name = file_name
                            material.file_ext = ext
                            material.file_path = folder_name
                            material.file_type = ext
                            material.file_size = size
                            material.file_signature = m.hexdigest()
                            material.thumbnail_file = ''
                            material.url = new_img_src

                            if 'alt' in img.attrib:
                                material.title = img.attrib['alt']
                            MaterialService.add_or_update(material)

                        new_img_src = url_for('misc.photo', id=material.id, ext=ext)

                    except Exception, e:
                        raise e

                    replace_list[img_src] = new_img_src
        else:
            raise Exception(u'内部错误')