Ejemplo n.º 1
0
 def parsing(cls,
             url,
             configs=None,
             outers=None,
             document=None,
             check=None):
     """解析接口
     :param url:网址
     :type url:str
     :param configs:一般配置
     :type configs:dict
     :param outers:外链配置
     :type outers:dict
     :param document:网址的内容
     :type document:str
     :param check:要检测的字段,若没有抽取到该字段,则返回空的结果
     :type check:list
     :return:是否有可用的域名配置(bool),返回的解析结果(dict)
     :rtype: bool, dict
     """
     if check is None:
         check = ["content"]
     if not document:
         document = download_page(url)
     flag = False
     for config in Matcher.match(url, configs=configs, outers=outers):
         flag = True
         result = cls.extract(url, document, config)
         if all([result[field] for field in check]):
             return flag, result
     return flag, Formatter.format_result()
Ejemplo n.º 2
0
def retrieve_incomes(movie_id, full_week=False, use_cumes=False): 
    '''Retrieve income data for movie_ids from BoxOfficeMojo.'''

    def parse_incomes(page, full_week, use_cumes):
        '''Parse BoxOfficeMojo page for movie incomes.'''
        if use_cumes:
            if full_week:
                pattern = r'<font color="#800080" size="2">\$([0-9,]*?) / ([0-9]*?)</font>'
            else:
                pattern = r'<td align="right"><font size="2">\$([0-9,]*?)</font></td><td align="center"><font size="2">([0-9]*?)</font></td></tr>'
            incomes = [g[0] for g in re.findall(pattern, page)]
        else:
            if full_week:
                pattern = r'<font color="#000080">\$([0-9,]*?)</font>'
                incomes =  re.findall(pattern, page)
            else:
                pattern = r'<td align="center"><font size="2">([0-9\-]*?)</font></td><td align="right"><font size="2">\$([0-9,]*?)</font></td>'
                incomes =  [g[1] for g in re.findall(pattern, page)]
        return [int(g.replace(",","")) for g in incomes]
        
        
    url_template = "http://www.boxofficemojo.com/movies/?page=%s&id=%s.htm"
    if full_week:
        url = url_template % ("daily", movie_id)
    else:
        url = url_template % ("weekend", movie_id)
    resp = download_page(url)
    if resp is None:
        logging.warning("BoxOfficeMojo movie not found: %s" % url)
    else:
        return {'id': movie_id, 'values': parse_incomes(resp, full_week, use_cumes)}
Ejemplo n.º 3
0
def parse_friends_URL(url, only_id=True):
    '''Return list of friends and, if not only_id, page count and is artist.'''

    def parse_friends_seed_name(friends_page):
        # Unused option to load a profile name from the friends page
        # Unfortunately, depends on the page langauge, and works only by adding
        # cookie manager to open us.myspace.com, or else directly the cookie MSCulture
        # with IPCulture=en-US using ClientCookie in download.py
        seed_name_pattern = '<span class="feature_headtext">&nbsp;amigos de(.*?)</span>'
        try:        
            return re.search(seed_name_pattern, friends_page).group(1)
        except:
            return False

    def parse_friends_list(friends_page):
        '''Returns the list of friend IDs in the page.'''
        friends =  re.findall(friends_list_pattern, friends_page)
        # Remove deleted accounts looking for empty friend[2] 
        matches = [re.search(' title="(.*?)"', f[2]) for f in friends]
        return [{"id": int(friends[i][0]), "name": m.group(1), "url": friends[i][1]} for \
            i, m in enumerate(matches) if m is not None]

    def parse_friends_page_count(friends_page):
        '''Returns the number of friends pages in the page.'''
        match = re.search(page_count_pattern, friends_page)
        try:        
            return int(filter(is_digit, match.group(1)))
        except:
            return 1

    def parse_friends_is_artist(friends_page):
        '''Returns True if the page belongs to a musician.'''
        return friends_page.find(is_artist_pattern) > 0

    resp = download_page(url)
    friends = count_pages = is_artist = None
    if resp is None:
        logging.debug("URL error on: %s" % url)
    else:
        friends = parse_friends_list(resp)
        if not only_id:
            count_pages = parse_friends_page_count(resp)
            is_artist = parse_friends_is_artist(resp)
            # name = parse_friends_seed_name(resp)
    if only_id:
        return friends
    else:
        return friends, count_pages, is_artist
Ejemplo n.º 4
0
def parse_profile_URL(url, only_artists=True):
    '''Return detail of profile. As for now, implemented ONLY for artists.'''

    def parse_profile_id(profile_page):
        '''Returns the ID of a profile from the profile page.'''
        match = re.search(profile_id_pattern, profile_page)
        try:        
            return int(filter(is_digit, match.group(1)))
        except:
            return None

    def parse_profile_name(profile_page):
        '''Returns the name of a profile from the profile page.'''
        try:        
            return re.search(profile_name_pattern, profile_page).group(1)
        except:
            return None

    def parse_profile_suffix(profile_page):
        '''Returns the URL suffix from the profile page.'''
        try:        
            return re.search(profile_url_pattern, profile_page).group(1)
        except:
            return None
            # If it's not a musician, then it's
            # <span class="urlLink"><a href="http://www.myspace.com/bellatopa" title="Perfil MySpace para Antonella" class="url">www.myspace.com/bellatopa</a></span>

    resp = download_page(url)
    if resp is None:
        logging.debug("URL error on: %s" % url)
        return None
    id = parse_profile_id(resp)
    if id is None: # TO ADD: and only_artists:
        logging.debug("Profile is not an artist %s" % url)
        return None
    name = parse_profile_name(resp)
    url = parse_profile_suffix(resp)
    return {"id": id, "name": name, "url": url}
Ejemplo n.º 5
0
# -*- coding = utf-8 -*-
import re
import download
url='http://www.163.com/'
html=download.download_page(url)
urls=re.findall('<a href="(.*?)">(.*?)</a>')