Beispiel #1
0
class UrlNormalize():
  #def init_onece(self, *args, **kwargs):
  def __init__(self, module_path = 'le_crawler.base.url_normalize_settings'):
    module_path = module_path
    __import__(module_path)
    self.__settings = CrawlerSettings(settings_module = sys.modules[module_path])
    self.__load_settings()

  def __convert_map_regs(self, regmap):
    if not regmap:
      return {}
    tmpres = {}
    for (id, reglist) in regmap.items():
      regstmp = []
      for r in reglist:
        #print 're....%s' % r
        regstmp.append(re.compile(r, re.IGNORECASE))
      if regstmp:
        tmpres[id] = regstmp
    return tmpres

  def __accept_reg(self, id_reglist, item):
    if not item:
      return None
    for (id, tmpres) in id_reglist.items():
      for r in tmpres:
        if r.search(item):
          return id
    return None

  def __get_keep_para_list(self, mapdict, id):
    if mapdict.has_key(id):
      return mapdict[id]
    return []

  def __get_keep_query_lst(self, id):
    return self.__get_keep_para_list(self.__keep_query, id)

  def __keep_fragment(self, id):
    if not self.__keep_fragments or not self.__keep_fragments.has_key(id):
      return False
    return self.__keep_fragments[id]
  def __update_paras_with_extra(self, input_dict, id):
    if not self.__extra_para.has_key(id):
      return input_dict
    input_dict.update(self.__extra_para[id])

  def get_mapping_id(self, url = None, domain = None):
    # first mapping domain
    if self.__id_mapping_domain.has_key(domain):
      return self.__id_mapping_domain(domain)
    # second try match reg
    return self.__accept_reg(self.__id_mapping_reg, url)

  def __set_query_dict(self, org_dict, id):
    if org_dict is None or id is None:
      return {}
    domain_k_p = self.__get_keep_query_lst(id)
    retdict = {}
    for (k,ef) in domain_k_p:
      if org_dict.has_key(k) and org_dict[k] != '':
        retdict[k] = org_dict[k]
      elif ef:
        retdict[k] = ''
    return retdict

  def __join_query(self, inputd):
    if not inputd:
      return ''
    query_str = None
    reslist = sorted(inputd.items(), key = lambda d: d[0], reverse = True)
    for (k, v) in reslist:
      if query_str:
        query_str += '&%s=%s' % (k, v[0])
      query_str = '%s=%s' % (k, v[0])
    return query_str

  def __load_settings(self):
    self.__id_mapping_reg = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_REG', {}))
    self.__id_mapping_domain = self.__convert_map_regs(self.__settings.getdict('ID_MAPPING_DOMAIN', {}))
    self.__keep_query = self.__settings.getdict('KEEP_QUERY', {})
    self.__keep_fragments = self.__settings.getdict('KEEP_FRAGEMENT', {})
    self.__extra_para = self.__settings.getdict('ADD_EXTRA_PARA', {})

  def get_unique_url(self,
      url,
      scheme = None,
      netloc = None,
      domain = None,
      no_conf_no_oper = False):
    id = self.get_mapping_id(url = url, domain = domain)
    if id is None:
      if not no_conf_no_oper:
        id = 'DEFAULT'
      else:
        return url
    if id is None or url is None:
      raise Exception('Failed get mapping id for: %s, %s' % (domain, url))
    urlp = urlparse.urlsplit(url.strip(), allow_fragments = self.__keep_fragment(id))
    if not urlp:
      raise Exception('Failed convert urlparse %s' % url)
    nscheme = urlp.scheme or scheme
    nnetloc = urlp.netloc or netloc
    qdict = urlparse.parse_qs(urlp.query)
    fqdict = self.__set_query_dict(qdict, id)
    self.__update_paras_with_extra(fqdict, id)
    nquery = self.__join_query(fqdict)
    return urlparse.urlunsplit((nscheme, nnetloc, urlp.path, nquery,
      urlp.fragment)).strip()
class HeadLineAlbumExtractor(object):
  _instance = None
  _instance_lock = threading.Lock()
  @staticmethod
  def get_instance(
      start_url_loader,
      setting_module_path = 'le_crawler.common.headline_album_settings',
      *kargs,
      **kwargs):
    HeadLineAlbumExtractor._instance_lock.acquire()
    if not HeadLineAlbumExtractor._instance:
      loger = Log('album_crawler', '../log/album_crawler.log')
      HeadLineAlbumExtractor._instance = \
      HeadLineAlbumExtractor(start_url_loader,
          loger,
          setting_module_path,
          *kargs,
          **kwargs)
    HeadLineAlbumExtractor._instance_lock.release()
    return HeadLineAlbumExtractor._instance

  def __init__(self,
      start_url_loader,
      loger,
      setting_module_path = 'le_crawler.common.headline_album_settings',
      *kargs,
      **kwargs):
    __import__(setting_module_path)
    self.__settings = CrawlerSettings(settings_module
        = sys.modules[setting_module_path])
    self.loger = loger
    # {album_id, {}}
    self.album_ids = {}
    self.__init_regs()
    self.__extend_map_handler = kwargs['extend_map_handler'] if \
    kwargs.has_key('extend_map_handler') else None
    if kwargs.has_key('extract_setting'):
      self.__extend_map_handler = \
      ExtendMapHandler.get_instance(start_url_loader,
          kwargs['extract_setting'])
    else:
      self.__extend_map_handler =\
          ExtendMapHandler.get_instance(start_url_loader)
    self.__url_normalize = UrlNormalize.get_instance()
    self.album_infos = {}
    self.url_filter = UrlFilter().get_instance()

  def get_category_id(self, refer_url):
    ca = self.__get_category_name(refer_url)
    if 'joke' == ca:
      return 109
    elif 'ent' == ca:
      return 104
    else:
      return -1

  def __get_category_name(self, refer_url):
    return self.__extend_map_handler.settings.get_category_name(refer_url)\
        if self.__extend_map_handler  else 'UNKONWN_CATEGORY'

  def __init_regs(self):
    # load glob id url reg
    self.local_id_reg = []
    for r in self.__settings.getlist('LOCAL_ID_REG', []):
      self.local_id_reg.append(re.compile(r, re.I | re.S))
    # load album id url
    self.album_id_match_regs = {}
    for k, v in self.__settings.getdict('ALBUM_ID_URL', {}).items():
      for r in v:
        self.album_id_match_regs.setdefault(k, []).append(re.compile(r, re.I |
          re.S))
    # load extend url dict
    self.extend_album_pages = self.__settings.getdict('ALBUM_PAGE_URL', {})
    # load global album id reg
    self.global_albumid_reg = self.__settings.getdict('GLOBAL_ALBUMID_REG', {})

    # href tags
    self.href_tags = self.__settings.getlist('HREF_TAGs', [])
    self.loger.log.info('load href url tags: %s' % len(self.href_tags))

  def __get_global_albumid(self, localid, site_album_id):
    url = self.global_albumid_reg[localid].replace('(*albumid*)', site_album_id)
    from le_crawler.core.docid_generator import gen_docid
    return gen_docid(url)

  def __get_localid(self, url):
    return query_domain_from_url(url) or self.url_filter.get_domain_from_url(url)
  # deprecated: using above
  #def __get_localid(self, url):
  #  for r in self.local_id_reg:
  #    sg = r.search(url)
  #    if not sg:
  #      continue
  #    g = sg.groups()
  #    if g:
  #      return g[0]
  #  return None

  def __parser_urls(self, sels):
    returls = []
    for urls in sels.xpath('//a'):
      for attr in self.href_tags:
        u = urls.xpath('./@%s' % attr)
        if u:
          returls.append(u.extract()[0].encode('utf8'))
    return returls

  # return [idlist]
  def __get_site_album_id(self, localid, urls):
    retlist = set()
    for u in urls:
      for r in self.album_id_match_regs[localid]:
        sg = r.search(u)
        if sg:
          g = sg.groups()
          if g:
            retlist.add(g[0])
            break
    return list(retlist)

  # return album video urls
  def __get_album_pags(self, localid, idlist, refer_url):
    returls = []
    category = self.__get_category_name(refer_url)
    postfix = ' %s|channel' % category
    if self.extend_album_pages.has_key(localid):
      for pageurl in self.extend_album_pages[localid]:
        for id in idlist:
          glid = self.__get_global_albumid(localid, id)
          strtmp = pageurl.replace('(*albumid*)', id).replace('(*pagenum*)',
              '(*)') + postfix
          from le_crawler.base.url_extend import extend_url
          sta, extedurls = extend_url(strtmp, '1', '8', 0)
          if not sta:
            continue
          for eu in extedurls:
            # preprocess url
            self.album_ids[eu.split(' ')[0]] = glid
          returls.extend(extedurls)
    return returls

  def get_global_albumid_by_refer(self, refer_url):
    if self.album_ids.has_key(refer_url):
      return self.album_ids[refer_url]
    else:
      print 'Error: can not found global id:', refer_url

  # return description
  # dict = {'enter_page': [], 'album_pages':[], 'album_infos_pages' : []}
  def parser_enter(self, url, pages):
    localid = self.__get_localid(url)
    if not localid:
      return []
    from scrapy.selector import Selector
    sel = Selector(text = pages, type = 'html')
    if not sel:
      return []
    urls = self.__parser_urls(sel)
    albumids = self.__get_site_album_id(localid, urls)
    albumurls = self.__get_album_pags(localid, albumids, url)
    # hock start urls to extend_map_handler
    self.__extend_map_handler.settings.add_start_urls(albumurls)
    return albumurls

  # input enter page
  # return album info pages
  def parser_album_info_pages(self, body, url, refer_url):
    sta = False
    albumid = self.get_global_albumid_by_refer(url)
    if not self.album_infos.has_key(albumid):
      sta, items = self.__extend_map_handler.settings.extract_custom_map(
          body = body,
          pageurl = url)
      if not sta:
        return []
      cateid = self.get_category_id(refer_url)
      self.album_infos[albumid] = items
      self.album_infos[albumid]['album_cid'] = cateid
      self.album_infos[albumid]['album_id'] = albumid
      self.album_infos[albumid]['album_url'] = \
        self.__url_normalize.get_unique_url(url)
    # second extract urls
    status, extend_url = self.__extend_map_handler.extract_extend_map(body = body,
        pageurl = url, ignore_empty_property = True)
    if status:
      ldict = self.__extend_map_handler.get_inlink_location_dict()
      if not ldict.has_key(extend_url[0]):
        self.loger.log.error('Failed found inlink location for %s' %
            extend_url[0])
        assert False, 'Failed found inlink location, %s' %  extend_url[0]
      else:
        locationstr = ldict[extend_url[0]]
      self.album_infos[albumid].setdefault('album_vids', {})[locationstr] = \
            [self._get_store_key(i) for i in extend_url]
      video_url = extend_url[0] if extend_url else None
      album_pic = self.__extend_map_handler.lookup_extend_map(video_url, type
          = 'dict')['cover']\
          if video_url and \
          self.__extend_map_handler.lookup_extend_map(video_url, type = 'dict')\
              and self.__extend_map_handler.lookup_extend_map(video_url, type
                  = 'dict').has_key('cover') else None
      if album_pic:
        self.album_infos[albumid]['album_pic'] = album_pic

    return extend_url
  #Note: this docid should same as today_tv_writer
  def _get_store_key(self, url):
    return md5.new(url).hexdigest()

  def parser_enter_page(self, url, sels):
    glbid = self.get_global_albumid_by_refer(url)
    if not glbid:
      return

  def parser_ablum_pages(self, sels):
    pass

  def ignore_crawl_link(self, url):
    return self.__extend_map_handler.settings.ignore_link_to_crawler(url)

  def get_album_info(self, albumid):
    return self.album_infos[albumid] \
        if self.album_infos.has_key(albumid) else {}

  def get_album_infos(self):
    return self.album_infos

  def debug_album_infos(self):
    for k, v in self.album_infos.items():
      print '-------', 'albuminfo', '-------'
      for k1, v1 in v.items():
        print k1, v1