Beispiel #1
0
def url_orbit(city_list):
    for i in city_list:
        citys = i.replace('+',' ')
        page = 1
        url = "http://www.orbitz.com/shop/home?type=hotel&hotel.type=keyword&hotel.coord=&hotel.keyword.key="+i+"&hotel.locId=&hotel.chkin=&hotel.chkout=&hotel.rooms[0].adlts=2&hotel.rooms[0].chlds=0&hotel.rooms[0].chldAge[0]=&hotel.rooms[0].chldAge[1]=&hotel.rooms[0].chldAge[2]=&hotel.rooms[0].chldAge[3]=&hotel.rooms[0].chldAge[4]=&hotel.rating=&hotel.chain=&hotel.hname=&hotel.couponCode=&search=Search&hsv.page="
        hotel_pattern = 'hotelName'
        response = urllib2.urlopen(url+str(page))
        parser = BeautifulSoup(response.read())
        tag = parser.fetchText(' Sorry, but we cannot find your destination. Please re-enter a city name or airport code.')
        if tag:
            print "'%s', such place cannot be found."%(i)
            return
        tag = parser.fetchText('We need more information about your trip')
        if tag:
            print "we need more information for such place '%s'."%(i)
        else:
            print "we have found '%s',let's start."%(i)
            while True:
                page_url = url+str(page)
                print url
                response = urllib2.urlopen(page_url)
                parser = BeautifulSoup(response.read())
                hotels = parser.findAll('h2', {'class':hotel_pattern})
                if not hotels: break
                for j in hotels:
                    hotel_url = j.a.get('href')
                    hotel_db = Orbit_URL(url = hotel_url,city = citys,date = timezone.now())
                    hotel_db.save()
                    print hotel_url
                page += 1         
Beispiel #2
0
def scrape_siteexplorer_inlinks(license_uri):
    '''Scrape the Yahoo! SiteExplorer page for Inlinks count'''
    base_url = 'http://siteexplorer.search.yahoo.com/search?p='
    url = base_url + urllib2.quote(license_uri[7:-1])
    socks_monkey.enable_tor()
    page = urllib2.urlopen(url).read()
    socks_monkey.disable_tor()
    soup = BeautifulSoup(page)
    regex = re.compile('Inlinks \((.*)\)')
    inlink_part = soup.fetchText(regex)
    if inlink_part:
        inlink_count = regex.match(inlink_part[0]).group(1)
        return int(inlink_count.replace(',',''))
    else:
        # if we can't find an Inlinks part and we have no exception,
        # then just assume that there were no inlinks and return 0.
        return 0
Beispiel #3
0
class xmlparser(object):

  def __init__(self, configFile, defaultConfig):
    self.defaultDict={}
    self.configDict ={}
    with open(configFile) as f:
      content  = f.read()
      self.configFile = BeautifulSoup(content)
      comments = self.configFile.findAll(text=lambda text:isinstance(text, Comment))
      [comment.extract() for comment in comments]
    with open(defaultConfig) as f:
      content = f.read()
      self.defaultConfig = BeautifulSoup(content)
      comments = self.defaultConfig.findAll(text=lambda text:isinstance(text, Comment))
      [comment.extract() for comment in comments]
    self.allOptions = self.getAllOptions()

  def getAllOptions(self):
    for option in self.defaultConfig.options:
      label=option.find('label')
      default=option.find('default')
      if label != -1 and default != -1:
        self.defaultDict[label.string]=default.string
    first = True
    ignoreCount = 0
    
    for entry in self.configFile.fetchText():
      if first:
        first = False
        continue
      
      if ignoreCount > 0:
        ignoreCount -= 1
        continue
      if len(entry.contents) > 0:
        currentValue = entry.contents[0]
      else:
        currentValue = ""
        
      entriesDict = {}
      if len(entry.contents) > 1:        
        # Process as a separate dictionary
        for singleEntry in entry.contents:
          if singleEntry == '\n':
            continue
          ignoreCount += 1
          entriesDict[singleEntry.name] = singleEntry.contents[0]
        currentValue = entriesDict
          
      if entry.name in self.configDict:
        existingEntry = self.configDict[entry.name]
        if isinstance(existingEntry, list):
          list(existingEntry).append(currentValue)
        else:
          newlist = []
          newlist.append(existingEntry)
          newlist.append(currentValue)
          self.configDict[entry.name] = newlist
      else:
        self.configDict[entry.name] = currentValue
  def getConfigValueRecurse(self, configDictionary, query):
    # For multiple layers of XML
    querylist = query.split(".")
    if len(querylist) == 1:
      return configDictionary[querylist[0]]
    
    self.getConfigValueRecurse(configDictionary[querylist[0]], ".".join(querylist[1:]))
    
  def getConfigValue(self, query):
    return self.getConfigValueRecurse(self.configDict, query)

  def getValue(self, key):
    try:
      returnVal = self.configDict[key]
    except:
      returnVal = None
      
    if returnVal is None:
      try:
        returnVal = self.defaultDict[key]
      except:
        returnVal = None
        
    return returnVal