def url_orbit(city_list): for i in city_list: citys = i.replace('+',' ') page = 1 url = "http://www.orbitz.com/shop/home?type=hotel&hotel.type=keyword&hotel.coord=&hotel.keyword.key="+i+"&hotel.locId=&hotel.chkin=&hotel.chkout=&hotel.rooms[0].adlts=2&hotel.rooms[0].chlds=0&hotel.rooms[0].chldAge[0]=&hotel.rooms[0].chldAge[1]=&hotel.rooms[0].chldAge[2]=&hotel.rooms[0].chldAge[3]=&hotel.rooms[0].chldAge[4]=&hotel.rating=&hotel.chain=&hotel.hname=&hotel.couponCode=&search=Search&hsv.page=" hotel_pattern = 'hotelName' response = urllib2.urlopen(url+str(page)) parser = BeautifulSoup(response.read()) tag = parser.fetchText(' Sorry, but we cannot find your destination. Please re-enter a city name or airport code.') if tag: print "'%s', such place cannot be found."%(i) return tag = parser.fetchText('We need more information about your trip') if tag: print "we need more information for such place '%s'."%(i) else: print "we have found '%s',let's start."%(i) while True: page_url = url+str(page) print url response = urllib2.urlopen(page_url) parser = BeautifulSoup(response.read()) hotels = parser.findAll('h2', {'class':hotel_pattern}) if not hotels: break for j in hotels: hotel_url = j.a.get('href') hotel_db = Orbit_URL(url = hotel_url,city = citys,date = timezone.now()) hotel_db.save() print hotel_url page += 1
def scrape_siteexplorer_inlinks(license_uri): '''Scrape the Yahoo! SiteExplorer page for Inlinks count''' base_url = 'http://siteexplorer.search.yahoo.com/search?p=' url = base_url + urllib2.quote(license_uri[7:-1]) socks_monkey.enable_tor() page = urllib2.urlopen(url).read() socks_monkey.disable_tor() soup = BeautifulSoup(page) regex = re.compile('Inlinks \((.*)\)') inlink_part = soup.fetchText(regex) if inlink_part: inlink_count = regex.match(inlink_part[0]).group(1) return int(inlink_count.replace(',','')) else: # if we can't find an Inlinks part and we have no exception, # then just assume that there were no inlinks and return 0. return 0
class xmlparser(object): def __init__(self, configFile, defaultConfig): self.defaultDict={} self.configDict ={} with open(configFile) as f: content = f.read() self.configFile = BeautifulSoup(content) comments = self.configFile.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] with open(defaultConfig) as f: content = f.read() self.defaultConfig = BeautifulSoup(content) comments = self.defaultConfig.findAll(text=lambda text:isinstance(text, Comment)) [comment.extract() for comment in comments] self.allOptions = self.getAllOptions() def getAllOptions(self): for option in self.defaultConfig.options: label=option.find('label') default=option.find('default') if label != -1 and default != -1: self.defaultDict[label.string]=default.string first = True ignoreCount = 0 for entry in self.configFile.fetchText(): if first: first = False continue if ignoreCount > 0: ignoreCount -= 1 continue if len(entry.contents) > 0: currentValue = entry.contents[0] else: currentValue = "" entriesDict = {} if len(entry.contents) > 1: # Process as a separate dictionary for singleEntry in entry.contents: if singleEntry == '\n': continue ignoreCount += 1 entriesDict[singleEntry.name] = singleEntry.contents[0] currentValue = entriesDict if entry.name in self.configDict: existingEntry = self.configDict[entry.name] if isinstance(existingEntry, list): list(existingEntry).append(currentValue) else: newlist = [] newlist.append(existingEntry) newlist.append(currentValue) self.configDict[entry.name] = newlist else: self.configDict[entry.name] = currentValue def getConfigValueRecurse(self, configDictionary, query): # For multiple layers of XML querylist = query.split(".") if len(querylist) == 1: return configDictionary[querylist[0]] self.getConfigValueRecurse(configDictionary[querylist[0]], ".".join(querylist[1:])) def getConfigValue(self, query): return self.getConfigValueRecurse(self.configDict, query) def getValue(self, key): try: returnVal = self.configDict[key] except: returnVal = None if returnVal is None: try: returnVal = self.defaultDict[key] except: returnVal = None return returnVal