Example #1
0
def update_airports():
  """Get the city from Latte and update airport."""
  # build bulk
  from ghost_spider.elastic import LocationAirportEs
  from ghost_spider import progressbar
  import requests
  import time
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"name.untouched": "asc"}]
  while True:
    query["from"] = page * size
    results = LocationAirportEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress.show_progress()

    if not results["hits"]["hits"]:
      break
    
    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      if not location.get('place_id'):
        ret_json = []
        url = u'https://latte.la/travel/json?term=%s&kind=area1-area2-area3-city1-city2-city3' % location["area"]
        ret = requests.get(url, params={})
        ret_json = ret.json()
        found_place = None
        progress + 1
        progress.show_progress()

        for place in ret_json:
          if place.get('value') == location["area"]:
            found_place = place
            break
        if found_place:
          location["place_id"] = found_place["placeid"]
          location["place_url_key"] = found_place["url_key"]
      else:
        location["place_url_key"] = location["url_key"]
        location["place_id"] = long(location.get('place_id') or 0)
        location.pop('url_key')

      bulk += LocationAirportEs.bulk_data(location, action="update", data_id=data_id)

    if bulk:
      LocationAirportEs.send(bulk)
  if progress:
    progress + total
    progress.show_progress()
  print " "
Example #2
0
def dump_airports(filename):
  # build bulk
  from ghost_spider.elastic import LocationAirportEs
  from ghost_spider.util import CsvWriter
  from ghost_spider import progressbar
  import requests
  import time
  import os
  size = 100
  page = 0
  progress = None
  total = 0
  query = {"query": {"match_all": {}}, "size": size, "from": 0}
  query["sort"] = [{"country.untouched": "asc"}]
  first_row = ["name", "name_eng", "IATA", "NTGA", "country", "city", "placeid", "place_url"]
  if os.path.exists(filename):
    os.remove(filename)
  while True:
    query["from"] = page * size
    results = LocationAirportEs.search(query)
    if not progress:
      total = results["hits"]["total"]
      print "total %s" % total
      progress = progressbar.AnimatedProgressBar(end=total, width=100)
      progress.show_progress()

    if not results["hits"]["hits"]:
      break
    
    page += 1
    bulk = ""
    for result in results["hits"]["hits"]:
      location = result["_source"]
      data_id = result["_id"]
      progress + 1
      progress.show_progress()
      row = []
      row.append(location.get('name'))
      row.append(location.get('name_eng'))
      row.append(location.get('code') or '')
      row.append(location.get('code2') or '')
      row.append(location.get('country') or '')
      row.append(location.get('area') or '')
      row.append(unicode(location.get('place_id') or ''))
      row.append(location.get('url_key') or '')

      CsvWriter.write_to_csv(filename, row, firs_row=first_row)

  if progress:
    progress + total
    progress.show_progress()
  print " "
  def parse(self, response):
    sel = Selector(response)
    links = sel.xpath(AirportSelectors.LIST_AIRPORTS).extract()
    current_page = long(response.meta.get('current_page') or 1)
    print u'links: %s, %s' % (len(links), response.url)

    if links:
      for link in links:
        canonical = u'%s%s' % (self.base_url, link.split('?')[0])
        if LocationAirportEs.check_by_url(canonical):
          continue
        request = Request(canonical, callback=self.parse_airport, errback=self.parse_err)
        request.meta['page_kind'] = 'airport'
        yield request
      if len(links) >= 100:
        next_page = u'%s?pageid=%s' % (response.url.split('?')[0], current_page + 1)
        request = Request(next_page, callback=self.parse, errback=self.parse_err)
        request.meta['page_kind'] = 'list'
        request.meta['current_page'] = current_page + 1
        yield request