Example #1
0
def crawler(u, n):
    url = 'https://m.douban.com/rexxar/api/v2/status/user_timeline/'
    path = n + os.sep + 'status'
    if not os.path.isdir(path):
        os.makedirs(path)
    url += u
    query = ''
    while True:
        print url + query
        status = json.loads(util.open_url(url + query))
        count = len(status["items"])
        if count <= 0:
            break
        for item in status['items']:
            item = item["status"]
            jpath = os.path.join(path, item['id'] + '.json')
            if os.path.isfile(jpath):
                break
                #continue
            with open(jpath, 'wb') as f:
                f.write(json.dumps(item))
                idx = 0
            for image in item['images']:
                img_url = image['large']["url"]
                suffix = img_url[img_url.rfind('/') + 1:]
                with open(
                        os.path.join(
                            path, item['id'] + '_' + str(idx) + '.' + suffix),
                        'wb') as f:
                    f.write(util.open_url(img_url))
                idx += 1
        query = '?max_id=' + status["items"][len(status['items']) -
                                             1]["status"]["id"]
Example #2
0
def crawler(u, n):
    url = 'https://m.douban.com/rexxar/api/v2/status/user_timeline/'
    path = n + os.sep + 'status'
    if not os.path.isdir(path):
        os.makedirs(path)
    url += u
    query = ''
    while True:
        print url + query
        status = json.loads(util.open_url(url + query))
        count = len(status["items"]) 
        if count <= 0:
            break
        for item in status['items']:
            item = item["status"]
            jpath = os.path.join(path, item['id'] + '.json')
            if os.path.isfile(jpath):
                break
                #continue
            with open(jpath, 'wb') as f:
                f.write(json.dumps(item))
                idx = 0
            for image in item['images']:
                img_url = image['large']["url"]
                suffix = img_url[img_url.rfind('/')+1:]
                with open(os.path.join(path, item['id'] + '_' + str(idx) + '.' + suffix), 'wb') as f:
                    f.write(util.open_url(img_url))
                idx += 1
        query = '?max_id=' + status["items"][len(status['items'])-1]["status"]["id"]
Example #3
0
    def get_game_datetime(self):
        url = self.url_provider.get_page_url('overview')
        res = util.open_url(self.browser, url)
        soup = BeautifulSoup(res.read(), "lxml")

        datetime_data = soup.find("li", {"class": "OGameClock"}).text
        game_datetime = datetime.datetime.strptime(datetime_data, "%d.%m.%Y %H:%M:%S")
        return game_datetime
Example #4
0
    def get_resources(self, planet):
        self.logger.info('Getting resources data for planet %s' % planet.name)
        url = self.url_provider.get_page_url('resources', planet)
        res = util.open_url(self.browser, url)
        soup = BeautifulSoup(res.read())

        resources = []
        metal = int(soup.find(id='resources_metal').text.replace('.',''))
        crystal = int(soup.find(id='resources_crystal').text.replace('.',''))
        deuterium = int(soup.find(id='resources_deuterium').text.replace('.',''))
        energy = int(soup.find(id='resources_energy').text.replace('.',''))

        return Resources(metal, crystal, deuterium, energy)
Example #5
0
 def get_planets(self):
     self.logger.info('Getting planets')
     url = self.url_provider.get_page_url('resources')
     res = util.open_url(self.browser, url)
     soup = BeautifulSoup(res.read())
     planets = []
     current_planet_id = soup.find("meta", { "name" : "ogame-planet-id"})['content']
     current_planet_name = soup.find("meta", { "name" : "ogame-planet-name"})['content']
     current_planet_koords = soup.find("meta", { "name" : "ogame-planet-coordinates"})['content']
     current_planet = Planet(current_planet_name, current_planet_id, current_planet_koords)
     planets.append(current_planet)
     links = soup.findAll("a", { "class" : "planetlink tooltipRight js_hideTipOnMobile" })
     other_planets = [ Planet((str(link.find("span", {"class" : "planet-name  "}).contents[0])),
                         urlparse.parse_qs(link['href'])['cp'][0],
                         self.parse_coordinates(str(link.find("span", {"class" : "planet-koords  "}).contents[0])))
                         for link in links]
     if len(other_planets) > 1:
         planets.extend(other_planets)
     return planets
Example #6
0
 def log_index_page(self):
     """Logs the index page, used for test purposes"""
     url = self.url_provider.get_page_url('overview')
     res = util.open_url(self.browser, url)
     self.logger.info(res.read())
Example #7
0
def import_json_from_url(url, description, batch_size, key = None):
  try:
    if batch_size is not None:
      log.info("Batch downloading {0} list from {1} ... ".format(description, url))
      sys.stdout.flush()

      start = int(time())
      done = 0
      failed = 0
      last_elapsed = 0

      batch = []
      encoded = ''
      stream = util.open_url(url)
      if stream is None:
        return
      while True:
        line = util.read_stream_line(stream)
        if not line:
          break
        m = _re_json_line.match(line)
        if m is None:
          continue
        try:
          obj = json.loads(m.group(1))
        except ValueError:
          log.debug("Line failed JSON parse: {0}".format(line))
          failed += 1
          continue
        batch.append(obj)
        if len(batch) >= batch_size:
          for obj in batch:
            yield obj
          done += len(batch)
          elapsed = int(time()) - start
          if elapsed - last_elapsed >= 30:
            log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description))
            last_elapsed = elapsed
          batch = []
        if len(batch) >= batch_size:
          for obj in batch:
            yield obj
          done += len(batch)
          log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description))
      done += len(batch)
      for obj in batch:
        yield obj
      if failed:
        log.info("Lines failing JSON parse: {0}".format(failed))
      log.info("Loaded {0} row(s) of {1} data to DB...".format(done, description))
      log.info("Done.")
    else:
      log.info("Downloading {0} list from {1} ... ".format(description, url))
      sys.stdout.flush()
      encoded = util.read_from_url(url)
      log.info("Done.")
      log.info("Loading {0} data...".format(description))
      sys.stdout.flush()
      obj = json.loads(encoded)
      log.info("Done.")
      log.info("Adding {0} data to DB...".format(description))
      sys.stdout.flush()
      if key is not None:
        obj = obj[key]
      for o in obj:
        yield o
      log.info("Done.")
    # Force GC collection to try to avoid memory errors
    encoded = None
    obj = None
    batch = None
    gc.collect()
  except MemoryError:
    encoded = None
    obj = None
    batch = None
    gc.collect()
    raise