Ejemplo n.º 1
0
def downloadKidSources(authURL):
    kidCategories = []
    if authURL == "":
        conn = httplib.HTTPSConnection("www.netflix.com")
        conn.request(method="GET", url="/Kids", headers=kidHeaderDict)
        response = conn.getresponse()
        data = response.read()
        showHTM = GzipFile(fileobj=StringIO(data)).read()
        authURL = showHTM[(showHTM.find('"authURL":"')+11):].split('"')[0]

    jsonPut = '{"paths":[["genreList",{"from":0,"to":40},["id","menuName"]],["genreList","summary"]],"authURL":"'+authURL+'"}'

    while True:
        conn = httplib.HTTPSConnection("www.netflix.com")
        conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict)
        response = conn.getresponse()
        data = response.read()
        jsonString = GzipFile(fileobj=StringIO(data)).read()

        try:
            jsonData = json.loads(jsonString)
            break
        except:
            continue

    for x in range(0, 41):
        if len(jsonData['value']['genreList'][str(x)]) == 2:
            kidCategories.append(jsonData['value']['genreList'][str(x)][1])

    for kidCategory in kidCategories:
        conn = httplib.HTTPSConnection("www.netflix.com")
        conn.request(method="GET", url="/Kids/category/"+kidCategory, headers=kidHeaderDict)
        response = conn.getresponse()
        data = response.read()
        categoryHTM = GzipFile(fileobj=StringIO(data)).read()
        print("Getting Kids Category " + kidCategories),
        print(response.status, response.reason)

        file = open("htm_sources/kid_" + kidCategory, 'wt')
        file.write(categoryHTM)
        file.close()
    return kidCategories
Ejemplo n.º 2
0
    def readIDX(self):
        self.logger.debug("loading idx file ...")
        leng = self.__class__.__maxOffsetLen * 2
        w, p = [], []
        f = None
        cur = 0

        if self.__idxFileName.lower().endswith(".gz"):
            self.logger.debug("idx file is gzip format!")
            fmap = GzipFile(self.__idxFileName, "rb").read()
        else:
            f = open(self.__idxFileName, "rb")
            fmap = mmap.mmap(f.fileno(), 0, access=mmap.ACCESS_READ)

        try:
            while True:
                # 避免调用 self.__readUntilZeroEx
                idx = fmap.find(b"\0", cur)
                if idx != -1:
                    cur, wordStr = idx + 1, fmap[cur:idx].decode("utf-8")
                else:
                    wordStr = ""

                if not wordStr:
                    p.append(pos[0] + pos[1])
                    break

                w.append(wordStr)
                cur, pos = cur + leng, struct.unpack("!II", fmap[cur : cur + leng])  # 避免调用 self.__readNumbers
                p.append(pos[0])
        finally:
            if hasattr(fmap, "close"):
                fmap.close()
            else:  # 是大的bytes
                del fmap
            if hasattr(f, "close"):
                f.close()

        self.logger.debug("len(w)=%d len(p)=%d, %d", len(w), len(p), p[-1])
        self.logger.debug("sizeof w is %d, sizeof p is %d", sys.getsizeof(w), sys.getsizeof(p))
        self.__wordList, self.__posList = w, p
        self.logger.debug("idx file loaded.")
Ejemplo n.º 3
0
def crawlShow(showName, showLink, authURL, videoDict, final_results):
  if authURL == "":
    conn = httplib.HTTPSConnection("www.netflix.com")
    conn.request(method="GET", url="/Kids/title/" + showLink,headers=kidHeaderDict)
    response = conn.getresponse()
    data = response.read()
    showHTM = GzipFile(fileobj=StringIO(data)).read()
    authURL = showHTM[(showHTM.find('"authURL":"')+11):].split('"')[0]

  jsonPut = '{"paths":[["videos",'+showLink+',"seasonList",{"from":0,"to":'+str(maxSeasons)+'},"summary"],["videos",'+showLink+',"seasonList","summary"],["videos",'+showLink+',"seasonList","current","episodes",{"from":-1,"to":'+str(maxEpisodes)+'},["summary","synopsis","title","runtime","bookmarkPosition"]],["videos",'+showLink+',"seasonList","current","episodes",{"from":-1,"to":'+str(maxEpisodes)+'},"interestingMoment","_342x192","jpg"],["videos",'+showLink+',"seasonList","current","episodes","summary"],["videos",'+showLink+',"seasonList","current","episodes","current","summary"]],"authURL":"'+authURL+'"}'

  count = 0
  found = False
  while count < 5:
    count += 1
    conn = httplib.HTTPSConnection("www.netflix.com")
    conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict)
    response = conn.getresponse()
    data = response.read()
    jsonString = GzipFile(fileobj=StringIO(data)).read()

    try:
      jsonData = json.loads(jsonString)
      found = True
      break
    except:
      continue

  if not found:
      return

  seasonList = []
  for x in range(0, maxSeasons+1):
    if len(jsonData['value']['videos'][showLink]["seasonList"][str(x)]) == 2:
      seasonList.append(jsonData['value']['videos'][showLink]["seasonList"][str(x)][1])

  if len(seasonList) == 0:
    final_results.write((showName + "\thttp://www.netflix.com/watch/" + showLink + '\n').encode('utf8'))
    videoDict[showName] = ("http://www.netflix.com/watch/" + showLink).encode('utf8')
    return

  for seasonLink in seasonList:
    jsonPut = '{"paths":[["seasons",'+seasonLink+',"episodes",{"from":-1,"to":'+str(maxEpisodes)+'},["summary","synopsis","title","runtime","bookmarkPosition"]],["seasons",'+seasonLink+',"episodes",{"from":-1,"to":'+str(maxEpisodes)+'},"interestingMoment","_342x192","jpg"],["seasons",'+seasonLink+',"episodes","summary"],["seasons",'+seasonLink+',"episodes","current","summary"]],"authURL":"'+authURL+'"}'

    while True:
      conn = httplib.HTTPSConnection("www.netflix.com")
      conn.request(method="POST", url="/api/shakti/e6f64e0c/pathEvaluator?withSize=true&materialize=true&model=harris", body=jsonPut, headers=jsonHeaderDict)
      response = conn.getresponse()
      data = response.read()
      jsonString = GzipFile(fileobj=StringIO(data)).read()

      try:
        jsonData = json.loads(jsonString)
        break
      except:
        continue

    episodeList = []
    for x in range(-1, maxEpisodes+1):
      if len(jsonData['value']['seasons'][seasonLink]["episodes"][str(x)]) == 2:
        episodeList.append(jsonData['value']['seasons'][seasonLink]["episodes"][str(x)][1])

    for episodeLink in episodeList:
      seasonNum  = str(jsonData['value']['videos'][episodeLink]["summary"]["season"])
      episodeNum = str(jsonData['value']['videos'][episodeLink]["summary"]["episode"])
      final_results.write((showName + "/Season " + seasonNum + " : Episode " + episodeNum + "\thttp://www.netflix.com/watch/" + episodeLink + '\n').encode('utf8'))
      videoDict[showName + "/Season " + seasonNum + " : Episode " + episodeNum] = ("http://www.netflix.com/watch/" + episodeLink).encode('utf8')