Beispiel #1
0
    def __getLocation(self,name):
        payload =   {
                        "okc_api"   :   1,
                        "func"      :   "query",
                        "query"     :   name
                    }
        payloadStr  =   urllib.urlencode(payload)

        page = SessionManager.getSession().get("http://www.okcupid.com/locquery?%s" % payloadStr)
        data    =   json.loads(page.text)
        return data["locid"]
Beispiel #2
0
    def __fillUserProfile(self):
        payload =   {
                        "okc_api"   :   1
                    }
        payloadStr  =   urllib.urlencode(payload)

        page = SessionManager.getSession().get("http://www.okcupid.com/profile/%s?%s" % (self.getUserName(),payloadStr))
        data    =   json.loads(page.text)


        self.__config.set("User","Age","%s" % data["age"])
Beispiel #3
0
def doSearchJSON(url):
    """
    TODO -  this currently does one search to many, but we can not depend on the result numbers since
            blocked profiles are not returned.   So gotta make that more effient.
    """
    session =   SessionManager.getSession()
    pageSize    =   200
    rv  =   []
    

    i = 0
    timeKey = 1
    while True:
        newURL = url + "&timekey=%s&count=%s&low=%s&okc_api=1" % (timeKey,pageSize,(1+i*pageSize))
        if i == 0:
            newURL += "#Search"

        try:
            print newURL
            page = session.get(newURL)
        except requests.exceptions.ConnectionError:
            logging.warn("Connection error, sleeping 30 seconds")
            time.sleep(30)
            continue
        """
        print page
        print page.text
        print page.status_code
        print page.reason
        """
        if page.status_code != 200:
            logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason))
            time.sleep(30)
            continue

        data        =   json.loads(page.text)
        data["url"] =   newURL
        logging.info("Search total_matches [%s] matches [%s]" % (data["total_matches"],len(data["amateur_results"])))
        if len(data["amateur_results"]) == 0:
            total = 0
            for v in rv:
                total += len(v["amateur_results"])
            logging.info("\tTotal [%s]" % total)

            return rv

        rv.append(data)
        timeKey    =   data["cache_timekey"]
        i+=1
        time.sleep(10)
Beispiel #4
0
    def crawlProfiles(self,names):
        count   =   0
        idx     =   0
        session =   SessionManager.getSession()
        while True:

            if idx >= len(names):
                return

            if count >= self.getMaxSample():
                return

            name    =   names[idx]
            logging.info("[%s]" % name)
            count += 1
            #-----------------------------------------------------------------------------------------
            url = "http://www.okcupid.com/profile/%s?okc_api=1" % name
            try:
                logging.info(url)
                page = session.get(url)
            except requests.exceptions.ConnectionError:
                logging.warn("Connection error, sleeping 30 seconds")
                time.sleep(30)
                continue
            if page.status_code != 200:
                logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason))
                time.sleep(30)
                continue

            data    =       json.loads(page.text)
            idx     +=  1
            if int(data["status"]) > 100:
                logging.warn("Profile returned status [%s:%s]" % (data["status"],data["status_str"]))
                continue
            userId      =   data["userid"]
            fileName    =   os.path.join(self.__profilePath,"%s.json" % userId)
            with open(fileName,'wb') as fp:
                json.dump(data,fp,indent=4, separators=(',', ': '))
            #-----------------------------------------------------------------------------------------
            url = "http://www.okcupid.com/profile/%s" % name
            try:
                logging.info(url)
                page = session.get(url)
            except requests.exceptions.ConnectionError:
                logging.warn("Connection error, sleeping 30 seconds")
                time.sleep(30)
                continue
            if page.status_code != 200:
                logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason))
                time.sleep(30)
                continue

            fileName    =   os.path.join(self.__profilePath,"%s.html" % userId)
            with open(fileName,'wb') as fp:
                fp.write(page.text.encode("UTF-8"))
            #-----------------------------------------------------------------------------------------
            low =   1
            while True:
                url = "http://www.okcupid.com/profile/%s/questions?okc_api=1&low=%d" % (name,low)
                result = json.loads(session.get(url).text)
                try:
                    logging.info(url)
                    page = session.get(url)
                except requests.exceptions.ConnectionError:
                    logging.warn("Connection error, sleeping 30 seconds")
                    time.sleep(30)
                    continue
                if page.status_code != 200:
                    logging.warn("Page Error [%s:%s] sleeping 60 seconds" % (page.status_code,page.reason))
                    time.sleep(30)
                    continue

                data    =       json.loads(page.text)
                fileName   =  os.path.join(self.__answerPath,"%s.%s.json" % (userId,low))
                with open(fileName,'wb') as fp:
                    json.dump(data,fp,indent=4, separators=(',', ': '))

                if data["pagination"]["cur_last"] == data["pagination"]["last"]:
                    break
                else:
                    low += 10
                time.sleep(2)

            time.sleep(10)