コード例 #1
0
ファイル: crawl.py プロジェクト: vanxining/Wuxia
    if beg != -1:
        beg += 22
        end = raw.index("</span>", beg)
    else:
        beg = raw.find('<div id="content">')
        if beg != -1:
            beg += 18
            end = raw.index("</div>", beg)
        else:
            beg = raw.index('zzz="') + 5
            beg = raw.index('"', beg) + 2
            end = raw.index("<!", beg)

    raw = raw[beg:end]
    raw = raw.replace("<BR>", "\n").replace("<br />", "\n")
    raw = format(raw.decode("gbk")).encode("gbk")
    sio = StringIO(raw)

    txt += title + "\n\n"

    for line in sio:
        txt += line.strip() + '\n'

    txt += '\n'

    print "Done crawling", title
    common.random_sleep(2)

with open(book_title + ".txt", "w") as outf:
    outf.write(txt)
コード例 #2
0
def main():
    config = ConfigParser.RawConfigParser()
    config.read("config/users.ini")

    use_proxy = config.getboolean("default", "proxy")
    common.prepare(use_proxy=use_proxy)

    client = MongoClient()
    db = client.topcoder

    print "Crawling users..."
    print "Current:", db.users.count()

    invalid = set()

    if os.path.exists("config/invalid_handles"):
        for line in open("config/invalid_handles"):
            line = line.strip()
            if line:
                invalid.add(line)

    handles = set()

    for challenge in db.challenges.find():
        for reg in challenge["registrants"]:
            handle = reg["handle"].lower()

            if u' ' in handle or u'/' in handle or u'\\' in handle:
                continue

            if handle in invalid:
                continue

            if handle in handles:
                continue

            if db.users.find_one({u"handle": handle}):
                continue

            handles.add(handle)

    print len(handles), "users to be crawled."
    print "-----"

    for handle in handles:
        print handle

        while True:
            try:
                request = common.make_request(u"/v3.0.0/members/" + quote(handle))
                s = urllib2.urlopen(request).read().decode("utf-8")

                d = common.to_json(s)[u"result"][u"content"]
                refine_user(d)

                user_skills(d)

                db.users.insert_one(d)

                common.random_sleep(1)
                break

            except urllib2.HTTPError, e:
                if e.code == 404 or e.code == 403:
                    invalid.add(handle)

                    with open("config/invalid_handles", "w") as fp:
                        for h in sorted(invalid):
                            fp.write(h + '\n')

                    common.random_sleep(1)
                    break
                else:
                    print "HTTP Error", e.code, e.msg
                    print e.geturl()
                    print e.fp.read()
            except Exception, e:
                print "An unknown exception occurred."
                print e

            common.random_sleep(20)
コード例 #3
0
 def __random_sleep(self):
     if self.random_sleep_max is not None:
         co.random_sleep(1, self.random_sleep_max)
コード例 #4
0
def main():
    common.prepare(use_proxy=g_config.use_proxy)

    client = MongoClient()
    db = client.topcoder

    print "Crawling users..."
    print "Current:", db.users.count()

    if g_config.recrawl_all:
        print "Recrawl all users"

    if g_config.recheck_invalid_handles:
        print "Recheck invalid handles"

    invalid = set()

    def add_invalid_handle(hdl):
        invalid.add(hdl)

        with open(INVALID_HANDLES_FPATH, "w") as fp:
            for h in sorted(invalid):
                try:
                    fp.write(h.encode("utf-8") + '\n')
                except UnicodeDecodeError:
                    pass

    if os.path.exists(INVALID_HANDLES_FPATH):
        for line in open(INVALID_HANDLES_FPATH):
            line = line.strip()
            if line:
                invalid.add(line.decode("utf-8"))

    handles = set()

    query = {u"handle": None}
    field = {u"_id": 1}

    nb_challeges = db.challenges.count()
    for index, challenge in enumerate(db.challenges.find()):
        if (index + 1) % 100 == 0:
            print "Challenges: %d/%d" % (index + 1, nb_challeges)

        for reg in challenge[u"registrants"]:
            handle = reg[u"handle"].lower()

            for ch in ur" \/":
                if ch in handle:
                    continue

            if handle in invalid:
                continue

            if handle in handles:
                continue

            if not g_config.recrawl_all:
                query[u"handle"] = handle
                if db.users.find_one(query, field) is not None:
                    continue

            handles.add(handle)

    if g_config.recheck_invalid_handles or g_config.recrawl_all:
        handles.update(invalid)
        invalid = set()

        if os.path.exists(INVALID_HANDLES_FPATH):
            os.rename(INVALID_HANDLES_FPATH, INVALID_HANDLES_FPATH + ".bak")

    print len(handles), "users to be crawled"
    print "-----"

    for index, handle in enumerate(handles):
        print "[%d/%d]" % (index + 1, len(handles)), handle

        while True:
            try:
                try:
                    quoted = quote_handle(handle)
                except KeyError:
                    add_invalid_handle(handle)

                    break

                request = common.make_request(u"/v3/members/" + quoted)
                s = common.open_request_and_read(request).decode("utf-8")
                d = common.to_json(s)[u"result"][u"content"]

                try:
                    refine_user(d)
                    user_skills(d)
                    user_stats(d)
                    user_external_accounts(d)
                except:
                    traceback.print_exc()

                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break

                db.users.insert_one(d)

                common.random_sleep(DOZE)
                break
            except urllib2.HTTPError, e:
                if e.code in (
                        404,
                        403,
                ):
                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break
                else:
                    print "HTTP Error", e.code, e.msg
                    print e.geturl()
                    print e.fp.read()
            except KeyboardInterrupt:
                return
            except:
コード例 #5
0
ファイル: crawl.py プロジェクト: vanxining/Wuxia
    if beg != -1:
        beg += 22
        end = raw.index("</span>", beg)
    else:
        beg = raw.find('<div id="content">')
        if beg != -1:
            beg += 18
            end = raw.index("</div>", beg)
        else:
            beg = raw.index('zzz="') + 5
            beg = raw.index('"', beg) + 2
            end = raw.index("<!", beg)

    raw = raw[beg:end]
    raw = raw.replace("<BR>", "\n").replace("<br />", "\n")
    raw = format(raw.decode("gbk")).encode("gbk")
    sio = StringIO(raw)

    txt += title + "\n\n"

    for line in sio:
        txt += line.strip() + '\n'

    txt += '\n'

    print "Done crawling", title
    common.random_sleep(2)

with open(book_title + ".txt", "w") as outf:
    outf.write(txt)
コード例 #6
0
                        404,
                        403,
                ):
                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break
                else:
                    print "HTTP Error", e.code, e.msg
                    print e.geturl()
                    print e.fp.read()
            except KeyboardInterrupt:
                return
            except:
                traceback.print_exc()

            common.random_sleep(ERROR_WAIT)


if __name__ == "__main__":
    while True:
        # noinspection PyBroadException
        try:
            main()

            break
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()
コード例 #7
0
def main():
    client = MongoClient()
    db = client.topcoder

    config = ConfigParser.RawConfigParser()
    config.read("config/challenges.ini")

    init = config.getboolean("default", "init")

    if init:
        index = config.getint("default", "page_index")
    else:
        index = 1

    use_proxy = config.getboolean("default", "use_proxy")
    common.prepare(use_proxy=use_proxy)

    while True:
        path = "/v2/challenges/past?type=develop&pageIndex=%d&pageSize=10" % index
        raw = common.guarded_read(path)

        if '"data": []' in raw:
            return

        print "Page", index

        lists = json.loads(raw)

        for challenge in lists["data"]:
            cid = challenge["challengeId"]

            if filter_out(cid):
                continue

            if db.challenges.find_one({"challengeId": cid}):
                if init:
                    continue
                else:
                    return

            common.random_sleep(1)

            print ' ', challenge["challengeName"]

            path = "/v2/challenges/" + str(cid)
            d = common.to_json(common.guarded_read(path))

            path = "/v2/challenges/registrants/" + str(cid)
            raw = '{"registrants": %s}' % common.guarded_read(path)
            registrants = common.to_json(raw)

            path = "/v2/challenges/submissions/" + str(cid)
            submissions = common.to_json(common.guarded_read(path))

            d.update(registrants)
            d.update(submissions)
            format_challenge(d)

            db.challenges.insert_one(d)

        index += 1

        if init:
            config.set("default", "page_index", index)
            with open("config/challenges.ini", "wb") as fp:
                config.write(fp)

        common.random_sleep(10)
コード例 #8
0
ファイル: lists.py プロジェクト: July-shisan/TopcoderCrawler
def main():
    client = MongoClient()
    db = client.topcoder

    config = ConfigParser.RawConfigParser()
    config.read("config/challenges.ini")

    init = config.getboolean("default", "init")

    if init:
        index = config.getint("default", "page_index")
    else:
        index = 1

    use_proxy = config.getboolean("default", "use_proxy")
    common.prepare(use_proxy=use_proxy)

    while True:
        path = "/v2/challenges/past?type=develop&pageIndex=%d&pageSize=10" % index
        raw = common.guarded_read(path)

        if '"data": []' in raw:
            return

        print "Page", index

        lists = json.loads(raw)

        for challenge in lists["data"]:
            cid = challenge["challengeId"]

            if filter_out(cid):
                continue

            if db.challenges.find_one({"challengeId": cid}):
                if init:
                    continue
                else:
                    return

            common.random_sleep(1)

            print ' ', challenge["challengeName"]

            path = "/v2/challenges/" + str(cid)
            d = common.to_json(common.guarded_read(path))

            path = "/v2/challenges/registrants/" + str(cid)
            raw = '{"registrants": %s}' % common.guarded_read(path)
            registrants = common.to_json(raw)

            path = "/v2/challenges/submissions/" + str(cid)
            submissions = common.to_json(common.guarded_read(path))

            d.update(registrants)
            d.update(submissions)
            format_challenge(d)

            db.challenges.insert_one(d)

        index += 1

        if init:
            config.set("default", "page_index", index)
            with open("config/challenges.ini", "wb") as fp:
                config.write(fp)

        common.random_sleep(10)
コード例 #9
0
ファイル: users.py プロジェクト: vanxining/TopcoderCrawler
def main():
    common.prepare(use_proxy=g_config.use_proxy)

    client = MongoClient()
    db = client.topcoder

    print "Crawling users..."
    print "Current:", db.users.count()

    if g_config.recrawl_all:
        print "Recrawl all users"

    if g_config.recheck_invalid_handles:
        print "Recheck invalid handles"

    invalid = set()

    def add_invalid_handle(hdl):
        invalid.add(hdl)

        with open(INVALID_HANDLES_FPATH, "w") as fp:
            for h in sorted(invalid):
                try:
                    fp.write(h.encode("utf-8") + '\n')
                except UnicodeDecodeError:
                    pass

    if os.path.exists(INVALID_HANDLES_FPATH):
        for line in open(INVALID_HANDLES_FPATH):
            line = line.strip()
            if line:
                invalid.add(line.decode("utf-8"))

    handles = set()

    query = {u"handle": None}
    field = {u"_id": 1}

    nb_challeges = db.challenges.count()
    for index, challenge in enumerate(db.challenges.find()):
        if (index + 1) % 100 == 0:
            print "Challenges: %d/%d" % (index + 1, nb_challeges)

        for reg in challenge[u"registrants"]:
            handle = reg[u"handle"].lower()

            for ch in ur" \/":
                if ch in handle:
                    continue

            if handle in invalid:
                continue

            if handle in handles:
                continue

            if not g_config.recrawl_all:
                query[u"handle"] = handle
                if db.users.find_one(query, field) is not None:
                    continue

            handles.add(handle)

    if g_config.recheck_invalid_handles or g_config.recrawl_all:
        handles.update(invalid)
        invalid = set()

        if os.path.exists(INVALID_HANDLES_FPATH):
            os.rename(INVALID_HANDLES_FPATH, INVALID_HANDLES_FPATH + ".bak")

    print len(handles), "users to be crawled"
    print "-----"

    for index, handle in enumerate(handles):
        print "[%d/%d]" % (index + 1, len(handles)), handle

        while True:
            try:
                try:
                    quoted = quote_handle(handle)
                except KeyError:
                    add_invalid_handle(handle)

                    break

                request = common.make_request(u"/v3/members/" + quoted)
                s = common.open_request_and_read(request).decode("utf-8")
                d = common.to_json(s)[u"result"][u"content"]

                try:
                    refine_user(d)
                    user_skills(d)
                    user_stats(d)
                    user_external_accounts(d)
                except:
                    traceback.print_exc()

                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break

                db.users.insert_one(d)

                common.random_sleep(DOZE)
                break
            except urllib2.HTTPError, e:
                if e.code in (404, 403,):
                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break
                else:
                    print "HTTP Error", e.code, e.msg
                    print e.geturl()
                    print e.fp.read()
            except KeyboardInterrupt:
                return
            except:
コード例 #10
0
ファイル: users.py プロジェクト: vanxining/TopcoderCrawler
                break
            except urllib2.HTTPError, e:
                if e.code in (404, 403,):
                    add_invalid_handle(handle)

                    common.random_sleep(DOZE)
                    break
                else:
                    print "HTTP Error", e.code, e.msg
                    print e.geturl()
                    print e.fp.read()
            except KeyboardInterrupt:
                return
            except:
                traceback.print_exc()

            common.random_sleep(ERROR_WAIT)


if __name__ == "__main__":
    while True:
        # noinspection PyBroadException
        try:
            main()

            break
        except KeyboardInterrupt:
            break
        except:
            traceback.print_exc()