Esempio n. 1
0
def main():
    print("""
        ###############################

             Douban Top250 Movies
            (Multi-Threads Version)
               Author: Ke Yi

        ###############################
    """)
    print("Douban Movie Crawler Begins...")
    for i in range(THREAD_NUM):
        thread = Workers(Q_SHARE)
        thread.daemon = True
        thread.start()
    for index in range(PAGE_SIZE):
        Q_SHARE.put(index)
    Q_SHARE.join()
    print("Douban Movie Crawler Ends.")
    ol = sorted(MY_DIC.items(), key=lambda x: int(x[0]))  # ordered list
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(OUTPUT)
    my_file.write_in(ol)
    #  my_file.read_out()
    my_db = mdatabase.DB(DB_NAME, TB_NAME)
    my_db.db_insert(ol)
    #  my_db.db_retrieval()
    my_db.db_close()
Esempio n. 2
0
def main():
    my_spider = WeiBoSpider(UID)
    soup = my_spider.retrieve_page()
    my_spider.retrieve_content(soup)
    ol = sorted(list(DIC.items()), key=lambda x: int(x[0]))  # ordered list
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(OUTPUT)
    my_file.write_in(ol)
Esempio n. 3
0
def Workers(item):
    MY_DIC = collections.OrderedDict()
    for i in range(item['page_size']):
        my_spider = model.GenreSpider()
        my_soup = my_spider.retrieve_page(item['name'], item['order'], i)
        my_spider.retrieve_content(my_soup, MY_DIC)
    ol = sorted(MY_DIC.items(), key=lambda x: int(x[0]))
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(item['name'].lower() + '.json')
    my_file.write_in(ol)
    #  my_file.read_out()
    my_db = mdatabase.DB(DB_NAME, item['name'])
    my_db.db_insert(ol)
    #  my_db.db_retrieval()
    my_db.db_close()
Esempio n. 4
0
def Workers(item):
    MY_DIC = collections.OrderedDict()
    threads = []
    for i in range(item['page_size']):
        threads.append(gevent.spawn(Subworker, i, item, MY_DIC))
    gevent.joinall(threads)
    ol = sorted(MY_DIC.items(), key=lambda x: int(x[0]))
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(item['name'].lower() + '.json')
    my_file.write_in(ol)
    #  my_file.read_out()
    my_db = mdatabase.DB(DB_NAME, item['name'])
    my_db.db_insert(ol)
    #  my_db.db_retrieval()
    my_db.db_close()
Esempio n. 5
0
def main():
    print("""
        ###############################

             IMDB Top250 Movies
               Author: Ke Yi

        ###############################
    """)
    print("IMDB Movie Crawler Begins...")
    my_spider = IMDBSpider()
    my_soup = my_spider.retrieve_page(0)
    my_spider.retrieve_content(my_soup)
    print("IMDB Movie Crawler Ends...")
    ol = sorted(MY_DIC.items(), key=lambda x: int(x[0]))  # ordered list
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(OUTPUT)
    my_file.write_in(ol)
    #  my_file.read_out()
    my_db = mdatabase.DB(DB_NAME, TB_NAME)
    my_db.db_insert(ol)
    #  my_db.db_retrieval()
    my_db.db_close()
Esempio n. 6
0
    follower = []
    weibo = []
    img = []

    for i in range(len(numbers)):
        number = numbers[i].text
        if i % 3 == 0:
            following.append(number)
        elif i % 3 == 1:
            follower.append(number)
        else:
            weibo.append(number)

    for image in images:
        name.append(image.get_attribute('alt'))
        img.append(image.get_attribute('src'))
    DIC = {}
    OUTPUT = 'output_selphan.json'
    for i in range(len(name)):
        content = collections.OrderedDict([("Name", name[i]),
                                           ("Following", following[i]),
                                           ("Follower", follower[i]),
                                           ("Weibo", weibo[i]),
                                           ("Image", img[i])])
        DIC[str(i)] = content
    ol = sorted(list(DIC.items()), key=lambda x: int(x[0]))  # ordered list
    ol = [s[1] for s in ol]
    my_file = mjson.RWfile(OUTPUT)
    my_file.write_in(ol)
driver.quit()