def crawl_avnum(genreid, avnum_queue, expt_queue, proc_num): # get avnum from javbus website according genreid, and put avnums into avnum_queue next_page = 1 max_page = 1 while next_page <= max_page: url = get_page_url(next_page, genreid) if next_page == 1: next_page_soup = get_list_page_soup(url, genreid, proc_num) movie_sum = int( next_page_soup.find_all('a', { 'class': 'mypointer', 'id': 'resultshowmag' })[0].get_text().split()[-1]) max_page = math.ceil(movie_sum / 30) print('genre {} 共{}页'.format(genreid, max_page)) else: try: next_page_soup = get_list_page_soup(url, genreid, proc_num) except AvExpt as e: print(e) expt_queue.put(e) continue movie_links = get_movie_list(next_page_soup) for i in movie_links: # get av num from the soup av_num = get_av_num(i[0]) # skip existed movie if database.check_existence(av_num): # print('* 已存在 %s 停止爬取 *' % av_num) continue else: avnum_queue.put(i) # print('* {} 放入队列 *'.format(av_num)) next_page += 1
engine.declare(KnowledgeEngine.SocialFact(futureStrategy=AnswerArrayCP[42])) engine.declare(KnowledgeEngine.SocialFact(sensibleResolvePossible=AnswerArrayCP[43])) engine.run() # Executes first engine with the questionnaire and contextual questions & runs it InputArray = [] Username = "******" InputArray.append(Username) InputArray.append(individualDepressionLevel) InputArray.append(individualAnxietyLevel) InputArray.append(individualStressLevel) for x in range(26, 44): InputArray.append(AnswerArrayCP[x]) InputArray.append(maxIndValueOfDAS) # If this is the first execution, do not compare "Old" values with "new" ones if database.check_existence(Username) is None: print("No previous data found") # Else, compare the values and run the Knowledge Engine Two after Knowledge Engine One, too else: dbOldFactsArray = [] dbOldFactsArray = database.read_from_db(Username) # is this really an array # Start Knowledge-Engine engineTwo = KnowledgeEngineTwo.ComparingOldInputWithNew() engineTwo.reset() # Prepare the engine for the execution. # EngineTwo Declaring New Facts engineTwo.declare(KnowledgeEngineTwo.NewFinancialFact(financialDistress=AnswerArrayCP[26])) engineTwo.declare(KnowledgeEngineTwo.NewFinancialFact(employment=AnswerArrayCP[27])) engineTwo.declare(KnowledgeEngineTwo.NewFamilyFact(isCaretaker=AnswerArrayCP[28])) engineTwo.declare(KnowledgeEngineTwo.NewFamilyFact(getsEnoughSupport=AnswerArrayCP[29]))
def parse_page(url, thread_num, counter): """ parse function for each page""" # get main page soup main_page_soup = parser.get_main_page_soup(url) # request the website and get the elements movie_links = parser.get_movie_page_list(main_page_soup) # get next page url next_page = parser.get_next_page_url(main_page_soup) # loop through each movie box in the main page for i in movie_links: # get av num from the soup av_num = parser.get_av_num(i) # skip existed movie if database.check_existence(av_num): print('* 已存在 %s 停止爬取 *' % av_num) continue # get view page soup soup = parser.get_link_soup(i) # show current working status print('Thread {} 正在扒取:第 {} 页 番号:{}'.format(str(thread_num), str(os.path.basename(url)), av_num)) # get movie object info movie = parser.get_movie(soup, av_num) # show movie object # print(movie) stars = parser.get_star_list(soup) links = parser.get_download_link(soup, url, av_num) images = parser.get_sample_img_list(soup) # store movie info to database database.insert_movie(movie) # store star info to database for s in stars: database.insert_star(s, av_num) # store links info to database for l in links: database.insert_magnet(l) # store images url to database for g in images: database.insert_img(g, av_num) counter.increment_parse() print('第 ' + str(os.path.basename(url)) + ' 页扒取完毕') print('-------------------------') return next_page
def test_db_check_existence(self): username = database.check_existence("Anna") self.assertEqual(None, username)
def crawl_movie(av_info, expt_queue, proc_num): # get movie data, genre data, star data, images data from javbus website according avnum av_num = get_av_num(av_info[0]) if database.check_existence(av_num) is True: return print('Process {} get avnum: {}'.format(proc_num, av_num)) # get movie soup soup = get_movie_soup(av_info, proc_num) # get movie class try: movie = get_movie_class(soup, av_num, av_info[1]) except AvExpt as e: print('Process {} facing exception when database inserting:'.format( proc_num)) print(e) expt_queue.put(e) return print('movie class:', movie) # get starID list of a movie star_id_iter = get_star_iter(soup) # get genre list of a movie genres = get_genre_iter(soup) # get links of a movie try: link_iter = get_download_iter(soup, av_info[0], av_num, proc_num) except AvExpt as e: print( 'Process {} facing exception when crawling link:'.format(proc_num)) print(e) expt_queue.put(e) return # get sample images of a movie images = get_sample_img_iter(soup) # store movie info to database try: database.insert_movie(movie) except Exception as e: print( 'Process {} facing exception when insert movie:'.format(proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'movie_insert')) # store movie and star info to database for s in star_id_iter: if database.check_stars(s[0]): # print('Process{},avnum: {},starid: {}'.format(proc_num, av_num, s[0])) try: database.insert_m_s(av_num, s[0]) except Exception as e: print('Process {} facing exception when insert m_s:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_s_insert')) else: try: p = get_star(s, proc_num) except AvExpt as e: print('Process {} facing exception when crawling star:'.format( proc_num)) print(e) expt_queue.put(e) continue try: database.insert_star(p) except Exception as e: print('Process {} facing exception when insert star:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'star_insert')) try: database.insert_m_s(av_num, s[0]) except Exception as e: print('Process {} facing exception when insert m_s:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_s_insert')) for g in genres: if database.check_genres(g[0]): try: database.insert_m_g(av_num, g[0]) except Exception as e: print('Process {} facing exception when insert m_g:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_g_insert')) else: try: p = crawl_genre(g, proc_num) except AvExpt as e: print( 'Process {} facing exception when crawling genre:'.format( proc_num)) print(e) expt_queue.put(e) continue try: database.insert_genre(p) except Exception as e: print('Process {} facing exception when insert genre:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'genre_insert')) try: database.insert_m_g(av_num, g[0]) except Exception as e: print('Process {} facing exception when insert m_g:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'm_g_insert')) # store links info to database for li in link_iter: try: database.insert_magnet(li) except Exception as e: print('Process {} facing exception when insert magnet:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'magnet_insert')) # store images url to database for im in images: try: database.insert_img(im, av_num) except Exception as e: print('Process {} facing exception when insert img:'.format( proc_num)) print(e) expt_queue.put( AvExpt(proc_num, 'database_insert', av_info[0], str(e), 'img_insert')) print('Process {} 已扒取完毕:第 {} 页 番号:{}'.format( str(proc_num), str(os.path.basename(av_info[0])), av_num)) return