def progress_crawl(): print("Running progress crawl") url, keyword = readUrls(join(dir_path, "urls_to_index.txt")) url = url[0] keyword = keyword[0] def generate(): # netloc = urlparse(url).netloc all_links = [url] stack = spider.get_links(url, 200) indexed = 0 while len(stack) > 0: all_links.append(stack[0]) print("Processing", stack[0]) new_page = mk_page_vector.compute_vectors(stack[0], keyword) if new_page: stack.pop(0) indexed += 1 yield "data:" + str(indexed) + "\n\n" else: stack.pop(0) pod_from_file(keyword) yield "data:" + "Finished!" + "\n\n" return Response(generate(), mimetype='text/event-stream')
def generate(): urls, keywords = readUrls(join(dir_path, "urls_to_index.txt")) for c in range(len(urls)): mk_page_vector.compute_vectors(urls[c],keywords[c]) pod_from_file(keywords[c]) c+=1 yield "data:" + str(int(c/len(urls)*100)) + "\n\n"
def progress_crawl(): print("Running progress crawl") url,keyword = readUrls(join(dir_path, "urls_to_index.txt")) url = url[0] keyword = keyword[0] def generate(): netloc = urlparse(url).netloc all_links = [url] links = extract_links(url) #stack = list(set([link for link in links if urlparse(link).netloc == netloc])) stack = list(set([link for link in links if url in link and '#' not in link])) indexed = 0 while len(stack) > 0: all_links.append(stack[0]) print("Processing",stack[0]) new_page = mk_page_vector.compute_vectors(stack[0],keyword) if new_page: new_links = extract_links(stack[0]) #new_site_links = list(set([link for link in links if urlparse(link).netloc == netloc and link not in all_links and '#' not in link])) new_site_links = list(set([link for link in links if url in link and link not in all_links and '#' not in link])) stack.pop(0) stack=list(set(stack+new_site_links)) indexed+=1 yield "data:" + str(indexed) + "\n\n" else: stack.pop(0) pod_from_file(keyword) yield "data:" + "Finished!" + "\n\n" return Response(generate(), mimetype= 'text/event-stream')
def generate(): urls, keywords = readUrls(join(dir_path, "urls_to_index.txt")) for c in range(len(urls)): success = mk_page_vector.compute_vectors(urls[c], keywords[c]) if success: pod_from_file(keywords[c]) else: print("Error accessing the URL.") c += 1 yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
def generate(): urls, keywords, errors = readUrls(join(dir_path, "urls_to_index.txt")) if errors: logging.error('Some URLs could not be processed') if not urls or not keywords: logging.error('Invalid file format') yield "data: 0 \n\n" c = 0 for url, kwd in zip(urls, keywords): success = mk_page_vector.compute_vectors(url, kwd) if success: pod_from_file(kwd) else: logging.error("Error accessing the URL") c += 1 yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"