def generate(): c = 0 urls = list() pod_name = "" print(len(urls)) f = open(join(dir_path, "app", "static", "pods", "urls_from_pod.csv"), 'r', encoding="utf-8") for l in f: if "#Pod name" in l: pod_name = l.rstrip('\n').replace("#Pod name:", "") if len(l.rstrip('\n').split(',')) == 7: url, title, snippet, vector, freqs, cc = index_pod_file.parse_line( l) if not db.session.query(Urls).filter_by(url=url).all(): u = Urls(url=url, title=title, snippet=snippet, pod=pod_name, vector=vector, freqs=freqs, cc=cc) urls.append(u) f.close() if len(urls) == 0: print("All URLs already known.") yield "data:" + "no news" + "\n\n" else: for u in urls: db.session.add(u) db.session.commit() c += 1 yield "data:" + str(int(c / len(urls) * 100)) + "\n\n" pod_from_file(pod_name)
def generate(): urls, keywords = readUrls(join(dir_path, "urls_to_index.txt")) for c in range(len(urls)): mk_page_vector.compute_vectors(urls[c],keywords[c]) pod_from_file(keywords[c]) c+=1 yield "data:" + str(int(c/len(urls)*100)) + "\n\n"
def generate(): urls, keywords = readUrls(join(dir_path, "urls_to_index.txt")) for c in range(len(urls)): success = mk_page_vector.compute_vectors(urls[c], keywords[c]) if success: pod_from_file(keywords[c]) else: print("Error accessing the URL.") c += 1 yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
def generate(): # netloc = urlparse(url).netloc all_links = [url] stack = spider.get_links(url, 200) indexed = 0 while len(stack) > 0: all_links.append(stack[0]) print("Processing", stack[0]) new_page = mk_page_vector.compute_vectors(stack[0], keyword) if new_page: stack.pop(0) indexed += 1 yield "data:" + str(indexed) + "\n\n" else: stack.pop(0) pod_from_file(keyword) yield "data:" + "Finished!" + "\n\n"
def generate(): urls, keywords, errors = readUrls(join(dir_path, "urls_to_index.txt")) if errors: logging.error('Some URLs could not be processed') if not urls or not keywords: logging.error('Invalid file format') yield "data: 0 \n\n" c = 0 for url, kwd in zip(urls, keywords): success = mk_page_vector.compute_vectors(url, kwd) if success: pod_from_file(kwd) else: logging.error("Error accessing the URL") c += 1 yield "data:" + str(int(c / len(urls) * 100)) + "\n\n"
def generate(): netloc = urlparse(url).netloc all_links = [url] links = extract_links(url) #stack = list(set([link for link in links if urlparse(link).netloc == netloc])) stack = list(set([link for link in links if url in link and '#' not in link])) indexed = 0 while len(stack) > 0: all_links.append(stack[0]) print("Processing",stack[0]) new_page = mk_page_vector.compute_vectors(stack[0],keyword) if new_page: new_links = extract_links(stack[0]) #new_site_links = list(set([link for link in links if urlparse(link).netloc == netloc and link not in all_links and '#' not in link])) new_site_links = list(set([link for link in links if url in link and link not in all_links and '#' not in link])) stack.pop(0) stack=list(set(stack+new_site_links)) indexed+=1 yield "data:" + str(indexed) + "\n\n" else: stack.pop(0) pod_from_file(keyword) yield "data:" + "Finished!" + "\n\n"