def __init__(self, seeds, logger, thread_num=5, max_depth=9, ranks=None, index=None): self.init_seeds_num = len(seeds) self.tocrawl = {} for seed in seeds: self.tocrawl[seed] = 0 # {url: current_depth, ...} self.crawled = {} # {url1: None, url2: None, ...} self.max_depth = max_depth # traversal depth self.logger = logger self.ranks = ranks self.down_url = get_url.get_url(logger) self.indexing = indexing.indexing() if index: self.indexing.index.update(index) self.threadpool = thread_pool.thread_pool(thread_num) self.lock = threading.Lock()
# print 'inside else(3) ',m.group(3) # print ' flag1 is ',flag1 if __name__ == "__main__": url = raw_input("enter the url(http://www.example.com):") # url='http://www.vrsiddhartha.ac.in' crawl_obj = webcrawling(url) print "inside main", crawl_obj.domain for everyurl in hreflist: print "call hreflist", everyurl crawl_obj.crawl(everyurl) print "Error list is as follows", errorlist info_list = [crawl_obj.domain, hreflist.__len__()] fh = open(path + crawl_obj.domain + "/domaininfo.txt", "w") cPickle.dump(info_list, fh) fh.close() fh = open(path + crawl_obj.domain + "/errorlist.txt", "w") cPickle.dump(errorlist, fh) fh.close() # txt preprocessing# print domain fh = open(path + domain + "/domaininfo.txt", "r") list = cPickle.load(fh) fh.close() txtpp_obj = textpreprocessing(list[1]) # indexing# indexing_obj = indexing.indexing(path + list[0] + "/") indexing_obj.index_start(list[1]) crawl_obj.urlpickle()
cof.write("\n") vof.close() oof.close() cof.close() print("converting finished") print("call the c++ program, Good Luck~") total = args.end - args.start count = total // args.procs # number of procs = [] for i in range(args.procs): start = args.start + i * count end = args.end if i == args.procs - 1 else args.start + i * count + count p = Process(target=worker, args=(args.ospray_renderer, args.vti_path, view_path, opacity_path, color_path, args.outdir, args.var, args.rounds, str(up_vector[0]), str(up_vector[1]), str(up_vector[2]), str(start), str(end))) procs.append(p) p.start() for p in procs: p.join() # all done? meow meow meow if args.index: from indexing import indexing indexing(args.outdir, view, opacity, color)
import pdfkit from indexing import indexing from sampleData import sample, join # get input from user url = "https://github.com/" + input("the url for the github repo -> ") ignores = input("list of ignores, separated by kommas -> ").split(",") output_file_name = input("output file name -> ") # this function takes all the indexes and joins them together # \n is replaced since where is some weird but where there will be a bunch of "\n" in the final result otherwise def sanitize_input_for_pdfkit(indexes): return join(sample(indexes)).replace("\\n", "") # get the indexes indexes = indexing(url, ignores) # make pdfkit create a pdf pdfkit.from_string(sanitize_input_for_pdfkit(indexes), output_file_name)
from distutils.core import setup from Cython.Build import cythonize setup(name="indexing", ext_modules=cythonize('indexing.pyx'), script_name='setup.py', script_args=['build_ext', '--inplace']) import indexing import numpy as np print(indexing.indexing())
processed_dir = "../processed_data/" isExists = os.path.exists(processed_dir) if not isExists: os.makedirs(processed_dir) origin_dir = "../webpages/" isExists = os.path.exists(origin_dir) if not isExists: os.makedirs(origin_dir) file_path = "../doc_ku/crawled.txt" download_html(file_path, origin_dir) print("Download html finished.") url_dic = get_url_dic() file_num = len(url_dic) processing(origin_dir, processed_dir) print("Finish processing html files.") start = time.time() indexing(url_dic, processed_dir) end = time.time() print("Finish indexing.") print(end - start) start = time.time() vs_model_pre(file_num) end = time.time() print(end - start) print("Finish vs model preparation.")