def __init__(self, seeds, logger, thread_num=5, max_depth=9, ranks=None, index=None):
     self.init_seeds_num = len(seeds)
     self.tocrawl = {}
     for seed in seeds:
         self.tocrawl[seed] = 0  # {url: current_depth, ...}
     self.crawled = {}           # {url1: None, url2: None, ...}
     self.max_depth = max_depth  # traversal depth
     self.logger = logger
     self.ranks = ranks
     self.down_url = get_url.get_url(logger)
     self.indexing = indexing.indexing()
     if index: self.indexing.index.update(index)
     self.threadpool = thread_pool.thread_pool(thread_num)
     self.lock = threading.Lock()
Example #2
0
                            # print 'inside else(3) ',m.group(3)
                            # print ' flag1 is ',flag1


if __name__ == "__main__":
    url = raw_input("enter the url(http://www.example.com):")
    # url='http://www.vrsiddhartha.ac.in'
    crawl_obj = webcrawling(url)
    print "inside main", crawl_obj.domain
    for everyurl in hreflist:
        print "call hreflist", everyurl
        crawl_obj.crawl(everyurl)
    print "Error list is as follows", errorlist
    info_list = [crawl_obj.domain, hreflist.__len__()]
    fh = open(path + crawl_obj.domain + "/domaininfo.txt", "w")
    cPickle.dump(info_list, fh)
    fh.close()
    fh = open(path + crawl_obj.domain + "/errorlist.txt", "w")
    cPickle.dump(errorlist, fh)
    fh.close()
    # txt preprocessing#
    print domain
    fh = open(path + domain + "/domaininfo.txt", "r")
    list = cPickle.load(fh)
    fh.close()
    txtpp_obj = textpreprocessing(list[1])
    # indexing#
    indexing_obj = indexing.indexing(path + list[0] + "/")
    indexing_obj.index_start(list[1])
    crawl_obj.urlpickle()
Example #3
0
        cof.write("\n")

    vof.close()
    oof.close()
    cof.close()
    print("converting finished")


print("call the c++ program, Good Luck~")

total = args.end - args.start
count = total // args.procs  # number of
procs = []
for i in range(args.procs):
    start = args.start + i * count
    end = args.end if i == args.procs - 1 else args.start + i * count + count
    p = Process(target=worker,
                args=(args.ospray_renderer, args.vti_path, view_path, opacity_path, color_path,
                      args.outdir, args.var, args.rounds, str(up_vector[0]), str(up_vector[1]), str(up_vector[2]), str(start), str(end)))
    procs.append(p)
    p.start()


for p in procs:
    p.join()

# all done? meow meow meow
if args.index:
    from indexing import indexing
    indexing(args.outdir, view, opacity, color)
Example #4
0
import pdfkit

from indexing import indexing
from sampleData import sample, join

# get input from user
url = "https://github.com/" + input("the url for the github repo -> ")
ignores = input("list of ignores, separated by kommas -> ").split(",")
output_file_name = input("output file name -> ")


# this function takes all the indexes and joins them together
# \n is replaced since where is some weird but where there will be a bunch of "\n" in the final result otherwise
def sanitize_input_for_pdfkit(indexes):
    return join(sample(indexes)).replace("\\n", "")


# get the indexes
indexes = indexing(url, ignores)
# make pdfkit create a pdf
pdfkit.from_string(sanitize_input_for_pdfkit(indexes), output_file_name)
Example #5
0
from distutils.core import setup
from Cython.Build import cythonize

setup(name="indexing",
      ext_modules=cythonize('indexing.pyx'),
      script_name='setup.py',
      script_args=['build_ext', '--inplace'])
import indexing
import numpy as np
print(indexing.indexing())
Example #6
0
processed_dir = "../processed_data/"
isExists = os.path.exists(processed_dir)
if not isExists:
    os.makedirs(processed_dir)

origin_dir = "../webpages/"
isExists = os.path.exists(origin_dir)
if not isExists:
    os.makedirs(origin_dir)

file_path = "../doc_ku/crawled.txt"

download_html(file_path, origin_dir)
print("Download html finished.")

url_dic = get_url_dic()
file_num = len(url_dic)
processing(origin_dir, processed_dir)
print("Finish processing html files.")
start = time.time()
indexing(url_dic, processed_dir)
end = time.time()
print("Finish indexing.")
print(end - start)
start = time.time()
vs_model_pre(file_num)
end = time.time()
print(end - start)
print("Finish vs model preparation.")