def crawlingController(request): crawling_spec = {} if request.method == 'POST': print request.FILES crawling_spec["login_script"] = request.FILES.get('login-script', None) crawling_spec["login_url"] = request.POST.get('login-url', "") crawling_spec["form_values_script"] = request.FILES.get('form-values-script', None) crawling_spec["base_address"] = request.POST.get('base-address', "") crawling_spec["start_url"] = request.POST.get('start-url', "") crawling_spec["black_list_urls"] = request.POST.get('black-list-urls', "") crawling_spec["scope_urls"] = request.POST.get('scope-urls', "") crawling_spec["wait_time"] = request.POST.get('wait-time', "") crawling_spec["depth"] = request.POST.get('depth', "100") #crawling_spec["proxy_address"] = request.POST.get('proxy-address', "") #print crawling_spec login_data = "" form_data = "" lines = crawling_spec['login_script'].readlines() for line in lines: login_data = login_data+line.strip() lines = crawling_spec['form_values_script'].readlines() for line in lines: form_data = form_data+line.strip() bs = BeautifulSoup(form_data) print bs #print bs.findAll("tr") obj = Crawl(login_script = crawling_spec["login_script"], login_url = crawling_spec["login_url"] , \ form_values_script = crawling_spec["form_values_script"] , \ base_address = crawling_spec["base_address"],start_url = crawling_spec["start_url"], \ black_list_urls = crawling_spec["black_list_urls"], \ scope_urls = crawling_spec["scope_urls"], \ wait_time = crawling_spec["wait_time"], \ depth = crawling_spec["depth"]) #proxy_address = crawling_spec["proxy_address"]) #print login_script, login_url, form_values_script, base_address, start_url, black_list_urls, scope_urls, wait_time obj.save() crawling_spec["login_script"] = login_data crawling_spec["form_values_script"] = form_data fsm = initializeParams(crawling_spec) #pathSourcetoSink(fsm, crawl, crawl.login_url) #print graph if fsm: returnJsonGraph(fsm.graph) number_of_nodes = fsm.graph.number_of_nodes() number_of_edges = len(fsm.graph.edges()) nodes = getNodes(fsm.graph) edges = getEdges(fsm.graph) crawl = Crawl.objects.latest("id") pathSourcetoSink(fsm, crawl) print crawl.id workflows = getWfs(crawl.id) print "workflows" print workflows #print edges #print nodes return render(request, 'run.html', {'num_nodes': number_of_nodes,'num_edges':number_of_edges, 'nodes': nodes, 'edges': edges, 'workflows': workflows}) else: return render(request, "error.html")
def crawl_brands(): """ Get all offical recorded brands from eBay. """ logging.info("Crawl cell phone brands from eBay") site = EbaySpider.get_site() crawl = Crawl() crawl.site = EbaySpider.name crawl.action = "crawl brands" crawl.status = 'SUCCESS' try: brands = EbaySpider.extract_data_from_ajax_request(EbaySpider.URLS['data']['brands']) for name in brands: count = brands[name] brands[name] = {} brands[name]['count'] = count url = EbaySpider.URLS['data']['model'] + "&" + urllib.urlencode({'Brand': name}) print(url) models = EbaySpider.extract_data_from_ajax_request(url) brands[name]['models'] = models site.brands = brands except Exception as e: logging.info(">>>> Fail to crawl brands from ebay:%s" % traceback.format_exc()) msg = traceback.format_exc() crawl.status = msg[:1024] site.save() crawl.save()
def save_crawl(url): remove_crawl(url) session = Session() crawl = Crawl(url = url, ts = datetime.now()) session.add(crawl) session.commit() crawl_id = crawl.id session.close() return crawl_id