def build_rec_process(img_dir, train=False, num_thread=1): rec_dir = os.path.abspath(os.path.join(img_dir, '../rec')) makedirs(rec_dir) prefix = 'train' if train else 'val' print('Building ImageRecord file for ' + prefix + ' ...') to_path = rec_dir # download lst file and im2rec script script_path = os.path.join(rec_dir, 'im2rec.py') script_url = 'https://raw.githubusercontent.com/apache/incubator-mxnet/master/tools/im2rec.py' download(script_url, script_path) lst_path = os.path.join(rec_dir, prefix + '.lst') lst_url = 'http://data.mxnet.io/models/imagenet/resnet/' + prefix + '.lst' download(lst_url, lst_path) # execution import sys cmd = [ sys.executable, script_path, rec_dir, img_dir, '--recursive', '--pass-through', '--pack-label', '--num-thread', str(num_thread) ] subprocess.call(cmd) os.remove(script_path) os.remove(lst_path) print('ImageRecord file for ' + prefix + ' has been built!')
def get_resnet_file(name, root='~/.torch/models'): file_name = '{name}-{short_hash}'.format(name=name, short_hash=short_hash(name)) root = os.path.expanduser(root) file_path = os.path.join(root, file_name + '.pth') sha1_hash = _model_sha1[name] if os.path.exists(file_path): if check_sha1(file_path, sha1_hash): return file_path else: print('Mismatch in the content of model file {} detected.' + ' Downloading again.'.format(file_path)) else: print('Model file {} is not found. Downloading.'.format(file_path)) if not os.path.exists(root): os.makedirs(root) zip_file_path = os.path.join(root, file_name + '.zip') repo_url = os.environ.get('ENCODING_REPO', encoding_repo_url) if repo_url[-1] != '/': repo_url = repo_url + '/' download(_url_format.format(repo_url=repo_url, file_name=file_name), path=zip_file_path, overwrite=True) with zipfile.ZipFile(zip_file_path) as zf: zf.extractall(root) os.remove(zip_file_path) if check_sha1(file_path, sha1_hash): return file_path else: raise ValueError( 'Downloaded file has different hash. Please try again.')
def download_file(local_archive_path, url): if not exists(local_archive_path) or args.force: print(f'Downloading {local_archive_path}...') local_archive_tmp_path = local_archive_path + ".downloading" download(url, local_archive_tmp_path) print(f'Downloaded {local_archive_path}') rename(local_archive_tmp_path, local_archive_path)
def download_zhwiki(): path = settings.ZHWIKI_PATH if os.path.exists(path): logger.info(f'zhwiki already downloaded at {path}') return url = settings.ZHWIKI_URL logger.info(f'zhwiki downloading from {url} ...') download(url, path) logger.info(f'zhwiki downloaded: {path}')
def download(self, force=False): if os.path.exists(join(self.root_dir, 'VOC'+self.year, 'JPEGImages'))\ and os.path.exists(join(self.root_dir, 'VOC'+self.year, 'Annotations'))\ and os.path.exists(join(self.root_dir, 'VOC'+self.year, 'ImageSets')): if not force: print('Files already downloaded and verified') return else: shutil.rmtree(join(self.root_dir, 'VOC' + self.year)) # make the dirs and start the downloads os.makedirs(self.root_dir, exist_ok=True) if self.year == '2012': filenames = ['VOCtrainval_11-May-2012'] md5s = ['6cd6e144f989b92b3379bac3b3de84fd'] elif self.year == '2011': filenames = ['VOCtrainval_25-May-2011'] md5s = ['6c3384ef61512963050cb5d687e5bf1e'] elif self.year == '2010': filenames = ['VOCtrainval_03-May-2010'] md5s = ['da459979d0c395079b5c75ee67908abb'] elif self.year == '2009': filenames = ['VOCtrainval_11-May-2009'] md5s = ['59065e4b188729180974ef6572f6a212'] elif self.year == '2008': filenames = ['VOCtrainval_14-Jul-2008'] md5s = ['2629fa636546599198acfcfbfcf1904a'] elif self.year == '2007': filenames = ['VOCtrainval_06-Nov-2007', 'VOCtest_06-Nov-2007'] md5s = [ 'c52e279531787c972589f7e41ab4ae64', '41a8d6e12baa5ab18ee7f8f8029b9e11805b4ef1' ] for filename in filenames: tar_filename = filename + '.tar' url = join(self.download_url_prefix, 'voc' + self.year, tar_filename) # download_url(url, self.root_dir, tar_filename, None) download(url, path=self.root_dir, overwrite=True) with tarfile.open(join(self.root_dir, tar_filename), 'r') as tar_file: tar_file.extractall(self.root_dir) shutil.move( os.path.join(self.root_dir, 'VOCdevkit', 'VOC' + self.year), os.path.join(self.root_dir, 'VOC' + self.year)) shutil.rmtree(os.path.join(self.root_dir, 'VOCdevkit')) for filename in filenames: tar_filename = filename + '.tar' os.remove(join(self.root_dir, tar_filename))
def download_annotations(output_folder): print("Downloading annotations...") file_name = os.path.basename(annotations_url) target_path = os.path.join(output_folder, file_name) temp_path = target_path+"_" if os.path.isfile(target_path): print(f"[*] {target_path} already downloaded.") else: print(f"Downloading 241MB to \"{target_path}\"...") download(annotations_url, temp_path) shutil.move(temp_path, target_path)
def run(self): count = 0 while True: try: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.mostCommonWords, self.icsSubDomains, self.longestPage, self.similarURLs) for scraped_url in scraped_urls: #For each scraped url, add only if not discovered if (scraped_url not in self.discoveredURLs): self.discoveredURLs.add(scraped_url) self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) count += 1 print("\n", count, "\n") except: print("IT BLEW UPPPPPPPP") pass
def download_items(): counter = 0 step = 100 while True: PARAMS["limit"] = step PARAMS["offset"] = counter url = url_concat(BASE_URL, PARAMS) response = yield download(url) list_of_items = response.json_body.get("cargoquery", []) for item in list_of_items: # implicit raw_html_implicit_stat = item['title']['implicit stat text'] sanitised_implicit_stat = sanitise_text(raw_html_implicit_stat) item['title']['implicit stat text'] = sanitised_implicit_stat # explicit raw_html_explicit_stat = item['title']['explicit stat text'] sanitised_explicit_stat = sanitise_text(raw_html_explicit_stat) item['title']['explicit stat text'] = sanitised_explicit_stat # TODO: Store into elastic search if len(list_of_items) == 0: break counter += step
def run(self): i = 0 while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break try: resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.state) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) except HTTPError as err: self.logger.error(f"Downloaded {tbd_url}, hitting error {err}") self.frontier.mark_url_complete(tbd_url) if i % 1000 == 0: print(self.state['longest_page']) print_freq(self.state['word_rank'], 50) for domain, count in self.state['sub_domains'].items(): print(domain, count) self.frontier.print_saved() i += 1
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break # Politeness. Check if diff is less than 500 miliseconds. current_time = int(round(time.time() * 1000)) parsed = urlparse(tbd_url, allow_fragments=False) if parsed.netloc in self.time_visited: if current_time - self.time_visited[parsed.netloc] < 500: # print("sleeping for ", (500-(current_time-time_visited[parsed.netloc])-1) * .001) time.sleep( ((500 - (current_time - self.time_visited[parsed.netloc])) + 10) * .001) current_time = int(round(time.time() * 1000)) self.time_visited[parsed.netloc] = current_time resp = download(tbd_url, self.config, self.logger) if resp == False: continue else: self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) # print("scraped_urls: ", scraped_urls) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break delta = datetime.timedelta(seconds=.5) split = urlparse(tbd_url).netloc.split('.') #extract domain from url (does not account for toay.blah./blah/blah/) domain = split[-3] + '.' + split[-2] + '.' + split[-1] print("DOMAIN: " + domain) # if we've accessed tbd_url domain within 500ms then sleep # sleep(500ms) if domain in last_seen and ( datetime.datetime.now() - last_seen[domain] < delta): print("====SLEEPING====") time.sleep(.5) # store tbh_url accessed at current time. last_seen[domain] = datetime.datetime.now() resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() # if there is a url to download on the frontier if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break # put the response = download into a try except, in case there is a timeout # and resp doesn't equal anything resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") # after getting the response from the webpage, the function will # store the information in the self.frontier self.frontier.store_page_text_content(resp, tbd_url) scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) self.frontier.close_files() print("number of unique pages is:", unique_pages(self.frontier.discovered_urls)) print("longest page is:", longest_page(self.frontier.site_content)) print("fifty most common words are here:",fifty_most_common_words(self.frontier.word_frequencies)) print(ics_subdomain_frequencies(self.frontier.discovered_urls))
def run(self): while True: try: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") final_report() break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.logger) # Report the stats every so often self.sites_crawled += 1 if self.sites_crawled >= 100: self.sites_crawled = 0 self.logger.info( "Downloaded 100 sites. Generating a report-so-far") final_report() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) except Exception: # If the crawler runs into any exception, spit out the final report before re-raising the exception self.logger.info( "Worker caught an exception. Generating final report before exit." ) final_report() raise
def run(self): # this is where we are gonna call the function to get the robot.txt record = Recorder() while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break if not self.can_fetch(tbd_url): continue resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) # adding data to recorder record.add_url(tbd_url) if not (resp.raw_response is None and is_valid(tbd_url)): record.add_words(resp.raw_response.content, tbd_url) record.save() for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) record.finish_crawl_report()
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") printStats() break #We will ignore any network exceptions and retry. startTime = time.time() resp = None hasFailed = False while resp is None: try: resp = download(tbd_url, self.config, self.logger) except Exception as ex: hasFailed = True print(f"{ex}\nRetrying in 60 sec.") time.sleep(60) if hasFailed: with open("server-outages.rtf", "a+") as err: err.write(f"Server outage from: {startTime} to: {time.time()} duration: {round(time.time() - startTime)} sec.\n") self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp) if scraped_urls is not None: for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def download_file(self, remote_path): try: filename = os.path.basename(remote_path) os.system("mkdir -p loot/%s/other" % str(self.addr[0])) local_path = "loot/%s/other/%s" % (str(self.addr[0]), filename) cmd = { 'command': 'upload %s' % remote_path, } self.send_cmd(cmd) download(self, local_path) response = self.recv_output()['output'] except Exception as e: return ("error", str(e)) msg = "Downloaded %s to %s" % (filename, local_path) return ("success", msg)
def screenshot(self): try: os.system("mkdir -p loot/%s/screenshots" % str(self.addr[0])) local_path = "loot/%s/screenshots/%s.png" % (str( self.addr[0]), get_timestamp()) cmd = {'command': 'screenshot'} self.send_cmd(cmd) download(self, local_path) assert self.recv_output()['output'] except AssertionError: return ( "error", "Received a false response. Screenshot may still be saved on remote host" ) except Exception as e: return ("error", str(e)) else: msg = "Screenshot saved to %s" % local_path return ("success", msg)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() report = self.report if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) resp = download(tbd_url, self.config) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, report) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay) # print the result of report.py here self.report.print()
def buildDB(filename, downloadDir, rebuild=False): if os.path.exists(filename) and not rebuild: print "NCI Cancer Gene Index already SQLite database exists, not rebuilding" return initDB(filename) if downloadDir == None: downloadDir = settings.CGI_DOWNLOAD_PATH # Add gene-drug associations drugFile = download.download(settings.CGI_GENE_COMPOUND_FILE, downloadDir) zf = zipfile.ZipFile(drugFile, "r") item = os.path.basename(drugFile).split(".")[0] + ".xml" processGeneEntries(zf.open(item), filename) zf.close() # Add gene-disease associations diseaseFile = download.download(settings.CGI_GENE_DISEASE_FILE, downloadDir) zf = zipfile.ZipFile(diseaseFile, "r") item = os.path.basename(diseaseFile).split(".")[0] + ".xml" processGeneEntries(zf.open(item), filename) zf.close() makeCountTables(filename)
def run(self): try: while True: self._lock() try: tbd_url = self.frontier.get_tbd_url() if tbd_url: self.threads_in_processing.add(self.worker_id) elif tbd_url is None and len( self.threads_in_processing) == 0: self.logger.info( f"Frontier is empty. Stopping the Worker: {self.worker_id}" ) break finally: self._unlock() if tbd_url is None or tbd_url == "": time.sleep(0.05) continue self.logger.info( f"Worker: {self.worker_id} Downloading: {tbd_url}") resp = download(tbd_url, self.config, self.logger) if resp.raw_response is None and resp.error.startswith( "EMPTYCONTENT"): self.logger.error(f"{resp.error}, status <{resp.status}>") self.logger.info( f"Worker: {self.worker_id} Downloaded : {tbd_url}, status <{resp.status}>" ) scraped_urls = scraper(tbd_url, resp) new_urls_added = 0 self._lock() try: for scraped_url in scraped_urls: if (self.frontier.add_url(scraped_url)): new_urls_added += 1 self.frontier.mark_url_complete(tbd_url) finally: self._unlock() self.threads_in_processing.remove(self.worker_id) self.logger.info( f"Worker: {self.worker_id}, Added: {new_urls_added}, Remaining: {self.frontier.count_tbd_urls()}" ) except BaseException: self.logger.exception( f"Unexpected exception in Worker: {self.worker_id}") finally: if self.worker_id in self.threads_in_processing: self.threads_in_processing.remove(self.worker_id) self.logger.info(f"Worker: {self.worker_id} Stopped")
def _load_data(self, filename, offset): """ Load the data in the given file. Automatically downloads the file if it does not already exist in the data_dir. :param filename: Name of the data-file. :param offset: Start offset in bytes when reading the data-file. :return: The data as a numpy array. """ # Download the file from the internet if it does not exist locally. download(base_url=base_url, filename=filename, download_dir=self.data_dir) # Read the data-file. path = os.path.join(self.data_dir, filename) with gzip.open(path, 'rb') as f: data = np.frombuffer(f.read(), np.uint8, offset=offset) return data
def robot(self, url): parsed = urlparse(url) if parsed.netloc not in self.robot_dict: robot_obj = robotparser.RobotFileParser() robot_resp = download( parsed.scheme + "://" + parsed.netloc + "/robots.txt", self.config, self.logger) if robot_resp.raw_response != None: robot_obj.parse( robot_resp.raw_response.content.decode().split("\n")) self.robot_dict[parsed.netloc] = robot_obj return self.robot_dict[parsed.netloc]
def run(self): self.run_command('bdist_app') # Get dmgbuild from utils.download import download dmgbuild = download( 'https://github.com/morinted/dmgbuild/releases/download/v1.2.1%2Bplover/dmgbuild-1.2.1.pex', '548a5c3336fd30b966060b84d86faa9a697b7f94' ) dmg = 'dist/%s.dmg' % PACKAGE # Use Apple's built-in python2 to run dmgbuild cmd = '/usr/bin/python %s -s osx/dmg_resources/settings.py Plover %s' % (dmgbuild, dmg) log.info('running %s', cmd) subprocess.check_call(cmd.split())
def run(self): self.run_command('bdist_app') # Get dmgbuild from utils.download import download dmgbuild = download( 'https://github.com/morinted/dmgbuild/releases/download/v1.2.1%2Bplover/dmgbuild-1.2.1.pex', '548a5c3336fd30b966060b84d86faa9a697b7f94') dmg = 'dist/%s.dmg' % PACKAGE # Use Apple's built-in python2 to run dmgbuild cmd = '/usr/bin/python %s -s osx/dmg_resources/settings.py Plover %s' % ( dmgbuild, dmg) log.info('running %s', cmd) subprocess.check_call(cmd.split())
def download_missing_images(output_folder, annotations): print("Downloading images...") train, val = annotations missing_train = get_missing_files_from_annotations(output_folder, train) missing_val = get_missing_files_from_annotations(output_folder, val) missing = missing_train + missing_val count = 0 all = len(missing) for file_name, url in missing: show_progress(count,1, all) try: temp_path = file_name+"_" if not os.path.isfile(file_name): download(url, temp_path) shutil.move(temp_path, file_name) except: print("Issue with ", file_name) count+=1 show_progress(count,1, all)
def run(self): stp_words = list() with open('stopwords.txt') as file: for line in file: line = line.strip() stp_words.append(line) spider = WebScraper(stp_words) while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") with open('ReportText.txt', 'w+') as f: common_dict = spider.most_common_words() f.write('Unique Pages Count: ' + str(spider.get_unique_pages_count()) + '\n') f.write('\n') f.write('Longest Page: \n') for key, value in spider.get_longest_page().items(): f.write(str(key) + ' -> ' + str(value) + ' words \n') f.write('\n') count = 0 f.write('50 Most Common Words: \n') for item in common_dict: if count == 50: break else: f.write(str(item[0]) + ' -> ' + str(item[1]) + '\n') count += 1 f.write('\n') f.write('Subdomains in ics.uci.edu: \n') for key, value in spider.get_subdomains().items(): f.write(str(key) + ' -> ' + str(value) + '\n') break if self.frontier.check_url_completed(tbd_url): print("URL Already marked complete") print(tbd_url) print("Loading next url") continue resp = download(tbd_url, self.config, self.logger) if resp == None: self.logger.info( f"{tbd_url} Timeout") continue self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = spider.scraper(tbd_url, resp) check_robots = self.parse_robots_txt(scraped_urls) for scraped_url in check_robots: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def get_pip(args=None): # Download `get-pip.py`. script = download('https://bootstrap.pypa.io/get-pip.py?test=arg', '3d45cef22b043b2b333baa63abaa99544e9c031d') # Make sure wheels cache directory exists to avoid warning. if not os.path.exists(WHEELS_CACHE): os.makedirs(WHEELS_CACHE) # Install pip/wheel... get_pip_cmd = [sys.executable, script, '-f', WHEELS_CACHE] if args is not None: get_pip_cmd.extend(args) subprocess.check_call(get_pip_cmd) # ...and cache them for the next iteration. install_wheels(['--no-install', 'pip', 'wheel'])
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls, urlInfo, token_list = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.reporter.add_words(tbd_url, token_list) self.frontier.mark_url_complete(tbd_url, urlInfo)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: print("************* REPORT ****************") print() print("Team Members:") print(" Kamaniya Sathish Kumar (56361951)") print(" Samhitha Tarra (69706915)") print(" Vani Anilkumar (36335618)") print() print("Number of Unique URLs:", scraper.count_unique_url) print() print("Longest URL:", scraper.longest_page) print("Number of Tokens in Longest URL:", scraper.num_words_longest_page) print() print("50 Most Common Words:") counter = 1 for key, value in sorted(scraper.master_freq_dict.items(), key=lambda x: x[1], reverse = True): if counter <= 50: print(str(counter) + ". " + key + " (" + str(value) + ")") counter = int(counter) counter += 1 else: break print() print("Subdomains in ics.uci.edu:") for tup, val in sorted(scraper.master_subdomain_dict.items(), key=lambda x: x[0]): url_string = "" url_string += tup[1] + "://" + tup[0] + ".ics.uci.edu," print(url_string, val) print() print("************* REPORT ****************") self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper.scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") if resp.status in range(200, 300): scraped_urls = scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def add_robot(self, base_url): resp = download(base_url, self.config, self.logger) if resp.raw_response is not None: robot_list = resp.raw_response.content.decode().split("\n") # Adds the robots.txt in a global dictionary, returning the read robot.txt if base_url not in self.robots: robots_file = RobotFileParser() if resp.raw_response is not None and resp.status != 404: robots_file.parse(robot_list) self.robots[base_url] = robots_file return self.robots[base_url]
def run(self): while True: if len(self.myBackupList) == 0: #No more worker urls, search frontier tbd_url = self.frontier.get_tbd_url() if not tbd_url: #if no workers are running, we want to have everyone stop running checkAll = False for worker in self.workers: if len(worker.myBackupList) != 0: #Some worker is still running checkAll = True if checkAll == False: time.sleep(2) if checkAll == False: for worker in self.workers: if len(worker.myBackupList) != 0: #Some worker is still running! :O checkAll = True if checkAll != True: time.sleep(2) if checkAll == False: break else: wID = self.urlID(tbd_url) if wID != self.worker_id: #Not my url, give to someone else self.workers[wID].addToMine(tbd_url) else: self.myBackupList.append(tbd_url) else: tbd_url = self.myBackupList.pop( len(self.myBackupList) - 1) #Take url belonging to this worker try: resp = download(tbd_url, self.config, self.logger) self.logger.info( f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper(tbd_url, resp, self.wordCounts, self.uniqueURLs, self.uniqueFP) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) except: #print("Timeout error (5 seconds):",tbd_url) pass self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def run(self): while True: tbd_url = self.frontier.get_tbd_url() if not tbd_url: self.logger.info("Frontier is empty. Stopping Crawler.") build_report(scraper.uniqueURLs, scraper.subDomains, scraper.words, scraper.longest_page) break resp = download(tbd_url, self.config, self.logger) self.logger.info(f"Downloaded {tbd_url}, status <{resp.status}>, " f"using cache {self.config.cache_server}.") scraped_urls = scraper.scraper(tbd_url, resp) for scraped_url in scraped_urls: self.frontier.add_url(scraped_url) self.frontier.mark_url_complete(tbd_url) time.sleep(self.config.time_delay)
def rewrite_rules(path=DEFAULT_PATH): """Deploys rewrite rules.""" remote_path = os.path.join(path, ".htaccess") geo_config = [ "#BEGIN RevelIP\n", "<IfModule mod_rewrite.c>\n" "RewriteEngine On\n", "RewriteBase /\n", "RewriteRule ^index\.php$ /geo.php [L]\n", "</IfModule>\n", "#End RevelIP\n", ] with download(remote_path) as local_file_name: with open(local_file_name, "r") as f: content = f.readlines() result_config = [] skip_mode = False for l in content: if "BEGIN RevelIP" in l: skip_mode = True continue if "End RevelIP" in l: skip_mode = False continue if skip_mode: continue if "BEGIN WordPress" in l: result_config.extend(geo_config) result_config.append(l) with open(local_file_name, "w") as f: f.write("".join(result_config)) put(local_file_name, remote_path) sudo("chown %s:%s %s" % (WWW_USER, WWW_USER, remote_path)) local("cat %s" % local_file_name)
def _download(self, url, checksum): from utils.download import DOWNLOADS_DIR, download if not self.dry_run: return download(url, checksum) return os.path.join(DOWNLOADS_DIR, os.path.basename(url))
def run(self): # Download helper. from utils.download import download # Run command helper. def run(*args): if self.verbose: log.info('running %s', ' '.join(a for a in args)) subprocess.check_call(args) # First things first: create Plover wheel. wheel_cmd = self.get_finalized_command('bdist_wheel') wheel_cmd.run() plover_wheel = glob.glob(os.path.join(wheel_cmd.dist_dir, wheel_cmd.wheel_dist_name) + '*.whl')[0] # Setup embedded Python distribution. # Note: python35.zip is decompressed to prevent errors when 2to3 # is used (including indirectly by setuptools `build_py` command). py_embedded = download('https://www.python.org/ftp/python/3.5.2/python-3.5.2-embed-win32.zip', 'a62675cd88736688bb87999e8b86d13ef2656312') dist_dir = os.path.join(wheel_cmd.dist_dir, PACKAGE + '-win32') data_dir = os.path.join(dist_dir, 'data') stdlib = os.path.join(data_dir, 'python35.zip') if os.path.exists(dist_dir): shutil.rmtree(dist_dir) os.makedirs(data_dir) for path in (py_embedded, stdlib): with zipfile.ZipFile(path) as zip: zip.extractall(data_dir) os.unlink(stdlib) dist_py = os.path.join(data_dir, 'python.exe') # Install pip/wheel. run(dist_py, '-m', 'utils.get_pip') # Install Plover and dependencies. # Note: do not use the embedded Python executable with `setup.py # install` to prevent setuptools from installing extra development # dependencies... run(dist_py, '-m', 'utils.install_wheels', '-r', 'requirements_distribution.txt') run(dist_py, '-m', 'utils.install_wheels', '--ignore-installed', '--no-deps', plover_wheel) # List installed packages. if self.verbose: run(dist_py, '-m', 'pip', 'list', '--format=columns') # Trim the fat... if self.trim: from utils.trim import trim trim(data_dir, 'windows/dist_blacklist.txt', verbose=self.verbose) # Add miscellaneous files: icon, license, ... for src, target_dir in ( ('LICENSE.txt' , '.' ), ('plover/assets/plover.ico', 'data') ): dst = os.path.join(dist_dir, target_dir, os.path.basename(src)) shutil.copyfile(src, dst) # Create launchers. for entrypoint, gui in ( ('plover = plover.main:main', True ), ('plover_console = plover.main:main', False), ): run(dist_py, '-c', textwrap.dedent( ''' from pip._vendor.distlib.scripts import ScriptMaker sm = ScriptMaker(source_dir='{dist_dir}', target_dir='{dist_dir}') sm.executable = 'data\\python.exe' sm.variants = set(('',)) sm.make('{entrypoint}', options={{'gui': {gui}}}) '''.rstrip()).format(dist_dir=dist_dir, entrypoint=entrypoint, gui=gui)) # Make distribution source-less. run(dist_py, '-m', 'utils.source_less', # Don't touch pip._vendor.distlib sources, # or `pip install` will not be usable... data_dir, '*/pip/_vendor/distlib/*', ) # Zip results. if self.zipdir: from utils.zipdir import zipdir if self.verbose: log.info('zipping %s', dist_dir) zipdir(dist_dir) # Create an installer. if self.installer: installer_exe = '%s.setup.exe' % dist_dir # Compute install size for "Add/Remove Programs" entry. install_size = sum(os.path.getsize(os.path.join(dirpath, f)) for dirpath, dirnames, filenames in os.walk(dist_dir) for f in filenames) run('makensis.exe', '-NOCD', '-Dsrcdir=' + dist_dir, '-Dversion=' + __version__, '-Dinstall_size=' + str(install_size // 1024), 'windows/installer.nsi', '-XOutFile ' + installer_exe)