def jobVerification(download_dir): global resume_job if os.path.exists(download_dir + '/' + job_name): resume_yn = consoleutils.readKeyboard("The job already exists, do you want to resume the download? (y/n):", '^y$|^n$', 'Select y for Yes or n for No', None) if resume_yn == 'y': resume_job = True return True else: resume_job = False return False else: resume_job = False return True
def main(argv): #This is where is all begins... print 'Anaconda v1 [socialray.org]' print '' #if the scaper directory does not exist, create one. download_dir = os.getcwd() + '/downloads' if not os.path.exists(download_dir): print download_dir + ' directory was not found. Creating....' os.mkdir(download_dir) #standard text for invalid entry invalid = 'Invalid entry, re-enter.' #read options from the command-line options, args = getopt.getopt(args, '', ['name=', 'resume', 'downloadall', 'depth=', 'starturl=']) #if no options are provided, print out the help text. if len(options) == 0: print "Usage is python anaconda.py name=some_job_name" continue_if_exists = False downloadcategories = True download_all = False depth = 0 for opt, val in options: if opt in ("-n", "--name"): job_name = value if opt in ("-r", "--resume"): resume_if_exists = True if opt in ("-a", "--downloadall"): downloadcategories = True if opt in ("-d", "--depth"): depth = val if (Job.exists(job_name)): if not resume_if_exists: if os.path.exists(self.job_dir): #job exists, we cannot proceed print "The specified job already exists. To resume a job, attach the '--resume' option" sys.exit(1) #EXIT! job = Job(job_name, download_dir, depth) job.start() if not resume_job: dload_target = consoleutils.readKeyboard('What do I download? Categories=c, Categories and Articles=a (default is c):', '^c$|^a$', invalid, 'c') #wiki will ban us if delay < 1 second fetch_delay = string.atoi(consoleutils.readKeyboard("Input Fetch Delay Seconds n (default is 2):", '^\d+$', 'Enter a number, or press enter to accept the default', '2')) scan_depth = string.atoi(consoleutils.readKeyboard("Max depth while crawling? (4):", '^\d+$', 'Enter a number, or press enter to accept the default', '4')) #the download starts from this page start_url = consoleutils.readKeyboard('Url to begin downloading from:', None, invalid, None) #save to the job settings file job_settings_file = open(job_settings_filename, 'w') job_settings_file.writelines([dload_target + '\n', str(fetch_delay) + '\n', str(scan_depth) + '\n']) job_settings_file.close() else: #read from the previously saved settings file job_settings_file = open(job_settings_filename, 'r') dload_target, fetch_delay, scan_depth = withoutNewlines(job_settings_file) fetch_delay = string.atoi(fetch_delay) scan_depth = string.atoi(scan_depth) job_settings_file.close() try: cat_dload_queue = Queue.Queue() entry_dload_queue = Queue.Queue() cat_download_count = entry_download_count = 0 if not resume_job: openJobFiles('w') addToCategoryQueue(start_url) print "Started download at " + str(datetime.datetime.utcnow()) else: ok = rebuildState() openJobFiles('a') #open for append if ok: print "Resumed download at " + str(datetime.datetime.utcnow()) else: sys.exit() if dload_target == 'c': downloadCategories() if dload_target == 'a': downloadCategories() downloadEntries() print "Download completed at " + str(datetime.datetime.utcnow()) except: print sys.exc_info() closeJobFiles() print "Download aborted at " + str(datetime.datetime.utcnow())