def import_ia(url, *, from_date=None, to_date=None, maintainers=None, tags=None, skip_unchanged='resolved-response'): skip_responses = skip_unchanged == 'response' with ia.WaybackClient() as wayback: # Pulling on this generator does the work. versions = ( wayback.timestamped_uri_to_version(version.date, version.raw_url, url=version.url, maintainers=maintainers, tags=tags, view_url=version.view_url) for version in wayback.list_versions(url, from_date=from_date, to_date=to_date, skip_repeats=skip_responses)) if skip_unchanged == 'resolved-response': versions = _filter_unchanged_versions(versions) _add_and_monitor(versions)
def wayback_exist (url, dates): #print("Inside simplify function") try: with internetarchive.WaybackClient() as client: # sys.stdout.write("inside wayback\n") # sys.stdout.flush() # print("made it internetarchive") # dump returns ALL instances within the date-range that page has been documented in the Archive # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo dump = client.list_versions(url, from_date=datetime(dates[0], dates[1], dates[2]), to_date=datetime(dates[3], dates[4], dates[5])) # sys.stdout.write("dump worked\n") # sys.stdout.flush() # get the versions if Archive contains data in the requested range try: versions = list(dump) return True except: # sys.stdout.write("inner try failed\n") # sys.stdout.flush() #print("inner try failed") return False except: # sys.stdout.write("outer try failed\n") # sys.stdout.flush() return False
def __init__(self, records, results_queue, maintainers, tags, cancel, failure_queue=None, session_options=None, unplaybackable=None): super().__init__() self.summary = self.create_summary() self.results_queue = results_queue self.failure_queue = failure_queue self.cancel = cancel self.records = records self.maintainers = maintainers self.tags = tags self.unplaybackable = unplaybackable session_options = session_options or dict(retries=3, backoff=2, timeout=(30.5, 2)) session = ia.WaybackSession(**session_options) self.wayback = ia.WaybackClient(session=session)
def _list_ia_versions_for_urls(url_patterns, from_date, to_date, skip_repeats=True, version_filter=None, client=None, stop=None): version_filter = version_filter or _is_page skipped = 0 with client or ia.WaybackClient() as client: for url in url_patterns: if stop and stop.is_set(): break ia_versions = client.list_versions(url, from_date=from_date, to_date=to_date, skip_repeats=skip_repeats) try: for version in ia_versions: if stop and stop.is_set(): break if version_filter(version): yield version else: skipped += 1 logger.debug('Skipping URL "%s"', version.url) except ia.BlockedByRobotsError as error: logger.warn(f'CDX search error: {error!r}') except ValueError as error: # NOTE: this isn't really an exceptional case; list_versions() # raises ValueError when Wayback has no matching records. # TODO: there should probably be no exception in this case. if 'does not have archived versions' not in str(error): logger.warn(repr(error)) except ia.WaybackException as error: logger.error(f'Error getting CDX data for {url}: {error!r}') except Exception: # Need to handle the exception here to let iteration continue # and allow other threads that might be running to be joined. logger.exception(f'Error processing versions of {url}') if skipped > 0: logger.info('Skipped %s URLs that did not match filters', skipped)
def import_ia_urls(urls, *, from_date=None, to_date=None, maintainers=None, tags=None, skip_unchanged='resolved-response', version_filter=None, worker_count=0, create_pages=True, unplaybackable_path=None, dry_run=False): skip_responses = skip_unchanged == 'response' worker_count = worker_count if worker_count > 0 else PARALLEL_REQUESTS unplaybackable = load_unplaybackable_mementos(unplaybackable_path) with utils.QuitSignal((signal.SIGINT, signal.SIGTERM)) as stop_event: cdx_records = utils.FiniteQueue() cdx_thread = threading.Thread(target=lambda: utils.iterate_into_queue( cdx_records, _list_ia_versions_for_urls( urls, from_date, to_date, skip_responses, version_filter, # Use a custom session to make sure CDX calls are extra robust. client=ia.WaybackClient( ia.WaybackSession(retries=10, backoff=4)), stop=stop_event))) cdx_thread.start() summary = {} versions_queue = utils.FiniteQueue() memento_thread = threading.Thread( target=lambda: WaybackRecordsWorker.parallel_with_retries( worker_count, summary, cdx_records, versions_queue, maintainers, tags, stop_event, unplaybackable, tries=(None, dict(retries=3, backoff=4, timeout=(30.5, 2)), dict(retries=7, backoff=4, timeout=60.5)))) memento_thread.start() uploadable_versions = versions_queue if skip_unchanged == 'resolved-response': uploadable_versions = _filter_unchanged_versions(versions_queue) if dry_run: uploader = threading.Thread( target=lambda: _log_adds(uploadable_versions)) else: uploader = threading.Thread(target=lambda: _add_and_monitor( uploadable_versions, create_pages, stop_event)) uploader.start() cdx_thread.join() memento_thread.join() print( '\nLoaded {total} CDX records:\n' ' {success:6} successes ({success_pct:.2f}%),\n' ' {playback:6} could not be played back ({playback_pct:.2f}%),\n' ' {missing:6} had no actual memento ({missing_pct:.2f}%),\n' ' {unknown:6} unknown errors ({unknown_pct:.2f}%).'.format( **summary)) uploader.join() if not dry_run: print('Saving list of non-playbackable URLs...') save_unplaybackable_mementos(unplaybackable_path, unplaybackable)
def counter(file, terms, dates): #counts a set of one or two word terms during a single timeframe #dates should be in the following form: [starting year, starting month, starting day, ending year, ending month, ending day] #terms should be in the format ["term"], as a phrase: ["climate", "change"], or as a set of terms and/or phrases: ["climate", ["climate", "change"]] #read the URLs with open(file) as csvfile: read = csv.reader(csvfile) data = list(read) csvfile.close() #Start the matrix that we'll put term counts into row_count = len(data) column_count = len(terms) matrix = numpy.zeros((row_count, column_count), dtype=numpy.int16) print(row_count, column_count) for pos, row in enumerate(data): thisPage = row[0] try: with internetarchive.WaybackClient() as client: dump = client.list_versions( thisPage, from_date=datetime(dates[0], dates[1], dates[2]), to_date=datetime(dates[3], dates[4], dates[5]) ) # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo versions = reversed(list(dump)) for version in versions: # for each version in all the snapshots if version.status_code == '200' or version.status_code == '-': # if the IA snapshot was viable url = version.raw_url contents = requests.get( url).content.decode() #decode the url's HTML contents = BeautifulSoup(contents, 'lxml') body = contents.find('body') # remove portions of the webpage we don't want to count d = [s.extract() for s in body('footer')] d = [s.extract() for s in body('header')] d = [s.extract() for s in body('nav')] d = [s.extract() for s in body('script')] d = [s.extract() for s in body('style')] del d body = [text for text in body.stripped_strings] # Count terms: for p, t in enumerate(terms): if type(t) is list: page_sum = two_count(t, body) else: page_sum = count(t, body) matrix[pos][ p] = page_sum # put the count of the term in the right spot in the matrix keywords[url] = keyword_function(body) final_urls[thisPage] = [url, row[3]] print(pos) break else: pass except: print("fail") final_urls[thisPage] = ["", thisPage] matrix[pos] = 999 unique, counts = numpy.unique(matrix, return_counts=True) results = dict(zip(unique, counts)) print(results) #for writing the term count to a CSV. You will then need to convert delimited text to columns and replace the first column with the list of URLs with open('outputs/counts.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in matrix: writer.writerow(row) csvfile.close() #print out urls in separate file with open('outputs/urls.csv', 'w') as output: writer = csv.writer(output) for key, value in final_urls.items(): writer.writerow([key, value[0], value[1]]) output.close() #print out top three keywords in separate file with open("outputs/keywords.csv", "w", encoding='utf-8') as outfile: writer = csv.writer(outfile) for key, value in keywords.items(): try: writer.writerow([key, value[0], value[1], value[2]]) except IndexError: writer.writerow([key, "ERROR"]) outfile.close() print("The program is finished!")
def linker(file, domain, datesA, datesB=[]): #currently only accepts looking at how a set of URLs point to each other (a square matrix) #currently is meant to look at links in a single domain e.g. "http://www.epa.gov" File should be a csv of links like: "/cleanpowerplan/page" #datesA should be in the following form: [starting year, starting month, starting day, ending year, ending month, ending day] #datesB = optional (for comparing two time periods). should be in same format as datesA dates = {'first': datesA, 'second': datesB} finalURLs = {} # build outgoing link matrix with open(file) as csvfile: read = csv.reader(csvfile) data = list(read) #put the csv data in an array row_count = len(data) matrix = numpy.zeros((row_count, row_count), dtype=numpy.int8) #create matrix urls = [] for row in data: finalURLs[domain + row[0]] = [] urls.append( row[0]) #compile list of all urls to check against later csvfile.close() times = 1 if len(dates['second']) > 0: times = 2 position = 1 #Loop through data, call CDX API, populate matrix while position <= times: if position == 1: theseDates = dates[ 'first'] # These are the numeric codes used to ID link status in timeframe A, B, and combined (A+B) connection = 1 decoding_error = 8 WM_error = 9 else: theseDates = dates['second'] connection = 3 decoding_error = 14 WM_error = 16 for pos, row in enumerate(data): thisPage = domain + row[ 0] # row[0] #for urls_shortened.csv use: 'http://www.epa.gov'+row[0] try: with internetarchive.WaybackClient() as client: dump = client.list_versions( thisPage, from_date=datetime(theseDates[0], theseDates[1], theseDates[2]), to_date=datetime(theseDates[3], theseDates[4], theseDates[5]) ) # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo versions = reversed( list(dump)) # start from the most recent snapshots for version in versions: # for each version in all the snapshots if version.status_code == '200': # if the IA snapshot was a redirect or page not found, move to the next snapshot version try: contents = requests.get( version.raw_url).content.decode( ) #decode the url's HTML contents = BeautifulSoup(contents, 'lxml') # remove portions of the webpage we don't want to count d = [s.extract() for s in contents('script')] d = [s.extract() for s in contents('style')] del d contents = contents.find("body") links = contents.find_all( 'a') #find all outgoing links thisPageLinksTo = [] for link in links: thisPageLinksTo.append( link['href'] ) #for each outgoing link, strip away the name etc. to just the href #use keys/columns and check against links. is x key/column in links? does this page link to another? if so, add 1 for i, url in enumerate(urls): if url in thisPageLinksTo: #if this page links to another domain url #print(thisPage, url, wayback_url, pos, i) #print what this page is, what it links to, and IA url matrix[pos][ i] = connection #put a 1 at the right position. matrix[row][column] finalURLs[thisPage].append(version.raw_url) print(pos) break except: finalURLs[thisPage].append("decoding error") matrix[ pos] = decoding_error # code for indicating decoding error #print('decoding error', version.status_code, row[0])# this will capture errors in decoding a page break else: pass except: finalURLs[thisPage].append("WM error") matrix[pos] = WM_error # code for indicating IA/WM error if position == 1: matrixA = matrix matrix = numpy.zeros((row_count, row_count), dtype=numpy.int8) #reset matrix else: matrixB = matrix position = position + 1 if len(dates['second']) > 0: final_matrix = numpy.add(matrixA, matrixB) else: final_matrix = matrixA with open('outputs/urls.csv', 'w') as output: writer = csv.writer(output) for key, value in finalURLs.items(): try: writer.writerow([key, value[0], value[1]]) except IndexError: writer.writerow([key, "ERROR"]) output.close() with open('outputs/links.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in final_matrix: writer.writerow(row) csvfile.close() with open('outputs/linksA.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in matrixA: writer.writerow(row) csvfile.close() with open('outputs/linksB.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in matrixB: writer.writerow(row) csvfile.close() print("The program is finished!")
def counter(file, terms, dates): #terms = ['adaptation', ['Agency', 'Mission'], ['air', 'quality'], 'anthropogenic', 'benefits', 'Brownfield', ['clean', 'energy'], 'Climate', ['climate', 'change'], 'Compliance', 'Cost-effective', 'Costs', 'Deregulatory', 'deregulation', 'droughts', ['economic', 'certainty'], ['economic', 'impacts'], 'economic', 'Efficiency', 'Emissions', ['endangered', 'species'], ['energy', 'independence'], 'Enforcement', ['environmental', 'justice'], ['federal', 'customer'], ['fossil', 'fuels'], 'Fracking', ['global', 'warming'], 'glyphosate', ['greenhouse', 'gases'], ['horizontal', 'drilling'], ['hydraulic', 'fracturing'], 'Impacts', 'Innovation', 'Jobs', 'Mercury', 'Methane', 'pesticides', 'pollution', 'Precautionary', ['regulatory', 'certainty'], 'regulation', 'Resilience', 'Risk', 'Safe', 'Safety', ['sensible', 'regulations'], 'state', 'storms', 'sustainability', 'Toxic', 'transparency', ['Unconventional', 'gas'], ['unconventional', 'oil'], ['Water', 'quality'], 'wildfires'] #file = 'all Versionista URLs 10-16-18.csv' with open(file) as csvfile: read = csv.reader(csvfile) data = list(read) csvfile.close() row_count = len(data) column_count = len(terms) matrix = numpy.zeros((row_count, column_count), dtype=numpy.int16) print(row_count, column_count) for pos, row in enumerate(data): thisPage = row[0] #change for specific CSVs try: with internetarchive.WaybackClient() as client: dump = client.list_versions( thisPage, from_date=datetime(dates[0], dates[1], dates[2]), to_date=datetime(dates[3], dates[4], dates[5]) ) # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo versions = reversed(list(dump)) for version in versions: # for each version in all the snapshots if version.status_code == '200' or version.status_code == '-': # if the IA snapshot was viable url = version.raw_url contents = requests.get( url).content.decode() #decode the url's HTML contents = BeautifulSoup(contents, 'lxml') body = contents.find('body') d = [s.extract() for s in body('footer')] d = [s.extract() for s in body('header')] d = [s.extract() for s in body('nav')] d = [s.extract() for s in body('script')] d = [s.extract() for s in body('style')] d = [s.extract() for s in body.select('div > #menuh')] #FWS d = [ s.extract() for s in body.select('div > #siteFooter') ] #FWS d = [ s.extract() for s in body.select('div.primary-nav') ] #DOE d = [ s.extract() for s in body.select('div > #nav-homepage-header') ] #OSHA d = [ s.extract() for s in body.select('div > #footer-two') ] #OSHA del d body = [text for text in body.stripped_strings] for p, t in enumerate(terms): if type(t) is list: page_sum = two_count(t, body) else: page_sum = count(t, body) matrix[pos][ p] = page_sum #put the count of the term in the matrix keywords[url] = keyword_function(body) final_urls[thisPage] = [url, row[3]] print(pos) break else: pass except: print("fail") final_urls[thisPage] = ["", row[3]] matrix[pos] = 999 unique, counts = numpy.unique(matrix, return_counts=True) results = dict(zip(unique, counts)) print(results) #for writing term counts to a csv. you will need to convert delimited text to columns and replace the first column with the list of URLs with open('outputs/counts.csv', 'w', newline='') as csvfile: writer = csv.writer(csvfile, delimiter=' ', quotechar='|', quoting=csv.QUOTE_MINIMAL) for row in matrix: writer.writerow(row) csvfile.close() #print out urls in separate file with open('outputs/urls.csv', 'w') as output: writer = csv.writer(output) for key, value in final_urls.items(): writer.writerow([key, value[0], value[1]]) output.close() #print out keywords in separate file with open("outputs/keywords.csv", "w", encoding='utf-8') as outfile: writer = csv.writer(outfile) for key, value in keywords.items(): try: writer.writerow([key, value[0], value[1], value[2]]) except IndexError: writer.writerow([key, "ERROR"]) outfile.close() print("The program is finished!")
thisPage = elm[0] # grab the url # save the url to wayback now if (now_indic == 1): try: r = requests.get('https://web.archive.org/save/' + thisPage) except: continue with open(counts_file_name, 'a', newline='') as output: writer = csv.writer(output) writer.writerow("") csvfile.close() try: with internetarchive.WaybackClient() as client: # dump returns ALL instances within the date-range that page has been documented in the Archive dump = client.list_versions( thisPage, from_date=datetime(dates[0], dates[1], dates[2]), to_date=datetime(dates[3], dates[4], dates[5]) ) # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo #print("\n"+thisPage) results.append("\n" + thisPage + "\n") sys.stdout.write("\n" + thisPage + "\n") #sys.stdout.flush() achive_indicator = 0 # indicator variable to tell whether Archive has pages in the requested date range try: # get the versions if Archive contains data in the requested range