def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m): mc = MementoClient() actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def test_get_native_timegate_uri(input_uri_r, input_datetime, expected_uri_g): mc = MementoClient(check_native_timegate=True) actual_uri_g = mc.get_native_timegate_uri(input_uri_r, input_datetime) assert expected_uri_g == actual_uri_g
def test_get_memento_uri_specified_timegate_direct_timegate_query(input_uri_r, input_datetime, input_timegate, expected_uri_m): mc = MementoClient(timegate_uri=input_timegate, check_native_timegate=False) actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime, include_uri_checks=False).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def url_list(request): urls = URL.objects.all() if request.method == "POST": form = URLForm(request.POST) if form.is_valid(): post = form.save(commit=False) try: response = requests.get(post) temp = BeautifulSoup(response.content,"lxml") post.title = temp.title.string post.finalDestination = response.url post.statusCode = response.status_code dt = datetime.datetime.now() mc = MementoClient() uri = post.finalDestination memento_uri = mc.get_memento_info(uri, dt).get("mementos").get("closest") post.uri = memento_uri.get('uri')[0] post.datetime = str(memento_uri.get('datetime')) except: post.statusCode = "None" post.finalDestination = "Does not exit" post.title = "No title" pass finally: post.save() return redirect('url_detail', pk=post.pk) else: form = URLForm return render(request, 'lab1/url_list.html',{'urls':urls,'form':URLForm})
def url_list(request): urls = URL.objects.all() if request.method == "POST": form = URLForm(request.POST) if form.is_valid(): post = form.save(commit=False) try: response = requests.get(post) temp = BeautifulSoup(response.content, "lxml") post.title = temp.title.string post.finalDestination = response.url post.statusCode = response.status_code dt = datetime.datetime.now() mc = MementoClient() uri = post.finalDestination memento_uri = mc.get_memento_info( uri, dt).get("mementos").get("closest") post.uri = memento_uri.get('uri')[0] post.datetime = str(memento_uri.get('datetime')) except: post.statusCode = "None" post.finalDestination = "Does not exit" post.title = "No title" pass finally: post.save() return redirect('url_detail', pk=post.pk) else: form = URLForm return render(request, 'lab1/url_list.html', { 'urls': urls, 'form': URLForm })
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g): mc = MementoClient(timegate_uri=input_uri_g) accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m): mc = MementoClient() actual_uri_m = mc.get_memento_info( input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def url_list(request): if request.method == "POST": form = SearchForm(request.POST) if form.is_valid(): new_url = form.save(commit = False) new_url.date = timezone.now() # Runs when URL is correct try: response = requests.get(new_url) page = BeautifulSoup(response.content, "lxml") if page.title is not None: title = page.title.string else: title = "No Title Available" new_url.status = response.status_code new_url.final_url = response.url new_url.title = title # Wayback storing current_date = datetime.datetime.now() memento = MementoClient() wayback_res = memento.get_memento_info(response.url, current_date).get("mementos").get("closest") new_url.wayback = wayback_res.get("uri")[0] if wayback_res.get("datetime") is not None: new_url.wayback_date = str(wayback_res.get("datetime")) else: new_url.wayback_date = str(current_date) # Picture archiving # Connecting to S3 s3_connection = boto3.resource("s3") # For image capture with PhahtomJS data = json.dumps({"url":response.url, "renderType":"jpeg"}).encode("utf-8") headers = {"content-type": "application/json"} api_url = "http://PhantomJScloud.com/api/browser/v2/" + api_key + "/" req = urllibreq.Request(url=api_url, data=data, headers=headers) res = urllibreq.urlopen(req) result = res.read() # Puts the generated image on S3 s3_connection.Bucket("lab3pics").put_object(Key=str(current_date) + ".jpg", Body=result, ACL="public-read", ContentType="image/jpeg") # Generates a publicly accessible link to the image pic_url = "http://s3.amazonaws.com/lab3pics/" + str(current_date) + ".jpg" new_url.archive_link = pic_url # Sets up error message except Exception as e: new_url.status = "None" new_url.final_url = "Does not exist" new_url.title = "This webpage does not exist" new_url.wayback = "Not available" new_url.wayback_date = "Not available" new_url.archive_link = e # Redirects to details page finally: new_url.save() return redirect('url_detail', pk = new_url.pk) else: urls = URL.objects.filter(date__lte = timezone.now()).order_by('-date') form = SearchForm return render(request, 'urlexpander/url_list.html', {'urls': urls, 'form': SearchForm})
def good_url_slash_at_end(): input_uri_r = "http://www.cnn.com/" mc = MementoClient() dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT", "%a, %d %b %Y %H:%M:%S GMT") uri_m = mc.get_memento_info(input_uri_r, dt).get("mementos").get("closest").get("uri")[0] assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
def test_bad_timegate_linux(): input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.com" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) with pytest.raises(requests.ConnectionError): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g): mc = MementoClient(timegate_uri=input_uri_g) accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_get_memento_uri_specified_timegate(input_uri_r, input_datetime, input_timegate, expected_uri_m): mc = MementoClient(timegate_uri=input_timegate, check_native_timegate=False) actual_uri_m = mc.get_memento_info( input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0] assert expected_uri_m == actual_uri_m
def test_bad_timegate_osx(): input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.com" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri") assert input_uri_r == original_uri
def test_get_memento_data_non_compliant(input_uri_m): # TODO: pytest did not seem to split this into arguments input_uri_m = input_uri_m[0] mc = MementoClient() accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_m, accept_datetime).get("original_uri") assert input_uri_m == original_uri
def test_good_url_slash_at_end(): input_uri_r = "http://www.cnn.com/" mc = MementoClient() dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT", "%a, %d %b %Y %H:%M:%S GMT") uri_m = mc.get_memento_info( input_uri_r, dt).get("mementos").get("closest").get("uri")[0] assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
def test_bad_timegate(): print("'HTTP_PROXY' in os.environ: {}".format('HTTP_PROXY' in os.environ)) input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.moc" accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) with pytest.raises(requests.ConnectionError): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def main(): """Entry function.""" parser = argparse.ArgumentParser() parser.add_argument('input_csv_path_file', help="specify the csv file to read") parser.add_argument('output_csv_path_file', help="specify the csv file to write results") parser.add_argument('url_field', help=" specify the field name to get the URL") parser.add_argument('datetime', help="Memento Datetime") args = parser.parse_args() df = pd.read_csv(args.input_csv_path_file) dt = datetime.datetime.strptime(args.datetime, '%Y%m%d') mc = MementoClient(check_native_timegate=False) with open(args.output_csv_path_file, 'a') as csvfile: fieldnames = ['original-uri', 'memento-closest', 'memento-first', 'memento-last', 'timegate_uri'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writeheader() for index, row in df[9487:].iterrows(): # if row[args.url_field] != '': memento = mc.get_memento_info( row[args.url_field], dt, include_uri_checks=False) if not memento.get("mementos") is None: writer.writerow({'original-uri': memento.get("original_uri"), 'memento-closest': '' if memento.get("mementos") .get("closest") is None else memento.get("mementos") .get("closest").get("uri")[0], 'memento-first': '' if memento.get("mementos") .get("first") is None else memento.get("mementos") .get("first").get("uri")[0], 'memento-last': '' if memento.get("mementos") .get("last") is None else memento.get("mementos") .get("last").get("uri")[0], 'timegate_uri': memento.get("timegate_uri")}) else: writer.writerow({'original-uri': memento.get("original_uri"), 'memento-closest': '', 'memento-first': '', 'memento-last': '', 'timegate_uri': memento.get("timegate_uri")}) csvfile.flush()
def test_nonexistent_urirs(input_uri_r): input_uri_r = input_uri_r[0] accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient() memento_info = mc.get_memento_info(input_uri_r, accept_datetime) assert memento_info.get("original_uri") == input_uri_r assert memento_info.get("timegate_uri") == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
def test_get_memento_data_non_compliant(input_uri_m): # TODO: pytest did not seem to split this into arguments input_uri_m = input_uri_m[0] mc = MementoClient() accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") original_uri = mc.get_memento_info(input_uri_m, accept_datetime).get("original_uri") assert input_uri_m == original_uri
def test_nonexistent_urirs(input_uri_r): input_uri_r = input_uri_r[0] accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient() memento_info = mc.get_memento_info(input_uri_r, accept_datetime) assert memento_info.get("original_uri") == input_uri_r assert memento_info.get( "timegate_uri" ) == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
def test_bad_timegate(): print("'HTTP_PROXY' in os.environ: {}".format('HTTP_PROXY' in os.environ)) input_uri_r = "http://www.cnn.com" bad_uri_g = "http://www.example.moc" accept_datetime = datetime.datetime.strptime( "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT") mc = MementoClient(timegate_uri=bad_uri_g) #with pytest.raises(requests.ConnectionError): with pytest.raises((requests.exceptions.ConnectionError, memento_client.memento_client.MementoClientException)): original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def test_determine_if_memento(input_uri_m): # TODO: pytest did not seem to split this into arguments input_uri_m = input_uri_m[0] status = MementoClient.is_memento(input_uri_m) assert True == status
def run( self, filename, file, dependency_results=dict(), follow_redirects: bool = True, ): """ Find links in any text file and check if they are archived. Link is considered valid if the link has been archived by any services in memento_client. This bear can automatically fix redirects. Warning: This bear will make HEAD requests to all URLs mentioned in your codebase, which can potentially be destructive. As an example, this bear would naively just visit the URL from a line that goes like `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out all your data. :param dependency_results: Results given by URLHeadBear. :param follow_redirects: Set to true to check all redirect urls. """ self._mc = MementoClient() for result in dependency_results.get(URLHeadBear.name, []): line_number, link, code, context = result.contents if not (code and 200 <= code < 400): continue status = MementoBear.check_archive(self._mc, link) if not status: yield Result.from_values( self, ('This link is not archived yet, visit ' 'https://web.archive.org/save/%s to get it archived.' % link), file=filename, line=line_number, severity=RESULT_SEVERITY.INFO) if follow_redirects and 300 <= code < 400: # HTTP status 30x redirect_urls = MementoBear.get_redirect_urls(link) for url in redirect_urls: status = MementoBear.check_archive(self._mc, url) if not status: yield Result.from_values( self, ('This link redirects to %s and not archived yet, ' 'visit https://web.archive.org/save/%s to get it ' 'archived.' % (url, url)), file=filename, line=line_number, severity=RESULT_SEVERITY.INFO)
def get_via_mementos(uri, dt): mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) # mc = MementoClient() print("Getting mementos for %s ..." % uri) try: mementos = mc.get_memento_info(uri, dt).get("mementos") if mementos: print("Got mementos for %s ..." % uri) if 'closest' in mementos: uri = mementos.get("closest").get("uri")[0] elif 'memento' in mementos: uri = mementos.get("closest").get("uri")[0] # Need to patch the id_ into the url: uri = re.sub(r"\/(\d{14})\/", r"/\1id_/", uri) except Exception as e: print(e) pass return uri
def test_close_with_user_supplied_session(mock_session): class mock_headers(): def __init__(self): self.headers = {"header": "nodata"} mock_session.head.return_value = mock_headers() with MementoClient(session=mock_session) as mc: urir = mc.get_original_uri('http://www.cnn.com') mock_session.close.assert_not_called()
def test_close_session_on_default(mock_session): class mock_headers(): def __init__(self): self.headers = {"header": "nodata"} mock_session.head.return_value = mock_headers() with MementoClient() as mc: mc.session = mock_session urir = mc.get_original_uri('http://www.cnn.com') mock_session.close.assert_called_with()
def run(self, filename, file, network_timeout: typed_dict(str, int, DEFAULT_TIMEOUT) = dict(), link_ignore_regex: str = '([.\/]example\.com|\{|\$)', link_ignore_list: typed_list(str) = DEFAULT_IGNORE, follow_redirects: bool = True): """ Find links in any text file and check if they are archived. Link is considered valid if the link has been archived by any services in memento_client. This bear can automatically fix redirects. Warning: This bear will make HEAD requests to all URLs mentioned in your codebase, which can potentially be destructive. As an example, this bear would naively just visit the URL from a line that goes like `do_not_ever_open = 'https://api.acme.inc/delete-all-data'` wiping out all your data. :param network_timeout: A dict mapping URLs and timeout to be used for that URL. All the URLs that have the same host as that of URLs provided will be passed that timeout. It can also contain a wildcard timeout entry with key '*'. The timeout of all the websites not in the dict will be the value of the key '*'. :param link_ignore_regex: A regex for urls to ignore. :param link_ignore_list: Comma separated url globs to ignore. :param follow_redirects: Set to true to check all redirect urls. """ self._mc = MementoClient() network_timeout = { urlparse(url).netloc if not url == '*' else '*': timeout for url, timeout in network_timeout.items() } if link_ignore_list != self.DEFAULT_IGNORE: link_ignore_list.extend(self.DEFAULT_IGNORE) for (line_number, link, code, context) in self.analyze_links_in_file(file, network_timeout, link_ignore_regex, link_ignore_list): status = MementoBear.check_archive(self._mc, link) if not status: yield Result.from_values( self, ('This link is not archived yet, visit ' 'https://web.archive.org/save/%s to get it archived.' % link), file=filename, line=line_number, severity=RESULT_SEVERITY.INFO) if follow_redirects and 300 <= code < 400: # HTTP status 30x redirect_urls = MementoBear.get_redirect_urls(link) for url in redirect_urls: status = MementoBear.check_archive(self._mc, url) if not status: yield Result.from_values( self, ('This link redirects to %s and not archived yet, ' 'visit https://web.archive.org/save/%s to get it ' 'archived.' % (url, url)), file=filename, line=line_number, severity=RESULT_SEVERITY.INFO)
def link_handler(link): try: link = link.split(' ')[1] except IndexError: pass #print(str_link) uri_regex = re.compile( r'^(?:http|ftp)s?://' # http:// or https:// r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|' # domain... r'localhost|' # localhost... r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|' # ...or ipv4 r'\[?[A-F0-9]*:[A-F0-9:]+\]?)' # ...or ipv6 r'(?::\d+)?' # optional port r'(?:/?|[/?]\S+)$', re.IGNORECASE) uri_rec = uri_regex.search(link) #uri_rec = re.search("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", link) #print(uri_rec) #print(uri_regex) #print(link) if uri_rec: print('Url found') uri = uri_rec.group(0) print(uri) timegate = 'https://archive.fo/timegate/' mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) try: archive_uri = mc.get_memento_info(uri).get("mementos").get( "last").get("uri")[0] # print(uri) # print(archive_uri) print('Archive is ' + archive_uri) except AttributeError: archive_uri = archive_create(uri) return archive_uri except NameError: print('Sum happen') return ('Something went wrong, let @raku_cat know') else: pass else: return 'No valid URL found' if 'archive.fo' in archive_uri: # print(archive_uri) return archive_uri elif 'archive.is' in archive_uri: keyboard = InlineKeyboardMarkup(inline_keyboard=[ [ InlineKeyboardButton(text='Force save page', callback_data='save') ], [ InlineKeyboardButton(text='← Prior', callback_data='back'), InlineKeyboardButton(text='Next →', callback_data='next') ], [ InlineKeyboardButton(text='History', switch_inline_query_current_chat=uri) ], ]) return archive_uri, keyboard elif 'trans' in archive_uri: archive_uri = mc.get_memento_info(uri).get("timegate_uri") print('Sent weird api deal') return (archive_uri) else: print('^No it wasn\'t') return 'Something went wrong, let @raku_cat know'
def on_callback_query(msg): query_id, chat_id, query_data = telepot.glance(msg, flavor='callback_query') # print(msg) # print(query_data) print('Recieved query ' + query_id) url = msg['message']['reply_to_message']['text'].split(' ')[1] msg_idf = telepot.message_identifier(msg['message']) callback_text = '' global delay if query_data == 'save': if delay != '': if datetime.datetime.now() > delay: r = requests.get('https://archive.fo/') html = r.text soup = BeautifulSoup(html, 'lxml') submitid = soup.find('input').get('value') headers = { 'User-agent': 'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36' } values = {'submitid': submitid, 'url': url, 'anyway': '1'} r = requests.post('https://archive.fo/submit/', data=values, headers=headers) uri = r.text archive_uri = uri.split('"')[1] delay = datetime.datetime.now() + datetime.timedelta(minutes=3) if 'archive.fo' in archive_uri: pass else: callback_text = 'Something went wrong, let @raku_cat know' else: callback_text = 'Saving on cooldown, please try again in a few miniutes.' else: uri = msg['message']['text'] foo, keyboard = link_handler(url) dt = uri.split('/')[3] dt = datetime.datetime.strptime(dt, '%Y%m%d%H%M%S') timegate = 'https://archive.fo/timegate/' mc = MementoClient(timegate_uri=timegate, check_native_timegate=False) if query_data == 'back': try: archive_uri = mc.get_memento_info( url, dt).get('mementos').get('prev').get('uri')[0] except AttributeError: callback_text = 'No older archives or something went wrong.' elif query_data == 'next': try: archive_uri = mc.get_memento_info( uri, dt).get('mementos').get('next').get('uri')[0] except AttributeError: callback_text = 'No newer archives or something went wrong.' try: bot.editMessageText(msg_idf, archive_uri) except: pass try: bot.editMessageText(msg_idf, archive_uri, reply_markup=keyboard) except: pass bot.answerCallbackQuery(query_id, text=callback_text) print('Responding to callback ' + query_id)