def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m):

    mc = MementoClient()

    actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime).get("mementos").get("closest").get("uri")[0]

    assert expected_uri_m == actual_uri_m
def test_get_memento_uri_specified_timegate_direct_timegate_query(input_uri_r, input_datetime, input_timegate, expected_uri_m):

    mc = MementoClient(timegate_uri=input_timegate, check_native_timegate=False)

    actual_uri_m = mc.get_memento_info(input_uri_r, input_datetime, include_uri_checks=False).get("mementos").get("closest").get("uri")[0]

    assert expected_uri_m == actual_uri_m
Ejemplo n.º 3
0
def url_list(request):
    urls = URL.objects.all()
    if request.method == "POST":
        form = URLForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            try: 
                response = requests.get(post)
                temp = BeautifulSoup(response.content,"lxml")
                post.title = temp.title.string
                post.finalDestination = response.url
                post.statusCode = response.status_code
                dt = datetime.datetime.now()
                mc = MementoClient()
                uri = post.finalDestination
                memento_uri = mc.get_memento_info(uri, dt).get("mementos").get("closest")
                post.uri = memento_uri.get('uri')[0]
                post.datetime = str(memento_uri.get('datetime'))
            except:
                post.statusCode = "None"
                post.finalDestination = "Does not exit"
                post.title = "No title"
                pass
            finally:           
                post.save()
                return redirect('url_detail', pk=post.pk)
    else:
        form = URLForm
    return render(request, 'lab1/url_list.html',{'urls':urls,'form':URLForm})
Ejemplo n.º 4
0
def url_list(request):
    urls = URL.objects.all()
    if request.method == "POST":
        form = URLForm(request.POST)
        if form.is_valid():
            post = form.save(commit=False)
            try:
                response = requests.get(post)
                temp = BeautifulSoup(response.content, "lxml")
                post.title = temp.title.string
                post.finalDestination = response.url
                post.statusCode = response.status_code
                dt = datetime.datetime.now()
                mc = MementoClient()
                uri = post.finalDestination
                memento_uri = mc.get_memento_info(
                    uri, dt).get("mementos").get("closest")
                post.uri = memento_uri.get('uri')[0]
                post.datetime = str(memento_uri.get('datetime'))
            except:
                post.statusCode = "None"
                post.finalDestination = "Does not exit"
                post.title = "No title"
                pass
            finally:
                post.save()
                return redirect('url_detail', pk=post.pk)
    else:
        form = URLForm
    return render(request, 'lab1/url_list.html', {
        'urls': urls,
        'form': URLForm
    })
Ejemplo n.º 5
0
Archivo: views.py Proyecto: sjsn/lab2
def url_list(request):
	if request.method == "POST":
		form = SearchForm(request.POST)
		if form.is_valid():
			new_url = form.save(commit = False)
			new_url.date = timezone.now()
			# Runs when URL is correct
			try:
				response = requests.get(new_url)
				page = BeautifulSoup(response.content, "lxml")
				if page.title is not None:
					title = page.title.string
				else:
					title = "No Title Available"
				new_url.status = response.status_code
				new_url.final_url = response.url
				new_url.title = title
				# Wayback storing
				current_date = datetime.datetime.now()
				memento = MementoClient()
				wayback_res = memento.get_memento_info(response.url, current_date).get("mementos").get("closest")
				new_url.wayback = wayback_res.get("uri")[0]
				if wayback_res.get("datetime") is not None:
					new_url.wayback_date = str(wayback_res.get("datetime"))
				else:
					new_url.wayback_date = str(current_date)
				# Picture archiving
				# Connecting to S3
				s3_connection = boto3.resource("s3")
				# For image capture with PhahtomJS
				data = json.dumps({"url":response.url, "renderType":"jpeg"}).encode("utf-8")
				headers = {"content-type": "application/json"}
				api_url = "http://PhantomJScloud.com/api/browser/v2/" + api_key + "/"
				req = urllibreq.Request(url=api_url, data=data, headers=headers)
				res = urllibreq.urlopen(req)
				result = res.read()
				# Puts the generated image on S3
				s3_connection.Bucket("lab3pics").put_object(Key=str(current_date) + ".jpg", Body=result, ACL="public-read", ContentType="image/jpeg")
				# Generates a publicly accessible link to the image
				pic_url = "http://s3.amazonaws.com/lab3pics/" + str(current_date) + ".jpg"
				new_url.archive_link = pic_url
			# Sets up error message
			except Exception as e:
				new_url.status = "None"
				new_url.final_url = "Does not exist"
				new_url.title = "This webpage does not exist"
				new_url.wayback = "Not available"
				new_url.wayback_date = "Not available"
				new_url.archive_link = e
				# Redirects to details page
			finally:
				new_url.save()
				return redirect('url_detail', pk = new_url.pk)
	else:
		urls = URL.objects.filter(date__lte = timezone.now()).order_by('-date')
		form = SearchForm
	return render(request, 'urlexpander/url_list.html', {'urls': urls, 'form': SearchForm})
def test_get_memento_uri_default(input_uri_r, input_datetime, expected_uri_m):

    mc = MementoClient()

    actual_uri_m = mc.get_memento_info(
        input_uri_r,
        input_datetime).get("mementos").get("closest").get("uri")[0]

    assert expected_uri_m == actual_uri_m
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g):

    mc = MementoClient(timegate_uri=input_uri_g)

    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")

    assert input_uri_r == original_uri
def test_bad_timegate_linux():

    input_uri_r = "http://www.cnn.com"
    bad_uri_g = "http://www.example.com"
    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient(timegate_uri=bad_uri_g)

    with pytest.raises(requests.ConnectionError):
        original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
def good_url_slash_at_end():

    input_uri_r = "http://www.cnn.com/"
    
    mc = MementoClient()
    dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT", "%a, %d %b %Y %H:%M:%S GMT")
    
    uri_m = mc.get_memento_info(input_uri_r, dt).get("mementos").get("closest").get("uri")[0]

    assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
def test_bad_timegate_osx():

    input_uri_r = "http://www.cnn.com"
    bad_uri_g = "http://www.example.com"
    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient(timegate_uri=bad_uri_g)

    original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")

    assert input_uri_r == original_uri
Ejemplo n.º 11
0
def test_get_memento_uri_specified_timegate(input_uri_r, input_datetime,
                                            input_timegate, expected_uri_m):

    mc = MementoClient(timegate_uri=input_timegate,
                       check_native_timegate=False)

    actual_uri_m = mc.get_memento_info(
        input_uri_r,
        input_datetime).get("mementos").get("closest").get("uri")[0]

    assert expected_uri_m == actual_uri_m
Ejemplo n.º 12
0
def test_mementos_not_in_archive_uri(input_uri_r, input_datetime, input_uri_g):

    mc = MementoClient(timegate_uri=input_uri_g)

    accept_datetime = datetime.datetime.strptime(
        "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    original_uri = mc.get_memento_info(input_uri_r,
                                       accept_datetime).get("original_uri")

    assert input_uri_r == original_uri
def test_bad_timegate():

    print("'HTTP_PROXY' in os.environ:  {}".format('HTTP_PROXY' in os.environ))

    input_uri_r = "http://www.cnn.com"
    bad_uri_g = "http://www.example.moc"
    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient(timegate_uri=bad_uri_g)

    with pytest.raises(requests.ConnectionError):
        original_uri = mc.get_memento_info(input_uri_r, accept_datetime).get("original_uri")
Ejemplo n.º 14
0
def test_good_url_slash_at_end():

    input_uri_r = "http://www.cnn.com/"

    mc = MementoClient()
    dt = datetime.datetime.strptime("Tue, 11 Sep 2001 08:45:45 GMT",
                                    "%a, %d %b %Y %H:%M:%S GMT")

    uri_m = mc.get_memento_info(
        input_uri_r, dt).get("mementos").get("closest").get("uri")[0]

    assert uri_m == 'http://webarchive.loc.gov/all/20010911181528/http://www2.cnn.com/'
Ejemplo n.º 15
0
def main():
    """Entry function."""
    parser = argparse.ArgumentParser()
    parser.add_argument('input_csv_path_file',
                        help="specify the csv file to read")
    parser.add_argument('output_csv_path_file',
                        help="specify the csv file to write results")
    parser.add_argument('url_field',
                        help=" specify the field name to get the URL")
    parser.add_argument('datetime', help="Memento Datetime")

    args = parser.parse_args()

    df = pd.read_csv(args.input_csv_path_file)

    dt = datetime.datetime.strptime(args.datetime, '%Y%m%d')
    mc = MementoClient(check_native_timegate=False)
    with open(args.output_csv_path_file, 'a') as csvfile:
        fieldnames = ['original-uri', 'memento-closest',
                      'memento-first', 'memento-last',
                      'timegate_uri']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writeheader()
        for index, row in df[9487:].iterrows():
            # if row[args.url_field] != '':
            memento = mc.get_memento_info(
                row[args.url_field], dt, include_uri_checks=False)

            if not memento.get("mementos") is None:
                writer.writerow({'original-uri': memento.get("original_uri"),
                                 'memento-closest':
                                 '' if memento.get("mementos")
                                 .get("closest") is None else
                                 memento.get("mementos")
                                 .get("closest").get("uri")[0],
                                 'memento-first':
                                 '' if memento.get("mementos")
                                 .get("first") is None else
                                 memento.get("mementos")
                                 .get("first").get("uri")[0],
                                 'memento-last':
                                 '' if memento.get("mementos")
                                 .get("last") is None else
                                 memento.get("mementos")
                                 .get("last").get("uri")[0],
                                 'timegate_uri': memento.get("timegate_uri")})
            else:
                writer.writerow({'original-uri': memento.get("original_uri"),
                                 'memento-closest': '',
                                 'memento-first': '',
                                 'memento-last': '',
                                 'timegate_uri': memento.get("timegate_uri")})
            csvfile.flush()
def test_get_memento_data_non_compliant(input_uri_m):

    # TODO: pytest did not seem to split this into arguments
    input_uri_m = input_uri_m[0]

    mc = MementoClient()
    
    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    original_uri = mc.get_memento_info(input_uri_m, accept_datetime).get("original_uri")

    assert input_uri_m == original_uri
def test_nonexistent_urirs(input_uri_r):

    input_uri_r = input_uri_r[0]

    accept_datetime = datetime.datetime.strptime("Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient()

    memento_info = mc.get_memento_info(input_uri_r, accept_datetime)

    assert memento_info.get("original_uri") == input_uri_r

    assert memento_info.get("timegate_uri") == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
Ejemplo n.º 18
0
def test_get_memento_data_non_compliant(input_uri_m):

    # TODO: pytest did not seem to split this into arguments
    input_uri_m = input_uri_m[0]

    mc = MementoClient()

    accept_datetime = datetime.datetime.strptime(
        "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    original_uri = mc.get_memento_info(input_uri_m,
                                       accept_datetime).get("original_uri")

    assert input_uri_m == original_uri
Ejemplo n.º 19
0
def test_nonexistent_urirs(input_uri_r):

    input_uri_r = input_uri_r[0]

    accept_datetime = datetime.datetime.strptime(
        "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient()

    memento_info = mc.get_memento_info(input_uri_r, accept_datetime)

    assert memento_info.get("original_uri") == input_uri_r

    assert memento_info.get(
        "timegate_uri"
    ) == 'http://timetravel.mementoweb.org/timegate/{}'.format(input_uri_r)
Ejemplo n.º 20
0
def test_bad_timegate():

    print("'HTTP_PROXY' in os.environ:  {}".format('HTTP_PROXY' in os.environ))

    input_uri_r = "http://www.cnn.com"
    bad_uri_g = "http://www.example.moc"
    accept_datetime = datetime.datetime.strptime(
        "Thu, 01 Jan 1970 00:00:00 GMT", "%a, %d %b %Y %H:%M:%S GMT")

    mc = MementoClient(timegate_uri=bad_uri_g)

    #with pytest.raises(requests.ConnectionError):
    with pytest.raises((requests.exceptions.ConnectionError,
                        memento_client.memento_client.MementoClientException)):
        original_uri = mc.get_memento_info(input_uri_r,
                                           accept_datetime).get("original_uri")
Ejemplo n.º 21
0
def get_via_mementos(uri, dt):
    mc = MementoClient(timegate_uri=timegate, check_native_timegate=False)
    # mc = MementoClient()
    print("Getting mementos for %s ..." % uri)
    try:
        mementos = mc.get_memento_info(uri, dt).get("mementos")
        if mementos:
            print("Got mementos for %s ..." % uri)
            if 'closest' in mementos:
                uri = mementos.get("closest").get("uri")[0]
            elif 'memento' in mementos:
                uri = mementos.get("closest").get("uri")[0]
            # Need to patch the id_ into the url:
            uri = re.sub(r"\/(\d{14})\/", r"/\1id_/", uri)
    except Exception as e:
        print(e)
        pass

    return uri
Ejemplo n.º 22
0
def on_callback_query(msg):
    query_id, chat_id, query_data = telepot.glance(msg,
                                                   flavor='callback_query')
    #    print(msg)
    #    print(query_data)
    print('Recieved query ' + query_id)
    url = msg['message']['reply_to_message']['text'].split(' ')[1]
    msg_idf = telepot.message_identifier(msg['message'])
    callback_text = ''
    global delay
    if query_data == 'save':
        if delay != '':
            if datetime.datetime.now() > delay:
                r = requests.get('https://archive.fo/')
                html = r.text
                soup = BeautifulSoup(html, 'lxml')
                submitid = soup.find('input').get('value')
                headers = {
                    'User-agent':
                    'Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/54.0.2840.99 Safari/537.36'
                }
                values = {'submitid': submitid, 'url': url, 'anyway': '1'}
                r = requests.post('https://archive.fo/submit/',
                                  data=values,
                                  headers=headers)
                uri = r.text
                archive_uri = uri.split('"')[1]
                delay = datetime.datetime.now() + datetime.timedelta(minutes=3)
                if 'archive.fo' in archive_uri:
                    pass
                else:
                    callback_text = 'Something went wrong, let @raku_cat know'
            else:
                callback_text = 'Saving on cooldown, please try again in a few miniutes.'
    else:
        uri = msg['message']['text']
        foo, keyboard = link_handler(url)
        dt = uri.split('/')[3]
        dt = datetime.datetime.strptime(dt, '%Y%m%d%H%M%S')
        timegate = 'https://archive.fo/timegate/'
        mc = MementoClient(timegate_uri=timegate, check_native_timegate=False)
        if query_data == 'back':
            try:
                archive_uri = mc.get_memento_info(
                    url, dt).get('mementos').get('prev').get('uri')[0]
            except AttributeError:
                callback_text = 'No older archives or something went wrong.'
        elif query_data == 'next':
            try:
                archive_uri = mc.get_memento_info(
                    uri, dt).get('mementos').get('next').get('uri')[0]
            except AttributeError:
                callback_text = 'No newer archives or something went wrong.'
    try:
        bot.editMessageText(msg_idf, archive_uri)
    except:
        pass
    try:
        bot.editMessageText(msg_idf, archive_uri, reply_markup=keyboard)
    except:
        pass
    bot.answerCallbackQuery(query_id, text=callback_text)
    print('Responding to callback ' + query_id)
Ejemplo n.º 23
0
def link_handler(link):
    try:
        link = link.split(' ')[1]
    except IndexError:
        pass
    #print(str_link)
    uri_regex = re.compile(
        r'^(?:http|ftp)s?://'  # http:// or https://
        r'(?:(?:[A-Z0-9](?:[A-Z0-9-]{0,61}[A-Z0-9])?\.)+(?:[A-Z]{2,6}\.?|[A-Z0-9-]{2,}\.?)|'  # domain...
        r'localhost|'  # localhost...
        r'\d{1,3}\.\d{1,3}\.\d{1,3}\.\d{1,3}|'  # ...or ipv4
        r'\[?[A-F0-9]*:[A-F0-9:]+\]?)'  # ...or ipv6
        r'(?::\d+)?'  # optional port
        r'(?:/?|[/?]\S+)$',
        re.IGNORECASE)
    uri_rec = uri_regex.search(link)
    #uri_rec = re.search("(?i)\b((?:https?://|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:'\".,<>?«»“”‘’]))", link)
    #print(uri_rec)
    #print(uri_regex)
    #print(link)
    if uri_rec:
        print('Url found')
        uri = uri_rec.group(0)
        print(uri)
        timegate = 'https://archive.fo/timegate/'
        mc = MementoClient(timegate_uri=timegate, check_native_timegate=False)
        try:
            archive_uri = mc.get_memento_info(uri).get("mementos").get(
                "last").get("uri")[0]
            #            print(uri)
            #           print(archive_uri)
            print('Archive is ' + archive_uri)
        except AttributeError:
            archive_uri = archive_create(uri)
            return archive_uri
        except NameError:
            print('Sum happen')
            return ('Something went wrong, let @raku_cat know')
        else:
            pass
    else:
        return 'No valid URL found'
    if 'archive.fo' in archive_uri:
        #       print(archive_uri)
        return archive_uri
    elif 'archive.is' in archive_uri:
        keyboard = InlineKeyboardMarkup(inline_keyboard=[
            [
                InlineKeyboardButton(text='Force save page',
                                     callback_data='save')
            ],
            [
                InlineKeyboardButton(text='← Prior', callback_data='back'),
                InlineKeyboardButton(text='Next →', callback_data='next')
            ],
            [
                InlineKeyboardButton(text='History',
                                     switch_inline_query_current_chat=uri)
            ],
        ])
        return archive_uri, keyboard
    elif 'trans' in archive_uri:
        archive_uri = mc.get_memento_info(uri).get("timegate_uri")
        print('Sent weird api deal')
        return (archive_uri)
    else:
        print('^No it wasn\'t')
        return 'Something went wrong, let @raku_cat know'