def archive_is(url): """ Function for pushing to archive.is """ print("[*] Pushing to archive.is...") archiveis_result = archiveis.capture(url).replace("http://", "https://") print(archiveis_result)
def capture(url): """ Capture an url in archive.is """ # Easiest way to do it for now, archive.is API sucks # FIXME replace this lib return archiveis.capture(url)
def save_in_archive(url): """ Saves an URL in archive.is :param url: URL to save :return: URL in archive.is """ return archiveis.capture(url)
def handle_msg_all(self, msg): if (msg['msg_type_id'] == 1 or msg['msg_type_id'] == 4) and msg['content']['type'] == 7: self.send_msg_by_uid( archiveis.capture(msg['content']['data']['url']), msg['user']['id']) self.send_msg_by_uid(u'请复制上述链接在墙外打开', msg['user']['id'])
def generate_text(quote, submission): text = quote[0].decode('string-escape') # in case quote has \n in it text += '\n\n' text += '[Here\'s](' text += archiveis.capture(submission.url) text += ') an archived version of ' urls = None if submission.is_self and submission.selftext: urls = get_urls(submission.selftext) archive_urls = map(archiveis.capture, urls) if submission.is_self: text += 'this thread' if urls: text += '[,' if quote[1] else ',' else: text += '[.' if quote[1] else '.' else: text += 'the linked post' text += '[.' if quote[1] else '.' if quote[1]: text += '](' text += quote[1] text += ')' if urls: text += ' and the links:' for link in zip(urls, archive_urls): text += '\n\n' text += '[' + link[0] + '](' + link[1] + ')' return text
async def archive(self, ctx, url): await bot_log(_("%s가 %s를(을) 아카이브 했습니다.\n") % (ctx.message.author, url)) try: if not "http" in url: url = "http://" + url archive_url = archiveis.capture(url, self.proxyString) await self.bot.send_message(ctx.message.channel, _("아카이브 중입니다...\n조금만 기다려 주세요!")) self.driver.get(url) wait = WebdriverWait(self.driver, 2) wait.until(EC.presence_of_element_located((By.XPATH, 'html'))) self.driver.maximize_window() self.driver.find_element_by_tag_name('html').screenshot( 'screenshot.png') await self.bot.send_file(ctx.message.channel, 'screenshot.png') await self.bot.send_message(ctx.message.channel, archive_url) await self.bot.log(_("아카이브 주소:%s\n") % (url)) os.remove('screenshot.png') except: try: self.driver.close() except: pass await self.bot.send_message(ctx.message.channel, _("오류가 발생했어요!")) raise
def archive_url(original_url, username, running_locally): ''' Return error_message if failed; otherwise None. ''' error_message = None print('adjust_url(' + original_url + ')') url = url_util.adjust_url(original_url) if url is not None: # Pop from pending dynamodb.pop_account_archive_request_by(list_name=dynamodb.ACCOUNT_TABLE_ARCHIVE_PENDING_REQUEST_LIST, username=username, original_url=original_url) # Record the current datetime utc_datetime = datetime.datetime.utcnow() utc_datetime_str = str(utc_datetime) # Save it on archive website initial_archive_md_url = None if running_locally: try: print('archiveis.capture(' + url + ')') initial_archive_md_url = archiveis.capture(url) except Exception as e: print('Unexpected exception: ' + str(e)) # Screenshot the url webpage print('take_url_webpage_snapshot(' + url + ')') url_webpage_png, _url_inner_html = webpage_snapshot.take_url_webpage_snapshot(url=url, running_locally=running_locally) # Create new archive entry on DynamoDB dynamodb.create_new_archive(url=url, datetime=utc_datetime_str, username=username, archive_md_url=initial_archive_md_url) # Store the screenshot on S3 archive_id, _, _ = dynamodb.get_archive_info(url=url, datetime=utc_datetime_str) url_webpage_png_s3_key = s3.WEBPAGE_SCREENSHOT_DIR + archive_id + '.png' s3.upload_file_bytes_object(key=url_webpage_png_s3_key, file_bytes=url_webpage_png) # Store the text of the webpage on S3 # url_webpage_text = clean_text(extract_text(url_inner_html)).encode() # url_weboage_text_s3_key = s3.WEBPAGE_TEXT_DIR + archive_id + '.txt' #s3.upload_file_bytes_object(key=url_weboage_text_s3_key, file_bytes=url_webpage_text) # Early-exit for success return None else: error_message = 'Invalid URL: ' + original_url # All success must early-exit assert error_message # Pop from pending dynamodb.pop_account_archive_request_by(list_name=dynamodb.ACCOUNT_TABLE_ARCHIVE_PENDING_REQUEST_LIST, username=username, original_url=original_url) # Add into failed dynamodb.push_account_archive_request(list_name=dynamodb.ACCOUNT_TABLE_ARCHIVE_FAILED_REQUEST_LIST, username=username, original_url=original_url) return error_message
def archive_target(self, target): archive_url = archiveis.capture(target) """ Returns log of what was archived """ message = f"target {target} has been archived" # This logs to the docker logs self.logger.info(message) return archive_url
def archive(self, url, num): ar = archiveis.capture(url) if (num + 1) % 5 == 0: print("pausing to let the archiver catch up [ 20 seconds ]") time.sleep(20) print("[ %d ] " % num + ar) return str(ar)
def get_all_tweets(screen_name): if (consumer_key == ""): print "You need to set up the script first. Edit it and add your keys." return #Twitter only allows access to a users most recent 3240 tweets with this method #authorize twitter, initialize tweepy auth = tweepy.OAuthHandler(consumer_key, consumer_secret) auth.set_access_token(access_key, access_secret) api = tweepy.API(auth) #initialize a list to hold all the tweepy Tweets alltweets = [] #make initial request for most recent tweets (200 is the maximum allowed count) new_tweets = api.user_timeline(screen_name = screen_name,count=200) #save most recent tweets alltweets.extend(new_tweets) #save the id of the oldest tweet less one oldest = alltweets[-1].id - 1 #keep grabbing tweets until there are no tweets left to grab while len(new_tweets) > 0: print "getting tweets before {0}".format(oldest) #all subsiquent requests use the max_id param to prevent duplicates new_tweets = api.user_timeline(screen_name = screen_name,count=200,max_id=oldest,include_entities = True, tweet_mode = 'extended') #save most recent tweets alltweets.extend(new_tweets) #update the id of the oldest tweet less one oldest = alltweets[-1].id - 1 print "...{0} tweets downloaded so far".format(len(alltweets)) for tweet in alltweets: tweetID=tweet.id_str tweetURL="https://twitter.com/{0}/status/{1}".format(screen_name,tweetID) print "Archiving {0}...".format(tweetURL) archive_url = archiveis.capture(tweetURL) archiveorg_url = savepagenow.capture_or_cache(tweetURL) print "Tweet archived! archive.is: {0} ||| archive.org: {1}".format(archive_url,archiveorg_url[0]) print "All tweets successfully archived."
def arquivar_tweets(): lista_ids = database.recupera_ids_sem_arquivo() for par in lista_ids: url = "https://twitter.com/" + str(par[1]) + "/status/" + str(par[0]) try: url_arquivo = archiveis.capture(url) print(url_arquivo) database.adiciona_arquivo(par[0], url_arquivo) except Exception as E: traceback.print_exc() print("Problema no arquivador principal") exit()
def archive(self, url, num): base = "https://archive.today/download/" ar = archiveis.capture(url) if (num + 1) % 5 == 0: print("pausing to let the archiver catch up [ 20 seconds ]") time.sleep(20) print("[ %d ] " % num + ar) return str(ar)
async def replace_url(self, link, msg): await msg.delete() if patterns.youtube.search(msg.content): new_url = patterns.domain.sub(r"\1hooktube.com\3", link.group(0)) else: new_url = archive.capture(link.group(0)) new_url = new_url.replace("http", "https") logger.report(new_url) new_content = patterns.url.sub(new_url, msg.content) await msg.channel.send(msg.author.nick + ": " + new_content) return
def job_from_facebook(idx, store_urls, db): driver = get_selenium_driver() news_collection = db[Constants.NEWS_COLLECTION] urls = store_urls[idx] t = open("./fb_result.txt", 'a') for id_url in urls: id = id_url[0] url = id_url[1] try: # url = url[0] if "archive" not in url: archive_url = archiveis.capture(url) time.sleep(30) else: archive_url = url print(archive_url) # if "wip/" in archive_url: except Exception as e: print(str(e)) t.write(url + "\n") t.flush() print("Factcheck ERROR in {}".format(url)) continue if "wip/" in archive_url: archive_url = archive_url.replace("wip/", "") if "wip/" not in archive_url: try: return_dic = {} return_dic['ref_source'] = get_news_source_article( archive_url, driver) return_dic['news_id'] = id return_dic['ref_source_url'] = return_dic['ref_source']['url'] print(id) news_collection.find_one_and_update( {"news_id": return_dic['news_id']}, {"$set": return_dic}, upsert=True) except: print("Problem in {}".format(archive_url)) continue else: news_collection.find_one_and_update({"news_id": id}, { "$set": { 'archive_wip': archive_url.replace("/wip", ""), 'news_id': id } }, upsert=True) t.close()
def is_memento(clip_id): """ Archive a clip with archive.is. """ clip = Clip.objects.get(id=clip_id) logger.debug("Archiving {} with archive.is".format(clip.url)) try: is_url = archiveis.capture(clip.url) is_memento = Memento.objects.create(url=is_url, archive="archive.is") logger.debug("Created {}".format(is_memento)) clip.mementos.add(is_memento) except Exception as e: logger.debug("archive.is failed") logger.debug(e)
def echo_all(message): reply = archiveis.capture(message.text) try: bot.reply_to(message, reply) except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') html = requests.get(message.text) Title = BeautifulSoup(html.text, "html.parser").title.text.encode('utf-8').strip() with open('archive.csv', 'a') as f1: f1.write(time.ctime() + ',' + message.text + ',' + reply + ',') f1.write(Title) f1.write('\n')
def handle_submission(self, submission): logging.debug('[submissions] Processing new submission %s', submission.id) if submission.selftext == '': urls = re.findall(self.regex, submission.url) else: urls = re.findall(self.regex, submission.selftext) if urls: logging.info('[submissions] New submission %s with bild.de URLs found', submission.id) archive_urls = [] bildplus = 0 for url in urls: parsed_url = urlparse(url) if parsed_url.path.startswith('/bild-plus/'): logging.info('[submissions] Skipping %s because it is probably a BILD+ link', url) bildplus += 1 continue logging.info('[submissions] Capturing %s', url) archive_url = archiveis.capture(url) if archive_url: archive_urls.append(archive_url) logging.info('[submissions] Captured: %s', archive_url) else: logging.warning('[submissions] Got an empty archive.is URL back. Something is wrong') if len(urls) != len(archive_urls) + bildplus: logging.warning('[submissions] Found %d bild.de URLs, but got only %d archive.is links', len(urls), len(archive_urls)) if archive_urls: links = "\n- ".join(archive_urls) body = ("> Diese Zeitung ist ein Organ der Niedertracht. Es ist falsch, sie zu lesen.\n" "> Jemand, der zu dieser Zeitung beiträgt, ist gesellschaftlich absolut inakzeptabel.\n" "> Es wäre verfehlt, zu einem ihrer Redakteure freundlich oder auch nur höflich zu sein.\n" "> Man muß so unfreundlich zu ihnen sein, wie es das Gesetz gerade noch zuläßt.\n" "> Es sind schlechte Menschen, die Falsches tun.\n\n" "[Max Goldt](https://de.wikipedia.org/wiki/Max_Goldt), deutscher Schriftsteller\n\n" "Du kannst diesen Artikel auf archive.is lesen, wenn du nicht auf bild.de gehen willst:\n\n- " \ + links + \ "\n\n" "----\n\n" "^^[Info](https://www.reddit.com/r/MaxGoldtBot) | " "[Autor](https://www.reddit.com/u/pille1842) | " "[GitHub](https://github.com/pille1842/MaxGoldtBot) | " "[Warum die Bild schlecht ist]" "(http://www.bildblog.de/62600/warum-wir-gegen-die-bild-zeitung-kaempfen/)") submission.reply(body) logging.info('[submissions] Replied to %s with %d links', submission.id, len(archive_urls)) else: logging.warning('[submissions] No reply to %s: %d bild.de links found, none archived', submission.id, len(urls)) else: logging.debug('[submissions] No relevant URLs found in %s', submission.id)
def echo_all(message): reply = archiveis.capture(message.text) bot.reply_to(message, reply) html = requests.get(message.text) Title = BeautifulSoup(html.text, "html.parser").title.text.encode('utf-8').strip() with open('archive.csv', 'a') as f1: f1.write(time.ctime() + ',' + message.text + ',' + reply + ',') f1.write(Title) f1.write('\n') with open('archive.txt', 'a') as f2: f2.write(time.ctime() + '\n' + message.text + '\n' + reply + '\n') f2.write(Title) f2.write('\n' + '\n')
def on_data(self, data): # convert from JSON to a dictionary tweet = json.loads(data) # grab the tweet's screen name, ID, etc tweet_id = tweet.get('id_str') screen_name = tweet.get('user',{}).get('screen_name') tweet_text = tweet.get('text') # grab the reply tweet information reply_tweet_id = tweet.get('in_reply_to_status_id_str') reply_tweet_screen_name = tweet.get('in_reply_to_screen_name') if reply_tweet_id is not None: # make the URL of the tweet to archive tweet_to_archive = "https://twitter.com/%s/status/%s" % (reply_tweet_screen_name, reply_tweet_id) # print confirmation of finding tweet print "[*] Given tweet to archive: %s" % tweet_to_archive # archive the tweet internet_archive_url = internet_archive(tweet_to_archive) # push to archive.is print "[*] Pushing to archive.is..." archiveis_result = archiveis.capture(tweet_to_archive).replace("http://", "https://") print "[!] Archived %s" % tweet_to_archive print internet_archive_url print archiveis_result # sleep, so the bot doesn't immediately reply and potentially trigger bot alerts time.sleep(10) # content of tweet to send to requester message = "Sure thing, here are the archive links: %s, %s" % (internet_archive_url,archiveis_result) # post a reply to the tweet api.update_status(message,in_reply_to_status_id=tweet_id,auto_populate_reply_metadata=True) print "[!] Posted a reply" # sleep to avoid rate limiting time.sleep(300) return True
def main(): output_file = open(args.log, 'w') for request in range(args.start, args.end + 1): if FLAG: connection_attempt = 1 linkToArchive = "https://www.righttoknow.org.au/request/" + str( request ) #todo: figure out non-naive way to do this. archive human readable urls instead? # print link being currently archived print "\n[*] Given FOI request URL to archive: %s" % linkToArchive # archive the URL internet_archive_url = internet_archive(linkToArchive, connection_attempt, args.retries, FLAG) # push to archive.is print "[+] Uploading to archive.is..." archiveis_result = archiveis.capture(linkToArchive).replace( "http", "https") print "[+] FOI Request Archived %s" % linkToArchive print "[+] Wayback Machine: %s" % str(internet_archive_url) print "[+] archive.is: %s \n" % str(archiveis_result) # save links to file if args.log: output_file.write(str(internet_archive_url)) else: continue # sleep to avoid bot triggers time.sleep(0.3) output_file.close() # kill Tor process on completion tor_process.kill() print "[*] %d FOI requests archived on the Wayback Machine" % ( args.end - args.start + 1) print "[*] Links saved to file: %s\n" % args.log print "[*] Killed Tor process" print "[*] Exiting..." return True
def save_with_archiveis(url): """saves the page to archive.is """ print("Saving url: {} with the waybackmachine...".format(url)) try: archiveis_location = archiveis.capture(url) print("archive.is saved on: {}".format(archiveis_location)) re.sub('^.+/', '', archiveis_location) archiveis_download = 'https://archive.today/download/{}.zip'.format( re.sub('^.+/', '', archiveis_location)) msg = '{} downlodable at {}'.format(archiveis_location, archiveis_download) except Exception as e: print(" sorry, something went wrong :(\n {}".format(e)) print("Impossible to save the URL to archive.is") print("ERROR: {}".format(str(e))) msg = 'FAILED' return msg
def arquivar_tweets(): print("Arquivando tweets...") lista_ids = database.recupera_ids_sem_arquivo2() for par in lista_ids: url = "https://twitter.com/" + str(par[1]) + "/status/" + str(par[0]) print(url) try: url_arquivo = archiveis.capture(url) database.adiciona_arquivo(par[0], url_arquivo) except Exception as E: print(E) print("Problema no arquivador principal") try: url_arquivo = savepagenow.capture(url) database.adiciona_arquivo(par[0], url_arquivo) time.sleep(20) except Exception as E2: print(E2) print("Problema no arquivador reserva.")
def test_capture(self): archive_url_1 = archiveis.capture("http://www.example.com/") self.assertTrue(archive_url_1.startswith("http://archive.vn/"))
site=site, update=update, archive='archive.org', url=ia_memento, ) else: logger.info("Internet Archive returned a cached memento") except Exception: logger.info("Adding Internet Archive memento failed") # Archive.is mementos where turned on if site.has_archiveis_mementos: logger.info("Adding archive.is memento for %s" % site.url) try: is_memento = archiveis.capture( site.url, user_agent="pastpages.org ([email protected])" ) is_created = Memento.objects.filter(url=is_memento).count() == 0 if is_created: memento = Memento.objects.create( site=site, update=update, archive='archive.is', url=is_memento, ) else: logger.info("archive.is returned a cached memento") except Exception: logger.info("Adding archive.is memento failed") # webcitation mementos where turned on
print "[*] Pushing to Perma.cc..." perma_json = {} perma_json['url'] = '%s' % input # remember to put your Perma.cc API key in here response = requests.post("https://api.perma.cc/v1/archives/?api_key=YOUR_PERMA_API_KEY_HERE", data=perma_json) if response.status_code == 201: result = json.loads(response.content) page_id = result['guid'] perma_url = "https://perma.cc/%s" % page_id return perma_url else: print "[*] Connection error" # push to The Internet Archive internet_archive_result = internet_archive(input) print internet_archive_result # push to archive.is print "[*] Pushing to archive.is..." archiveis_result = archiveis.capture(input) print archiveis_result # push to perma.cc perma_result = perma(input) print perma_result
def get_foo(url): print(url) archive_url = archiveis.capture(url) foo = archive_url.split('/')[-1] return foo
import archiveis import savepagenow import time if __name__ == '__main__': gevent.monkey.patch_all() print("Arquivando tweets...") while True: lista_ids = database.recupera_ids_sem_arquivo() for par in lista_ids: url = "https://twitter.com/" + str(par[1]) + "/status/" + str( par[0]) print(url) try: url_arquivo = archiveis.capture(url) database.adiciona_arquivo(par[0], url_arquivo) except Exception as E: print(E) print("Problema no arquivador principal") try: url_arquivo = savepagenow.capture(url) database.adiciona_arquivo(par[0], url_arquivo) time.sleep(20) except Exception as E2: print(E2) print("Problema no arquivador reserva.") def arquivar_tweets(): print("Arquivando tweets...")
def archive_is(url): return archiveis.capture(url)
perma_json = {} perma_json['url'] = '%s' % input # remember to put your Perma.cc API key in here response = requests.post( "https://api.perma.cc/v1/archives/?api_key=YOUR_PERMA_API_KEY_HERE", data=perma_json) if response.status_code == 201: result = json.loads(response.content) page_id = result['guid'] perma_url = "https://perma.cc/%s" % page_id return perma_url else: print "[*] Connection error" # push to The Internet Archive internet_archive_result = internet_archive(input) print internet_archive_result # push to archive.is print "[*] Pushing to archive.is..." archiveis_result = archiveis.capture(input).replace("http://", "https://") print archiveis_result # push to perma.cc perma_result = perma(input) print perma_result
def start(self): return archiveis.capture(self._url)
def bot_login(identifiers): login = praw.Reddit(**identifiers) return login with open("identifiers.csv") as id_csv: reader = csv.reader(id_csv) imported_id = {row[0]: row[1] for row in reader} reddit = bot_login(imported_id) subreddit = reddit.subreddit("badmathematics") submission_stream = subreddit.stream.submissions(skip_existing=True, pause_after=0) while True: submission = next(submission_stream) if submission and not submission.is_self: url = submission.url if url.startswith("https://www.reddit.com"): url = url[0:8] + 'old' + url[11:] archive_url = archiveis.capture(url) comment_text = f"[Here's]({archive_url}) an archived version of this thread. \n" \ "[^^Source](https://github.com/kitegi/discount-gv)" submission.reply(comment_text) print("Reply sent") else: sleep(50) sleep(10)