def echo_all(message): if 'weibo.com' in message.text or 'weibo.cn' in message.text: try: bot.reply_to(message, "archive doesn't support weibo") except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') else: if message.text.startswith( 'https://mp.weixin.qq.com/') and '__biz=' in message.text: url = '&'.join(message.text.split('&', 5)[:5]) else: url = message.text try: reply = archivenow.push(url, 'is')[0] bot.reply_to(message, reply) except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') html = requests.get(url) soup = BeautifulSoup(html.text, "html.parser") if message.text.startswith('https://mp.weixin.qq.com/s'): Title = soup.h2.text.strip() else: Title = soup.title.text.strip() with open('archive.csv', 'a') as f1: f1.write(time.ctime() + ',' + message.text + ',' + reply + ',') f1.write(Title) f1.write('\n')
def echo_all(message): if 'weibo.com' in message.text or 'weibo.cn' in message.text: try: bot.reply_to(message, "archive doesn't support weibo") except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') else: if message.text.startswith( 'https://mp.weixin.qq.com/') and '__biz=' in message.text: url = '&'.join(message.text.split('&', 5)[:5]) else: url = message.text headers = { "User-Agent": "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ", "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language": "en-us", "Connection": "keep-alive", "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7" } html = requests.get(url, headers=headers) soup = BeautifulSoup(html.text, "html.parser") if len(soup) == 0: Title = datetime.datetime.now().strftime("%Y-%m-%d-") + str( random.randrange(2, 50000000)) else: if message.text.startswith('https://mp.weixin.qq.com/s'): Title = soup.h2.text.strip() elif "zhihu.com" in message.text: Title = soup.title.text.strip() + "-" + str( random.randrange(2, 50000000)) else: Title = soup.title.text.strip() Title = Title.replace('\n', '') Title = Title.replace('|', ' ') Title = Title.replace('<', ' ') Title = Title.replace('>', ' ') Title = Title.replace(',', ' ') Title = Title.replace(' ', '_') cmd = 'monolith ' + url + ' -o /srv/web/mono/' + Title + '.html' os.system(cmd) reply_url = 'http://206.189.252.32:8083/' + Title + '.html' bot.reply_to(message, reply_url) try: reply_ia = archivenow.push(url, 'ia')[0] bot.reply_to(message, reply_ia) except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') with open('archive.csv', 'a') as f1: print(reply_ia) f1.write(time.ctime() + ',' + message.text + ',' + reply_ia + ',') f1.write(Title) f1.write('\n')
def archive_url(url): # Convert link to old.reddit.com if is_reddit_link(url): url = "https://old." + url[url.find("reddit") :] archived_url = archivenow.push(url, "ia")[0] # If archiving fails, return the original link if not archived_url.startswith("https://"): archived_url = url return archived_url
def main(): parser = argparse.ArgumentParser( description= 'Follower Count History. Given a Twitter username, collect follower counts from the Internet Archive.' ) parser.add_argument('-g', dest='graph', action='store_true', help='Generate a graph with data points') parser.add_argument('-e', dest='allMemento', action='store_false', help='Collect every memento, not just one per month') parser.add_argument('uname', help='Twitter username without @') group = parser.add_mutually_exclusive_group() group.add_argument('-p', dest='push', action='store_true', help='Push to Internet Archive') group.add_argument( '-P', dest='pushAll', action='store_true', help='Push to all Archives available through ArchiveNow') args = parser.parse_args() #Dependencies Optional if args.push or args.pushAll: from archivenow import archivenow import datetime def slugify(value): """ Convert to ASCII. Convert spaces to hyphens. Remove characters that aren't alphanumerics, underscores, or hyphens. Convert to lowercase. Also strip leading and trailing whitespace. """ value = str(value) value = unicodedata.normalize('NFKD', value).encode('ascii', 'ignore').decode('ascii') value = str(re.sub('[^\w\s-]', '', value).strip().lower()) value = str(re.sub('[-\s]+', '-', value)) return value archivelink = 'http://web.archive.org/web/timemap/link/http://twitter.com/' + args.uname print(args.uname) print(archivelink) r = requests.get(archivelink) linkslist = [] safeuname = slugify(args.uname) try: os.makedirs('./' + safeuname) except OSError as e: if e.errno != errno.EEXIST: raise writefile = './' + safeuname + '/' + safeuname + ".csv" errorfile = './' + safeuname + '/' + safeuname + "-Error.csv" w = open(writefile, "a+") e = open(errorfile, "a+") olddates = [] if (os.stat(writefile).st_size == 0): #ensure header line does not get re written w.write('date,count,URL-M' + '\n') else: w.seek(0) #read in old data points for line in w.readlines(): row = line.split(",") if (row[0] != "date"): olddates.append(row[0].replace("-", "")) #reset to the end lastline = '' for line in r.iter_lines( ): #on each line if rel="memento" doesn't exist, ignore. If it does get the link and add it to the list of links #print(line) if ('rel="memento"'.encode('utf-8') in line or 'rel="first memento"'.encode('utf-8') in line): if (line != lastline): lastline = line linkslist.append(line[1:line.find('>;'.encode('utf-8'))]) print(str(len(linkslist)) + " archive points found") lastdate = '' date = '0' #with open("test.txt", "r") as f: for line in linkslist: line = line.decode('utf-8') dateloc = line.find("/web/") date = line[dateloc + 5:dateloc + 19] #get the timestamp from the link if (args.allMemento): #get one entry per month if (date[:6] == lastdate ): #if new month is the same as previous, skip e.write(date + ",duplicate month," + line + "\n") continue if (date[:8] in olddates): #if date is in old data-> skip continue if (not args.allMemento ): #If not all mementos AND if month is in old data-> skip for x in olddates: if x[:6] == date[:6]: e.write(date + ",duplicate month," + line + "\n") continue print(date) try: res = urllib.urlopen(line) except: e.write(day + ",URI-M not loaded," + line + "\n") continue try: html = res.read() except IncompleteRead as err: e.write(date + ",partial read," + line + "\n") continue soup = BeautifulSoup(html, "lxml") #html.parser -> lxml #get rid of scripts(javascript especially) for elem in soup.findAll(['script', 'style']): elem.extract() #Make sure this isn't a redirected Momento realURL = res.geturl() realdateloc = realURL.find("/web/") realdate = realURL[dateloc + 5:dateloc + 19] #get the timestamp from the link day = '-'.join([date[:4], date[4:6], date[6:8]]) if (date != realdate): e.write(day + ",redirect," + line + "\n") continue if int(date) < 10120700000000: e.write(day + ",before 10120700000000," + line + "\n") continue else: #try excepts that find the follower counts for different versions of Twitter since its 2008 try: result = soup.select(".ProfileNav-item--followers")[0] try: result = result.find("a")['title'] except: result = result.find("a")['data-original-title'] except: try: result = soup.select(".js-mini-profile-stat")[-1]['title'] except: try: result = soup.select(".stats li")[-1].find( "strong")['title'] except: try: result = soup.select(".stats li")[-1].find( "strong").text except: try: result = soup.select("#follower_count")[0].text except: try: result = soup.select( "#followers_count")[0].text except: try: result = soup.select( ".user-stats-followers")[0].text #result = result[:result.find("Followers")] except: try: result = soup.select( ".stats_count")[1].text except: try: result = soup.select( "follower_stats") if not result: raise ValueError('Empty') except: e.write( day + ",followers not found," + line + "\n") continue result = re.sub(r'\D', '', str(result)) #remove everything that's not a number if (result == ''): e.write(day + ",followers not numbers," + line + "\n") continue try: result = str( int(result) ) #Make sure a number. Also translates other languages if possible. print(result) w.write(day + ',' + result + ',' + realURL + '\n') lastdate = date[:6] except: e.write(day + ",followers not arabic numerals," + line + "\n") continue w.close() if args.push or args.pushAll: #Send to archive now = datetime.datetime.now().strftime("%Y%m") if (int(date[:6]) < int(now)): if args.pushAll: print("Pushing to Archives") archivenow.push("http://twitter.com/" + args.uname, "all") else: print("Pushing to Internet Archive") archivenow.push("http://twitter.com/" + args.uname, "ia") else: print("Not Pushing to Archive. Last Memento Within Current Month.") if (args.graph): #Call the Rscript to create a linechart with the numbers collected Rcall = "Rscript --vanilla follower_count_linechart.R " + safeuname subprocess.call(Rcall, shell=True)
from archivenow import archivenow import time from tqdm import tqdm with open("listpdf.txt", "r", encoding="utf-8-sig") as f: listpdf_temp = [i.strip() for i in f.readlines()] print("Save to web.archive.org") for i in tqdm(listpdf_temp): print( archivenow.push( "https://ddc.moph.go.th/viralpneumonia/file/situation/" + i, "ia")) time.sleep(20)
import sys import csv from archivenow import archivenow seedfile = sys.argv[1] outputfile = sys.argv[2] with open(seedfile) as f: with open(outputfile, 'w') as g: for line in f: line = line.strip() print("working on URI-R {}".format(line)) urim = archivenow.push(line, "ia")[0] g.write('{}\t{}\n'.format(line, urim)) urim = archivenow.push(line, "is")[0] g.write('{}\t{}\n'.format(line, urim)) urim = archivenow.push(line, "wc") g.write('{}\t{}\n'.format(line, urim))
all_data = title + "\n\n" + article + "\n\nที่มา : " + url if collection not in data: data[collection] = 1 with codecs.open( os.path.join( f, collection + "_" + str(data[collection]) + ".txt"), "w", "utf-8") as temp: temp.write(all_data) temp.close() data[collection] += 1 i2 += 1 e = 0 try: archivenow.push(url, "ia") time.sleep(8) except: pass i += 1 i_backup = i else: e += 1 i += 1 else: e += 1 except Exception as ex: e += 1 print(ex) # # #print(e) # #print(i)
def echo_all(message): # 某些微信链接包含个人独特id,有泄露隐私风险 if message.text.startswith('https://mp.weixin.qq.com/') and '__biz=' in message.text: url = '&'.join(message.text.split('&', 5)[:5]) else: url = message.text # 抓取网页 headers={"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36", "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8", "Accept-Language" : "en-us", "Connection" : "keep-alive", "Accept-Charset" : "utf-8;q=0.7,*;q=0.7"} html = requests.get(url, headers = headers) soup = BeautifulSoup(html.text, "html.parser") if url.startswith('https://mp.weixin.qq.com/'): # 提取公众号名称 Official_Account = soup.find("a", {"id": "js_name"}).text.strip() # 提取创建时间 ct: create_time script = soup.find("script", text=lambda text: text and "var ct" in text) parser = Parser() tree = parser.parse(script.text) for node in nodevisitor.visit(tree): if isinstance(node, ast.VarDecl) and node.identifier.value == 'ct': ct = node.initializer.value ct = int(ct.strip('"')) create_date = datetime.utcfromtimestamp(ct).strftime('%Y-%m-%d') # 提取文章标题、作者和描述信息 def get_meta(soup, meta): raw = soup.find("meta", property=meta) meta = raw['content'] if raw else "" return meta title = get_meta(soup, "og:title") author = get_meta(soup, "og:article:author") description = get_meta(soup, "og:description") elif "zhihu.com" in message.text: Official_Account = "小肚鸡肠的知乎" create_date = "" title = soup.title.text.strip() + "-" + str(random.randrange(2, 50000000)) author = "知乎小管家去死" description = "知乎删贴还不让别人存档。" elif "weibo.com" in message.text: bot.reply_to(message, "暂不支持微博页面抓取,请截图后保存至 https://t.me/sm_ms_bot") else: Official_Account = "" create_date = "" title = soup.title.text.strip() author = "" description = "" # 调用系统命令 monolith,保存网页。需要系统已经安装 monolith。 subprocess.call(["monolith", url, '-o', '/srv/web/mono/' + title + '.html']) # 将保存的网址返回,注意需要将中文 url 做编码,否则遇到特殊字符会识别错误 reply_url = 'http://206.189.252.32:8083/' + urllib.parse.quote(title) + '.html' bot.reply_to(message, reply_url) # 保存到 archive.org,archive.today try: reply_ia = archivenow.push(url, 'ia')[0] bot.reply_to(message, reply_ia) reply_is = archivenow.push(url, 'is')[0] bot.reply_to(message, reply_is) except Exception as e: bot.reply_to(message, 'oooops, please send the url again.') bot.reply_to(message, 'http://206.189.252.32:8085/') reply_ia_link = '<a href="' + reply_ia + '" target="_blank">' + '备份3' + '</a>' reply_is_link = '<a href="' + reply_is + '" target="_blank">' + '备份2' + '</a>' monolith_link = '<a href="' + reply_url + '" target="_blank">' + '备份1' + '</a>' message_link = '<a href="' + url + '" target="_blank">' + 'url' + '</a>' with open('/srv/web/archive_web3/data/archive.csv', 'a') as csvfile: fieldnames = ['提交时间', '帐号', '标题', '发布日期', '描述', '原始链接', '2049bbs','archive.today', 'archive.org'] writer = csv.DictWriter(csvfile, fieldnames=fieldnames) writer.writerow({'提交时间': time.ctime(), '帐号': Official_Account, '标题': title, '发布日期': create_date, '描述': description, '原始链接': message_link, '2049bbs': monolith_link, 'archive.today': reply_is_link, 'archive.org': reply_ia_link })
def find_or_create_mementos(urirs, session, accept_datetime=None, timegates=[ "https://timetravel.mementoweb.org/timegate/", "https://web.archive.org/web/" ]): urims = [] req_headers = {} if accept_datetime is not None: req_headers['accept-datetime'] = \ accept_datetime.strftime( "%a, %d %b %Y %H:%M:%S GMT" ) retry = Retry(total=10, read=10, connect=10, backoff_factor=0.3, status_forcelist=(500, 502, 504)) adapter = HTTPAdapter(max_retries=retry) for urir in urirs: # check for URI-M first and just take it if it exists for urig in timegates: module_logger.info("checking if {} exists via {}".format( urir, urig)) available = False urig = urig[:-1] if urig[-1] == '/' else urig try: urig = "{}/{}".format(urig, urir) # no caching for datetime negotiation dt_neg_session = requests.Session() dt_neg_session.mount('http://', adapter) dt_neg_session.mount('https://', adapter) dt_neg_session.headers.update({'user-agent': __useragent__}) r = dt_neg_session.get(urig, headers=req_headers) if r.status_code != 200: module_logger.info( "got a status of {} for {} -- could not find a memento for {} via {}" .format(r.status_code, r.url, urir, urig)) available = False else: if 'memento-datetime' in r.headers: available = True else: available = False except RequestException: module_logger.exception( "Failed to find memento for {}".format(urir)) available = False if r.url[0:29] == "https://web.archive.org/save/": available = False # module_logger.info("a candidate memento for {} was found: {}".format(urir, available)) if available is True: candidate_urim = r.url module_logger.info( "adding available URI-M {}".format(candidate_urim)) urims.append(candidate_urim) else: numsecs = randint(3, 10) module_logger.info( "sleeping {} seconds before pushing into web archive...". format(numsecs)) time.sleep(numsecs) module_logger.info("pushing {} into Internet Archive".format(urir)) create_memento_session = requests.Session() create_memento_session.mount('http://', adapter) create_memento_session.mount('https://', adapter) create_memento_session.headers.update( {'user-agent': __useragent__}) candidate_urim = archivenow.push(urir, "ia", session=create_memento_session)[0] module_logger.info( "received candidate URI-M {} from the Internet Archive".format( candidate_urim)) if candidate_urim[0:5] == "Error" or candidate_urim[ 0:29] == "https://web.archive.org/save/": # for now, skip if error # TODO: try with other archives, we don't use archive.is because new mementos don't immediately have Memento headers # candidate_urim = archivenow.push(urir, "is")[0] module_logger.warning( "Failed to push {} into the Internet Archive, skipping...". format(urir)) hypercane.errors.errorstore.add( urir, "Failed to create URI-M for {}".format(urir)) else: module_logger.info( "adding newly minted URI-M {}".format(candidate_urim)) urims.append(candidate_urim) return urims
def archive_page(page_url): results = [page_url] results.append(archivenow.push(page_url, "wc")) results.append(archivenow.push(page_url, "ia")) results.append(archivenow.push(page_url, "is")) return results