def echo_all(message):

    if 'weibo.com' in message.text or 'weibo.cn' in message.text:
        try:
            bot.reply_to(message, "archive doesn't support weibo")
        except Exception as e:
            bot.reply_to(message, 'oooops, please send the url again.')
    else:
        if message.text.startswith(
                'https://mp.weixin.qq.com/') and '__biz=' in message.text:
            url = '&'.join(message.text.split('&', 5)[:5])
        else:
            url = message.text
        try:
            reply = archivenow.push(url, 'is')[0]
            bot.reply_to(message, reply)
        except Exception as e:
            bot.reply_to(message, 'oooops, please send the url again.')

        html = requests.get(url)
        soup = BeautifulSoup(html.text, "html.parser")
        if message.text.startswith('https://mp.weixin.qq.com/s'):
            Title = soup.h2.text.strip()
        else:
            Title = soup.title.text.strip()

        with open('archive.csv', 'a') as f1:
            f1.write(time.ctime() + ',' + message.text + ',' + reply + ',')
            f1.write(Title)
            f1.write('\n')
Esempio n. 2
0
def echo_all(message):

    if 'weibo.com' in message.text or 'weibo.cn' in message.text:
        try:
            bot.reply_to(message, "archive doesn't support weibo")
        except Exception as e:
            bot.reply_to(message, 'oooops, please send the url again.')
    else:
        if message.text.startswith(
                'https://mp.weixin.qq.com/') and '__biz=' in message.text:
            url = '&'.join(message.text.split('&', 5)[:5])
        else:
            url = message.text
        headers = {
            "User-Agent":
            "Mozilla/5.0 (Windows; U; Windows NT 5.1; zh-CN; rv:1.9.1.6) ",
            "Accept":
            "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "en-us",
            "Connection": "keep-alive",
            "Accept-Charset": "GB2312,utf-8;q=0.7,*;q=0.7"
        }
        html = requests.get(url, headers=headers)
        soup = BeautifulSoup(html.text, "html.parser")
        if len(soup) == 0:
            Title = datetime.datetime.now().strftime("%Y-%m-%d-") + str(
                random.randrange(2, 50000000))
        else:
            if message.text.startswith('https://mp.weixin.qq.com/s'):
                Title = soup.h2.text.strip()
            elif "zhihu.com" in message.text:
                Title = soup.title.text.strip() + "-" + str(
                    random.randrange(2, 50000000))
            else:
                Title = soup.title.text.strip()
        Title = Title.replace('\n', '')
        Title = Title.replace('|', ' ')
        Title = Title.replace('<', ' ')
        Title = Title.replace('>', ' ')
        Title = Title.replace(',', ' ')
        Title = Title.replace(' ', '_')

        cmd = 'monolith ' + url + ' -o /srv/web/mono/' + Title + '.html'
        os.system(cmd)
        reply_url = 'http://206.189.252.32:8083/' + Title + '.html'
        bot.reply_to(message, reply_url)

        try:
            reply_ia = archivenow.push(url, 'ia')[0]
            bot.reply_to(message, reply_ia)
        except Exception as e:
            bot.reply_to(message, 'oooops, please send the url again.')

        with open('archive.csv', 'a') as f1:
            print(reply_ia)
            f1.write(time.ctime() + ',' + message.text + ',' + reply_ia + ',')
            f1.write(Title)
            f1.write('\n')
Esempio n. 3
0
def archive_url(url):
    # Convert link to old.reddit.com
    if is_reddit_link(url):
        url = "https://old." + url[url.find("reddit") :]
    archived_url = archivenow.push(url, "ia")[0]

    # If archiving fails, return the original link
    if not archived_url.startswith("https://"):
        archived_url = url
    return archived_url
Esempio n. 4
0
def main():
    parser = argparse.ArgumentParser(
        description=
        'Follower Count History. Given a Twitter username, collect follower counts from the Internet Archive.'
    )

    parser.add_argument('-g',
                        dest='graph',
                        action='store_true',
                        help='Generate a graph with data points')
    parser.add_argument('-e',
                        dest='allMemento',
                        action='store_false',
                        help='Collect every memento, not just one per month')
    parser.add_argument('uname', help='Twitter username without @')
    group = parser.add_mutually_exclusive_group()
    group.add_argument('-p',
                       dest='push',
                       action='store_true',
                       help='Push to Internet Archive')
    group.add_argument(
        '-P',
        dest='pushAll',
        action='store_true',
        help='Push to all Archives available through ArchiveNow')

    args = parser.parse_args()

    #Dependencies Optional
    if args.push or args.pushAll:
        from archivenow import archivenow
        import datetime

    def slugify(value):
        """
		Convert to ASCII. Convert spaces to hyphens.
		Remove characters that aren't alphanumerics, underscores, or hyphens.
		Convert to lowercase. Also strip leading and trailing whitespace.
		"""
        value = str(value)
        value = unicodedata.normalize('NFKD',
                                      value).encode('ascii',
                                                    'ignore').decode('ascii')
        value = str(re.sub('[^\w\s-]', '', value).strip().lower())
        value = str(re.sub('[-\s]+', '-', value))
        return value

    archivelink = 'http://web.archive.org/web/timemap/link/http://twitter.com/' + args.uname
    print(args.uname)
    print(archivelink)
    r = requests.get(archivelink)
    linkslist = []
    safeuname = slugify(args.uname)
    try:
        os.makedirs('./' + safeuname)
    except OSError as e:
        if e.errno != errno.EEXIST:
            raise

    writefile = './' + safeuname + '/' + safeuname + ".csv"
    errorfile = './' + safeuname + '/' + safeuname + "-Error.csv"
    w = open(writefile, "a+")
    e = open(errorfile, "a+")
    olddates = []
    if (os.stat(writefile).st_size == 0):
        #ensure header line does not get re written
        w.write('date,count,URL-M' + '\n')
    else:
        w.seek(0)
        #read in old data points
        for line in w.readlines():
            row = line.split(",")
            if (row[0] != "date"):
                olddates.append(row[0].replace("-", ""))
        #reset to the end
    lastline = ''
    for line in r.iter_lines(
    ):  #on each line if rel="memento" doesn't exist, ignore. If it does get the link and add it to the list of links
        #print(line)
        if ('rel="memento"'.encode('utf-8') in line
                or 'rel="first memento"'.encode('utf-8') in line):
            if (line != lastline):
                lastline = line
                linkslist.append(line[1:line.find('>;'.encode('utf-8'))])
    print(str(len(linkslist)) + " archive points found")

    lastdate = ''
    date = '0'
    #with open("test.txt", "r") as f:
    for line in linkslist:
        line = line.decode('utf-8')
        dateloc = line.find("/web/")
        date = line[dateloc + 5:dateloc + 19]  #get the timestamp from the link

        if (args.allMemento):
            #get one entry per month
            if (date[:6] == lastdate
                ):  #if new month is the same as previous, skip
                e.write(date + ",duplicate month," + line + "\n")
                continue

        if (date[:8] in olddates):  #if date is in old data-> skip
            continue

        if (not args.allMemento
            ):  #If not all mementos AND if month is in old data-> skip
            for x in olddates:
                if x[:6] == date[:6]:
                    e.write(date + ",duplicate month," + line + "\n")
                    continue

        print(date)
        try:
            res = urllib.urlopen(line)
        except:
            e.write(day + ",URI-M not loaded," + line + "\n")
            continue

        try:
            html = res.read()
        except IncompleteRead as err:
            e.write(date + ",partial read," + line + "\n")
            continue

        soup = BeautifulSoup(html, "lxml")  #html.parser -> lxml
        #get rid of scripts(javascript especially)
        for elem in soup.findAll(['script', 'style']):
            elem.extract()

        #Make sure this isn't a redirected Momento
        realURL = res.geturl()
        realdateloc = realURL.find("/web/")
        realdate = realURL[dateloc + 5:dateloc +
                           19]  #get the timestamp from the link
        day = '-'.join([date[:4], date[4:6], date[6:8]])
        if (date != realdate):
            e.write(day + ",redirect," + line + "\n")
            continue

        if int(date) < 10120700000000:
            e.write(day + ",before 10120700000000," + line + "\n")
            continue
        else:
            #try excepts that find the follower counts for different versions of Twitter since its 2008
            try:
                result = soup.select(".ProfileNav-item--followers")[0]
                try:
                    result = result.find("a")['title']
                except:
                    result = result.find("a")['data-original-title']
            except:
                try:
                    result = soup.select(".js-mini-profile-stat")[-1]['title']
                except:
                    try:
                        result = soup.select(".stats li")[-1].find(
                            "strong")['title']
                    except:
                        try:
                            result = soup.select(".stats li")[-1].find(
                                "strong").text
                        except:
                            try:
                                result = soup.select("#follower_count")[0].text
                            except:
                                try:
                                    result = soup.select(
                                        "#followers_count")[0].text
                                except:
                                    try:
                                        result = soup.select(
                                            ".user-stats-followers")[0].text
                                        #result = result[:result.find("Followers")]
                                    except:
                                        try:
                                            result = soup.select(
                                                ".stats_count")[1].text
                                        except:
                                            try:
                                                result = soup.select(
                                                    "follower_stats")
                                                if not result:
                                                    raise ValueError('Empty')
                                            except:
                                                e.write(
                                                    day +
                                                    ",followers not found," +
                                                    line + "\n")
                                                continue

        result = re.sub(r'\D', '',
                        str(result))  #remove everything that's not a number
        if (result == ''):
            e.write(day + ",followers not numbers," + line + "\n")
            continue
        try:
            result = str(
                int(result)
            )  #Make sure a number. Also translates other languages if possible.
            print(result)

            w.write(day + ',' + result + ',' + realURL + '\n')
            lastdate = date[:6]
        except:
            e.write(day + ",followers not arabic numerals," + line + "\n")
            continue
    w.close()

    if args.push or args.pushAll:
        #Send to archive
        now = datetime.datetime.now().strftime("%Y%m")

        if (int(date[:6]) < int(now)):
            if args.pushAll:
                print("Pushing to Archives")
                archivenow.push("http://twitter.com/" + args.uname, "all")
            else:
                print("Pushing to Internet Archive")
                archivenow.push("http://twitter.com/" + args.uname, "ia")
        else:
            print("Not Pushing to Archive. Last Memento Within Current Month.")
    if (args.graph):
        #Call the Rscript to create a linechart with the numbers collected
        Rcall = "Rscript --vanilla follower_count_linechart.R " + safeuname
        subprocess.call(Rcall, shell=True)
Esempio n. 5
0
from archivenow import archivenow
import time
from tqdm import tqdm

with open("listpdf.txt", "r", encoding="utf-8-sig") as f:
    listpdf_temp = [i.strip() for i in f.readlines()]
print("Save to web.archive.org")
for i in tqdm(listpdf_temp):
    print(
        archivenow.push(
            "https://ddc.moph.go.th/viralpneumonia/file/situation/" + i, "ia"))
    time.sleep(20)
Esempio n. 6
0
import sys
import csv

from archivenow import archivenow

seedfile = sys.argv[1]
outputfile = sys.argv[2]

with open(seedfile) as f:

    with open(outputfile, 'w') as g:

        for line in f:
    
            line = line.strip()
            print("working on URI-R {}".format(line))
    
            urim = archivenow.push(line, "ia")[0]
            g.write('{}\t{}\n'.format(line, urim))

            urim = archivenow.push(line, "is")[0]
            g.write('{}\t{}\n'.format(line, urim))

            urim = archivenow.push(line, "wc")
            g.write('{}\t{}\n'.format(line, urim))
Esempio n. 7
0
                all_data = title + "\n\n" + article + "\n\nที่มา : " + url

                if collection not in data:
                    data[collection] = 1
                with codecs.open(
                        os.path.join(
                            f,
                            collection + "_" + str(data[collection]) + ".txt"),
                        "w", "utf-8") as temp:
                    temp.write(all_data)
                temp.close()
                data[collection] += 1
                i2 += 1
                e = 0
                try:
                    archivenow.push(url, "ia")
                    time.sleep(8)
                except:
                    pass
                i += 1
                i_backup = i
            else:
                e += 1
                i += 1
        else:
            e += 1
    except Exception as ex:
        e += 1
        print(ex)  #
#	#print(e)
#	#print(i)
Esempio n. 8
0
def echo_all(message):

    # 某些微信链接包含个人独特id,有泄露隐私风险
    if message.text.startswith('https://mp.weixin.qq.com/') and '__biz=' in message.text:
        url = '&'.join(message.text.split('&', 5)[:5])
    else:
        url = message.text

    # 抓取网页
    headers={"User-Agent" : "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/77.0.3835.0 Safari/537.36",
                  "Accept" : "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
                  "Accept-Language" : "en-us",
                  "Connection" : "keep-alive",
                  "Accept-Charset" : "utf-8;q=0.7,*;q=0.7"}
    html = requests.get(url, headers = headers)
    soup = BeautifulSoup(html.text, "html.parser")


    if url.startswith('https://mp.weixin.qq.com/'):

        # 提取公众号名称
        Official_Account = soup.find("a", {"id": "js_name"}).text.strip()

        # 提取创建时间 ct: create_time
        script = soup.find("script", text=lambda text: text and "var ct" in text)
        parser = Parser()
        tree = parser.parse(script.text)
        for node in nodevisitor.visit(tree):
            if isinstance(node, ast.VarDecl) and node.identifier.value == 'ct':
                ct = node.initializer.value
        ct = int(ct.strip('"'))
        create_date = datetime.utcfromtimestamp(ct).strftime('%Y-%m-%d')

        # 提取文章标题、作者和描述信息
        def get_meta(soup, meta):
            raw = soup.find("meta", property=meta)
            meta = raw['content'] if raw else ""
            return meta

        title = get_meta(soup, "og:title")
        author = get_meta(soup, "og:article:author")
        description = get_meta(soup, "og:description")

    elif "zhihu.com" in message.text:
        Official_Account = "小肚鸡肠的知乎"
        create_date = ""
        title = soup.title.text.strip() + "-" + str(random.randrange(2, 50000000))
        author = "知乎小管家去死"
        description = "知乎删贴还不让别人存档。"
     
    elif "weibo.com" in message.text:
        bot.reply_to(message, "暂不支持微博页面抓取,请截图后保存至 https://t.me/sm_ms_bot")

    else:
        Official_Account = ""
        create_date = ""
        title = soup.title.text.strip()
        author = ""
        description = ""


    # 调用系统命令 monolith,保存网页。需要系统已经安装 monolith。
    subprocess.call(["monolith", url, '-o', '/srv/web/mono/' + title + '.html'])

    # 将保存的网址返回,注意需要将中文 url 做编码,否则遇到特殊字符会识别错误
    reply_url = 'http://206.189.252.32:8083/'  + urllib.parse.quote(title) + '.html'
    bot.reply_to(message, reply_url)

    # 保存到 archive.org,archive.today
    try:
        reply_ia = archivenow.push(url, 'ia')[0]
        bot.reply_to(message, reply_ia)
        reply_is = archivenow.push(url, 'is')[0]
        bot.reply_to(message, reply_is)
    except Exception as e:
        bot.reply_to(message, 'oooops, please send the url again.')

    bot.reply_to(message, 'http://206.189.252.32:8085/')

    reply_ia_link = '<a href="' + reply_ia + '" target="_blank">' + '备份3' + '</a>'
    reply_is_link = '<a href="' + reply_is + '" target="_blank">' + '备份2' + '</a>'
    monolith_link = '<a href="' + reply_url + '" target="_blank">' + '备份1' + '</a>'
    message_link = '<a href="' + url + '" target="_blank">' + 'url' + '</a>'

    with open('/srv/web/archive_web3/data/archive.csv', 'a') as csvfile:
        fieldnames = ['提交时间', '帐号', '标题', '发布日期', '描述', '原始链接', '2049bbs','archive.today', 'archive.org']
        writer = csv.DictWriter(csvfile, fieldnames=fieldnames)
        writer.writerow({'提交时间': time.ctime(),
        '帐号': Official_Account,
        '标题': title,
        '发布日期': create_date,
        '描述': description,
        '原始链接': message_link,
        '2049bbs': monolith_link,
        'archive.today': reply_is_link,
        'archive.org': reply_ia_link
        })
Esempio n. 9
0
def find_or_create_mementos(urirs,
                            session,
                            accept_datetime=None,
                            timegates=[
                                "https://timetravel.mementoweb.org/timegate/",
                                "https://web.archive.org/web/"
                            ]):

    urims = []

    req_headers = {}

    if accept_datetime is not None:
        req_headers['accept-datetime'] = \
            accept_datetime.strftime( "%a, %d %b %Y %H:%M:%S GMT" )

    retry = Retry(total=10,
                  read=10,
                  connect=10,
                  backoff_factor=0.3,
                  status_forcelist=(500, 502, 504))
    adapter = HTTPAdapter(max_retries=retry)

    for urir in urirs:
        # check for URI-M first and just take it if it exists

        for urig in timegates:

            module_logger.info("checking if {} exists via {}".format(
                urir, urig))
            available = False

            urig = urig[:-1] if urig[-1] == '/' else urig

            try:

                urig = "{}/{}".format(urig, urir)

                # no caching for datetime negotiation
                dt_neg_session = requests.Session()
                dt_neg_session.mount('http://', adapter)
                dt_neg_session.mount('https://', adapter)
                dt_neg_session.headers.update({'user-agent': __useragent__})

                r = dt_neg_session.get(urig, headers=req_headers)

                if r.status_code != 200:
                    module_logger.info(
                        "got a status of {} for {} -- could not find a memento for {} via {}"
                        .format(r.status_code, r.url, urir, urig))
                    available = False
                else:
                    if 'memento-datetime' in r.headers:
                        available = True
                    else:
                        available = False

            except RequestException:
                module_logger.exception(
                    "Failed to find memento for {}".format(urir))
                available = False

        if r.url[0:29] == "https://web.archive.org/save/":
            available = False

        # module_logger.info("a candidate memento for {} was found: {}".format(urir, available))

        if available is True:
            candidate_urim = r.url
            module_logger.info(
                "adding available URI-M {}".format(candidate_urim))
            urims.append(candidate_urim)
        else:
            numsecs = randint(3, 10)
            module_logger.info(
                "sleeping {} seconds before pushing into web archive...".
                format(numsecs))
            time.sleep(numsecs)

            module_logger.info("pushing {} into Internet Archive".format(urir))
            create_memento_session = requests.Session()
            create_memento_session.mount('http://', adapter)
            create_memento_session.mount('https://', adapter)
            create_memento_session.headers.update(
                {'user-agent': __useragent__})

            candidate_urim = archivenow.push(urir,
                                             "ia",
                                             session=create_memento_session)[0]

            module_logger.info(
                "received candidate URI-M {} from the Internet Archive".format(
                    candidate_urim))

            if candidate_urim[0:5] == "Error" or candidate_urim[
                    0:29] == "https://web.archive.org/save/":
                # for now, skip if error
                # TODO: try with other archives, we don't use archive.is because new mementos don't immediately have Memento headers
                # candidate_urim = archivenow.push(urir, "is")[0]
                module_logger.warning(
                    "Failed to push {} into the Internet Archive, skipping...".
                    format(urir))
                hypercane.errors.errorstore.add(
                    urir, "Failed to create URI-M for {}".format(urir))
            else:
                module_logger.info(
                    "adding newly minted URI-M {}".format(candidate_urim))
                urims.append(candidate_urim)

    return urims
Esempio n. 10
0
def archive_page(page_url):
    results = [page_url]
    results.append(archivenow.push(page_url, "wc"))
    results.append(archivenow.push(page_url, "ia"))
    results.append(archivenow.push(page_url, "is"))
    return results