Esempio n. 1
0
def import_ia(url,
              *,
              from_date=None,
              to_date=None,
              maintainers=None,
              tags=None,
              skip_unchanged='resolved-response'):
    skip_responses = skip_unchanged == 'response'
    with ia.WaybackClient() as wayback:
        # Pulling on this generator does the work.
        versions = (
            wayback.timestamped_uri_to_version(version.date,
                                               version.raw_url,
                                               url=version.url,
                                               maintainers=maintainers,
                                               tags=tags,
                                               view_url=version.view_url)
            for version in wayback.list_versions(url,
                                                 from_date=from_date,
                                                 to_date=to_date,
                                                 skip_repeats=skip_responses))

        if skip_unchanged == 'resolved-response':
            versions = _filter_unchanged_versions(versions)

        _add_and_monitor(versions)
def wayback_exist (url, dates):
    #print("Inside simplify function")    
    try:
        with internetarchive.WaybackClient() as client:
            # sys.stdout.write("inside wayback\n")
            # sys.stdout.flush()
            
            # print("made it internetarchive")
            # dump returns ALL instances within the date-range that page has been documented in the Archive
            # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo

            dump = client.list_versions(url, from_date=datetime(dates[0], dates[1], dates[2]), to_date=datetime(dates[3], dates[4], dates[5]))

            # sys.stdout.write("dump worked\n")
            # sys.stdout.flush()
 
            # get the versions if Archive contains data in the requested range
            try:
                versions = list(dump)
                return True
            except:
                # sys.stdout.write("inner try failed\n")
                # sys.stdout.flush()
                #print("inner try failed")
                return False
        
    except: 
        # sys.stdout.write("outer try failed\n")
        # sys.stdout.flush()
        return False
Esempio n. 3
0
 def __init__(self, records, results_queue, maintainers, tags, cancel,
              failure_queue=None, session_options=None,
              unplaybackable=None):
     super().__init__()
     self.summary = self.create_summary()
     self.results_queue = results_queue
     self.failure_queue = failure_queue
     self.cancel = cancel
     self.records = records
     self.maintainers = maintainers
     self.tags = tags
     self.unplaybackable = unplaybackable
     session_options = session_options or dict(retries=3, backoff=2,
                                               timeout=(30.5, 2))
     session = ia.WaybackSession(**session_options)
     self.wayback = ia.WaybackClient(session=session)
Esempio n. 4
0
def _list_ia_versions_for_urls(url_patterns,
                               from_date,
                               to_date,
                               skip_repeats=True,
                               version_filter=None,
                               client=None,
                               stop=None):
    version_filter = version_filter or _is_page
    skipped = 0

    with client or ia.WaybackClient() as client:
        for url in url_patterns:
            if stop and stop.is_set():
                break

            ia_versions = client.list_versions(url,
                                               from_date=from_date,
                                               to_date=to_date,
                                               skip_repeats=skip_repeats)
            try:
                for version in ia_versions:
                    if stop and stop.is_set():
                        break
                    if version_filter(version):
                        yield version
                    else:
                        skipped += 1
                        logger.debug('Skipping URL "%s"', version.url)
            except ia.BlockedByRobotsError as error:
                logger.warn(f'CDX search error: {error!r}')
            except ValueError as error:
                # NOTE: this isn't really an exceptional case; list_versions()
                # raises ValueError when Wayback has no matching records.
                # TODO: there should probably be no exception in this case.
                if 'does not have archived versions' not in str(error):
                    logger.warn(repr(error))
            except ia.WaybackException as error:
                logger.error(f'Error getting CDX data for {url}: {error!r}')
            except Exception:
                # Need to handle the exception here to let iteration continue
                # and allow other threads that might be running to be joined.
                logger.exception(f'Error processing versions of {url}')

    if skipped > 0:
        logger.info('Skipped %s URLs that did not match filters', skipped)
Esempio n. 5
0
def import_ia_urls(urls,
                   *,
                   from_date=None,
                   to_date=None,
                   maintainers=None,
                   tags=None,
                   skip_unchanged='resolved-response',
                   version_filter=None,
                   worker_count=0,
                   create_pages=True,
                   unplaybackable_path=None,
                   dry_run=False):
    skip_responses = skip_unchanged == 'response'
    worker_count = worker_count if worker_count > 0 else PARALLEL_REQUESTS
    unplaybackable = load_unplaybackable_mementos(unplaybackable_path)

    with utils.QuitSignal((signal.SIGINT, signal.SIGTERM)) as stop_event:
        cdx_records = utils.FiniteQueue()
        cdx_thread = threading.Thread(target=lambda: utils.iterate_into_queue(
            cdx_records,
            _list_ia_versions_for_urls(
                urls,
                from_date,
                to_date,
                skip_responses,
                version_filter,
                # Use a custom session to make sure CDX calls are extra robust.
                client=ia.WaybackClient(
                    ia.WaybackSession(retries=10, backoff=4)),
                stop=stop_event)))
        cdx_thread.start()

        summary = {}
        versions_queue = utils.FiniteQueue()
        memento_thread = threading.Thread(
            target=lambda: WaybackRecordsWorker.parallel_with_retries(
                worker_count,
                summary,
                cdx_records,
                versions_queue,
                maintainers,
                tags,
                stop_event,
                unplaybackable,
                tries=(None, dict(retries=3, backoff=4, timeout=(30.5, 2)),
                       dict(retries=7, backoff=4, timeout=60.5))))
        memento_thread.start()

        uploadable_versions = versions_queue
        if skip_unchanged == 'resolved-response':
            uploadable_versions = _filter_unchanged_versions(versions_queue)
        if dry_run:
            uploader = threading.Thread(
                target=lambda: _log_adds(uploadable_versions))
        else:
            uploader = threading.Thread(target=lambda: _add_and_monitor(
                uploadable_versions, create_pages, stop_event))
        uploader.start()

        cdx_thread.join()
        memento_thread.join()

        print(
            '\nLoaded {total} CDX records:\n'
            '  {success:6} successes ({success_pct:.2f}%),\n'
            '  {playback:6} could not be played back ({playback_pct:.2f}%),\n'
            '  {missing:6} had no actual memento ({missing_pct:.2f}%),\n'
            '  {unknown:6} unknown errors ({unknown_pct:.2f}%).'.format(
                **summary))

        uploader.join()

        if not dry_run:
            print('Saving list of non-playbackable URLs...')
            save_unplaybackable_mementos(unplaybackable_path, unplaybackable)
Esempio n. 6
0
def counter(file, terms, dates):
    #counts a set of one or two word terms during a single timeframe
    #dates should be in the following form: [starting year, starting month, starting day, ending year, ending month, ending day]
    #terms should be in the format ["term"], as a phrase: ["climate", "change"], or as a set of terms and/or phrases: ["climate", ["climate", "change"]]

    #read the URLs
    with open(file) as csvfile:
        read = csv.reader(csvfile)
        data = list(read)
    csvfile.close()

    #Start the matrix that we'll put term counts into
    row_count = len(data)
    column_count = len(terms)
    matrix = numpy.zeros((row_count, column_count), dtype=numpy.int16)
    print(row_count, column_count)

    for pos, row in enumerate(data):
        thisPage = row[0]
        try:
            with internetarchive.WaybackClient() as client:
                dump = client.list_versions(
                    thisPage,
                    from_date=datetime(dates[0], dates[1], dates[2]),
                    to_date=datetime(dates[3], dates[4], dates[5])
                )  # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo
                versions = reversed(list(dump))
                for version in versions:  # for each version in all the snapshots
                    if version.status_code == '200' or version.status_code == '-':  # if the IA snapshot was viable
                        url = version.raw_url
                        contents = requests.get(
                            url).content.decode()  #decode the url's HTML
                        contents = BeautifulSoup(contents, 'lxml')
                        body = contents.find('body')
                        # remove portions of the webpage we don't want to count
                        d = [s.extract() for s in body('footer')]
                        d = [s.extract() for s in body('header')]
                        d = [s.extract() for s in body('nav')]
                        d = [s.extract() for s in body('script')]
                        d = [s.extract() for s in body('style')]
                        del d
                        body = [text for text in body.stripped_strings]
                        # Count terms:
                        for p, t in enumerate(terms):
                            if type(t) is list:
                                page_sum = two_count(t, body)
                            else:
                                page_sum = count(t, body)
                            matrix[pos][
                                p] = page_sum  # put the count of the term in the right spot in the matrix
                        keywords[url] = keyword_function(body)
                        final_urls[thisPage] = [url, row[3]]
                        print(pos)
                        break
                    else:
                        pass
        except:
            print("fail")
            final_urls[thisPage] = ["", thisPage]
            matrix[pos] = 999

    unique, counts = numpy.unique(matrix, return_counts=True)
    results = dict(zip(unique, counts))
    print(results)

    #for writing the term count to a CSV. You will then need to convert delimited text to columns and replace the first column with the list of URLs
    with open('outputs/counts.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for row in matrix:
            writer.writerow(row)
    csvfile.close()

    #print out urls in separate file
    with open('outputs/urls.csv', 'w') as output:
        writer = csv.writer(output)
        for key, value in final_urls.items():
            writer.writerow([key, value[0], value[1]])
    output.close()

    #print out top three keywords in separate file
    with open("outputs/keywords.csv", "w", encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for key, value in keywords.items():
            try:
                writer.writerow([key, value[0], value[1], value[2]])
            except IndexError:
                writer.writerow([key, "ERROR"])
    outfile.close()

    print("The program is finished!")
Esempio n. 7
0
def linker(file, domain, datesA, datesB=[]):
    #currently only accepts looking at how a set of URLs point to each other (a square matrix)
    #currently is meant to look at links in a single domain e.g. "http://www.epa.gov" File should be a csv of links like: "/cleanpowerplan/page"
    #datesA should be in the following form: [starting year, starting month, starting day, ending year, ending month, ending day]
    #datesB = optional (for comparing two time periods). should be in same format as datesA

    dates = {'first': datesA, 'second': datesB}

    finalURLs = {}

    # build outgoing link matrix
    with open(file) as csvfile:
        read = csv.reader(csvfile)
        data = list(read)  #put the csv data in an array
        row_count = len(data)
        matrix = numpy.zeros((row_count, row_count),
                             dtype=numpy.int8)  #create matrix
        urls = []
        for row in data:
            finalURLs[domain + row[0]] = []
            urls.append(
                row[0])  #compile list of all urls to check against later
    csvfile.close()

    times = 1
    if len(dates['second']) > 0:
        times = 2
    position = 1

    #Loop through data, call CDX API, populate matrix
    while position <= times:
        if position == 1:
            theseDates = dates[
                'first']  # These are the numeric codes used to ID link status in timeframe A, B, and combined (A+B)
            connection = 1
            decoding_error = 8
            WM_error = 9
        else:
            theseDates = dates['second']
            connection = 3
            decoding_error = 14
            WM_error = 16
        for pos, row in enumerate(data):
            thisPage = domain + row[
                0]  # row[0] #for urls_shortened.csv use: 'http://www.epa.gov'+row[0]
            try:
                with internetarchive.WaybackClient() as client:
                    dump = client.list_versions(
                        thisPage,
                        from_date=datetime(theseDates[0], theseDates[1],
                                           theseDates[2]),
                        to_date=datetime(theseDates[3], theseDates[4],
                                         theseDates[5])
                    )  # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo
                    versions = reversed(
                        list(dump))  # start from the most recent snapshots
                    for version in versions:  # for each version in all the snapshots
                        if version.status_code == '200':  # if the IA snapshot was a redirect or page not found, move to the next snapshot version
                            try:
                                contents = requests.get(
                                    version.raw_url).content.decode(
                                    )  #decode the url's HTML
                                contents = BeautifulSoup(contents, 'lxml')
                                # remove portions of the webpage we don't want to count
                                d = [s.extract() for s in contents('script')]
                                d = [s.extract() for s in contents('style')]
                                del d
                                contents = contents.find("body")
                                links = contents.find_all(
                                    'a')  #find all outgoing links
                                thisPageLinksTo = []
                                for link in links:
                                    thisPageLinksTo.append(
                                        link['href']
                                    )  #for each outgoing link, strip away the name etc. to just the href
                                #use keys/columns and check against links. is x key/column in links? does this page link to another? if so, add 1
                                for i, url in enumerate(urls):
                                    if url in thisPageLinksTo:  #if this page links to another domain url
                                        #print(thisPage, url, wayback_url, pos, i) #print what this page is, what it links to, and IA url
                                        matrix[pos][
                                            i] = connection  #put a 1 at the right position. matrix[row][column]
                                finalURLs[thisPage].append(version.raw_url)
                                print(pos)
                                break
                            except:
                                finalURLs[thisPage].append("decoding error")
                                matrix[
                                    pos] = decoding_error  # code for indicating decoding error
                                #print('decoding error', version.status_code, row[0])# this will capture errors in decoding a page
                                break
                        else:
                            pass
            except:
                finalURLs[thisPage].append("WM error")
                matrix[pos] = WM_error  # code for indicating IA/WM error
        if position == 1:
            matrixA = matrix
            matrix = numpy.zeros((row_count, row_count),
                                 dtype=numpy.int8)  #reset matrix
        else:
            matrixB = matrix
        position = position + 1

    if len(dates['second']) > 0:
        final_matrix = numpy.add(matrixA, matrixB)
    else:
        final_matrix = matrixA

    with open('outputs/urls.csv', 'w') as output:
        writer = csv.writer(output)
        for key, value in finalURLs.items():
            try:
                writer.writerow([key, value[0], value[1]])
            except IndexError:
                writer.writerow([key, "ERROR"])
    output.close()
    with open('outputs/links.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for row in final_matrix:
            writer.writerow(row)
    csvfile.close()
    with open('outputs/linksA.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for row in matrixA:
            writer.writerow(row)
    csvfile.close()
    with open('outputs/linksB.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for row in matrixB:
            writer.writerow(row)
    csvfile.close()
    print("The program is finished!")
Esempio n. 8
0
def counter(file, terms, dates):
    #terms = ['adaptation', ['Agency', 'Mission'], ['air', 'quality'], 'anthropogenic', 'benefits', 'Brownfield', ['clean', 'energy'], 'Climate', ['climate', 'change'], 'Compliance', 'Cost-effective', 'Costs', 'Deregulatory', 'deregulation', 'droughts', ['economic', 'certainty'], ['economic', 'impacts'], 'economic', 'Efficiency', 'Emissions', ['endangered', 'species'], ['energy', 'independence'], 'Enforcement', ['environmental', 'justice'], ['federal', 'customer'], ['fossil', 'fuels'], 'Fracking', ['global', 'warming'], 'glyphosate', ['greenhouse', 'gases'], ['horizontal', 'drilling'], ['hydraulic', 'fracturing'], 'Impacts', 'Innovation', 'Jobs', 'Mercury', 'Methane', 'pesticides', 'pollution', 'Precautionary', ['regulatory', 'certainty'], 'regulation', 'Resilience', 'Risk', 'Safe', 'Safety', ['sensible', 'regulations'], 'state', 'storms', 'sustainability', 'Toxic', 'transparency', ['Unconventional', 'gas'], ['unconventional', 'oil'], ['Water', 'quality'], 'wildfires']
    #file = 'all Versionista URLs 10-16-18.csv'

    with open(file) as csvfile:
        read = csv.reader(csvfile)
        data = list(read)
    csvfile.close()

    row_count = len(data)
    column_count = len(terms)
    matrix = numpy.zeros((row_count, column_count), dtype=numpy.int16)
    print(row_count, column_count)

    for pos, row in enumerate(data):
        thisPage = row[0]  #change for specific CSVs
        try:
            with internetarchive.WaybackClient() as client:
                dump = client.list_versions(
                    thisPage,
                    from_date=datetime(dates[0], dates[1], dates[2]),
                    to_date=datetime(dates[3], dates[4], dates[5])
                )  # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo
                versions = reversed(list(dump))
                for version in versions:  # for each version in all the snapshots
                    if version.status_code == '200' or version.status_code == '-':  # if the IA snapshot was viable
                        url = version.raw_url
                        contents = requests.get(
                            url).content.decode()  #decode the url's HTML
                        contents = BeautifulSoup(contents, 'lxml')
                        body = contents.find('body')
                        d = [s.extract() for s in body('footer')]
                        d = [s.extract() for s in body('header')]
                        d = [s.extract() for s in body('nav')]
                        d = [s.extract() for s in body('script')]
                        d = [s.extract() for s in body('style')]
                        d = [s.extract()
                             for s in body.select('div > #menuh')]  #FWS
                        d = [
                            s.extract()
                            for s in body.select('div > #siteFooter')
                        ]  #FWS
                        d = [
                            s.extract() for s in body.select('div.primary-nav')
                        ]  #DOE
                        d = [
                            s.extract()
                            for s in body.select('div > #nav-homepage-header')
                        ]  #OSHA
                        d = [
                            s.extract()
                            for s in body.select('div > #footer-two')
                        ]  #OSHA
                        del d
                        body = [text for text in body.stripped_strings]
                        for p, t in enumerate(terms):
                            if type(t) is list:
                                page_sum = two_count(t, body)
                            else:
                                page_sum = count(t, body)
                            matrix[pos][
                                p] = page_sum  #put the count of the term in the matrix
                        keywords[url] = keyword_function(body)
                        final_urls[thisPage] = [url, row[3]]
                        print(pos)
                        break
                    else:
                        pass
        except:
            print("fail")
            final_urls[thisPage] = ["", row[3]]
            matrix[pos] = 999

    unique, counts = numpy.unique(matrix, return_counts=True)
    results = dict(zip(unique, counts))
    print(results)

    #for writing term counts to a csv. you will need to convert delimited text to columns and replace the first column with the list of URLs
    with open('outputs/counts.csv', 'w', newline='') as csvfile:
        writer = csv.writer(csvfile,
                            delimiter=' ',
                            quotechar='|',
                            quoting=csv.QUOTE_MINIMAL)
        for row in matrix:
            writer.writerow(row)
    csvfile.close()

    #print out urls in separate file
    with open('outputs/urls.csv', 'w') as output:
        writer = csv.writer(output)
        for key, value in final_urls.items():
            writer.writerow([key, value[0], value[1]])
    output.close()

    #print out keywords in separate file
    with open("outputs/keywords.csv", "w", encoding='utf-8') as outfile:
        writer = csv.writer(outfile)
        for key, value in keywords.items():
            try:
                writer.writerow([key, value[0], value[1], value[2]])
            except IndexError:
                writer.writerow([key, "ERROR"])
    outfile.close()

    print("The program is finished!")
Esempio n. 9
0
        thisPage = elm[0]  # grab the url

        # save the url to wayback now
        if (now_indic == 1):
            try:
                r = requests.get('https://web.archive.org/save/' + thisPage)
            except:
                continue

        with open(counts_file_name, 'a', newline='') as output:
            writer = csv.writer(output)
            writer.writerow("")
        csvfile.close()

        try:
            with internetarchive.WaybackClient() as client:
                # dump returns ALL instances within the date-range that page has been documented in the Archive
                dump = client.list_versions(
                    thisPage,
                    from_date=datetime(dates[0], dates[1], dates[2]),
                    to_date=datetime(dates[3], dates[4], dates[5])
                )  # list_versions calls the CDX API from internetarchive.py from the webmonitoring repo
                #print("\n"+thisPage)
                results.append("\n" + thisPage + "\n")
                sys.stdout.write("\n" + thisPage + "\n")
                #sys.stdout.flush()

                achive_indicator = 0
                # indicator variable to tell whether Archive has pages in the requested date range

                try:  # get the versions if Archive contains data in the requested range