Esempio n. 1
0
    def run(self):
        # If so, lookup Target and extract any additional metadata:
        targets = json.load(self.input()['targets'].open('r'))
        doc = DocumentMDEx(targets, self.doc.get_wrapped().copy(), self.source).mdex()
        # Documents may be rejected at this point:
        if doc is None:
            logger.critical("The document %s has been REJECTED!" % self.doc['document_url'])
            doc = self.doc.get_wrapped().copy()
            doc['status'] = 'REJECTED'
        else:
            # Inform W3ACT it's available:
            doc['status'] = 'ACCEPTED'
            logger.debug("Sending doc: %s" % doc)
            w = w3act(act().url, act().username, act().password)
            r = w.post_document(doc)
            if r.status_code == 200:
                logger.info("Document POSTed to W3ACT: %s" % doc['document_url'])
            else:
                logger.error("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text))
                raise Exception("Failed with %s %s\n%s" % (r.status_code, r.reason, r.text))
                #yield AvailableInWayback(doc['document_url'], doc['wayback_timestamp'], check_available=True)

        # And write out to the status file
        with self.output().open('w') as out_file:
            out_file.write('{}'.format(json.dumps(doc, indent=4)))

        # Also post to Monitrix if configured to do so:
        if systems().elasticsearch_host:
            yield RecordDocumentInMonitrix(self.job, self.launch_id, doc, self.source)
Esempio n. 2
0
    def run(self):
        # If so, lookup Target and extract any additional metadata:
        targets = json.load(self.input()['targets'].open('r'))
        doc = DocumentMDEx(targets,
                           self.doc.get_wrapped().copy(), self.source).mdex()
        # Documents may be rejected at this point:
        if doc is None:
            logger.critical("The document %s has been REJECTED!" %
                            self.doc['document_url'])
            doc = self.doc.get_wrapped().copy()
            doc['status'] = 'REJECTED'
        else:
            # Inform W3ACT it's available:
            doc['status'] = 'ACCEPTED'
            logger.debug("Sending doc: %s" % doc)
            w = w3act(act().url, act().username, act().password)
            r = w.post_document(doc)
            if r.status_code == 200:
                logger.info("Document POSTed to W3ACT: %s" %
                            doc['document_url'])
            else:
                logger.error("Failed with %s %s\n%s" %
                             (r.status_code, r.reason, r.text))
                raise Exception("Failed with %s %s\n%s" %
                                (r.status_code, r.reason, r.text))
                #yield AvailableInWayback(doc['document_url'], doc['wayback_timestamp'], check_available=True)

        # And write out to the status file
        with self.output().open('w') as out_file:
            out_file.write('{}'.format(json.dumps(doc, indent=4)))

        # Also post to Monitrix if configured to do so:
        if systems().elasticsearch_host:
            yield RecordDocumentInMonitrix(self.job, self.launch_id, doc,
                                           self.source)
 def run(self):
     # Set up connection to W3ACT:
     w = w3act(act().url, act().username, act().password)
     # Grab those targets:
     targets = w.get_ld_export(self.frequency)
     # Persist to disk:
     with self.output().open('w') as f:
         f.write('{}'.format(json.dumps(targets, indent=4)))
Esempio n. 4
0
 def run(self):
     # Set up connection to W3ACT:
     w = w3act(act().url, act().username, act().password)
     # Grab those targets:
     targets = w.get_ld_export(self.frequency)
     # Persist to disk:
     with self.output().open('w') as f:
         f.write('{}'.format(json.dumps(targets, indent=4)))
Esempio n. 5
0
def main():
	parser = argparse.ArgumentParser('Interrogate the W3ACT API.')
	parser.add_argument('-w', '--w3act-url', dest='w3act_url', 
					type=str, default="http://*****:*****@bl.uk",
					help="W3ACT user email to login with [default: %(default)s]" )
	parser.add_argument('-p', '--w3act-pw', dest='w3act_pw', 
					type=str, default="sysAdmin", 
					help="W3ACT user password [default: %(default)s]" )
	parser.add_argument('action', metavar='action', help="The action to perform (one of 'add-target', 'list-targets', 'get-target').")
	
	args, subargs = parser.parse_known_args()
	
	# Connect
	act = w3act(args.w3act_url,args.w3act_user,args.w3act_pw)
	
	if args.action == "list-targets":
		json = act.get_json("api/targets")
		print json
	elif args.action == 'add-target':
		r = act.post_target(subargs[0], subargs[1])
		print r.status_code
		print r.text		
	elif args.action == 'update-target-schedule':
		r = act.update_target_schedule(int(subargs[0]), subargs[1], subargs[2])
		print r.status_code
		print r.text
	elif args.action == 'set-selector':
		r = act.update_target_selector(int(subargs[0]))
		print r.status_code
		print r.text
	elif args.action == 'watch-target':
		r = act.watch_target(int(subargs[0]))
		print r.status_code
		print r.text
	elif args.action == 'unwatch-target':
		r = act.unwatch_target(int(subargs[0]))
		print r.status_code
		print r.text
	elif args.action == 'add-document':
		doc = {}
		wtid = subargs[0]
		doc['target_id'] = int(wtid)
		doc['wayback_timestamp'] = subargs[1]
		doc['document_url'] = subargs[2]
		doc['landing_page_url'] = subargs[3]
		doc['filename'] = os.path.basename( urlparse(doc['document_url']).path )
		doc['size'] = ""
		logger.debug("Sending doc: %s" % doc)
		r = act.post_document(doc)
		print r.status_code
		print r.text
Esempio n. 6
0
def uri_of_doc(self, **kwargs):
    try:
        logger.info("Got doc to send to W3ACT for: %s" % kwargs)

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password'))
        # And post this document up:
        send_document_to_w3act(kwargs,cfg.get('wayback','endpoint'),w)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Esempio n. 7
0
def uri_of_doc(self, **kwargs):
    try:
        logger.info("Got doc to send to W3ACT for: %s" % kwargs)

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # And post this document up:
        send_document_to_w3act(kwargs, cfg.get('wayback', 'endpoint'), w)

    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Esempio n. 8
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act','url'),cfg.get('act','username'),cfg.get('act','password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" % (cfg.get('h3','host'), cfg.get('h3','port')), username=cfg.get('h3','username'), password=cfg.get('h3','password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w, "%s/%s" % (HERITRIX_JOBS, frequency), heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" % (frequency, launch_id))
            assemble_job_output.delay(frequency,launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED" )
            logger.info("Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id, len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." % (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name, launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
def surts_from_w3act(allSurts):
    count = 0
    # get licenced urls from w3act
    w = w3act(args.act_url, args.act_username, args.act_password)
    acturls = w.get_oa_export("all")

    # write a copy of w3act urls as a record
    with open(w3actURLsFile, 'w') as outAct:
        # for all w3act urls, generate surt
        for line in acturls:
            for seed in line["seeds"]:
                outAct.write("%s\n" % seed)

                surtVal = generate_surt(seed)
                allSurts.add(surtVal)
                count += 1
                logger.debug("ACT seed [%s] surt [%s]" % (seed, surtVal))

    logger.info("%s surts from ACT generated" % count)
Esempio n. 10
0
def stop_start_job(self, frequency, start=datetime.utcnow(), restart=True):
    """
    Restarts the job for a particular frequency.
    """
    try:
        logger.info("Stopping/starting %s at %s" % (frequency, start))

        # Set up connection to W3ACT:
        w = w3act(cfg.get('act', 'url'), cfg.get('act', 'username'),
                  cfg.get('act', 'password'))
        # Set up connection to H3:
        h = hapyx.HapyX("https://%s:%s" %
                        (cfg.get('h3', 'host'), cfg.get('h3', 'port')),
                        username=cfg.get('h3', 'username'),
                        password=cfg.get('h3', 'password'))

        # Stop job if currently running:
        if frequency in h.list_jobs() and h.status(frequency) != "":
            """Stops a running job, notifies RabbitMQ and cleans up the directory."""
            launch_id = h.get_launch_id(frequency)
            job = W3actJob.from_directory(w,
                                          "%s/%s" % (HERITRIX_JOBS, frequency),
                                          heritrix=h)
            job.stop()
            remove_action_files(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "STOPPED")

            # Pass on to the next step in the chain:
            logger.info("Requesting assembly of output for: %s/%s" %
                        (frequency, launch_id))
            assemble_job_output.delay(frequency, launch_id)
        else:
            job = None

        # Start job if requested:
        if restart:
            targets = w.get_ld_export(frequency)
            # logger.info("Found %s Targets in export." % len(export))
            #    targets = [t for t in export if (t["startDate"] is None or t["startDate"] < start) and (t["endDateISO"] is None or t["crawlEndDateISO"] > start)]
            logger.debug("Found %s Targets in date range." % len(targets))
            job = W3actJob(w, targets, frequency, heritrix=h)
            logger.info("Starting job %s..." % job.name)
            job.start()
            launch_id = h.get_launch_id(frequency)
            crawl.status.update_job_status.delay(
                job.name, "%s/%s" % (job.name, launch_id), "LAUNCHED")
            logger.info("Launched job %s/%s with %s seeds." %
                        (job.name, launch_id, len(job.seeds)))
            return "Launched job %s/%s with %s seeds." % (job.name, launch_id,
                                                          len(job.seeds))
        else:
            if job:
                logger.info("Stopped job %s/%s without restarting..." %
                            (job.name, launch_id))
                return "Stopped job %s/%s without restarting..." % (job.name,
                                                                    launch_id)
            else:
                logger.warning("No running '%s' job to stop!" % frequency)
                return "No running '%s' job to stop!" % frequency
    except BaseException as e:
        logger.exception(e)
        raise self.retry(countdown=10, exe=e)
Esempio n. 11
0
                    type=str, default="http://*****:*****@bl.uk", 
                    help="W3ACT user email to login with [default: %(default)s]" )
    parser.add_argument('-p', '--w3act-pw', dest='w3act_pw', 
                    type=str, default="sysAdmin", 
                    help="W3ACT user password [default: %(default)s]" )
    parser.add_argument('-W', '--wb-url', dest='wb_url', 
                    type=str, default="http://localhost:8080/wayback", 
                    help="Wayback endpoint to check URL availability [default: %(default)s]" )
    
    args = parser.parse_args()

    # Set up connection to ACT:
    act = w3act.w3act(args.w3act_url,args.w3act_user,args.w3act_pw)

    # Non-matching Target test
    run_doc_mdex_test('https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/567676/east_dulwich_community_nursery_association.pdf',
                    'https://www.gov.uk/government/publications/east-dulwich-community-nursery-association-inquiry-report',
                    'https://www.gov.uk/government/publications?departments[]=department-for-transport',
                    None,"East Dulwich Community Nursery Association")

    # Title-only extraction tests:
    run_doc_mdex_test_extraction(
        "https://www.euromod.ac.uk/sites/default/files/working-papers/em2-01.pdf",
        "https://www.euromod.ac.uk/publications/date/2001/type/EUROMOD%20Working%20Paper%20Series",
        "https://www.euromod.ac.uk/", "Towards a multi purpose framework for tax benefit microsimulation")

    run_doc_mdex_test_extraction(
        "https://www.gov.uk/government/uploads/system/uploads/attachment_data/file/128968/competency-guidance.pdf",
Esempio n. 12
0
from crawl.w3act.w3act import w3act
from crawl.h3.utils import url_to_surt
import logging

LOGGING_FORMAT="[%(asctime)s] %(levelname)s: %(message)s"
logging.basicConfig(format=LOGGING_FORMAT, level=logging.DEBUG)
log = logging.getLogger(__name__)
log.setLevel(logging.DEBUG)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(
        'Grab Open Access targets and output to a file in SURT form.')
    parser.add_argument('--act-url', dest='act_url', type=str,
                        default="https://www.webarchive.org.uk/act/",
                        help="ACT endpoint to use. [default: %(default)s]")
    parser.add_argument('--act-username', dest='act_username', type=str,
                        help="ACT username to use. [default: %(default)s]")
    parser.add_argument('--act-password', dest='act_password', type=str,
                        help="ACT password to use. [default: %(default)s]")
    parser.add_argument('output_file', metavar='output file', default="/wayback/ldhosts.txt",
                        help="Output file to create, e.g. '/wayback/ldhosts.txt''.")

    args = parser.parse_args()

    w = w3act(args.act_url, args.act_username, args.act_password)
    items = w.get_oa_export("all")
    surts = ["http://(%s" % url_to_surt(u) for t in items for u in t["seeds"]]
    with open(args.output_file, "wb") as o:
        o.write("\n".join(surts))