def addMemento(self, urim): try: r = self.session.get(urim) if len(r.history) == 0: raw_urim = otmt.generate_raw_urim(urim) else: raw_urim = otmt.generate_raw_urim(r.url) self.session.get(raw_urim) self.urimlist.append(urim) except (ConnectionError, TooManyRedirects, RequestException) as e: self.addMementoError(urim, repr(e))
def generate_raw_urim_archiveorg_happy_path(self): urim = "https://web.archive.org/web/20070207050545/http://www.cnn.com:80/" raw_urim = "https://web.archive.org/web/20070207050545id_/http://www.cnn.com:80/" self.assertEquals(raw_urim, generate_raw_urim(urim))
def generate_raw_urim_archiveit_happy_path(self): urim = "http://wayback.archive-it.org/1068/20110317183254/http://www.amnistia.org.mx/" raw_urim = "http://wayback.archive-it.org/1068/20110317183254id_/http://www.amnistia.org.mx/" self.assertEquals(raw_urim, generate_raw_urim(urim))
def get_newspaper_publication_date(urim, cache_storage): import otmt dbconn = MongoClient(cache_storage) session = get_web_session(cache_storage) db = dbconn.get_default_database() try: return db.derivedvalues.find_one({"urim": urim})["newspaper publication date"] except (KeyError, TypeError): raw_urim = otmt.generate_raw_urim(urim) r = session.get(raw_urim) r.raise_for_status() article = Article(urim) article.download(r.text) article.parse() article.nlp() pd = article.publish_date if pd is None: pd = r.headers['memento-datetime'] else: pd = pd.strftime("%a, %d %b %Y %H:%M:%S GMT") db.derivedvalues.update( {"urim": urim}, {"$set": { "newspaper publication date": str(pd) }}) return pd
def getMementoContent(self, urim): """Returns the HTTP entity of memento at `urim` provided that it was previously stored via `addMemento`. If no data was stored via `addMemento` for `urim`, then `CollectionModelNoSuchMementoException` is thrown. If data was stored via `addMementoError` for `urim`, then `CollectionModelMementoErrorException` is thrown. """ raw_urim = otmt.generate_raw_urim(urim) return self.session.get(raw_urim).text
def get_raw_simhash(urim, cache_storage): import otmt dbconn = MongoClient(cache_storage) session = get_web_session(cache_storage) db = dbconn.get_default_database() # 1 if lang of urim in cache, return it try: return db.derivedvalues.find_one({"urim": urim})["raw simhash"] except (KeyError, TypeError): r = session.get(urim) if len(r.history) == 0: raw_urim = otmt.generate_raw_urim(urim) else: raw_urim = otmt.generate_raw_urim(r.url) r2 = session.get(raw_urim) r2.raise_for_status() if 'text/html' not in r2.headers['content-type']: raise Exception( "Hypercane currently only operates with HTML resources, refusing to compute Simhash on {}" .format(urim)) simhash = Simhash(r2.text).value db.derivedvalues.update({"urim": urim}, {"$set": { "raw simhash": str(simhash) }}, upsert=True) return str(simhash)
def synthesize_warc(urim, session, output_directory): import otmt import glob from warcio.warcwriter import WARCWriter from warcio.statusandheaders import StatusAndHeaders from hashlib import md5 from datetime import datetime import traceback m = md5() m.update(urim.encode('utf8')) urlhash = m.hexdigest() if len( glob.glob('{}/{}*.warc.gz'.format(output_directory, urlhash)) ) > 0: module_logger.warning("Detected existing WARC for URI-M, skipping {}".format(urim)) return resp = session.get(urim, stream=True) resp.raise_for_status() headers_list = resp.raw.headers.items() # we use response.url instead of urim to (hopefully) avoid raw redirects raw_urim = otmt.generate_raw_urim(resp.url) raw_response = session.get(raw_urim, stream=True) warc_target_uri = None # we have to implement this construct in case the archive combines original with other relations for link in resp.links: if 'original' in link: warc_target_uri = resp.links[link]['url'] if warc_target_uri is None: module_logger.warning("could not find this memento's original resource, skipping {}".format(urim)) return try: mdt = resp.headers['Memento-Datetime'] except KeyError: module_logger.warning("could not find this memento's memento-datetime, skipping {}".format(urim)) return http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0') module_logger.debug("mdt formatted by strptime and converted by strftime: {}".format( datetime.strptime( mdt, "%a, %d %b %Y %H:%M:%S GMT" ).strftime('%Y-%m-%dT%H:%M:%SZ') )) warc_headers_dict = {} warc_headers_dict['WARC-Date'] = datetime.strptime( mdt, "%a, %d %b %Y %H:%M:%S GMT" ).strftime('%Y-%m-%dT%H:%M:%SZ') with open("{}/{}-{}.warc.gz".format(output_directory, urlhash, datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as output: writer = WARCWriter(output, gzip=True) record = writer.create_warc_record( warc_target_uri, 'response', payload=raw_response.raw, http_headers=http_headers, warc_headers_dict=warc_headers_dict ) writer.write_record(record)
def addManyMementos(self, urims): module_logger.info("started with {} URI-Ms for processing...".format(len(urims))) # protect the function from duplicates in the urims list urims = list(set(urims)) module_logger.info("found duplicates, now using {} URI-Ms for processing...".format(len(urims))) futuressession = FuturesSession(session=self.session) retry = Retry( total=10, read=10, connect=10, backoff_factor=0.3, status_forcelist=(500, 502, 504) ) adapter = HTTPAdapter(max_retries=retry) futuressession.mount('http://', adapter) futuressession.mount('https://', adapter) futures = {} raw_futures = {} working_urim_list = [] raw_urims = [] for uri in urims: # raw_urim = otmt.generate_raw_urim(uri) working_urim_list.append(uri) futures[uri] = futuressession.get(uri) # futures[raw_urim] = futuressession.get(raw_urim) working_starting_size = len(working_urim_list) def uri_generator(urilist): while len(urilist) > 0: uchoice = random.choice(urilist) yield uchoice for uri in uri_generator(working_urim_list): if futures[uri].done(): module_logger.debug("URI-M {} is done, processing...".format(uri)) if len(working_urim_list) % 100 == 0: module_logger.info("{}/{} mementos left to process".format(len(working_urim_list), working_starting_size)) try: r = futures[uri].result() if len(r.history) == 0: raw_urim = otmt.generate_raw_urim(uri) else: raw_urim = otmt.generate_raw_urim(r.url) raw_urims.append( raw_urim ) if 'memento-datetime' not in r.headers: self.addMementoError(uri, "URI-M {} does not produce a memento".format(uri)) else: # the content should be cached by the session # we just need to keep track of the URI-Ms for this run self.urimlist.append(uri) except Exception as e: self.addMementoError(uri, repr(e)) working_urim_list.remove(uri) del futures[uri] module_logger.info("done adding {} mementos, now adding corresponding {} raw mementos...".format( len(urims), len(raw_urims) )) working_raw_urim_list = [] for raw_urim in list(set(raw_urims)): working_raw_urim_list.append(raw_urim) raw_futures[raw_urim] = futuressession.get(raw_urim) working_rawurims_starting_size = len(working_raw_urim_list) # for raw_urim in uri_generator(working_raw_urim_list): while len(working_raw_urim_list) > 0: raw_urim = random.choice(working_raw_urim_list) module_logger.debug("fetching results for raw URI-M {}".format(raw_urim)) # module_logger.debug("are the keys the same as the working list: {}".format( set(working_raw_urim_list) == set(list(raw_futures.keys())) ) ) module_logger.debug("raw mementos working list size: {}".format(len(working_raw_urim_list))) module_logger.debug("raw mementos futures keys size: {}".format(len(raw_futures))) # try: # raw_futures[raw_urim] # except KeyError: # module_logger.error("{} is not in futures".format(raw_urim)) # module_logger.error("is it: {}".format( raw_urim in raw_futures )) # module_logger.error("") # module_logger.error("working list follows:") # module_logger.error(pp.pformat(working_raw_urim_list)) # module_logger.error("") # module_logger.error("raw_futures keys follows:") # module_logger.error(pp.pformat(list(raw_futures.keys()))) if raw_futures[raw_urim].done(): module_logger.debug("raw URI-M {} is done, processing...".format(raw_urim)) if len(working_raw_urim_list) % 100 == 0: module_logger.info("{}/{} raw mementos left to process".format(len(working_raw_urim_list), working_rawurims_starting_size)) try: r = raw_futures[raw_urim].result() if 'memento-datetime' not in r.headers: self.addMementoError(uri, "raw URI-M {} does not produce a memento".format(raw_urim)) else: # the content should be cached by the session # we just need to keep track of the raw URI-Ms for this run self.urimlist.append(raw_urim) except Exception as e: self.addMementoError(raw_urim, repr(e)) # module_logger.debug("removing {} from working raw URI-M list and raw futures keys".format(raw_urim)) working_raw_urim_list.remove(raw_urim) del raw_futures[raw_urim] # module_logger.debug("raw URI-M {} in working raw URI-M list still? {}".format( raw_urim, raw_urim in working_raw_urim_list )) time.sleep(1)
def generate_raw_urim_archiveorg_raw_already(self): raw_urim = "https://web.archive.org/web/20070207050545id_/http://www.cnn.com:80/" self.assertEquals(raw_urim, generate_raw_urim(raw_urim))
def generate_raw_urim_archiveit_raw_already(self): raw_urim = "http://wayback.archive-it.org/1068/20110317183254id_/http://www.amnistia.org.mx/" self.assertEquals(raw_urim, generate_raw_urim(raw_urim))
def get_boilerplate_free_content(urim, cache_storage="", dbconn=None, session=None): import otmt from boilerpy3 import extractors if dbconn is None: dbconn = MongoClient(cache_storage) if session is None: session = get_web_session(cache_storage) db = dbconn.get_default_database() # 1. if boilerplate free content in cache, return it try: module_logger.info( "returing boilerplate free content from cache for {}".format(urim)) bpfree = db.derivedvalues.find_one({"urim": urim})["boilerplate free content"] return bytes(bpfree, "utf8") except (KeyError, TypeError): module_logger.info( "generating boilerplate free content for {}".format(urim)) r = session.get(urim) if len(r.history) == 0: raw_urim = otmt.generate_raw_urim(urim) else: raw_urim = otmt.generate_raw_urim(r.url) r2 = session.get(raw_urim) r2.raise_for_status() module_logger.info("content-type is {}".format( r2.headers['content-type'])) if 'text/html' not in r2.headers['content-type']: module_logger.warning( "we can only remove boilerplate from HTML, returning zero bytes" ) return bytes() # paragraphs = justext( # r.text, get_stoplist('English') # ) # bpfree = "" # for paragraph in paragraphs: # bpfree += "{}\n".format(paragraph.text) module_logger.debug( "attempting to extract boilerplate free content from {}".format( urim)) extractor = extractors.ArticleExtractor() try: bpfree = extractor.get_content(r2.text) module_logger.info( "storing boilerplate free content in cache {}".format(urim)) db.derivedvalues.update( {"urim": urim}, {"$set": { "boilerplate free content": bpfree }}, upsert=True) except Exception: module_logger.exception( "failed to extract boilerplate from {}, setting value to empty string" .format(urim)) hypercane.errors.errorstore.add(urim, traceback.format_exc()) return bytes() return bytes(bpfree, "utf8")