Esempio n. 1
0
 def _parse_line(self, line=""):
     """Parses single line of a CDX file and returns selected and derived attributes in a namedtuple."""
     segs = line.strip().split(" ")
     if len(segs) > 3:
         url = urlparse(segs[2])
         dom = tldextract.extract(segs[2])
         Segments = namedtuple("Segments", "scheme, host, domain, tld, surt, uri, time, mime")
         return Segments(url.scheme, url.netloc, surt(dom.registered_domain), surt(dom.suffix), surt(segs[0]), segs[0], segs[1], segs[3])
Esempio n. 2
0
 def _parse_line(self, line=""):
     """Parses single line of a CDX file and returns selected and derived attributes in a namedtuple."""
     segs = line.strip().split(" ")
     if len(segs) > 3:
         url = urlparse(segs[2])
         dom = tldextract.extract(segs[2])
         Segments = namedtuple(
             "Segments", "scheme, host, domain, tld, surt, uri, time, mime")
         return Segments(url.scheme, url.netloc,
                         surt(dom.registered_domain), surt(dom.suffix),
                         surt(segs[0]), segs[0], segs[1], segs[3])
Esempio n. 3
0
def test_surt_return_type(burl):
    """surt.surt() returns the same type of string object (i.e. returns unicode
    string for unicode string input, and byets for bytes)

    Note this behavior may change in the future versions. This test is for
    testing compatibility until that happens.
    """
    assert isinstance(burl, bytes)

    b = surt.surt(burl)
    assert type(b) is type(burl)

    uurl = burl.decode('ascii')
    u = surt.surt(uurl)
    assert type(u) is type(uurl)
Esempio n. 4
0
def timestamp_simhash(redis_db, url, timestamp):
    """Get stored simhash data from Redis for URL and timestamp
    """
    try:
        if url and timestamp:
            results = redis_db.hget(surt(url), timestamp)
            if results:
                return {'simhash': results}
            results = redis_db.hget(surt(url), timestamp[:4])
            if results:
                return {'status': 'error', 'message': 'NO_CAPTURES'}
            return {'status': 'error', 'message': 'CAPTURE_NOT_FOUND'}
    except RedisError as exc:
        logging.error('error loading simhash data for url %s timestamp %s (%s)',
                      url, timestamp, exc)
Esempio n. 5
0
def getCDXJLinesWithURIR(urir, indexPath=ipwbConfig.getIPWBReplayIndexPath()):
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    cdxjLinesWithURIR = []

    cdxjLineIndex = getCDXJLine_binarySearch(s, indexPath, True, True)  # get i

    if cdxjLineIndex is None:
        return []

    cdxjLines = []
    with open(indexPath, 'r') as f:
        cdxjLines = f.read().split('\n')
        baseCDXJLine = cdxjLines[cdxjLineIndex]  # via binsearch

        cdxjLinesWithURIR.append(baseCDXJLine)

    # Get lines before pivot that match surt
    sI = cdxjLineIndex - 1
    while sI >= 0:
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI -= 1
    # Get lines after pivot that match surt
    sI = cdxjLineIndex + 1
    while sI < len(cdxjLines):
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI += 1
    return cdxjLinesWithURIR
Esempio n. 6
0
def showTimeMap(urir, format):
    urir = getCompleteURI(urir)
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)
    tmContentType = ''

    hostAndPort = ipwbUtils.getIPWBReplayConfig()

    tgURI = 'http://{0}:{1}/timegate/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)

    tm = ''  # Initialize for usage beyond below conditionals
    if format == 'link':
        tm = generateLinkTimeMapFromCDXJLines(
            cdxjLinesWithURIR, s, request.url, tgURI)
        tmContentType = 'application/link-format'
    elif format == 'cdxj':
        tm = generateCDXJTimeMapFromCDXJLines(
            cdxjLinesWithURIR, s, request.url, tgURI)
        tmContentType = 'application/cdxj+ors'

    resp = Response(tm)
    resp.headers['Content-Type'] = tmContentType

    return resp
Esempio n. 7
0
def verify_block(urim):
    surim = surt.surt(urim)
    status = "FAILED"
    blkct = 0
    t2_b = timeit.default_timer()
    for blk in blkfs:
        blkct += 1
        rec = lookup_in_block(surim, blk)
        if not rec:
            continue
        t3_b = timeit.default_timer()
        t2_diff = t3_b - t2_b
        mfp = generate_current(urim)
        t4_b = timeit.default_timer()
        t3_diff = t4_b - t3_b
        mf = json.load(open(mfp))
        if rec["hash"] == mf["hash"]:
            status = "VERIFIED"
        t4_diff = timeit.default_timer() - t4_b
        break
    return {
        "status": status,
        "blkct": blkct,
        "lookupt": t2_diff,
        "gent": t3_diff,
        "verift": t4_diff
    }
Esempio n. 8
0
def showTimeMap(urir, format):
    urir = getCompleteURI(urir)
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)
    tmContentType = ''

    hostAndPort = ipwbUtils.getIPWBReplayConfig()

    tgURI = 'http://{0}:{1}/timegate/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)

    tm = ''  # Initialize for usage beyond below conditionals
    if format == 'link':
        tm = generateLinkTimeMapFromCDXJLines(
            cdxjLinesWithURIR, s, request.url, tgURI)
        tmContentType = 'application/link-format'
    elif format == 'cdxj':
        tm = generateCDXJTimeMapFromCDXJLines(
            cdxjLinesWithURIR, s, request.url, tgURI)
        tmContentType = 'application/cdxj+ors'

    resp = Response(tm)
    resp.headers['Content-Type'] = tmContentType

    return resp
Esempio n. 9
0
    def __init__(self, runres, qrel, qtitles):
        with codecs.open(qtitles, "r", encoding='utf-8', errors='ignore') as f:
            # with open(qtitles, 'rb') as f:
            csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in csv_reader:
                # for utf8_row in csv_reader:
                # row = [x.decode('utf8') for x in utf8_row]
                if len(row) < 2: continue
                self.qid_title_dict[row[0]] = row[1]
            f.close()
        with codecs.open(qrel, "r", encoding='utf-8', errors='ignore') as f:
            # with codecs.open(qrel, "rb") as f:
            csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
            for row in csv_reader:
                # for utf8_row in csv_reader:
                # only select relevant document-query pairs
                # row = [x.decode('utf8') for x in utf8_row]
                # if row[3] == '1':
                # self.doc_query_pairs.append((row[2], row[0]))
                # if int(row[1]) <= 100:
                if len(row) < 5: continue
                self.doc_query_pairs.append((surt(row[4]), row[0]))

                # TODO: for now one-to-one query-url pair
                # self.doc_query_dict[surt(row[4])] = row[0]

            f.close()
        self._runRes = runres
Esempio n. 10
0
 def get_pseudo_rel_qd_bing(self, top_k):
     """
     query-doc relevance grade
     query_id \t rank \t title \t description \t url \t query_id (tab delimiter)
     :param top_k: top k documents
     :return:
     """
     d = []
     q = []
     with codecs.open(self._runRes, "r", encoding='utf-8',
                      errors='ignore') as f:
         # with open(self._runRes, "rb") as f:
         next(f)  # skip the first line
         csv_reader = csv.reader(f, delimiter='\t', quoting=csv.QUOTE_NONE)
         for row in csv_reader:
             # for utf8_row in csv_reader:
             # row = [x.decode('utf8') for x in utf8_row]
             if len(row) <= 1:
                 print(row)
                 continue
             if int(row[1]) <= top_k:
                 # urls in Bing is not normalized yet
                 d.append(surt(row[4]))
                 # get Bing rank as label
                 # qid - rank
                 q.append([row[0], row[1]])
         f.close()
     return d, q
Esempio n. 11
0
def resolveMemento(urir, datetime):
    """ Request a URI-R at a supplied datetime from the CDXJ """
    urir = getCompleteURI(urir)

    if ipwbUtils.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'
          .format(urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)

    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(urir, datetime)

        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])
    newDatetime = closestLine.split(' ')[1]

    linkHeader = getLinkHeaderAbbreviatedTimeMap(urir, newDatetime)

    return (newDatetime, linkHeader, uri)
Esempio n. 12
0
def memento_data_for_url(request, url, qs=None, hash=None):
    from perma.models import Link  #noqa
    try:
        canonicalized = surt.surt(url)
    except ValueError:
        return {}
    mementos = [{
        'uri': memento_url(request, link),
        'datetime': link.creation_timestamp,
    } for link in Link.objects.visible_to_memento().filter(
        submitted_url_surt=canonicalized).order_by('creation_timestamp')]
    if not mementos:
        return {}
    return {
        'self': request.build_absolute_uri(),
        'original_uri': url,
        'timegate_uri': timegate_url(request, url),
        'timemap_uri': {
            'json_format': timemap_url(request, url, 'json'),
            'link_format': timemap_url(request, url, 'link'),
            'html_format': timemap_url(request, url, 'html'),
        },
        'mementos': {
            'first': mementos[0],
            'last': mementos[-1],
            'list': mementos,
        }
    }
Esempio n. 13
0
def show_timemap(urir, format):
    urir = compile_target_uri(urir, request.query_string)

    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    index_path = ipwb_utils.get_ipwb_replay_index_path()

    cdxj_lines_with_urir = get_cdxj_lines_with_urir(urir, index_path)
    tm_content_type = ''

    host_and_port = ipwb_utils.get_ipwb_replay_config()

    tg_uri = f'http://{host_and_port[0]}:{host_and_port[1]}/timegate/{urir}'

    tm = ''  # Initialize for usage beyond below conditionals
    if format == 'link':
        tm = generate_link_timemap_from_cdxj_lines(cdxj_lines_with_urir, s,
                                                   request.url, tg_uri)
        tm_content_type = 'application/link-format'
    elif format == 'cdxj':
        tm = generate_cdxj_timemap_from_cdxj_lines(cdxj_lines_with_urir, s,
                                                   request.url, tg_uri)
        tm_content_type = 'application/cdxj+ors'

    resp = Response(tm)
    resp.headers['Content-Type'] = tm_content_type

    return resp
Esempio n. 14
0
def canonicalize(url, surt_ordered=True):
    """
    Canonicalize url and convert to surt
    If not in surt ordered mode, convert back to url form
    as surt conversion is currently part of canonicalization

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
    'com,example)/path/file.html'

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
    'example.com/path/file.html'

    >>> canonicalize('urn:some:id')
    'urn:some:id'
    """
    try:
        key = surt.surt(url)
    except Exception as e:  #pragma: no cover
        # doesn't happen with surt from 0.3b
        # urn is already canonical, so just use as-is
        if url.startswith('urn:'):
            return url

        raise UrlCanonicalizeException('Invalid Url: ' + url)

    # if not surt, unsurt the surt to get canonicalized non-surt url
    if not surt_ordered:
        key = unsurt(key)

    return key
Esempio n. 15
0
    def _assemble_entry(self, recorded_url, records):
        if recorded_url.response_recorder:
            if recorded_url.response_recorder.payload_digest.name == "sha1":
                sha1base32 = base64.b32encode(
                        recorded_url.response_recorder.payload_digest.digest()
                        ).decode("utf-8")
            else:
                self.logger.warn(
                        "digest type is %s but big captures table is indexed "
                        "by sha1",
                        recorded_url.response_recorder.payload_digest.name)
        else:
            digest = hashlib.new("sha1", records[0].content[1])
            sha1base32 = base64.b32encode(digest.digest()).decode("utf-8")

        if (recorded_url.warcprox_meta
                and "captures-bucket" in recorded_url.warcprox_meta):
            bucket = recorded_url.warcprox_meta["captures-bucket"]
        else:
            bucket = "__unspecified__"

        canon_surt = surt.surt(recorded_url.url.decode("utf-8"),
            trailing_comma=True, host_massage=False, with_scheme=True)

        entry = {
            # id only specified for rethinkdb partitioning
            "id": "{} {}".format(
                canon_surt[:20], records[0].id.decode("utf-8")[10:-1]),
            "abbr_canon_surt": canon_surt[:150],
            "canon_surt": canon_surt,
            "timestamp": recorded_url.timestamp.replace(
                tzinfo=rethinkstuff.UTC),
            "url": recorded_url.url.decode("utf-8"),
            "offset": records[0].offset,
            "filename": os.path.basename(records[0].warc_filename),
            "warc_type": records[0].type.decode("utf-8"),
            "warc_id": records[0].id.decode("utf-8"),
            "sha1base32": sha1base32,
            "content_type": recorded_url.mimetype,
            "response_code": recorded_url.status,
            "http_method": recorded_url.method,
            "bucket": bucket,
            "record_length": records[0].length, # compressed (or not) length of
                                                # warc record including record
                                                # headers
            "wire_bytes": recorded_url.size, # count of bytes transferred over
                                             # the wire, including http headers
                                             # if any
        }

        if recorded_url.warcprox_meta:
            if "dedup-ok" in recorded_url.warcprox_meta:
                entry["dedup_ok"] = recorded_url.warcprox_meta["dedup-ok"]
            if "captures-table-extra-fields" in recorded_url.warcprox_meta:
                extras = recorded_url.warcprox_meta[
                        "captures-table-extra-fields"]
                for extra_field in extras:
                    entry[extra_field] = extras[extra_field]

        return entry
Esempio n. 16
0
    def test_media_capture_in_iframes(self):
        settings.ENABLE_AV_CAPTURE = True
        target_folder = self.org_user.root_folder
        obj = self.successful_post(self.list_url,
                                   data={
                                       'url': self.server_url + "/test_media_outer.html",
                                       'folder': target_folder.pk,
                                   },
                                   user=self.org_user)

        # verify that all images in src and srcset were found and captured
        expected_captures = (
            # test_media_a.html
            "test.wav", "test2.wav",
            # test_media_b.html
            "test.mp4", "test2.mp4",
            # test_media_c.html
            "test.swf", "test2.swf", "test3.swf",
            "test1.jpg", "test2.png", "test_fallback.jpg",
            "wide1.png", "wide2.png", "narrow.png"
        )
        failures = []
        for expected_capture in expected_captures:
            try:
                cdxline = CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid'])
                if cdxline.parsed['status'] != '200':
                    failures.append("%s returned HTTP status %s." % (expected_capture, cdxline.parsed['status']))
            except CDXLine.DoesNotExist:
                failures.append("%s not captured." % expected_capture)
        self.assertFalse(bool(failures), "Failures in fetching media from iframes: %s" % failures)
Esempio n. 17
0
def handle_results(redis_db,
                   timestamps_to_fetch,
                   url,
                   snapshots_per_page,
                   page=None):
    """Utility method used by `year_simhash`
    """
    available_simhashes = []
    if page:
        number_of_pages = ceil(len(timestamps_to_fetch) / snapshots_per_page)
        if page > number_of_pages:
            page = number_of_pages
        if number_of_pages > 0:
            timestamps_to_fetch = \
                timestamps_to_fetch[(page - 1) * snapshots_per_page:(page * snapshots_per_page)]
        else:
            number_of_pages = 1
    try:
        results = redis_db.hmget(surt(url), timestamps_to_fetch)
        for i, simhash in enumerate(results):
            available_simhashes.append([str(timestamps_to_fetch[i]), simhash])
        if page:
            available_simhashes.insert(0, ["pages", number_of_pages])
        return available_simhashes
    except RedisError as exc:
        logging.error('cannot handle results for url %s page %d (%s)', url,
                      page, exc)
Esempio n. 18
0
def getCDXJLinesWithURIR(urir, indexPath):
    """ Get all CDXJ records corresponding to a URI-R """
    if not indexPath:
        indexPath = ipwbUtils.getIPWBReplayIndexPath()
    indexPath = getIndexFileFullPath(indexPath)

    print('Getting CDXJ Lines with {0} in {1}'.format(urir, indexPath))
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    cdxjLinesWithURIR = []

    cdxjLineIndex = getCDXJLine_binarySearch(s, indexPath, True, True)  # get i

    if cdxjLineIndex is None:
        return []

    cdxjLines = []
    with open(indexPath, 'r') as f:
        cdxjLines = f.read().split('\n')
        baseCDXJLine = cdxjLines[cdxjLineIndex]  # via binsearch

        cdxjLinesWithURIR.append(baseCDXJLine)

    # Get lines before pivot that match surt
    sI = cdxjLineIndex - 1
    while sI >= 0:
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI -= 1
    # Get lines after pivot that match surt
    sI = cdxjLineIndex + 1
    while sI < len(cdxjLines):
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI += 1
    return cdxjLinesWithURIR
Esempio n. 19
0
def showMementosForURIRs(urir):
    urir = getCompleteURI(urir)

    if ipwbConfig.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'.format(
        urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    if len(cdxjLinesWithURIR) == 1:
        fields = cdxjLinesWithURIR[0].split(' ', 2)
        redirectURI = '/{1}/{0}'.format(unsurt(fields[0]), fields[1])
        return redirect(redirectURI, code=302)

    msg = ''
    if cdxjLinesWithURIR:
        msg += '<p>{0} capture(s) available:</p><ul>'.format(
            len(cdxjLinesWithURIR))
        for line in cdxjLinesWithURIR:
            fields = line.split(' ', 2)
            dt14 = fields[1]
            dtrfc1123 = ipwbConfig.datetimeToRFC1123(fields[1])
            msg += ('<li><a href="/{1}/{0}">{0} at {2}</a></li>'.format(
                unsurt(fields[0]), dt14, dtrfc1123))
        msg += '</ul>'
    return Response(msg)
Esempio n. 20
0
    def generate_surt(self, url):
        if self.RE_NONCHARS.search(url):
            logger.warn("Questionable characters found in URL [%s]" % url)
            return None

        surtVal = surt.surt(url)

        #### WA: ensure SURT has scheme of original URL ------------
        # line_scheme = RE_SCHEME.match(line)           # would allow http and https (and any others)
        line_scheme = 'http://'  # for wayback, all schemes need to be only http
        surt_scheme = self.RE_SCHEME.match(surtVal)

        if line_scheme and not surt_scheme:
            if re.match(r'\(', surtVal):
                # surtVal = line_scheme.group(0) + surtVal
                surtVal = line_scheme + surtVal
                logger.debug("Added scheme [%s] to surt [%s]" % (line_scheme, surtVal))
            else:
                # surtVal = line_scheme.group(0) + '(' + surtVal
                surtVal = line_scheme + '(' + surtVal
                # logger.debug("Added scheme [%s] and ( to surt [%s]" % (line_scheme, surtVal))

        surtVal = re.sub(r'\)/$', ',', surtVal)

        return surtVal
Esempio n. 21
0
    def test_media_capture_in_iframes(self):
        settings.ENABLE_AV_CAPTURE = True
        target_folder = self.org_user.root_folder
        obj = self.successful_post(self.list_url,
                                   data={
                                       'url': self.server_url + "/test_media_outer.html",
                                       'folder': target_folder.pk,
                                   },
                                   user=self.org_user)

        # verify that all images in src and srcset were found and captured
        expected_captures = (
            # test_media_a.html
            "test.wav", "test2.wav",
            # test_media_b.html
            "test.mp4", "test2.mp4",
            # test_media_c.html
            "test.swf", "test2.swf", "test3.swf",
            "test1.jpg", "test2.png", "test_fallback.jpg",
            "wide1.png", "wide2.png", "narrow.png"
        )
        failures = []
        for expected_capture in expected_captures:
            try:
                cdxline = CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid'])
                if cdxline.parsed['status'] != '200':
                    failures.append("%s returned HTTP status %s." % (expected_capture, cdxline.parsed['status']))
            except CDXLine.DoesNotExist:
                failures.append("%s not captured." % expected_capture)
        self.assertFalse(bool(failures), "Failures in fetching media from iframes: %s" % failures)
Esempio n. 22
0
def resolveMemento(urir, datetime):
    """ Request a URI-R at a supplied datetime from the CDXJ """
    urir = getCompleteURI(urir)

    if ipwbUtils.isLocalHosty(urir):
        urir = urir.split('/', 4)[4]
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    print('Getting CDXJ Lines with the URI-R {0} from {1}'
          .format(urir, indexPath))
    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    closestLine = getCDXJLineClosestTo(datetime, cdxjLinesWithURIR)

    if closestLine is None:
        msg = '<h1>ERROR 404</h1>'
        msg += 'No capture found for {0} at {1}.'.format(urir, datetime)

        return Response(msg, status=404)

    uri = unsurt(closestLine.split(' ')[0])
    newDatetime = closestLine.split(' ')[1]

    linkHeader = getLinkHeaderAbbreviatedTimeMap(urir, newDatetime)

    return (newDatetime, linkHeader, uri)
Esempio n. 23
0
def regenerate_urlkeys(urlkey_prefix='file'):
    """
        Rewrite CDXLine urlkeys using the current version of the surt library.
    """

    from perma.models import CDXLine
    from surt import surt

    target_cdxlines = CDXLine.objects.all()
    if urlkey_prefix:
        target_cdxlines = target_cdxlines.filter(
            urlkey__startswith=urlkey_prefix)

    for i, cdxline in enumerate(target_cdxlines):
        if not (i % 1000):
            print "%s records done -- next is %s." % (i, cdxline.link_id)
        new_surt = surt(cdxline.parsed['url'])
        if new_surt != cdxline.urlkey:
            try:
                cdxline.raw = cdxline.raw.replace(cdxline.urlkey, new_surt, 1)
            except UnicodeDecodeError:
                print "Skipping unicode for %s" % cdxline.link_id
                continue
            cdxline.urlkey = new_surt
            cdxline.save()
Esempio n. 24
0
def getCDXJLinesWithURIR(urir, indexPath):
    """ Get all CDXJ records corresponding to a URI-R """
    if not indexPath:
        indexPath = ipwbUtils.getIPWBReplayIndexPath()
    indexPath = getIndexFileFullPath(indexPath)

    print('Getting CDXJ Lines with {0} in {1}'.format(urir, indexPath))
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    cdxjLinesWithURIR = []

    cdxjLineIndex = getCDXJLine_binarySearch(s, indexPath, True, True)  # get i

    if cdxjLineIndex is None:
        return []

    cdxjLines = []
    with open(indexPath, 'r') as f:
        cdxjLines = f.read().split('\n')
        baseCDXJLine = cdxjLines[cdxjLineIndex]  # via binsearch

        cdxjLinesWithURIR.append(baseCDXJLine)

    # Get lines before pivot that match surt
    sI = cdxjLineIndex - 1
    while sI >= 0:
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI -= 1
    # Get lines after pivot that match surt
    sI = cdxjLineIndex + 1
    while sI < len(cdxjLines):
        if cdxjLines[sI].split(' ')[0] == s:
            cdxjLinesWithURIR.append(cdxjLines[sI])
        sI += 1
    return cdxjLinesWithURIR
Esempio n. 25
0
 def _dsub(self, url):
     ext = tldextract.extract(url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         return f"{reg_dom}{subdom_len}"
Esempio n. 26
0
    def load_cdx(self, query):
        """
            This function accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        """
        guid = query.params['guid']
        url = query.url

        # We'll first check the key-value store to see if we cached the lookup for this guid on a previous request.
        # This will be common, since each playback triggers lots of requests for the same .warc file.
        cache_key = guid + '-surts'
        url_key = guid+'-url'
        surt_lookup = django_cache.get(cache_key)
        url = url or django_cache.get(url_key)
        if surt_lookup and url:
            surt_lookup = json.loads(surt_lookup)

        else:
            # nothing in cache; find requested link in database
            try:
                link = Link.objects.select_related().get(pk=guid)
            except Link.DoesNotExist:
                return []

            # cache url, which may be blank if this is the first request
            if not url:
                url = link.submitted_url
            django_cache.set(url_key, url, timeout=60*60)

            # get warc file
            for asset in link.assets.all():
                if '.warc' in asset.warc_capture:
                    warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
                    break
            else:
                return []  # no .warc file -- do something to handle this?

            # now we have to get an index of all the URLs in this .warc file
            # first try fetching it from a .cdx file on disk
            cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')

            if not default_storage.exists(cdx_path):
                # there isn't a .cdx file on disk either -- let's create it
                with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file:
                    write_cdx_index(cdx_file, warc_file, warc_path, sort=True)

            # now load the URL index from disk and stick it in the cache
            cdx_lines = (line.strip() for line in default_storage.open(cdx_path, 'rb'))
            surt_lookup = dict((key, list(val)) for key, val in groupby(cdx_lines, key=lambda line: line.split(' ', 1)[0]))
            django_cache.set(cache_key, json.dumps(surt_lookup), timeout=60*60)

        # find cdx lines for url
        sorted_url = surt(url)
        if sorted_url in surt_lookup:
            return (str(i) for i in surt_lookup[sorted_url])

        # didn't find requested url in this archive
        return []
Esempio n. 27
0
 def _dsub(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse("http://" + url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         return "{0}{1}".format(reg_dom, subdom_len)
Esempio n. 28
0
    def run(self, url, year, created):
        """Run Celery Task.
        """
        self.job_id = self.request.id
        self.url = url_fix(url)
        time_started = datetime.now()
        self._log.info('Start calculating simhashes.')
        self.download_errors = 0
        if not self.url:
            self._log.error('did not give url parameter')
            return {'status': 'error', 'info': 'URL is required.'}
        if not year:
            self._log.error('did not give year parameter')
            return {'status': 'error', 'info': 'Year is required.'}
        # fetch captures
        self.update_state(
            state='PENDING',
            meta={'info': 'Fetching %s captures for year %s' % (url, year)})
        resp = self.fetch_cdx(url, year)
        if resp.get('status') == 'error':
            return resp
        captures = resp.get('captures')
        total = len(captures)
        self.seen = dict()
        # calculate simhashes in parallel
        i = 0
        final_results = {}
        for res in self.tpool.map(self.get_calc, captures):
            if not res:
                continue
            (timestamp, simhash) = res
            if simhash:
                final_results[timestamp] = simhash
            if i % 10 == 0:
                self.update_state(state='PENDING',
                                  meta={
                                      'info':
                                      'Processed %d out of %d captures.' %
                                      (i, total)
                                  })
            i += 1

        self._log.info('%d final results for %s and year %s.',
                       len(final_results), self.url, year)
        if final_results:
            try:
                urlkey = surt(self.url)
                self.redis.hmset(urlkey, final_results)
                self.redis.expire(urlkey, self.simhash_expire)
            except RedisError as exc:
                self._log.error('cannot write simhashes to Redis for URL %s',
                                self.url,
                                exc_info=1)

        duration = (datetime.now() - time_started).seconds
        self._log.info('Simhash calculation finished in %.2fsec.', duration)
        return {'duration': str(duration)}
Esempio n. 29
0
def showTimeMap(urir, format):
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbConfig.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)

    tm = generateTimeMapFromCDXJLines(cdxjLinesWithURIR, s, request.url)

    return Response(tm)
Esempio n. 30
0
 def save_to_redis(self, ts, data):
     try:
         urlkey = surt(self.url)
         self._log.info('save simhash to Redis for timestamp %s urlkey %s',
                        ts, urlkey)
         self.redis_db.hset(urlkey, ts,
                            base64.b64encode(str(data).encode('ascii')))
     except RedisError as exc:
         self._log.error('cannot save simhash to Redis (%s)', exc)
Esempio n. 31
0
def normalize_url(url, base_url):
    absolute = urljoin(base_url, url)
    # Use SURT to do most normalization, but don't return in SURT format.
    result = surt(absolute, reverse_ipaddr=False, surt=False, with_scheme=True)
    # Use HTTPS for all web URLs. Don't translate other schemes (e.g. FTP).
    if result.startswith('http:'):
        result = f'https:{result[5:]}'

    return result
Esempio n. 32
0
 def _dsub(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse("http://" + url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         return "{0}{1}".format(reg_dom, subdom_len)
Esempio n. 33
0
 def __call__(self, url):
     try:
         key = surt.surt(
                 url, trailing_comma=True, host_massage=False,
                 with_scheme=True)
         # logging.debug('%s -> %s', url, key)
         return key
     except Exception as e:
         raise pywb.utils.canonicalize.UrlCanonicalizeException(
                 'Invalid Url: ' + url)
 def _update_ds(self, count, entry):
     """Update data structure after processing a line from the CDX"""
     try:
         suburis = generate_suburis(surt(entry), max_host_segments=self.max_host_segments, max_path_segments=self.max_path_segments)
         for s in suburis:
             self._update_record("suburi", s, count)
         #self._update_record("time", entry.time[0:6], entry.surt)
         #self._update_record("mediatype", entry.mime, entry.surt)
     except:
         print("Something went wrong while processing " + entry)
Esempio n. 35
0
 def __call__(self, url):
     try:
         key = surt.surt(
                 url, trailing_comma=True, host_massage=False,
                 with_scheme=True)
         # logging.debug('%s -> %s', url, key)
         return key
     except Exception as e:
         raise pywb.utils.canonicalize.UrlCanonicalizeException(
                 'Invalid Url: ' + url)
Esempio n. 36
0
def cdx(request):
    """
        This function handles WARC lookups by our warc server (running in warc_server).
        It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404.
    """
    # find requested link and url
    try:
        link = Link.objects.select_related().get(pk=request.POST.get('guid'))
    except Link.DoesNotExist:
        print "COULDN'T FIND LINK"
        raise Http404
    url = request.POST.get('url', link.submitted_url)

    # get warc file
    for asset in link.assets.all():
        if '.warc' in asset.warc_capture:
            warc_path = os.path.join(settings.GENERATED_ASSETS_STORAGE, asset.base_storage_path, asset.warc_capture)
            break
    else:
        if settings.USE_WARC_ARCHIVE:
            print "COULDN'T FIND WARC"
            raise Http404 # no .warc file -- do something to handle this
        else:
            warc_path = os.path.join(settings.GENERATED_ASSETS_STORAGE, asset.base_storage_path, "archive.warc.gz")

    # get cdx file
    cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')
    try:
        cdx_file = open(cdx_path, 'rb')
    except IOError:
        # if we can't find the CDX file associated with this WARC, create it
        cdx_lines = StringIO.StringIO()
        cdx_writer.CDX_Writer(warc_path, cdx_lines).make_cdx()
        cdx_lines = cdx_lines.getvalue().split("\n")
        with open(cdx_path, 'wb') as cdx_file:
            cdx_file.write("\n".join(sorted(cdx_lines)))
        cdx_file = open(cdx_path, 'rb')

    # find cdx lines for url
    sorted_url = surt.surt(url)
    out = ""
    for line in cdx_file:
        if line.startswith(sorted_url+" "):
            out += line
        elif out:
            # file may contain multiple matching lines in a row; we want to return all of them
            # if we've already found one or more matching lines, and now they're no longer matching, we're done
            break

    if out:
        return HttpResponse(out, content_type="text/plain")

    print "COULDN'T FIND URL"
    raise Http404 # didn't find requested url in .cdx file
Esempio n. 37
0
 def _dpth(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse(url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = path_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         if urlseg.path:
             path_len = urlseg.path.strip("\n\r/").count("/") + 1
         return f"{reg_dom}{subdom_len}/{path_len}"
Esempio n. 38
0
 def _dpth(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse("http://" + url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = path_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         if urlseg.path:
             path_len = urlseg.path.strip("\n\r/").count("/") + 1
         return "{0}{1}/{2}".format(reg_dom, subdom_len, path_len)
Esempio n. 39
0
 def _dpth(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse("http://" + url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = path_len = 0
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         if urlseg.path:
             path_len = urlseg.path.strip("\n\r/").count("/") + 1
         return "{0}{1}/{2}".format(reg_dom, subdom_len, path_len)
Esempio n. 40
0
def getLinkHeaderAbbreviatedTimeMap(urir, pivotDatetime):
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)
    hostAndPort = ipwbUtils.getIPWBReplayConfig()

    tgURI = 'http://{0}:{1}/timegate/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)

    tmURI = 'http://{0}:{1}/timemap/link/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)
    tm = generateLinkTimeMapFromCDXJLines(cdxjLinesWithURIR, s, tmURI, tgURI)

    # Fix base TM relation when viewing abbrev version in Link resp
    tm = tm.replace('rel="self timemap"', 'rel="timemap"')

    # Only one memento in TimeMap
    if 'rel="first last memento"' in tm:
        return tm.replace('\n', ' ').strip()

    tmLines = tm.split('\n')
    for idx, line in enumerate(tmLines):
        if len(re.findall('rel=.*memento"', line)) == 0:
            continue  # Not a memento

        if pivotDatetime in line:
            addBothNextAndPrev = False
            if idx > 0 and idx < len(tmLines) - 1:
                addBothNextAndPrev = True

            if addBothNextAndPrev or idx == 0:
                tmLines[idx + 1] = \
                    tmLines[idx + 1].replace('memento"', 'next memento"')
            if addBothNextAndPrev or idx == len(tmLines) - 1:
                tmLines[idx - 1] = \
                    tmLines[idx - 1].replace('memento"', 'prev memento"')
            break

    # Remove all mementos in abbrev TM that are not:
    #   first, last, prev, next, or pivot
    for idx, line in enumerate(tmLines):
        if len(re.findall('rel=.*memento"', line)) == 0:
            continue  # Not a memento
        if pivotDatetime in line:
            continue

        if len(re.findall('rel=.*(next|prev|first|last)', line)) == 0:
            tmLines[idx] = ''

    return ' '.join(filter(None, tmLines))
Esempio n. 41
0
def getLinkHeaderAbbreviatedTimeMap(urir, pivotDatetime):
    s = surt.surt(urir, path_strip_trailing_slash_unless_empty=False)
    indexPath = ipwbUtils.getIPWBReplayIndexPath()

    cdxjLinesWithURIR = getCDXJLinesWithURIR(urir, indexPath)
    hostAndPort = ipwbUtils.getIPWBReplayConfig()

    tgURI = 'http://{0}:{1}/timegate/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)

    tmURI = 'http://{0}:{1}/timemap/link/{2}'.format(
        hostAndPort[0],
        hostAndPort[1], urir)
    tm = generateLinkTimeMapFromCDXJLines(cdxjLinesWithURIR, s, tmURI, tgURI)

    # Fix base TM relation when viewing abbrev version in Link resp
    tm = tm.replace('rel="self timemap"', 'rel="timemap"')

    # Only one memento in TimeMap
    if 'rel="first last memento"' in tm:
        return tm.replace('\n', ' ').strip()

    tmLines = tm.split('\n')
    for idx, line in enumerate(tmLines):
        if len(re.findall('rel=.*memento"', line)) == 0:
            continue  # Not a memento

        if pivotDatetime in line:
            addBothNextAndPrev = False
            if idx > 0 and idx < len(tmLines) - 1:
                addBothNextAndPrev = True

            if addBothNextAndPrev or idx == 0:
                tmLines[idx + 1] = \
                    tmLines[idx + 1].replace('memento"', 'next memento"')
            if addBothNextAndPrev or idx == len(tmLines) - 1:
                tmLines[idx - 1] = \
                    tmLines[idx - 1].replace('memento"', 'prev memento"')
            break

    # Remove all mementos in abbrev TM that are not:
    #   first, last, prev, next, or pivot
    for idx, line in enumerate(tmLines):
        if len(re.findall('rel=.*memento"', line)) == 0:
            continue  # Not a memento
        if pivotDatetime in line:
            continue

        if len(re.findall('rel=.*(next|prev|first|last)', line)) == 0:
            tmLines[idx] = ''

    return ' '.join(filter(None, tmLines))
Esempio n. 42
0
    def test_should_capture_all_srcset_images(self):
        target_folder = self.org_user.root_folder
        obj = self.successful_post(self.list_url,
                                   data={
                                       'url': self.server_url + "/test_media_outer.html",
                                       'folder': target_folder.pk,
                                   },
                                   user=self.org_user)

        # verify that all images in src and srcset were found and captured
        expected_captures = ("test1.jpg", "test2.png", "test_fallback.jpg", "wide1.png", "wide2.png", "narrow.png")
        for expected_capture in expected_captures:
            self.assertEqual('200', CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid']).parsed['status'])
Esempio n. 43
0
    def test_should_capture_nested_audio_file(self):
        settings.ENABLE_AV_CAPTURE = True
        target_folder = self.org_user.root_folder
        obj = self.successful_post(self.list_url,
                                   data={
                                       'url': self.server_url + "/test_wav_outer.html",
                                       'folder': target_folder.pk,
                                   },
                                   user=self.org_user)

        # verify that embedded /test.* files in iframe were found and captured
        expected_captures = ("test.wav", "test2.wav", "test.mp4", "test2.mp4", "test.swf", "test2.swf", "test3.swf")
        for expected_capture in expected_captures:
            self.assertEqual('200', CDXLine.objects.get(urlkey=surt(self.server_url + "/" + expected_capture), link_id=obj['guid']).parsed['status'])
Esempio n. 44
0
    def get_massaged_url(self, record, use_precalculated_value=True):
        if use_precalculated_value:
            return self.surt

        if 'warcinfo' == record.type:
            return self.get_original_url(record)
        else:
            url = record.url
            if self.screenshot_mode:
                url = 'http://web.archive.org/screenshot/'+url

            try:
                return surt(url)
            except:
                return self.get_original_url(record)
Esempio n. 45
0
def cdx(request):
    """
        This function handles WARC lookups by our warc server (running in warc_server).
        It accepts a standard CDX request, except with a GUID instead of date, and returns a standard CDX 11 response.
        If there's no warc for the requested GUID, or the requested URL isn't stored in that WARC, it returns a 404.
    """
    # find requested link and url
    try:
        link = Link.objects.select_related().get(pk=request.GET.get('guid'))
    except Link.DoesNotExist:
        print "COULDN'T FIND LINK"
        raise Http404
    url = request.GET.get('url', link.submitted_url)

    # get warc file
    for asset in link.assets.all():
        if '.warc' in asset.warc_capture:
            warc_path = os.path.join(asset.base_storage_path, asset.warc_capture)
            break
    else:
        print "COULDN'T FIND WARC"
        raise Http404 # no .warc file -- do something to handle this?

    # get cdx file
    cdx_path = warc_path.replace('.gz', '').replace('.warc', '.cdx')
    if not default_storage.exists(cdx_path):
        # if we can't find the CDX file associated with this WARC, create it
        with default_storage.open(warc_path, 'rb') as warc_file, default_storage.open(cdx_path, 'wb') as cdx_file:
            write_cdx_index(cdx_file, warc_file, warc_path, sort=True)

    cdx_lines = default_storage.open(cdx_path, 'rb')

    # find cdx lines for url
    sorted_url = surt.surt(url)
    out = ""
    for line in cdx_lines:
        if line.startswith(sorted_url+" "):
            out += line
        elif out:
            # file may contain multiple matching lines in a row; we want to return all of them
            # if we've already found one or more matching lines, and now they're no longer matching, we're done
            break

    if out:
        return HttpResponse(out, content_type="text/plain")

    print "COULDN'T FIND URL"
    raise Http404 # didn't find requested url in .cdx file
Esempio n. 46
0
 def _dini(self, url):
     ext = tldextract.extract(url)
     urlseg = urlparse("http://" + url)
     reg_dom = surt(ext.registered_domain)
     if reg_dom[0].isalpha() and ")/" in reg_dom:
         subdom_len = path_len = query_len = 0
         path_init = urlseg.path.strip("\n\r/")[:1]
         if ext.subdomain:
             subdom_len = ext.subdomain.count(".") + 1
         if urlseg.path:
             path_len = urlseg.path.strip("\n\r/").count("/") + 1
         if urlseg.query:
             query_len = urlseg.query.strip("?&").count("&") + 1
         if not path_init.isalnum():
             path_init = "-"
         return "{0}{1}/{2}/{3}/{4}".format(reg_dom, subdom_len, path_len, query_len, path_init)
Esempio n. 47
0
    def load_cdx(self, url, params = {}, parsed_cdx = True, **kwvalues):
        # canonicalize to surt (canonicalization is part of surt conversion)
        try:
            key = surt.surt(url)
        except Exception as e:
            raise wbexceptions.BadUrlException('Bad Request Url: ' + url)

        # if not surt, unsurt the surt to get canonicalized non-surt url
        if not self.surt_ordered:
            key = utils.unsurt(key)

        match_func = binsearch.iter_exact

        params.update(**kwvalues)
        params['output'] = 'raw' if parsed_cdx else 'text'

        return cdxserve.cdx_serve(key, params, self.sources, match_func)
Esempio n. 48
0
def canonicalize(url, surt_ordered=True):
    """
    Canonicalize url and convert to surt
    If not in surt ordered mode, convert back to url form
    as surt conversion is currently part of canonicalization

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=True)
    'com,example)/path/file.html'

    >>> canonicalize('http://example.com/path/file.html', surt_ordered=False)
    'example.com/path/file.html'
    """
    try:
        key = surt.surt(url)
    except Exception as e:
        raise UrlCanonicalizeException('Invalid Url: ' + url)

    # if not surt, unsurt the surt to get canonicalized non-surt url
    if not surt_ordered:
        key = unsurt(key)

    return key
Esempio n. 49
0
def regenerate_urlkeys(urlkey_prefix='file'):
    """
        Rewrite CDXLine urlkeys using the current version of the surt library.
    """

    from perma.models import CDXLine
    from surt import surt

    target_cdxlines = CDXLine.objects.all()
    if urlkey_prefix:
        target_cdxlines = target_cdxlines.filter(urlkey__startswith=urlkey_prefix)

    for i, cdxline in enumerate(target_cdxlines):
        if not (i%1000):
            print "%s records done -- next is %s." % (i, cdxline.link_id)
        new_surt = surt(cdxline.parsed['url'])
        if new_surt != cdxline.urlkey:
            try:
                cdxline.raw = cdxline.raw.replace(cdxline.urlkey, new_surt, 1)
            except UnicodeDecodeError:
                print "Skipping unicode for %s" % cdxline.link_id
                continue
            cdxline.urlkey = new_surt
            cdxline.save()
Esempio n. 50
0
def getBing(url,outputArray, indexOfOutputArray,verbose=False, **kwargs):
	apiKey = []
	try:
		apiKey_env=os.getenv('CD_Bing_key')
		if apiKey_env is not None:
			logging.debug ( 'cdGetBing: Bing api key detected in environment variable, overwite local config values.')
			apiKey=apiKey_env
		else:
			fileConfig = open(os.path.dirname(__file__)+"/../config", "r")
			config = fileConfig.read()
			fileConfig.close()

			apiKey = json.loads(config)
			apiKey = apiKey['BingAPIKey']
	except:
		logging.debug ( 'cdGetBing: ', sys.exc_info() )
		return ''

	if( len(apiKey) == 0 ):
		logging.info ( 'cdGetBing: apiKey empty' )
		return ''
	elif( apiKey == 'YourBingSearchAPIKey' ):
		logging.info ( 'cdGetBing.py: please set Bing search api key in config' )
		return ''



	api_key = apiKey
	headers = {
		'User-Agent': 'Mozilla/5.0 (X11; Linux i686 (x86_64); rv:2.0b4pre) Gecko/20100812 Minefield/4.0b4pre',
		'Ocp-Apim-Subscription-Key':api_key
		}
	base_url = 'https://api.cognitive.microsoft.com/bing/v5.0/search?q='
	parsedUrl = urllib.parse.urlparse(url)
	if( len(parsedUrl.scheme)<1 ):
		url = 'http://'+url
	searchUrl=url[7:]
	converted_url=quote(url,safe='')
	url = base_url + converted_url +'&count=10'
	auth = HTTPBasicAuth(api_key,api_key)

	response = requests.get(url, headers=headers)
	json_result=response.json()
	#print json_result

	result=''
	canonical_search_url=surt(searchUrl)
	for category in json_result:
		if category == 'webPages' :
			for page in json_result[category]['value']:
				result_url=surt(page['displayUrl'])
				if result_url==canonical_search_url :
					result = page['dateLastCrawled']
					break

		elif category == 'images' :
			for page in json_result[category]['value']:
				result_url=surt(page['contentUrl'])
				if result_url==canonical_search_url :
					result = page['datePublished']
					break

		elif category == 'news' :
			for page in json_result[category]['value']:
				result_url=surt(page['url'])
				if result_url==canonical_search_url :
					result = page['datePublished']
					break

		elif category == 'videos' :
			for page in json_result[category]['value']:
				result_url=surt(page['hostPageDisplayUrl'])
				if result_url==canonical_search_url :
					result = page['datePublished']
					break
		if result != '' :
			break

	outputArray[indexOfOutputArray]=result
	kwargs['displayArray'][indexOfOutputArray] = result
	logging.debug ( 'Done Bing' )
	return result
Esempio n. 51
0
    def apply_filters(self, wbrequest, matcher):
        """Parse the GUID and find the CDXLine in the DB"""

        guid = matcher.group(1)
        cache_key = guid+'-cdx'
        cached_cdx = django_cache.get(cache_key)
        redirect_matcher = re.compile(r' 30[1-7] ')
        if cached_cdx is None or not wbrequest.wb_url:
            with close_database_connection():
                try:
                    # This will filter out links that have user_deleted=True
                    link = Link.objects.get(guid=guid)
                except Link.DoesNotExist:
                    raise_not_found(wbrequest.wb_url)

                if not wbrequest.wb_url:
                    # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler.
                    wbrequest.custom_params['guid'] = guid
                    wbrequest.custom_params['url'] = link.submitted_url
                    return

                # Legacy archives didn't generate CDXLines during
                # capture so generate them on demand if not found, unless
                # A: the warc capture hasn't been generated OR
                # B: we know other cdx lines have already been generated
                #    and the requested line is simply missing
                lines = list(link.cdx_lines.all())
                if not lines:

                    # TEMP: remove after all legacy warcs have been exported
                    if not default_storage.exists(link.warc_storage_file()):
                        link.export_warc()

                    lines = CDXLine.objects.create_all_from_link(link)

                # build a lookup of all cdx lines for this link indexed by urlkey, like:
                # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']}
                cached_cdx = defaultdict(list)
                for line in lines:
                    cached_cdx[line.urlkey].append(str(line.raw))

                # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops
                for urlkey, lines in cached_cdx.iteritems():
                    if len(lines) > 1:
                        lines_without_redirects = [line for line in lines if not redirect_matcher.search(line)]
                        if lines_without_redirects:
                            cached_cdx[urlkey] = lines_without_redirects

                django_cache.set(cache_key, cached_cdx)

        urlkey = surt(wbrequest.wb_url.url)
        cdx_lines = cached_cdx.get(urlkey)
        if not cdx_lines:
            raise_not_found(wbrequest.wb_url)

        # Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = cdx_lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)
Esempio n. 52
0
def show_uri(path, datetime=None):
    global IPFS_API

    daemonAddress = '{0}:{1}'.format(IPFSAPI_HOST, IPFSAPI_PORT)
    if not ipwbUtils.isDaemonAlive(daemonAddress):
        errStr = ('IPFS daemon not running. '
                  'Start it using $ ipfs daemon on the command-line '
                  ' or from the <a href="/">'
                  'IPWB replay homepage</a>.')
        return Response(errStr, status=503)

    path = getCompleteURI(path)
    cdxjLine = ''
    try:
        surtedURI = surt.surt(
                     path, path_strip_trailing_slash_unless_empty=False)
        indexPath = ipwbUtils.getIPWBReplayIndexPath()

        searchString = surtedURI
        if datetime is not None:
            searchString = surtedURI + ' ' + datetime

        cdxjLine = getCDXJLine_binarySearch(searchString, indexPath)

    except Exception as e:
        print(sys.exc_info()[0])
        respString = ('{0} not found :(' +
                      ' <a href="http://{1}:{2}">Go home</a>').format(
            path, IPWBREPLAY_HOST, IPWBREPLAY_PORT)
        return Response(respString)
    if cdxjLine is None:  # Resource not found in archives
        return generateNoMementosInterface(path, datetime)

    cdxjParts = cdxjLine.split(" ", 2)
    jObj = json.loads(cdxjParts[2])
    datetime = cdxjParts[1]

    digests = jObj['locator'].split('/')

    class HashNotFoundError(Exception):
        pass

    payload = None
    header = None
    try:
        def handler(signum, frame):
            raise HashNotFoundError()

        if os.name != 'nt':  # Bug #310
            signal.signal(signal.SIGALRM, handler)
            signal.alarm(10)

        payload = IPFS_API.cat(digests[-1])
        header = IPFS_API.cat(digests[-2])

        if os.name != 'nt':  # Bug #310
            signal.alarm(0)

    except ipfsapi.exceptions.TimeoutError:
        print("{0} not found at {1}".format(cdxjParts[0], digests[-1]))
        respString = ('{0} not found in IPFS :(' +
                      ' <a href="http://{1}:{2}">Go home</a>').format(
            path, IPWBREPLAY_HOST, IPWBREPLAY_PORT)
        return Response(respString)
    except TypeError as e:
        print('A type error occurred')
        print(e)
        abort(500)
    except HTTPError as e:
        print("Fetching from the IPFS failed")
        print(e)
        abort(503)
    except HashNotFoundError:
        if payload is None:
            print("Hashes not found:\n\t{0}\n\t{1}".format(
                digests[-1], digests[-2]))
            abort(404)
        else:  # payload found but not header, fabricate header
            print("HTTP header not found, fabricating for resp replay")
            header = ''
    except Exception as e:
        print('Unknown exception occurred while fetching from ipfs.')
        print(e)
        abort(500)

    if 'encryption_method' in jObj:
        keyString = None
        while keyString is None:
            if 'encryption_key' in jObj:
                keyString = jObj['encryption_key']
            else:
                askForKey = ('Enter a path for file',
                             ' containing decryption key: \n> ')
                keyString = raw_input(askForKey)

        paddedEncryptionKey = pad(keyString, AES.block_size)
        key = base64.b64encode(paddedEncryptionKey)

        nonce = b64decode(jObj['encryption_nonce'])
        cipher = AES.new(key, AES.MODE_CTR, nonce=nonce)
        header = cipher.decrypt(base64.b64decode(header))
        payload = cipher.decrypt(base64.b64decode(payload))

    hLines = header.decode() \
                   .replace('\r', '') \
                   .replace('\n\t', '\t') \
                   .replace('\n ', ' ') \
                   .split('\n')
    hLines.pop(0)

    status = 200
    if 'status_code' in jObj:
        status = jObj['status_code']

    resp = Response(payload, status=status)

    for idx, hLine in enumerate(hLines):
        k, v = hLine.split(':', 1)

        if k.lower() == 'transfer-encoding' and \
           re.search(r'\bchunked\b', v, re.I):
            try:
                unchunkedPayload = extractResponseFromChunkedData(payload)
            except Exception as e:
                print('Error while dechunking')
                print(sys.exc_info()[0])
                continue  # Data may have no actually been chunked
            resp.set_data(unchunkedPayload)

        if k.lower() not in ["content-type", "content-encoding", "location"]:
            k = "X-Archive-Orig-" + k

        resp.headers[k] = v.strip()

    # Add ipwb header for additional SW logic
    newPayload = resp.get_data()

    lineJSON = cdxjLine.split(' ', 2)[2]
    mime = json.loads(lineJSON)['mime_type']

    if 'text/html' in mime:
        ipwbjsinject = """<script src="/ipwbassets/webui.js"></script>
                      <script>injectIPWBJS()</script>"""

        newPayload = newPayload.decode('utf-8').replace(
            '</html>', ipwbjsinject + '</html>')

        resp.set_data(newPayload)

    resp.headers['Memento-Datetime'] = ipwbUtils.digits14ToRFC1123(datetime)

    if header is None:
        resp.headers['X-Headers-Generated-By'] = 'InterPlanetary Wayback'

    # Get TimeMap for Link response header
    # respWithLinkHeader = getLinkHeaderAbbreviatedTimeMap(path, datetime)
    # resp.headers['Link'] = respWithLinkHeader.replace('\n', ' ')

    if status[0] == '3' and isUri(resp.headers.get('Location')):
        # Bad assumption that the URI-M will contain \d14 but works for now.
        uriBeforeURIR = request.url[:re.search(r'/\d{14}/', request.url).end()]
        newURIM = uriBeforeURIR + resp.headers['Location']
        resp.headers['Location'] = newURIM

    return resp
Esempio n. 53
0
def test_surt():
    # These tests are from WaybackURLKeyMakerTest.java

    assert surt.surt(None) == '-'
    assert surt.surt('') == '-'
    assert surt.surt("filedesc:foo.arc.gz") == 'filedesc:foo.arc.gz'
    assert surt.surt("filedesc:/foo.arc.gz") == 'filedesc:/foo.arc.gz'
    assert surt.surt("filedesc://foo.arc.gz") == 'filedesc://foo.arc.gz'
    assert surt.surt("warcinfo:foo.warc.gz") == 'warcinfo:foo.warc.gz'
    assert surt.surt("dns:alexa.com") == 'dns:alexa.com'
    assert surt.surt("dns:archive.org") == 'dns:archive.org'

    assert surt.surt("http://www.archive.org/") == 'org,archive)/'
    assert surt.surt("http://archive.org/") == 'org,archive)/'
    assert surt.surt("http://archive.org/goo/") == 'org,archive)/goo'
    assert surt.surt("http://archive.org/goo/?") == 'org,archive)/goo'
    assert surt.surt("http://archive.org/goo/?b&a") == 'org,archive)/goo?a&b'
    assert surt.surt("http://archive.org/goo/?a=2&b&a=1") == 'org,archive)/goo?a=1&a=2&b'

    # trailing comma mode
    assert surt.surt("http://archive.org/goo/?a=2&b&a=1", trailing_comma=True) == 'org,archive,)/goo?a=1&a=2&b'
    assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
    assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'

    # PHP session id:
    assert surt.surt("http://archive.org/index.php?PHPSESSID=0123456789abcdefghijklemopqrstuv&action=profile;u=4221") == 'org,archive)/index.php?action=profile;u=4221'

    # WHOIS url:
    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il") == 'il,org,isoc,whois)/shaveh.co.il'

    # Yahoo web bug. See https://github.com/internetarchive/surt/issues/1
    assert surt.surt('http://visit.webhosting.yahoo.com/visit.gif?&r=http%3A//web.archive.org/web/20090517140029/http%3A//anthonystewarthead.electric-chi.com/&b=Netscape%205.0%20%28Windows%3B%20en-US%29&s=1366x768&o=Win32&c=24&j=true&v=1.2') == 'com,yahoo,webhosting,visit)/visit.gif?&b=netscape%205.0%20(windows;%20en-us)&c=24&j=true&o=win32&r=http://web.archive.org/web/20090517140029/http://anthonystewarthead.electric-chi.com/&s=1366x768&v=1.2'

    # Simple customization:
    assert surt.surt("http://www.example.com/", canonicalizer=lambda x, **opts: x) == 'com,example,www)/'
    assert surt.surt("mailto:[email protected]") == 'mailto:[email protected]'
    assert surt.surt("http://www.example.com/", with_scheme=True) == 'http://(com,example)/'
    assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=True) == 'http://(com,example)/'
    assert surt.surt("http://www.example.com/", with_scheme=False) == 'com,example)/'
    assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True) == 'http://(com,example,)/'
    assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True) == 'https://(com,example,)/'
    assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=True) == 'com,example,)/'
    assert surt.surt("ftp://www.example.com/", with_scheme=False, trailing_comma=False) == 'com,example)/'
    assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True) == 'ftp://(com,example,)/'
    assert surt.surt("http://www.example.com/", with_scheme=True, host_massage=False) == 'http://(com,example,www)/'
    assert surt.surt("http://www.example.com/", with_scheme=False, host_massage=False) == 'com,example,www)/'
    assert surt.surt("http://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'http://(com,example,www,)/'
    assert surt.surt("https://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'https://(com,example,www,)/'
    assert surt.surt("ftp://www.example.com/", with_scheme=True, trailing_comma=True, host_massage=False) == 'ftp://(com,example,www,)/'

    assert surt.surt("mailto:[email protected]", with_scheme=True) == 'mailto:[email protected]'
    assert surt.surt("mailto:[email protected]", trailing_comma=True) == 'mailto:[email protected]'
    assert surt.surt("mailto:[email protected]", with_scheme=True, trailing_comma=True) == 'mailto:[email protected]'
    assert surt.surt("dns:archive.org", with_scheme=True) == 'dns:archive.org'
    assert surt.surt("dns:archive.org", trailing_comma=True) == 'dns:archive.org'
    assert surt.surt("dns:archive.org", with_scheme=True, trailing_comma=True) == 'dns:archive.org'
    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", with_scheme=True) == 'whois://(il,org,isoc,whois)/shaveh.co.il'
    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True) == 'il,org,isoc,whois,)/shaveh.co.il'
    assert surt.surt("whois://whois.isoc.org.il/shaveh.co.il", trailing_comma=True, with_scheme=True) == 'whois://(il,org,isoc,whois,)/shaveh.co.il'
    assert surt.surt("warcinfo:foo.warc.gz", trailing_comma=True) == 'warcinfo:foo.warc.gz'
    assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True) == 'warcinfo:foo.warc.gz'
    assert surt.surt("warcinfo:foo.warc.gz", with_scheme=True, trailing_comma=True) == 'warcinfo:foo.warc.gz'
Esempio n. 54
0
    def apply_filters(self, wbrequest, matcher):
        """Parse the GUID and find the CDXLine in the DB"""

        guid = matcher.group(1)
        cache_key = Link.get_cdx_cache_key(guid)
        cached_cdx = django_cache.get(cache_key)
        redirect_matcher = re.compile(r' 30[1-7] ')
        if cached_cdx is None or not wbrequest.wb_url:
            with opbeat_trace('cdx-cache-miss'), close_database_connection():
                try:
                    # This will filter out links that have user_deleted=True
                    link = Link.objects.get(guid=guid)
                except Link.DoesNotExist:
                    raise_not_found(wbrequest.wb_url)

                if not wbrequest.wb_url:
                    # This is a bare request to /warc/1234-5678/ -- return so we can send a forward to submitted_url in PermaGUIDHandler.
                    wbrequest.custom_params['guid'] = guid
                    wbrequest.custom_params['url'] = link.safe_url
                    return

                # Legacy archives didn't generate CDXLines during
                # capture so generate them on demand if not found, unless
                # A: the warc capture hasn't been generated OR
                # B: we know other cdx lines have already been generated
                #    and the requested line is simply missing
                lines = CDXLine.objects.filter(link_id=link.guid)

                if not lines:
                    lines = CDXLine.objects.create_all_from_link(link)

                # build a lookup of all cdx lines for this link indexed by urlkey, like:
                # cached_cdx = {'urlkey1':['raw1','raw2'], 'urlkey2':['raw3','raw4']}
                cached_cdx = defaultdict(list)
                for line in lines:
                    cached_cdx[line.urlkey].append(str(line.raw))

                # remove any redirects if we also have a non-redirect capture for the same URL, to prevent redirect loops
                for urlkey, lines in cached_cdx.iteritems():
                    if len(lines) > 1:
                        lines_without_redirects = [line for line in lines if not redirect_matcher.search(line)]
                        if lines_without_redirects:
                            cached_cdx[urlkey] = lines_without_redirects

                # record whether link is private so we can enforce permissions
                cached_cdx['is_private'] = link.is_private

                django_cache.set(cache_key, cached_cdx)

        # enforce permissions
        if cached_cdx.get('is_private'):
            # if user is allowed to access this private link, they will have a cookie like GUID=<token>,
            # which can be validated with link.validate_access_token()
            cookie = Cookie.SimpleCookie(wbrequest.env.get('HTTP_COOKIE')).get(guid)
            if not cookie:
                raise CustomTemplateException(status='400 Bad Request',
                                              template_path='archive/missing-cookie.html',
                                              template_kwargs={
                                                  'content_host': settings.WARC_HOST,
                                              })
            if not Link(pk=guid).validate_access_token(cookie.value, 3600):
                raise_not_found(wbrequest.wb_url)

        # check whether archive contains the requested URL
        urlkey = surt(wbrequest.wb_url.url)
        cdx_lines = cached_cdx.get(urlkey)
        if not cdx_lines:
            raise_not_found(wbrequest.wb_url)

        # Store the line for use in PermaCDXSource
        # so we don't need to hit the DB again
        wbrequest.custom_params['lines'] = cdx_lines
        wbrequest.custom_params['guid'] = guid

        # Adds the Memento-Datetime header
        # Normally this is done in MementoReqMixin#_parse_extra
        # but we need the GUID to make the DB query and that
        # isn't parsed from the url until this point
        wbrequest.wb_url.set_replay_timestamp(CDXLine(raw=cdx_lines[0]).timestamp)
Esempio n. 55
0
 def urlkey(self, url):
     """compute urlkey from `url`."""
     return surt(url, canonicalizer=self.canonicalize)
Esempio n. 56
0
    def __init__(self, file, out_file=sys.stdout, format="N b a m s k r M S V g", use_full_path=False, file_prefix=None, all_records=False, screenshot_mode=False, exclude_list=None, stats_file=None):

        self.field_map = {'M': 'AIF meta tags',
                          'N': 'massaged url',
                          'S': 'compressed record size',
                          'V': 'compressed arc file offset',
                          'a': 'original url',
                          'b': 'date',
                          'g': 'file name',
                          'k': 'new style checksum',
                          'm': 'mime type',
                          'r': 'redirect',
                          's': 'response code',
                         }

        self.file   = file
        self.out_file = out_file
        self.format = format
        self.all_records  = all_records
        self.screenshot_mode = screenshot_mode
        self.crlf_pattern = re.compile('\r?\n\r?\n')
        self.response_pattern = re.compile('^application/http;\s*msgtype=response$', re.I)

        #similar to what what the wayback uses:
        self.fake_build_version = "archive-commons.0.0.1-SNAPSHOT-20120112102659-python"

        #these fields are set for each record in the warc
        self.offset        = 0
        self.surt          = None
        self.mime_type     = None
        self.headers       = None
        self.content       = None
        self.meta_tags     = None
        self.response_code = None

        #Large html files cause lxml to segfault
        #problematic file was 154MB, we'll stop at 5MB
        self.lxml_parse_limit = 5 * 1024 * 1024

        if use_full_path:
            self.warc_path = os.path.abspath(file)
        elif file_prefix:
            self.warc_path = os.path.join(file_prefix, file)
        else:
            self.warc_path = file

        if exclude_list:
            if not os.path.exists(exclude_list):
                raise IOError, "Exclude file not found"
            self.excludes = []
            f = open(exclude_list, 'r')
            for line in f:
                if '' == line.strip():
                    continue
                url = line.split()[0]
                self.excludes.append(surt(url))
        else:
            self.excludes = None

        if stats_file:
            if os.path.exists(stats_file):
                raise IOError, "Stats file already exists"
            self.stats_file = stats_file
        else:
            self.stats_file = None
Esempio n. 57
0
def brozzler_list_captures():
    '''
    Handy utility for looking up entries in the rethinkdb "captures" table by
    url or sha1.
    '''
    import surt
    import rethinkdb

    arg_parser = argparse.ArgumentParser(
            prog=os.path.basename(sys.argv[0]),
            formatter_class=argparse.ArgumentDefaultsHelpFormatter)
    _add_rethinkdb_options(arg_parser)
    _add_common_options(arg_parser)
    arg_parser.add_argument(
            'url_or_sha1', metavar='URL_or_SHA1',
            help='url or sha1 to look up in captures table')

    args = arg_parser.parse_args(args=sys.argv[1:])
    _configure_logging(args)

    r = rethinkstuff.Rethinker(
            args.rethinkdb_servers.split(','), args.rethinkdb_db)

    class Jsonner(json.JSONEncoder):
        def default(self, o):
            if isinstance(o, datetime.datetime):
                return o.isoformat()
            return json.JSONEncoder.default(self, o)

    if args.url_or_sha1[:5] == 'sha1:':
        raise Exception('not implemented')
        # def find_response_by_digest(self, algo, raw_digest, bucket="__unspecified__"):
        #     if algo != "sha1":
        #         raise Exception(
        #                 "digest type is %s but big captures table is indexed by "
        #                 "sha1" % algo)
        #     sha1base32 = base64.b32encode(raw_digest).decode("utf-8")
        #     results_iter = self.r.table(self.table).get_all([sha1base32, "response", bucket], index="sha1_warc_type").run()
        #     results = list(results_iter)
        #     if len(results) > 0:
        #         if len(results) > 1:
        #             self.logger.debug("expected 0 or 1 but found %s results for sha1base32=%s bucket=%s (will use first result)", len(results), sha1base32, bucket)
        #         result = results[0]
        #     else:
        #         result = None
        #     self.logger.debug("returning %s for sha1base32=%s bucket=%s",
        #                       result, sha1base32, bucket)
        #     return result
    else:
        key = surt.surt(
                args.url_or_sha1, trailing_comma=True, host_massage=False,
                with_scheme=True)
        reql = r.table('captures').between(
                [key[:150], rethinkdb.minval],
                [key[:150]+'!', rethinkdb.maxval],
                index='abbr_canon_surt_timestamp')
        reql = reql.order_by(index='abbr_canon_surt_timestamp')
        reql = reql.filter(
                lambda capture: (capture['canon_surt'] >= key)
                                 & (capture['canon_surt'] <= key))
        logging.debug('rethinkdb query: %s', reql)
        results = reql.run()
        for result in results:
            print(json.dumps(result, cls=Jsonner, indent=2))
Esempio n. 58
0
 def urlkey(self, url):
     """compute urlkey from `url`."""
     return surt(url, **dict(self.canonicalizer_options))