コード例 #1
0
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(
                    data[3], data[4]),
                                                headers_list,
                                                protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2],
                                                   'response',
                                                   payload=data[1],
                                                   http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(
                        data[2], 'revisit', http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
コード例 #2
0
ファイル: pipeline.py プロジェクト: ArchiveTeam/domains-grab
    def process(self, item):
        os.rename('%(item_dir)s/%(warc_file_base)s.warc.gz' % item,
                  '%(data_dir)s/%(warc_file_base)s.warc.gz' % item)
        os.rename('%(item_dir)s/%(warc_file_base)s_data.txt' % item,
                  '%(data_dir)s/%(warc_file_base)s_data.txt' % item)

        has_metadata = False

        with open('%(data_dir)s/%(warc_file_base)s.warc.gz' % item, 'rb') as f:
            for record in ArchiveIterator(f):
                if record.rec_type == 'warcinfo':
                    info_id = record.rec_headers.get_header('WARC-Record-ID')
                    for l in record.content_stream().read().split(b'\r\n'):
                        if l.startswith(b'wget-arguments'):
                            wget_arguments = l.split(b':', 1)[1].strip()
                if record.rec_type == 'resource':
                    has_metadata = True

        if not has_metadata:
            with open('%(data_dir)s/%(warc_file_base)s-tail.warc.gz' % item,
                      'wb') as f:
                writer = WARCWriter(f, gzip=True)
                record = writer.create_warc_record(
                    'metadata://gnu.org/software/wget/warc/MANIFEST.txt',
                    'resource',
                    payload=io.BytesIO(bytes(info_id, 'utf8') + b'\n'),
                    warc_headers_dict={
                        'WARC-Warcinfo-ID': info_id,
                        'Content-Type': 'text/plain'
                    })
                manifest_id = record.rec_headers.get_header('WARC-Record-ID')
                writer.write_record(record)
                record = writer.create_warc_record(
                    'metadata://gnu.org/software/wget/warc/wget_arguments.txt',
                    'resource',
                    payload=io.BytesIO(wget_arguments + b'\n'),
                    warc_headers_dict={
                        'WARC-Warcinfo-ID': info_id,
                        'WARC-Concurrent-To': manifest_id,
                        'Content-Type': 'text/plain'
                    })
                writer.write_record(record)
                with open('%(item_dir)s/wget.log' % item, 'rb') as f_log:
                    record = writer.create_warc_record(
                        'metadata://gnu.org/software/wget/warc/wget.log',
                        'resource',
                        payload=f_log,
                        warc_headers_dict={
                            'WARC-Warcinfo-ID': info_id,
                            'WARC-Concurrent-To': manifest_id,
                            'Content-Type': 'text/plain'
                        })
                writer.write_record(record)
        else:
            open('%(data_dir)s/%(warc_file_base)s-tail.warc.gz' % item,
                 'w').close()

        shutil.rmtree('%(item_dir)s' % item)
コード例 #3
0
ファイル: api.py プロジェクト: peterk/binhydrate
def fetch_urls_to_warc(urls, warcfile_path):
    """Fetch urls and write to warc file

    :urls: list of urls to binary files
    :warcfile_path: path to a WARC file.

    """

    with open(warcfile_path, 'wb') as output:
        writer = WARCWriter(output, gzip=True)

        for url in urls:
            print(url)
            resp = requests.get(url,
                                headers={'Accept-Encoding': 'identity'},
                                stream=True)

            headers_list = resp.raw.headers.items()
            http_headers = StatusAndHeaders('200 OK',
                                            headers_list,
                                            protocol='HTTP/1.0')
            record = writer.create_warc_record(url,
                                               'response',
                                               payload=resp.raw,
                                               http_headers=http_headers)
            writer.write_record(record)
コード例 #4
0
def warc_from_response(response, resolved_url):
    f_output = BytesIO()
    writer = WARCWriter(f_output, gzip=True)
    # Response
    response_header_items = list(response.headers.to_unicode_dict().items())
    response_headers = StatusAndHeaders("200 OK", response_header_items, protocol="HTTP/1.0")
    response_record = writer.create_warc_record(resolved_url, "response", payload=BytesIO(response.body), http_headers=response_headers)
    writer.write_record(response_record)
    # Request
    request_header_items = list(response.request.headers.to_unicode_dict().items())
    request_headers = StatusAndHeaders("200 OK", request_header_items, protocol="HTTP/1.0")
    request_record = writer.create_warc_record(resolved_url, "request", payload=BytesIO(response.request.body), http_headers=request_headers)
    request_record.rec_headers.add_header("WARC-Concurrent-To", response_record.rec_headers.get_header("WARC-Record-ID"))
    writer.write_record(request_record)
    contents = f_output.getvalue()
    f_output.close()
    return contents
コード例 #5
0
    def write_memento(self, murl=None):
        """
        This is function to write memento in WARC format.

        Parameters:
            murl (str): URI-M

        Returns:
            (bool): True on Success and False on Failure
        """
        try:
            if self.lookup_memento(murl):
                return True
            else:
                response = Utils.get_murl_info(murl, self.__thandle)
                mpath = self.__memento_dir
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["handle"].lower())
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["domain"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["archive"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                mpath = os.path.join(mpath, response["wrep"] + response["lang"])
                if not os.path.exists(mpath):
                    os.mkdir(mpath)
                try:
                    mpath = os.path.join(mpath, str(response["timestamp"]) + self.__constants.WARC_EXT)
                    with open(mpath, "wb") as output:
                        writer = WARCWriter(output, gzip=True)
                        resp = requests.get(murl,
                                            headers={'Accept-Encoding': 'identity'},
                                            stream=True, timeout=120)

                        # get raw headers from urllib3
                        headers_list = resp.raw.headers.items()
                        http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.1')
                        record = writer.create_warc_record(mpath, 'response',
                                                           payload=resp.raw,
                                                           http_headers=http_headers)
                        writer.write_record(record)
                    return True
                except requests.exceptions.TooManyRedirects as err:
                    sys.stderr.write(murl + "Too Many redirects" + "\n")
                except requests.exceptions.ConnectTimeout as err:
                    sys.stderr.write(murl + "Connection Timeout" + "\n")
                except Exception as e:
                    sys.stderr.write("Memento Write Error: " + str(e) + "URL:" + murl + "\n")
        except Exception as e:
            sys.stderr.write("Memento Write Error: " + murl + " " + str(e) + "\n")
        return False
コード例 #6
0
ファイル: media2warc.py プロジェクト: gwu-libraries/twarc
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(data[2], 'revisit',
                                                       http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
コード例 #7
0
def convert_to_warc(website, filename):
	with open(filename + '.warc.gz', 'wb') as output:
		writer = WARCWriter(output, gzip=True)
		
		resp = requests.get(website,
                        headers={'Accept-Encoding': 'identity'},
                        stream=True)
						
		# get raw headers from urllib3
		headers_list = resp.raw.headers.items()


		http_headers = StatusAndHeaders('200 OK', headers_list, protocol='HTTP/1.0')

		record = writer.create_warc_record(website, 'response',
                                   payload=resp.raw,
                                   http_headers=http_headers)
								   
		writer.write_record(record)
コード例 #8
0
ファイル: tools.py プロジェクト: ra2003/crocoite
def errataFix (args):
    errata = args.errata

    with args.input as infd, args.output as outfd:
        writer = WARCWriter (outfd, gzip=True)

        warcinfo = {
                'software': getSoftwareInfo (),
                'tool': 'crocoite-errata', # not the name of the cli tool
                'parameters': {'errata': [errata.uuid]},
                }
        payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
        record = writer.create_warc_record ('', 'warcinfo',
                payload=payload,
                warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
        writer.write_record (record)

        for record in ArchiveIterator (infd):
            fixedRecord = errata.applyFix (record)
            writer.write_record (fixedRecord)
    json.dump (errata.stats, sys.stdout)
    sys.stdout.write ('\n')
    sys.stdout.flush ()
コード例 #9
0
    def _fetch_warc(self, action_result, url, out_path):
        with open(out_path, "wb") as output:
            writer = WARCWriter(output, gzip=True)

            resp = requests.get(url,
                                headers={"Accept-Encoding": "identity"},
                                stream=True)

            # get raw headers from urllib3
            headers_list = resp.raw.headers.items()

            http_headers = StatusAndHeaders("200 OK",
                                            headers_list,
                                            protocol="HTTP/1.0")

            record = writer.create_warc_record(url,
                                               "response",
                                               payload=resp.raw,
                                               http_headers=http_headers)

            writer.write_record(record)

        return out_path
コード例 #10
0
class WarcDownloader:
    """
        Download URL with HTTP GET, save to a WARC file and return the decoded text
    """
    def __init__(self,
                 expected_filename,
                 _logger,
                 warcinfo_record_data=None,
                 program_name='WebArticleCurator',
                 user_agent=None,
                 overwrite_warc=True,
                 err_threshold=10,
                 known_bad_urls=None,
                 max_no_of_calls_in_period=2,
                 limit_period=1,
                 proxy_url=None,
                 allow_cookies=False,
                 verify_request=True,
                 stay_offline=False):
        # Store variables
        self._logger = _logger
        self._req_headers = {
            'Accept-Encoding': 'identity',
            'User-agent': user_agent
        }
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        # Setup download function
        if not stay_offline:
            self.download_url = self._download_url
        else:
            self.download_url = self._dummy_download_url

        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        self.good_urls = set()

        # Setup target file handle
        filename = self._set_target_filename(expected_filename, overwrite_warc)
        self._logger.log('INFO', 'Creating archivefile:', filename)
        self._output_file = open(filename, 'wb')

        self._session = Session()  # Setup session for speeding up downloads
        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies
        self._verify_request = verify_request
        if not self._verify_request:
            disable_warnings(InsecureRequestWarning)

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(
            limits(calls=max_no_of_calls_in_period,
                   period=limit_period)(self._http_get_w_cookie_handling))

        self._writer = WARCWriter(self._output_file,
                                  gzip=True,
                                  warc_version='WARC/1.1')
        if warcinfo_record_data is None:  # Or use the parsed else custom headers will not be copied
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            warcinfo_record_data = {
                'software':
                program_name,
                'arguments':
                ' '.join(sys.argv[1:]),
                'format':
                'WARC File Format 1.1',
                'conformsTo':
                'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1-1_latestdraft.pdf'
            }
        info_record = self._writer.create_warcinfo_record(
            filename, warcinfo_record_data)
        self._writer.write_record(info_record)

    @staticmethod
    def _set_target_filename(filename, overwrite_warc):
        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(
                    filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(
                        filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1
        return filename

    def __del__(self):
        if hasattr(
                self, '_output_file'
        ):  # If the program opened a file, then it should gracefully close it on exit!
            self._output_file.close()

    def _http_get_w_cookie_handling(self, *args, **kwargs):
        """
            Extend requests.get with optional cookie purging
        """
        if not self._allow_cookies:
            self._session.cookies.clear()
        return self._session.get(*args, **kwargs)

    def _handle_request_exception(self, url, msg):
        self._logger.log('WARNING', url, msg, sep='\t')

        self._error_count += 1
        if self._error_count >= self._error_threshold:
            raise NameError(
                'Too many error happened! Threshold exceeded! See log for details!'
            )

    @staticmethod
    def _get_peer_name(resp):
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[
                0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...
        return peer_name

    def _dummy_download_url(self, _):
        raise NotImplementedError

    def _download_url(self, url):
        if url in self.bad_urls:
            self._logger.log('DEBUG', 'Not downloading known bad URL:', url)
            return None

        if url in self.good_urls:  # This should not happen!
            self._logger.log(
                'ERROR',
                'Not downloading URL, because it is already downloaded in this session:',
                url)
            return None

        scheme, netloc, path, params, query, fragment = urlparse(url)
        # For safety urlencode the generated URL... (The URL might by modified in this step.)
        path = quote(path, safe='/%')
        url_reparsed = urlunparse(
            (scheme, netloc, path, params, query, fragment))

        try:  # The actual request (on the reparsed URL, everything else is made on the original URL)
            resp = self._requests_get(url_reparsed,
                                      headers=self._req_headers,
                                      stream=True,
                                      verify=self._verify_request)
        # UnicodeError is originated from idna codec error, LocationParseError is originated from URLlib3 error
        except (UnicodeError, RequestException, LocationParseError) as err:
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(
                url, 'Downloading failed with status code: {0} {1}'.format(
                    resp.status_code, resp.reason))
            return None

        # REQUEST (build headers for warc)
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(
            respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(
            urlunparse(('', '', path, params, query, fragment)), proto),
                                             reqv_headers.items(),
                                             is_http_request=True)
        reqv_record = self._writer.create_warc_record(
            url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        # resp_status need to be stripped else warcio strips the spaces and digest verification will fail!
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason).strip()
        resp_headers_list = resp.raw.headers.items(
        )  # get raw headers from urllib3
        # Must get peer_name before the content is read
        peer_name = self._get_peer_name(resp)

        try:
            data = resp.raw.read(
            )  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(
                url, 'RequestException happened during downloading: {0} \n\n'
                ' The program ignores it and jumps to the next one.'.format(
                    err))
            return None

        # warcio hack as \r\n is the record separator and trailing ones will be split and digest will eventually fail!
        if data.endswith(b'\r\n'):  # TODO: Warcio bugreport!
            data = data.rstrip()

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger.log('WARNING',
                             'DECODE ERROR RETRYING IN \'IGNORE\' MODE:',
                             url,
                             enc,
                             sep='\t')
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(
            data
        )  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status,
                                             resp_headers_list,
                                             protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(
            url,
            'response',
            payload=data_stream,
            http_headers=resp_http_headers,
            warc_headers_dict={
                'WARC-IP-Address': peer_name,
                'WARC-X-Detected-Encoding': enc
            })
        # Everything is OK, write the two WARC records
        self.write_record(reqv_record, url)
        self.write_record(resp_record, url)

        return text

    def write_record(self, record, url):
        self.good_urls.add(url)
        self._writer.write_record(record)
コード例 #11
0
ファイル: tools.py プロジェクト: ra2003/crocoite
def mergeWarc (files, output):
    # stats
    unique = 0
    revisit = 0
    uniqueLength = 0
    revisitLength = 0

    payloadMap = {}
    writer = WARCWriter (output, gzip=True)

    # Add an additional warcinfo record, describing the transformations. This
    # is not ideal, since
    #   “A ‘warcinfo’ record describes the records that
    #   follow it […] until next ‘warcinfo’”
    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
    # A warcinfo record is expected at the beginning of every file. But it
    # might have written by a different software, so we don’t want to
    # strip/replace that information, but supplement it.
    warcinfo = {
            'software': getSoftwareInfo (),
            'tool': 'crocoite-merge', # not the name of the cli tool
            'parameters': {'inputs': files},
            }
    payload = BytesIO (json.dumps (warcinfo, indent=2).encode ('utf-8'))
    record = writer.create_warc_record ('', 'warcinfo',
            payload=payload,
            warc_headers_dict={'Content-Type': makeContentType (jsonMime, 'utf-8')})
    writer.write_record (record)

    for l in files:
        with open (l, 'rb') as fd:
            for record in ArchiveIterator (fd):
                if record.rec_type in {'resource', 'response'}:
                    headers = record.rec_headers
                    rid = headers.get_header('WARC-Record-ID')
                    csum = headers.get_header('WARC-Payload-Digest')
                    length = int (headers.get_header ('Content-Length'))
                    dup = payloadMap.get (csum, None)
                    if dup is None:
                        payloadMap[csum] = {'uri': headers.get_header('WARC-Target-URI'),
                                'id': rid, 'date': headers.get_header('WARC-Date')}
                        unique += 1
                        uniqueLength += length
                    else:
                        logging.debug (f'Record {rid} is duplicate of {dup["id"]}')
                        # Payload may be identical, but HTTP headers are
                        # (probably) not. Include them.
                        record = writer.create_revisit_record (
                                headers.get_header('WARC-Target-URI'), digest=csum,
                                refers_to_uri=dup['uri'], refers_to_date=dup['date'],
                                http_headers=record.http_headers)
                        record.rec_headers.add_header ('WARC-Truncated', 'length')
                        record.rec_headers.add_header ('WARC-Refers-To', dup['id'])
                        revisit += 1
                        revisitLength += length
                else:
                    unique += 1
                writer.write_record (record)
    json.dump (dict (
            unique=dict (records=unique, bytes=uniqueLength),
            revisit=dict (records=revisit, bytes=revisitLength),
            ratio=dict (
                    records=unique/(unique+revisit),
                    bytes=uniqueLength/(uniqueLength+revisitLength)
                    ),
            ),
            sys.stdout,
            cls=StrJsonEncoder)
    sys.stdout.write ('\n')
コード例 #12
0
ファイル: har2warc.py プロジェクト: webrecorder/har2warc
class HarParser(object):
    logger = logging.getLogger(__name__)

    def __init__(self, reader, writer, gzip=True):
        if isinstance(reader, str):
            with codecs.open(reader, encoding='utf-8') as fh:
                self.har = json.loads(fh.read())
        elif hasattr(reader, 'read'):
            self.har = json.loads(reader.read())
        elif isinstance(reader, dict):
            self.har = reader
        else:
            raise Exception('reader is in an unknown format')

        self.fh = None
        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')

    def parse(self, out_filename=None, rec_title=None):
        out_filename = out_filename or 'har.warc.gz'
        rec_title = rec_title or 'HAR Recording'
        metadata = self.create_wr_metadata(self.har['log'], rec_title)
        self.write_warc_info(self.har['log'], out_filename, metadata)

        for entry in self.har['log']['entries']:
            self.parse_entry(entry)

        if self.fh:
            self.fh.close()

    def parse_entry(self, entry):
        url = entry['request']['url']

        response = self.parse_response(url,
                                        entry['response'],
                                        entry.get('serverIPAddress'))

        #TODO: support WARC/1.1 arbitrary precision dates!
        warc_date = entry['startedDateTime'][:19] + 'Z'

        response.rec_headers.replace_header('WARC-Date', warc_date)

        request = self.parse_request(entry['request'])

        self.writer.write_request_response_pair(request, response)


    def create_wr_metadata(self, log, rec_title):
        pagelist = []

        for page in log['pages']:
            if not page['title'].startswith(('http:', 'https:')):
                continue

            pagelist.append(dict(title=page['title'],
                                 url=page['title'],
                                 timestamp=iso_date_to_timestamp(page['startedDateTime'])))

        metadata = {"title": rec_title,
                    "type": "recording",
                   }

        if pagelist:
            metadata["pages"] = pagelist

        return metadata

    def write_warc_info(self, log, filename, metadata):
        creator = '{0} {1}'.format(log['creator']['name'],
                                   log['creator']['version'])

        source = 'HAR Format {0}'.format(log['version'])

        software = 'har2warc ' + str(__version__)

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(filename, params)
        self.writer.write_record(record)

    def _get_http_version(self, entry):
        http_version = entry.get('httpVersion')
        if not http_version or http_version.upper() not in ('HTTP/1.1', 'HTTP/1.0'):
            http_version = 'HTTP/1.1'

        return http_version

    def parse_response(self, url, response, ip=None):
        headers = []
        payload = BytesIO()
        content = response['content'].get('text', '')

        if not content and not response.get('headers'):
            self.logger.info('No headers or payload for: {0}'.format(url))
            headers.append(('Content-Length', '0'))
        if response['content'].get('encoding') == 'base64':
            payload.write(base64.b64decode(content))
        else:
            payload.write(content.encode('utf-8'))

        length = payload.tell()
        payload.seek(0)

        SKIP_HEADERS = ('content-encoding', 'transfer-encoding')

        http2 = False

        for header in response['headers']:
            if header['name'].lower() not in SKIP_HEADERS:
                headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        status = response.get('status') or 204

        reason = response.get('statusText')
        if not reason:
            reason = http_status_names.get(status, 'No Reason')

        status_line = str(status) + ' ' + reason

        proto = self._get_http_version(response)

        http_headers = StatusAndHeaders(status_line, headers, protocol=proto)

        if not content:
            content_length = http_headers.get_header('Content-Length', '0')
            if content_length != '0':
                self.logger.info('No Content for length {0} {1}'.format(content_length, url))
                http_headers.replace_header('Content-Length', '0')
        else:
            http_headers.replace_header('Content-Length', str(length))

        warc_headers_dict = {}
        if ip:
            warc_headers_dict['WARC-IP-Address'] = ip

        record = self.writer.create_warc_record(url, 'response',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length,
                                                warc_headers_dict=warc_headers_dict)

        return record

    def parse_request(self, request):
        parts = urlsplit(request['url'])

        path = parts.path
        query = request.get('queryString')
        if query:
            path += '?' + urlencode(dict((p['name'], p['value'])
                                    for p in query))

        headers = []
        http2 = False

        for header in request['headers']:
            headers.append((header['name'], header['value']))

            #TODO: http2 detection -- write as same warc header?
            if (not http2 and
                header['name'] in (':method', ':scheme', ':path')):
                http2 = True

        if http2:
            headers.append(('Host', parts.netloc))

        http_version = self._get_http_version(request)

        status_line = request['method'] + ' ' + path + ' ' + http_version
        http_headers = StatusAndHeaders(status_line, headers)

        payload = None
        length = 0

        if request['bodySize'] > 0:
            payload = BytesIO()
            payload.write(request['postData']['text'].encode('utf-8'))
            length = payload.tell()
            payload.seek(0)

        record = self.writer.create_warc_record(request['url'], 'request',
                                                http_headers=http_headers,
                                                payload=payload,
                                                length=length)

        return record
コード例 #13
0
    def facebook_user_bio(self, username):
        """Scrapes Facebook bio and returns info
        on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal)
        @param username: Facebook username
        @return: a dictionary of account attributes """

        user_email_fb = self.message['credentials']['user_email_fb']
        user_password_fb = self.message['credentials']['user_password_fb']

        # ensure username is clean and can be accessed
        if username.startswith(
                "https://www.facebook.com/") or username.startswith(
                    "http://www.facebook.com/"):

            username = re.sub(r'^.+facebook\.com\/', '', username)
            # possibly also remove trailing /
            username = re.sub(r'\/$', '', username)

        # created at field
        fb_general = base_fb_url + username
        # bio info
        fb_about = base_fb_url + username + "/about/?ref=page_internal"
        # site transparency (e.g. admins)
        m_fb_general = "http://m.facebook.com/" + username

        # request the html
        r = requests.get(fb_general)
        # ensure no 404's
        if not r:
            log.debug("Couldn't access profile site: %s", fb_general)
            return

        soup = BeautifulSoup(r.content, "html.parser")

        # scrape creation date
        created_at = soup.find('div', {"class": "_3qn7"})
        created_at = created_at.select_one("span").text

        created_at = re.sub(r"(Seite erstellt)", "", created_at)

        created_at = created_at[3:]

        # scrape n of likes
        # find span with like number
        spans = soup.find('span', {"class": "_52id _50f5 _50f7"})
        # isolate likes via regex
        likes = re.search(r'^[\d]+.[^\s]+', spans.text).group()

        bio_dict = {
            "username": fb_general,
            "n_likes": likes,
            "created_at": created_at
        }

        # request about html
        r_about = requests.get(fb_about)

        # ensure no 404's
        if not r_about:
            log.debug("Couldn't access username/about site: %s", fb_about)
            return

        about_soup = BeautifulSoup(r_about.content, "html.parser")
        mission_text = about_soup.find_all('div', {'class': "_4bl9"})

        for divs in mission_text:
            describing_div = divs.find('div', {'class': '_50f4'})
            content_div = divs.find('div', {'class': '_3-8w'})

            if describing_div and content_div:
                bio_dict[describing_div.text] = content_div.text

        # photos
        # Retrieves profile and cover photo of public facebook page
        # bio going to the 'about' page, parsing html and getting
        # the links to photos from script tag, these can then be passed
        # harvest_media
        # this is not affected by the harvest_media options but will always happen
        all_scripts = about_soup.find_all('script')

        for js in all_scripts:
            for content in js.contents:
                if 'cover_photo' in content:
                    # isolate relevant links
                    links = re.findall(r'https\:\\/\\/scontent[^"]*', content)

                    # remove escaped front slashes
                    for val, link in enumerate(links):
                        links[val] = re.sub(r'\\', "", link)
                        self._harvest_media_url(links[val])

        if m_fb_general:

            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
            site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt"
            site_transparency_detail_id = "u_0_d"

            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('headless')
            chrome_options.add_argument('start-maximised')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1200x800')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument(f"user-agent={user_agent}")

            # this will connect to the selenium container starting scraping
            driver = webdriver.Remote("host.docker.internal:4444/wd/hub",
                                      {'browserName': 'chrome'})
            driver.get("http://m.facebook.com")
            driver.maximize_window()
            # accept cookies
            cookies = driver.find_element_by_id('accept-cookie-banner-label')
            # more or less random wait to replicate user behavior, ensure politeness
            time.sleep(random.uniform(3, 9))
            cookies.click()
            # Search & Enter the Email or Phone field & Enter Password
            username_fb = driver.find_element_by_id("m_login_email")
            password_fb = driver.find_element_by_id("m_login_password")
            submit = driver.find_element_by_css_selector("._56b_")
            # send keys and make sure not prepolutaed
            # 2fa has to be deactivated
            username_fb.clear()
            password_fb.clear()
            username_fb.send_keys(user_email_fb)
            password_fb.send_keys(user_password_fb)
            time.sleep(random.uniform(3, 9))
            # Step 4) Click Login
            submit.click()
            time.sleep(random.uniform(3, 9))
            # navigate to site
            driver.get(m_fb_general)
            time.sleep(random.uniform(3, 9))
            driver.execute_script("window.scrollTo(0, 800)")
            # site info only loads on scroll
            # use class name and div content (todo)
            time.sleep(random.uniform(20, 25))
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.CSS_SELECTOR, site_transparency_class_selector)))
            site_transparency = driver.find_elements_by_css_selector(
                site_transparency_class_selector)
            #site transparency should always be below about
            site_transparency[1].click()
            time.sleep(random.uniform(20, 15))
            # simply get the whole text of the transparency box of site
            # the exact info can be extracted ex-post
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.ID, site_transparency_detail_id)))
            time.sleep(random.uniform(3, 9))
            site_transparency_text = driver.find_element_by_id(
                site_transparency_detail_id).text
            time.sleep(random.uniform(3, 9))
            driver.close()
            log.info("Finished scraping transparency box")
            bio_dict['transparency_text'] = site_transparency_text

        # ensure that only warc will be written if sites were found
        # else nothing will happen
        if r_about or r:
            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(bio_dict,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
コード例 #14
0
ファイル: fixity.py プロジェクト: oduwsdl/archival-fixity
def generate_current(urim):

    tic_all = timeit.default_timer()

    time_json = {
        'date': strftime("%Y%m%d%H%M%S", gmtime()),
        'time_in_seconds_to_download_memento': 0,
        'time_in_seconds_to_generate_fixity': 0
    }

    urimid_, mdatetime, urir = convert_to_original_link(urim)

    manif = {
        "@context":
        "http://manifest.ws-dl.cs.odu.edu/terms.json",
        "uri-r":
        urir,
        "uri-m":
        urim,
        "memento-datetime":
        datetime.datetime.strptime(
            mdatetime, '%Y%m%d%H%M%S').strftime('%a, %d %b %Y %H:%M:%S GMT')
    }

    urimh = hashlib.md5(urim.encode()).hexdigest()

    downloadtime = strftime("%Y%m%d%H%M%S", gmtime())

    manif["created"] = datetime.datetime.strptime(
        downloadtime, '%Y%m%d%H%M%S').strftime('%a, %d %b %Y %H:%M:%S GMT')

    outMainDir = '/data/Fixity/verification/' + urimh + '/' + downloadtime
    warc_file = outMainDir + '/raw.warc'

    tic0 = timeit.default_timer()

    if not os.path.exists(outMainDir):
        os.makedirs(outMainDir)

    with open(warc_file, 'wb') as poutput:
        writer = WARCWriter(poutput, gzip=False)

        headers = {
            'User-Agent':
            'Web Science and Digital Libraries Group (@WebSciDL); Project/archives_fixity; Contact/Mohamed Aturban ([email protected])',
            'Accept-Encoding': None
        }

        try:
            resp = requests.get(urimid_,
                                headers=headers,
                                timeout=180,
                                allow_redirects=True,
                                stream=True)
        except:
            pass

        cont = resp.content
        headers_list = resp.headers.items()
        http_headers = StatusAndHeaders(str(resp.status_code),
                                        headers_list,
                                        protocol='HTTP/1.0')
        record = writer.create_warc_record(urimid_,
                                           'response',
                                           payload=BytesIO(cont),
                                           http_headers=http_headers)
        try:
            writer.write_record(record)
        except Exception as e:
            print(str(e))

    toc0 = timeit.default_timer()

    if os.path.exists(warc_file):
        with open(warc_file, 'rb') as stream:
            counter_raw = 0
            for record in ArchiveIterator(stream):
                if record.rec_type == 'response':
                    uri = record.rec_headers.get_header('WARC-Target-URI')
                    if uri == urimid_:
                        status_code = record.http_headers.statusline.split()[0]
                        entity = record.content_stream().read()  #.strip()
                        hdrs, hdrs_values, hdrs_keys = extrcated_headers_from_warc_record(
                            record, status_code)
                        hdrs[
                            "Preference-Applied"] = "original-links, original-content"
                        md5h = hashlib.md5(entity +
                                           hdrs_values.encode()).hexdigest()
                        sha256h = hashlib.sha256(
                            entity + hdrs_values.encode()).hexdigest()
                        hash_v = "md5:{} sha256:{}".format(md5h, sha256h)
                        hash_constructor = "(curl -s '$uri-m' && echo -n '" + hdrs_keys + "') | tee >(sha256sum) >(md5sum) >/dev/null | cut -d ' ' -f 1 | paste -d':' <(echo -e 'md5\nsha256') - | paste -d' ' - -"

    manif["http-headers"] = hdrs
    manif["hash"] = hash_v
    manif["hash-constructor"] = hash_constructor
    manif[
        "@id"] = "http://manifest.ws-dl.cs.odu.edu/manifest/" + downloadtime + '/ /' + urim

    manif_file = json.dumps(manif, indent=4)
    self_hash = hashlib.sha256(manif_file.encode()).hexdigest()

    manif["@id"] = manif["@id"].replace("/ /", "/" + self_hash + "/")

    with open(outMainDir + '/' + self_hash + '.json', 'w') as outfile:
        json.dump(manif, outfile, indent=4)

    toc_all = timeit.default_timer()

    time_json['time_in_seconds_to_download_memento'] = toc0 - tic0
    time_json['time_in_seconds_to_generate_fixity'] = (
        toc_all - tic_all) - time_json['time_in_seconds_to_download_memento']

    with open(outMainDir + '/' + self_hash + '.json.time', 'w') as outfile:
        json.dump(time_json, outfile, indent=4)

    return outMainDir + '/' + self_hash + '.json'
コード例 #15
0
ファイル: warcs.py プロジェクト: imtoohard/hypercane
def synthesize_warc(urim, session, output_directory):

    import otmt
    import glob
    from warcio.warcwriter import WARCWriter
    from warcio.statusandheaders import StatusAndHeaders
    from hashlib import md5
    from datetime import datetime
    import traceback

    m = md5()
    m.update(urim.encode('utf8'))
    urlhash = m.hexdigest()

    if len( glob.glob('{}/{}*.warc.gz'.format(output_directory, urlhash)) ) > 0:
        module_logger.warning("Detected existing WARC for URI-M, skipping {}".format(urim))
        return

    resp = session.get(urim, stream=True)
    resp.raise_for_status()

    headers_list = resp.raw.headers.items()

    # we use response.url instead of urim to (hopefully) avoid raw redirects
    raw_urim = otmt.generate_raw_urim(resp.url)

    raw_response = session.get(raw_urim, stream=True)

    warc_target_uri = None

    # we have to implement this construct in case the archive combines original with other relations
    for link in resp.links:

        if 'original' in link:
            warc_target_uri = resp.links[link]['url']

    if warc_target_uri is None:
        module_logger.warning("could not find this memento's original resource, skipping {}".format(urim))
        return

    try:
        mdt = resp.headers['Memento-Datetime']

    except KeyError:
        module_logger.warning("could not find this memento's memento-datetime, skipping {}".format(urim))
        return

    http_headers = StatusAndHeaders('200 OK',
        headers_list, protocol='HTTP/1.0')

    module_logger.debug("mdt formatted by strptime and converted by strftime: {}".format(
        datetime.strptime(
            mdt, "%a, %d %b %Y %H:%M:%S GMT"
        ).strftime('%Y-%m-%dT%H:%M:%SZ')
    ))

    warc_headers_dict = {}
    warc_headers_dict['WARC-Date'] = datetime.strptime(
        mdt, "%a, %d %b %Y %H:%M:%S GMT"
    ).strftime('%Y-%m-%dT%H:%M:%SZ')

    with open("{}/{}-{}.warc.gz".format(output_directory, urlhash, datetime.now().strftime('%Y%m%d%H%M%S')), 'wb') as output:
        writer = WARCWriter(output, gzip=True)

        record = writer.create_warc_record(
            warc_target_uri, 'response',
            payload=raw_response.raw,
            http_headers=http_headers,
            warc_headers_dict=warc_headers_dict
            )

        writer.write_record(record)
コード例 #16
0
        url = None
        date = None
        with open(filepath, 'rb') as content_file:
            content = content_file.read()
            for line in content.split(b"\n"):
                if re.search(rb'<!-- Mirrored from .* by HTTrack Website Copier.*\[.*\],', line):
                    url = re.sub(rb'.*<!-- Mirrored from ', b'', re.sub(rb' by HTTrack Website Copier.*', b'', line))
                    date = re.sub(rb'.+by HTTrack Website.+\[.+\][^,]*, ', b'', re.sub(rb' -->.*', b'', line))
                    break
        if date is None:
            dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        else:
            try:
                dvalue = parse(date.decode("utf8")).strftime('%Y-%m-%dT%H:%M:%SZ')
            except ValueError:
                dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        if url is None:
            urlStr = "unknown"
        else:
            try:
                urlStr = url.decode("utf8")
                # sys.stderr.write("HH1 " + urlStr + "\n")
            except:
                urlStr = "unknown-encoding"
                # sys.stderr.write("HH2 " + urlStr + "\n")
        with open(filepath, 'rb') as content_file:
            record = writer.create_warc_record(urlStr, 'response',
                                               payload=content_file)

        writer.write_record(record)
コード例 #17
0
ファイル: warc.py プロジェクト: podolskyi/cocrawler
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info['software'] = 'cocrawler/' + version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + self.subprefix
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; minimizes open filehandles
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, expires, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(expires - time.time())
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += host + '.\t' + str(
                    ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def write_request_response_pair(self,
                                    url,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None):
        if self.writer is None:
            self.open()

        # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?)

        req_http_headers = StatusAndHeaders(
            'GET / HTTP/1.1', headers_to_str_headers(req_headers))

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        resp_http_headers = StatusAndHeaders(
            '200 OK',
            headers_to_str_headers(resp_headers),
            protocol='HTTP/1.1')

        warc_headers_dict = {}
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
コード例 #18
0
        ]:
            payload[input.get('name')] = input.get('value')
    payload["ctl00$ctl00$bodyContent$mainContent$ddlYears"] = str(year)
    return payload


def post_url(url, headers):
    return url + '?' + functools.reduce(
        lambda acc, k: acc + '&' + k + '=' + headers[k], headers.keys())


if __name__ == "__main__":
    config.initalize_project_root()
    today = time.strftime("%Y%m%d%H%M%S", time.gmtime())
    #config.initalize_record()
    r = requests.get(config.AGENDA_URL)
    session_headers = parse_session(r.text, 2018)
    fileName = 'rec-' + today + '-psuedos-MacBook-Pro.local.warc.gz'
    with open(fileName, 'wb') as output:
        writer = WARCWriter(output, gzip=True)
        response = requests.post(config.AGENDA_URL, data=session_headers)
        headers_list = response.raw.headers.items()
        http_headers = StatusAndHeaders('200 OK',
                                        headers_list,
                                        protocol='HTTP/1.0')
        record = writer.create_warc_record(config.AGENDA_URL,
                                           'response',
                                           payload=response.raw,
                                           http_headers=http_headers)
        writer.write_record(record)
コード例 #19
0
with open('example.warc.wet.gz', 'wb') as output:
    writer = WARCWriter(output, gzip=True)
    with ZIMFile("data/wikipedia_en_simple_all_nopic_2020-12.zim") as reader:
        for uid in range(0, reader.article_count):
            if uid % 10_000 == 0:
                print("{} out of {}".format(uid, reader.article_count))

            article = reader.get_article_by_id(uid)
            try:
                if article.mimetype != "text/html":
                    continue
            except RuntimeError:
                continue

            if article.is_redirect:
                continue

            url = 'https://simple.wikipedia.org/wiki/{}'.format(quote(article.url))
            html = bytes(article.content).decode('utf8')
            text = html2text(html)
            payload = BytesIO(text.encode('utf8'))

            record = writer.create_warc_record(
                url,
                'conversion',
                payload=payload,
            )

            writer.write_record(record)
コード例 #20
0
class WarcDownloader:
    """
        Download URL with HTTP GET, save to a WARC file and return the decoded text
    """
    def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True,
                 err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2,
                 limit_period=1, proxy_url=None, allow_cookies=False):
        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1

        logger_.log('INFO', 'Creating archivefile: {0}'.format(filename))

        self._output_file = open(filename, 'wb')
        self._logger_ = logger_
        self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent}

        self._session = Session()  # Setup session for speeding up downloads

        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period,
                                                    period=limit_period)(self._http_get_w_cookie_handling))
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        self._writer = WARCWriter(self._output_file, gzip=True)
        if warcinfo_record_data is None:
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]),
                            'format': 'WARC File Format 1.0',
                            'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'}
            info_record = self._writer.create_warcinfo_record(filename, info_headers)
        else:  # Must recreate custom headers else they will not be copied
            custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\
                             encode('UTF-8')
            info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0],
                                                          payload=BytesIO(custom_headers),
                                                          length=len(custom_headers))
        self._writer.write_record(info_record)

    def __del__(self):
        if hasattr(self, '_output_file'):  # If the program opened a file, then it should gracefully close it on exit!
            self._output_file.close()

    def _http_get_w_cookie_handling(self, *args, **kwargs):
        """
            Extend requests.get with optional cookie purging
        """
        if not self._allow_cookies:
            self._session.cookies.clear()
        return self._session.get(*args, **kwargs)

    def _handle_request_exception(self, url, msg):
        self._logger_.log('WARNING', '\t'.join((url, msg)))

        self._error_count += 1
        if self._error_count >= self._error_threshold:
            raise NameError('Too many error happened! Threshold exceeded! See log for details!')

    def download_url(self, url):
        scheme, netloc, path, params, query, fragment = urlparse(url)
        path = quote(path)  # For safety urlencode the generated URL...
        url = urlunparse((scheme, netloc, path, params, query, fragment))

        if url in self.bad_urls:
            self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url))
            return None

        try:  # The actual request
            resp = self._requests_get(url, headers=self._req_headers, stream=True)
        except RequestException as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code,
                                                                                                      resp.reason))
            return None

        # REQUEST
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)),
                                                                  proto), reqv_headers.items(), is_http_request=True)
        reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason)
        resp_headers_list = resp.raw.headers.items()  # get raw headers from urllib3
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...

        try:
            data = resp.raw.read()  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc)))
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(data)  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream,
                                                      http_headers=resp_http_headers,
                                                      warc_headers_dict={'WARC-IP-Address': peer_name,
                                                                         'WARC-X-Detected-Encoding': enc})
        # Everything is OK, write the two WARC records
        self._writer.write_record(reqv_record)
        self._writer.write_record(resp_record)

        return text

    def write_record(self, record):
        self._writer.write_record(record)
コード例 #21
0
    def facebook_user_timeline(self,
                               seed_id,
                               username,
                               nsid,
                               sleep_between_harvests=True):
        """This function will scrape the user timeline"""
        log.debug("Harvesting user %s with seed_id %s.", username, seed_id)
        # make sure either username or nsid is present to start scraping
        assert username or nsid

        # possibly get fbid from state.json
        if not nsid:
            nsid = self.state_store.get_state(
                __name__, u"timeline.{}.fbid".format(username))
            log.info("Trying to retrieve FB-ID from state store")
            log.info("Found FB-ID from state store is %s", nsid)

        # Possibly look up fbid if not supplied and not already in state.json
        if username and not nsid:

            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)
            # write id to state.json if not already there
            key = "timeline.{}.fbid".format(username)
            self.state_store.set_state(__name__, key, nsid)
            log.info("Writing fbid to state store")

        if nsid:
            # report back whether user id was found
            log.info("FB userid %s", nsid)
            # todo - need to add timeout and what to do if blocked

            incremental = self.message.get("options",
                                           {}).get("incremental", False)
            harvest_media = self.message.get("options",
                                             {}).get("harvest_media", False)
            if harvest_media:
                # disable verbose logging of the state store to avoid overlong log messages
                # (the state store holds the list of all harvested media URLs)
                self.state_store.verbose = False

            if incremental:
                # search for since_id of post
                since_id = self.state_store.get_state(
                    __name__, u"timeline.{}.since_id".format(nsid))

            scrape_result = []

            # check if blocked, usually lasts 24 hours
            if "Temporarily Blocked" in requests.get(
                    "https://m.facebook.com/" + nsid).text:
                # sleep 24 hours
                log.debug("Temporarily blocked - waiting 24 hours")
                time.sleep(86429)

            # check for cookies. otherwise don't use
            if os.path.isfile("/tmp/cookies.json"):
                fb_cookies = "/tmp/cookies.json"
            else:
                fb_cookies = None

            counter = 0

            for post in facebook_scraper.get_posts(nsid,
                                                   pages=self.pages,
                                                   options={
                                                       "allow_extra_requests":
                                                       False,
                                                       "posts_per_page": 200
                                                   },
                                                   timeout=30,
                                                   cookies=fb_cookies):
                scrape_result.append(post)
                self.result.harvest_counter["posts"] += 1
                self.result.increment_stats("posts")

                counter += 1  # in case self.result.harvest_count also contains old harvest counts

                # for very long harvests, try to avoid blocking by sleeping after a
                # certain amount of posts
                if self.result.harvest_counter["posts"] in [
                        2000, 4000, 6000, 8000
                ]:
                    log.info(
                        "Waiting a few minutes to avoid block bc of too many requests"
                    )
                    time.sleep(random.uniform(100, 650))

                if incremental and post["post_id"] == since_id and post[
                        "post_id"]:
                    log.info(
                        "Stopping, found last post that was previously harvested with id: %s",
                        post["post_id"])
                    break

            # harvesting media (images_lowquality links!)
            # doing this after post scrape to avoid potential blocks
            # last condition avoids parsing empty lists (i.e. no media)
            if harvest_media:
                img_counter = 1
                for post in scrape_result:
                    if post['images_lowquality'] and (
                            img_counter <= 1000 or
                        (1000 / img_counter) >= random.random()):
                        log.info("Harvesting media from post")
                        self.result.harvest_counter["images"] += 1
                        self.result.increment_stats("images")
                        img_counter += 1  # in case harvest_counter contains old counts
                        # get media content from links - should automatically be caught within warc stream
                        # all photos on fb are jpgs, so the list comprehension checks whether this is the case
                        # for the stream, if not (e.g. video) it will not harvest
                        [
                            self._harvest_media_url(media_url)
                            for media_url in post['images_lowquality']
                            if 'jpg' in media_url
                        ]

            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(scrape_result,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)

            if incremental:
                # some posts will have post["post_id"] None
                # I take the latest post without a None id
                # if no post with a post_id is found nothing will
                # be written to the state store
                latest_post = next(
                    (latest_post for latest_post in scrape_result
                     if latest_post["post_id"] is not None), None)

                if latest_post:
                    max_post_time = latest_post.get("time")
                    max_post_id = latest_post.get("post_id")

                    assert max_post_time and max_post_id
                    # write most recent post ID to state store
                    key = "timeline.{}.since_id".format(nsid)

                    self.state_store.set_state(__name__, key, max_post_id)
                    log.info(
                        "Wrote first scraped post to state_store: %s (state: %s)",
                        max_post_id, key)

        else:
            msg = "NSID not found for user {}".format(username)
            log.exception(msg)
            self.result.warnings.append(
                Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))

        # sleep approx. 15 min before starting next harvest to ensure politness
        if sleep_between_harvests:
            log.info("Waiting approx. 15 before next harvest")
            time.sleep(random.randint(850, 2500))
コード例 #22
0
    def facebook_user_bio(self, username):
        """Scrapes Facebook bio and returns info
        on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal)
        @param username: Facebook username
        @return: a dictionary of account attributes """

        # ensure username is clean and can be accessed
        if username.startswith(
                "https://www.facebook.com/") or username.startswith(
                    "http://www.facebook.com/") or username.startswith(
                        "www.facebook.com/"):
            username = re.sub(r'^.+facebook\.com\/', '', username)
            # possibly also remove trailing /
            username = re.sub(r'\/$', '', username)

        # created at field
        fb_general = base_fb_url + username
        # bio info
        fb_about = base_fb_url + username + "/about/?ref=page_internal"
        # site transparency (e.g. admins)
        m_fb_general = "https://m.facebook.com/" + username

        site_transparency_detail_id_selector = "//*[contains(text(), 'Page history')]/ancestor::div/ancestor::div"
        site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt"

        driver = self.initiate_selenium_webdriver()

        # check whether cookies are present, otherwise try to
        # log in
        if os.path.isfile("/tmp/cookies.json"):
            # first navigate to fb, otherwise
            # selenium does not accept the cookies
            # navigate to page
            driver.get("https://m.facebook.com/")

            # else load cookies
            with open("/tmp/cookies.json") as f:
                cookies = json.load(f)
            # add to driver
            for cookie in cookies:
                driver.add_cookie(cookie)

        # if no cookies, try to login
        else:
            self.fb_login(driver=driver)
            time.sleep(random.uniform(3, 9))
            # then write cookies
            cookies = driver.get_cookies()

            with open("/tmp/cookies.json", "w") as f:
                json.dump(cookies, f)

        time.sleep(random.uniform(3, 9))
        # navigate to site
        driver.get(m_fb_general)
        time.sleep(random.uniform(3, 9))
        # site info only loads on scroll
        driver.execute_script("window.scrollTo(0, 800)")

        # extract likes
        site_likes_xpath = "//div[@class=\"_59k _2rgt _1j-f _2rgt\"]"
        likes = driver.find_elements_by_xpath(site_likes_xpath)

        likes = [
            single_div.text for single_div in likes
            if "like this" in single_div.text
        ]

        # new fb page layout has followers instead of likes
        if len(likes) == 0:
            followers_xpath = "//*[@id=\"profile_intro_card\"]/div[1]/div/div[1]/div[2]/div/div/div/span"
            likes = driver.find_elements_by_xpath(followers_xpath)
            likes = [likes[0].text]

        # old page layout
        if "like this" in likes[0]:
            time.sleep(random.uniform(20, 25))
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.CSS_SELECTOR, site_transparency_class_selector)))
            site_transparency = driver.find_elements_by_css_selector(
                site_transparency_class_selector)
            # site transparency should always be below about
            time.sleep(random.uniform(5, 9))
            site_transparency[1].click()
            time.sleep(random.uniform(15, 20))
            # simply get the whole text of the transparency box of site
            # the exact info can be extracted ex-post
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.XPATH, site_transparency_detail_id_selector)))
            time.sleep(random.uniform(3, 9))
            site_transparency_text = driver.find_elements_by_xpath(
                site_transparency_detail_id_selector)[0].text
            time.sleep(random.uniform(3, 9))
            driver.quit()
        # new page layout
        elif "Followers" in likes[0]:
            # click on about
            time.sleep(random.uniform(20, 25))
            more_about_xpath = "//*[@id=\"profile_intro_card\"]/div[2]/div/div[3]/div/a"
            more_about = driver.find_element_by_xpath(more_about_xpath)
            more_about.click()
            time.sleep(random.uniform(2, 7))
            see_more_transparency_xpath = "//*[@id=\"transparency\"]/header/div/div[2]/div/div/a"
            see_more_transparency = driver.find_element_by_xpath(
                see_more_transparency_xpath)
            see_more_transparency.click()
            time.sleep(random.uniform(2, 7))
            all_divs_xpath = "//*[contains(text(), 'Page history')]/ancestor::div/ancestor::div"
            all_divs = driver.find_elements_by_xpath(all_divs_xpath)

            # account for different way of writing
            if len(all_divs) == 0:
                all_divs_xpath = "//*[contains(text(), 'Page History')]/ancestor::div/ancestor::div"
                all_divs = driver.find_elements_by_xpath(all_divs_xpath)

            site_transparency_text = all_divs[0].text
            driver.quit()

        log.info("Finished scraping transparency box")
        bio_dict = {
            "username": fb_general,
            "n_likes": likes[0],
            "transparency_text": site_transparency_text
        }

        # request about html
        # tries to add cookies, otherwise it won't work anymore
        if os.path.isfile("/tmp/cookies.json"):
            with open("/tmp/cookies.json") as f:
                cookies = json.load(f)
        else:
            cookies = None

        r_about = requests.get(fb_about)

        # ensure no 404's
        if not r_about:
            log.debug("Couldn't access username/about site: %s", fb_about)
            return

        about_soup = BeautifulSoup(r_about.content, "html.parser")
        mission_text = about_soup.find_all('div', {'class': "_4bl9"})

        for divs in mission_text:
            describing_div = divs.find('div', {'class': '_50f4'})
            content_div = divs.find('div', {'class': '_3-8w'})

            if describing_div and content_div:
                bio_dict[describing_div.text] = content_div.text

        # photos
        # Retrieves profile and cover photo of public facebook page
        # bio going to the 'about' page, parsing html and getting
        # the links to photos from script tag, these can then be passed
        # harvest_media
        # this is not affected by the harvest_media options but will always happen
        all_scripts = about_soup.find_all('script')

        for js in all_scripts:
            for content in js.contents:
                if 'cover_photo' in content:
                    # isolate relevant links
                    links = re.findall(r'https\:\\/\\/scontent[^"]*', content)

                    # remove escaped front slashes
                    for val, link in enumerate(links):
                        links[val] = re.sub(r'\\', "", link)
                        self._harvest_media_url(links[val])
                # ensure that only warc will be written if sites were found
                # else nothing will happen
        if r_about:
            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(bio_dict,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
コード例 #23
0
    def facebook_user_ads(self, username, nsid, iso2c, access_token):
        assert username or nsid

        limit_per_page = 500

        if username and not nsid:
            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid and access_token and iso2c:
            # start scraping
            request_url = "https://graph.facebook.com/v5.0/ads_archive"
            request_params = {
                "access_token":
                access_token,
                "limit":
                limit_per_page,
                "search_page_ids":
                str(nsid),
                "ad_active_status":
                "ALL",
                "ad_reached_countries":
                iso2c,  # todo
                "fields":
                "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency"
            }

            api_result = requests.get(request_url, params=request_params)

            print(api_result.text)

            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            # write to warc
            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(api_result.json(),
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
            time.sleep(1.2)  # sleep to avoid getting blocked by api

        else:
            log.debug(
                "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s",
                str(access_token), str(iso2c))
コード例 #24
0
    def facebook_user_timeline(self, seed_id, username, nsid):
        """This function will scrape the user timeline"""
        log.debug("Harvesting user %s with seed_id %s.", username, seed_id)
        # make sure either username or nsid is present to start scraping
        assert username or nsid

        # Possibly look up username
        if username and not nsid:

            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid:
            # report back whether user id was found
            log.info("FB userid %s", nsid)
            # todo - need to add timeout and what to do if blocked
            # todo - post ids will sometimes be empty, account for that for incremental

            incremental = self.message.get("options",
                                           {}).get("incremental", False)
            harvest_media = self.message.get("options",
                                             {}).get("harvest_media", False)

            if incremental:
                # search for since_id of post
                since_id = self.state_store.get_state(
                    __name__, u"timeline.{}.since_id".format(nsid))

            scrape_result = []

            for post in facebook_scraper.get_posts(nsid,
                                                   pages=self.pages,
                                                   extra_info=True,
                                                   timeout=20):
                scrape_result.append(post)
                self.result.harvest_counter["posts"] += 1
                self.result.increment_stats("posts")

                if harvest_media and post[
                        'images']:  #last condition avoids parsing empty lists (i.e. no media)
                    log.info("Harvesting media from post")
                    # get media content from links - should automatically be caught within warc stream
                    # all photos on fb are jpgs, so the list comprehension checks whether this is the case
                    # for the stream, if not (e.g. video) it will not harvest
                    [
                        self._harvest_media_url(media_url)
                        for media_url in post['images'] if 'jpg' in media_url
                    ]

                if incremental and post["post_id"] == since_id:
                    log.info(
                        "Stopping, found last post that was previously harvested with id: %s",
                        post["post_id"])
                    break

            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(scrape_result,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)

            # write to state store
            incremental = self.message.get("options",
                                           {}).get("incremental", False)

            key = "timeline.{}.since_id".format(nsid)
            max_post_time = scrape_result[0].get("time")
            max_post_id = scrape_result[0].get("post_id")

            assert max_post_time and max_post_id

            if incremental:

                self.state_store.set_state(
                    __name__, key, max_post_id) if incremental else None

                log.info("Wrote first scraped post to state_store")

        else:
            msg = "NSID not found for user {}".format(username)
            log.exception(msg)
            self.result.warnings.append(
                Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))
コード例 #25
0
ファイル: site_crawler.py プロジェクト: alvations/linguacrawl
class SiteCrawler(object):
    def __init__(self,
                 priority,
                 multi_site_crawler,
                 seed_urls,
                 domain,
                 config,
                 scout=None):
        # Multi-site crawler object that manages current crawler
        self.multi_site_crawler = multi_site_crawler

        # Concurrency lock to ensure that only one process accesses URL lists (pending, visited and attempts)
        self.url_list_concurrency_lock = Lock()
        # Concurrency lock to ensure that only one process accesses to write the status and the output WARC file
        self.file_write_concurrency_lock = Lock()
        # If verbose is True, debuging level is set to INFO; otherwise it is ERROR
        logging.basicConfig(
            level=logging.INFO if config["verbose"] else logging.ERROR)

        # Domain corresponding to the seed URLs to be crawled
        self.domain = domain
        # Accepted TLDs in the crawl
        self.tlds = config["accepted_tlds"]

        # Set of URLs that have been already crawled
        self.visited = set()
        # Map that counts the number of times a URL is visited and could not be accessed
        self.attempts = {}
        # Links that must not be re-crawled until some time has passed
        self.asleep_links = {}
        # Maximum number of attempts to visit a website and receiving an error until it is discarded
        self.max_attempts = config["max_attempts"]

        # Maximum
        self.max_folder_tree_depth = config["max_folder_tree_depth"]
        # Accepted content time (for example: (text/html) )
        self.accepted_content_type = config["accepted_content"]
        # List of regular expressions to discard URLs
        self.url_blacklist_re = config["url_blacklist"]

        # If interrupt is set to False, crawling stops
        self.interrupt = False
        self.sleep_thread = None

        # Variable that keeps the current size of the crawling
        self.crawl_size = 0.0
        # Priority of the process when added to the queue that manages all the crawlers in MultiSiteCrawler
        self.priority = priority

        # Path to the file that stores crawling state dump
        self.dumpfile = config["output_dir"] + "/" + self.domain + ".state"
        # If a path is provided, the previous crawling status is restored to resume crawling
        if config["resume_crawling"]:
            self.load_status(pickle.load(open(self.dumpfile, 'rb')))
        # Path to the file where WARC is writen
        output_file_name = config["output_dir"] + "/" + self.domain + ".warc.gz"
        metadata_output_file_name = config[
            "output_dir"] + "/" + self.domain + ".metadata.gz"
        name_counter = 1
        while os.path.isfile(output_file_name):
            output_file_name = config[
                "output_dir"] + "/" + self.domain + "." + str(
                    name_counter) + ".warc.gz"
            metadata_output_file_name = config[
                "output_dir"] + "/" + self.domain + "." + str(
                    name_counter) + ".metadata.gz"
            name_counter += 1
        f_out = open(output_file_name, 'wb')
        self.writer = WARCWriter(f_out, gzip=True)
        self.metadata_writer = gzip.open(metadata_output_file_name, "wb")

        # Scout object that will determine if the website is promising and if crawling should be interrupted
        self.scout = scout
        # The user will only keep documents in these languages
        self.langs_of_interest = config["langs_of_interest"]

        # User agent of the crawl
        self.user_agent = config["user_agent"]
        # Connection timeout
        self.conn_timeout = config["connection_timeout"]
        # Setting default crawling delay
        self.default_delay = config["crawl_delay"]
        # Init list of pending URLs from seed URLs; every URL is checked to confirm that it can be visited
        self.pending_urls = []
        # Robots parser: it is initialised from the first valid seed URL found
        self.robots = SiteRobots(self.user_agent, self.default_delay,
                                 self.conn_timeout)
        self.url_list_concurrency_lock.acquire()
        for url in seed_urls:
            if url.is_valid():
                self.add_url_to_list(url)
        self.url_list_concurrency_lock.release()

        # Maximum crawling size for this site
        if "max_size_per_site" not in config:
            self.max_size = None
        else:
            self.max_size = config["max_size_per_site"]
        # Maximum crawling time for this site
        if "max_time_per_site" not in config:
            self.max_time = None
        else:
            self.max_time = config["max_time_per_site"]
        # Starting time of the crawling; it is used to decide when max_time is reached
        self.starts = int(time.time())
        # Time of the last connection; it is used to make sure that delay is fulfilled
        self.last_connection = self.starts - self.default_delay

    def extend_url_list(self, url_list):
        self.url_list_concurrency_lock.acquire()
        for u in url_list:
            self.add_url_to_list(u)
        self.url_list_concurrency_lock.release()

    # Adding URL to the list of URLs to be visited during crawling; before doing so, checks if it was already visited or
    # if it infringes TLD restrictions
    def add_url_to_list(self, url):
        if not url.is_valid():
            logging.info('"%s" is not a valid URL', url.get_norm_url())
        if url.get_norm_url() in self.visited or url in self.pending_urls:
            logging.info(
                '"%s" already used before (it may be pending of crawling)',
                url.get_norm_url())
        else:
            logging.info('"%s" added to pending URLs', url.get_norm_url())
            self.pending_urls.append(url)

    def get_pending_url(self):
        url = None
        try:
            self.url_list_concurrency_lock.acquire()
            sleeping_urls = []
            while len(self.pending_urls) > 0 and url is None:
                # Next URL is picked from the list of pending URLs and is added to the list of visited URLs
                tmp_url = self.pending_urls.pop()
                if tmp_url.wait_until is not None and tmp_url.wait_until > time.time(
                ):
                    sleeping_urls.append(url)
                else:
                    self.visited.add(tmp_url.get_norm_url())
                    url = tmp_url
            self.pending_urls.extend(sleeping_urls)
        finally:
            self.url_list_concurrency_lock.release()
        #threading.current_thread().name = "crawling: "+url.get_norm_url()
        return url

    def _process_link(self, link, url):
        logging.debug("\t\t" + threading.current_thread().name +
                      "--- going to process " + link.get_norm_url())
        # Longer than limit set by the standard RFC7230 are discarded
        if not link.is_valid():
            return None
        # Filter url using URL blacklist_re
        for f in self.url_blacklist_re:
            if re.search(f, link.get_norm_url()):
                return None

        if self.domain == link.get_domain():
            logging.debug("\t\t" + threading.current_thread().name +
                          "--- adding URL to list " + link.get_norm_url())
            self.url_list_concurrency_lock.acquire()
            self.add_url_to_list(link)
            self.url_list_concurrency_lock.release()
            return link
        elif link.get_tld() in self.tlds:
            self.url_list_concurrency_lock.acquire()
            if link.get_norm_url() in self.visited:
                logging.info('"%s" already used to extend list of seed URLs',
                             link.get_norm_url())
                self.url_list_concurrency_lock.release()
            else:
                logging.info('"%s" used to extend list of seed URLs',
                             link.get_norm_url())
                self.visited.add(link.get_norm_url())
                self.url_list_concurrency_lock.release()
                self.multi_site_crawler.extend_seed_urls(link)
            return link
        else:
            logging.info('"%s" discarded: not in the same TLD',
                         link.get_norm_url())
            return None

    def _calc_depth(self, url):
        # calculate url depth
        return len(
            url.replace('https', 'http').replace(
                self.root_url, '').rstrip('/').split('/')) - 1

    def connect_to_server(self, url):
        res = None
        try:
            logging.info('Connecting to: %s', url.get_norm_url())
            self.last_connection = time.time()
            # Connections are done with a delay to avoid blocking the server
            if url.get_url_parts().scheme == 'http':
                try:
                    conn = http.client.HTTPConnection(
                        url.get_url_parts().netloc, timeout=self.conn_timeout)
                except:
                    conn = http.client.HTTPSConnection(
                        url.get_url_parts().netloc, timeout=self.conn_timeout)
            else:
                conn = http.client.HTTPSConnection(url.get_url_parts().netloc,
                                                   timeout=self.conn_timeout)
            logging.info('Connection obtained: %s', url.get_norm_url())

            conn.request('GET',
                         quote(url.get_url_parts().path, '?=&%/'),
                         headers={'User-Agent': self.user_agent})
            logging.info('Get request set %s', url.get_norm_url())

            res = conn.getresponse()

            logging.info('Response obtained from: %s', url.get_norm_url())
        except (http.client.HTTPException, EnvironmentError) as e:
            logging.info("HTTPException!")
            conn = None
            self.process_failed_url(url)
        except socket.timeout:
            logging.info("Socket timeout!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except ssl.CertificateError:
            logging.info("CertificateError!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except ConnectionResetError:
            logging.info("ConnectionResetError!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except Exception as ex:
            logging.info(str(ex))
            if conn is not None:
                conn.close()
        if conn is None:
            logging.info('Connection is closed')
        else:
            logging.info('Connection is correct')
        return conn, res

    # The method returns True if the response status is 2XX and the document should be processed; otherwhise it takes
    # the corresponding action (manage redirects or errors)
    def deal_with_response_status(self, url, response):
        if 200 <= response.status <= 226:
            return True
        elif 301 <= response.status <= 308:
            rlink = self._process_link(Link(response.getheader('location')),
                                       url)
            if rlink is not None:
                logging.info('%s Redirect: %s -> %s',
                             threading.current_thread().name,
                             url.get_norm_url(), rlink.get_norm_url())
        elif 400 <= response.status <= 407 or 409 <= response.status <= 412 or 414 <= response.status <= 427 or 431 <= response.status:
            self.process_failed_url(url, retry=False)
        elif response.status == 408:
            self.process_failed_url(url, retry=True)
        elif response.status == 413 or response.status == 428:
            waiting_time = response.getheader('Retry-After')
            if waiting_time is None:
                url.wait_until = time.time() + 500
            else:
                url.wait_until = time.time() + int(waiting_time)
            self.process_failed_url(url, retry=True)
        else:
            self.process_failed_url(url, retry=False)
        return False

    def crawl_one_page(self):
        self.multi_site_crawler.new_running_crawler()
        url = self.get_pending_url()
        if not self.interrupt and url is not None:
            if not self.robots.fetch(url, self.max_attempts, self.domain):
                logging.info("robots.txt forbids crawling URL: %s",
                             url.get_norm_url())
                return
            logging.debug("\t" + threading.current_thread().name +
                          " >>>> Connecting " + url.get_norm_url() + "...")
            connection, server_response = self.connect_to_server(url)
            logging.debug("\t" + threading.current_thread().name +
                          "<<<< Connected " + url.get_norm_url())

            # If response is 2XX, the web page is processed
            if server_response is not None and self.deal_with_response_status(
                    url, server_response):
                # Check content type
                content_type = server_response.getheader('Content-Type')
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Content type: " + str(content_type))
                doc = None
                if content_type is not None and not re.search(
                        self.accepted_content_type, content_type):
                    logging.info("%s discarded: wrong file type",
                                 url.get_norm_url())
                else:
                    logging.debug("\t" + threading.current_thread().name +
                                  ">>>> Extracting doc from " +
                                  url.get_norm_url())
                    doc = WebDocument(server_response, url, self.max_attempts)
                    logging.debug("\t" + threading.current_thread().name +
                                  "<<<< Document extracted " +
                                  url.get_norm_url())
                connection.close()
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Connection closed: " + url.get_norm_url())

                if doc is not None:
                    if doc.utf_text:
                        links_set = doc.get_link_set()
                        # We can shuffle links to avoid to get biased by the structure of the site
                        # random.shuffle(linksset)
                        listoflinks = []
                        for li in links_set:
                            listoflinks.append(li.get_norm_url())
                        logging.debug("\t" + threading.current_thread().name +
                                      "<<<< Processing " +
                                      str(len(links_set)) + " links... " +
                                      url.get_norm_url() + "... " +
                                      " ".join(listoflinks))
                        for link in links_set:
                            self._process_link(link, doc.url)
                        logging.debug("\t" + threading.current_thread().name +
                                      "<<<< Links processed " +
                                      url.get_norm_url())

                        if doc.get_lang() is None or not doc.get_lang(
                        ).is_reliable:
                            logging.info(
                                "%s discarded: language detection is not reliable",
                                url.get_norm_url())
                        elif doc.get_lang(
                        ).language not in self.langs_of_interest:
                            logging.info(
                                "%s discarded: language not among languages of interest (detected=%s)",
                                url.get_norm_url(),
                                doc.get_lang().language)
                        else:
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          ">>>> Running scout " +
                                          url.get_norm_url())
                            self.run_scout(doc)
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          "<<<< Scout run " +
                                          url.get_norm_url())
                            # The document is writen to the warc
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          ">>>> Write document " +
                                          url.get_norm_url())
                            self.write_document(doc)
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          "<<<< Document saved " +
                                          url.get_norm_url())
                else:
                    logging.debug("\t" + threading.current_thread().name +
                                  "<<<< Document was none: " +
                                  url.get_norm_url())

            else:
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Connection was none")

                if connection is not None:
                    connection.close()

            if self.max_size is not None and self.crawl_size > self.max_size:
                self.interrupt_crawl()
            elif self.max_time is not None and time.time(
            ) - self.crawlstarts > self.max_time:
                self.interrupt_crawl()
            elif len(self.pending_urls) == 0:
                self.interrupt = True
        # If the crawler is allowed to continue crawling, wait until delay has passed and continue
        if not self.interrupt:
            self.sleep_thread = Thread(target=self._wait_and_queue)
            self.sleep_thread.daemon = False
            self.sleep_thread.name = self.sleep_thread.name + "_sleep"
            self.sleep_thread.start()
        else:
            self.multi_site_crawler.new_done_crawler()

    def _wait_and_queue(self):
        sleeptime = self.robots.get_delay() - (time.time() -
                                               self.last_connection)
        if sleeptime > 0:
            time.sleep(sleeptime)
        self.multi_site_crawler.crawler_ready(self)
        self.multi_site_crawler.new_done_crawler()

    # Scout is run until the recommendation_ready is ready; once it is, the object scout is deleted
    def run_scout(self, doc):
        if self.scout is not None:
            self.scout.step(doc)
            if self.scout.recommendation_ready():
                if not self.scout.recommendation_keep_crawling():
                    logging.info(
                        "Website discarded after crawling %s due to infringement of scout rule",
                        doc.url.get_norm_url())
                    self.interrupt = True
                else:
                    logging.info(
                        "Scout recommends keep crawling website after downloading %s; langs of interest found: %s",
                        doc.url.get_norm_url(), str(self.scout.lang_evidence))
                self.scout = None

    def process_failed_url(self, url, retry=True):
        if not retry:
            self.url_list_concurrency_lock.acquire()
            self.visited.add(url.get_norm_url())
            self.url_list_concurrency_lock.release()
            logging.info('%s: the URL does not exist', url.get_norm_url())
        else:
            if url.get_norm_url() not in self.attempts:
                self.url_list_concurrency_lock.acquire()
                self.add_url_to_list(url)
                self.attempts[url.get_norm_url()] = 1
                self.visited.remove(url.get_norm_url())
                self.url_list_concurrency_lock.release()
                logging.info('%s: retrying (attempt 1)', url.get_norm_url())
            else:
                if self.attempts[url.get_norm_url()] <= self.max_attempts:
                    logging.info('%s: retrying (attempt %s)', url,
                                 str(self.attempts[url.get_norm_url()]))
                    self.url_list_concurrency_lock.acquire()
                    self.add_url_to_list(url)
                    self.attempts[url.get_norm_url()] += 1
                    self.visited.remove(url.get_norm_url())
                    self.url_list_concurrency_lock.release()
                else:
                    self.url_list_concurrency_lock.acquire()
                    del self.attempts[url.get_norm_url()]
                    self.visited.add(url.get_norm_url())
                    self.url_list_concurrency_lock.release()
                    logging.info('%s: given up after %s attempts',
                                 url.get_norm_url(), str(self.max_attempts))

    def write_document(self, doc):
        self.file_write_concurrency_lock.acquire()
        try:
            headers_list = doc.response.getheaders()
            http_headers = StatusAndHeaders('200 OK',
                                            headers_list,
                                            protocol='HTTP/1.0')
            norm_url = doc.url.get_norm_url()
            record = self.writer.create_warc_record(norm_url,
                                                    'response',
                                                    payload=io.BytesIO(
                                                        doc.text),
                                                    http_headers=http_headers)
            self.writer.write_record(record)
            self.crawl_size += sys.getsizeof(doc.text) / 1000000.0
            if self.metadata_writer is not None:
                self.metadata_writer.write(
                    ("%s\t%s\t%s\n" % (doc.url.get_norm_url(), str(
                        doc.encoding), str(doc.get_lang()))).encode())
                self.metadata_writer.flush()
        finally:
            self.file_write_concurrency_lock.release()

    def get_status_object(self):
        targets = []
        for u in self.pending_urls:
            targets.append(u.get_norm_url())
        return {
            'visited': self.visited,
            'pendingurls': targets,
            'attempts': self.attempts
        }

    def load_status(self, status_obj):
        try:
            self.file_write_concurrency_lock.acquire()
            self.visited = status_obj['visited']
            self.pending_urls = []
            for u in status_obj['pendingurls']:
                self.pending_urls.append(Link(u))
            self.attempts = status_obj['attempts']
        finally:
            self.file_write_concurrency_lock.release()

    def save_status(self):
        try:
            self.file_write_concurrency_lock.acquire()
            if self.dumpfile is not None:
                pickle.dump(self.get_status_object(),
                            open(self.dumpfile, 'wb'))
        finally:
            self.file_write_concurrency_lock.release()

    def interrupt_crawl(self):
        try:
            self.url_list_concurrency_lock.acquire()
            self.interrupt = True
            self.save_status()
            self.metadata_writer.close()
        finally:
            self.url_list_concurrency_lock.release()

    def __hash__(self):
        return hash(self.domain)

    def one_thread_less(self):
        self.threads += 1
コード例 #26
0
ファイル: mhtml2warc.py プロジェクト: ikreymer/mhtml-warc
class MHTML2WARC:
    logger = logging.getLogger(__name__)

    def __init__(self, writer, gzip=True):
        self.fh = None
        self.writer = None
        self.filename = 'unknown'
        self.is_first = True

        if isinstance(writer, BaseWARCWriter):
            self.writer = writer
        elif isinstance(writer, str):
            self.fh = open(writer, 'wb')
            self.filename = writer
            self.writer = WARCWriter(self.fh, gzip=gzip)
        elif hasattr(writer, 'write'):
            self.writer = WARCWriter(writer, gzip=gzip)
        else:
            raise Exception('writer is in an unknown format')

    def parse(self, input_):
        if isinstance(input_, str):
            with open(input_, 'rb') as rfh:
                message = email.message_from_binary_file(rfh, policy=email.policy.strict)
        elif hasattr(input_, 'read'):
            message = email.message_from_binary_file(input_, policy=email.policy.strict)
        else:
            raise Exception('input is in an unknown format')

        if not message.is_multipart():
            raise Exception('Invalid MHTML -- not multipart')


        main_url = message.get('Snapshot-Content-Location', '')

        warc_date = self.write_warc_info(message)

        for part in message.walk():
            if part.get_content_type() == 'multipart/related':
                continue

            self.write_resource(part, main_url, warc_date)

    def write_resource(self, part, main_url, warc_date):
        content_type = part.get_content_type()
        main_type = part.get_content_maintype()
        content = part.get_payload(decode=True)

        url = part.get('Content-Location')

        warc_headers = {'WARC-Date': warc_date,
                        'WARC-Creation-Date': self.writer._make_warc_date(),
                       }

        content_id = part.get('Content-ID')
        write_redir = False

        if content_id:
            warc_headers['Content-ID'] = content_id

            cid_url = 'cid:' + content_id[1:-1]

            # only write main page url once under url
            # there may be additional frames for same url
            # only write them under cid
            if url == main_url:
                if self.is_first:
                    self.is_first = False
                else:
                    url = None

            if not url:
                # if cid urls not allowed, skip this resource
                if not allow_cid_urls:
                    return
                url = cid_url
            else:
                write_redir = True


        record = self.writer.create_warc_record(url, 'resource',
                                  payload=BytesIO(content),
                                  length=len(content),
                                  warc_content_type=content_type,
                                  warc_headers_dict=warc_headers)

        self.writer.write_record(record)

        if write_redir and allow_cid_urls:
            self.add_cid_redirect(cid_url, url)

    def add_cid_redirect(self, cid_url, url):
        msg = b'redirect'

        headers_list = [('Content-Type', 'text/plain'),
                        ('Content-Length', str(len(msg))),
                        ('Location', url)]

        http_headers = StatusAndHeaders('302 Redirect', headers_list, protocol='HTTP/1.0')

        record = self.writer.create_warc_record(cid_url, 'response',
                                  length=len(msg),
                                  payload=BytesIO(msg),
                                  http_headers=http_headers)

        self.writer.write_record(record)

    def write_warc_info(self, message):
        creator = message.get('From', '')

        url = message.get('Snapshot-Content-Location', '')

        title = message.get('Subject', url)


        try:
            actual_date = http_date_to_datetime(message['Date'])
            timestamp = datetime_to_timestamp(actual_date)
        except Exception:
            actual_date = ''
            timestamp = ''

        source = 'MHTML Snapshot for: ' + url

        software = 'mhtml2warc ' + str(__version__)

        metadata = {'title':  source,
                    'type': 'recording',
                    'pages': [{'title': title,
                               'url': url,
                               'timestamp': timestamp}]
                   }

        params = OrderedDict([('software', software),
                              ('creator', creator),
                              ('source', source),
                              ('format', 'WARC File Format 1.0'),
                              ('subject', title),
                              ('json-metadata', json.dumps(metadata))])

        record = self.writer.create_warcinfo_record(self.filename, params)

        if actual_date:
            actual_date = datetime_to_iso_date(actual_date)

            creation_date = record.rec_headers.get('WARC-Date')
            record.rec_headers.replace_header('WARC-Date', actual_date)
            record.rec_headers.replace_header('WARC-Creation-Date', creation_date)

        self.writer.write_record(record)

        return actual_date
コード例 #27
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            warcheader_version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info[
            'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + str(
                self.subprefix)  # don't let yaml leave this as an int
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; to minimize open filehandles?
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, ttl, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(ttl)
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += '\t'.join(
                    (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def _fake_resp_headers(self, resp_headers, body_len, decompressed=False):
        prefix = b'X-Crawler-'
        ret = []
        for h, v in resp_headers:
            hl = h.lower()
            if hl == b'content-length':
                if not (v.isdigit() and int(v) == body_len):
                    ret.append((prefix + h, v))
                    ret.append((b'Content-Length', str(body_len)))
            elif hl == b'content-encoding':
                if decompressed:
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            elif hl == b'transfer-encoding':
                if v.lower() == b'chunked':
                    # aiohttp always undoes chunking
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            else:
                ret.append((h, v))
        return ret

    def write_request_response_pair(self,
                                    url,
                                    ip,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None,
                                    decompressed=False):
        if self.writer is None:
            self.open()

        req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers)

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        fake_resp_headers = self._fake_resp_headers(resp_headers,
                                                    len(payload),
                                                    decompressed=decompressed)
        resp_http_headers = StatusAndHeaders('200 OK',
                                             fake_resp_headers,
                                             protocol='HTTP/1.1')

        warc_headers_dict = OrderedDict()
        if ip is not None:
            # ip should be here unless we crawl through a proxy
            warc_headers_dict['WARC-IP-Address'] = ip
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
コード例 #28
0
    else:
        record_type = 'response'
        http_headers = record.http_headers
        # Transfer-Encoding: chunked header causes error with giawarc
        http_headers.remove_header("Transfer-Encoding")
        try:
            http_headers.to_ascii_bytes()
        except UnicodeEncodeError:
            # if header is non ascii, create a new header, with status code only
            # content length and content type will be filled before writing
            http_headers = StatusAndHeaders(record.http_headers.get_statuscode(), [])

    # Extract payloads (XML) from non-HTML document formats
    if url[-4:] == ".pdf" or ((record.http_headers is not None and record.http_headers.get_header('Content-Type') is not None) and "application/pdf" in record.http_headers.get_header('Content-Type')):
        if options.pdfpass:
            new_record = po.create_warc_record(uri=url, record_type=record_type, warc_content_type=record.content_type, payload=BytesIO(payload), http_headers=http_headers)
            po.write_record(new_record)
            continue ### do not process further!
        if options.pdfextract:
            payloads = pdfextract(payload, extractor)
        else:
            payloads = pdf2html(payload)
    elif url[-4:] == ".odt" or url[-4:] == ".ods" or url[-4:] == ".odp":
        payloads = openoffice2html(payload)
    elif url[-5:] == ".docx" or url[-5:] == ".pptx" or url[-5:] == ".xlsx":
        payloads = office2html(payload)
    elif url[-5:] == ".epub":
        payloads = epub2html(payload)
    else:
        payloads = [payload]
コード例 #29
0
with open('example.warc.gz', 'wb') as output:
    writer = WARCWriter(output, gzip=True)

    resp = requests.get('http://example.com/',
                        headers={'Accept-Encoding': 'identity'},
                        stream=True)

    # get raw headers from urllib3
    headers_list = resp.raw.headers.items()

    http_headers = StatusAndHeaders('200 OK',
                                    headers_list,
                                    protocol='HTTP/1.0')
    print(resp.raw)
    record = writer.create_warc_record('http://example.com/',
                                       'response',
                                       payload=resp.raw,
                                       http_headers=http_headers)

    writer.write_record(record)

#quit()

all_posts = []

for post in facebook_scraper.get_posts(442978589179108,
                                       extra_info=True,
                                       pages=1,
                                       timeout=20):
    print(post['text'][:40])
    all_posts.append(post)
コード例 #30
0
class WarcHandler(EventHandler):
    __slots__ = ('logger', 'writer', 'documentRecords', 'log', 'maxLogSize',
                 'logEncoding', 'warcinfoRecordId')

    def __init__(self, fd, logger):
        self.logger = logger
        self.writer = WARCWriter(fd, gzip=True)

        self.logEncoding = 'utf-8'
        self.log = BytesIO()
        # max log buffer size (bytes)
        self.maxLogSize = 500 * 1024

        # maps document urls to WARC record ids, required for DomSnapshotEvent
        # and ScreenshotEvent
        self.documentRecords = {}
        # record id of warcinfo record
        self.warcinfoRecordId = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._flushLogEntries()

    def writeRecord(self,
                    url,
                    kind,
                    payload,
                    warc_headers_dict=None,
                    http_headers=None):
        """
        Thin wrapper around writer.create_warc_record and writer.write_record.

        Adds default WARC headers.
        """

        d = {}
        if self.warcinfoRecordId:
            d['WARC-Warcinfo-ID'] = self.warcinfoRecordId
        d.update(warc_headers_dict)
        warc_headers_dict = d

        record = self.writer.create_warc_record(
            str(url),
            kind,
            payload=payload,
            warc_headers_dict=warc_headers_dict,
            http_headers=http_headers)
        self.writer.write_record(record)

        return record

    def _writeRequest(self, item):
        logger = self.logger.bind(reqId=item.id)

        req = item.request
        url = item.url

        path = url.relative().with_fragment(None)
        httpHeaders = StatusAndHeaders(f'{req.method} {path} HTTP/1.1',
                                       req.headers,
                                       protocol='HTTP/1.1',
                                       is_http_request=True)
        warcHeaders = {
            'X-Chrome-Initiator': json.dumps(req.initiator),
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(req.timestamp),
        }

        body = item.request.body
        if item.request.hasPostData and body is None:
            # oops, don’t know what went wrong here
            logger.error('requestBody missing',
                         uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)
        record = self.writeRecord(url,
                                  'request',
                                  payload=body,
                                  http_headers=httpHeaders,
                                  warc_headers_dict=warcHeaders)
        return record.rec_headers['WARC-Record-ID']

    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To': concurrentTo,
            'X-Chrome-Request-ID': item.id,
            'WARC-Date': datetime_to_iso_date(resp.timestamp),
        }
        # conditional WARC headers
        if item.remoteIpAddress:
            warcHeaders['WARC-IP-Address'] = item.remoteIpAddress
        if item.protocol:
            warcHeaders['X-Chrome-Protocol'] = item.protocol

        # HTTP headers
        statusText = resp.statusText or \
                BaseHTTPRequestHandler.responses.get (
                resp.status, ('No status text available', ))[0]
        httpHeaders = StatusAndHeaders(f'{resp.status} {statusText}',
                                       resp.headers,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.mimeType
        if contentType:
            if isinstance(resp.body, UnicodeBody):
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('Content-Type', contentType)

        # response body
        body = resp.body
        if body is None:
            warcHeaders['WARC-Truncated'] = 'unspecified'
        else:
            httpHeaders.replace_header('Content-Length', str(len(body)))
            warcHeaders['X-Chrome-Base64Body'] = str(type(body) is Base64Body)
            body = BytesIO(body)

        record = self.writeRecord(item.url,
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=body,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')

    def _writeScript(self, item):
        writer = self.writer
        encoding = 'utf-8'
        path = item.path or '-'
        self.writeRecord(packageUrl(f'script/{path}'),
                         'metadata',
                         payload=BytesIO(str(item).encode(encoding)),
                         warc_headers_dict={
                             'Content-Type':
                             f'application/javascript; charset={encoding}'
                         })

    def _writeItem(self, item):
        assert item.request
        concurrentTo = self._writeRequest(item)
        # items that failed loading don’t have a response
        if item.response:
            self._writeResponse(item, concurrentTo)

    def _addRefersTo(self, headers, url):
        refersTo = self.documentRecords.get(url)
        if refersTo:
            headers['WARC-Refers-To'] = refersTo
        else:
            self.logger.error(f'No document record found for {url}')
        return headers

    def _writeDomSnapshot(self, item):
        writer = self.writer

        warcHeaders = {
            'X-DOM-Snapshot': str(True),
            'X-Chrome-Viewport': item.viewport,
            'Content-Type': 'text/html; charset=utf-8',
        }

        self._addRefersTo(warcHeaders, item.url)

        self.writeRecord(item.url,
                         'conversion',
                         payload=BytesIO(item.document),
                         warc_headers_dict=warcHeaders)

    def _writeScreenshot(self, item):
        writer = self.writer
        warcHeaders = {
            'Content-Type': 'image/png',
            'X-Crocoite-Screenshot-Y-Offset': str(item.yoff)
        }
        self._addRefersTo(warcHeaders, item.url)
        self.writeRecord(item.url,
                         'conversion',
                         payload=BytesIO(item.data),
                         warc_headers_dict=warcHeaders)

    def _writeControllerStart(self, item):
        payload = BytesIO(
            json.dumps(item.payload, indent=2,
                       cls=StrJsonEncoder).encode('utf-8'))

        writer = self.writer
        warcinfo = self.writeRecord(
            packageUrl('warcinfo'),
            'warcinfo',
            warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
            payload=payload)
        self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']

    def _flushLogEntries(self):
        if self.log.tell() > 0:
            writer = self.writer
            self.log.seek(0)
            # XXX: we should use the type continuation here
            self.writeRecord(packageUrl('log'),
                             'resource',
                             payload=self.log,
                             warc_headers_dict={
                                 'Content-Type':
                                 f'text/plain; encoding={self.logEncoding}'
                             })
            self.log = BytesIO()

    def _writeLog(self, item):
        """ Handle log entries, called by .logger.WarcHandlerConsumer only """
        self.log.write(item.encode(self.logEncoding))
        self.log.write(b'\n')
        if self.log.tell() > self.maxLogSize:
            self._flushLogEntries()

    route = {
        Script: _writeScript,
        RequestResponsePair: _writeItem,
        DomSnapshotEvent: _writeDomSnapshot,
        ScreenshotEvent: _writeScreenshot,
        ControllerStart: _writeControllerStart,
    }

    async def push(self, item):
        for k, v in self.route.items():
            if isinstance(item, k):
                v(self, item)
                break
コード例 #31
0
                        rb'.*<!-- Mirrored from ', b'',
                        re.sub(rb' by HTTrack Website Copier.*', b'', line))
                    date = re.sub(rb'.+by HTTrack Website.+\[.+\][^,]*, ', b'',
                                  re.sub(rb' -->.*', b'', line))
                    break
        if date is None:
            dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        else:
            try:
                dvalue = parse(
                    date.decode("utf8")).strftime('%Y-%m-%dT%H:%M:%SZ')
            except ValueError:
                dvalue = datetime.now().strftime('%Y-%m-%dT%H:%M:%SZ')
        if url is None:
            urlStr = "unknown"
        else:
            try:
                urlStr = url.decode("utf8")
                # sys.stderr.write("HH1 " + urlStr + "\n")
            except:
                urlStr = "unknown-encoding"
                # sys.stderr.write("HH2 " + urlStr + "\n")
        with open(filepath, 'rb') as content_file:
            record = writer.create_warc_record(
                urlStr,
                'resource',
                warc_content_type="application/http; msgtype=response",
                payload=content_file)

        writer.write_record(record)