Esempio n. 1
0
 def process(self, item):
     digests = {}
     input_filename = "%(item_dir)s/%(warc_file_base)s.warc" % item
     output_filename = "%(item_dir)s/%(warc_file_base)s-deduplicated.warc.gz" % item
     with open(input_filename, 'rb') as f_in, \
             open(output_filename, 'wb') as f_out:
         writer = WARCWriter(filebuf=f_out, gzip=True)
         for record in ArchiveIterator(f_in):
             url = record.rec_headers.get_header('WARC-Target-URI')
             if url is not None and url.startswith('<'):
                 url = re.search('^<(.+)>$', url).group(1)
                 record.rec_headers.replace_header('WARC-Target-URI', url)
             if record.rec_headers.get_header('WARC-Type') == 'response':
                 digest = record.rec_headers.get_header('WARC-Payload-Digest')
                 if digest in digests:
                     writer.write_record(
                         self._record_response_to_revisit(writer, record,
                                                          digests[digest])
                     )
                 else:
                     digests[digest] = (
                         record.rec_headers.get_header('WARC-Record-ID'),
                         record.rec_headers.get_header('WARC-Date'),
                         record.rec_headers.get_header('WARC-Target-URI')
                     )
                     writer.write_record(record)
             elif record.rec_headers.get_header('WARC-Type') == 'warcinfo':
                 record.rec_headers.replace_header('WARC-Filename', output_filename)
                 writer.write_record(record)
             else:
                 writer.write_record(record)
Esempio n. 2
0
    def run(self):

        with open(self.warcfile, 'ab') as output:
            while True:
                self.lock.acquire()
                data = self.out_queue.get()
                writer = WARCWriter(output, gzip=False)
                headers_list = data[0]
                http_headers = StatusAndHeaders('{} {}'.format(data[3], data[4]), headers_list, protocol='HTTP/1.0')
                record = writer.create_warc_record(data[2], 'response', payload=data[1], http_headers=http_headers)
                h = hashlib.sha1()
                h.update(record.raw_stream.read(BLOCK_SIZE))
                if self.dedup.lookup(h.hexdigest()):
                    record = writer.create_warc_record(data[2], 'revisit',
                                                       http_headers=http_headers)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                else:
                    self.dedup.save(h.hexdigest(), data[2])
                    record.raw_stream.seek(0)
                    writer.write_record(record)
                    self.out_queue.task_done()
                    self.lock.release()
                        headers={'Accept-Encoding': 'identity'},
                        stream=True)

    # get raw headers from urllib3
    headers_list = resp.raw.headers.items()

    http_headers = StatusAndHeaders('200 OK',
                                    headers_list,
                                    protocol='HTTP/1.0')
    print(resp.raw)
    record = writer.create_warc_record('http://example.com/',
                                       'response',
                                       payload=resp.raw,
                                       http_headers=http_headers)

    writer.write_record(record)

#quit()

all_posts = []

for post in facebook_scraper.get_posts(442978589179108,
                                       extra_info=True,
                                       pages=1,
                                       timeout=20):
    print(post['text'][:40])
    all_posts.append(post)

print(all_posts)

Esempio n. 4
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            warcheader_version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info[
            'software'] = 'cocrawler/' + version + ' cocrawler_warcheader_version/' + warcheader_version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + str(
                self.subprefix)  # don't let yaml leave this as an int
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; to minimize open filehandles?
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, ttl, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(ttl)
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += '\t'.join(
                    (host + '.', str(ttl), 'IN', kind, r['host'])) + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def _fake_resp_headers(self, resp_headers, body_len, decompressed=False):
        prefix = b'X-Crawler-'
        ret = []
        for h, v in resp_headers:
            hl = h.lower()
            if hl == b'content-length':
                if not (v.isdigit() and int(v) == body_len):
                    ret.append((prefix + h, v))
                    ret.append((b'Content-Length', str(body_len)))
            elif hl == b'content-encoding':
                if decompressed:
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            elif hl == b'transfer-encoding':
                if v.lower() == b'chunked':
                    # aiohttp always undoes chunking
                    ret.append((prefix + h, v))
                else:
                    ret.append((h, v))
            else:
                ret.append((h, v))
        return ret

    def write_request_response_pair(self,
                                    url,
                                    ip,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None,
                                    decompressed=False):
        if self.writer is None:
            self.open()

        req_http_headers = StatusAndHeaders('GET / HTTP/1.1', req_headers)

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        fake_resp_headers = self._fake_resp_headers(resp_headers,
                                                    len(payload),
                                                    decompressed=decompressed)
        resp_http_headers = StatusAndHeaders('200 OK',
                                             fake_resp_headers,
                                             protocol='HTTP/1.1')

        warc_headers_dict = OrderedDict()
        if ip is not None:
            # ip should be here unless we crawl through a proxy
            warc_headers_dict['WARC-IP-Address'] = ip
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Esempio n. 5
0
cleaner = Cleaner(style=True,
                  links=True,
                  add_nofollow=True,
                  page_structure=False,
                  safe_attrs_only=False)

if options.output == sys.stdout:
    filename = options.input
else:
    filename = options.output

fo.write_record(
    fo.create_warcinfo_record(filename=filename,
                              info={
                                  'software':
                                  'bitextor/bitextor-warc2htmlwarc.py',
                                  'format': 'WARC File Format 1.0'
                              }))

for record in f:
    # Initial checks
    if record.rec_type != 'response' and record.rec_type != 'resource':
        continue
    if record.rec_headers.get_header(
            'WARC-Target-URI')[0] == '<' and record.rec_headers.get_header(
                'WARC-Target-URI')[-1] == '>':
        url = record.rec_headers.get_header('WARC-Target-URI')[1:-1]
    else:
        url = record.rec_headers.get_header('WARC-Target-URI')
    if url == "unknown":
Esempio n. 6
0
class WarcDownloader:
    """
        Download URL with HTTP GET, save to a WARC file and return the decoded text
    """
    def __init__(self, filename, logger_, program_name='corpusbuilder 1.0', user_agent=None, overwrite_warc=True,
                 err_threshold=10, warcinfo_record_data=None, known_bad_urls=None, max_no_of_calls_in_period=2,
                 limit_period=1, proxy_url=None, allow_cookies=False):
        if known_bad_urls is not None:  # Setup the list of cached bad URLs to prevent trying to download them again
            with open(known_bad_urls, encoding='UTF-8') as fh:
                self.bad_urls = {line.strip() for line in fh}
        else:
            self.bad_urls = set()

        if not overwrite_warc:  # Find out next nonexisting warc filename
            num = 0
            while os.path.exists(filename):
                filename2, ext = os.path.splitext(filename)  # Should be filename.warc.gz
                if ext == '.gz' and filename2.endswith('.warc'):
                    filename2, ext2 = os.path.splitext(filename2)  # Should be filename.warc
                    ext = ext2 + ext  # Should be .warc.gz

                filename = '{0}-{1:05d}{2}'.format(filename2, num, ext)
                num += 1

        logger_.log('INFO', 'Creating archivefile: {0}'.format(filename))

        self._output_file = open(filename, 'wb')
        self._logger_ = logger_
        self._req_headers = {'Accept-Encoding': 'identity', 'User-agent': user_agent}

        self._session = Session()  # Setup session for speeding up downloads

        if proxy_url is not None:  # Set socks proxy if provided
            self._session.proxies['http'] = proxy_url
            self._session.proxies['https'] = proxy_url

        self._allow_cookies = allow_cookies

        # Setup rate limiting to prevent hammering the server
        self._requests_get = sleep_and_retry(limits(calls=max_no_of_calls_in_period,
                                                    period=limit_period)(self._http_get_w_cookie_handling))
        self._error_count = 0
        self._error_threshold = err_threshold  # Set the error threshold which cause aborting to prevent deinal

        self._writer = WARCWriter(self._output_file, gzip=True)
        if warcinfo_record_data is None:
            # INFO RECORD
            # Some custom information about the warc writer program and its settings
            info_headers = {'software': program_name, 'arguments': ' '.join(sys.argv[1:]),
                            'format': 'WARC File Format 1.0',
                            'conformsTo': 'http://bibnum.bnf.fr/WARC/WARC_ISO_28500_version1_latestdraft.pdf'}
            info_record = self._writer.create_warcinfo_record(filename, info_headers)
        else:  # Must recreate custom headers else they will not be copied
            custom_headers = ''.join('{0}: {1}\r\n'.format(k, v) for k, v in warcinfo_record_data[1].items()).\
                             encode('UTF-8')
            info_record = self._writer.create_warc_record('', 'warcinfo', warc_headers=warcinfo_record_data[0],
                                                          payload=BytesIO(custom_headers),
                                                          length=len(custom_headers))
        self._writer.write_record(info_record)

    def __del__(self):
        if hasattr(self, '_output_file'):  # If the program opened a file, then it should gracefully close it on exit!
            self._output_file.close()

    def _http_get_w_cookie_handling(self, *args, **kwargs):
        """
            Extend requests.get with optional cookie purging
        """
        if not self._allow_cookies:
            self._session.cookies.clear()
        return self._session.get(*args, **kwargs)

    def _handle_request_exception(self, url, msg):
        self._logger_.log('WARNING', '\t'.join((url, msg)))

        self._error_count += 1
        if self._error_count >= self._error_threshold:
            raise NameError('Too many error happened! Threshold exceeded! See log for details!')

    def download_url(self, url):
        scheme, netloc, path, params, query, fragment = urlparse(url)
        path = quote(path)  # For safety urlencode the generated URL...
        url = urlunparse((scheme, netloc, path, params, query, fragment))

        if url in self.bad_urls:
            self._logger_.log('INFO', 'Not downloading known bad URL: {0}'.format(url))
            return None

        try:  # The actual request
            resp = self._requests_get(url, headers=self._req_headers, stream=True)
        except RequestException as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if resp.status_code != 200:  # Not HTTP 200 OK
            self._handle_request_exception(url, 'Downloading failed with status code: {0} {1}'.format(resp.status_code,
                                                                                                      resp.reason))
            return None

        # REQUEST
        reqv_headers = resp.request.headers
        reqv_headers['Host'] = netloc

        proto = 'HTTP/{0}'.format(respv_str[resp.raw.version])  # Friendly protocol name
        reqv_http_headers = StatusAndHeaders('GET {0} {1}'.format(urlunparse(('', '', path, params, query, fragment)),
                                                                  proto), reqv_headers.items(), is_http_request=True)
        reqv_record = self._writer.create_warc_record(url, 'request', http_headers=reqv_http_headers)

        # RESPONSE
        resp_status = '{0} {1}'.format(resp.status_code, resp.reason)
        resp_headers_list = resp.raw.headers.items()  # get raw headers from urllib3
        # Must get peer_name before the content is read
        # It has no official API for that:
        # https://github.com/kennethreitz/requests/issues/2158
        # https://github.com/urllib3/urllib3/issues/1071
        # So workaround to be compatible with windows:
        # https://stackoverflow.com/questions/22492484/how-do-i-get-the-ip-address-from-a-http-request-using-the-\
        # requests-library/22513161#22513161
        try:
            peer_name = resp.raw._connection.sock.getpeername()[0]  # Must get peer_name before the content is read
        except AttributeError:  # On Windows there is no getpeername() Attribute of the class...
            try:
                peer_name = resp.raw._connection.sock.socket.getpeername()[0]
            except AttributeError:
                peer_name = 'None'  # Socket closed and could not derermine peername...

        try:
            data = resp.raw.read()  # To be able to return decoded and also write warc
        except ProtocolError as err:
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        if len(data) == 0:
            err = 'Response data has zero length!'
            self._handle_request_exception(url, 'RequestException happened during downloading: {0} \n\n'
                                                ' The program ignores it and jumps to the next one.'.format(err))
            return None

        enc = resp.encoding  # Get or detect encoding to decode the bytes of the text to str
        if enc is None:
            enc = detect(data)['encoding']
        try:
            text = data.decode(enc)  # Normal decode process
        except UnicodeDecodeError:
            self._logger_.log('WARNING', '\t'.join(('DECODE ERROR RETRYING IN \'IGNORE\' MODE:', url, enc)))
            text = data.decode(enc, 'ignore')
        data_stream = BytesIO(data)  # Need the original byte stream to write the payload to the warc file

        resp_http_headers = StatusAndHeaders(resp_status, resp_headers_list, protocol=proto)
        # Add extra headers like encoding because it is not stored any other way...
        resp_record = self._writer.create_warc_record(url, 'response', payload=data_stream,
                                                      http_headers=resp_http_headers,
                                                      warc_headers_dict={'WARC-IP-Address': peer_name,
                                                                         'WARC-X-Detected-Encoding': enc})
        # Everything is OK, write the two WARC records
        self._writer.write_record(reqv_record)
        self._writer.write_record(resp_record)

        return text

    def write_record(self, record):
        self._writer.write_record(record)
Esempio n. 7
0
class CCWARCWriter:
    def __init__(self,
                 prefix,
                 max_size,
                 subprefix=None,
                 gzip=True,
                 get_serial=None):
        self.writer = None
        self.prefix = prefix
        self.subprefix = subprefix
        self.max_size = max_size
        self.gzip = gzip
        self.hostname = socket.gethostname()
        if get_serial is not None:
            self.external_get_serial = get_serial
        else:
            self.external_get_serial = None
            self.serial = 0

    def __del__(self):
        if self.writer is not None:
            self.f.close()

    def create_default_info(self,
                            version,
                            ip,
                            description=None,
                            creator=None,
                            operator=None):
        '''
        creator:  # person, organization, service
        operator:  # person, if creator is an organization
        isPartOf:  # name of the crawl
        '''
        info = OrderedDict()

        info['software'] = 'cocrawler/' + version
        info['hostname'] = self.hostname
        info['ip'] = ip
        if description:
            info['description'] = description
        if creator:
            info['creator'] = creator
        if operator:
            info['operator'] = operator
        info[
            'isPartOf'] = self.prefix  # intentionally does not include subprefix
        info['format'] = 'WARC file version 1.0'
        self.info = info
        return info

    def open(self):
        filename = self.prefix
        if self.subprefix:
            filename += '-' + self.subprefix
        serial = self.get_serial(filename)
        filename += '-' + serial + '-' + self.hostname + '.warc'
        if self.gzip:
            filename += '.gz'
        self.filename = filename
        self.f = open(filename, 'wb')
        self.writer = WARCWriter(self.f, gzip=self.gzip)
        record = self.writer.create_warcinfo_record(self.filename, self.info)
        self.writer.write_record(record)

    def get_serial(self, filename):
        if self.external_get_serial is not None:
            return self.external_get_serial(filename)
        self.serial += 1
        return '{:06}'.format(self.serial - 1)

    def maybe_close(self):
        '''
        TODO: always close/reopen if subprefix is not None; minimizes open filehandles
        '''
        fsize = os.fstat(self.f.fileno()).st_size
        if fsize > self.max_size:
            self.f.close()
            self.writer = None

    def write_dns(self, dns, expires, url):
        # write it out even if empty
        # TODO: we filter the addresses early, should we warc the unfiltered dns repsonse?

        # the response object doesn't contain the query type 'A' or 'AAAA'
        # but it has family=2 AF_INET (ipv4) and flags=4 AI_NUMERICHOST -- that's 'A'
        kind = 'A'  # fixme IPV6

        ttl = int(expires - time.time())
        host = url.hostname

        if self.writer is None:
            self.open()

        payload = timestamp_now() + '\r\n'

        for r in dns:
            try:
                payload += host + '.\t' + str(
                    ttl) + '\tIN\t' + kind + '\t' + r['host'] + '\r\n'
            except Exception as e:
                LOGGER.info('problem converting dns reply for warcing', host,
                            r, e)
                pass
        payload = payload.encode('utf-8')

        record = self.writer.create_warc_record('dns:' + host,
                                                'resource',
                                                payload=BytesIO(payload),
                                                warc_content_type='text/dns',
                                                length=len(payload))

        self.writer.write_record(record)
        LOGGER.debug('wrote warc dns response record%s for host %s',
                     p(self.prefix), host)
        stats.stats_sum('warc dns' + p(self.prefix), 1)

    def write_request_response_pair(self,
                                    url,
                                    req_headers,
                                    resp_headers,
                                    is_truncated,
                                    payload,
                                    digest=None):
        if self.writer is None:
            self.open()

        # XXX WARC-Identified-Payload-Type set from Apache Tika? (done by Common Crawl) (how expensive?)

        req_http_headers = StatusAndHeaders(
            'GET / HTTP/1.1', headers_to_str_headers(req_headers))

        request = self.writer.create_warc_record('http://example.com/',
                                                 'request',
                                                 http_headers=req_http_headers)

        resp_http_headers = StatusAndHeaders(
            '200 OK',
            headers_to_str_headers(resp_headers),
            protocol='HTTP/1.1')

        warc_headers_dict = {}
        if digest is not None:
            warc_headers_dict['WARC-Payload-Digest'] = digest
        if is_truncated:
            if is_truncated in valid_truncations:
                warc_headers_dict['WARC-Truncated'] = is_truncated
            else:
                LOGGER.error('Invalid is_truncation of ' + is_truncated)
                warc_headers_dict['WARC-Truncated'] = 'unspecified'

        response = self.writer.create_warc_record(
            url,
            'response',
            payload=BytesIO(payload),
            length=len(payload),
            warc_headers_dict=warc_headers_dict,
            http_headers=resp_http_headers)

        self.writer.write_request_response_pair(request, response)
        self.maybe_close()
        LOGGER.debug('wrote warc request-response pair%s for url %s',
                     p(self.prefix), url)
        stats.stats_sum('warc r/r' + p(self.prefix), 1)
Esempio n. 8
0
class SiteCrawler(object):
    def __init__(self,
                 priority,
                 multi_site_crawler,
                 seed_urls,
                 domain,
                 config,
                 scout=None):
        # Multi-site crawler object that manages current crawler
        self.multi_site_crawler = multi_site_crawler

        # Concurrency lock to ensure that only one process accesses URL lists (pending, visited and attempts)
        self.url_list_concurrency_lock = Lock()
        # Concurrency lock to ensure that only one process accesses to write the status and the output WARC file
        self.file_write_concurrency_lock = Lock()
        # If verbose is True, debuging level is set to INFO; otherwise it is ERROR
        logging.basicConfig(
            level=logging.INFO if config["verbose"] else logging.ERROR)

        # Domain corresponding to the seed URLs to be crawled
        self.domain = domain
        # Accepted TLDs in the crawl
        self.tlds = config["accepted_tlds"]

        # Set of URLs that have been already crawled
        self.visited = set()
        # Map that counts the number of times a URL is visited and could not be accessed
        self.attempts = {}
        # Links that must not be re-crawled until some time has passed
        self.asleep_links = {}
        # Maximum number of attempts to visit a website and receiving an error until it is discarded
        self.max_attempts = config["max_attempts"]

        # Maximum
        self.max_folder_tree_depth = config["max_folder_tree_depth"]
        # Accepted content time (for example: (text/html) )
        self.accepted_content_type = config["accepted_content"]
        # List of regular expressions to discard URLs
        self.url_blacklist_re = config["url_blacklist"]

        # If interrupt is set to False, crawling stops
        self.interrupt = False
        self.sleep_thread = None

        # Variable that keeps the current size of the crawling
        self.crawl_size = 0.0
        # Priority of the process when added to the queue that manages all the crawlers in MultiSiteCrawler
        self.priority = priority

        # Path to the file that stores crawling state dump
        self.dumpfile = config["output_dir"] + "/" + self.domain + ".state"
        # If a path is provided, the previous crawling status is restored to resume crawling
        if config["resume_crawling"]:
            self.load_status(pickle.load(open(self.dumpfile, 'rb')))
        # Path to the file where WARC is writen
        output_file_name = config["output_dir"] + "/" + self.domain + ".warc.gz"
        metadata_output_file_name = config[
            "output_dir"] + "/" + self.domain + ".metadata.gz"
        name_counter = 1
        while os.path.isfile(output_file_name):
            output_file_name = config[
                "output_dir"] + "/" + self.domain + "." + str(
                    name_counter) + ".warc.gz"
            metadata_output_file_name = config[
                "output_dir"] + "/" + self.domain + "." + str(
                    name_counter) + ".metadata.gz"
            name_counter += 1
        f_out = open(output_file_name, 'wb')
        self.writer = WARCWriter(f_out, gzip=True)
        self.metadata_writer = gzip.open(metadata_output_file_name, "wb")

        # Scout object that will determine if the website is promising and if crawling should be interrupted
        self.scout = scout
        # The user will only keep documents in these languages
        self.langs_of_interest = config["langs_of_interest"]

        # User agent of the crawl
        self.user_agent = config["user_agent"]
        # Connection timeout
        self.conn_timeout = config["connection_timeout"]
        # Setting default crawling delay
        self.default_delay = config["crawl_delay"]
        # Init list of pending URLs from seed URLs; every URL is checked to confirm that it can be visited
        self.pending_urls = []
        # Robots parser: it is initialised from the first valid seed URL found
        self.robots = SiteRobots(self.user_agent, self.default_delay,
                                 self.conn_timeout)
        self.url_list_concurrency_lock.acquire()
        for url in seed_urls:
            if url.is_valid():
                self.add_url_to_list(url)
        self.url_list_concurrency_lock.release()

        # Maximum crawling size for this site
        if "max_size_per_site" not in config:
            self.max_size = None
        else:
            self.max_size = config["max_size_per_site"]
        # Maximum crawling time for this site
        if "max_time_per_site" not in config:
            self.max_time = None
        else:
            self.max_time = config["max_time_per_site"]
        # Starting time of the crawling; it is used to decide when max_time is reached
        self.starts = int(time.time())
        # Time of the last connection; it is used to make sure that delay is fulfilled
        self.last_connection = self.starts - self.default_delay

    def extend_url_list(self, url_list):
        self.url_list_concurrency_lock.acquire()
        for u in url_list:
            self.add_url_to_list(u)
        self.url_list_concurrency_lock.release()

    # Adding URL to the list of URLs to be visited during crawling; before doing so, checks if it was already visited or
    # if it infringes TLD restrictions
    def add_url_to_list(self, url):
        if not url.is_valid():
            logging.info('"%s" is not a valid URL', url.get_norm_url())
        if url.get_norm_url() in self.visited or url in self.pending_urls:
            logging.info(
                '"%s" already used before (it may be pending of crawling)',
                url.get_norm_url())
        else:
            logging.info('"%s" added to pending URLs', url.get_norm_url())
            self.pending_urls.append(url)

    def get_pending_url(self):
        url = None
        try:
            self.url_list_concurrency_lock.acquire()
            sleeping_urls = []
            while len(self.pending_urls) > 0 and url is None:
                # Next URL is picked from the list of pending URLs and is added to the list of visited URLs
                tmp_url = self.pending_urls.pop()
                if tmp_url.wait_until is not None and tmp_url.wait_until > time.time(
                ):
                    sleeping_urls.append(url)
                else:
                    self.visited.add(tmp_url.get_norm_url())
                    url = tmp_url
            self.pending_urls.extend(sleeping_urls)
        finally:
            self.url_list_concurrency_lock.release()
        #threading.current_thread().name = "crawling: "+url.get_norm_url()
        return url

    def _process_link(self, link, url):
        logging.debug("\t\t" + threading.current_thread().name +
                      "--- going to process " + link.get_norm_url())
        # Longer than limit set by the standard RFC7230 are discarded
        if not link.is_valid():
            return None
        # Filter url using URL blacklist_re
        for f in self.url_blacklist_re:
            if re.search(f, link.get_norm_url()):
                return None

        if self.domain == link.get_domain():
            logging.debug("\t\t" + threading.current_thread().name +
                          "--- adding URL to list " + link.get_norm_url())
            self.url_list_concurrency_lock.acquire()
            self.add_url_to_list(link)
            self.url_list_concurrency_lock.release()
            return link
        elif link.get_tld() in self.tlds:
            self.url_list_concurrency_lock.acquire()
            if link.get_norm_url() in self.visited:
                logging.info('"%s" already used to extend list of seed URLs',
                             link.get_norm_url())
                self.url_list_concurrency_lock.release()
            else:
                logging.info('"%s" used to extend list of seed URLs',
                             link.get_norm_url())
                self.visited.add(link.get_norm_url())
                self.url_list_concurrency_lock.release()
                self.multi_site_crawler.extend_seed_urls(link)
            return link
        else:
            logging.info('"%s" discarded: not in the same TLD',
                         link.get_norm_url())
            return None

    def _calc_depth(self, url):
        # calculate url depth
        return len(
            url.replace('https', 'http').replace(
                self.root_url, '').rstrip('/').split('/')) - 1

    def connect_to_server(self, url):
        res = None
        try:
            logging.info('Connecting to: %s', url.get_norm_url())
            self.last_connection = time.time()
            # Connections are done with a delay to avoid blocking the server
            if url.get_url_parts().scheme == 'http':
                try:
                    conn = http.client.HTTPConnection(
                        url.get_url_parts().netloc, timeout=self.conn_timeout)
                except:
                    conn = http.client.HTTPSConnection(
                        url.get_url_parts().netloc, timeout=self.conn_timeout)
            else:
                conn = http.client.HTTPSConnection(url.get_url_parts().netloc,
                                                   timeout=self.conn_timeout)
            logging.info('Connection obtained: %s', url.get_norm_url())

            conn.request('GET',
                         quote(url.get_url_parts().path, '?=&%/'),
                         headers={'User-Agent': self.user_agent})
            logging.info('Get request set %s', url.get_norm_url())

            res = conn.getresponse()

            logging.info('Response obtained from: %s', url.get_norm_url())
        except (http.client.HTTPException, EnvironmentError) as e:
            logging.info("HTTPException!")
            conn = None
            self.process_failed_url(url)
        except socket.timeout:
            logging.info("Socket timeout!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except ssl.CertificateError:
            logging.info("CertificateError!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except ConnectionResetError:
            logging.info("ConnectionResetError!")
            if conn is not None:
                conn.close()
            self.process_failed_url(url)
        except Exception as ex:
            logging.info(str(ex))
            if conn is not None:
                conn.close()
        if conn is None:
            logging.info('Connection is closed')
        else:
            logging.info('Connection is correct')
        return conn, res

    # The method returns True if the response status is 2XX and the document should be processed; otherwhise it takes
    # the corresponding action (manage redirects or errors)
    def deal_with_response_status(self, url, response):
        if 200 <= response.status <= 226:
            return True
        elif 301 <= response.status <= 308:
            rlink = self._process_link(Link(response.getheader('location')),
                                       url)
            if rlink is not None:
                logging.info('%s Redirect: %s -> %s',
                             threading.current_thread().name,
                             url.get_norm_url(), rlink.get_norm_url())
        elif 400 <= response.status <= 407 or 409 <= response.status <= 412 or 414 <= response.status <= 427 or 431 <= response.status:
            self.process_failed_url(url, retry=False)
        elif response.status == 408:
            self.process_failed_url(url, retry=True)
        elif response.status == 413 or response.status == 428:
            waiting_time = response.getheader('Retry-After')
            if waiting_time is None:
                url.wait_until = time.time() + 500
            else:
                url.wait_until = time.time() + int(waiting_time)
            self.process_failed_url(url, retry=True)
        else:
            self.process_failed_url(url, retry=False)
        return False

    def crawl_one_page(self):
        self.multi_site_crawler.new_running_crawler()
        url = self.get_pending_url()
        if not self.interrupt and url is not None:
            if not self.robots.fetch(url, self.max_attempts, self.domain):
                logging.info("robots.txt forbids crawling URL: %s",
                             url.get_norm_url())
                return
            logging.debug("\t" + threading.current_thread().name +
                          " >>>> Connecting " + url.get_norm_url() + "...")
            connection, server_response = self.connect_to_server(url)
            logging.debug("\t" + threading.current_thread().name +
                          "<<<< Connected " + url.get_norm_url())

            # If response is 2XX, the web page is processed
            if server_response is not None and self.deal_with_response_status(
                    url, server_response):
                # Check content type
                content_type = server_response.getheader('Content-Type')
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Content type: " + str(content_type))
                doc = None
                if content_type is not None and not re.search(
                        self.accepted_content_type, content_type):
                    logging.info("%s discarded: wrong file type",
                                 url.get_norm_url())
                else:
                    logging.debug("\t" + threading.current_thread().name +
                                  ">>>> Extracting doc from " +
                                  url.get_norm_url())
                    doc = WebDocument(server_response, url, self.max_attempts)
                    logging.debug("\t" + threading.current_thread().name +
                                  "<<<< Document extracted " +
                                  url.get_norm_url())
                connection.close()
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Connection closed: " + url.get_norm_url())

                if doc is not None:
                    if doc.utf_text:
                        links_set = doc.get_link_set()
                        # We can shuffle links to avoid to get biased by the structure of the site
                        # random.shuffle(linksset)
                        listoflinks = []
                        for li in links_set:
                            listoflinks.append(li.get_norm_url())
                        logging.debug("\t" + threading.current_thread().name +
                                      "<<<< Processing " +
                                      str(len(links_set)) + " links... " +
                                      url.get_norm_url() + "... " +
                                      " ".join(listoflinks))
                        for link in links_set:
                            self._process_link(link, doc.url)
                        logging.debug("\t" + threading.current_thread().name +
                                      "<<<< Links processed " +
                                      url.get_norm_url())

                        if doc.get_lang() is None or not doc.get_lang(
                        ).is_reliable:
                            logging.info(
                                "%s discarded: language detection is not reliable",
                                url.get_norm_url())
                        elif doc.get_lang(
                        ).language not in self.langs_of_interest:
                            logging.info(
                                "%s discarded: language not among languages of interest (detected=%s)",
                                url.get_norm_url(),
                                doc.get_lang().language)
                        else:
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          ">>>> Running scout " +
                                          url.get_norm_url())
                            self.run_scout(doc)
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          "<<<< Scout run " +
                                          url.get_norm_url())
                            # The document is writen to the warc
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          ">>>> Write document " +
                                          url.get_norm_url())
                            self.write_document(doc)
                            logging.debug("\t" +
                                          threading.current_thread().name +
                                          "<<<< Document saved " +
                                          url.get_norm_url())
                else:
                    logging.debug("\t" + threading.current_thread().name +
                                  "<<<< Document was none: " +
                                  url.get_norm_url())

            else:
                logging.debug("\t" + threading.current_thread().name +
                              "<<<< Connection was none")

                if connection is not None:
                    connection.close()

            if self.max_size is not None and self.crawl_size > self.max_size:
                self.interrupt_crawl()
            elif self.max_time is not None and time.time(
            ) - self.crawlstarts > self.max_time:
                self.interrupt_crawl()
            elif len(self.pending_urls) == 0:
                self.interrupt = True
        # If the crawler is allowed to continue crawling, wait until delay has passed and continue
        if not self.interrupt:
            self.sleep_thread = Thread(target=self._wait_and_queue)
            self.sleep_thread.daemon = False
            self.sleep_thread.name = self.sleep_thread.name + "_sleep"
            self.sleep_thread.start()
        else:
            self.multi_site_crawler.new_done_crawler()

    def _wait_and_queue(self):
        sleeptime = self.robots.get_delay() - (time.time() -
                                               self.last_connection)
        if sleeptime > 0:
            time.sleep(sleeptime)
        self.multi_site_crawler.crawler_ready(self)
        self.multi_site_crawler.new_done_crawler()

    # Scout is run until the recommendation_ready is ready; once it is, the object scout is deleted
    def run_scout(self, doc):
        if self.scout is not None:
            self.scout.step(doc)
            if self.scout.recommendation_ready():
                if not self.scout.recommendation_keep_crawling():
                    logging.info(
                        "Website discarded after crawling %s due to infringement of scout rule",
                        doc.url.get_norm_url())
                    self.interrupt = True
                else:
                    logging.info(
                        "Scout recommends keep crawling website after downloading %s; langs of interest found: %s",
                        doc.url.get_norm_url(), str(self.scout.lang_evidence))
                self.scout = None

    def process_failed_url(self, url, retry=True):
        if not retry:
            self.url_list_concurrency_lock.acquire()
            self.visited.add(url.get_norm_url())
            self.url_list_concurrency_lock.release()
            logging.info('%s: the URL does not exist', url.get_norm_url())
        else:
            if url.get_norm_url() not in self.attempts:
                self.url_list_concurrency_lock.acquire()
                self.add_url_to_list(url)
                self.attempts[url.get_norm_url()] = 1
                self.visited.remove(url.get_norm_url())
                self.url_list_concurrency_lock.release()
                logging.info('%s: retrying (attempt 1)', url.get_norm_url())
            else:
                if self.attempts[url.get_norm_url()] <= self.max_attempts:
                    logging.info('%s: retrying (attempt %s)', url,
                                 str(self.attempts[url.get_norm_url()]))
                    self.url_list_concurrency_lock.acquire()
                    self.add_url_to_list(url)
                    self.attempts[url.get_norm_url()] += 1
                    self.visited.remove(url.get_norm_url())
                    self.url_list_concurrency_lock.release()
                else:
                    self.url_list_concurrency_lock.acquire()
                    del self.attempts[url.get_norm_url()]
                    self.visited.add(url.get_norm_url())
                    self.url_list_concurrency_lock.release()
                    logging.info('%s: given up after %s attempts',
                                 url.get_norm_url(), str(self.max_attempts))

    def write_document(self, doc):
        self.file_write_concurrency_lock.acquire()
        try:
            headers_list = doc.response.getheaders()
            http_headers = StatusAndHeaders('200 OK',
                                            headers_list,
                                            protocol='HTTP/1.0')
            norm_url = doc.url.get_norm_url()
            record = self.writer.create_warc_record(norm_url,
                                                    'response',
                                                    payload=io.BytesIO(
                                                        doc.text),
                                                    http_headers=http_headers)
            self.writer.write_record(record)
            self.crawl_size += sys.getsizeof(doc.text) / 1000000.0
            if self.metadata_writer is not None:
                self.metadata_writer.write(
                    ("%s\t%s\t%s\n" % (doc.url.get_norm_url(), str(
                        doc.encoding), str(doc.get_lang()))).encode())
                self.metadata_writer.flush()
        finally:
            self.file_write_concurrency_lock.release()

    def get_status_object(self):
        targets = []
        for u in self.pending_urls:
            targets.append(u.get_norm_url())
        return {
            'visited': self.visited,
            'pendingurls': targets,
            'attempts': self.attempts
        }

    def load_status(self, status_obj):
        try:
            self.file_write_concurrency_lock.acquire()
            self.visited = status_obj['visited']
            self.pending_urls = []
            for u in status_obj['pendingurls']:
                self.pending_urls.append(Link(u))
            self.attempts = status_obj['attempts']
        finally:
            self.file_write_concurrency_lock.release()

    def save_status(self):
        try:
            self.file_write_concurrency_lock.acquire()
            if self.dumpfile is not None:
                pickle.dump(self.get_status_object(),
                            open(self.dumpfile, 'wb'))
        finally:
            self.file_write_concurrency_lock.release()

    def interrupt_crawl(self):
        try:
            self.url_list_concurrency_lock.acquire()
            self.interrupt = True
            self.save_status()
            self.metadata_writer.close()
        finally:
            self.url_list_concurrency_lock.release()

    def __hash__(self):
        return hash(self.domain)

    def one_thread_less(self):
        self.threads += 1
Esempio n. 9
0
def mergeWarc(files, output):
    # stats
    unique = 0
    revisit = 0
    uniqueLength = 0
    revisitLength = 0

    payloadMap = {}
    writer = WARCWriter(output, gzip=True)

    # Add an additional warcinfo record, describing the transformations. This
    # is not ideal, since
    #   “A ‘warcinfo’ record describes the records that
    #   follow it […] until next ‘warcinfo’”
    #   -- https://iipc.github.io/warc-specifications/specifications/warc-format/warc-1.1/#warcinfo
    # A warcinfo record is expected at the beginning of every file. But it
    # might have written by a different software, so we don’t want to
    # strip/replace that information, but supplement it.
    warcinfo = {
        'software': getSoftwareInfo(),
        'tool': 'crocoite-merge',  # not the name of the cli tool
        'parameters': {
            'inputs': files
        },
    }
    payload = BytesIO(json.dumps(warcinfo, indent=2).encode('utf-8'))
    record = writer.create_warc_record(
        packageUrl('warcinfo'),
        'warcinfo',
        payload=payload,
        warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'})
    writer.write_record(record)

    for l in files:
        with open(l, 'rb') as fd:
            for record in ArchiveIterator(fd):
                if record.rec_type in {'resource', 'response'}:
                    headers = record.rec_headers
                    rid = headers.get_header('WARC-Record-ID')
                    csum = headers.get_header('WARC-Payload-Digest')
                    length = int(headers.get_header('Content-Length'))
                    dup = payloadMap.get(csum, None)
                    if dup is None:
                        payloadMap[csum] = {
                            'uri': headers.get_header('WARC-Target-URI'),
                            'id': rid,
                            'date': headers.get_header('WARC-Date')
                        }
                        unique += 1
                        uniqueLength += length
                    else:
                        logging.debug(
                            f'Record {rid} is duplicate of {dup["id"]}')
                        # Payload may be identical, but HTTP headers are
                        # (probably) not. Include them.
                        record = writer.create_revisit_record(
                            headers.get_header('WARC-Target-URI'),
                            digest=csum,
                            refers_to_uri=dup['uri'],
                            refers_to_date=dup['date'],
                            http_headers=record.http_headers)
                        record.rec_headers.add_header('WARC-Truncated',
                                                      'length')
                        record.rec_headers.add_header('WARC-Refers-To',
                                                      dup['id'])
                        revisit += 1
                        revisitLength += length
                else:
                    unique += 1
                writer.write_record(record)
    json.dump(dict(
        unique=dict(records=unique, bytes=uniqueLength),
        revisit=dict(records=revisit, bytes=revisitLength),
        ratio=dict(records=unique / (unique + revisit),
                   bytes=uniqueLength / (uniqueLength + revisitLength)),
    ),
              sys.stdout,
              cls=StrJsonEncoder)
Esempio n. 10
0
    def facebook_user_ads(self, username, nsid, iso2c, access_token):
        assert username or nsid

        limit_per_page = 500

        if username and not nsid:
            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid and access_token and iso2c:
            # start scraping
            request_url = "https://graph.facebook.com/v5.0/ads_archive"
            request_params = {
                "access_token":
                access_token,
                "limit":
                limit_per_page,
                "search_page_ids":
                str(nsid),
                "ad_active_status":
                "ALL",
                "ad_reached_countries":
                iso2c,  # todo
                "fields":
                "page_name, page_id, funding_entity, ad_creation_time, ad_delivery_start_time, ad_delivery_stop_time, ad_creative_body, ad_creative_link_caption, ad_creative_link_description, ad_creative_link_title, ad_snapshot_url, demographic_distribution, region_distribution, impressions, spend, currency"
            }

            api_result = requests.get(request_url, params=request_params)

            print(api_result.text)

            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            # write to warc
            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(api_result.json(),
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
            time.sleep(1.2)  # sleep to avoid getting blocked by api

        else:
            log.debug(
                "Something went wrong. Is some information missing? Access token is: %s, iso2c is: %s",
                str(access_token), str(iso2c))
Esempio n. 11
0
    def facebook_user_bio(self, username):
        """Scrapes Facebook bio and returns info
        on the information contained on the about page (e.g. https://www.facebook.com/pg/SPD/about/?ref=page_internal)
        @param username: Facebook username
        @return: a dictionary of account attributes """

        user_email_fb = self.message['credentials']['user_email_fb']
        user_password_fb = self.message['credentials']['user_password_fb']

        # ensure username is clean and can be accessed
        if username.startswith(
                "https://www.facebook.com/") or username.startswith(
                    "http://www.facebook.com/"):

            username = re.sub(r'^.+facebook\.com\/', '', username)
            # possibly also remove trailing /
            username = re.sub(r'\/$', '', username)

        # created at field
        fb_general = base_fb_url + username
        # bio info
        fb_about = base_fb_url + username + "/about/?ref=page_internal"
        # site transparency (e.g. admins)
        m_fb_general = "http://m.facebook.com/" + username

        # request the html
        r = requests.get(fb_general)
        # ensure no 404's
        if not r:
            log.debug("Couldn't access profile site: %s", fb_general)
            return

        soup = BeautifulSoup(r.content, "html.parser")

        # scrape creation date
        created_at = soup.find('div', {"class": "_3qn7"})
        created_at = created_at.select_one("span").text

        created_at = re.sub(r"(Seite erstellt)", "", created_at)

        created_at = created_at[3:]

        # scrape n of likes
        # find span with like number
        spans = soup.find('span', {"class": "_52id _50f5 _50f7"})
        # isolate likes via regex
        likes = re.search(r'^[\d]+.[^\s]+', spans.text).group()

        bio_dict = {
            "username": fb_general,
            "n_likes": likes,
            "created_at": created_at
        }

        # request about html
        r_about = requests.get(fb_about)

        # ensure no 404's
        if not r_about:
            log.debug("Couldn't access username/about site: %s", fb_about)
            return

        about_soup = BeautifulSoup(r_about.content, "html.parser")
        mission_text = about_soup.find_all('div', {'class': "_4bl9"})

        for divs in mission_text:
            describing_div = divs.find('div', {'class': '_50f4'})
            content_div = divs.find('div', {'class': '_3-8w'})

            if describing_div and content_div:
                bio_dict[describing_div.text] = content_div.text

        # photos
        # Retrieves profile and cover photo of public facebook page
        # bio going to the 'about' page, parsing html and getting
        # the links to photos from script tag, these can then be passed
        # harvest_media
        # this is not affected by the harvest_media options but will always happen
        all_scripts = about_soup.find_all('script')

        for js in all_scripts:
            for content in js.contents:
                if 'cover_photo' in content:
                    # isolate relevant links
                    links = re.findall(r'https\:\\/\\/scontent[^"]*', content)

                    # remove escaped front slashes
                    for val, link in enumerate(links):
                        links[val] = re.sub(r'\\', "", link)
                        self._harvest_media_url(links[val])

        if m_fb_general:

            user_agent = 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/60.0.3112.50 Safari/537.36'
            site_transparency_class_selector = "._a58._a5o._9_7._2rgt._1j-g._2rgt._86-3._2rgt._1j-g._2rgt"
            site_transparency_detail_id = "u_0_d"

            chrome_options = webdriver.ChromeOptions()
            chrome_options.add_argument('headless')
            chrome_options.add_argument('start-maximised')
            chrome_options.add_argument('--no-sandbox')
            chrome_options.add_argument('--window-size=1200x800')
            chrome_options.add_argument('--disable-dev-shm-usage')
            chrome_options.add_argument(f"user-agent={user_agent}")

            # this will connect to the selenium container starting scraping
            driver = webdriver.Remote("host.docker.internal:4444/wd/hub",
                                      {'browserName': 'chrome'})
            driver.get("http://m.facebook.com")
            driver.maximize_window()
            # accept cookies
            cookies = driver.find_element_by_id('accept-cookie-banner-label')
            # more or less random wait to replicate user behavior, ensure politeness
            time.sleep(random.uniform(3, 9))
            cookies.click()
            # Search & Enter the Email or Phone field & Enter Password
            username_fb = driver.find_element_by_id("m_login_email")
            password_fb = driver.find_element_by_id("m_login_password")
            submit = driver.find_element_by_css_selector("._56b_")
            # send keys and make sure not prepolutaed
            # 2fa has to be deactivated
            username_fb.clear()
            password_fb.clear()
            username_fb.send_keys(user_email_fb)
            password_fb.send_keys(user_password_fb)
            time.sleep(random.uniform(3, 9))
            # Step 4) Click Login
            submit.click()
            time.sleep(random.uniform(3, 9))
            # navigate to site
            driver.get(m_fb_general)
            time.sleep(random.uniform(3, 9))
            driver.execute_script("window.scrollTo(0, 800)")
            # site info only loads on scroll
            # use class name and div content (todo)
            time.sleep(random.uniform(20, 25))
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.CSS_SELECTOR, site_transparency_class_selector)))
            site_transparency = driver.find_elements_by_css_selector(
                site_transparency_class_selector)
            #site transparency should always be below about
            site_transparency[1].click()
            time.sleep(random.uniform(20, 15))
            # simply get the whole text of the transparency box of site
            # the exact info can be extracted ex-post
            element = WebDriverWait(driver, 20).until(
                ec.presence_of_element_located(
                    (By.ID, site_transparency_detail_id)))
            time.sleep(random.uniform(3, 9))
            site_transparency_text = driver.find_element_by_id(
                site_transparency_detail_id).text
            time.sleep(random.uniform(3, 9))
            driver.close()
            log.info("Finished scraping transparency box")
            bio_dict['transparency_text'] = site_transparency_text

        # ensure that only warc will be written if sites were found
        # else nothing will happen
        if r_about or r:
            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(bio_dict,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    "https://m.facebook.com/" + username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)
Esempio n. 12
0
    def facebook_user_timeline(self, seed_id, username, nsid):
        """This function will scrape the user timeline"""
        log.debug("Harvesting user %s with seed_id %s.", username, seed_id)
        # make sure either username or nsid is present to start scraping
        assert username or nsid

        # Possibly look up username
        if username and not nsid:

            log.debug("No FB userid, retrieving it")

            nsid = self.get_fbid(username)

        if nsid:
            # report back whether user id was found
            log.info("FB userid %s", nsid)
            # todo - need to add timeout and what to do if blocked
            # todo - post ids will sometimes be empty, account for that for incremental

            incremental = self.message.get("options",
                                           {}).get("incremental", False)
            harvest_media = self.message.get("options",
                                             {}).get("harvest_media", False)

            if incremental:
                # search for since_id of post
                since_id = self.state_store.get_state(
                    __name__, u"timeline.{}.since_id".format(nsid))

            scrape_result = []

            for post in facebook_scraper.get_posts(nsid,
                                                   pages=self.pages,
                                                   extra_info=True,
                                                   timeout=20):
                scrape_result.append(post)
                self.result.harvest_counter["posts"] += 1
                self.result.increment_stats("posts")

                if harvest_media and post[
                        'images']:  #last condition avoids parsing empty lists (i.e. no media)
                    log.info("Harvesting media from post")
                    # get media content from links - should automatically be caught within warc stream
                    # all photos on fb are jpgs, so the list comprehension checks whether this is the case
                    # for the stream, if not (e.g. video) it will not harvest
                    [
                        self._harvest_media_url(media_url)
                        for media_url in post['images'] if 'jpg' in media_url
                    ]

                if incremental and post["post_id"] == since_id:
                    log.info(
                        "Stopping, found last post that was previously harvested with id: %s",
                        post["post_id"])
                    break

            # filename will later be converted to path
            # replicating pattern from https://github.com/internetarchive/warcprox/blob/f19ead00587633fe7e6ba6e3292456669755daaf/warcprox/writer.py#L69
            # create random token for filename
            random_token = ''.join(
                random.sample('abcdefghijklmnopqrstuvwxyz0123456789', 8))
            serial_no = '00000'
            file_name = safe_string(
                self.message["id"]) + "-" + warcprox.timestamp17(
                ) + "-" + serial_no + "-" + random_token

            with open(os.path.join(self.warc_temp_dir, file_name + ".warc.gz"),
                      "wb") as result_warc_file:
                log.info("Writing json-timeline result to path %s",
                         self.warc_temp_dir)
                writer = WARCWriter(result_warc_file, gzip=True)

                def json_date_converter(o):
                    """ Converts datetime.datetime items in facebook_scraper result
                    to formate suitable for json.dumps"""
                    if isinstance(o, datetime.datetime):
                        return o.__str__()

                json_payload = json.dumps(scrape_result,
                                          default=json_date_converter,
                                          ensure_ascii=False).encode("utf-8")

                record = writer.create_warc_record(
                    username,
                    'metadata',
                    payload=BytesIO(json_payload),
                    warc_content_type="application/json")
                writer.write_record(record)
                log.info("Writing scraped results to %s", self.warc_temp_dir)

            # write to state store
            incremental = self.message.get("options",
                                           {}).get("incremental", False)

            key = "timeline.{}.since_id".format(nsid)
            max_post_time = scrape_result[0].get("time")
            max_post_id = scrape_result[0].get("post_id")

            assert max_post_time and max_post_id

            if incremental:

                self.state_store.set_state(
                    __name__, key, max_post_id) if incremental else None

                log.info("Wrote first scraped post to state_store")

        else:
            msg = "NSID not found for user {}".format(username)
            log.exception(msg)
            self.result.warnings.append(
                Msg(CODE_UID_NOT_FOUND, msg, seed_id=seed_id))
Esempio n. 13
0
def run(url, out_path, time_limit, agent, filetypes, warcfilename, wait):
    cmd = ""
    if time_limit:
        cmd += "timeout {} ".format(time_limit)
    waitoption = ""
    if wait is not None:
        waitoption = "--wait " + wait
    agentoption = ""
    if agent is not None:
        agentoption = "--user-agent \"" + agent + "\""

    filetypesoption = ""
    if filetypes is not None:
        filetypesoption = "-A \"" + filetypes + "\""

    warcoption = ""
    warcfilebasename = warcfilename[0:warcfilename.find(".warc.gz")]
    if warcfilename is not None:
        warcoption = "--warc-file \"" + warcfilebasename + "\""

    if check_wget_compression("wget --help | grep 'no-warc-compression'"):
        warcoption += " --no-warc-compression"

    cmd += "wget --mirror {WAIT} {FILETYPES} -q -o /dev/null {URL} -P {DOWNLOAD_PATH} {AGENT} {WARC}".format(
        WAIT=waitoption,
        FILETYPES=filetypesoption,
        URL=url,
        DOWNLOAD_PATH=out_path,
        AGENT=agentoption,
        WARC=warcoption)
    # print("cmd", cmd)
    try:
        system_check(cmd)
    except subprocess.CalledProcessError as grepexc:
        sys.stderr.write(
            "Warning: Some files could not be downloaded with wget\n")

    with open(warcfilebasename + ".warc", 'rb') as f_in:
        with open(warcfilebasename + ".warc.gz", 'wb') as f_out:
            writer = WARCWriter(f_out, gzip=True)
            try:
                for record in ArchiveIterator(f_in):
                    if record.http_headers:
                        if record.http_headers.get_header(
                                'Transfer-Encoding') == "chunked":
                            continue
                        try:
                            record.http_headers.to_ascii_bytes()
                        except UnicodeEncodeError:
                            # if header is non ascii, create a new header, with status code only
                            # content length and content type will be filled before writing
                            record.http_headers = StatusAndHeaders(
                                record.http_headers.get_statuscode(), [])
                    uri = record.rec_headers.get_header('WARC-Target-URI')
                    # ignore metadata records
                    if not uri or uri.startswith(
                            'metadata://gnu.org/software/wget/warc/'):
                        continue
                    record.length = None
                    writer.write_record(record)
            except Exception as e:
                print(e, file=sys.stderr)
                pass

    system_check("rm {WARC}".format(WARC=warcfilebasename + ".warc"))
Esempio n. 14
0
class WarcHandler(EventHandler):
    __slots__ = ('logger', 'writer', 'maxBodySize', 'documentRecords', 'log',
                 'maxLogSize', 'logEncoding', 'warcinfoRecordId')

    def __init__(self, fd, logger, maxBodySize=defaultSettings.maxBodySize):
        self.logger = logger
        self.writer = WARCWriter(fd, gzip=True)
        self.maxBodySize = maxBodySize

        self.logEncoding = 'utf-8'
        self.log = BytesIO()
        # max log buffer size (bytes)
        self.maxLogSize = 500 * 1024

        # maps document urls to WARC record ids, required for DomSnapshotEvent
        # and ScreenshotEvent
        self.documentRecords = {}
        # record id of warcinfo record
        self.warcinfoRecordId = None

    def __enter__(self):
        return self

    def __exit__(self, exc_type, exc_value, traceback):
        self._flushLogEntries()

    def writeRecord(self,
                    url,
                    kind,
                    payload,
                    warc_headers_dict=None,
                    http_headers=None):
        """
        Thin wrapper around writer.create_warc_record and writer.write_record.

        Adds default WARC headers.
        """

        d = {}
        if self.warcinfoRecordId:
            d['WARC-Warcinfo-ID'] = self.warcinfoRecordId
        d.update(warc_headers_dict)
        warc_headers_dict = d

        record = self.writer.create_warc_record(
            url,
            kind,
            payload=payload,
            warc_headers_dict=warc_headers_dict,
            http_headers=http_headers)
        self.writer.write_record(record)

        return record

    def _writeRequest(self, item):
        logger = self.logger.bind(reqId=item.id)

        req = item.request
        resp = item.response
        url = urlsplit(resp['url'])

        path = url.path
        if url.query:
            path += '?' + url.query
        httpHeaders = StatusAndHeaders('{} {} HTTP/1.1'.format(
            req['method'], path),
                                       item.requestHeaders,
                                       protocol='HTTP/1.1',
                                       is_http_request=True)
        initiator = item.initiator
        warcHeaders = {
            'X-Chrome-Initiator':
            json.dumps(initiator),
            'X-Chrome-Request-ID':
            item.id,
            'WARC-Date':
            datetime_to_iso_date(
                datetime.utcfromtimestamp(item.chromeRequest['wallTime'])),
        }
        try:
            bodyTruncated = None
            payload, payloadBase64Encoded = item.requestBody
        except ValueError:
            # oops, don’t know what went wrong here
            bodyTruncated = 'unspecified'
            logger.error('requestBody missing',
                         uuid='ee9adc58-e723-4595-9feb-312a67ead6a0')

        if bodyTruncated:
            warcHeaders['WARC-Truncated'] = bodyTruncated
            payload = None

        if payload:
            payload = BytesIO(payload)
            warcHeaders['X-Chrome-Base64Body'] = str(payloadBase64Encoded)
        record = self.writeRecord(req['url'],
                                  'request',
                                  payload=payload,
                                  http_headers=httpHeaders,
                                  warc_headers_dict=warcHeaders)
        return record.rec_headers['WARC-Record-ID']

    def _writeResponse(self, item, concurrentTo):
        # fetch the body
        reqId = item.id
        rawBody = None
        base64Encoded = False
        bodyTruncated = None
        if item.isRedirect:
            # redirects reuse the same request, thus we cannot safely retrieve
            # the body (i.e getResponseBody may return the new location’s
            # body).
            bodyTruncated = 'unspecified'
        elif item.encodedDataLength > self.maxBodySize:
            bodyTruncated = 'length'
            # check body size first, since we’re loading everything into memory
            self.logger.error('body for {} too large {} vs {}'.format(
                reqId, item.encodedDataLength, self.maxBodySize))
        else:
            try:
                rawBody, base64Encoded = item.body
            except ValueError:
                # oops, don’t know what went wrong here
                bodyTruncated = 'unspecified'

        # now the response
        resp = item.response
        warcHeaders = {
            'WARC-Concurrent-To':
            concurrentTo,
            'WARC-IP-Address':
            resp.get('remoteIPAddress', ''),
            'X-Chrome-Protocol':
            resp.get('protocol', ''),
            'X-Chrome-FromDiskCache':
            str(resp.get('fromDiskCache')),
            'X-Chrome-ConnectionReused':
            str(resp.get('connectionReused')),
            'X-Chrome-Request-ID':
            item.id,
            'WARC-Date':
            datetime_to_iso_date(
                datetime.utcfromtimestamp(item.chromeRequest['wallTime'] +
                                          (item.chromeResponse['timestamp'] -
                                           item.chromeRequest['timestamp']))),
        }
        if bodyTruncated:
            warcHeaders['WARC-Truncated'] = bodyTruncated
        else:
            warcHeaders['X-Chrome-Base64Body'] = str(base64Encoded)

        httpHeaders = StatusAndHeaders('{} {}'.format(resp['status'],
                                                      item.statusText),
                                       item.responseHeaders,
                                       protocol='HTTP/1.1')

        # Content is saved decompressed and decoded, remove these headers
        blacklistedHeaders = {'transfer-encoding', 'content-encoding'}
        for h in blacklistedHeaders:
            httpHeaders.remove_header(h)

        # chrome sends nothing but utf8 encoded text. Fortunately HTTP
        # headers take precedence over the document’s <meta>, thus we can
        # easily override those.
        contentType = resp.get('mimeType')
        if contentType:
            if not base64Encoded:
                contentType += '; charset=utf-8'
            httpHeaders.replace_header('content-type', contentType)

        if rawBody is not None:
            httpHeaders.replace_header('content-length',
                                       '{:d}'.format(len(rawBody)))
            bodyIo = BytesIO(rawBody)
        else:
            bodyIo = BytesIO()

        record = self.writeRecord(resp['url'],
                                  'response',
                                  warc_headers_dict=warcHeaders,
                                  payload=bodyIo,
                                  http_headers=httpHeaders)

        if item.resourceType == 'Document':
            self.documentRecords[item.url] = record.rec_headers.get_header(
                'WARC-Record-ID')

    def _writeScript(self, item):
        writer = self.writer
        encoding = 'utf-8'
        self.writeRecord(
            packageUrl('script/{}'.format(item.path)),
            'metadata',
            payload=BytesIO(str(item).encode(encoding)),
            warc_headers_dict={
                'Content-Type':
                'application/javascript; charset={}'.format(encoding)
            })

    def _writeItem(self, item):
        if item.failed:
            # should have been handled by the logger already
            return

        concurrentTo = self._writeRequest(item)
        self._writeResponse(item, concurrentTo)

    def _addRefersTo(self, headers, url):
        refersTo = self.documentRecords.get(url)
        if refersTo:
            headers['WARC-Refers-To'] = refersTo
        else:
            self.logger.error('No document record found for {}'.format(url))
        return headers

    def _writeDomSnapshot(self, item):
        writer = self.writer

        warcHeaders = {
            'X-DOM-Snapshot': str(True),
            'X-Chrome-Viewport': item.viewport,
            'Content-Type': 'text/html; charset=utf-8',
        }

        self._addRefersTo(warcHeaders, item.url)

        self.writeRecord(item.url,
                         'conversion',
                         payload=BytesIO(item.document),
                         warc_headers_dict=warcHeaders)

    def _writeScreenshot(self, item):
        writer = self.writer
        warcHeaders = {
            'Content-Type': 'image/png',
            'X-Crocoite-Screenshot-Y-Offset': str(item.yoff)
        }
        self._addRefersTo(warcHeaders, item.url)
        self.writeRecord(item.url,
                         'conversion',
                         payload=BytesIO(item.data),
                         warc_headers_dict=warcHeaders)

    def _writeControllerStart(self, item):
        payload = BytesIO(json.dumps(item.payload, indent=2).encode('utf-8'))

        writer = self.writer
        warcinfo = self.writeRecord(
            packageUrl('warcinfo'),
            'warcinfo',
            warc_headers_dict={'Content-Type': 'text/plain; encoding=utf-8'},
            payload=payload)
        self.warcinfoRecordId = warcinfo.rec_headers['WARC-Record-ID']

    def _flushLogEntries(self):
        writer = self.writer
        self.log.seek(0)
        # XXX: we should use the type continuation here
        self.writeRecord(packageUrl('log'),
                         'resource',
                         payload=self.log,
                         warc_headers_dict={
                             'Content-Type':
                             'text/plain; encoding={}'.format(self.logEncoding)
                         })
        self.log = BytesIO()

    def _writeLog(self, item):
        """ Handle log entries, called by .logger.WarcHandlerConsumer only """
        self.log.write(item.encode(self.logEncoding))
        self.log.write(b'\n')
        # instead of locking, check we’re running in the main thread
        if self.log.tell () > self.maxLogSize and \
                threading.current_thread () is threading.main_thread ():
            self._flushLogEntries()

    route = {
        Script: _writeScript,
        Item: _writeItem,
        DomSnapshotEvent: _writeDomSnapshot,
        ScreenshotEvent: _writeScreenshot,
        ControllerStart: _writeControllerStart,
    }

    def push(self, item):
        processed = False
        for k, v in self.route.items():
            if isinstance(item, k):
                v(self, item)
                processed = True
                break

        if not processed:
            self.logger.debug('unknown event {}'.format(repr(item)))