Example #1
0
  def map_warc_files(self, _, line):
    """Mapper function to process each WARC file.

    Args:
      line: Each line is a path to a WARC gz file to be processed.

    Returns:
      Generator of (key, value) tuples.
    """
    f = None
    # If we are on EC2 or running on a Hadoop cluster, pull files via S3
    if self.options.runner in ['emr', 'hadoop']:
      # Connect to Amazon S3.
      s3 = boto3.resource('s3')
      obj = s3.Object('commoncrawl', line)
      # Hack to get the raw stream out of obj:
      # http://stackoverflow.com/questions/7624900/how-can-i-use-boto-to-stream-a-file-out-of-amazon-s3-to-rackspace-cloudfiles
      f = warc.WARCFile(fileobj=GzipStreamFile(obj.get()['Body']._raw_stream))
    # If we are local, use files on the local file system
    else:
      line = Path.join(Path.abspath(Path.dirname(__file__)), line)
      print 'Loading local file {}'.format(line)
      f = warc.WARCFile(fileobj=gzip.open(line))

    # For each WARC record:
    for i, record in enumerate(f):
      for key, value in self.process_warc_record(record):
        yield key, value
    self.increment_counter('commoncrawl', 'num-files', 1)
Example #2
0
def open_warc_file(filename, from_commoncrawl=True):
    """ Opens a WARC file from local-data or S3 for Common Crawl files """

    local_data_file = os.path.join(config["PATH_BACK"],
                                   'local-data/%s' % filename)

    if not from_commoncrawl:
        filereader = open(filename, "rb")
    elif os.path.isfile(local_data_file):
        filereader = open(local_data_file, "rb")
    else:
        conn = boto.s3.connect_to_region(
            "us-east-1",
            anon=True,
            calling_format=boto.s3.connection.OrdinaryCallingFormat(),
            is_secure=False)

        pds = conn.get_bucket('aws-publicdatasets')
        filereader = Key(pds)
        filereader.key = filename

    if filename.endswith(".warc"):
        return warc.WARCFile(fileobj=filereader)
    else:
        # TODO: investigate how we could use cloudflare's zlib
        return warc.WARCFile(fileobj=GzipStreamFile(filereader))
Example #3
0
    def _warc_reader_from_file(self, filereader, filepath):
        """ Creates a WARC record iterator from a file reader """

        if filepath.endswith(".warc"):
            return warc.WARCFile(fileobj=filereader)
        else:
            # TODO: investigate how we could use cloudflare's zlib
            return warc.WARCFile(fileobj=GzipStreamFile(filereader))
Example #4
0
    def mapper(self, _, line):
        """
        The Map of MapReduce
        If you're using Hadoop or EMR, it pulls the CommonCrawl files from S3,
        otherwise it pulls from the local filesystem. Dispatches each file to
        `process_record`.
        """
        # If we're on EC2 or running on a Hadoop cluster, pull files via S3
        if self.options.runner in ['emr', 'hadoop']:
            # Connect to Amazon S3 using anonymous credentials
            boto_config = botocore.client.Config(
                signature_version=botocore.UNSIGNED,
                read_timeout=180,
                retries={'max_attempts': 20})
            s3client = boto3.client('s3', config=boto_config)
            # Verify bucket
            try:
                s3client.head_bucket(Bucket='commoncrawl')
            except botocore.exceptions.ClientError as exception:
                LOG.error('Failed to access bucket "commoncrawl": %s',
                          exception)
                return
            # Check whether WARC/WAT/WET input exists
            try:
                s3client.head_object(Bucket='commoncrawl', Key=line)
            except botocore.client.ClientError as exception:
                LOG.error('Input not found: %s', line)
                return
            # Start a connection to one of the WARC/WAT/WET files
            LOG.info('Loading s3://commoncrawl/%s', line)
            try:
                temp = TemporaryFile(mode='w+b',
                                     dir=self.options.s3_local_temp_dir)
                s3client.download_fileobj('commoncrawl', line, temp)
            except botocore.client.ClientError as exception:
                LOG.error('Failed to download %s: %s', line, exception)
                return
            temp.seek(0)
            try:
                #ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
                ccfile = warc.WARCFile(fileobj=(gzip.open(temp)))
            except Exception as exception:
                LOG.error('Failed to open %s at %s: %s', temp, line, exception)
                return
        # If we're local, use files on the local file system
        else:
            line = Path.join(Path.abspath(Path.dirname(__file__)), line)
            LOG.info('Loading local file %s', line)
            try:
                ccfile = warc.WARCFile(fileobj=gzip.open(line))
            except Exception as exception:
                LOG.error('Failed to open %s: %s', line, exception)
                return

        for _i, record in enumerate(ccfile):
            for key, value in self.process_record(record):
                yield key, value
            self.increment_counter('commoncrawl', 'processed_records', 1)
Example #5
0
def get_records(id_, iterator):
    conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
    bucket = conn.get_bucket('commoncrawl')

    for uri in iterator:
        key_ = Key(bucket, uri)
        _file = warc.WARCFile(fileobj=GzipStreamFile(key_))

        for record in _file:
            if record['Content-Type'] == 'application/json':
                record = json.loads(record.payload.read())
                try:

                    def cc_filter(x):
                        return "creativecommons.org" in x['url']

                    cc_links = filter(
                        cc_filter,
                        list(record['Envelope']['Payload-Metadata']
                             ['HTTP-Response-Metadata']['HTML-Metadata']
                             ['Links']))
                    if len(cc_links) > 0:
                        yield record
                except KeyError:
                    pass
Example #6
0
def warc_to_zip():
    warcfile = sys.argv[1]
    zipout = sys.argv[2]

    file = zipfile.ZipFile(zipout, "w", zipfile.ZIP_DEFLATED, True)

    f = warc.WARCFile(warcfile, "rb")
    for record in f:
        print "------------"
        for key in record.header.keys():
            print key, record.header[key]
        if record.header.has_key('warc-target-uri'):
            u = urlparse(record['WARC-Target-URI'])
            name = "{}/{}/{}".format(u.scheme, u.netloc, u.path)
            if record['content-type'] == "application/http;msgtype=response":
                r = httpparse(record.payload)
                file.writestr(name, r.read())
            elif record['content-type'] == "application/http;msgtype=request":
                print "payload:", record.payload
                print "Skipping request record", record['WARC-Target-URI']
                file.writestr("{}-request".format(name), record.payload.read())
            else:
                print "Skipping record", record['WARC-Target-URI']
                file.writestr(name, record.payload.read())

    file.close()
Example #7
0
    def mapper(self, _, line):
        f = None
        """
    if self.options.runner in ['inline']:
      print self.options.runner + "lol"
      print 'Loading local file {}'.format(line)
      f = warc.WARCFile(fileobj=gzip.open(line))
    else:
    """
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for i, record in enumerate(f):
            if record['Content-Type'] == 'application/http; msgtype=response':
                payload = record.payload.read()
                headers, body = payload.split('\r\n\r\n', 1)
                email = "email"
                p = re.compile(EMAIL_REGEX)
                emails = [i for i in re.findall(p, body) if len(i) < 50]
                emails = set(emails)
                domain = urlparse(record.url).netloc
                for email in emails:
                    yield {
                        "url": record.url,
                        "date": record.date,
                        "email": email,
                        "domain": domain
                    }, 1
Example #8
0
def process_files(patt=PATT):
    files = list(data_dir.glob(patt))
    host_counter, server_counter = Counter(), Counter()
    for file in files:
        ccfile = warc.WARCFile(fileobj=gzip.open(file))
        for i, record in enumerate(ccfile):
            if record['Content-Type'] != 'application/json':
                continue

            payload = record.payload.read()
            data = json.loads(payload)

            if data['Envelope']['WARC-Header-Metadata'][
                    'WARC-Type'] != 'response':
                continue

            url = data["Envelope"]["WARC-Header-Metadata"].get(
                "WARC-Target-URI")
            if url:
                host = urllib.parse.urlparse(url).netloc.lower()
                host_counter.update([host])

            server = data['Envelope']['Payload-Metadata'][
                'HTTP-Response-Metadata']['Headers'].get('Server')
            if server:
                server_counter.update([server])
    return host_counter, server_counter
    def mapWat(self, _, line):
        ''' Takes partial WARC paths and produces (hostname, {links}) pairs '''
        if self.options.localsource:
            # Stream data from local file
            # this lets us use pre-downloaded *.gz files for testing rather than
            # hammering the amazon servers.
            fpath = os.path.abspath(
                os.path.join(self.options.localsource, line))
            print('Loading local file: ' + fpath)
            rawstream = open(fpath, 'rb')
        else:
            # Stream data from common crawl servers
            conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
            pds = conn.get_bucket('commoncrawl')
            rawstream = boto.s3.key.Key(pds, line)

        # iterate through records in warc.wat.gz file
        warcstream = warc.WARCFile(fileobj=GzipStreamFile(rawstream))
        for i, record in enumerate(warcstream):
            if record['Content-Type'] == 'application/json':
                payload = record.payload.read()
                jsonPayload = json.loads(payload)
                hostlinks = self.watHostLinks(jsonPayload)
                if hostlinks: yield hostlinks
            if self.options.localsource and i % 10000 == 0:
                print('Record %5dk' % (i / 1000))
            self.increment_counter('commoncrawl', 'processed_records', 1)
        rawstream.close()
    def mapper(self, _, line):
        f = None
        #if self.options.runner in ['inline']:
        #  print self.options.runner + "lol"
        #  print 'Loading local file {}'.format(line)
        #  f = warc.WARCFile(fileobj=gzip.open(line))
        #else:
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for i, record in enumerate(f):
            if record['Content-Type'] == 'application/http; msgtype=response':
                payload = record.payload.read()
                headers, body = payload.split('\r\n\r\n', 1)
                data = []
                #data = data + Detector().check_headers(headers)
                data = data + Detector().check_script(body)
                data = data + Detector().check_html(body)
                data = {
                    "tech": data,
                    "url": record.url,
                    "date": record.date,
                    "domain": urlparse(record.url).netloc
                }
                yield data, 1
Example #11
0
    def create(self, filename, fileobj=None, operator=None):
        """
        :rtype: warc.WARCFile
        """
        assert useragent.POLICY is not None

        if fileobj is None:
            fileobj = io.BytesIO()

        self.fileobj = fileobj
        self.warc = warc.WARCFile(fileobj=fileobj)

        header = warc.WARCHeader({
            "WARC-Type": "warcinfo",
            "WARC-Filename": filename,
        }, defaults=True)
        body = [
            b"software: owlbot/"+bytes(version.STR, "ascii"),
            b"format: WARC File Format 1.0",
            # policy from .OWLBOT_POLICY or os.environ["OWLBOT_POLICY"]
            b"robots: " + bytes(useragent.POLICY, "ascii"),
        ]
        if operator is not None:
            body.append(b"operator: " + operator.encode("utf-8"))

        self.warc.write_record(
            warc.WARCRecord(header, payload=b"\r\n".join(body))
        )
Example #12
0
def read_warc(path='./data/sample.warc.gz'):
    # Beautiful soup HTML to text
    with gzip.open(path, mode='rb') as gzf:
        #cleantexts = []
        #doc_ids = []
        text_and_ids = []

        for i, record in enumerate(warc.WARCFile(fileobj=gzf)):
            if i == 0:
                continue
            # cleantexts.append(BeautifulSoup(record.payload.read(), 'lxml').text)

            soup = BeautifulSoup(record.payload.read(), 'lxml')
            for script in soup(["script", "style"]):
                script.extract()  # rip it out
            text = soup.get_text()

            # break into lines and remove leading and trailing space on each
            lines = (line.strip() for line in text.splitlines())
            # break multi-headlines into a line each

            chunks = (phrase.strip() for line in lines
                      for phrase in line.split("  "))

            # drop blank lines
            text = '\n'.join(chunk for chunk in chunks if chunk)
            text = remove_html_tags(text)

            #cleantexts.append(text)
            #doc_ids.append(record.header.get('WARC-TREC-ID'))
            doc_id = record.header.get('WARC-TREC-ID')
            text_and_ids.append((text, doc_id))

    return text_and_ids  #cleantexts,doc_ids
Example #13
0
def clean_warc(input):

    text = v2.extract(input)
    warc_content = warc.WARCFile(fileobj=StringIO.StringIO(input))

    for record in warc_content:
        url, date = record['WARC-Target-URI'], record['WARC-Date']

    return '%s,%s\n%s' % (url, date, text)
Example #14
0
 def mapper(self, _, line):
     f = None
     ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
     if self.options.runner in ['emr', 'hadoop']:
         # Connect to Amazon S3 using anonymous credentials
         conn = boto.connect_s3(anon=True)
         pds = conn.get_bucket('aws-publicdatasets')
         # Start a connection to one of the WARC files
         k = Key(pds, line)
         f = warc.WARCFile(fileobj=GzipStreamFile(k))
     ## If we're local, use files on the local file system
     else:
         print 'Loading local file {}'.format(line)
         f = warc.WARCFile(fileobj=gzip.open(line))
     ###
     for i, record in enumerate(f):
         for key, value in self.process_record(record):
             yield key, value
         self.increment_counter('commoncrawl', 'processed_records', 1)
Example #15
0
 def mapper(self, _, line):
     """
     The map will download the file from commoncrawl, parse the file into multiple records, and process each record
     """
     self.start_time = time.time()
     # Connect to Amazon S3 using anonymous credentials
     boto_config = botocore.client.Config(
         signature_version=botocore.UNSIGNED,
         read_timeout=180,
         retries={'max_attempts': 20})
     s3client = boto3.client('s3', config=boto_config)
     # Check bucket existence
     try:
         s3client.head_bucket(Bucket='commoncrawl')
     except botocore.exceptions.ClientError as exception:
         LOG.error('Failed to access bucket "commoncrawl": %s', exception)
         return
     # Check if the input exists
     try:
         s3client.head_object(Bucket='commoncrawl', Key=line)
     except botocore.client.ClientError as exception:
         LOG.error('Input not found: %s', line)
         return
     # Download input
     sys.stderr.write("Downloading s3://commoncrawl/{}\n".format(line))
     sys.stderr.write(
         time.strftime(
             "Download [START]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     try:
         temp = TemporaryFile(mode='w+b',
                              dir=self.options.s3_local_temp_dir)
         s3client.download_fileobj('commoncrawl', line, temp)
     except botocore.client.ClientError as exception:
         LOG.error('Failed to download %s: %s', line, exception)
         return
     sys.stderr.write(
         time.strftime(
             "Download [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     temp.seek(0)
     ccfile = warc.WARCFile(fileobj=(GzipStreamFile(temp)))
     sys.stderr.write('Attempting MapReduce Job......\n')
     sys.stderr.write(
         time.strftime(
             "Processing [START]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
     for _i, record in enumerate(ccfile):
         for key, value in self.process_record(record):
             yield key, value
         self.increment_counter('commoncrawl', 'processed_records', 1)
     sys.stderr.write(
         time.strftime(
             "Processing [FINISHED]. Distance from initial time: %Hh:%Mm:%Ss\n",
             time.gmtime(time.time() - self.start_time)))
Example #16
0
    def __init__(self, f):
        if not os.path.isfile('records.json'):
            print('No records.json file was found.')
            print('We need the records.json for deduplication!')
            if 'n' in raw_input('Continue? [y/n]').lower():
                sys.exit(1)
        else:
            self.load_records()

        self.input_filename = f
        self.input_file = warc.WARCFile(self.input_filename)
        self.input_file_size = os.path.getsize(self.input_filename)

        self.output_filename = self.input_filename[:-8] \
            + '-deduplicated.warc.gz'
        self.output_file = warc.WARCFile(self.output_filename, 'w')

        self.output_log_filename = self.input_filename[:-8] \
            + '-deduplicated.log'
        self.output_log = []
Example #17
0
    def double_check(cls, f):
        input_file = warc.WARCFile(f)
        input_file_size = os.path.getsize(f)
        input_file_records = 0
        output_filename = f[:-8] + '-deduplicated.warc.gz'
        output_file = warc.WARCFile(output_filename)
        output_file_size = os.path.getsize(output_filename)
        output_file_records = 0

        while input_file_size > input_file.tell():
            for record in input_file:
                input_file_records += 1

        while output_file_size > output_file.tell():
            for record in output_file:
                output_file_records += 1

        input_file.close()
        output_file.close()

        return input_file_records == output_file_records - 1
Example #18
0
def get_partial_warc_file(url, num_bytes=1024 * 10):
    """
    Download the first part of a WARC file and return a warc.WARCFile instance.

    url: the url of a gzipped WARC file
    num_bytes: the number of bytes to download. Default is 10KB

    return: warc.WARCFile instance
    """
    with closing(requests.get(url, stream=True)) as r:
        buf = StringIO(r.raw.read(num_bytes))
    return warc.WARCFile(fileobj=buf, compress=True)
 def process_record(self, record):
     f = warc.WARCFile(fileobj=gzip.open(filepath))
     vocab = defaultdict(int)
     nbwords = 0
     for i, record in enumerate(f):
         devnull = open(os.devnull,"w")
         if record['Content-Type'] != 'text/plain':
               continue
         page = record.payload.read()        
         v, n = learn_vocab_from_train_iter(page)               
         yield v, n
     self.increment_counter('commoncrawl', 'processed_pages', 1)
Example #20
0
def create(file_path, i):
    entries = []
    r = redis.StrictRedis(host='localhost', port=6379, db=0)
    with gzip.open(file_path, mode='rb') as gzf:
        for record in warc.WARCFile(fileobj=gzf):
            url = record['WARC-Target-URI'].strip()
            html = record.payload.read()
            soup = BeautifulSoup(html, 'lxml')
            links = [link.get('href') for link in soup.find_all('a')]
            row = entityfeatureextractor.extract_row(domain, url, html)
            r.rpush(url, row)
            r.rpush(url, links)
Example #21
0
    def process_paths(self, id_, paths):
        '''
        connect to s3 and get the data
        '''

        conn = boto.connect_s3(anon=True, host='s3.amazonaws.com')
        bucket = conn.get_bucket('commoncrawl')

        for uri in paths:
            key_ = Key(bucket, uri)
            archive_iterator = warc.WARCFile(fileobj=GzipStreamFile(key_))
            for record in archive_iterator:
                for res in self.process_record(record):
                    yield res
Example #22
0
    def parse_archive(self, line):
        # Connect to Amazon S3 using anonymous credentials
        conn = boto.connect_s3(anon=True)
        pds = conn.get_bucket('aws-publicdatasets')

        # Start a connection to one of the WARC files
        k = Key(pds, line)
        f = warc.WARCFile(fileobj=GzipStreamFile(k))

        for record in f:
            if record['Content-Type'] != 'application/http; msgtype=response':
                continue
            self.doc_q.put(record.payload.read())
            self.count += 1
Example #23
0
 def mapper(self, _, line):
   f = None
   ## If we're on EC2 or running on a Hadoop cluster, pull files via S3
   if line.startswith("s3://"):
   
     print('Downloading ...',file=sys.stderr)
     key = None
     
     # Connect to Amazon S3 using anonymous credentials
     conn = boto.connect_s3(anon=True)
     if line.startswith("s3://"):
        pathStart = line.index('/',5)
        bucketName = line[5:pathStart]
        keyPath = line[pathStart+1:]
        print("Bucket: "+bucketName,file=sys.stderr)
        print("Key: "+keyPath,file=sys.stderr)
        bucket = conn.get_bucket(bucketName)
        key = Key(bucket,keyPath)
     else:
        print("Bucket: aws-publicdatasets",file=sys.stderr)
        print("Key: "+line,file=sys.stderr)
        bucket = conn.get_bucket("aws-publicdatasets")
        key = Key(bucket,line)
     # Start a connection to one of the WARC files
     f = warc.WARCFile(fileobj=GzipStreamFile(key))
     
   ## If we're local, use files on the local file system
   else:
     if line.startswith("file:///"):
        line = line[7:]
     print("Local: {}".format(line),file=sys.stderr)
     f = warc.WARCFile(fileobj=gzip.open(line))
   ###
   for i, record in enumerate(f):
     for key, value in self.process_record(record):
       yield key, value
     self.increment_counter('commoncrawl', 'processed_records', 1)
Example #24
0
    def __init__(self, filename=None, old_style=False):
        # podporujeme i cteni z archivu, ne? Hlavne vsechno cist po bajtech
        if (filename is None):
            self._file = sys.stdin.buffer
        elif (filename.endswith('.xz')):
            self._file = lzma.open(filename, 'rb')
        elif (filename.endswith('.gz')):
            self._file = gzip.open(filename, 'rb')
        else:
            self._file = open(filename, 'rb')

        # Tak tohle jsem musel musel vycist ze samotnych zdrojaku knihovny warc.
        # Dokumentaci maji fakt dost spatnou.
        self._warc = warc.WARCFile(fileobj=self._file)
        self._old_style = old_style
def parse_wet_file():
    # TODO: copy WET file from HDFS to tmp path

    gzip_fobj = gzip.open(wet_file, "r")
    warc_fobj = warc.WARCFile(fileobj=gzip_fobj, compress=False)

    while True:
        try:
            record = warc_fobj.read_record()
        except:
            continue
        if not record:
            break

        # TODO: got a warc record in record, parse it
    return
Example #26
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description=__name__)
    parser.add_argument('--source-url',
                        '-u',
                        help='Remote URL to read input WARC file from.')
    parser.add_argument('--source-file',
                        '-f',
                        help='Local path to read input WARC file from.')

    args = parser.parse_args()
    # Validate argumentsif(args.command in ['setup-pool'] and args.pool_directory == None):
    if not (args.source_file or args.source_url):
        parser.error(
            "--source-file or --source-url argument must be provided.")

    if args.source_file is not None:
        source_string = args.source_file
        cf = open(args.source_file)
    elif args.source_url is not None:
        source_string = args.source_url
        # Open a connection pool
        http = urllib3.PoolManager(cert_reqs='CERT_REQUIRED',
                                   ca_certs=certifi.where())
        # Open a streaming connection to the specified URL
        cf = http.request('GET', args.source_url, preload_content=False)

    # Wrap the filestream in a streamable unzipper
    f = warc.WARCFile(fileobj=GzipStreamFile(cf))
    warc_records = 0
    warc_responses = 0
    readable_pages = 0
    report_interval = 100

    start_time = arrow.utcnow()
    for record in f:
        if record['WARC-Type'] == 'response':
            warc_responses = warc_responses + 1
    end_time = arrow.utcnow()
    elapsed_time = end_time - start_time
    print("{} response records in file {} ()".format(warc_responses,
                                                     source_string,
                                                     elapsed_time))
Example #27
0
 def process_document(self, doc):
   if doc.status == 200:
     self.concurrency_lock.acquire()
     try:
       #print base64.b64encode(doc.text)+"\t"+doc.url+"\t"+str(time.time())
       warc_record = warc.WARCRecord(payload=doc.text,headers={"WARC-Target-URI":doc.url})
       f = warc.WARCFile(fileobj=sys.stdout.buffer)
       f.write_record(warc_record)
       self.crawlsize+=sys.getsizeof(doc.text)/1000000.0
       if self.sizelimit != None and self.crawlsize > self.sizelimit:
         self.interrupt=True
         self.save_status()
       if self.timelimit != None and time.time()-self.crawlstarts > self.timelimit:
         self.interrupt=True
         self.save_status()
     finally:
       self.concurrency_lock.release()
   else:
     pass
Example #28
0
def analyze_warc_file():
    host = []
    server = []
    f = warc.WARCFile(FILE, "rb")
    for num, record in enumerate(f, start=1):
        line = None
        record.payload.readline()
        line = record.payload.readline()
        print(str(line))
        if 'Host' in str(line):
            host.append(
                re.search(r':(.*)',
                          str(line)).group(1).strip().replace("\\r\\n'", ''))
        if 'Server' in str(line):
            server.append(
                re.search(r':(.*)',
                          str(line)).group(1).strip().replace("\\r\\n'", ''))
    print(server)
    print(host)
    return host, server
def parsefile(filename):
    f = warc.WARCFile(filename, 'r')
    invindex = {}
    pagetablekey = 1 + len(PAGETABLE)
    for record in f:
        url = record.header.get('warc-target-uri', None)
        if not url:
            continue

        words = record.payload.read().split()
        PAGETABLE[pagetablekey] = (url, len(words))
        for w in set(words):
            #if regex.match(w.decode('utf8')):
            if w not in invindex:
                invindex[w] = [pagetablekey]
            else:
                invindex[w].append(pagetablekey)
        pagetablekey += 1

    f.close()
    return invindex
Example #30
0
    def process_record(self, record):
        if record['WARC-Type'] != 'response':
            return

        # The HTTP response is defined by a specification: first part is headers
        # (metadata) and then following two CRLFs (newlines) has the response
        payload = record.payload.read()

        http_headers, body = payload.split('\r\n\r\n', 1)
        if 'Content-Type: text/html' in http_headers and body.strip():
            if ENDPOINT_RE.search(http_headers) or INDIEWEB_RE.search(body):
                warcstr = StringIO()
                warcfile = warc.WARCFile(fileobj=warcstr, mode='w')
                warcfile.write_record(
                    warc.WARCRecord(payload=payload, header=record.header))
                warcbuf = base64.b64encode(warcstr.getvalue())
                warcfile.close()

                domain = urlparse.urlparse(
                    record['WARC-Target-URI']).netloc.lower()
                # domain = headers['Host']
                yield domain, warcbuf