コード例 #1
0
ファイル: indexer.py プロジェクト: Tino92/tino-thesis
def read_and_clean_files(clueweb_file, ann_file, data_dir, ann_dir):
    """
    Read file from data_dir and ann_dir, replace entity mentions and clean records in that file
    :param clueweb_file:
    :param ann_file:
    :param data_dir: Warc files directory
    :param ann_dir: Annotations directory
    :return: {'record_id': record_id,
		'replaced_record': cleaned_replaced_record,
		'cleaned_record': cleaned_record}
    """
    annotation_input = fileinput.FileInput(os.path.join(ann_dir, ann_file), openhook=fileinput.hook_compressed)
    annotation_list = []
    for line in annotation_input:
	annotation_list.append(Annotation.parse_annotation(line))

    warc_path = os.path.join(data_dir, clueweb_file)
    warc_file = warc.open(warc_path)
    print "Replacing entity mentions for ", clueweb_file, ":", ann_file, "..."
    start = time.time()
    warc_entry = WarcEntry(warc_path, warc_file, annotation_list)
    cleaned_records = warc_entry.replace_entity_mentions()
    end = time.time()
    print "Time used: ", end - start
    warc_file.close()
    return cleaned_records
コード例 #2
0
def handle_warc_file(warc_file_name):
	global PROCESS_STDOUT_HANDLE

	if warc_file_name.find('.warc.gz') < 0:
		return

	if PROCESS_STDOUT_FILENAME is None:
		initialize_output_files(0)

	w = warc.open(warc_file_name)

	try:
		for record in w:
			try:
				record_url_parsed = urlparse.urlparse(record['WARC-Target-URI'])

				host = record_url_parsed.hostname
				queries_string = record_url_parsed.query

				queries = [x.split('=')[0] for x in queries_string.split('&')] # grabs a list of keys

				for query in queries:
					print host, query # one line per host and query combination
			except:
				sys.stderr.write('Awkward record in ' + warc_file_name + '\n')
				continue

	except IOError:
		sys.stderr.write('Awkward record found in ' + warc_file_name + '\n')
コード例 #3
0
def validate_file(warc_path):
    """Validate a CYOC.net grab warc file"""
    wf = warc.open(warc_path)
    for record in wf:
        if record.type == 'response':
            validate_record(record)
    return
def run(string):
    myindex = index.open_dir("index")
    writer = myindex.writer(procs=3,  multisegment=True, limitmb=512)
    f = None
    try:
        f = warc.open(string)
    except :
        print "Can't open this file" 
        sys.exit()

    i = 0
    count = 1
    datalist = []
    print "start read record!"
    for record in f:
        i = i + 1
        
        if i >= 2:
            # parser = MyHTMLParser()
            try:    
                
                # data = striphtml(unicode(record.payload, errors='ignore'))
                # datalist.append(data)

                # if data == None :
                    # print i
                parser = MyHTMLParser()
                parser.feed(unicode(record.payload, errors='ignore'))
                data = parser.getData().decode('utf8')
                # datalist.append(data2)
                parser.kill()

                if (i % 15000) == 0:
                    print "write the data of %d" %(i)
                    # for x in datalist:
                    #     writer.add_document(docId=count, content=x)
                    #     count += 1
                    print "commit now"
                    start = time.time()
                    writer.commit()
                    stop = time.time()
                    print "commit over ", (stop - start)
                    writer = myindex.writer(procs=3,  multisegment=True, limitmb=512)
                    datalist = []
                    gc.collect()
            except Exception as e:
                print "error in the data of %d" %(i)
                print e.message, e.args
                print "------------------------"
                # print data
            #parser.kill()
            writer.add_document(docId=i-1, content=data)
    # for x in datalist:
    #     writer.add_document(docId=count, content=x)
    #     count += 1
    print "final commit now"
    start = time.time()
    writer.commit()
    stop = time.time()
    print "final commit over", (stop - start)
コード例 #5
0
ファイル: webarchive.py プロジェクト: bakztfuture/cosr-back
def create_warc_from_corpus(documents, filename=None):
    """ Used mainly in tests to generate small .warc files """

    if filename is None:
        fd, filename = tempfile.mkstemp(suffix=".warc")
        os.close(fd)

    f = warc.open(filename, "w")

    for doc in documents:

        headers = "Connection: close\r\nContent-Type: text/html"
        if "headers" in doc:
            headers = "\r\n".join(["%s: %s" % (k, v) for k, v in doc["headers"].iteritems()])

        payload = "HTTP/1.1 200 OK\r\n" + headers + "\r\n\r\n" + doc["content"]

        record = warc.WARCRecord(payload=payload, headers={
            "Content-Type": "application/http; msgtype=response",
            "WARC-Type": "response",
            "WARC-Target-URI": doc["url"]
        })
        f.write_record(record)

    f.close()

    return filename
コード例 #6
0
ファイル: facc1.py プロジェクト: zxteloiv/ClueWeb09-scripts
def process_facc1_with_filename(facc1file, cluewebfile):
    facc1_obj = open(facc1file, 'rb')
    clueweb_obj = warc.open(cluewebfile, 'rb')

    process_facc1_with_fileobj(facc1_obj, clueweb_obj)

    facc1_obj.close()
    clueweb_obj.close()
コード例 #7
0
ファイル: nolink-crawler.py プロジェクト: clab/gv-crawl
    def _get_warc_file(self):
        """Creates a new Warc file"""
        assert self.warc_fp is None, "Current Warc file must be None"

        self.file_n += 1
        fname = "{}.{}.warc.gz".format(self.fname_prefix, self.file_n)
        self.warc_fname = os.path.join(self.outdir, fname)
        self.warc_fp = warc.open(self.warc_fname, "w")
コード例 #8
0
ファイル: CreateIndex.py プロジェクト: KemingChen/IR
def parser(filename):
    try:
        warcfile = warc.open(filename)
    except Exception, e:
        os.system("cls")
        print "Can't find the " + filename
        os.system("pause")
        raise e
コード例 #9
0
ファイル: crawler.py プロジェクト: vchahun/gv-crawl
    def _get_warc_file(self):
        '''Creates a new Warc file'''
        assert self.warc_fp is None, 'Current Warc file must be None'

        self.file_n += 1
        fname = '%s.%s.warc.gz' % (self.fname_prefix, self.file_n)
        self.warc_fname = os.path.join(self.outdir, fname)
        self.warc_fp = warc.open(self.warc_fname, 'w')
コード例 #10
0
ファイル: warcscrape.py プロジェクト: datafyit/warc3
def warc_records(string, path):
    """Iterates over warc records in path."""
    for filename in os.listdir(path):
        if re.search(string, filename) and ".warc" in filename:
            print("parsing", filename)
            with warc.open(path + filename) as warc_file:
                for record in warc_file:
                    yield record
コード例 #11
0
def FilterOneFile(InName,OutName,hBlack):
    In = warc.open(InName)
    Out = warc.open(OutName,'w')
    cnt = 0
    FilterCnt = 0
    while True:
        try:
            record = In.read_record()
        except (AssertionError, EOFError) as e:
            break
        cnt += 1
        if 'warc-trec-id' in record:
            if record['warc-trec-id'] in hBlack:
                FilterCnt +=  1
                continue
        Out.write_record(record)
    print '[%s] [%d/%d] filtered' %(InName,FilterCnt,cnt)
    return True
コード例 #12
0
def handle_warc_file(warc_file):
	f = warc.open(warc_file)

	for record in f:
		if not core.is_response_record(record):
			continue
		soup = BeautifulSoup(record.payload.read())
		
		if soup.text.find('IP.Board') >= 0:
			yield record.url
コード例 #13
0
    def open_warc(self):
        #Reset period
        self.period_start = time.gmtime()
        self.period_start_time = time.time()

        #Open the warc
        self.warc_filepath = utilities.generate_warc_filepath(self.data_dir, self.collection, warc_type=self.stream_name)
        utilities.create_warc_dir(self.warc_filepath)
        log.debug("Opening %s", self.warc_filepath)
        self.warc = warc.open(self.warc_filepath, "wb")
コード例 #14
0
def handle_warc_file(warc_file, f):
	'''
	Perform a function over every record in a warc file.
	Args:
		- warc_file : path/to/warc.gz file
		- f         : function to run over every record.
	'''
	warc_handle = warc.open(warc_file)

	for record in warc_handle:
		yield f(record)
コード例 #15
0
def get_response_warcs(warc_file):
	'''
	Returns a sequence of response records only in a warc_file
	Args:
		- warc_file : path/to/warc.gz file
	'''
	warc_handle = warc.open(warc_file)

	for record in warc_handle:
		if record.type == 'response':
			yield record
コード例 #16
0
ファイル: FindWarc.py プロジェクト: KemingChen/IR
def main():
    targetId = 22466
    #filename = "data/ClueWeb09_English_Sample.warc"
    filename = "data/10.warc.gz"
    warcfile = warc.open(filename)
    docId = 0
    for doc in warcfile:
        if docId  == targetId:
            print unicode(doc.payload, errors="ignore")
        elif docId > targetId:
            break
        docId += 1
コード例 #17
0
 def __iter__(self):
     for fpath in self.fpaths:
         f = warc.open(fpath)
         for record in f:
             if record.type == 'response':
                 try:
                     header = record.header
                     content = record.payload.read()
                     yield (header, content)
                 except Exception as e:
                     print(e)
         f.close()
コード例 #18
0
def find_file(filename):
    # E.g. "clueweb12-0002wb-99-19011"
    root = filename.split("-")[1]
    folder = root[0:2]
    warcf = filename.split("-")[2]
    warc_path = "/clueweb12b/DiskB/ClueWeb12_" + folder + "/" + root + "/" + root + "-" + warcf + ".warc.gz"
    w = warc.open(warc_path)
    for reg in w:
        if "warc-trec-id" not in reg.header.keys():
            continue
        if filename == reg.header["warc-trec-id"]:
            return reg.payload.read()
    return filename + "NOT FOUND"
コード例 #19
0
ファイル: warcshotter.py プロジェクト: ersi/warcshotter
def main():
    targeturl = argv[1]
    filename = "%s-%s.warc" % (urlparse(targeturl).netloc,
                               datetime.utcnow().strftime("%Y%m%d-%H%M"))
    print "Starting snapshot of %s, writing to %s" % (targeturl, filename)
    wf = warc.open(filename, "w")

    warcinfo_record = mkwarcinfo(filename)
    if DEBUG:
        print "Writing warcinfo record"
    wf.write_record(warcinfo_record)

    record = download(targeturl)
    if len(REQUESTS):
        request_record = REQUESTS.pop(0)
        if DEBUG:
            print "Writing request record %s" % request_record['WARC-Record-ID']
        wf.write_record(request_record)
        if DEBUG:
            print "Writing response record %s" % record['WARC-Record-ID']
        wf.write_record(record)
    else:
        if DEBUG:
            print "Writing response record"
        wf.write_record(record)

    # If the parser could parse the first resource, continue to download found
    # resources. Doesn't parse again, currently. Only grabbin images, css etc
    if DEBUG:
        print "Downloading linked content"
    for target in TARGETS:
        record = download(target)

        if len(REQUESTS):
            request_record = REQUESTS.pop(0)
            if DEBUG:
                print "Writing request record %s" % request_record['WARC-Record-ID']
            wf.write_record(request_record)
            if DEBUG:
                print "Writing response record %s" % record['WARC-Record-ID']
            wf.write_record(record)
        else:
            record = download(target)
            if DEBUG:
                "Writing response record."
            wf.write_record(record)
    if DEBUG:
        print "TARGETS ", TARGETS
    wf.close()
    print "Done."
コード例 #20
0
def write_csv(in_name=None, out_name="warc_header.csv"):
    ret = False
    with open(out_name, 'wb') as csvfile:
        result = []
        spamwriter = csv.writer(csvfile, delimiter=' ',
                            quotechar='|', quoting=csv.QUOTE_MINIMAL)
        with warc.open(in_name, 'rU') as f: 
            for record in f:
               url = record['WARC-Target-URI']
               rec_id = record['WARC-Record-ID']
               warc_date = record['WARC-Date']
               ip = record['WARC-IP-Address']
               spamwriter.writerow([url, rec_id, ip, warc_date])
        spamwriter.close()
    return True
コード例 #21
0
    def __init__(self, filepath):
        """
        :param filepath:  The filepath of the WARC file.
        """
        self.filepath = filepath
        log.info("Writing to %s", self.filepath)

        #Create the directory
        filepath_parent = os.path.dirname(self.filepath)
        if not os.path.exists(filepath_parent):
            log.debug("Creating %s directory.", filepath_parent)
            os.makedirs(filepath_parent)

        #Open warc
        self._warc_file = ia_warc.open(self.filepath, "w")
コード例 #22
0
ファイル: warc2db.py プロジェクト: ldmt-muri/gv-crawl
 def article_records():
     for fn in args.warcs:
         n_records = n_errors = 0
         print('Processing {}'.format(fn))
         warc_file = warc.open(fn)
         for record in warc_file:
             n_records += 1
             try:
                 yield process_article(record)
             except AssertionError as e:
                 n_errors += 1
                 logging.error('{}\t{}'.format(record.url, e))
         warc_file.close()
         print('Records processed: {} ({} errors => {} inserted)'.format(n_records,
             n_errors, n_records - n_errors))
コード例 #23
0
 def _get_warc_file(self):
     '''Creates a new Warc file'''
     #assert self.warc_fp is None, 'Current Warc file must be None'
     if not self.hostname:
         return
     file_n = self.warc_file_n_slots.get(self.hostname)
     if not file_n:
         file_n = 0
     file_n += 1
     self.warc_file_n_slots[self.hostname] = file_n
     fname = '%s_%s.warc.gz' % (self.hostname, file_n)
     warc_fname = os.path.join(self.warc_dir, fname)
     assert os.path.exists(warc_fname) is not True
     warc_fp = warc.open(warc_fname, 'w')
     self.warc_fp_slots[self.hostname] = warc_fp
     return warc_fp
コード例 #24
0
def FetchOneDirURL(InDir,OutName):
    lFName = WalkDir(InDir)
    out = open(OutName,'w')
    for InName in lFName:
        In = warc.open(InName)
        while True:
            try:
                record = In.read_record()
            except (AssertionError, EOFError) as e:
                break
            if ('warc-trec-id' in record) & ('warc-target-uri' in record):
                print >> out, record['warc-target-uri'] + '\t' + record['warc-trec-id']
                
        print '[%s] finished' %(InName)
    out.close()
    print 'dir [%s] finished' %(InDir)
    return True
コード例 #25
0
def list_records(filepath, services=()):
    print "File %s" % filepath
    f = warc.open(filepath)
    try:
        for record in f:
            if record.type == 'response':
                resp_record = to_response_record(record)
                if (not services
                    or (isinstance(resp_record, ApiResponseRecord) and resp_record.service in services)
                        or (not isinstance(resp_record, ApiResponseRecord) and "other" in services)):
                    print "Record %s" % resp_record.record_id
                    print "Url: %s" % resp_record.record_url
                    print "Date: %s" % resp_record.date
                    if isinstance(resp_record, ApiResponseRecord):
                        print "Service: %s" % resp_record.service
                        print "API method: %s (%s)" % (resp_record.api_method, resp_record.api_args)
    finally:
        f.close()
コード例 #26
0
ファイル: features.py プロジェクト: yoPitarch/princess_git
def getDocContent(trecid):
    '''function en cours'''
    princess_dir = '/osirim/sig/PROJET/PRINCESS'
    corpus_dir   = '/osirim/sig/CORPUS/CLUEWEB12/ClueWeb12-Full'
    clue_web_dir = trecid[0:9] + '_' + trecid[10:12]
    #print clue_web_dir
    clue_web_dir = clue_web_dir.replace("clueweb","ClueWeb")
    warc_dir = trecid[10:16]
    #warc_file = warc_dir + '-' + trecid[19:21] + 'warc.gz'
    warc_file = corpus_dir+'/'+clue_web_dir+'/'+ warc_dir+"/"+warc_dir + '-' + trecid[17:19] + '.warc.gz'
    #docnum = trecid[20:]
    #print warc_file
    file = warc.open(warc_file)
    #loop through warc to get doc content 
    for record in file:
        #print record.header
        if "WARC-Trec-ID" in record.header:
    #   if record['WARC-TREC-ID'] == docNum:
            if record['WARC-Trec-ID'] == trecid:
                #print record.payload.read()
                html_doc =  record.payload.read()
                #soup = BeautifulSoup(html_doc, 'html.parser') # vire bien les balises mais plante sur </br />
                soup = BeautifulSoup(html_doc, 'html5lib') # peut garder des div, des a et img
                for script in soup(["script", "style","comment"]):
                    script.extract() 

                text = soup.get_text()
                # break into lines and remove leading and trailing space on each
                lines = (line.strip() for line in text.splitlines())
                # break multi-headlines into a line each
                chunks = (phrase.strip() for line in lines for phrase in line.split("  "))
                # drop blank lines
                text_clean = '\n'.join(chunk for chunk in chunks if chunk)

                #txt_unicode =  unicode(txt, 'utf-8')
                #txt_unicode = u""+txt
                #txt_unicode = txt.encode('utf8', 'replace')
                #print txt
                #return txt.encode('utf-8','ignore')
                text_clean = text_clean.replace(u"</br />","</br>")
                return text_clean
                #sys.exit(0)
                break
コード例 #27
0
def run(string):
    f = None
    try:
        f = warc.open(string)
    except :
        print "Can't open this file" 
        sys.exit()

    i = 0
    error = 0
    print "start read record!"
    for record in f:
        i = i + 1
        if i > 2:
            data1 = striphtml(unicode(record.payload, errors='ignore'))
            if data1 == None:
                error = error + 1
            if (i % 1000) == 0 :
                print error
                print i
    print error
コード例 #28
0
def get_warc(url):
    global targeturl
    targeturl = url

    temp = tempfile.NamedTemporaryFile(mode='rb', suffix = '.warc')

    filename = "%s-%s.warc" % (urlparse(targeturl).netloc,
                               datetime.utcnow().strftime("%Y%m%d-%H%M"))

    wf = warc.open(temp.name, "w")

    warcinfo_record = mkwarcinfo(filename)

    wf.write_record(warcinfo_record)

    record = download(targeturl)
    if len(REQUESTS):
        request_record = REQUESTS.pop(0)
        wf.write_record(request_record)
        wf.write_record(record)
    else:
        wf.write_record(record)

    #If the parser could parse the first resource, continue to download found
    #resources. Doesn't parse again, currently. Only grabbin images, css etc
    for target in TARGETS:
        record = download(target)

        if len(REQUESTS):
            request_record = REQUESTS.pop(0)
            wf.write_record(request_record)
            wf.write_record(record)
        else:
            record = download(target)
            wf.write_record(record)
    wf.close()
    binary = open(temp.name, 'rb').read()
    temp.close()

    return binary
コード例 #29
0
def read_simplyhired_data(filename):
  f = warc.open(filename)
  for record in f:
    # Ignore DNS records
    if record['content-type'] == 'text/dns':
      continue
    http_headers, content = parse_warc_payload(record.payload.read())
    soup = BeautifulSoup(content)
    if soup.select('div.detail .job_info'):
      # Get the job metadata and yield to iterator
      job_company = soup.select('div.detail .company')[0].text.strip().replace("Company: ", "")
      job_title = soup.select('div.detail .title')[0].text.strip()
      job_location = soup.select('div.detail .location')[0].text.strip().replace("Location: ", "")
      job_description = soup.select('div.detail .description_full')[0].get_text("\n", strip=True)
      # job_is_telecommute = soup.find_all("dt",text="Telecommute:")[0].find_next_sibling().text.strip()
      yield {
        "source": record.url,
        "company": job_company,
        "title": job_title,
        "location": job_location,
        "description": job_description
      }
コード例 #30
0
 def open_warc(self):
     """ Opens a new WARC file """
     self.warc_file_path = os.path.join (
             config.warcs_path,
             "apicrawler.%s.warc.gz" % (
             datetime.datetime.now().strftime(config.datetime_format)))
             
     logger.info("Writing new WARC file: %s" % self.warc_file_path)
     self.warc_file = warc.open(self.warc_file_path, "w")
     #
     # Write WARCInfo record
     #
     warc_header = warc.WARCHeader(
             {   "WARC-Type": "warcinfo",
                 "Content-Type": "application/warc-fields",
                 "WARC-Filename": os.path.basename(self.warc_file_path)  },
             defaults = True)
     warc_payload = 'software: apicrawler\nhostname: ia200127'
     warc_record = warc.WARCRecord(warc_header, warc_payload)
     self.warc_file.write_record(warc_record)
     self.warcinfo_id = warc_header['WARC-RECORD-ID']
     logger.info("New WARC id: %s" % self.warcinfo_id)
コード例 #31
0
ファイル: savehtml.py プロジェクト: wanng-ide/NTCIRMDB2
from assessment.models import *
import mongoengine
import pymongo

#mongoengine.connect(db=ntcir)
con = pymongo.Connection('localhost', 27017)

ntcir = con.ntcir
topic = ntcir.ntcir
table = ntcir.table

table.insert({'table_id': '1', 'table': 'dog'})
table.insert({'table_id': '2', 'table': 'cat'})
table.insert({'table_id': '3', 'table': 'mouse'})

f = warc.open('/Users/Fan/Downloads/0000tw-00.warc.gz')
i = 0
for record in f:
    h = record.header
    test = record.payload.read()
    WARC_Trec_ID = h.get("WARC-Trec-ID")
    i += 1
    if i > 1:
        filepath = '/Users/Fan/Downloads/NTCIRMDB2/assessment/templates/' + str(
            WARC_Trec_ID) + '.html'
        with open(filepath, 'w') as d:
            d.write(test)
            d.close()
        topic.insert({
            'topic_id': '1',
            'topic': 'dog',
コード例 #32
0
with open("warc.paths", "r") as paths:
    # count = 0
    # for path in paths:
    # 	path = pre + path
    # 	count += 1
    # 	print(path)
    # print(count)

    # only process the first file for practice
    path = paths.readline()
    path = pre + path
    print(path)

    with warc.open(
            "CC-MAIN-20160205193905-00000-ip-10-236-182-209.ec2.internal.warc.gz",
            "r") as records:

        recordnum = 0
        with open("warc_output.txt", "w") as fout:
            for record in records:
                # WARC records have three different types:
                #  ["application/warc-fields", "application/http; msgtype=request", "application/http; msgtype=response"]
                # We're only interested in the HTTP responses
                if record.header[
                        'content-type'] != 'application/http; msgtype=response':
                    continue

                if recordnum < 1000000:
                    try:
                        content = record.payload.read().decode("utf-8")
コード例 #33
0
# To run: python just_text.py > text
###
from glob import glob
#
from nltk.corpus import stopwords
import warc

# List any of the WARC files found in the data folder
warc_files = glob(
    "C:\DIC\Labs\Lab2\CC-MAIN-20190121172846-20190121194846-00444.warc.gz")

# Process each of the WARC files we found
files_processed = 0
for fn in warc_files:
    f = warc.open(fn)
    for record in f:
        url = record.header.get('warc-target-uri', None)
        if not url:
            continue
        text = record.payload.read()
        #
        print(url)
        print(text)


def construct_dictionary():
    pairs = []
    for i in range(0, 3):  # + str(i)
        with open("reducer_output_cc" + "//part-r-0000" + str(i),
                  "r",
                  encoding="utf-8") as f:
コード例 #34
0
    count = 0
    #	filename = '../dataset/'+domain+'.com'+str(i)+'.warc.gz'
    filename = '../samples/' + domain + '_sample.warc.gz'
    print 'Load' + filename
    try:
        with gzip.open(filename, 'rb') as gfz:
            ''' Load file '''
            contents = [(warc.WARCRecord(payload=record.payload.read(),
                                         headers=record.header))
                        for record in warc.WARCFile(fileobj=gfz)]
            l = len(contents)
    except:
        continue
    ''' select records randomly '''
    print 'select'
    f_sample = warc.open(filename_sample, 'a')
    while count < per_warc:
        rand = random.randint(0, l - 1)
        sys.stdout.write("\rRecord count %i" % count)
        sys.stdout.flush()
        r = contents[rand]
        #pre = preprocessing.HTMLPreprocessing(r.payload)
        payload = r.payload
        r['Content-Length'] = str(len(payload))
        r['WARC-Record-ID'] = str(uuid.uuid4())
        f_sample.write_record(
            warc.WARCRecord(payload=payload, headers=r.header))
        count += 1
    print '\n'
    f_sample.close()
コード例 #35
0
ファイル: test.py プロジェクト: wanng-ide/NTCIRMDB2
import warc

f = warc.open(
    '/home/tamaki/corpus/DiskB/ClueWeb12_08/0817wb/0817wb-49.warc.gz')
i = 0
for record in f:
    h = record.header
    test = record.payload.read()
    WARC_Trec_ID = h.get("WARC-Trec-ID")
    i += 1
    if i > 1:
        filepath = '/home/fanyimeng/test/' + str(WARC_Trec_ID) + '.html'
        with open(filepath, 'w') as d:
            d.write(test)
            d.close()
コード例 #36
0
ファイル: warccrawl.py プロジェクト: okfde/odm-datenerfassung
filetypes = ['.CSV', '.XLS', '.XLSX', '.JSON', '.RDF', '.ZIP']
geofiletypes = ('.GEOJSON', '.GML', '.GPX', '.GJSON', '.TIFF', '.SHP', '.KML',
                '.KMZ', '.WMS', '.WFS')
filetypes.extend(geofiletypes)

csvoutfile = open(sys.argv[1] + '.data.csv', 'a+b')
datawriter = csv.writer(csvoutfile, delimiter=',')

columns = [
    'Stadt_URL', 'URL_Datei', 'URL_Text', 'URL_Dateiname', 'Format', 'geo',
    'URL_PARENT', 'Title_Parent'
]

datawriter.writerow(columns)

f = warc.open(sys.argv[2])
domain = sys.argv[1]
blacklist = ('.jpg', '.gif', '.ico', '.txt', '.pdf', '.png', 'dns:', '.css',
             '.js')

for record in f:
    if ('WARC-Target-URI' in record.header) and (
            domain in record['WARC-Target-URI']) and not any(
                x in record['WARC-Target-URI']
                for x in blacklist) and 'metadata' in record['warc-type']:
        #for item in record.__dict__['header'].items():
        #print item
        for line in record.__dict__['payload'].read().split('\n'):
            if any(ext in line.upper() for ext in filetypes):
                url = line.split(' ')[1]
                extension = url.split('.')[-1].upper()
コード例 #37
0
    bucket = lsh.hash_tables[0].get_val(key)
    for query_object in bucket:
        candidates = lsh.query(query_object[0], distance_func='cosine')
        dedup.add(query_object[1])
        for c in candidates:
            candidate_key = c[0][
                1]  # warc id is appended as extra data in lsh.index()
            if candidate_key == query_object[1]:
                continue
            candidate_distance = c[1]
            if float(candidate_distance) >= threshold:
                dedup.add(candidate_key)
            elif candidate_key in dedup:
                dedup.remove(candidate_key)

file = warc.open(filenameIn + '_dedup.warc.gz', 'wb')
numSingle = len(dedup)
for i in range(0, max_files):
    with gzip.open(datasetPath + filenameIn + str(i) + '.warc.gz',
                   mode='rb') as gzf:
        for record in warc.WARCFile(fileobj=gzf):
            record_id = record['WARC-Record-ID']
            if record_id in dedup:
                payload = record.payload.read()
            file.write_record(
                warc.WARCRecord(payload=payload, headers=record.header))

print 'Total pages: ' + str(doc_count)
print 'Pages after deduplication: ' + str(numSingle)

file.close()
コード例 #38
0
ファイル: domains_dist.py プロジェクト: serzh/prj-nlp
matplotlib.use('TkAgg')
import matplotlib.pyplot as plt
import collections
import argparse

argparser = argparse.ArgumentParser(
    description=
    'processes warc files and returns pie chart with domains distribtuion')
argparser.add_argument('--input', required=True, help='Path to the warc file')
argparser.add_argument('--output',
                       required=True,
                       help='Path to the output image of pie chart')
args = argparser.parse_args()

domains_counter = collections.Counter()
with warc.open(args.input) as f:
    for record in f:
        if 'WARC-Target-URI' not in record:
            continue
        else:
            host_with_port = urlparse(record['WARC-Target-URI']).netloc
            host = host_with_port.split(':')[0]
            domain = host.split('.')[-1]
            domains_counter.update([domain])

common = dict(domains_counter.most_common(10))
domains = list(common.keys())
total = sum(common.values())
shares = [common[domain] / total for domain in domains]

fig, ax = plt.subplots()
コード例 #39
0
import urllib.request
import warc

if __name__ == '__main__':

    f = warc.open("test.warc.gz")
    for record in f:
        print(record['WARC-Target-URI'], record['Content-Length'])
コード例 #40
0
def main():
    """ main routine """

    if not os.path.exists(LEXICON_PATH):
        try:
            os.mkdir(LEXICON_PATH)
        except Exception:
            print 'Error making %s, exiting.' % LEXICON_PATH
            return

    wet_files = get_wet_files()
    docid_gen = docid_generator()

    url_index = UrlIndex()
    word_index = WordIndex()

    for wet_file in wet_files:
        print wet_file
        wet_fd = warc.open(wet_file)
        doc_next_offset = 0

        lexicon_file = os.path.join(
            LEXICON_PATH,
            '.'.join(os.path.basename(wet_file).split('.')[:-1] + ['lexicon']))
        lex_fd = open(lexicon_file, 'wb')

        for wet_record in wet_fd:
            if wet_record.url:

                docid = docid_gen.next()

                url = wet_record.url
                url_lens, url_fileid, url_offset = url_index.write_url_index(
                    url)

                doc_fileid = 88
                doc_offset = doc_next_offset if doc_next_offset else 0
                doc_header_length = wet_record.payload.fileobj.tell(
                ) - doc_offset
                doc_length = doc_header_length + wet_record.header.content_length

                content_offset = doc_header_length
                content_length = wet_record.header.content_length

                print docid, url, (url_fileid, url_offset, url_lens), \
                  (doc_fileid, doc_offset, doc_length, content_offset, content_length)

                # docid(4B), url_pos[fileid(2B), offset(4B), lens(2B)],
                # doc_pos[fileid(2B), offset(4B), lens(4B), con_offset(2B), con_lens(4B)]
                url_idx_data = pack('=IHIHHIIHI', \
                                docid, url_fileid, url_offset, url_lens, doc_fileid, \
                                doc_offset, doc_length, content_offset, content_length)
                url_index.write_url_index_entry(url_idx_data)

                # generate lexicons
                saved_offset = wet_record.payload.fileobj.tell()
                page_content = wet_record.payload.fileobj.read(content_length)
                wet_record.payload.fileobj.seek(saved_offset)
                for token, start, end in split_with_offset(page_content):
                    if is_ascii(token) and len(token) > 0 and len(token) < 256:
                        word_id = word_index.add_entry(token)
                        lexicon_data = pack('iiih', word_id, docid, start, 2)
                        lex_fd.write(lexicon_data)

                doc_next_offset = wet_record.payload.fileobj.tell(
                ) + wet_record.payload.length

        lex_fd.close()
        wet_fd.close()
コード例 #41
0
# prefix= "https://commoncrawl.s3.amazonaws.com/"
# exurl ="crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/warc/CC-MAIN-20190322115048-20190322141048-00227.warc.gz"
# finalurl= prefix+exurl
# finalurl = finalurl.replace('/warc/','/wet/').replace('.warc.','.warc.wet.')
# wget.download(exurl)

keywords= 'trump' or 'president' or 'government' or 'party' or 'people' or 'election' or 'state' or 'house' or 'political' \
          or 'politics' or 'republican' or 'vote' or 'administration'
count = 0
# exurl ="https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/wet/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz"
exurl2 = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912203021.14/wet/CC-MAIN-20190323201804-20190323223804-00227.warc.wet.gz"
exurl3 = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912201672.12/wet/CC-MAIN-20190318191656-20190318213656-00398.warc.wet.gz"

# wget.download(exurl)
records = warc.open("D:\MS\\2ndSem\DIC\Lab2\\crawlData\\text.warc.wet")
newfile = open("D:\\MS\\2ndSem\\DIC\\Lab2\\crawlData\\crawl2.txt", 'a')
for record in records:
    url = record.header.get('WARC-Target-URI', None)
    if url:
        text = record.payload.read()
        try:
            if (count < 520 and detect(text.decode("utf-8")) == 'en'):
                if keywords in text.decode("utf-8").lower():
                    newfile.write(str(text, 'utf-8'))
                    newfile.write('\n\n')
                    count = count + 1
                    print(count)
        except Exception:
            pass
コード例 #42
0
from modules import NumberGenerator

docIDDigits = 4
frequancyDigits = 2
docIdGenerator = NumberGenerator.Number(digits=docIDDigits, after=args.startID)

r = redis.Redis(unix_socket_path=args.redis, db=args.redisDB)
if args.docIDwet:
    docIDwetFile = open(args.docIDwet, mode='a')
else:
    docIDwetFile = sys.stderr

for filepath in args.files:
    print("* Dealing:", filepath, file=sys.stderr)
    with warc.open(filepath, 'rb') as f:
        for (record, offset, _) in tqdm(f.browse(), unit='records'):
            URI = record.url
            if URI:
                content = record.payload.read()
                if content:
                    (lang, langConfidence) = Language.classify(content)
                    if lang in space_devided_langs:
                        words = latin_sep_words.split(str(content))
                    elif lang == 'zh' and not args.skipChinese:
                        words = jieba.cut(content, cut_all=False)
                        # words = list(words)
                        words = [word for word in words if non_latin_words_pattern.match(word)]
                    else:
                        # other languages
                        continue
コード例 #43
0
ファイル: ExtractEnPage.py プロジェクト: TPLink32/nlp
folderidx =sys.argv[1]

foldername = '/home/cluo/publicdata/DiskB/ClueWeb12_'+folderidx
pages = set()
for l in open('../ref/enpages.txt').readlines():
    pages.add(l.strip())


for folder in os.listdir(foldername):
    subfoldername = foldername+'/'+folder
    for filename in os.listdir(subfoldername):
        print subfoldername,filename
        os.system('7z e '+subfoldername+'/'+filename)
        warcfilename = filename.replace('.7z','')
        f = warc.open(warcfilename)
    
        for record in f:
            id = record.header.get('WARC-TREC-ID')
            if id in pages:
                fout = open('../data/parse/'+str(record.header.get('WARC-TREC-ID')+'.html'),'w')
                content = record.payload.read()
                write = False
                for l in content.split('\n'):
                    if '<!DOCTYPE' in l:
                        write=True
                    if write == True:
                        fout.write(l+'\n')
                fout.write(content)
                fout.close()
        os.system('rm '+warcfilename)
コード例 #44
0
ファイル: warc_id.py プロジェクト: stteffen58/Seminarthesis
import warc
import uuid
import sys
import os
import gzip
os.chdir('/home/eckel/')
''' Load and preprocess data '''

print 'preprocessing'
filenameIn = sys.argv[1]
max_range = int(sys.argv[2])
for i in range(0, max_range):
    print filenameIn + str(i)
    fw = warc.open('dataset_id/' + filenameIn + str(i) + '.warc.gz', 'wb')
    with gzip.open('dataset/' + filenameIn + '.com' + str(i) + '.warc.gz',
                   mode='rb') as gzf:
        for record in warc.WARCFile(fileobj=gzf):
            record['WARC-Record-ID'] = str(uuid.uuid4())
            fw.write_record(
                warc.WARCRecord(payload=record.payload.read(),
                                headers=record.header))
    fw.close()
コード例 #45
0
ファイル: Wiki.py プロジェクト: ZJCODE/CodeArchive
# -*- coding: utf-8 -*-
"""
Created on Mon Nov 21 10:20:14 2016

@author: ZJun
"""

import warc
f = warc.open("00.warc.gz")
f.read_record()
i = 0
for record in f:
    i = i + 1
    print record['WARC-Target-URI'], record['Content-Length']
    if i > 10:
        break
f.close()

f = warc.open("00.warc.gz")
a = f.read_record()
b = f.read_record()
'''
a.header.items()
Out[52]: 
[('warc-type', 'warcinfo'),
 ('content-length', '219'),
 ('version', '0.18'),
 ('warc-date', '2009-03-75T00:59:24-0400'),
 ('content-type', 'application/warc-fields'),
 ('warc-record-id', '<urn:uuid:b38cd8ab-5ba6-445c-9c9c-0a5cbc3b6a41>')]
 
コード例 #46
0
from __future__ import print_function

import warc
import sys

if len(sys.argv) != 2:
    print("Usage: {} <input_file>".format(sys.argv[0]))
    sys.exit(0)

f = warc.open(sys.argv[1])
for record in f:
    print(record.url)
    # record.header => Dictionary containing the header field names and their values
    # record.payload => Payload object, can be read with a.payload.read()


コード例 #47
0
import warc
from textblob import TextBlob
import string
import urllib
import os

commonPath = 's3://aws-publicdatasets/'

for line in sys.stdin:
    #download file
    url = commonPath + line
    os.system('aws s3 cp ' + url + './file.warc.wet.gz')
    print 'wet file downloaded'

    #open the warc file
    f = warc.open('file.warc.wet.gz')
    txtFile = open('data/' + line[70:140] + '.txt', 'w')

    #read the warc file and write to text file
    for record in f:
        if record['Content-Type'] == 'text/plain':
            date = record['WARC-Date']
            htmlText = record.payload.read()
            text = htmlText.strip()
            text = text.translate(None, '[!@#$+:|/\#%^*()-_=~]\n')
            printableText = filter(lambda x: x in string.printable, text)
            blob = TextBlob(printableText)
            for sentence in blob.sentences:
                if len(sentence) <= 340:
                    txtFile.write(date[0:10])
                    txtFile.write('\t')
コード例 #48
0
#https://gist.github.com/Smerity/afe7430fdb4371015466
#https://pypi.org/project/langdetect/
#!pip install warc3-wet
import warc
import wget
from langdetect import detect

exurl = "https://commoncrawl.s3.amazonaws.com/crawl-data/CC-MAIN-2019-13/segments/1552912202658.65/wet/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz"
wget.download(exurl, out="../Data/Commoncrawl")
from sh import gunzip
gunzip(
    "../Data/Commoncrawl/CC-MAIN-20190322115048-20190322141048-00227.warc.wet.gz"
)
records = warc.open(
    "../Data/Commoncrawl/CC-MAIN-20190322115048-20190322141048-00227.warc.wet")
counter = 0
data = []
urls = set()
keywords = [
    'soccer', 'basketball', 'sports', 'baseball', 'score', 'hockey', 'mls',
    'nba', 'ncaa', 'nfl', 'knicks', 'mma', 'nhl', 'golf'
]
for record in records:
    if counter == 600:
        break
    else:
        url = record.header.get('warc-target-uri', None)
        if not url:
            continue
        if url not in urls:
            urls.add(url)
コード例 #49
0
ファイル: pipeline.py プロジェクト: aggroskater/twitchtv-grab
    def process(self, item):

        # assert that this item is flagged for sampling. If not,
        # return immediately. We don't want to butcher uploads that
        # have been determined to be worth saving in their original
        # state.
        #
        # Presumably, the tracker is tagging these items as something
        # appropriate. Alternately, one could create a "Phase 3" grab
        # and know for a fact that we are only receiving videos that
        # should be sampled. In which case, one may skip the item_type
        # check and proceed directly to sampling.

        item_name = item['item_name']
        item_type, item_value = item_name.split(':', 1)

        item['item_type'] = item_type
        item['item_value'] = item_value

        assert item_type in ('video-bulk', 'url-bulk')

        # Item type is not marked as "video-bulk" from tracker.
        # Carry on. Nothing to do here.
        if item_type != 'video-bulk' or 'url-bulk':
            return

        # ok. This is an item that needs to be sampled.

        # remember where we started from so we can get back there and
        # not mess up the expectations for the rest of stages in the
        # pipeline
        original_path = os.getcwd()

        # get to item_dir ; begin work
        os.chdir(item['item_dir'])

        # we will need some data from the warcfile
        warcinfo_record_ID = ""
        metadata_record_ID = ""
        truncated_record_ID = ""

        # set up old and new warc files for reading and writing, respectively.
        # If a file ends in *.gz for writing, the warc library handles gz
        # compression transparently.
        old_warc_file = warc.open("%(warc_file_base)s.warc.gz" % item)
        new_warc_file = warc.open(
            "%(warc_file_base)s-POSTPROCESSED.warc.gz" % item, "w")

        # ------------------------ Start of main for loop -------------------#

        # and here... we... go
        for record in old_warc_file:

            # Firstly, we detect whether the record we're iterating over holds
            # data we'll need later. If so, behave appropriately. After the
            # if-elif-elif dance, we proceed to copy each record into a new
            # record in the %(warc_file_base)s-POSTPROCESSED.warc.gz file,
            # modifying as necesary (truncated long records, etc)

            # ------------------------ Check for data -------------------------#

            # Grab the lengthy payload (the flv file); if the content-length is
            # longer than ~5MiB, and the record is of the "response" type, then
            # this record *probably* has the flv file.
            if ((long(record['Content-Length']) >= 5000000)
                    and record['WARC-Type'] == "response"):

                # need the record id of the original flv record. Will refernece
                # it in truncated record.
                truncated_record_id = record['warc-record-id']

                # add "WARC-Truncated" to this record, indicating that it has
                # been truncated due to length.
                record['warc-truncated'] = "length"

                # extract the payload
                tempfile = open("intermediate.int", 'wb')
                for line in record.payload:
                    tempfile.write(line)
                tempfile.close()

                # put the payload back; iterating through record.payload
                # invokes a generator on the payload that seems to
                # "eat up" the payload in the original file. I say so because
                # attempting to, say, write the payload out twice (to TWO files)
                # will fail, as will any attempt to read out the payload again
                # without first "putting it back." (I'd love an explanation for
                # just what's going on here; but for now, this hack works)
                # (for the record with the long content-length, we end up reading
                # the payload twice; once here, to get it to a separate file, and
                # once again, in COPY PAYLOAD, to write out a truncated version to
                # the new warc file)
                stream = StringIO(open("intermediate.dat", 'rb').read())
                stream.seek(0, os.SEEK_END)
                streamlength = stream.tell()
                stream.seek(0)
                record.payload = warc.utils.FilePart(fileobj=stream,
                                                     length=streamlength)

                # can't close the stream yet for some reason. This might
                # introduce leaks of some sort, so keep an eye on it.
                # The relevant error: "IO Operation on a closed file."
                # I suspect this operation occurs somewhere in the warc library,
                # and i'm hoping that the stream object just falls out of scope
                # at some point other than when the entire pipeline shuts down.
                # stream.close()

            # Adjust the warcinfo record to note that we also utilized ffmpeg
            elif (record['WARC-Type'] == "warcinfo"):

                # grab the record-id for later use in resource records
                warcinfo_record_ID = record['warc-record-id']

                # gotta add another "software" key to the content-block of the
                # warcinfo record that indicates the use of ffmpeg.
                warcinfo_stream = StringIO()
                for line in record.payload:
                    warcinfo_stream.write(line)

                # trailing \r\n\r\n is already present in the payload; just seek back
                # two bytes (yes, the second \r\n will get clobbered; potential unicode
                # byte-length issues here) and then tack on the additional lines you
                # need to like so:
                warcinfo_stream.seek(-2, os.SEEK_END)
                warcinfo_stream.write("software: ffmpeg/2.3.1\r\n\r\n")
                warcinfo_stream.seek(0, os.SEEK_END)
                warcinfo_stream_len = warcinfo_stream.tell()
                warcinfo_stream.seek(0)
                record.payload = warc.utils.FilePart(
                    fileobj=warcinfo_stream, length=warcinfo_stream_len)

            # Get the metadata record's warc-record-id for later resource
            # records.
            elif (record['WARC-Type'] == "metadata"):

                metadata_record_ID = record['warc-record-id']

            # End of conditionals. Proceed to write the new record to the
            # post-processed warcfile.

            # ------------------------ Copy Record -------------------------#

            # COPY HEADER

            # Should we add defaults=False ? It seems that some additional headers
            # are added in WARCHeader as well as WARCRecord. However, they don't
            # seem harmful: digests and timestamps.
            new_header = warc.WARCHeader(record.header)

            # COPY PAYLOAD

            # if the current record gets truncated, then set the content-length
            # to the new, truncated length as per spec.
            truncated_flag = None

            # SHORT record payloads
            if long(record['content-length']) < 500000:

                #print "Copying payload..."
                new_payload = StringIO()
                for line in record.payload:
                    new_payload.write(line)
                #if we don't seek back to 0, new_payload.read() is empty
                new_payload.seek(0)
                #print "Done copying payload."

            # LONG record payloads (the one that probably has video data)
            else:

                #print "Found long content-length. Truncating..."
                new_payload = StringIO()
                decrement = 25
                #Grab some lines
                #print "Gonna grab some lines. Decrement: ", decrement
                for line in record.payload:
                    #print "Grabbing a line."
                    new_payload.write(line)
                    decrement -= 1
                    #print "Decrement: ", decrement
                    if decrement == 0:
                        break
                # be kind: rewind
                new_payload.seek(0)
                truncated_flag = True

                #print "Done truncating."

            # CREATE RECORD FROM HEADER AND PAYLOAD

            new_rec = warc.WARCRecord(payload=new_payload.read(),
                                      headers=new_header,
                                      defaults=False)

            # if this record happened to be one that got truncated, then we
            # need to adjust its content-length header.
            if truncated_flag:

                #print "Adjusting content-length header"

                # From page 9 of the ISO WARC Standard:
                #
                # "The WARC-Truncated field may be used on any WARC record. The WARC
                # field Content-Length shall still report the actual truncated size of
                # the record block."

                # Get the length of the truncated content-block and set
                # Content-Length header appropriately
                new_payload.seek(0)
                new_payload.seek(0, os.SEEK_END)
                thelength = new_payload.tell()
                new_rec['content-length'] = str(thelength)
                new_payload.seek(0)

            # WRITE THE NEW RECORD OUT TO THE NEW WARCFILE

            # (the warc library handles the gz-compression and putting each record
            # in a separate gz "member" transparently; no need to much with the gzip
            # library ourselves)

            #print "Copying record to new .warc.gz"
            new_warc_file.write_record(new_rec)
            #print "Done copying record to new .warc.gz"
            #print "\n\n"

        #------------------------ END OF MAIN FOR LOOP ------------------------#

        # at this point, we have a new warcfile with copied and truncated
        # records; now, we need to sample the content and add these "conversion"
        # records to the warc file.

        # Should probably delete old warc at this point, since new warcfile has all
        # of the old records, and we've already got another copy of the main
        # payload. If we proceed to write out the full newfile with the shrunken
        # payload before deleting the old warc, we'll basically be using nearly
        # 3x the interim diskspace rather than 2x. (Don't get me wrong, I'd love
        # to have more of a generator-like setup that negates the need to use
        # twice the disk space, but it's beyond the scope of my abilities at the
        # moment and I don't think I'd be able to get up to speed before the
        # deadline for this project drops (August 27 2014) Update: LOL Twitch is
        # already deleting things on August 26; oh well, I suppose this code
        # could come in handy if the IA suddenly needs to compress lots of
        # material)

        # Now, we need to convert the flv, and add conversion records

        # Our "payload.flv" is not quite an flv yet; the payload still includes the
        # HTTP Response headers. We need to grep for "CRLFCRLF" and then chop off
        # anything prior to it, including it, leaving nothing but the flv file for
        # ffmpeg to work with.
        thefile = open("intermediate.int").read()  # NOT A FILE; just a "str"
        theflv = thefile.split('\r\n\r\n')[1]
        writetheflv = open("samplethis.flv", "w")
        writetheflv.write(theflv)
        writetheflv.close()

        # Get Snapshots
        SnapShot()

        # Get shrinked video
        ShrinkRay()

        # Clean up
        print(
            "********************* \n\n Removing temporary files; cleaning up \n\n*********************"
        )
        # remove original file intermediates: "intermediate.int" and "samplethis.flv"
        rmargs = shlex.split("rm intermediate.int samplethis.flv")
        call(rmargs)

        # And we're done!
        new_warc_file.close()
        os.chdir(original_path)
コード例 #50
0
ファイル: warc-test.py プロジェクト: Willian-Zhang/not-google
import warc
import sys
from tqdm import tqdm

print("** Test 1")
print("*Pass 1")
offsets = []
with warc.open(
        'data/wet/CC-MAIN-20170919112242-20170919132242-00000.warc.wet.gz',
        'rb') as f:
    i = 0

    for record, offset, length in f.browse():
        if i > 3:
            break
        URI = record.url
        print(i, offset, length, URI)
        if URI:
            offsets.append(offset)
            i += 1

print("seeks:", offsets)
print("*Pass 2")
with warc.open(
        'data/wet/CC-MAIN-20170919112242-20170919132242-00000.warc.wet.gz',
        'rb') as f:
    i = 2
    f.seek(offsets[i])
    for record, offset, length in f.browse():
        if i > 5:
            break
コード例 #51
0
import warc
import uuid
import os
os.chdir('/home/eckel/')

f = warc.open("samples/overstock_sample.warc.gz", "rb")
fw = warc.open("overstock_test.warc.gz", "wb")
count = 0
for record in f:
    if record[
            'WARC-Record-ID'] == '2dd726fe-5f11-43c3-a02c-47860e668cac' or record[
                'WARC-Record-ID'] == '4b3e1e5f-9ac3-4619-b784-a093a1d1ac0d':
        payload = record.payload.read()
        record_header = record.header
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
        fw.write_record(warc.WARCRecord(payload=payload,
                                        headers=record.header))
    #elif count < 2:
    #	payload = record.payload.read()
    #       record_header = record.header
    #	fw.write_record(warc.WARCRecord(payload=payload,headers=record.header))
    #	count += 1
f.close()
fw.close()
コード例 #52
0
import urllib.request
import warc

if __name__ == '__main__':

    urls = [
        'https://elpais.com/', 'https://elpais.com/tag/gente/a',
        'https://politica.elpais.com/', 'https://elpais.com/internacional/'
    ]

    f = warc.open("test.warc.gz", "w")

    for u in urls:
        fp = urllib.request.urlopen(u)

        mybytes = fp.read()
        mystr = mybytes.decode("utf8")

        fp.close()

        header = h = warc.WARCHeader({"WARC-Type": "response"}, defaults=True)
        header['WARC-Target-URI:'] = u

        record = warc.WARCRecord(header, mybytes)
        f.write_record(record)

    f.close()

    for u in urls:
        f = warc.open("test_trozos.warc.gz", "a")
        fp = urllib.request.urlopen(u)
コード例 #53
0
        pickle.dump(text_information, writer)


warc_path_prefix = os.path.abspath(sys.argv[1])
file_path = os.path.abspath(sys.argv[2])

warc_records = {}
counter = 0
part_number = 1
for warc_path in glob.glob(warc_path_prefix + "*"):
    if not warc_path.endswith("warc") and not warc_path.endswith(
            "warc.gz") and not warc_path.endswith("warcs.tgz"):
        print("skipped", warc_path)
        continue
    try:
        f = warc.open(warc_path)
        print("reading", warc_path)

        for record in f:
            try:
                target_uri = record["WARC-Target-URI"]
                html_text = record.payload.read()
                try:
                    html_text = html_text.decode("utf-8", "ignore")
                except:
                    pass
                warc_records[target_uri] = html_text
            except:
                pass
            counter += 1
            if counter % 10000 == 0:
コード例 #54
0
ファイル: test.py プロジェクト: wvanderp/node-warc
import warc
f = warc.open("C:\Users\wouter\Documents\GitHub\\node-warc\warc\crawl0.warc")
for record in f:
    print record['Content-Length']
def extract_sentences(clueweb_directory, freebase_directory):
    sub_directories = os.listdir(freebase_directory)
    for directory in sub_directories:
        clueweb_subdir = os.path.join(clueweb_directory, directory)
        if not os.path.isdir(os.path.join(clueweb_subdir)):
            sys.stderr.write("%s is not a directory\n" % (directory))
            continue
        freebase_subdir = os.path.join(freebase_directory, directory)
        if not os.path.isdir(os.path.join(freebase_subdir)):
            sys.stderr.write("%s is not a directory\n" % (directory))
            continue
        leaf_files = os.listdir(freebase_subdir)
        for leaf_file in leaf_files:
            warc_file = os.path.join(
                clueweb_subdir, leaf_file.replace(".anns.tsv", ".warc.gz"))
            annotated_file = os.path.join(freebase_subdir, leaf_file)
            if not os.path.exists(warc_file):
                sys.stderr.write("Skipped: warc file does not exist: %s" %
                                 (warc_file))
                continue
            if not os.path.exists(annotated_file):
                sys.stderr.write("Skipped: annotated file does not exist: %s" %
                                 (annotated_file))
                continue

            warc_file_reader = warc.open(warc_file).__iter__()
            # records = [record for record in warc_file_reader]
            annotated_file_reader = open(annotated_file)
            current_record = None
            current_record_count = -2
            current_record_content = ""
            start_sentence = 0
            end_sentence = 0
            sys.stderr.write("%s %s\n" % (warc_file, annotated_file))
            entities_and_sentence = []
            for line in annotated_file_reader:
                line = line.rstrip()
                columns = line.split('\t')
                entity_tag = columns[-1]
                if entity_tag.startswith("/m/"):
                    record_number = int(columns[0].split("-")[-1])
                    encoding = columns[1].lower()
                    entity_name = columns[2]
                    start_offset = int(columns[3])
                    end_offset = int(columns[4])
                    score = float(columns[5])
                    # Get the current record
                    while current_record_count < record_number:
                        try:
                            current_record = warc_file_reader.next()
                        except:
                            current_record_count = -100
                            break
                        current_record_count += 1
                        if current_record_count == record_number:
                            try:
                                current_record_content = current_record.payload.decode(
                                    encoding,
                                    'replace').encode('utf-8', 'replace')
                            except:
                                current_record_count = -100
                                break
                    if current_record_count < 0:
                        break
                    if current_record_content[
                            start_offset:end_offset] != entity_name:
                        continue
                    '''if len(records) <= record_number + 1:
                        break
                    current_record_content = records[record_number + 1].payload.decode(encoding, 'replace').encode('utf-8', 'replace')'''
                    # Get the current sentence
                    if start_sentence <= start_offset and end_sentence >= end_offset:
                        entities_and_sentence.append([
                            entity_tag, score, start_offset - start_sentence,
                            end_offset - start_sentence
                        ])
                    else:
                        if entities_and_sentence != []:
                            print json.dumps(entities_and_sentence)
                            entities_and_sentence = []
                        [start_sentence,
                         end_sentence] = get_sentence(current_record_content,
                                                      start_offset, end_offset)
                        if start_sentence != -1:
                            entities_and_sentence.append(
                                current_record_content[
                                    start_sentence:end_sentence])
                            entities_and_sentence.append([
                                entity_tag, score,
                                start_offset - start_sentence,
                                end_offset - start_sentence
                            ])
コード例 #56
0
ファイル: processrandompath.py プロジェクト: serzh/prj-nlp
    body = BeautifulSoup(html, "html.parser").find("body")
    if not body:
        return None

    # now strip HTML we don't like.
    for tag in body.findAll():
        if tag.name.lower() in blacklist:
            tag.extract()
        elif not tag.name.lower() in whitelist:
            tag.name = "span"
            tag.attrs = []

    return body.get_text()


f = warc.open(random_file())
domains_stat = {}
language_stat = {}
num_undetect_lang = 0
counter = 0
for record in f:
    if record.type == 'response':
        #and record.http_headers.get_header('Content-Type') == 'text/html':
        counter += 1
        content = record.payload.read()
        text = plain_text(content)
        if text:
            try:
                lang = detect(text)
                if lang not in language_stat:
                    language_stat[lang] = 1
コード例 #57
0
from pyspark.sql import SparkSession
from pyspark import SparkContext


spark = SparkSession.builder.master("local").getOrCreate()
sc = SparkContext.getOrCreate()

#simple spark 
# rdd = sc.parallelize(["hello world"])
# count = rdd.flatMap(lambda x: x.split(' ')).map(lambda word: (word, 1)).reduceByKey((lambda a, b: a+b))
# output = count.collect()
# print(output)

#wordcount 
# data = sc.textFile("./10.warc.gz")
# count = data.flatMap(lambda x: x.split(' ')).map(lambda word: (word, 1)).reduceByKey((lambda a, b: a+b))
# output = count.collect()
# output

#read warc and write to txt
import warc

f = warc.open("10.warc.gz")
inf = open("output06.txt", "a")
for record in f:
    inf.write(str(record))
inf.close()
f.close()

# 
コード例 #58
0
ファイル: WARCMerge.py プロジェクト: vnwala/cs851-s15
                else:
                    flagCheckFiles = 1
            if flagCheckFiles == 1:
                if quietMode == False:
                    print "\n The given list contains one or more files that are not in valid WARC format"
            if len(dirTree) == 0:
                if quietMode == False:
                    print('\n  No WARC files found in given list \n')
                sys.exit(0)

        outputPath = Ddir

        # generate new warc file name
        newFile = timeStampedFilename("WARCMerge") + '.warc'
        newFileFullPath = outputPath + '/' + newFile
        filePtr = warc.open(newFileFullPath, "w")
        outputFilesList.append(newFileFullPath)
        flag = 0

        outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB

        # Sorting files by sizes
        sortFiles(dirTree)

        if quietMode == False:
            print
            print 'Merging the following WARC files: '
            print '----------------------------------: '

        for warcFile in dirTree:
            outputFileSize = os.path.getsize(newFileFullPath) / forConvertToMB
コード例 #59
0
import io
import time
import nltk
import os
import json
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords 
nltk.download('stopwords') 
nltk.download('punkt')

if not os.path.isdir("./output"): os.mkdir("./output")
punc = '''!()-[]{};:'"\, <>./?@#$%^&*_~\n='''
count = 1
output = {} 

f = warc.open("01.warc.gz")
for record in f:
    if record.type == 'response':
        print "21 record.type: ", record.type, " , record.url: " , record.url
        r = requests.get(record.url, verify=False)
        read = r.text.encode('ascii', 'ignore').lower()
        print "\n\n\n24  ",type(read)," read: ", read
        text_tokens = word_tokenize(read) 
        nltk_tokens = [ word for word in text_tokens if not word in stopwords.words()]
        print "\n\n\n27 nltk_tokens: ",nltk_tokens
        #tmp = sorted(set(tmp),key=tmp.index)
        ordered_tokens = set()
        tokens_without_sw = []
        for word in nltk_tokens:
            if word not in ordered_tokens:
                ordered_tokens.add(word)
コード例 #60
0
    q = norm.pdf(x, 1, 1)

with gzip.open("C:\ClueWeb09\2008-10-01", "rb") as f1:
    file_content = f1.read()
    x = f1.readline()
    trecCW9 = norm.pdf(x, 0, 2)
    q = norm.pdf(x, 1, 1)


# 
# clue_web_01_08_df = pd.read_csv('C:\ClueWeb09\2008-01-01\en0000\01.warc.gz', compression='gzip',
#                                  header=1, sep='\t', quotechar='"')


# Access the processed warc files based off of indexes from Indri toolkit
f2 = warc.open("C:\Information Retrieval\ClueWeb09\2008-01-01\en0000\01.warc.gz")
for record in f2:
    print (record['Trec-'], record['Content-Length'])


f3 = warc.open("C:\Information Retrieval\ClueWeb09\2008-02-01\en0000\01.warc.gz")
for record in f3:
    print (record['Trec-'], record['Content-Length'])

f4 = warc.open("C:\Information Retrieval\ClueWeb09\2008-03-01\en0000\01.warc.gz")
for record in f4:
    print (record['Trec-'], record['Content-Length'])

f5 = warc.open("C:\Information Retrieval\ClueWeb09\2008-04-01\en0000\01.warc.gz")
for record in f4:
    print (record['Trec-'], record['Content-Length'])