def collect_archive_from_url(url,base_arch_dir="archives"): list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, base_arch_dir) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(base_arch_dir,list_name) # download monthly archives for res in results: result_path = os.path.join(arc_dir,res) #this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = url + res pp('retrieving %s' % gz_url) resp = urllib2.urlopen(gz_url) if resp.getcode() == 200: print("200 - writing file to %s" % (result_path)) output = open(result_path,'wb') output.write(resp.read()) output.close() else: print("%s error code trying to retrieve %s" % (str(resp.getcode(),gz_url)))
def collect_archive_from_url(url, base_arch_dir="archives"): list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, base_arch_dir) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(base_arch_dir, list_name) # download monthly archives for res in results: result_path = os.path.join(arc_dir, res) # this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = url + res pp('retrieving %s' % gz_url) resp = urllib2.urlopen(gz_url) if resp.getcode() == 200: print("200 - writing file to %s" % (result_path)) output = open(result_path, 'wb') output.write(resp.read()) output.close() else: print("%s error code trying to retrieve %s" % (str(resp.getcode(), gz_url)))
def collect_archive_from_url(url, base_arch_dir="archives"): list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, base_arch_dir) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(base_arch_dir, list_name) # download monthly archives for res in results: result_path = os.path.join(arc_dir, res) #this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = url + res pp('retrieving %s' % gz_url) info = urllib.urlretrieve(gz_url, result_path) print info
def collect_archive_from_url(url, base_arch_dir="archives"): list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, base_arch_dir) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(base_arch_dir, list_name) # download monthly archives for res in results: result_path = os.path.join(arc_dir, res) # this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = url + res pp("retrieving %s" % gz_url) info = urllib.urlretrieve(gz_url, result_path) print info
def collect_archive_from_url(url, archive_dir="../archives", notes=None): """ Collects archives (generally tar.gz) files from mailmain archive page. Returns True if archives were downloaded, False otherwise (for example if the page lists no accessible archive files). """ list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, archive_dir, notes=notes) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(archive_dir, list_name) populate_provenance(directory=arc_dir, list_name=list_name, list_url=url, notes=notes) encountered_error = False # download monthly archives for res in results: result_path = os.path.join(arc_dir, res) # this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = "/".join([url.strip("/"), res]) pp('retrieving %s' % gz_url) resp = urllib2.urlopen(gz_url) if resp.getcode() == 200: print("200 - writing file to %s" % (result_path)) output = open(result_path, 'wb') output.write(resp.read()) output.close() else: print("%s error code trying to retrieve %s" % (str(resp.getcode(), gz_url))) encountered_error = True if not encountered_error: # mark that all available archives were collected provenance = access_provenance(arc_dir) provenance['complete'] = True update_provenance(arc_dir, provenance) # return True if any archives collected, false otherwise return len(results) > 0
def collect_archive_from_url(url, archive_dir="../archives", notes=None): """ Collects archives (generally tar.gz) files from mailmain archive page. Returns True if archives were downloaded, False otherwise (for example if the page lists no accessible archive files). """ list_name = get_list_name(url) pp("Getting archive page for %s" % list_name) if w3c_archives_exp.search(url): return w3crawl.collect_from_url(url, archive_dir, notes=notes) response = urllib2.urlopen(url) html = response.read() results = [] for exp in mailing_list_path_expressions: results.extend(exp.findall(html)) pp(results) # directory for downloaded files arc_dir = archive_directory(archive_dir, list_name) populate_provenance(directory=arc_dir, list_name=list_name, list_url=url, notes=notes) encountered_error = False # download monthly archives for res in results: result_path = os.path.join(arc_dir, res) # this check is redundant with urlretrieve if not os.path.isfile(result_path): gz_url = "/".join([url.strip("/"),res]) pp('retrieving %s' % gz_url) resp = urllib2.urlopen(gz_url) if resp.getcode() == 200: print("200 - writing file to %s" % (result_path)) output = open(result_path, 'wb') output.write(resp.read()) output.close() else: print("%s error code trying to retrieve %s" % (str(resp.getcode(), gz_url))) encountered_error = True if not encountered_error: # mark that all available archives were collected provenance = access_provenance(arc_dir) provenance['complete'] = True update_provenance(arc_dir, provenance) # return True if any archives collected, false otherwise return len(results) > 0