Python collect_from_url Examples, w3crawl.collect_from_url Python Examples

Example #1

0

Show file

File: mailman.py Project: davelester/bigbang

def collect_archive_from_url(url,base_arch_dir="archives"):
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)
    
    if w3c_archives_exp.search(url):
      return w3crawl.collect_from_url(url, base_arch_dir)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
      results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(base_arch_dir,list_name)

    # download monthly archives   
    for res in results:
        result_path = os.path.join(arc_dir,res)
        #this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = url + res
            pp('retrieving %s' % gz_url)
            resp = urllib2.urlopen(gz_url)
            if resp.getcode() == 200:
                print("200 - writing file to %s" % (result_path))
                output = open(result_path,'wb')
                output.write(resp.read())
                output.close()
            else:
                print("%s error code trying to retrieve %s" %
                      (str(resp.getcode(),gz_url)))

Example #2

0

Show file

def collect_archive_from_url(url, base_arch_dir="archives"):
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)

    if w3c_archives_exp.search(url):
        return w3crawl.collect_from_url(url, base_arch_dir)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
        results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(base_arch_dir, list_name)

    # download monthly archives
    for res in results:
        result_path = os.path.join(arc_dir, res)
        # this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = url + res
            pp('retrieving %s' % gz_url)
            resp = urllib2.urlopen(gz_url)
            if resp.getcode() == 200:
                print("200 - writing file to %s" % (result_path))
                output = open(result_path, 'wb')
                output.write(resp.read())
                output.close()
            else:
                print("%s error code trying to retrieve %s" %
                      (str(resp.getcode(), gz_url)))

Example #3

0

Show file

def collect_archive_from_url(url, base_arch_dir="archives"):
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)

    if w3c_archives_exp.search(url):
        return w3crawl.collect_from_url(url, base_arch_dir)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
        results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(base_arch_dir, list_name)

    # download monthly archives
    for res in results:
        result_path = os.path.join(arc_dir, res)
        #this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = url + res
            pp('retrieving %s' % gz_url)
            info = urllib.urlretrieve(gz_url, result_path)
            print info

Example #4

0

Show file

File: mailman.py Project: Jack005/bigbang

def collect_archive_from_url(url, base_arch_dir="archives"):
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)

    if w3c_archives_exp.search(url):
        return w3crawl.collect_from_url(url, base_arch_dir)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
        results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(base_arch_dir, list_name)

    # download monthly archives
    for res in results:
        result_path = os.path.join(arc_dir, res)
        # this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = url + res
            pp("retrieving %s" % gz_url)
            info = urllib.urlretrieve(gz_url, result_path)
            print info

Example #5

0

Show file

File: mailman.py Project: davidberra/bigbang

def collect_archive_from_url(url, archive_dir="../archives", notes=None):
    """
    Collects archives (generally tar.gz) files from mailmain
    archive page.

    Returns True if archives were downloaded, False otherwise
    (for example if the page lists no accessible archive files).
    """
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)

    if w3c_archives_exp.search(url):
        return w3crawl.collect_from_url(url, archive_dir, notes=notes)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
        results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(archive_dir, list_name)

    populate_provenance(directory=arc_dir,
                        list_name=list_name,
                        list_url=url,
                        notes=notes)

    encountered_error = False
    # download monthly archives
    for res in results:
        result_path = os.path.join(arc_dir, res)
        # this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = "/".join([url.strip("/"), res])
            pp('retrieving %s' % gz_url)
            resp = urllib2.urlopen(gz_url)
            if resp.getcode() == 200:
                print("200 - writing file to %s" % (result_path))
                output = open(result_path, 'wb')
                output.write(resp.read())
                output.close()
            else:
                print("%s error code trying to retrieve %s" %
                      (str(resp.getcode(), gz_url)))
                encountered_error = True

    if not encountered_error:  # mark that all available archives were collected
        provenance = access_provenance(arc_dir)
        provenance['complete'] = True
        update_provenance(arc_dir, provenance)

    # return True if any archives collected, false otherwise
    return len(results) > 0

Example #6

0

Show file

File: mailman.py Project: davidberra/bigbang

def collect_archive_from_url(url, archive_dir="../archives", notes=None):
    """
    Collects archives (generally tar.gz) files from mailmain
    archive page.

    Returns True if archives were downloaded, False otherwise
    (for example if the page lists no accessible archive files).
    """
    list_name = get_list_name(url)
    pp("Getting archive page for %s" % list_name)

    if w3c_archives_exp.search(url):
        return w3crawl.collect_from_url(url, archive_dir, notes=notes)

    response = urllib2.urlopen(url)
    html = response.read()

    results = []
    for exp in mailing_list_path_expressions:
        results.extend(exp.findall(html))

    pp(results)

    # directory for downloaded files
    arc_dir = archive_directory(archive_dir, list_name)

    populate_provenance(directory=arc_dir, list_name=list_name, list_url=url, notes=notes)

    encountered_error = False
    # download monthly archives
    for res in results:
        result_path = os.path.join(arc_dir, res)
        # this check is redundant with urlretrieve
        if not os.path.isfile(result_path):
            gz_url = "/".join([url.strip("/"),res])
            pp('retrieving %s' % gz_url)
            resp = urllib2.urlopen(gz_url)
            if resp.getcode() == 200:
                print("200 - writing file to %s" % (result_path))
                output = open(result_path, 'wb')
                output.write(resp.read())
                output.close()
            else:
                print("%s error code trying to retrieve %s" %
                      (str(resp.getcode(), gz_url)))
                encountered_error = True

    if not encountered_error:   # mark that all available archives were collected
        provenance = access_provenance(arc_dir)
        provenance['complete'] = True
        update_provenance(arc_dir, provenance)

    # return True if any archives collected, false otherwise
    return len(results) > 0