Example #1
0
def should_fetch_media(link_dir, link):
    if is_static_file(link['url']):
        return False

    if os.path.exists(os.path.join(link_dir, 'media')):
        return False

    return FETCH_MEDIA
Example #2
0
def should_fetch_dom(link_dir, link):
    if is_static_file(link['url']):
        return False
    
    if os.path.exists(os.path.join(link_dir, 'output.html')):
        return False

    return FETCH_DOM
Example #3
0
def should_fetch_screenshot(link_dir, link):
    if is_static_file(link['url']):
        return False
    
    if os.path.exists(os.path.join(link_dir, 'screenshot.png')):
        return False

    return FETCH_SCREENSHOT
Example #4
0
def should_fetch_archive_dot_org(link_dir, link):
    if is_static_file(link['url']):
        return False

    if os.path.exists(os.path.join(link_dir, 'archive.org.txt')):
        # if open(path, 'r').read().strip() != 'None':
        return False

    return SUBMIT_ARCHIVE_DOT_ORG
Example #5
0
def should_fetch_title(link_dir, link):
    # if link already has valid title, skip it
    if link['title'] and not link['title'].lower().startswith('http'):
        return False

    if is_static_file(link['url']):
        return False

    return FETCH_TITLE
Example #6
0
def should_fetch_git(link_dir, link):
    if is_static_file(link['url']):
        return False

    if os.path.exists(os.path.join(link_dir, 'git')):
        return False

    is_clonable_url = ((domain(link['url']) in GIT_DOMAINS)
                       or (extension(link['url']) == 'git'))
    if not is_clonable_url:
        return False

    return FETCH_GIT