def should_fetch_media(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'media')): return False return FETCH_MEDIA
def should_fetch_dom(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'output.html')): return False return FETCH_DOM
def should_fetch_screenshot(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'screenshot.png')): return False return FETCH_SCREENSHOT
def should_fetch_archive_dot_org(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'archive.org.txt')): # if open(path, 'r').read().strip() != 'None': return False return SUBMIT_ARCHIVE_DOT_ORG
def should_fetch_title(link_dir, link): # if link already has valid title, skip it if link['title'] and not link['title'].lower().startswith('http'): return False if is_static_file(link['url']): return False return FETCH_TITLE
def should_fetch_git(link_dir, link): if is_static_file(link['url']): return False if os.path.exists(os.path.join(link_dir, 'git')): return False is_clonable_url = ((domain(link['url']) in GIT_DOMAINS) or (extension(link['url']) == 'git')) if not is_clonable_url: return False return FETCH_GIT