parse_srcs = (
    Composable()  # :: List[Optional[Path]]
    >> F(filter, is_not_none)  # :: List[Path]
    # >> F(filter, lambda phrase: str.startswith(phrase, '/media'))
    >> F(filter, is_media_path)
    >> F(map, tryit(split_path_parts))  # :: List[Optional(PathParts)]
    # >> mapper(tryit(split_path_parts))
    >> F(filter, is_not_none)  # :: List[PathParts]
    >> F(map, getitem(1))  # :: List[Path]
    # >> mapper(getitem(1))
)
# Combine get_img_srcs with get_css_srcs, and parse resultant paths
fetch_full_paths = (
    Composable()  # :: Location
    # >> cache(branch(get_img_srcs, get_css_srcs))  # :: (List[Path], List[Path])
    >> branch(get_img_srcs, get_css_srcs)  # :: (List[Path], List[Path])
    >> combine  # :: List[Path]
    >> unique  # :: List[Path]
)
fetch_paths = fetch_full_paths >> parse_srcs  # :: List[Path]
executor = fetch_paths >> F(map, sync_media)  # Location -> Side Effects! impure! impure!


def main(location):
    """ Pull down all images referenced in a given HTML URL or file."""
    return executor(location)

if __name__ == "__main__":
    import sys
    main(sys.argv[1])

def crop(prefix):
    def wrapped(word):
        if word.startswith(prefix):
            return word[len(prefix):]
    return wrapped

remove_prefix = maybe(crop('http://opeterml1297110.njgroup.com:7000'),
                      maybe(crop('http://cdn.theatlantic.com/assets/'),
                            maybe(crop('https://cdn.theatlantic.com/assets/'),
                                  lambda word: word)))


# Read data input, with no processing
data_read = (Pipe() >> branch(get_html, read_page) >> cache)


# Composite work-horse functions
# Retreive src-like properties from <img> tags
get_img_srcs = (
    Pipe()  # :: Location
    >> get_html  # :: ElementTree
    >> img_tags  # :: List[Element]
    >> partial(map, get_src)  # List[Optional[Path]]
)
# Retreive URL-paths from CSS 'background-image:' properties
get_css_srcs = (
    Pipe()  # :: Location
    >> path_to_url  # :: URLString
    >> read_page  # :: HTMLString
Exemple #3
0
    Chainable()  # :: Location
    >> read_page  # :: str
    >> BACKGROUND_IMAGE_REGEX.findall  # :: List[str]
)
# Format relative paths for sync-media
parse_srcs = (
    Chainable()  # :: List[Optional[Path]]
    >> partial(filter, is_not_none)  # :: List[Path]
    >> partial(filter, is_media_path)
    >> partial(map, tryit(split_path_parts))  # :: List[Optional(PathParts)]
    >> partial(filter, is_not_none)  # :: List[PathParts]
    >> partial(map, getitem(1))  # :: List[Path]
)
# Combine get_img_srcs with get_css_srcs, and parse resultant paths
fetch_full_paths = (
    Chainable()  # :: Location
    >> cache(branch(get_img_srcs, get_css_srcs))  # :: (List[Path], List[Path])
    >> combine  # :: List[Path]
    >> unique  # :: List[Path]
)
fetch_paths = fetch_full_paths >> parse_srcs  # :: List[Path]
executor = fetch_paths >> partial(map, sync_media)  # Location -> Side Effects! impure! impure!

def main(location):
    """ Pull down all images referenced in a given HTML URL or file."""
    return executor(location)

if __name__ == "__main__":
    import sys
    main(sys.argv[1])