コード例 #1
0
def genRunner(url):
    '''This module runs the process'''
    page            = makePage(url)
    if page:
        soup        = makeSoup(page, url, 'html5lib')
        if not soup.body or not soup.head:
            print 'no body or no head found...'
        else:
            CONTENT = getAllContent(soup)
    CONTENT         = postprocContent(CONTENT)
    return CONTENT
コード例 #2
0
def genRunner(url):
    '''This module runs the algorithm'''
    # Why am I not using soup stuff??
    page = makePage(url)
    if page:
        soup = makeSoup(page, url, 'html5lib')
        if not soup.body or not soup.head:
            print 'no body or no head found...'
        else:
            meta_info, CONTENT = getAllContent(soup)
            if meta_info and CONTENT:
                CONTENT = postprocContent(CONTENT)
                return meta_info, CONTENT
            else:
                return None, None
コード例 #3
0
import sys, os
import prepwork
import alltheMeta

def mergeOthDict(mod, o):
    '''merges the new "other" dict, 'o', with the running "other" dict, mod'''
    for key in o.keys():
        if mod.has_key(key):
            mod[key] = mod[key].union(o[key])
        else:
            mod[key] = o[key]
    return mod

if __name__=="__main__":
    url_file = sys.argv[1]
    mod = dict()
    if os.path.isfile(url_file):
        with open(url_file, 'r') as f1:
            urls = f1.read()
        urls = urls.split('\n')
        urls = [url.strip() for url in urls if url.strip()]
        for url in urls:
            page = prepwork.makePage(url)
            soup = prepwork.makeSoup(page, url)
            _, o = alltheMeta.getAllTheMeta(soup)
            mod = mergeOthDict(mod, o)
    for key in mod.keys():
        print key
        print mod[key]
        print '\n'