Beispiel #1
0
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w") 
    for f in files:
        if f.split(".")[-1] != "json":
            #make sure this is json file
            continue
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                try:
                    data = json.loads(line)
                    url = data['url']
                    url = URLUtility.normalize(url)
                    html_content = data['html'] 
                    #links = HTMLParser.extract_links(url, html_content)
                    links = HTMLParser.extract_links_bs(url, html_content)
                    for link in links:
                        if URLUtility.is_same_site(url, link):
                            if link not in uniq_links:
                                uniq_links.add(link)
                                out.write(link.encode('utf-8') + "\n")
                    if url not in links:
                        out.write(url.encode('utf-8') + "\n")
                except:
                    traceback.print_exc()
                    continue
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set(
    )  #many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w")
    for f in files:
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                data = json.loads(line)
                url = data['url']
                url = URLUtility.normalize(url)
                html_content = data['text']
                #links = HTMLParser.extract_links(url, html_content)
                links = HTMLParser.extract_links_bs(url, html_content)
                for link in links:
                    if URLUtility.is_same_site(url, link):
                        if link not in uniq_links:
                            uniq_links.add(link)
                            out.write(link.encode('utf-8') + "\n")
                if url not in links:
                    out.write(url.encode('utf-8') + "\n")

    out.close()