Beispiel #1
0
def crawl(in_file, html_dir, status_dir, agent):
    urls = set()
    url_objects = []
    with open(in_file) as lines:
        for line in lines:
            values = line.strip("\n").split("\t")
            url_object = {"url_meta":{}}
            if len(values) == 4:
                url_object["url_meta"]["topic"] = values[0]
                url_object["url_meta"]["site"] = values[1]
                url = URLUtility.normalize(values[2])
                url_object["url"] = url 
                url_object["url_meta"]["subtopic"] = values[3]

            else:
                url = URLUtility.normalize(values[0])
                url_object["url"] = url 
            if url not in urls:
                urls.add(url)
                url_objects.append(url_object)              
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
def crawl(in_file, html_dir, status_dir, agent):
    urls = set()
    url_objects = []
    with open(in_file) as lines:
        for line in lines:
            values = line.strip("\n").split("\t")
            url_object = {"url_meta": {}}
            if len(values) == 4:
                url_object["url_meta"]["topic"] = values[0]
                url_object["url_meta"]["site"] = values[1]
                url = URLUtility.normalize(values[2])
                url_object["url"] = url
                url_object["url_meta"]["subtopic"] = values[3]

            else:
                url = URLUtility.normalize(values[0])
                url_object["url"] = url
            if url not in urls:
                urls.add(url)
                url_objects.append(url_object)
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target=crawlprocess,
                    args=(url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
Beispiel #3
0
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set()#many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w") 
    for f in files:
        if f.split(".")[-1] != "json":
            #make sure this is json file
            continue
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                try:
                    data = json.loads(line)
                    url = data['url']
                    url = URLUtility.normalize(url)
                    html_content = data['html'] 
                    #links = HTMLParser.extract_links(url, html_content)
                    links = HTMLParser.extract_links_bs(url, html_content)
                    for link in links:
                        if URLUtility.is_same_site(url, link):
                            if link not in uniq_links:
                                uniq_links.add(link)
                                out.write(link.encode('utf-8') + "\n")
                    if url not in links:
                        out.write(url.encode('utf-8') + "\n")
                except:
                    traceback.print_exc()
                    continue
 def extract_links_bs(url, html):
     '''
     Extract links from html source using beautiful soup
     Args:
         - url: url of the html source, used to construct absolute url from relative url
         - html: html source
         - html: html source
     Returns:
         - links: extracted links
    
     '''
     from bs4 import BeautifulSoup
     soup = BeautifulSoup(html)
     links = set()
     for tag in soup.findAll('a', href=True):
         link = tag['href']
         try:
             link = urlparse.urljoin(url, link)
         except:
             continue
         link = URLUtility.validate_link(link)
         if link:
             link = URLUtility.normalize(link)
             if link:
                 links.add(link)
     return list(links)
Beispiel #5
0
    def search_backlinks(self, url, limit=5):
        """
        Return a list of urls
        Args:
            limit: maximum number of results to return
        """
        urls = []
        try:
            results = self.client.links(url,
                                        scope="page_to_page",
                                        sort="page_authority",
                                        filters=["external"],
                                        limit=limit)
            #results = self.client.links(url, scope="page_to_page", sort="spam_score", filters=["external"], limit=limit)
            #results = self.client.links(url, scope="page_to_page", sort="page_authority")

            for res in results:
                if 'uu' in res:
                    url = URLUtility.normalize(res['uu'])
                    if url:
                        urls.append(url)
                else:
                    print "Error: key does not exisit"
                    print res
        except:
            traceback.print_exc()

        return urls
    def extract_external_links(self, url, html):
        '''
        Extract external outlinks, that link to different websites
        Returns: 
            - list of unique urls
        '''
        try:
            soup = BeautifulSoup(html, 'lxml')
            links = set()
            tld = URLUtility.get_tld(url)

            for tag in soup.findAll('a', href=True):
                link = tag['href']
                values = urlparse.urlparse(link)
                if (values.netloc == "") or (values.netloc
                                             == tld) or (tld in values.netloc):
                    continue
                link = URLUtility.validate_link(link)
                if link:
                    link = URLUtility.normalize(link)
                    if link:
                        links.add(link)
            return list(links)
        except:
            traceback.print_exc()
            return []
 def extract_insite_links(self, url, html):
     '''
     Returns: 
         - list of insite urls that are different from the input url
     '''
     try:
         soup = BeautifulSoup(html, 'html.parser')
         #soup = BeautifulSoup(html, 'lxml') # Couldn't parse http://www.gunsinternational.com/
         links = set()
         tld = URLUtility.get_tld(url)
         for tag in soup.findAll('a', href=True):
             link = tag['href']
             try:
                 link = urlparse.urljoin(url, link)
             except:
                 traceback.print_exc()
                 continue
             values = urlparse.urlparse(link)
             if tld in values.netloc:
                 link = URLUtility.validate_link(link)
                 if link:
                     link = URLUtility.normalize(link)
                     if link and link != url:
                         links.add(link)
         return list(links)
     except:
         print "Parsing with BeautifulSoup failed"
         return []
    def extract_links_bs(self, url, html):
        '''
        Extract all outlinks from html using beautiful soup. Return list of links

        Args:
            - url: url of the html source, used to construct absolute url from relative url
            - html: html source
        Returns:
            - links: list of outlinks
       
        '''
        try:
            soup = BeautifulSoup(html, 'lxml')
        except:
            print "Parsing with beautiful soup failed"
            return []
        links = set()
        for tag in soup.findAll('a', href=True):
            link = tag['href']
            try:
                link = urlparse.urljoin(url, link)
            except:
                continue
            link = URLUtility.validate_link(link)
            if link:
                link = URLUtility.normalize(link)
                if link:
                    links.add(link)
        return list(links)
    def search(self, keyword, k):
        """
        Search for a keyword and return top matched urls
        Reference: https://developers.google.com/custom-search/json-api/v1/reference/cse/list

        Args:
            k: Number of search results to return. 
        """
        k = min(k, self.max_results) 
        urls = []
        index = 1
        while index<=k:
            try:
                res = self.service.cse().list(q=keyword, cx=self.cse_id, num=10, start=index).execute() # maximum 10 results for each query
                if 'items' in res:
                    res = res['items']
                    for item in res:
                        url = URLUtility.normalize(item['link'])
                        if url:
                            urls.append(url)
                    if len(res)<10:
                        # Early stop paging
                        break
                else:
                    print res
                    break # No more results, stop paging
            except:
                traceback.print_exc()
                break

            index += 10

        return urls
Beispiel #10
0
def _read_ac_result_file(result_file):
    urls = []
    with open(result_file) as lines:
        for line in lines:
            url = line.split()[0]
            url = URLUtility.normalize(url)
            urls.append(url)
    return urls
def run(infile, outdir):
    urls = set([])
    with open(infile) as lines:
        for line in lines:
            url = line.strip().split("\t")[0]
            url = URLUtility.normalize(url)
            urls.add(url)
    urls = list(urls)
    Download.download(urls, outdir)
Beispiel #12
0
def _read_sf_result_file(result_file):
    urls = []
    with open(result_file) as lines:
        for line in lines:
            values = line.strip().split(', ')
            url = values[-1]
            url = URLUtility.normalize(url)
            urls.append(url)
    return urls
Beispiel #13
0
def deduplicate(outfile, indirs):
    writer = open(outfile, 'w')
    cached_urls = set()
    for indir in indirs:
        for fname in os.listdir(indir):
            print "Reading", indir+'/'+fname
            for line in open(indir+'/'+fname):
                data = json.loads(line)
                url = URLUtility.normalize(data['url'])
                if url in cached_urls: 
                    continue
                cached_urls.add(url)
                writer.write(line)
Beispiel #14
0
def _read_urls_from_json(url_file):
    urls = set()
    with open(url_file) as lines:
        for line in lines:
            try:
                jsonobj = json.loads(line)
                for url in jsonobj[1:]:
                    url = URLUtility.normalize(url)
                    urls.add(url)
            except:
                traceback.print_exc()

    print "Number of urls read from json file: ", len(urls)
    return list(urls)
Beispiel #15
0
def _read_ac_result_file(result_file, max_pages):
    """
    Load all sites from the result file of ACHE 
    """
    count = 0
    sites = set()
    with open(result_file) as lines:
        for line in lines:
            count += 1
            url = line.split()[0]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
Beispiel #16
0
    def search(self, query_term, count=10):
        """
        Reference: https://docs.microsoft.com/en-us/rest/api/cognitiveservices/bing-web-api-v5-reference#query-parameters
        Args:
            count: The number of search results to return in the response. If count is greater than 50, paging will be used to fetch the results since maximum results of each query is 50
        """
        if self.cache and self.cache.contains(query_term):
            urls = self.cache.get(query_term)
            return [url for url in urls if self.is_valid(url)]
        urls = []
        offset = 0

        while count>0:
            params = urllib.urlencode({
                # Request parameters
                'q': query_term,
                'count': str(min(count, 50)),
                'offset': str(offset),
                'mkt': 'en-us',
                'safesearch': 'Moderate'})

            try:
                conn = httplib.HTTPSConnection('api.cognitive.microsoft.com')
                #conn.request("GET", "/bing/v5.0/search?%s" % params, "{body}", headers)
                conn.request("GET", "/bing/v7.0/search?%s" % params, "{body}", self.headers)
                response = conn.getresponse()
                data = response.read()
                obj = json.loads(data)
                if 'webPages' in obj:
                    webPages = obj['webPages']
                    values = webPages['value']
                    for value in values:
                        if self.is_valid(value['url']):
                            url = URLUtility.normalize(value['url'])
                            if url:
                                urls.append(url)
                conn.close()
            except:
                traceback.print_exc()

            count -= 50
            offset += 1

        if self.cache:
            self.cache.add(query_term, urls)
        return urls
Beispiel #17
0
def _read_sf_result_file(result_file, max_pages):
    """
    Load all sites from the result file of SEEDFINDER 
    """
    sites = set()
    count = 0
    with open(result_file) as lines:
        for line in lines:
            count += 1
            values = line.strip().split(', ')
            url = values[-1]
            url = URLUtility.normalize(url)
            site = URLUtility.get_host(url)
            sites.add(site)
            if count == max_pages:
                break
    return sites
 def extract_links(url, html):
     '''
     Extract links from html source using regular expression
     Args:
         - url: url of the html source, used to construct absolute url from relative url
         - html: html source
     Returns:
         - links: extracted (normalized and validated) links
     '''
     match = HTMLParser.LINK_PATTERN.findall(html)
     links = set([])
     for link in match:
         link = urlparse.urljoin(url, link)
         link = URLUtility.validate_link(link)
         if link:
             link = URLUtility.normalize(link)
             if link:
                 links.add(link)
     return list(links)
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir,
                   agent):
    #Read the status_file, recrawl the website that causes exception

    downloaded_urls = set()
    with open(status_file) as lines:
        for line in lines:
            try:
                json_data = json.loads(line)
                if "exception" in json_data:
                    continue
                downloaded_urls.add(json_data['url'])
            except:
                print "recrawl exception"
                traceback.print_exc()
                continue

    url_objects = []
    with open(supplement_status_file) as lines:
        for line in lines:
            try:
                values = line.strip("\n").split("\t")
                url = URLUtility.normalize(values[2])
                if url not in downloaded_urls:
                    url_object = {"url_meta": {}}
                    url_object["url_meta"]["topic"] = values[0]
                    url_object["url_meta"]["site"] = values[1]
                    url_object["url_meta"]["subtopic"] = values[3]
                    url_object["url"] = url
                    url_objects.append(url_object)
            except:
                print "custom recrawl exception"
                traceback.print_exc()
                continue
    print "Number of urls to download: " + str(len(url_objects))
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target=crawlprocess,
                    args=(url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()
Beispiel #20
0
def custom_recrawl(status_file, supplement_status_file, html_dir, status_dir, agent):
    #Read the status_file, recrawl the website that causes exception

    downloaded_urls = set()
    with open(status_file) as lines:
        for line in lines:
            try:
                json_data = json.loads(line)
                if "exception" in json_data:
                    continue
                downloaded_urls.add(json_data['url'])
            except:
                print "recrawl exception"
                traceback.print_exc()
                continue
 
    url_objects = []
    with open(supplement_status_file) as lines:
        for line in lines:
            try:
                values = line.strip("\n").split("\t")
                url = URLUtility.normalize(values[2])
                if url not in downloaded_urls:
                    url_object = {"url_meta":{}}
                    url_object["url_meta"]["topic"] = values[0]
                    url_object["url_meta"]["site"] = values[1]
                    url_object["url_meta"]["subtopic"] = values[3]
                    url_object["url"] = url 
                    url_objects.append(url_object)
            except:
                print "custom recrawl exception"
                traceback.print_exc()
                continue
    print "Number of urls to download: " + str(len(url_objects))
    jobs = []
    for i in range(Config.PROCESS_NUMBER):
        p = Process(target = crawlprocess, args = (url_objects, i, html_dir, status_dir, agent))
        jobs.append(p)
        p.start()
    for p in jobs:
        p.join()   
Beispiel #21
0
def _read_relev_file(clf_file):
    """
    Load all sites from the classification file
    Note that all classification files from different discovery tools have the same format
    """
    sites = set()
    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)
                if label != -1 and label != 1:
                    print "Parsed label is incorrect"
                if label == 1:
                    sites.add(site)
            except:
                traceback.print_exc()
    return sites
Beispiel #22
0
def _read_clf_file(clf_file):
    url2label = {}
    site2label = {}

    with open(clf_file) as lines:
        for line in lines:
            try:
                values = line.strip().split(",")
                url = ''.join(values[:-1])
                label = int(values[-1])
                url = URLUtility.normalize(url)
                site = URLUtility.get_host(url)

                if label > 0:
                    url2label[url] = True
                    site2label[site] = True
                else:
                    url2label[url] = False
                    if site not in site2label:
                        site2label[site] = False
            except:
                print line
    return url2label, site2label
def expand(indir, output_file):
    files = os.listdir(indir)
    uniq_links = set(
    )  #many seed urls come from the same site, so there exists duplicated outlinks from seed urls
    out = open(output_file, "w")
    for f in files:
        filename = indir + "/" + f
        with open(filename) as lines:
            for line in lines:
                data = json.loads(line)
                url = data['url']
                url = URLUtility.normalize(url)
                html_content = data['text']
                #links = HTMLParser.extract_links(url, html_content)
                links = HTMLParser.extract_links_bs(url, html_content)
                for link in links:
                    if URLUtility.is_same_site(url, link):
                        if link not in uniq_links:
                            uniq_links.add(link)
                            out.write(link.encode('utf-8') + "\n")
                if url not in links:
                    out.write(url.encode('utf-8') + "\n")

    out.close()