def export_graph_data():
    data_file = open(EXPORT_EDGES_FILE, "w")
    db = db_utils.getDBInstance()

    nodes = set([])
    data_file.write("Source\tTarget\n")
 
    pages = db.webpages.find()
    for page in pages:
        ilinks = page["incoming_links"]
        for link in ilinks:
            if link.startswith("javascript"):
                continue
            if page["url"].startswith("javascript"):
                continue
            link = link.replace(",", "")
            nodes.add(link)
            page["url"] = page["url"].replace(",", "")
            nodes.add(page["url"])
            data_file.write(("%s\t%s\n"%(link, page["url"])))



    nodes_file = open(EXPORT_NODES_FILE, "w")    

    nodes_file.write("Id\tLabel\n")
    for node in nodes:
        nodes_file.write("%s\t%s\n"%(node, utils.domainOf(node)))
    
    nodes_file.close() 
    data_file.close()
Beispiel #2
0
def regex_domain_match(db, url):
    visited_domain_patterns = db.domain_regex_patterns.find_one({'domain_regexes': 'regex'})
    regexes = []
    if visited_domain_patterns:
        regexes = visited_domain_patterns["regexes"]
    for regex in regexes:
        if re.search(regex, url, re.I|re.M):
            return True
    domain = utils.domainOf(url)
    if domain.startswith("www."):
        domain = domain.replace("www.", "")
    
    if domain.count('.') > 1:
        spli = domain.split(".")
        domain = domain[-2] + "." + domain[-1]

    regexes.append(domain)
    if not visited_domain_patterns:
        visited_domain_patterns = {
            'domain_regexes': "regex"}

    visited_domain_patterns["regexes"] = list(set(regexes))
    if "_id" in visited_domain_patterns:
        visited_domain_patterns.pop("_id")
    query = {'domain_regexes': 'regex'}
    db.domain_regex_patterns.find_and_modify(query=query,
            update=visited_domain_patterns, upsert=True, new=True)
    return False
Beispiel #3
0
def prepareURLsForNextCrawl(urls):
    new_urls = []
    for url in urls:
        url = utils.sanitize_url(url)
        if url.endswith(".js") or url.endswith("exe"):
            url = utils.domainOf(url)
            url = utils.sanitize_url(url)
        new_urls.append(url)
    new_urls = [url for url in new_urls if not isWhiteListedURL(url)]
    return list(set(new_urls))
Beispiel #4
0
def persistURLsForNextCrawlInDB(db, urls):
    urls = prepareURLsForNextCrawl(urls)
    crawl_queue = db.crawl_queue
    visited_urls = db.visited_urls

    for url in urls:
        domain = utils.domainOf(url)
        domain_hash = utils.get_url_hash(domain)
        url_hash = utils.get_url_hash(url)

        db_query = {domain_hash: domain}
        update = {url_hash: url}
        url_obj = visited_urls.find_one(db_query)
        if not url_obj:
            crawl_queue.insert(update)
Beispiel #5
0
 def _domain(self):
     return utils.domainOf(self.url)
Beispiel #6
0
            if obj:
                print "Updating for: ", out_url
                query = {'url_hash': url_hash}
                obj["incoming_links"].append(url)
                obj["incoming_links"] = list(set(obj["incoming_links"]))
                if "_id" in obj:
                    obj.pop("_id")
                new_obj = db.webpages.find_and_modify(query=query, update=obj,
                        new=True, upsert=True)
            else:
                new_webpage = WebPage(out_url)
                new_webpage.incoming_links.add(url)
                db.webpages.insert(new_webpage.json())
        
        print "Marking  as visited"
        domain = utils.domainOf(url)
        domain_hash = utils.get_url_hash(domain)
        db_query = {domain_hash: domain}
        update = {url_hash: url}
        vis = db.visited_urls.find_and_modify(query=db_query,
                update=db_query, upsert=True, new=True)
        print vis
        print "Updating crawl queue"
        persistURLsForNextCrawlInDB(db, all_urls)
        print "Updated"


if __name__=='__main__':
    #crawl_url("http://sugarudyog.com/index.htm?adasd=asdas&sadgas=afs", headless=True,
    #        save_into_db=True)
    print