Beispiel #1
0
conf_file = join(expanduser("~"),"scrape.conf")

# Config
config = ConfigParser.ConfigParser()
config.read(conf_file)
salt = config.get('info', 'salt')
tab_page = config.get('info','tab_page')
domain = config.get('info','domain')
delay = int(config.get('info','delay'))

#tdl = TabDownloader(domain, tab_page, salt)
#tu = UserDownloader(domain)
#test_page = "http://tabs."+domain+"/m/megadeth/symphony_of_destruction_ver6_guitar_pro.htm"
#test_page = "http://tabs."+domain+"/r/ryan_clough/love_lust_power_tab.htm"
#test_page = "http://tabs."+domain+"/l/lynyrd_skynyrd/free_bird_guitar_pro.htm"
#test_page = "http://tabs."+domain+"/j/justin_bieber/all_that_matters_crd.htm"
#tab_data =  tdl.tab_download(test_page,True)

#tu.load_user("Bonsaischaap")

ts = TabSpider(domain, delay)
while ts.has_more():
    ts.next_url()

# if tab_data:
#     for comment in tab_data.comments:
#         print "level 1 comment by " + comment.author
#         if hasattr(comment, "child_comments"):
#             print "\tlevel 2 comments: " + str(comment.child_comments)
Beispiel #2
0
def main(argv=None):
    # Config processing
    conf_file = join(expanduser("~"),"scrape.conf")
    config = ConfigParser.ConfigParser()
    config.read(conf_file)
    salt = config.get('info', 'salt')
    tab_page = config.get('info','tab_page')
    domain = config.get('info','domain')
    delay = int(config.get('info', 'delay'))
    
    # Start graph
    config = Config(NEO4J_URI)
    g = Graph(config)
    g.clear() # Change if youre working with persistent data store!!!!
    
    # Set up local indices 
    users = {}
    instruments = {}
    
    # Make page crawlers
    tab_loader = TabDownloader(domain, tab_page, salt, delay)
    user_loader = UserDownloader(domain, delay)
    
    # Unofficial iterator
    resource = TabSpider(domain)
    
    # Start crawling!
    while resource.has_more():
        # Get tab info
        tab_info = tab_loader.tab_download(resource.next_url())
        
        # has_more doesnt actually work because of the retarted logic needed to keep track of tabs
        if not tab_info:
            break
        
        # If tab is valid
        if tab_info:
            # Store base tab
            tab_node = g.vertices.create(name=tab_info.tab_file)
            tab_node.tab_file = tab_info.tab_file
            tab_node.title = tab_info.title
            tab_node.version = tab_info.version
            tab_node.rating = tab_info.rating
            tab_node.num_ratings = tab_info.num_ratings
            tab_node.num_comments = tab_info.num_comments
            tab_node.label = "tab"
            tab_node.save()
            
            # Add instruments
            for instrument in tab_info.instruments:
                if instrument not in instruments.keys():
                    i_node = g.vertices.create(name=instrument)
                    i_node.label = "instrument"
                    i_node.save()
                    instruments[instrument] = i_node
                i_node = instruments[instrument]
                g.edges.create(tab_node,"has_instrument",i_node)
            
            # Add comments (recursive)
            if tab_info.comments:
                for comment in tab_info.comments:
                    g.edges.create(tab_node,"has_comment",save_comment(g, comment))
            
            # Get info on the tabber if we don't have it
            if tab_info.tabber:
                if tab_info.tabber not in users.keys():
                    tabber = user_loader.load_user(tab_info.tabber)
		    if not tabber:
                        continue
                    # create user node for tabber
                    tempname = tabber.name
                    if not tempname:
                        tempname = ""
                    u_node = g.vertices.create(name=tabber.tempname)
                    u_node.registration_date = tabber.registration_date
                    u_node.num_contributions = tabber.num_contributions
                    u_node.rank = tabber.rank
                    u_node.save()
                    users[tab_info.tabber] = u_node
                
                # Add tab to tabber's transcriptions
                tabber = users[tab_info.tabber]
                g.edges.create(tabber,"tabbed",tab_node)
                
    print "Finished crawl! Woah!"