conf_file = join(expanduser("~"),"scrape.conf") # Config config = ConfigParser.ConfigParser() config.read(conf_file) salt = config.get('info', 'salt') tab_page = config.get('info','tab_page') domain = config.get('info','domain') delay = int(config.get('info','delay')) #tdl = TabDownloader(domain, tab_page, salt) #tu = UserDownloader(domain) #test_page = "http://tabs."+domain+"/m/megadeth/symphony_of_destruction_ver6_guitar_pro.htm" #test_page = "http://tabs."+domain+"/r/ryan_clough/love_lust_power_tab.htm" #test_page = "http://tabs."+domain+"/l/lynyrd_skynyrd/free_bird_guitar_pro.htm" #test_page = "http://tabs."+domain+"/j/justin_bieber/all_that_matters_crd.htm" #tab_data = tdl.tab_download(test_page,True) #tu.load_user("Bonsaischaap") ts = TabSpider(domain, delay) while ts.has_more(): ts.next_url() # if tab_data: # for comment in tab_data.comments: # print "level 1 comment by " + comment.author # if hasattr(comment, "child_comments"): # print "\tlevel 2 comments: " + str(comment.child_comments)
def main(argv=None): # Config processing conf_file = join(expanduser("~"),"scrape.conf") config = ConfigParser.ConfigParser() config.read(conf_file) salt = config.get('info', 'salt') tab_page = config.get('info','tab_page') domain = config.get('info','domain') delay = int(config.get('info', 'delay')) # Start graph config = Config(NEO4J_URI) g = Graph(config) g.clear() # Change if youre working with persistent data store!!!! # Set up local indices users = {} instruments = {} # Make page crawlers tab_loader = TabDownloader(domain, tab_page, salt, delay) user_loader = UserDownloader(domain, delay) # Unofficial iterator resource = TabSpider(domain) # Start crawling! while resource.has_more(): # Get tab info tab_info = tab_loader.tab_download(resource.next_url()) # has_more doesnt actually work because of the retarted logic needed to keep track of tabs if not tab_info: break # If tab is valid if tab_info: # Store base tab tab_node = g.vertices.create(name=tab_info.tab_file) tab_node.tab_file = tab_info.tab_file tab_node.title = tab_info.title tab_node.version = tab_info.version tab_node.rating = tab_info.rating tab_node.num_ratings = tab_info.num_ratings tab_node.num_comments = tab_info.num_comments tab_node.label = "tab" tab_node.save() # Add instruments for instrument in tab_info.instruments: if instrument not in instruments.keys(): i_node = g.vertices.create(name=instrument) i_node.label = "instrument" i_node.save() instruments[instrument] = i_node i_node = instruments[instrument] g.edges.create(tab_node,"has_instrument",i_node) # Add comments (recursive) if tab_info.comments: for comment in tab_info.comments: g.edges.create(tab_node,"has_comment",save_comment(g, comment)) # Get info on the tabber if we don't have it if tab_info.tabber: if tab_info.tabber not in users.keys(): tabber = user_loader.load_user(tab_info.tabber) if not tabber: continue # create user node for tabber tempname = tabber.name if not tempname: tempname = "" u_node = g.vertices.create(name=tabber.tempname) u_node.registration_date = tabber.registration_date u_node.num_contributions = tabber.num_contributions u_node.rank = tabber.rank u_node.save() users[tab_info.tabber] = u_node # Add tab to tabber's transcriptions tabber = users[tab_info.tabber] g.edges.create(tabber,"tabbed",tab_node) print "Finished crawl! Woah!"