Example #1
0
##  main
###############################################################
if __name__ == "__main__":
    # Bolean declarations
    wantBasePaths = True
    
    # Get list of games that start with 100 sssp files from /data/zliu8/sssp 
    #starting_path = '/data/zliu8/sssp/'
    outputPath    = '/home/saguinag/CategoryPaths/ssspGamesDatFiles/'
    
    # List of files
    #src_pg_filenames = getFilenames(starting_path)
    print "sys.argv", sys.argv
    
    filename = sys.argv[1]
    
    print 'Processing file: ',filename
    pattern = re.compile(r'([0-9]+).txt')
    src_pg_id = pattern.search(filename)
    pgID = src_pg_id.group(1)
				
    for row in gamesWithSourceNode(pgID):
        print row
    
		## Save the new dataframe to disk
#		motherload.to_csv(outputFile, sep=',',mode='w',encoding='utf-8',index=True)

    print 'Done'


Example #2
0
		for filename in src_pg_files:
			pattern = re.compile(r'([0-9]+).txt')
			src_pg_id = pattern.search(filename)
			print  "src_pg_id.group(1)",src_pg_id.group(1)
			if src_pg_id.group(1) not in st_pg_id_nodes_proc_lst:
				print 'Processing file: ',filename
				# Grab shortest path data
				print 'Fetching shortest path data'
				shortest_path_data = Path_Data(starting_path+filename)
				sp_df = pd.DataFrame.from_dict(shortest_path_data.path_data.items())
				sp_df.columns=['key','sp']
				#print sp_df.head()

				# Show games that start with this page id
				print 'Fetching games per starting node'
				for games_plus in  gamesWithSourceNode(src_pg_id.group(1)):
					if len(games_plus) >= 3:
						path_chain_for_game_dict["%ld_to_%ld" % (games_plus[0],games_plus[2])] = games_plus[1]
					else:
						print 'games_plus is not at least 3'
				
				print 'games with starting node: ',len(path_chain_for_game_dict)
			
				gm_df = pd.DataFrame.from_dict(path_chain_for_game_dict.items())
				gm_df.columns=['key','game']
				#print gm_df.head()
			
				# Merge data frames on key
				data = pd.DataFrame()
				data = pd.merge(gm_df, sp_df, on='key')
				#print data.describe()
#               csv.writer(f).writerow(row) #f.write(row)
#           f.close()
#           print 'Done writing results to file io'
#    print 'Done.'

###############################################################################
    result = re.search('sssp_(.*).txt', args.filename)

    print 'Script arguments:'
    print '\t'+args.filename
    
    print '\nSource node (wikipedia.game.page_id): '+result.group(1)

    print 'Games with source node (%s): ' % result.group(1)
    
    results = gamesWithSourceNode(result.group(1),-1)
    data=  np.array(results) 
    df = pd.DataFrame(data, columns=['src_pg','game','end_pg'])
    print df.head()

    ### Find the shortest path score to end_page
    src_pg = result.group(1)
    # output filename 
    output_file='outputFiles/'+result.group(1)+'_sp_score_'+datetime.date.today().strftime("%d%b%y")+'.txt'
    # write out to file
    f = open(output_file,'w')
    for end_pg in df['end_pg']:
        end_pg_score = find_sssp_score(end_pg,args.filename)
        if end_pg_score is None:
            row_2_print = result.group(1)+'\t'+end_pg+'\t0'
            csv.writer(f).writerow([row_2_print])
    args = parser.parse_args()
    
    #filenames = re.search('sssp_(.*).txt', getFilenames(args.dirPath))
    srcNodes = list()
    results = []
    filenames   = getFilenames(args.dirPath)

    ## extract source nodes 
    for fn in filenames:
        srcNodes.append(re.search('sssp_(.*).txt',fn).group(1))

    ## get games
    for srcnode in srcNodes: 
        if not(srcnode.isdigit()):
            continue
        results = gamesWithSourceNode(srcnode,-1)
        data=  np.array(results) 
        df = pd.DataFrame(data, columns=['src_pg','game','end_pg'])
        #print df.head()
        outFname='/home/saguinag/CategoryPaths/gamesDatafiles/'+srcnode+\
                    '_wpgame_games.dat'
        df.to_csv(outFname, sep=',',mode='w',encoding='utf-8',index=False)
        df = None 
    print 'Done.'
            
"""
    ## Find the shortest path score to end_page
    src_pg = filenames.group(1)
    # output filename 
    
    print output_file