## main ############################################################### if __name__ == "__main__": # Bolean declarations wantBasePaths = True # Get list of games that start with 100 sssp files from /data/zliu8/sssp #starting_path = '/data/zliu8/sssp/' outputPath = '/home/saguinag/CategoryPaths/ssspGamesDatFiles/' # List of files #src_pg_filenames = getFilenames(starting_path) print "sys.argv", sys.argv filename = sys.argv[1] print 'Processing file: ',filename pattern = re.compile(r'([0-9]+).txt') src_pg_id = pattern.search(filename) pgID = src_pg_id.group(1) for row in gamesWithSourceNode(pgID): print row ## Save the new dataframe to disk # motherload.to_csv(outputFile, sep=',',mode='w',encoding='utf-8',index=True) print 'Done'
for filename in src_pg_files: pattern = re.compile(r'([0-9]+).txt') src_pg_id = pattern.search(filename) print "src_pg_id.group(1)",src_pg_id.group(1) if src_pg_id.group(1) not in st_pg_id_nodes_proc_lst: print 'Processing file: ',filename # Grab shortest path data print 'Fetching shortest path data' shortest_path_data = Path_Data(starting_path+filename) sp_df = pd.DataFrame.from_dict(shortest_path_data.path_data.items()) sp_df.columns=['key','sp'] #print sp_df.head() # Show games that start with this page id print 'Fetching games per starting node' for games_plus in gamesWithSourceNode(src_pg_id.group(1)): if len(games_plus) >= 3: path_chain_for_game_dict["%ld_to_%ld" % (games_plus[0],games_plus[2])] = games_plus[1] else: print 'games_plus is not at least 3' print 'games with starting node: ',len(path_chain_for_game_dict) gm_df = pd.DataFrame.from_dict(path_chain_for_game_dict.items()) gm_df.columns=['key','game'] #print gm_df.head() # Merge data frames on key data = pd.DataFrame() data = pd.merge(gm_df, sp_df, on='key') #print data.describe()
# csv.writer(f).writerow(row) #f.write(row) # f.close() # print 'Done writing results to file io' # print 'Done.' ############################################################################### result = re.search('sssp_(.*).txt', args.filename) print 'Script arguments:' print '\t'+args.filename print '\nSource node (wikipedia.game.page_id): '+result.group(1) print 'Games with source node (%s): ' % result.group(1) results = gamesWithSourceNode(result.group(1),-1) data= np.array(results) df = pd.DataFrame(data, columns=['src_pg','game','end_pg']) print df.head() ### Find the shortest path score to end_page src_pg = result.group(1) # output filename output_file='outputFiles/'+result.group(1)+'_sp_score_'+datetime.date.today().strftime("%d%b%y")+'.txt' # write out to file f = open(output_file,'w') for end_pg in df['end_pg']: end_pg_score = find_sssp_score(end_pg,args.filename) if end_pg_score is None: row_2_print = result.group(1)+'\t'+end_pg+'\t0' csv.writer(f).writerow([row_2_print])
args = parser.parse_args() #filenames = re.search('sssp_(.*).txt', getFilenames(args.dirPath)) srcNodes = list() results = [] filenames = getFilenames(args.dirPath) ## extract source nodes for fn in filenames: srcNodes.append(re.search('sssp_(.*).txt',fn).group(1)) ## get games for srcnode in srcNodes: if not(srcnode.isdigit()): continue results = gamesWithSourceNode(srcnode,-1) data= np.array(results) df = pd.DataFrame(data, columns=['src_pg','game','end_pg']) #print df.head() outFname='/home/saguinag/CategoryPaths/gamesDatafiles/'+srcnode+\ '_wpgame_games.dat' df.to_csv(outFname, sep=',',mode='w',encoding='utf-8',index=False) df = None print 'Done.' """ ## Find the shortest path score to end_page src_pg = filenames.group(1) # output filename print output_file