if len(sys.argv) == 2: inputDir = sys.argv[1] outputDir = inputDir elif len(sys.argv) == 3: inputDir = sys.argv[1] outputDir = sys.argv[2] else: print "Usage: python statisticWordDf.py inputDir(eg. ~/corpus/data_twitter201301/201301_clean) [outputDir default:=inputDir]" sys.exit(0) sym_names = loadSnP500("/home/yxqin/corpus/obtainSNP500/snp500_ranklist_20160801") snpSym = ["$"+item[0].lower() for item in sym_names] #print "## snpSyms:", snpSym[:5] ################################################################### [snpCompHash, windowHash] = statisticDF(inputDir, snpSym) compDFHash = {} for comp in snpCompHash: avg_df = sum(snpCompHash.get(comp).values())/float(len(windowHash)) compDFHash[comp] = avg_df #sortedList = sortHash(compDFHash, 1, True) #for item in sortedList:#[:50] for sym in snpSym: if sym not in snpCompHash: continue sorted_df = sortHash(snpCompHash.get(sym), 0, False) df_sorted = [str(dfItem[1]) for dfItem in sorted_df] print sym, "\t", compDFHash.get(sym), "\t", "\t".join(df_sorted) #print "\t".join(df_sorted)
sys.exit(0) [btySklHash, unitDFHash, unitInvolvedHash] = loadEvtseg(btySklFileName) print "### Example of skl: ", len(btySklHash), btySklHash.keys()[0:20] ########### # frmHash = loadBtyFrm(sys.argv[1]) # print frmHash.keys()[0:20] # sys.exit() ########### ############################################### ########### # output freq distri of bursty units [unitHash, windowHash] = statisticDF(dataFilePath, btySklHash) unitHash_FST = {} unitHash_Score = {} for unit in sorted(unitHash.keys()): df_hash = unitHash[unit] prob_hash = dict([(t, df_hash[t]/windowHash[t]) for t in df_hash if df_hash[t]>0]) l = len(prob_hash) probTemp = sum(prob_hash.values()) prob = probTemp/l dayStr = btySklFileName[-2:] e_st = windowHash[dayStr] * prob sigma_st = math.sqrt(e_st*(1-prob)) unitHash_Score[unit] = (df_hash.get(dayStr) - e_st)/sigma_st
sys.exit(0) [btySklHash, unitDFHash, unitInvolvedHash] = loadEvtseg(btySklFileName) print "### Example of skl: ", len(btySklHash), btySklHash.keys()[0:20] ########### # frmHash = loadBtyFrm(sys.argv[1]) # print frmHash.keys()[0:20] # sys.exit() ########### ############################################### ########### # output freq distri of bursty units [unitHash, windowHash] = statisticDF(dataFilePath, btySklHash) unitHash_FST = {} unitHash_Score = {} for unit in sorted(unitHash.keys()): df_hash = unitHash[unit] prob_hash = dict([(t, df_hash[t] / windowHash[t]) for t in df_hash if df_hash[t] > 0]) l = len(prob_hash) probTemp = sum(prob_hash.values()) prob = probTemp / l dayStr = btySklFileName[-2:] e_st = windowHash[dayStr] * prob sigma_st = math.sqrt(e_st * (1 - prob))