if len(sys.argv) == 2:
        inputDir = sys.argv[1]
        outputDir = inputDir
    elif len(sys.argv) == 3:
        inputDir = sys.argv[1]
        outputDir = sys.argv[2]
    else:
        print "Usage: python statisticWordDf.py inputDir(eg. ~/corpus/data_twitter201301/201301_clean) [outputDir default:=inputDir]"
        sys.exit(0)

    sym_names = loadSnP500("/home/yxqin/corpus/obtainSNP500/snp500_ranklist_20160801")
    snpSym = ["$"+item[0].lower() for item in sym_names]
    #print "## snpSyms:", snpSym[:5]

###################################################################
    [snpCompHash, windowHash] = statisticDF(inputDir, snpSym)
    compDFHash = {}
    for comp in snpCompHash:
        avg_df = sum(snpCompHash.get(comp).values())/float(len(windowHash))
        compDFHash[comp] = avg_df

    #sortedList = sortHash(compDFHash, 1, True)
    #for item in sortedList:#[:50]
    for sym in snpSym:
        if sym not in snpCompHash:
            continue
        sorted_df = sortHash(snpCompHash.get(sym), 0, False)
        df_sorted = [str(dfItem[1]) for dfItem in sorted_df]
        print sym, "\t", compDFHash.get(sym), "\t", "\t".join(df_sorted)
        #print "\t".join(df_sorted)
Exemple #2
0
        sys.exit(0)

    [btySklHash, unitDFHash, unitInvolvedHash] = loadEvtseg(btySklFileName)
    print "### Example of skl: ", len(btySklHash), btySklHash.keys()[0:20]

###########
#    frmHash = loadBtyFrm(sys.argv[1])
#    print frmHash.keys()[0:20]
#    sys.exit()
###########
    
###############################################

###########
# output freq distri of bursty units
    [unitHash, windowHash] = statisticDF(dataFilePath, btySklHash)

    unitHash_FST = {}
    unitHash_Score = {}
    for unit in sorted(unitHash.keys()):
        df_hash = unitHash[unit]

        prob_hash = dict([(t, df_hash[t]/windowHash[t]) for t in df_hash if df_hash[t]>0])
        l = len(prob_hash)
        probTemp = sum(prob_hash.values())
        prob = probTemp/l

        dayStr = btySklFileName[-2:]
        e_st = windowHash[dayStr] * prob
        sigma_st = math.sqrt(e_st*(1-prob))
        unitHash_Score[unit] = (df_hash.get(dayStr) - e_st)/sigma_st
        sys.exit(0)

    [btySklHash, unitDFHash, unitInvolvedHash] = loadEvtseg(btySklFileName)
    print "### Example of skl: ", len(btySklHash), btySklHash.keys()[0:20]

    ###########
    #    frmHash = loadBtyFrm(sys.argv[1])
    #    print frmHash.keys()[0:20]
    #    sys.exit()
    ###########

    ###############################################

    ###########
    # output freq distri of bursty units
    [unitHash, windowHash] = statisticDF(dataFilePath, btySklHash)

    unitHash_FST = {}
    unitHash_Score = {}
    for unit in sorted(unitHash.keys()):
        df_hash = unitHash[unit]

        prob_hash = dict([(t, df_hash[t] / windowHash[t]) for t in df_hash
                          if df_hash[t] > 0])
        l = len(prob_hash)
        probTemp = sum(prob_hash.values())
        prob = probTemp / l

        dayStr = btySklFileName[-2:]
        e_st = windowHash[dayStr] * prob
        sigma_st = math.sqrt(e_st * (1 - prob))