Example #1
0
def divide_archived_news(traingingStart,trainingEnd,estimationStart,estimationEnd):
    archivedNewsPath = common.get_configuration("training", "GROUP_STOCK_NEWS")
    archivedNews = json.load(open(archivedNewsPath),encoding='ISO-8859-1')
    trainingPhaseNews = {}
    testPhaseNews = {}
    
    timelineBegin = time.strptime(traingingStart, "%Y-%m-%d")
    timelineEnd = time.strptime(trainingEnd, "%Y-%m-%d")
    eTimeLineBegin = time.strptime(estimationStart, "%Y-%m-%d")
    eTimeLineEnd = time.strptime(estimationEnd, "%Y-%m-%d")
    
    for stock in archivedNews:
        if stock not in trainingPhaseNews:
            trainingPhaseNews[stock] = {}
        if stock not in testPhaseNews:
            testPhaseNews[stock] = {}
        for articleId in archivedNews[stock]:
            newsDate = time.strptime(articleId[0:8],"%Y%m%d")
            if newsDate <= timelineEnd and newsDate >= timelineBegin:
                trainingPhaseNews[stock][articleId] = archivedNews[stock][articleId]
            elif newsDate >= eTimeLineBegin and newsDate <= eTimeLineEnd :
                testPhaseNews[stock][articleId] = archivedNews[stock][articleId]
    
    "Write Training data and Test Data to File"
    trainingFilePath = common.get_configuration("training", "TRAINING_NEWS_FILE")
    with open(trainingFilePath,"w") as output:
        output.write(json.dumps(trainingPhaseNews))  
    
    testingFilePath = common.get_configuration("training", "TESTING_NEWS_FILE")
    with open(testingFilePath,"w") as output:
        output.write(json.dumps(testPhaseNews))           
Example #2
0
def divide_archived_news(endDate):
    archivedNewsPath = common.get_configuration("model", "GROUP_STOCK_NEWS")
    archivedNews = json.load(open(archivedNewsPath), encoding='ISO-8859-1')
    trainingPhaseNews = {}
    testPhaseNews = {}

    timeLine = time.strptime(endDate, "%Y-%m-%d")
    for stock in archivedNews:
        if stock not in trainingPhaseNews:
            trainingPhaseNews[stock] = {}
        if stock not in testPhaseNews:
            testPhaseNews[stock] = {}
        for articleId in archivedNews[stock]:
            newsDate = time.strptime(articleId[0:8], "%Y%m%d")
            if newsDate < timeLine:
                trainingPhaseNews[stock][articleId] = archivedNews[stock][
                    articleId]
            else:
                testPhaseNews[stock][articleId] = archivedNews[stock][
                    articleId]

    "Write Training data and Test Data to File"
    trainingFilePath = common.get_configuration("model", "TRAINING_NEWS_FILE")
    with open(trainingFilePath, "w") as output:
        output.write(json.dumps(trainingPhaseNews))

    testingFilePath = common.get_configuration("model", "TESTING_NEWS_FILE")
    with open(testingFilePath, "w") as output:
        output.write(json.dumps(testPhaseNews))
Example #3
0
def create_conf(warning_threshold,news_back):
    termConFile = common.get_configuration("model", "TERM_CONTRIBUTION_PATH")
    clustConFile = common.get_configuration("model", "CLUSTER_CONTRIBUTION_PATH")
    clustProFile = common.get_configuration("model", "CLUSTER_PROBABILITY_PATH")
    keyWordsFile = common.get_configuration("training", "VOCABULARY_FILE")
    trendFile = common.get_configuration("model", "TREND_RANGE_FILE")
    
    conf = {}
    conf["1"] = {}
    conf["1"]["termContribution"] = json.load(open(termConFile))
    conf["1"]["clusterProbability"] = json.load(open(clustProFile))
    conf["1"]["clusterContribution"] = json.load(open(clustConFile))
    conf["1"]["location"] = {"BVPSBVPS":"Panama","MERVAL":"Argentina","IBOV":"Brazil","CHILE65":"Chile","COLCAP":"Colombia","CRSMBCT":"Costa Rica","MEXBOL":"Mexico","IGBVL":"Peru","IBVC":"Venezuela"}
    conf["1"]["stocks"] = ["MERVAL","IBOV","CHILE65","COLCAP","CRSMBCT","MEXBOL","BVPSBVPS","IGBVL","IBVC"]
    conf["1"]["kyewordList"] = json.load(open(keyWordsFile))
    conf["1"]["warning_threshold"] = warning_threshold
    conf["1"]["version"] = "1"
    conf["1"]["news_back"] = news_back
    
    with open("./model_test.conf","w") as o_q:
        o_q.write(json.dumps(conf))
    
    conf_trend = {}
    conf_trend["1"] = json.load(open(trendFile))
    with open("./trendRange.json","w") as o_q:
        o_q.write(json.dumps(conf_trend))
Example #4
0
def get_company_list():
    comDir = common.get_configuration("model", "COMPANY_MEMBER")
    sfile = os.listdir(comDir)
    companyList = {}
    for fi in sfile:
        with open(comDir + "/" + fi) as comFile:
            lines = comFile.readlines()
            stockIndex = lines[1].replace("\r", "").replace(
                "\n", "").split(",")[1].replace(" Index", "")
            if stockIndex not in companyList:
                companyList[stockIndex] = []
            for line in lines[2:]:
                infos = line.replace("\r", "").replace("\n", "").split(",")
                companyName = infos[2]
                tmps = companyName.split(" ")
                companyName = " ".join(tmps[:len(tmps) -
                                            1 if len(tmps) > 1 else len(tmps)])
                if companyName not in companyList[stockIndex]:
                    companyList[stockIndex].append(companyName)
            companyList[stockIndex].append(stockIndex)

    desFile = common.get_configuration("model", "COMPANY_LIST")
    with open(desFile, "w") as output:
        jsStr = json.dumps(companyList)
        output.write(jsStr)
Example #5
0
def compute_term_contribution():
    "Read the Vocabulary File"
    vocabularyFilePath = common.get_configuration("model", "VOCABULARY_FILE")
    vocaLines = open(vocabularyFilePath).readlines()
    vocaList = [w.replace("\n", "") for w in vocaLines]

    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    print "StartTime: ", datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")

    finalWordContribution = {}
    "Iteratively to access each Stock Index"
    trainingFile = group_news_by_cluster()
    for index in trainingFile:
        stockNews = trainingFile[index]
        wordContribution = {}
        for cluster in stockNews:
            #computing the words count in each cluster
            articles = cluster["articles"]
            #initiate the wordFreq
            wordFreq = {}
            for term in vocaList:
                wordFreq[term] = 0
            for article in articles:
                content = article["content"]
                tokens = nltk.word_tokenize(content)
                words = [
                    w.lower() for w in tokens if w not in [
                        ",", ".", ")", "]", "(", "[", "*", ";", "...", ":",
                        "&", '"'
                    ] and not w.isdigit()
                ]
                words = [
                    w for w in words if w.encode("utf8") not in
                    nltk.corpus.stopwords.words('english')
                ]
                stemmedWords = [stemmer.stem(w) for w in words]
                fdist = nltk.FreqDist(stemmedWords)
                for term in wordFreq:
                    if term in fdist:
                        wordFreq[term] = wordFreq[term] + fdist[term]
            #computing the word contribution
            count = sum(wordFreq.values())
            contributions = {}
            for term in wordFreq:
                contribution = (wordFreq[term] + 1) / (count + len(wordFreq))
                contributions[term] = "%0.4f" % contribution
        #       print "term:%s, contribution:%f" %(term,contribution)

        # add the contributions to each cluster
            wordContribution[cluster["cluster"]] = contributions

        finalWordContribution[index] = wordContribution
    print "EndTime: ", datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")

    "Write the Term Contribution To File"
    termContributionFile = common.get_configuration("model",
                                                    "TERM_CONTRIBUTION_PATH")
    jsString = json.dumps(finalWordContribution)
    with open(termContributionFile, "w") as output:
        output.write(jsString)
Example #6
0
def divide_archived_news(endDate):
    archivedNewsPath = common.get_configuration("model", "GROUP_STOCK_NEWS")
    archivedNews = json.load(open(archivedNewsPath),encoding='ISO-8859-1')
    trainingPhaseNews = {}
    testPhaseNews = {}
    
    timeLine = time.strptime(endDate, "%Y-%m-%d")
    for stock in archivedNews:
        if stock not in trainingPhaseNews:
            trainingPhaseNews[stock] = {}
        if stock not in testPhaseNews:
            testPhaseNews[stock] = {}
        for articleId in archivedNews[stock]:
            newsDate = time.strptime(articleId[0:8],"%Y%m%d")
            if newsDate < timeLine:
                trainingPhaseNews[stock][articleId] = archivedNews[stock][articleId]
            else:
                testPhaseNews[stock][articleId] = archivedNews[stock][articleId]
    
    "Write Training data and Test Data to File"
    trainingFilePath = common.get_configuration("model", "TRAINING_NEWS_FILE")
    with open(trainingFilePath,"w") as output:
        output.write(json.dumps(trainingPhaseNews))  
    
    testingFilePath = common.get_configuration("model", "TESTING_NEWS_FILE")
    with open(testingFilePath,"w") as output:
        output.write(json.dumps(testPhaseNews))           
Example #7
0
    def compute_stock_index_probability(self, predictiveDate, clusterType,
                                        stockIndex):
        try:
            "Get the clusters List"
            stockIndexFile = open(
                common.get_configuration("model", 'CLUSTER_PROBABILITY_PATH'))
            clusterProbability = json.load(stockIndexFile)
            clusterJson = {}
            clusterContributionJson = {}
            clusterJson = clusterProbability[stockIndex]
            "Get the contribution of each cluster"
            clusterContributionFile = open(
                common.get_configuration("model", 'CLUSTER_CONTRIBUTION_PATH'))
            clusterContributionJson = json.load(clusterContributionFile)
            clusterTypesHistory, stockDerived = self.get_stock_index_cluster(
                predictiveDate, stockIndex)
            stockIndexProbability = 0
            for key in clusterContributionJson[stockIndex].keys():
                if key == str(clusterType):
                    "Search from the Cluster contribution Matrix to get the contribution probability"
                    stockIndexProbability = stockIndexProbability + math.log(
                        float(clusterContributionJson[stockIndex][key][
                            int(clusterTypesHistory[0]) - 1][2])) + math.log(
                                float(clusterContributionJson[stockIndex][key][
                                    int(clusterTypesHistory[1]) - 1][1])
                            ) + math.log(
                                float(clusterContributionJson[stockIndex][key][
                                    int(clusterTypesHistory[2]) -
                                    1][0])) + math.log(
                                        float(clusterJson[str(clusterType)]))

            return stockIndexProbability, stockDerived
        except Exception as e:
            print traceback.format_exc()
            print "Error in computing stock index probability: %s" % e.args
def compute_term_contribution():
    "Read the Vocabulary File"
    vocabularyFilePath = common.get_configuration("training", "VOCABULARY_FILE")
    vocaLines = open(vocabularyFilePath).readlines()
    vocaList = [w.replace("\n","") for w in vocaLines]
    
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    print "StartTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
    finalWordContribution = {}
    "Iteratively to access each Stock Index"
    trainingFile = group_news_by_cluster()
    
    print "Finish Group news by cluster"
    
    for index in trainingFile:
        stockNews = trainingFile[index]
        wordContribution = {}
        for cluster in stockNews:
            #computing the words count in each cluster
            print "Start Cluster ", cluster["cluster"], "For Stock ",index, "at ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
            articles = cluster["articles"]
            #initiate the wordFreq
            wordFreq = {}
            for term in vocaList:
                wordFreq[term] = 0
            for article in articles:
                content = article["content"]
                tokens = nltk.word_tokenize(content)
                words = [w.lower() for w in tokens if w not in [",",".",")","]","(","[","*",";","...",":","&",'"'] and not w.isdigit()]
                words = [w for w in words if w.encode("utf8") not in nltk.corpus.stopwords.words('english')]
                stemmedWords = [stemmer.stem(w) for w in words]
                fdist=nltk.FreqDist(stemmedWords)
                for term in wordFreq:
                    if term in fdist:
                        wordFreq[term] = wordFreq[term] + fdist[term]
            #computing the word contribution
            count = sum(wordFreq.values())
            contributions = {}
            for term in wordFreq:
                contribution = (wordFreq[term]+1)/(count + len(wordFreq))
                contributions[term] = "%0.4f" %contribution
        #       print "term:%s, contribution:%f" %(term,contribution)
            
            # add the contributions to each cluster
            wordContribution[cluster["cluster"]] = contributions
            print "Finish Cluster ", cluster["cluster"], "For Stock ",index, "at ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
        finalWordContribution[index] = wordContribution    
    print "EndTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
    "Write the Term Contribution To File"
    termContributionFile = common.get_configuration("model", "TERM_CONTRIBUTION_PATH")
    jsString = json.dumps(finalWordContribution)
    with open(termContributionFile,"w") as output:
        output.write(jsString)
Example #9
0
def clusterSet(traingingStart,traningEndDate,clu_num): 
    con = common.getDBConnection()
    cur = con.cursor()
    
    finalClusterRecord = []
    stockList = ["MERVAL","MEXBOL","CHILE65","BVPSBVPS","COLCAP","CRSMBCT","IBOV","IGBVL","IBVC"]
    finalOrderCluster = {}
    for stock in stockList:
        sql = "select embers_id,post_date,current_value,previous_close_value,one_day_change,change_percent,name from t_enriched_bloomberg_prices where name=? and post_date<=? and post_date>=? order by post_date asc"
        cur.execute(sql,(stock,traningEndDate,traingingStart))
        rows = cur.fetchall()
        changes = [row[5] for row in rows]
        fdist = nltk.FreqDist(changes)
        clusterS = [(0,x) for x in fdist.keys()]
        
        c1 = KMeansClustering(clusterS)
        cluster = c1.getclusters(clu_num)
        "The sample data of cluster by the KMeans algorithm"
#        cluster = [[(0, 0.0862), (0, 0.088), (0, 0.0914), (0, 0.094), (0, 0.0957), (0, 0.097), (0, 0.1017), (0, 0.1024), (0, 0.0774), (0, 0.0882), (0, 0.0783), (0, 0.11), (0, 0.0807), (0, 0.0813), (0, 0.1367), (0, 0.0831), (0, 0.0836), (0, 0.0855), (0, 0.0879), (0, 0.0912), (0, 0.0763), (0, 0.1046), (0, 0.0784), (0, 0.0815), (0, 0.1464), (0, 0.1987), (0, 0.1053), (0, 0.1101), (0, 0.1176), (0, 0.0868), (0, 0.1342), (0, 0.1466), (0, 0.0761), (0, 0.0772)], [(0, -0.0001), (0, 0.0), (0, 0.0001), (0, -0.0002), (0, -0.0003), (0, -0.0004), (0, -0.0005), (0, -0.0006), (0, 0.0002), (0, 0.0003), (0, 0.0004), (0, 0.0005), (0, 0.0006), (0, 0.0007), (0, 0.0008), (0, 0.0009), (0, 0.001), (0, 0.0011), (0, 0.0012), (0, 0.0013), (0, 0.0014), (0, 0.0015), (0, 0.0016), (0, 0.0017), (0, 0.0018), (0, 0.0019), (0, 0.002), (0, 0.0021), (0, 0.0022), (0, 0.0023), (0, 0.0024), (0, 0.0025), (0, 0.0026), (0, 0.0027), (0, 0.0028), (0, 0.0029), (0, 0.003), (0, 0.0031), (0, 0.0032), (0, 0.0033), (0, 0.0034), (0, 0.0035), (0, 0.0036), (0, 0.0037), (0, 0.0038), (0, 0.0039), (0, 0.004), (0, 0.0041), (0, 0.0042), (0, 0.0043), (0, 0.0044), (0, 0.0045), (0, 0.0046), (0, 0.0047), (0, 0.0048), (0, 0.0049), (0, 0.005), (0, -0.0007), (0, -0.0008)], [(0, 0.0297), (0, 0.0296), (0, 0.0298), (0, 0.0299), (0, 0.0301), (0, 0.03), (0, 0.0303), (0, 0.0302), (0, 0.0304), (0, 0.0305), (0, 0.0306), (0, 0.0308), (0, 0.0307), (0, 0.0309), (0, 0.031), (0, 0.0311), (0, 0.0313), (0, 0.0314), (0, 0.0312), (0, 0.0316), (0, 0.0315), (0, 0.0317), (0, 0.0318), (0, 0.032), (0, 0.0319), (0, 0.0322), (0, 0.0321), (0, 0.0324), (0, 0.0323), (0, 0.0326), (0, 0.0325), (0, 0.0328), (0, 0.033), (0, 0.0327), (0, 0.0332), (0, 0.0331), (0, 0.0333), (0, 0.0329), (0, 0.0335), (0, 0.0336), (0, 0.0334), (0, 0.0337), (0, 0.0338), (0, 0.0339), (0, 0.034), (0, 0.0341), (0, 0.0342), (0, 0.0343), (0, 0.0344), (0, 0.0345), (0, 0.0346), (0, 0.0348), (0, 0.0349), (0, 0.035), (0, 0.0351), (0, 0.0352), (0, 0.0355), (0, 0.0356), (0, 0.0358), (0, 0.0357), (0, 0.0359), (0, 0.036), (0, 0.0361), (0, 0.0362), (0, 0.0363), (0, 0.0365)], [(0, 0.0559), (0, 0.0564), (0, 0.0568), (0, 0.0571), (0, 0.0573), (0, 0.0579), (0, 0.0578), (0, 0.0581), (0, 0.0587), (0, 0.0589), (0, 0.0595), (0, 0.0591), (0, 0.0594), (0, 0.0604), (0, 0.0598), (0, 0.06), (0, 0.0602), (0, 0.0609), (0, 0.0612), (0, 0.059), (0, 0.0606), (0, 0.0614), (0, 0.0619), (0, 0.0625), (0, 0.0628), (0, 0.0615), (0, 0.0637), (0, 0.0633), (0, 0.0634), (0, 0.0636), (0, 0.0654), (0, 0.0658), (0, 0.0659), (0, 0.0669), (0, 0.0667), (0, 0.0664), (0, 0.067), (0, 0.0675), (0, 0.0673), (0, 0.0676), (0, 0.0686), (0, 0.07), (0, 0.0697), (0, 0.0709), (0, 0.0716), (0, 0.0717), (0, 0.0738), (0, 0.0747)], [(0, -0.0133), (0, -0.0132), (0, -0.0135), (0, -0.0134), (0, -0.0137), (0, -0.0138), (0, -0.0136), (0, -0.014), (0, -0.0139), (0, -0.0142), (0, -0.0143), (0, -0.0144), (0, -0.0141), (0, -0.0145), (0, -0.0146), (0, -0.0147), (0, -0.0148), (0, -0.0149), (0, -0.015), (0, -0.0151), (0, -0.0152), (0, -0.0153), (0, -0.0154), (0, -0.0155), (0, -0.0156), (0, -0.0157), (0, -0.0158), (0, -0.0159), (0, -0.016), (0, -0.0161), (0, -0.0162), (0, -0.0163), (0, -0.0164), (0, -0.0165), (0, -0.0166), (0, -0.0167), (0, -0.0168), (0, -0.0169), (0, -0.017), (0, -0.0171), (0, -0.0172), (0, -0.0173), (0, -0.0174), (0, -0.0175), (0, -0.0176), (0, -0.0177), (0, -0.0178), (0, -0.0179), (0, -0.018), (0, -0.0181), (0, -0.0182), (0, -0.0183), (0, -0.0184), (0, -0.0185), (0, -0.0186), (0, -0.0187), (0, -0.0188), (0, -0.0189), (0, -0.019), (0, -0.0191), (0, -0.0192), (0, -0.0193), (0, -0.0194), (0, -0.0195)], [(0, 0.0448), (0, 0.0451), (0, 0.0452), (0, 0.0446), (0, 0.0447), (0, 0.0456), (0, 0.045), (0, 0.0455), (0, 0.0462), (0, 0.0459), (0, 0.0461), (0, 0.0466), (0, 0.046), (0, 0.0467), (0, 0.0445), (0, 0.0458), (0, 0.0464), (0, 0.0477), (0, 0.0463), (0, 0.0472), (0, 0.0478), (0, 0.0457), (0, 0.0476), (0, 0.0481), (0, 0.0484), (0, 0.0488), (0, 0.0483), (0, 0.0487), (0, 0.0471), (0, 0.0482), (0, 0.0496), (0, 0.0474), (0, 0.0495), (0, 0.0485), (0, 0.0504), (0, 0.0505), (0, 0.0506), (0, 0.0501), (0, 0.0509), (0, 0.0508), (0, 0.051), (0, 0.0515), (0, 0.0516), (0, 0.052), (0, 0.0522), (0, 0.0524), (0, 0.053), (0, 0.0531), (0, 0.0534), (0, 0.0535), (0, 0.0536), (0, 0.0537), (0, 0.0538), (0, 0.0541), (0, 0.0542), (0, 0.0545), (0, 0.0546), (0, 0.0548), (0, 0.055)], [(0, 0.0172), (0, 0.017), (0, 0.0173), (0, 0.0174), (0, 0.0171), (0, 0.0177), (0, 0.0175), (0, 0.0178), (0, 0.0179), (0, 0.0176), (0, 0.0181), (0, 0.018), (0, 0.0183), (0, 0.0182), (0, 0.0186), (0, 0.0185), (0, 0.0187), (0, 0.0184), (0, 0.0189), (0, 0.0188), (0, 0.019), (0, 0.0191), (0, 0.0192), (0, 0.0194), (0, 0.0193), (0, 0.0196), (0, 0.0195), (0, 0.0197), (0, 0.0199), (0, 0.0198), (0, 0.02), (0, 0.0201), (0, 0.0202), (0, 0.0204), (0, 0.0205), (0, 0.0206), (0, 0.0203), (0, 0.0208), (0, 0.0207), (0, 0.021), (0, 0.0209), (0, 0.0211), (0, 0.0213), (0, 0.0212), (0, 0.0214), (0, 0.0215), (0, 0.0216), (0, 0.0217), (0, 0.0218), (0, 0.0219), (0, 0.022), (0, 0.0221), (0, 0.0222), (0, 0.0223), (0, 0.0224), (0, 0.0225), (0, 0.0226), (0, 0.0227), (0, 0.0228), (0, 0.0229), (0, 0.023), (0, 0.0231)], [(0, -0.0408), (0, -0.041), (0, -0.0411), (0, -0.0412), (0, -0.0413), (0, -0.0415), (0, -0.0416), (0, -0.0417), (0, -0.0419), (0, -0.042), (0, -0.0423), (0, -0.0424), (0, -0.0418), (0, -0.0425), (0, -0.0428), (0, -0.043), (0, -0.0431), (0, -0.0432), (0, -0.0433), (0, -0.0434), (0, -0.0436), (0, -0.0438), (0, -0.0439), (0, -0.044), (0, -0.0442), (0, -0.0441), (0, -0.0446), (0, -0.0443), (0, -0.0448), (0, -0.0447), (0, -0.045), (0, -0.0449), (0, -0.0453), (0, -0.0451), (0, -0.0454), (0, -0.0455), (0, -0.0458), (0, -0.0456), (0, -0.0459), (0, -0.0463), (0, -0.0461), (0, -0.046), (0, -0.0464), (0, -0.0465), (0, -0.0467), (0, -0.0462), (0, -0.0466), (0, -0.0472), (0, -0.0469), (0, -0.0475), (0, -0.0473), (0, -0.0478), (0, -0.0477), (0, -0.0476), (0, -0.0482), (0, -0.0481), (0, -0.0483), (0, -0.0487), (0, -0.0488), (0, -0.049), (0, -0.0492), (0, -0.0494)], [(0, -0.0261), (0, -0.0262), (0, -0.0263), (0, -0.0264), (0, -0.0266), (0, -0.0265), (0, -0.0267), (0, -0.0268), (0, -0.0269), (0, -0.0271), (0, -0.027), (0, -0.0273), (0, -0.0272), (0, -0.0275), (0, -0.0274), (0, -0.0277), (0, -0.0278), (0, -0.0276), (0, -0.0279), (0, -0.0281), (0, -0.028), (0, -0.0283), (0, -0.0282), (0, -0.0284), (0, -0.0285), (0, -0.0286), (0, -0.0287), (0, -0.0288), (0, -0.0289), (0, -0.0291), (0, -0.0292), (0, -0.0293), (0, -0.029), (0, -0.0294), (0, -0.0295), (0, -0.0297), (0, -0.0296), (0, -0.0299), (0, -0.03), (0, -0.0301), (0, -0.0302), (0, -0.0298), (0, -0.0303), (0, -0.0304), (0, -0.0307), (0, -0.0305), (0, -0.0308), (0, -0.031), (0, -0.0309), (0, -0.0312), (0, -0.0311), (0, -0.0313), (0, -0.0315), (0, -0.0314), (0, -0.0316), (0, -0.0317), (0, -0.0319), (0, -0.0318), (0, -0.032), (0, -0.0321), (0, -0.0322), (0, -0.0323), (0, -0.0325), (0, -0.0326), (0, -0.0327), (0, -0.0328), (0, -0.0329)], [(0, -0.0619), (0, -0.0622), (0, -0.0627), (0, -0.064), (0, -0.0645), (0, -0.065), (0, -0.0653), (0, -0.0651), (0, -0.0659), (0, -0.0663), (0, -0.0665), (0, -0.066), (0, -0.0666), (0, -0.0674), (0, -0.0671), (0, -0.0684), (0, -0.0672), (0, -0.0691), (0, -0.0689), (0, -0.0692), (0, -0.0701), (0, -0.0698), (0, -0.0709), (0, -0.0715), (0, -0.0717), (0, -0.0722), (0, -0.0734), (0, -0.0741), (0, -0.0749), (0, -0.0763), (0, -0.0772), (0, -0.0758), (0, -0.0762), (0, -0.0787), (0, -0.0788), (0, -0.0759), (0, -0.0775), (0, -0.0808)], [(0, -0.0905), (0, -0.1081), (0, -0.1018), (0, -0.094), (0, -0.0937), (0, -0.0936), (0, -0.0927), (0, -0.0919), (0, -0.0863), (0, -0.1593), (0, -0.1245), (0, -0.0847), (0, -0.1215), (0, -0.1139), (0, -0.1099), (0, -0.1068), (0, -0.0868), (0, -0.0856), (0, -0.0854), (0, -0.0837), (0, -0.0822), (0, -0.0877), (0, -0.1241), (0, -0.1073), (0, -0.1065), (0, -0.1011), (0, -0.0835)], [(0, -0.0196), (0, -0.0198), (0, -0.0197), (0, -0.0199), (0, -0.02), (0, -0.0201), (0, -0.0202), (0, -0.0204), (0, -0.0203), (0, -0.0205), (0, -0.0206), (0, -0.0208), (0, -0.0207), (0, -0.021), (0, -0.0209), (0, -0.0212), (0, -0.0211), (0, -0.0214), (0, -0.0215), (0, -0.0213), (0, -0.0217), (0, -0.0216), (0, -0.0219), (0, -0.0218), (0, -0.0221), (0, -0.022), (0, -0.0223), (0, -0.0222), (0, -0.0225), (0, -0.0224), (0, -0.0227), (0, -0.0226), (0, -0.0229), (0, -0.0228), (0, -0.023), (0, -0.0231), (0, -0.0232), (0, -0.0234), (0, -0.0233), (0, -0.0236), (0, -0.0235), (0, -0.0238), (0, -0.0237), (0, -0.024), (0, -0.0239), (0, -0.0242), (0, -0.0241), (0, -0.0244), (0, -0.0243), (0, -0.0245), (0, -0.0246), (0, -0.0247), (0, -0.0248), (0, -0.0249), (0, -0.025), (0, -0.0251), (0, -0.0252), (0, -0.0253), (0, -0.0254), (0, -0.0255), (0, -0.0256), (0, -0.0257), (0, -0.0258), (0, -0.0259), (0, -0.026)], [(0, -0.05), (0, -0.0504), (0, -0.0499), (0, -0.0507), (0, -0.0501), (0, -0.0509), (0, -0.0513), (0, -0.0505), (0, -0.051), (0, -0.0508), (0, -0.0517), (0, -0.0519), (0, -0.0516), (0, -0.052), (0, -0.0524), (0, -0.0525), (0, -0.0526), (0, -0.0528), (0, -0.0529), (0, -0.0533), (0, -0.0538), (0, -0.0535), (0, -0.0532), (0, -0.0542), (0, -0.0543), (0, -0.0546), (0, -0.054), (0, -0.055), (0, -0.0556), (0, -0.0545), (0, -0.056), (0, -0.0554), (0, -0.0567), (0, -0.0563), (0, -0.0571), (0, -0.0572), (0, -0.0576), (0, -0.0579), (0, -0.058), (0, -0.0584), (0, -0.0581), (0, -0.0588), (0, -0.0589), (0, -0.0591), (0, -0.0593), (0, -0.0596), (0, -0.0595), (0, -0.0601), (0, -0.0613), (0, -0.0614)], [(0, -0.001), (0, -0.0012), (0, -0.0017), (0, -0.0016), (0, -0.0013), (0, -0.0011), (0, -0.002), (0, -0.0018), (0, -0.0015), (0, -0.0014), (0, -0.0019), (0, -0.0021), (0, -0.0022), (0, -0.0023), (0, -0.0009), (0, -0.0024), (0, -0.0025), (0, -0.0026), (0, -0.0027), (0, -0.0028), (0, -0.0029), (0, -0.003), (0, -0.0031), (0, -0.0032), (0, -0.0033), (0, -0.0034), (0, -0.0035), (0, -0.0036), (0, -0.0037), (0, -0.0038), (0, -0.0039), (0, -0.004), (0, -0.0041), (0, -0.0042), (0, -0.0043), (0, -0.0044), (0, -0.0045), (0, -0.0046), (0, -0.0047), (0, -0.0048), (0, -0.0049), (0, -0.005), (0, -0.0051), (0, -0.0052), (0, -0.0053), (0, -0.0054), (0, -0.0055), (0, -0.0056), (0, -0.0057), (0, -0.0058), (0, -0.0059), (0, -0.006), (0, -0.0061), (0, -0.0062), (0, -0.0063), (0, -0.0064), (0, -0.0065), (0, -0.0066), (0, -0.0067), (0, -0.0068), (0, -0.0069)], [(0, -0.033), (0, -0.0332), (0, -0.0331), (0, -0.0334), (0, -0.0333), (0, -0.0336), (0, -0.0337), (0, -0.0335), (0, -0.0338), (0, -0.034), (0, -0.0339), (0, -0.0342), (0, -0.0343), (0, -0.0341), (0, -0.0344), (0, -0.0345), (0, -0.0346), (0, -0.0347), (0, -0.0348), (0, -0.035), (0, -0.0349), (0, -0.0351), (0, -0.0352), (0, -0.0353), (0, -0.0354), (0, -0.0355), (0, -0.0357), (0, -0.0356), (0, -0.0358), (0, -0.0359), (0, -0.0361), (0, -0.036), (0, -0.0363), (0, -0.0362), (0, -0.0365), (0, -0.0366), (0, -0.0364), (0, -0.0368), (0, -0.0369), (0, -0.0372), (0, -0.0371), (0, -0.0367), (0, -0.0375), (0, -0.0373), (0, -0.0376), (0, -0.0374), (0, -0.0378), (0, -0.038), (0, -0.0379), (0, -0.0377), (0, -0.0382), (0, -0.0384), (0, -0.0383), (0, -0.0386), (0, -0.0381), (0, -0.0387), (0, -0.0389), (0, -0.0385), (0, -0.039), (0, -0.0391), (0, -0.0388), (0, -0.0392), (0, -0.0395), (0, -0.0393), (0, -0.0397), (0, -0.0398), (0, -0.0396), (0, -0.0399), (0, -0.0402), (0, -0.0401), (0, -0.0403), (0, -0.0406), (0, -0.0407)], [(0, 0.0232), (0, 0.0233), (0, 0.0234), (0, 0.0235), (0, 0.0237), (0, 0.0236), (0, 0.0238), (0, 0.0239), (0, 0.024), (0, 0.0241), (0, 0.0242), (0, 0.0243), (0, 0.0244), (0, 0.0245), (0, 0.0247), (0, 0.0248), (0, 0.0246), (0, 0.0249), (0, 0.025), (0, 0.0251), (0, 0.0253), (0, 0.0252), (0, 0.0255), (0, 0.0254), (0, 0.0257), (0, 0.0256), (0, 0.0259), (0, 0.026), (0, 0.0258), (0, 0.0261), (0, 0.0262), (0, 0.0264), (0, 0.0265), (0, 0.0263), (0, 0.0267), (0, 0.0268), (0, 0.0266), (0, 0.027), (0, 0.0269), (0, 0.0271), (0, 0.0272), (0, 0.0274), (0, 0.0273), (0, 0.0276), (0, 0.0275), (0, 0.0277), (0, 0.0278), (0, 0.0279), (0, 0.0281), (0, 0.0282), (0, 0.0283), (0, 0.0284), (0, 0.0285), (0, 0.0286), (0, 0.0287), (0, 0.0288), (0, 0.0289), (0, 0.029), (0, 0.0291), (0, 0.0292), (0, 0.0293), (0, 0.0294)], [(0, 0.011), (0, 0.0112), (0, 0.0113), (0, 0.0111), (0, 0.0115), (0, 0.0114), (0, 0.0117), (0, 0.0116), (0, 0.0118), (0, 0.0119), (0, 0.0121), (0, 0.0122), (0, 0.0123), (0, 0.0124), (0, 0.012), (0, 0.0126), (0, 0.0125), (0, 0.0128), (0, 0.0127), (0, 0.013), (0, 0.0129), (0, 0.0131), (0, 0.0133), (0, 0.0132), (0, 0.0135), (0, 0.0134), (0, 0.0136), (0, 0.0137), (0, 0.0138), (0, 0.014), (0, 0.0139), (0, 0.0142), (0, 0.0141), (0, 0.0143), (0, 0.0144), (0, 0.0145), (0, 0.0146), (0, 0.0147), (0, 0.0148), (0, 0.0149), (0, 0.015), (0, 0.0151), (0, 0.0153), (0, 0.0152), (0, 0.0154), (0, 0.0155), (0, 0.0156), (0, 0.0157), (0, 0.0158), (0, 0.0159), (0, 0.016), (0, 0.0161), (0, 0.0162), (0, 0.0163), (0, 0.0164), (0, 0.0165), (0, 0.0166), (0, 0.0167), (0, 0.0168), (0, 0.0169)], [(0, -0.007), (0, -0.0071), (0, -0.0072), (0, -0.0073), (0, -0.0074), (0, -0.0075), (0, -0.0076), (0, -0.0077), (0, -0.0078), (0, -0.0079), (0, -0.0081), (0, -0.008), (0, -0.0082), (0, -0.0083), (0, -0.0084), (0, -0.0085), (0, -0.0086), (0, -0.0087), (0, -0.0088), (0, -0.0089), (0, -0.009), (0, -0.0091), (0, -0.0092), (0, -0.0093), (0, -0.0094), (0, -0.0095), (0, -0.0096), (0, -0.0097), (0, -0.0098), (0, -0.0099), (0, -0.01), (0, -0.0101), (0, -0.0102), (0, -0.0103), (0, -0.0104), (0, -0.0105), (0, -0.0106), (0, -0.0107), (0, -0.0108), (0, -0.0109), (0, -0.011), (0, -0.0111), (0, -0.0112), (0, -0.0113), (0, -0.0114), (0, -0.0115), (0, -0.0116), (0, -0.0117), (0, -0.0118), (0, -0.0119), (0, -0.012), (0, -0.0121), (0, -0.0122), (0, -0.0123), (0, -0.0124), (0, -0.0125), (0, -0.0126), (0, -0.0127), (0, -0.0128), (0, -0.0129), (0, -0.013), (0, -0.0131)], [(0, 0.0051), (0, 0.0052), (0, 0.0053), (0, 0.0055), (0, 0.0054), (0, 0.0057), (0, 0.0056), (0, 0.0059), (0, 0.0058), (0, 0.0061), (0, 0.006), (0, 0.0062), (0, 0.0063), (0, 0.0064), (0, 0.0065), (0, 0.0066), (0, 0.0068), (0, 0.0069), (0, 0.0067), (0, 0.007), (0, 0.0072), (0, 0.0071), (0, 0.0073), (0, 0.0074), (0, 0.0075), (0, 0.0076), (0, 0.0077), (0, 0.0078), (0, 0.0079), (0, 0.008), (0, 0.0081), (0, 0.0082), (0, 0.0083), (0, 0.0084), (0, 0.0085), (0, 0.0086), (0, 0.0087), (0, 0.0088), (0, 0.0089), (0, 0.009), (0, 0.0091), (0, 0.0092), (0, 0.0093), (0, 0.0094), (0, 0.0095), (0, 0.0096), (0, 0.0097), (0, 0.0098), (0, 0.0099), (0, 0.01), (0, 0.0101), (0, 0.0102), (0, 0.0103), (0, 0.0104), (0, 0.0105), (0, 0.0106), (0, 0.0107), (0, 0.0108), (0, 0.0109)], [(0, 0.0369), (0, 0.0371), (0, 0.0367), (0, 0.037), (0, 0.0375), (0, 0.0373), (0, 0.0376), (0, 0.0372), (0, 0.0377), (0, 0.038), (0, 0.0379), (0, 0.0374), (0, 0.0381), (0, 0.0382), (0, 0.0378), (0, 0.0384), (0, 0.0386), (0, 0.0387), (0, 0.0385), (0, 0.0389), (0, 0.0391), (0, 0.039), (0, 0.0392), (0, 0.0394), (0, 0.0395), (0, 0.0396), (0, 0.0398), (0, 0.0399), (0, 0.04), (0, 0.0401), (0, 0.0404), (0, 0.0405), (0, 0.0406), (0, 0.0407), (0, 0.0408), (0, 0.0409), (0, 0.041), (0, 0.0411), (0, 0.0412), (0, 0.0414), (0, 0.0415), (0, 0.0416), (0, 0.0417), (0, 0.0419), (0, 0.042), (0, 0.0421), (0, 0.0422), (0, 0.0426), (0, 0.0428), (0, 0.0427), (0, 0.043), (0, 0.0429), (0, 0.0431), (0, 0.0433), (0, 0.0434), (0, 0.0435), (0, 0.0436), (0, 0.0438), (0, 0.0437), (0, 0.044), (0, 0.0442), (0, 0.0444)]]
        namedCluster = {}
        i = 0
        orderCluster = {}
        for clu in cluster:
            i = i + 1
            namedCluster[i] = clu
            orderCluster[i] = [min(clu)[1],max(clu)[1]] 
        
        "The number of rows to be committed for each interval"
        committedInterval=0
        for row in rows:
            for nc in namedCluster:
                if (0,row[5]) in namedCluster[nc]:
                    newRow = list(row)
                    newRow.append(nc)
                    "update the trend type into Database"
                    UpdateEnrichedData(con, committedInterval, newRow)
                    finalClusterRecord.append(newRow)
        con.commit() 
        finalOrderCluster[stock] = orderCluster
        print stock, " Done"
        
    "Write the type range into a file"
    trendRangeFile = common.get_configuration("model", "TREND_RANGE_FILE")
    dataStr = json.dumps(finalOrderCluster)
    with open(trendRangeFile,"w") as output:
        output.write(dataStr)
    
    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("training", "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile,"w") as output:
        output.write(dataStr)
    
    if con:
        con.close()
Example #10
0
def create_vocabulary(feature_num=150):
    "Read the Negative Finance Dictionary"
    negativeFilePath = common.get_configuration("training", "NEGATIVE_DIC")
    negKeywords = json.load(open(negativeFilePath))

    "Read the Positive Finance Dictionary"
    positiveFilePath = common.get_configuration("training", "POSITIVE_DIC")
    posiKeyWords = json.load(open(positiveFilePath))

    "Read the archived news to count the top words"
    BBNewsPath = common.get_configuration("training", "TRAINING_NEWS_FILE")

    keyWords = []
    for w in negKeywords:
        keyWords.append(w)

    for w in posiKeyWords:
        keyWords.append(w)

    print "Over Here"

    wordFreq = {}
    flatCount = 0
    for line in open(BBNewsPath, "r"):
        news = json.loads(line)
        flatCount = flatCount + 1
        fdist = news["content"]
        for word in keyWords:
            if word in fdist:
                if word in wordFreq:
                    wordFreq[word] = wordFreq[word] + fdist[word]
                else:
                    wordFreq[word] = fdist[word]

    # sorted_obj2 = wordFreq.iteritems()
    sorted_obj2 = sorted(wordFreq.items(), key=lambda x: x[1], reverse=True)
    print sorted_obj2
    "Write the vocabulary list to File"
    vocabularyFile = common.get_configuration("training", "VOCABULARY_FILE")
    output = open(vocabularyFile, "w")
    result_word_list = []
    i = 1
    for word in sorted_obj2:
        if i > feature_num:
            break
        else:
            result_word_list.append(word[0])
            i = i + 1
    output.write(json.dumps(result_word_list))
    output.flush()
    output.close()
def group_news_by_cluster():
    "Load the Traing news File"
    trainingNewsFile = common.get_configuration("training", "TRAINING_NEWS_FILE")
    articles = json.load(open(trainingNewsFile))
    finalStockClusterNews = {}
    "Iterately read the news"
    for index in articles:
        indexNews = articles[index]
        dayNews = {}
        #group the news by date
        for articleId in indexNews:
            day = articleId[0:8]
            if day not in dayNews:
                dayNews[day] = []
            dayNews[day].append(indexNews[articleId])
        
    
        #read the day cluster file to group the date
        clusterDays = {}
        trendFilePath = common.get_configuration("training", "TRAINING_TREND_RECORDS")
        trendFile = open(trendFilePath)
        trendJson = json.load(trendFile)
        for trend in trendJson:
            if index == trend[6]:
                cluster = trend[7]
                structDate = time.strptime(trend[1],"%Y-%m-%d")
                dtDay = datetime(structDate[0],structDate[1],structDate[2])
                for i in range(1,4):
                    day = dtDay - timedelta(days=i)
                    dayStr = day.strftime("%Y%m%d")
                    if cluster not in clusterDays:
                        clusterDays[cluster] = []
                    if dayStr not in clusterDays[cluster]:
                        clusterDays[cluster].append(dayStr)
    
        clusterNews = []
        for cluster in clusterDays:
            cNews = {}
            cNews["cluster"] = cluster;
            docs = []
            for day in clusterDays[cluster]:
                if day in dayNews:
                    for doc in dayNews[day]:
                        docs.append(doc)
            cNews["articles"] =  docs;
            clusterNews.append(cNews)
        finalStockClusterNews[index] = clusterNews
           
    with open("D:/groupByCluster.json","w") as ot:
        ot.write(json.dumps(finalStockClusterNews))
    return finalStockClusterNews
Example #12
0
def group_news_by_cluster():
    "Load the Traing news File"
    trainingNewsFile = common.get_configuration("model", "TRAINING_NEWS_FILE")
    articles = json.load(open(trainingNewsFile))
    finalStockClusterNews = {}
    "Iterately read the news"
    for index in articles:
        indexNews = articles[index]
        dayNews = {}
        #group the news by date
        for articleId in indexNews:
            day = articleId[0:8]
            if day not in dayNews:
                dayNews[day] = []
            dayNews[day].append(indexNews[articleId])

        #read the day cluster file to group the date
        clusterDays = {}
        trendFilePath = common.get_configuration("model",
                                                 "TRAINING_TREND_RECORDS")
        trendFile = open(trendFilePath)
        trendJson = json.load(trendFile)
        for trend in trendJson:
            if index == trend[6]:
                cluster = trend[7]
                structDate = time.strptime(trend[2], "%Y-%m-%d")
                dtDay = datetime(structDate[0], structDate[1], structDate[2])
                for i in range(1, 4):
                    day = dtDay - timedelta(days=i)
                    dayStr = day.strftime("%Y%m%d")
                    if cluster not in clusterDays:
                        clusterDays[cluster] = []
                    if dayStr not in clusterDays[cluster]:
                        clusterDays[cluster].append(dayStr)

        clusterNews = []
        for cluster in clusterDays:
            cNews = {}
            cNews["cluster"] = cluster
            docs = []
            for day in clusterDays[cluster]:
                if day in dayNews:
                    for doc in dayNews[day]:
                        docs.append(doc)
            cNews["articles"] = docs
            clusterNews.append(cNews)
        finalStockClusterNews[index] = clusterNews

    return finalStockClusterNews
Example #13
0
def divide_archived_news(traingingStart,trainingEnd,estimationStart,estimationEnd):
    archivedNewsPath = common.get_configuration("training", "GROUP_STOCK_NEWS")
    
    timelineBegin = time.strptime(traingingStart, "%Y-%m-%d")
    timelineEnd = time.strptime(trainingEnd, "%Y-%m-%d")
    
    "Write Training data and Test Data to File"
    trainingFilePath = common.get_configuration("training", "TRAINING_NEWS_FILE")
    with open(trainingFilePath,"w") as output:
        for line in open(archivedNewsPath,"r"):
            news = json.loads(line)
            post_date = time.strptime(news["postDate"],"%Y-%m-%d")
            if post_date <= timelineEnd and post_date >= timelineBegin:
                output.write(json.dumps(news))
                output.write("\n")
Example #14
0
def group_daily_articles():

    stockArticles = {}

    archiveDir = common.get_configuration("model", "ARCHIVE_NEWS_DIR")
    dailyFileNames = os.listdir(archiveDir)
    matchRule = create_match_rule()
    pattern = re.compile(matchRule, re.I)

    "Construct company-stock object"
    comListFile = common.get_configuration("model", "COMPANY_LIST")
    comList = json.load(open(comListFile))
    comStock = {}
    for stock in comList:
        for company in comList[stock]:
            comStock[company.strip()] = stock

    i = 0
    print "StartTime: ", datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
    for dailyFile in dailyFileNames:
        dailyNews = json.load(open(archiveDir + "/" + dailyFile),
                              encoding='ISO-8859-1')
        for news in dailyNews:
            content = news["content"]
            matchedList = pattern.findall(content)
            matchedGroup = []
            if matchedList:
                i = i + 1
                for item in matchedList:
                    matchedGroup.append(item)
            matchedGroup = {}.fromkeys(matchedGroup).keys()

            "Group the news to matched stock"
            for item in matchedGroup:
                item = item.strip()
                if item in comStock:
                    stockIndex = comStock[item]
                    if stockIndex not in stockArticles:
                        stockArticles[stockIndex] = {}
                    articleId = news["articleId"]
                    stockArticles[stockIndex][articleId] = news
    print i

    "Write the grouped articles to file"
    groupedFile = common.get_configuration("model", "GROUP_STOCK_NEWS")
    with open(groupedFile, "w") as output:
        output.write(json.dumps(stockArticles))
    print "EndTime: ", datetime.strftime(datetime.now(), "%Y-%m-%d %H:%M:%S")
def group_daily_articles(rule_name):
    
    stockArticles = {}
    
    archiveDir = common.get_configuration("training", "ARCHIVE_NEWS_DIR")
    dailyFileNames = os.listdir(archiveDir)
    matchRule = create_match_rule(rule_name)
    pattern = re.compile(matchRule,re.I)
    print matchRule
    "Construct company-stock object"
    comListFile = common.get_configuration("training", rule_name)
    comList = json.load(open(comListFile))
    comStock = {}
    for stock in comList:
        for company in comList[stock]:
            comStock[company.strip().lower()] = stock
            
    i = 0
    print "StartTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    for dailyFile in dailyFileNames:
        dailyNews = json.load(open(archiveDir+ "/" + dailyFile),encoding='ISO-8859-1')
        for news in dailyNews:
            content = news["content"].lower()
#            print content
            matchedList = pattern.findall(content)
            matchedGroup = []
            if matchedList:
                i = i + 1
                for item in matchedList:
                    matchedGroup.append(item)
            matchedGroup = {}.fromkeys(matchedGroup).keys()
            
            "Group the news to matched stock"
            for item in matchedGroup:
                item = item.strip()
                if item in comStock:
                    stockIndex = comStock[item]
                    if stockIndex not in stockArticles:
                        stockArticles[stockIndex] = {}
                    articleId = news["articleId"]
                    stockArticles[stockIndex][articleId] = news
    print i
    
    "Write the grouped articles to file"
    groupedFile = common.get_configuration("training","GROUP_STOCK_NEWS")
    with open(groupedFile,"w") as output:
        output.write(json.dumps(stockArticles))
    print "EndTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")   
def compute_term_contribution(days_back):
    "Read the Vocabulary File"
    vocabularyFilePath = common.get_configuration("training", "VOCABULARY_FILE")
    vocaList = json.load(open(vocabularyFilePath,"r"))
    
    print "StartTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
    finalWordContribution = {}
    "Iteratively to access each Stock Index"
    trainingFile = group_news_by_cluster(days_back)
    
    print "Finish Group news by cluster"
    
    for index in trainingFile:
        stockNews = trainingFile[index]
        wordContribution = {}
        for cluster in stockNews:
            #computing the words count in each cluster
            print "Start Cluster ", cluster["cluster"], "For Stock ",index, "at ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
            articles = cluster["articles"]
            #initiate the wordFreq
            wordFreq = {}
            for term in vocaList:
                wordFreq[term] = 0
            for article in articles:
                fdist = article["content"]
                for term in wordFreq:
                    if term in fdist:
                        wordFreq[term] = wordFreq[term] + fdist[term]
            #computing the word contribution
            count = sum(wordFreq.values())
            contributions = {}
            for term in wordFreq:
                contribution = round(1.0*(wordFreq[term]+1)/(count + len(wordFreq)),4)
                contributions[term] = contribution
            
            # add the contributions to each cluster
            wordContribution[cluster["cluster"]] = contributions
            print "Finish Cluster ", cluster["cluster"], "For Stock ",index, "at ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
        finalWordContribution[index] = wordContribution    
    print "EndTime: ",datetime.strftime(datetime.now(),"%Y-%m-%d %H:%M:%S")
    
    "Write the Term Contribution To File"
    termContributionFile = common.get_configuration("model", "TERM_CONTRIBUTION_PATH")
    jsString = json.dumps(finalWordContribution)
    with open(termContributionFile,"w") as output:
        output.write(jsString)
Example #17
0
def import_news_to_database():
    try:
        historyNews = open(
            common.get_configuration("model", 'GROUP_STOCK_NEWS'))
        historyNewsJson = json.load(historyNews)

        for stockIndex in historyNewsJson:
            for article in historyNewsJson[stockIndex].values():
                news = {}
                news["title"] = article["title"]
                news["author"] = article["author"]
                postTime = article["postTime"].split(".")[0]
                postTime = datetime.strptime(postTime, "%Y-%m-%d %H:%M:%S")
                news["post_time"] = postTime
                news["post_date"] = postTime.date()
                news["content"] = article["content"]
                news["stock_index"] = stockIndex
                news["source"] = "Bloomberg News"
                news["update_time"] = article["queryTime"]
                news["newsUrl"] = article["newsUrl"]
                embersId = hashlib.sha1(article["content"]).hexdigest()
                news["embers_id"] = embersId
                ifExisted = bns.check_article_existed(news)
                if not ifExisted:
                    bns.insert_news(news)
                    "Insert into Mission process"
                    bns.insert_news_mission(news)
        bns.close_db_connection()
    except lite.Error, e:
        print "Error: %s" % e.args[0]
Example #18
0
def import_news_to_database():
    try:
        historyNews = open(common.get_configuration("model", "GROUP_STOCK_NEWS"))
        historyNewsJson = json.load(historyNews)

        for stockIndex in historyNewsJson:
            for article in historyNewsJson[stockIndex].values():
                news = {}
                news["title"] = article["title"]
                news["author"] = article["author"]
                postTime = article["postTime"].split(".")[0]
                postTime = datetime.strptime(postTime, "%Y-%m-%d %H:%M:%S")
                news["post_time"] = postTime
                news["post_date"] = postTime.date()
                news["content"] = article["content"]
                news["stock_index"] = stockIndex
                news["source"] = "Bloomberg News"
                news["update_time"] = article["queryTime"]
                news["newsUrl"] = article["newsUrl"]
                embersId = hashlib.sha1(article["content"]).hexdigest()
                news["embers_id"] = embersId
                ifExisted = bns.check_article_existed(news)
                if not ifExisted:
                    bns.insert_news(news)
                    "Insert into Mission process"
                    bns.insert_news_mission(news)
        bns.close_db_connection()
    except lite.Error, e:
        print "Error: %s" % e.args[0]
Example #19
0
def import_news_to_database():
    try:
        global con
        init()
        historyNews = open(common.get_configuration( "training", 'GROUP_STOCK_NEWS'))
        historyNewsJson = json.load(historyNews)
        i = 0
        for stockIndex in historyNewsJson:
            for article in historyNewsJson[stockIndex].values():
                news = {}
                news["title"] = article["title"]
                news["author"] = article["author"]
                postTime = article["postTime"].split(".")[0]
                postTime = datetime.strptime(postTime,"%Y-%m-%d %H:%M:%S")
                news["postTime"] = postTime
                news["postDate"] = postTime.date()
                news["content"] = article["content"]
                news["stockIndex"] = stockIndex
                news["source"] = "Bloomberg News"
                news["updateTime"] = article["queryTime"]
                news["url"] = article["newsUrl"]
                embersId = hashlib.sha1(article["content"]).hexdigest()
                news["embersId"] = embersId
                ifExisted = check_article_existed(news)
                if not ifExisted:
                    insert_news(news)
                    "Insert into Mission process"
                    insert_news_mission(news)
                i = i + 1
                if i % 1000 == 0:
                    con.commit()
        con.commit()
    except lite.Error, e:
        print "Error: %s" % e.args[0]
Example #20
0
def get_trend_type(rawIndexData):
    """
    Computing current day's trend type, compareing change percent to the trend range,
    Choose the nearnes trend as current day's type    
    """
    "Load the trend type range file"
    rangeFilePath = common.get_configuration("model", "TREND_RANGE_FILE")
    tFile = open(rangeFilePath)
    trendsJson = json.load(tFile)
    
    "Get the indicated stock range"
    stockIndex = rawIndexData["stockIndex"]
    tJson = trendsJson[stockIndex]
    print tJson
    
    "Computing change percent"
    lastPrice = float(rawIndexData["currentValue"])
    preLastPrice = float(rawIndexData["previousCloseValue"])
    changePercent = round((lastPrice - preLastPrice)/preLastPrice,4)
    
    distance = 10000
    trendType = None
    for type in tJson:
        tmpDistance = min(abs(changePercent-tJson[type][0]),abs(changePercent-tJson[type][1]))
        if tmpDistance < distance:
            distance = tmpDistance
            trendType = type
    return trendType
Example #21
0
    def compute_stock_news_probability(self, predictiveDate, clusterType,
                                       stockIndex):
        try:
            termContributionFile = open(
                common.get_configuration("model", 'TERM_CONTRIBUTION_PATH'))
            termContributionJson = json.load(termContributionFile)
            terms, newsDerived = self.get_stock_news_data(
                predictiveDate, stockIndex)
            termContributionProbability = 0
            if stockIndex in termContributionJson:
                for termClusterType in termContributionJson[stockIndex].keys():
                    if termClusterType == str(clusterType):
                        stermlist = termContributionJson[stockIndex][
                            termClusterType]
                        #print stermlist
                        for word, count in terms.iteritems():
                            if word in stermlist:
                                #print word
                                termContributionProbability = count * math.log(
                                    float(termContributionJson[stockIndex]
                                          [termClusterType][word]))
                                del stermlist[word]

            return termContributionProbability, newsDerived
        except IOError:
            print "Can't open the file:stock_raw_data.json."
        except Exception as e:
            print traceback.format_exc()
            print "Error in computing stock news probability: %s" % e.message
        return None
Example #22
0
def clusterSet(traingingStart,traningEndDate): 
    con = common.getDBConnection()
    cur = con.cursor()
    
    finalClusterRecord = []
    stockList = ["MERVAL","MEXBOL","CHILE65","BVPSBVPS","COLCAP","CRSMBCT","IBOV","IGBVL","IBVC"]
    for stock in stockList:
        sql = "select embers_id,post_date,current_value,previous_close_value,round(current_value-previous_close_value,4),round((current_value-previous_close_value)/previous_close_value,4),name from t_bloomberg_prices where name=? and post_date<=? and post_date>=? order by post_date asc"
        cur.execute(sql,(stock,traningEndDate,traingingStart))
        rows = cur.fetchall()
        "The number of rows to be committed for each interval"
        committedInterval = 0
        
        for row in rows:
            newRow = list(row)
            "Insert the pre-enriched stock index data into Database"
            InitiateEnrichedData(con, committedInterval, newRow)
            finalClusterRecord.append(newRow)
        con.commit()
    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("training", "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile,"w") as output:
        output.write(dataStr)
    
    if con:
        con.close()
Example #23
0
def main():
    vocabularyFile = common.get_configuration("training", "VOCABULARY_FILE")
    key_list = []
    with open(vocabularyFile,"r") as rf:
        lines = rf.readlines()
        for line in lines:
            line = line.strip()
            key_list.append(line)
    
    print key_list
Example #24
0
 def enumberate_stock_index( self ):
     try:
         clustersFile = open( common.get_configuration( "model", 'CLUSTER_PROBABILITY_PATH' ) )
         clusterJson = json.load( clustersFile )
         stockIndexList = []
         for stockIndex in clusterJson.keys():
             stockIndexList.append( stockIndex )
         return stockIndexList 
     except Exception as e:
         log.info( traceback.format_exc())
         log.info( "Error: %s" % e.args[0])
Example #25
0
 def enumberate_stock_index(self):
     try:
         clustersFile = open(
             common.get_configuration("model", 'CLUSTER_PROBABILITY_PATH'))
         clusterJson = json.load(clustersFile)
         stockIndexList = []
         for stockIndex in clusterJson.keys():
             stockIndexList.append(stockIndex)
         return stockIndexList
     except Exception as e:
         print traceback.format_exc()
         print "Error: %s" % e.args[0]
Example #26
0
 def compute_stock_index_probability( self, predictiveDate, clusterType , stockIndex ):
     try:
         "Get the clusters List"
         stockIndexFile = open( common.get_configuration( "model", 'CLUSTER_PROBABILITY_PATH' ) )
         clusterProbability = json.load( stockIndexFile )
         clusterJson = {}
         clusterContributionJson = {}
         clusterJson = clusterProbability[stockIndex]
         "Get the contribution of each cluster"
         clusterContributionFile = open( common.get_configuration( "model", 'CLUSTER_CONTRIBUTION_PATH' ) )
         clusterContributionJson = json.load( clusterContributionFile )
         clusterTypesHistory,stockDerived = self.get_stock_index_cluster( predictiveDate, stockIndex )
         stockIndexProbability = 0
         for key in clusterContributionJson[stockIndex].keys():
             if key == str( clusterType ):
                 "Search from the Cluster contribution Matrix to get the contribution probability"
                 stockIndexProbability = stockIndexProbability + math.log( float( clusterContributionJson[stockIndex][key][int( clusterTypesHistory[0] ) - 1][2] ) ) + math.log( float( clusterContributionJson[stockIndex][key][int( clusterTypesHistory[1] ) - 1][1] ) ) + math.log( float( clusterContributionJson[stockIndex][key][int( clusterTypesHistory[2] ) - 1][0] ) ) + math.log( float( clusterJson[str( clusterType )] ) )
         
         return stockIndexProbability,stockDerived
     except Exception as e:
         log.info( traceback.format_exc())
         log.info( "Error in computing stock index probability: %s" % e.args)
Example #27
0
def get_company_list():
    comDir = common.get_configuration("model", "COMPANY_MEMBER")
    sfile = os.listdir(comDir)
    companyList = {}
    for fi in sfile:
        with open(comDir+"/"+fi) as comFile:
            lines = comFile.readlines()
            stockIndex = lines[1].replace("\r","").replace("\n","").split(",")[1].replace(" Index","")
            if stockIndex not in companyList:
                companyList[stockIndex] = []
            for line in lines[2:]:
                infos = line.replace("\r","").replace("\n","").split(",")
                companyName = infos[2]
                tmps = companyName.split(" ")
                companyName = " ".join(tmps[:len(tmps)-1 if len(tmps)>1 else len(tmps)])
                if companyName not in companyList[stockIndex]:
                    companyList[stockIndex].append(companyName)
            companyList[stockIndex].append(stockIndex)
    
    desFile = common.get_configuration("model", "COMPANY_LIST")
    with open(desFile,"w") as output:
        jsStr = json.dumps(companyList)
        output.write(jsStr)
def create_match_rule(rule_name):
    comListFile = common.get_configuration("training", rule_name)
    comList = json.load(open(comListFile))
    rule = "("
    for stock in comList:
        for company in comList[stock]:
            company.replace("\\.","\\\\.")
            "check If the country name only contain one word, then we will add blank before and after the name to avoid the sub matching"
            if company.find(" ") < 0:
                eachRule = " " + company + " " + "|"
            else:
                eachRule = company + "|"
            rule += eachRule
    rule = rule[0:len(rule)-1] + ")"
    return rule.lower()
def create_match_rule():
    comListFile = common.get_configuration("model", "COMPANY_LIST")
    comList = json.load(open(comListFile))
    rule = "("
    for stock in comList:
        for company in comList[stock]:
            company.replace("\\.", "\\\\.")
            "check If the company name only contain one word, then we will add blank before and after the name"
            if company.find(" ") < 0:
                eachRule = " " + company + " " + "|"
            else:
                eachRule = company + "|"
            rule += eachRule
    rule = rule[0 : len(rule) - 1] + ")"
    return rule
Example #30
0
def create_match_rule():
    comListFile = common.get_configuration("model", "COMPANY_LIST")
    comList = json.load(open(comListFile))
    rule = "("
    for stock in comList:
        for company in comList[stock]:
            company.replace("\\.", "\\\\.")
            "check If the company name only contain one word, then we will add blank before and after the name"
            if company.find(" ") < 0:
                eachRule = " " + company + " " + "|"
            else:
                eachRule = company + "|"
            rule += eachRule
    rule = rule[0:len(rule) - 1] + ")"
    return rule
def import_history():
    hisFile = common.get_configuration("training", "HISTORICAL_STOCK_JSON")
    raw_price_list = []
    with open(hisFile,'r') as raw_file:
        lines = raw_file.readlines()
        for line in lines:
            raw_data = json.loads(line.replace("\n","").replace("\r",""))
            raw_price_list.append(raw_data)
    conn = common.getDBConnection()
    #process data one by one
    for raw_data in raw_price_list:
        process(conn,raw_data)
    
    if conn:
        conn.commit()
Example #32
0
 def enumberate_clusters( self , stockIndex ):
     try:
         clusterFile = open( common.get_configuration( "model", 'CLUSTER_PROBABILITY_PATH' ) )
         clusterJson = json.load( clusterFile ) 
         clustersList = []
         clusterProbability = {}
         for clusterKey in clusterJson.keys():
             if clusterKey == stockIndex:
                 clusterProbability = clusterJson[clusterKey]
                 break
         for clusterKey in clusterProbability.keys():
             clustersList.append( clusterKey )
         return clustersList 
     except Exception as e:
         log.info( traceback.format_exc())
         log.info( "Error: %s" % e.args)
Example #33
0
 def enumberate_clusters(self, stockIndex):
     try:
         clusterFile = open(
             common.get_configuration("model", 'CLUSTER_PROBABILITY_PATH'))
         clusterJson = json.load(clusterFile)
         clustersList = []
         clusterProbability = {}
         for clusterKey in clusterJson.keys():
             if clusterKey == stockIndex:
                 clusterProbability = clusterJson[clusterKey]
                 break
         for clusterKey in clusterProbability.keys():
             clustersList.append(clusterKey)
         return clustersList
     except Exception as e:
         print traceback.format_exc()
         print "Error: %s" % e.args
Example #34
0
def execute(date,cfgPath):
    init(cfgPath)
    enricheDa = ed.Enriched_Data(cfgPath)
    obj = enricheDa.enrich_all_stock(date)
    warningList = []
    for item in obj:
        warning = warningCheck(item)
        if warning is not None:
            warningList.append(warning) 
    
    #push warning to ZMQ
    port = common.get_configuration("info", "ZMQ_PORT")
    with queue.open(port, 'w', capture=True) as outq:
        for warning in warningList:
            outq.write(json.dumps(warning, encoding='utf8'))    
                
    return warningList   
Example #35
0
def get_trend_type(rawIndexData):
    """
    Computing current day's trend changeType, compareing change percent to the trend range,
    Choose the nearnes trend as current day's changeType    
    """
    "Load the trend changeType range file"
    rangeFilePath = common.get_configuration("model", "TREND_RANGE_FILE")
    tFile = open(rangeFilePath)
    trendsJson = json.load(tFile)
    tFile.close()
    
    "Get the indicated stock range"
    stockIndex = rawIndexData["name"]
    tJson = trendsJson[stockIndex]
    
    "Computing change percent"
    lastPrice = float(rawIndexData["currentValue"])
    preLastPrice = float(rawIndexData["previousCloseValue"])
    changePercent = round((lastPrice - preLastPrice)/preLastPrice,4)
    
    distance = 10000
    trendType = None
    for changeType in tJson:
        tmpDistance = min(abs(changePercent-tJson[changeType][0]),abs(changePercent-tJson[changeType][1]))
        if tmpDistance < distance:
            distance = tmpDistance
            trendType = changeType
            
    #According the current change percent to adjust the range of trend type
    bottom = tJson[trendType][0]
    top = tJson[trendType][1]
    
    if changePercent > top:
        top = changePercent
    
    if changePercent < bottom:
        bottom = changePercent
    
    trendsJson[stockIndex][trendType][0] = bottom
    trendsJson[stockIndex][trendType][1] = top
    
    with open(rangeFilePath,"w") as rangeFile:
        rangeFile.write(json.dumps(trendsJson))
        
    return trendType
Example #36
0
def dailySigmaTrends(stockIndex,cluster,m30,m90,std30,std90,curValue):
    #computing the bottom and upper line for daily sigma event
    s4Bottom = m30 - 4*std30
    s4Upper = m30 + 4*std30
    s3Bottom = m90 - 3*std90
    s3Upper = m90 + 3*std90
    
    bottom = s4Bottom
    upper = s4Upper
    if s4Bottom >= s3Bottom:
        bottom = s3Bottom
    if s3Upper <= s4Upper:
        upper = s3Upper
    #Get the span of input cluster
    """
    One point needed to be changed later: currently we just merge the two type of 
    extreme into one trend type 7, and we need to divide type 7 into type 7 and 11
    """
    trendRangePath = common.get_configuration("model", "TREND_RANGE_FILE")
    clusterDis = json.load(open(trendRangePath))
    #get the span of the input trend type
    cBottom = 0.0
    cUpper = 0.0
    
    clusters = clusterDis[stockIndex]
    for clu in clusters:
        if clu == cluster:
            cBottom = clusters[clu][0] * curValue
            cUpper = clusters[clu][1] * curValue
    
    #If Nothing happen, the eventType will be 0000
    eventType = "0000"
    
    if cBottom <= bottom:
        eventType = "0412"
    if cUpper >= upper:
        eventType = "0411"
    
    #If the predictive trends is the extreme value(Type == 1 and 6)
    #If previous day is not extreme sigma day, then predict that the next day will be extreme day
#    if eventType != "0000":
#        print "eventType:%s cBottom: %0.4f, bottom:%0.4f, cUpper:%0.4f, upper:%0.4f" %(eventType,cBottom,bottom,cUpper,upper)
    return eventType,cBottom,cUpper
def get_trend_type(stockIndex,changePercent):
    """
    Computing current day's trend type, compareing change percent to the trend range,
    Choose the nearnes trend as current day's type    
    """
    "Load the trend type range file"
    rangeFilePath = common.get_configuration("model", "TREND_RANGE_FILE")
    tFile = open(rangeFilePath)
    trendsJson = json.load(tFile)
    tJson = trendsJson[stockIndex]
    
    distance = 10000
    trendType = None
    for type in tJson:
        tmpDistance = min(abs(changePercent-tJson[type][0]),abs(changePercent-tJson[type][1]))
        if tmpDistance < distance:
            distance = tmpDistance
            trendType = type
    return trendType  
Example #38
0
def get_trend_type(stockIndex, changePercent):
    """
    Computing current day's trend type, compareing change percent to the trend range,
    Choose the nearnes trend as current day's type    
    """
    "Load the trend type range file"
    rangeFilePath = common.get_configuration("model", "TREND_RANGE_FILE")
    tFile = open(rangeFilePath)
    trendsJson = json.load(tFile)
    tJson = trendsJson[stockIndex]

    distance = 10000
    trendType = None
    for type in tJson:
        tmpDistance = min(abs(changePercent - tJson[type][0]),
                          abs(changePercent - tJson[type][1]))
        if tmpDistance < distance:
            distance = tmpDistance
            trendType = type
    return trendType
Example #39
0
    def get_stock_news_data(self, predictiveDate, stockIndex):
        con = None
        try:
            con = common.getDBConnection()
            cur = con.cursor()

            "Get past 3 day's news before Predictive Day "
            predictiveDate = datetime.strptime(predictiveDate, "%Y-%m-%d")
            startDay = (predictiveDate -
                        timedelta(days=3)).strftime("%Y-%m-%d")
            endDay = (predictiveDate - timedelta(days=1)).strftime("%Y-%m-%d")
            sqlquery = "select content,embers_id from t_daily_enrichednews where post_date>=? and post_date<=? and stock_index=?"

            cur.execute(sqlquery, ([startDay, endDay, stockIndex]))
            articleRecords = cur.fetchall()

            "Initiate the words List"
            vocabularyFile = open(
                common.get_configuration("model", 'VOCABULARY_FILE'))
            wordLines = vocabularyFile.readlines()
            termList = {}
            for line in wordLines:
                line = line.replace("\n", "").replace("\r", "")
                termList[line] = 0

            newsDerived = []
            "Merge all the term in each record"
            for record in articleRecords:
                jsonRecord = json.loads(record[0])
                newsDerived.append(record[1])
                for curWord in jsonRecord:
                    if curWord in termList:
                        termList[
                            curWord] = termList[curWord] + jsonRecord[curWord]

            return termList, newsDerived
        except sqlite.Error, e:
            print traceback.format_exc()
            print "Error: %s" % e.args[0]
def import_historical_stock():
    #get the historical stock dir
    stockFileDir = common.get_configuration("training", "HISTORICAL_STOCK")
    fileNames = os.listdir(stockFileDir)
    con = common.getDBConnection()
    cur = con.cursor()
    
    #clear the database
    clearSql = "delete from t_daily_stockindices"
    cur.execute(clearSql)
    con.commit()
    
    sql = "insert into t_daily_stockindices (sub_sequence,stock_index,date,last_price,one_day_change) values (?,?,?,?,?)";
    
    for filename in fileNames:
        fpath = stockFileDir + "/" + filename
        stock = filename.split(".")[0]
        subSequence = 0
        with open(fpath,"r") as stockFile:
            lines = stockFile.readlines()[2:]
            for line in lines:
                line = line.replace("\r","").replace("\n","")
                date = line.split(",")[0]
                lastPrice = line.split(",")[1]
                previousLastPrice = line.split(",")[2]
                
                if lastPrice == "#N/A N/A" or previousLastPrice == "#N/A N/A":
                    continue
                
                lastPrice = float(lastPrice)
                previousLastPrice = float(previousLastPrice)
                date = datetime.strptime(date,"%m/%d/%Y").strftime("%Y-%m-%d")
                oneDayChange = round(lastPrice - previousLastPrice,4)
                subSequence = subSequence + 1
                cur.execute(sql,(subSequence,stock,date,lastPrice,oneDayChange,))
                if subSequence % 300 == 0:
                    con.commit()
            con.commit() 
def get_trend_type(stockIndex,changePercent):
    """
    Computing current day's trend type, compareing change percent to the trend range,
    Choose the nearnes trend as current day's type    
    """
    "Load the trend type range file"
    rangeFilePath = common.get_configuration("model", "TREND_RANGE_FILE")
    tFile = open(rangeFilePath)
    trendsJson = json.load(tFile)
    tFile.close()
    tJson = trendsJson[stockIndex]
    
    distance = 10000
    trendType = None
    for changeType in tJson:
        tmpDistance = min(abs(changePercent-tJson[changeType][0]),abs(changePercent-tJson[changeType][1]))
        if tmpDistance < distance:
            distance = tmpDistance
            trendType = changeType
            
    #According the current change percent to adjust the range of trend type
    bottom = tJson[trendType][0]
    top = tJson[trendType][1]
    
    if changePercent > top:
        top = changePercent
    
    if changePercent < bottom:
        bottom = changePercent
    
    trendsJson[stockIndex][trendType][0] = bottom
    trendsJson[stockIndex][trendType][1] = top
    
    with open(rangeFilePath,"w") as rangeFile:
        rangeFile.write(json.dumps(trendsJson))
        
    return trendType 
Example #42
0
 def get_stock_news_data( self, predictiveDate , stockIndex ):
     con = None
     try:
         con = common.getDBConnection()
         cur = con.cursor()
         
         "Get past 3 day's news before Predictive Day "
         predictiveDate = datetime.strptime( predictiveDate, "%Y-%m-%d" )
         startDay = ( predictiveDate - timedelta( days = 3 ) ).strftime( "%Y-%m-%d" )
         endDay = ( predictiveDate - timedelta( days = 1 ) ).strftime( "%Y-%m-%d" )
         sqlquery = "select content,embers_id from t_daily_enrichednews where post_date>=? and post_date<=? and stock_index=?"
         
         cur.execute( sqlquery, ([startDay, endDay , stockIndex]))
         articleRecords = cur.fetchall()
         
         "Initiate the words List"
         vocabularyFile = open(common.get_configuration( "training", 'VOCABULARY_FILE'))
         wordLines = vocabularyFile.readlines()
         termList = {}
         for line in wordLines:
             line = line.replace("\n","").replace("\r","")
             termList[line] = 0
             
         newsDerived = []
         "Merge all the term in each record"
         for record in articleRecords:
             jsonRecord = json.loads(record[0])
             newsDerived.append(record[1])
             for curWord in jsonRecord:
                 if curWord in termList:
                     termList[curWord] = termList[curWord] + jsonRecord[curWord]
         
         return termList,newsDerived
     except sqlite.Error, e:
         log.info( traceback.format_exc())
         log.info( "Error: %s" % e.args[0])
Example #43
0
 def compute_stock_news_probability( self, predictiveDate, clusterType , stockIndex ):
     try:
         termContributionFile = open( common.get_configuration( "model", 'TERM_CONTRIBUTION_PATH' ) )
         termContributionJson = json.load( termContributionFile )
         terms,newsDerived = self.get_stock_news_data( predictiveDate , stockIndex )
         termContributionProbability = 0
         if stockIndex in termContributionJson:
             for termClusterType in termContributionJson[stockIndex].keys():
                 if termClusterType == str( clusterType ):    
                     stermlist = termContributionJson[stockIndex][termClusterType]
                     #print stermlist                            
                     for word, count in terms.iteritems():                    
                         if word in stermlist:                        
                             #print word
                             termContributionProbability =  count * math.log( float( termContributionJson[stockIndex][termClusterType][word] ) )
                             del stermlist[word]
         
         return termContributionProbability,newsDerived
     except IOError:
         log.info( "Can't open the file:stock_raw_data.json.")
     except Exception as e:
         log.info( traceback.format_exc())
         log.info( "Error in computing stock news probability: %s" % e.message)    
     return None
Example #44
0
def create_vocabulary():
    "Read the Negative Finance Dictionary"
    negativeFilePath = common.get_configuration("model", "NEGATIVE_DIC")
    negativeDoc = open(negativeFilePath).readlines()
    stemmer = nltk.stem.snowball.SnowballStemmer('english')
    negativeWords = []
    for l in negativeDoc:
        negativeWords.append(stemmer.stem(l.replace("\n", "")))

    fdist = nltk.FreqDist(negativeWords)
    negKeywords = []
    for k in fdist:
        negKeywords.append(k)

    "Read the Positive Finance Dictionary"
    positiveFilePath = common.get_configuration("model", "POSITIVE_DIC")
    positiveDoc = open(positiveFilePath).readlines()
    postiveWords = []
    for line in positiveDoc:
        postiveWords.append(stemmer.stem(line.replace("\n", "")))

    fdist = nltk.FreqDist(postiveWords)
    posiKeyWords = []
    for posWord in fdist:
        posiKeyWords.append(posWord)

    "Read the archived news to count the top words"
    BBNewsPath = common.get_configuration("model", "TRAINING_NEWS_FILE")
    news = open(BBNewsPath)
    jsonNews = json.load(news)
    #remove all the duplicated articles
    newsWarehouse = {}
    for stockIndex in jsonNews:
        for articleId in jsonNews[stockIndex]:
            newsWarehouse[articleId] = jsonNews[stockIndex][articleId]

    keyWords = []
    for w in negKeywords:
        keyWords.append(w)

    for w in posiKeyWords:
        keyWords.append(w)

    print "Over Here"

    wordFreq = {}
    flatCount = 0
    for news in newsWarehouse:
        flatCount = flatCount + 1
        doc = newsWarehouse[news]
        #print doc
        tokens = nltk.word_tokenize(doc["content"])
        stemmer = nltk.stem.snowball.SnowballStemmer('english')
        words = [
            w.lower() for w in tokens if w not in
            [",", ".", ")", "]", "(", "[", "*", ";", "...", ":", "&", '"']
            and not w.isdigit()
        ]
        words = [
            w for w in words
            if w.encode("utf8") not in nltk.corpus.stopwords.words('english')
        ]
        stemmedWords = [stemmer.stem(w) for w in words]
        fdist = nltk.FreqDist(stemmedWords)
        for word in keyWords:
            if word in fdist:
                if word in wordFreq:
                    wordFreq[word] = wordFreq[word] + fdist[word]
                else:
                    wordFreq[word] = fdist[word]

    print wordFreq

    #sorted_obj2 = wordFreq.iteritems()
    sorted_obj2 = sorted(wordFreq.items(), key=lambda x: x[1], reverse=True)
    print sorted_obj2[0][1]

    "Write the vocabulary list to File"
    vocabularyFile = common.get_configuration("model", "VOCABULARY_FILE")
    output = open(vocabularyFile, "w")
    i = 1
    for word in sorted_obj2:
        if i > 150:
            break
        else:
            output.write(word[0])
            output.write("\n")
            i = i + 1

    output.close()
Example #45
0
def clusterSet(traningEndDate):
    con = common.getDBConnection()
    cur = con.cursor()

    finalClusterRecord = []
    stockList = [
        "MERVAL", "MEXBOL", "CHILE65", "BVPSBVPS", "COLCAP", "CRSMBCT", "IBOV",
        "IGBVL"
    ]
    finalOrderCluster = {}
    for stock in stockList:
        sql = "select embers_id,sub_sequence,date,last_price,one_day_change,round(one_day_change/(last_price-one_day_change),4),stock_index from t_daily_stockindex where stock_index=? and date<=?"
        cur.execute(sql, (stock, traningEndDate))
        rows = cur.fetchall()
        changes = [row[5] for row in rows]
        fdist = nltk.FreqDist(changes)
        clusterS = [(0, x) for x in fdist.keys()]

        print "StartTime: ", datetime.strftime(datetime.now(),
                                               "%Y-%m-%d %H:%M:%S")
        c1 = KMeansClustering(clusterS)
        print "MiddleTime: ", datetime.strftime(datetime.now(),
                                                "%Y-%m-%d %H:%M:%S")
        cluster = c1.getclusters(20)
        #        cluster = [[(0, 0.0862), (0, 0.088), (0, 0.0914), (0, 0.094), (0, 0.0957), (0, 0.097), (0, 0.1017), (0, 0.1024), (0, 0.0774), (0, 0.0882), (0, 0.0783), (0, 0.11), (0, 0.0807), (0, 0.0813), (0, 0.1367), (0, 0.0831), (0, 0.0836), (0, 0.0855), (0, 0.0879), (0, 0.0912), (0, 0.0763), (0, 0.1046), (0, 0.0784), (0, 0.0815), (0, 0.1464), (0, 0.1987), (0, 0.1053), (0, 0.1101), (0, 0.1176), (0, 0.0868), (0, 0.1342), (0, 0.1466), (0, 0.0761), (0, 0.0772)], [(0, -0.0001), (0, 0.0), (0, 0.0001), (0, -0.0002), (0, -0.0003), (0, -0.0004), (0, -0.0005), (0, -0.0006), (0, 0.0002), (0, 0.0003), (0, 0.0004), (0, 0.0005), (0, 0.0006), (0, 0.0007), (0, 0.0008), (0, 0.0009), (0, 0.001), (0, 0.0011), (0, 0.0012), (0, 0.0013), (0, 0.0014), (0, 0.0015), (0, 0.0016), (0, 0.0017), (0, 0.0018), (0, 0.0019), (0, 0.002), (0, 0.0021), (0, 0.0022), (0, 0.0023), (0, 0.0024), (0, 0.0025), (0, 0.0026), (0, 0.0027), (0, 0.0028), (0, 0.0029), (0, 0.003), (0, 0.0031), (0, 0.0032), (0, 0.0033), (0, 0.0034), (0, 0.0035), (0, 0.0036), (0, 0.0037), (0, 0.0038), (0, 0.0039), (0, 0.004), (0, 0.0041), (0, 0.0042), (0, 0.0043), (0, 0.0044), (0, 0.0045), (0, 0.0046), (0, 0.0047), (0, 0.0048), (0, 0.0049), (0, 0.005), (0, -0.0007), (0, -0.0008)], [(0, 0.0297), (0, 0.0296), (0, 0.0298), (0, 0.0299), (0, 0.0301), (0, 0.03), (0, 0.0303), (0, 0.0302), (0, 0.0304), (0, 0.0305), (0, 0.0306), (0, 0.0308), (0, 0.0307), (0, 0.0309), (0, 0.031), (0, 0.0311), (0, 0.0313), (0, 0.0314), (0, 0.0312), (0, 0.0316), (0, 0.0315), (0, 0.0317), (0, 0.0318), (0, 0.032), (0, 0.0319), (0, 0.0322), (0, 0.0321), (0, 0.0324), (0, 0.0323), (0, 0.0326), (0, 0.0325), (0, 0.0328), (0, 0.033), (0, 0.0327), (0, 0.0332), (0, 0.0331), (0, 0.0333), (0, 0.0329), (0, 0.0335), (0, 0.0336), (0, 0.0334), (0, 0.0337), (0, 0.0338), (0, 0.0339), (0, 0.034), (0, 0.0341), (0, 0.0342), (0, 0.0343), (0, 0.0344), (0, 0.0345), (0, 0.0346), (0, 0.0348), (0, 0.0349), (0, 0.035), (0, 0.0351), (0, 0.0352), (0, 0.0355), (0, 0.0356), (0, 0.0358), (0, 0.0357), (0, 0.0359), (0, 0.036), (0, 0.0361), (0, 0.0362), (0, 0.0363), (0, 0.0365)], [(0, 0.0559), (0, 0.0564), (0, 0.0568), (0, 0.0571), (0, 0.0573), (0, 0.0579), (0, 0.0578), (0, 0.0581), (0, 0.0587), (0, 0.0589), (0, 0.0595), (0, 0.0591), (0, 0.0594), (0, 0.0604), (0, 0.0598), (0, 0.06), (0, 0.0602), (0, 0.0609), (0, 0.0612), (0, 0.059), (0, 0.0606), (0, 0.0614), (0, 0.0619), (0, 0.0625), (0, 0.0628), (0, 0.0615), (0, 0.0637), (0, 0.0633), (0, 0.0634), (0, 0.0636), (0, 0.0654), (0, 0.0658), (0, 0.0659), (0, 0.0669), (0, 0.0667), (0, 0.0664), (0, 0.067), (0, 0.0675), (0, 0.0673), (0, 0.0676), (0, 0.0686), (0, 0.07), (0, 0.0697), (0, 0.0709), (0, 0.0716), (0, 0.0717), (0, 0.0738), (0, 0.0747)], [(0, -0.0133), (0, -0.0132), (0, -0.0135), (0, -0.0134), (0, -0.0137), (0, -0.0138), (0, -0.0136), (0, -0.014), (0, -0.0139), (0, -0.0142), (0, -0.0143), (0, -0.0144), (0, -0.0141), (0, -0.0145), (0, -0.0146), (0, -0.0147), (0, -0.0148), (0, -0.0149), (0, -0.015), (0, -0.0151), (0, -0.0152), (0, -0.0153), (0, -0.0154), (0, -0.0155), (0, -0.0156), (0, -0.0157), (0, -0.0158), (0, -0.0159), (0, -0.016), (0, -0.0161), (0, -0.0162), (0, -0.0163), (0, -0.0164), (0, -0.0165), (0, -0.0166), (0, -0.0167), (0, -0.0168), (0, -0.0169), (0, -0.017), (0, -0.0171), (0, -0.0172), (0, -0.0173), (0, -0.0174), (0, -0.0175), (0, -0.0176), (0, -0.0177), (0, -0.0178), (0, -0.0179), (0, -0.018), (0, -0.0181), (0, -0.0182), (0, -0.0183), (0, -0.0184), (0, -0.0185), (0, -0.0186), (0, -0.0187), (0, -0.0188), (0, -0.0189), (0, -0.019), (0, -0.0191), (0, -0.0192), (0, -0.0193), (0, -0.0194), (0, -0.0195)], [(0, 0.0448), (0, 0.0451), (0, 0.0452), (0, 0.0446), (0, 0.0447), (0, 0.0456), (0, 0.045), (0, 0.0455), (0, 0.0462), (0, 0.0459), (0, 0.0461), (0, 0.0466), (0, 0.046), (0, 0.0467), (0, 0.0445), (0, 0.0458), (0, 0.0464), (0, 0.0477), (0, 0.0463), (0, 0.0472), (0, 0.0478), (0, 0.0457), (0, 0.0476), (0, 0.0481), (0, 0.0484), (0, 0.0488), (0, 0.0483), (0, 0.0487), (0, 0.0471), (0, 0.0482), (0, 0.0496), (0, 0.0474), (0, 0.0495), (0, 0.0485), (0, 0.0504), (0, 0.0505), (0, 0.0506), (0, 0.0501), (0, 0.0509), (0, 0.0508), (0, 0.051), (0, 0.0515), (0, 0.0516), (0, 0.052), (0, 0.0522), (0, 0.0524), (0, 0.053), (0, 0.0531), (0, 0.0534), (0, 0.0535), (0, 0.0536), (0, 0.0537), (0, 0.0538), (0, 0.0541), (0, 0.0542), (0, 0.0545), (0, 0.0546), (0, 0.0548), (0, 0.055)], [(0, 0.0172), (0, 0.017), (0, 0.0173), (0, 0.0174), (0, 0.0171), (0, 0.0177), (0, 0.0175), (0, 0.0178), (0, 0.0179), (0, 0.0176), (0, 0.0181), (0, 0.018), (0, 0.0183), (0, 0.0182), (0, 0.0186), (0, 0.0185), (0, 0.0187), (0, 0.0184), (0, 0.0189), (0, 0.0188), (0, 0.019), (0, 0.0191), (0, 0.0192), (0, 0.0194), (0, 0.0193), (0, 0.0196), (0, 0.0195), (0, 0.0197), (0, 0.0199), (0, 0.0198), (0, 0.02), (0, 0.0201), (0, 0.0202), (0, 0.0204), (0, 0.0205), (0, 0.0206), (0, 0.0203), (0, 0.0208), (0, 0.0207), (0, 0.021), (0, 0.0209), (0, 0.0211), (0, 0.0213), (0, 0.0212), (0, 0.0214), (0, 0.0215), (0, 0.0216), (0, 0.0217), (0, 0.0218), (0, 0.0219), (0, 0.022), (0, 0.0221), (0, 0.0222), (0, 0.0223), (0, 0.0224), (0, 0.0225), (0, 0.0226), (0, 0.0227), (0, 0.0228), (0, 0.0229), (0, 0.023), (0, 0.0231)], [(0, -0.0408), (0, -0.041), (0, -0.0411), (0, -0.0412), (0, -0.0413), (0, -0.0415), (0, -0.0416), (0, -0.0417), (0, -0.0419), (0, -0.042), (0, -0.0423), (0, -0.0424), (0, -0.0418), (0, -0.0425), (0, -0.0428), (0, -0.043), (0, -0.0431), (0, -0.0432), (0, -0.0433), (0, -0.0434), (0, -0.0436), (0, -0.0438), (0, -0.0439), (0, -0.044), (0, -0.0442), (0, -0.0441), (0, -0.0446), (0, -0.0443), (0, -0.0448), (0, -0.0447), (0, -0.045), (0, -0.0449), (0, -0.0453), (0, -0.0451), (0, -0.0454), (0, -0.0455), (0, -0.0458), (0, -0.0456), (0, -0.0459), (0, -0.0463), (0, -0.0461), (0, -0.046), (0, -0.0464), (0, -0.0465), (0, -0.0467), (0, -0.0462), (0, -0.0466), (0, -0.0472), (0, -0.0469), (0, -0.0475), (0, -0.0473), (0, -0.0478), (0, -0.0477), (0, -0.0476), (0, -0.0482), (0, -0.0481), (0, -0.0483), (0, -0.0487), (0, -0.0488), (0, -0.049), (0, -0.0492), (0, -0.0494)], [(0, -0.0261), (0, -0.0262), (0, -0.0263), (0, -0.0264), (0, -0.0266), (0, -0.0265), (0, -0.0267), (0, -0.0268), (0, -0.0269), (0, -0.0271), (0, -0.027), (0, -0.0273), (0, -0.0272), (0, -0.0275), (0, -0.0274), (0, -0.0277), (0, -0.0278), (0, -0.0276), (0, -0.0279), (0, -0.0281), (0, -0.028), (0, -0.0283), (0, -0.0282), (0, -0.0284), (0, -0.0285), (0, -0.0286), (0, -0.0287), (0, -0.0288), (0, -0.0289), (0, -0.0291), (0, -0.0292), (0, -0.0293), (0, -0.029), (0, -0.0294), (0, -0.0295), (0, -0.0297), (0, -0.0296), (0, -0.0299), (0, -0.03), (0, -0.0301), (0, -0.0302), (0, -0.0298), (0, -0.0303), (0, -0.0304), (0, -0.0307), (0, -0.0305), (0, -0.0308), (0, -0.031), (0, -0.0309), (0, -0.0312), (0, -0.0311), (0, -0.0313), (0, -0.0315), (0, -0.0314), (0, -0.0316), (0, -0.0317), (0, -0.0319), (0, -0.0318), (0, -0.032), (0, -0.0321), (0, -0.0322), (0, -0.0323), (0, -0.0325), (0, -0.0326), (0, -0.0327), (0, -0.0328), (0, -0.0329)], [(0, -0.0619), (0, -0.0622), (0, -0.0627), (0, -0.064), (0, -0.0645), (0, -0.065), (0, -0.0653), (0, -0.0651), (0, -0.0659), (0, -0.0663), (0, -0.0665), (0, -0.066), (0, -0.0666), (0, -0.0674), (0, -0.0671), (0, -0.0684), (0, -0.0672), (0, -0.0691), (0, -0.0689), (0, -0.0692), (0, -0.0701), (0, -0.0698), (0, -0.0709), (0, -0.0715), (0, -0.0717), (0, -0.0722), (0, -0.0734), (0, -0.0741), (0, -0.0749), (0, -0.0763), (0, -0.0772), (0, -0.0758), (0, -0.0762), (0, -0.0787), (0, -0.0788), (0, -0.0759), (0, -0.0775), (0, -0.0808)], [(0, -0.0905), (0, -0.1081), (0, -0.1018), (0, -0.094), (0, -0.0937), (0, -0.0936), (0, -0.0927), (0, -0.0919), (0, -0.0863), (0, -0.1593), (0, -0.1245), (0, -0.0847), (0, -0.1215), (0, -0.1139), (0, -0.1099), (0, -0.1068), (0, -0.0868), (0, -0.0856), (0, -0.0854), (0, -0.0837), (0, -0.0822), (0, -0.0877), (0, -0.1241), (0, -0.1073), (0, -0.1065), (0, -0.1011), (0, -0.0835)], [(0, -0.0196), (0, -0.0198), (0, -0.0197), (0, -0.0199), (0, -0.02), (0, -0.0201), (0, -0.0202), (0, -0.0204), (0, -0.0203), (0, -0.0205), (0, -0.0206), (0, -0.0208), (0, -0.0207), (0, -0.021), (0, -0.0209), (0, -0.0212), (0, -0.0211), (0, -0.0214), (0, -0.0215), (0, -0.0213), (0, -0.0217), (0, -0.0216), (0, -0.0219), (0, -0.0218), (0, -0.0221), (0, -0.022), (0, -0.0223), (0, -0.0222), (0, -0.0225), (0, -0.0224), (0, -0.0227), (0, -0.0226), (0, -0.0229), (0, -0.0228), (0, -0.023), (0, -0.0231), (0, -0.0232), (0, -0.0234), (0, -0.0233), (0, -0.0236), (0, -0.0235), (0, -0.0238), (0, -0.0237), (0, -0.024), (0, -0.0239), (0, -0.0242), (0, -0.0241), (0, -0.0244), (0, -0.0243), (0, -0.0245), (0, -0.0246), (0, -0.0247), (0, -0.0248), (0, -0.0249), (0, -0.025), (0, -0.0251), (0, -0.0252), (0, -0.0253), (0, -0.0254), (0, -0.0255), (0, -0.0256), (0, -0.0257), (0, -0.0258), (0, -0.0259), (0, -0.026)], [(0, -0.05), (0, -0.0504), (0, -0.0499), (0, -0.0507), (0, -0.0501), (0, -0.0509), (0, -0.0513), (0, -0.0505), (0, -0.051), (0, -0.0508), (0, -0.0517), (0, -0.0519), (0, -0.0516), (0, -0.052), (0, -0.0524), (0, -0.0525), (0, -0.0526), (0, -0.0528), (0, -0.0529), (0, -0.0533), (0, -0.0538), (0, -0.0535), (0, -0.0532), (0, -0.0542), (0, -0.0543), (0, -0.0546), (0, -0.054), (0, -0.055), (0, -0.0556), (0, -0.0545), (0, -0.056), (0, -0.0554), (0, -0.0567), (0, -0.0563), (0, -0.0571), (0, -0.0572), (0, -0.0576), (0, -0.0579), (0, -0.058), (0, -0.0584), (0, -0.0581), (0, -0.0588), (0, -0.0589), (0, -0.0591), (0, -0.0593), (0, -0.0596), (0, -0.0595), (0, -0.0601), (0, -0.0613), (0, -0.0614)], [(0, -0.001), (0, -0.0012), (0, -0.0017), (0, -0.0016), (0, -0.0013), (0, -0.0011), (0, -0.002), (0, -0.0018), (0, -0.0015), (0, -0.0014), (0, -0.0019), (0, -0.0021), (0, -0.0022), (0, -0.0023), (0, -0.0009), (0, -0.0024), (0, -0.0025), (0, -0.0026), (0, -0.0027), (0, -0.0028), (0, -0.0029), (0, -0.003), (0, -0.0031), (0, -0.0032), (0, -0.0033), (0, -0.0034), (0, -0.0035), (0, -0.0036), (0, -0.0037), (0, -0.0038), (0, -0.0039), (0, -0.004), (0, -0.0041), (0, -0.0042), (0, -0.0043), (0, -0.0044), (0, -0.0045), (0, -0.0046), (0, -0.0047), (0, -0.0048), (0, -0.0049), (0, -0.005), (0, -0.0051), (0, -0.0052), (0, -0.0053), (0, -0.0054), (0, -0.0055), (0, -0.0056), (0, -0.0057), (0, -0.0058), (0, -0.0059), (0, -0.006), (0, -0.0061), (0, -0.0062), (0, -0.0063), (0, -0.0064), (0, -0.0065), (0, -0.0066), (0, -0.0067), (0, -0.0068), (0, -0.0069)], [(0, -0.033), (0, -0.0332), (0, -0.0331), (0, -0.0334), (0, -0.0333), (0, -0.0336), (0, -0.0337), (0, -0.0335), (0, -0.0338), (0, -0.034), (0, -0.0339), (0, -0.0342), (0, -0.0343), (0, -0.0341), (0, -0.0344), (0, -0.0345), (0, -0.0346), (0, -0.0347), (0, -0.0348), (0, -0.035), (0, -0.0349), (0, -0.0351), (0, -0.0352), (0, -0.0353), (0, -0.0354), (0, -0.0355), (0, -0.0357), (0, -0.0356), (0, -0.0358), (0, -0.0359), (0, -0.0361), (0, -0.036), (0, -0.0363), (0, -0.0362), (0, -0.0365), (0, -0.0366), (0, -0.0364), (0, -0.0368), (0, -0.0369), (0, -0.0372), (0, -0.0371), (0, -0.0367), (0, -0.0375), (0, -0.0373), (0, -0.0376), (0, -0.0374), (0, -0.0378), (0, -0.038), (0, -0.0379), (0, -0.0377), (0, -0.0382), (0, -0.0384), (0, -0.0383), (0, -0.0386), (0, -0.0381), (0, -0.0387), (0, -0.0389), (0, -0.0385), (0, -0.039), (0, -0.0391), (0, -0.0388), (0, -0.0392), (0, -0.0395), (0, -0.0393), (0, -0.0397), (0, -0.0398), (0, -0.0396), (0, -0.0399), (0, -0.0402), (0, -0.0401), (0, -0.0403), (0, -0.0406), (0, -0.0407)], [(0, 0.0232), (0, 0.0233), (0, 0.0234), (0, 0.0235), (0, 0.0237), (0, 0.0236), (0, 0.0238), (0, 0.0239), (0, 0.024), (0, 0.0241), (0, 0.0242), (0, 0.0243), (0, 0.0244), (0, 0.0245), (0, 0.0247), (0, 0.0248), (0, 0.0246), (0, 0.0249), (0, 0.025), (0, 0.0251), (0, 0.0253), (0, 0.0252), (0, 0.0255), (0, 0.0254), (0, 0.0257), (0, 0.0256), (0, 0.0259), (0, 0.026), (0, 0.0258), (0, 0.0261), (0, 0.0262), (0, 0.0264), (0, 0.0265), (0, 0.0263), (0, 0.0267), (0, 0.0268), (0, 0.0266), (0, 0.027), (0, 0.0269), (0, 0.0271), (0, 0.0272), (0, 0.0274), (0, 0.0273), (0, 0.0276), (0, 0.0275), (0, 0.0277), (0, 0.0278), (0, 0.0279), (0, 0.0281), (0, 0.0282), (0, 0.0283), (0, 0.0284), (0, 0.0285), (0, 0.0286), (0, 0.0287), (0, 0.0288), (0, 0.0289), (0, 0.029), (0, 0.0291), (0, 0.0292), (0, 0.0293), (0, 0.0294)], [(0, 0.011), (0, 0.0112), (0, 0.0113), (0, 0.0111), (0, 0.0115), (0, 0.0114), (0, 0.0117), (0, 0.0116), (0, 0.0118), (0, 0.0119), (0, 0.0121), (0, 0.0122), (0, 0.0123), (0, 0.0124), (0, 0.012), (0, 0.0126), (0, 0.0125), (0, 0.0128), (0, 0.0127), (0, 0.013), (0, 0.0129), (0, 0.0131), (0, 0.0133), (0, 0.0132), (0, 0.0135), (0, 0.0134), (0, 0.0136), (0, 0.0137), (0, 0.0138), (0, 0.014), (0, 0.0139), (0, 0.0142), (0, 0.0141), (0, 0.0143), (0, 0.0144), (0, 0.0145), (0, 0.0146), (0, 0.0147), (0, 0.0148), (0, 0.0149), (0, 0.015), (0, 0.0151), (0, 0.0153), (0, 0.0152), (0, 0.0154), (0, 0.0155), (0, 0.0156), (0, 0.0157), (0, 0.0158), (0, 0.0159), (0, 0.016), (0, 0.0161), (0, 0.0162), (0, 0.0163), (0, 0.0164), (0, 0.0165), (0, 0.0166), (0, 0.0167), (0, 0.0168), (0, 0.0169)], [(0, -0.007), (0, -0.0071), (0, -0.0072), (0, -0.0073), (0, -0.0074), (0, -0.0075), (0, -0.0076), (0, -0.0077), (0, -0.0078), (0, -0.0079), (0, -0.0081), (0, -0.008), (0, -0.0082), (0, -0.0083), (0, -0.0084), (0, -0.0085), (0, -0.0086), (0, -0.0087), (0, -0.0088), (0, -0.0089), (0, -0.009), (0, -0.0091), (0, -0.0092), (0, -0.0093), (0, -0.0094), (0, -0.0095), (0, -0.0096), (0, -0.0097), (0, -0.0098), (0, -0.0099), (0, -0.01), (0, -0.0101), (0, -0.0102), (0, -0.0103), (0, -0.0104), (0, -0.0105), (0, -0.0106), (0, -0.0107), (0, -0.0108), (0, -0.0109), (0, -0.011), (0, -0.0111), (0, -0.0112), (0, -0.0113), (0, -0.0114), (0, -0.0115), (0, -0.0116), (0, -0.0117), (0, -0.0118), (0, -0.0119), (0, -0.012), (0, -0.0121), (0, -0.0122), (0, -0.0123), (0, -0.0124), (0, -0.0125), (0, -0.0126), (0, -0.0127), (0, -0.0128), (0, -0.0129), (0, -0.013), (0, -0.0131)], [(0, 0.0051), (0, 0.0052), (0, 0.0053), (0, 0.0055), (0, 0.0054), (0, 0.0057), (0, 0.0056), (0, 0.0059), (0, 0.0058), (0, 0.0061), (0, 0.006), (0, 0.0062), (0, 0.0063), (0, 0.0064), (0, 0.0065), (0, 0.0066), (0, 0.0068), (0, 0.0069), (0, 0.0067), (0, 0.007), (0, 0.0072), (0, 0.0071), (0, 0.0073), (0, 0.0074), (0, 0.0075), (0, 0.0076), (0, 0.0077), (0, 0.0078), (0, 0.0079), (0, 0.008), (0, 0.0081), (0, 0.0082), (0, 0.0083), (0, 0.0084), (0, 0.0085), (0, 0.0086), (0, 0.0087), (0, 0.0088), (0, 0.0089), (0, 0.009), (0, 0.0091), (0, 0.0092), (0, 0.0093), (0, 0.0094), (0, 0.0095), (0, 0.0096), (0, 0.0097), (0, 0.0098), (0, 0.0099), (0, 0.01), (0, 0.0101), (0, 0.0102), (0, 0.0103), (0, 0.0104), (0, 0.0105), (0, 0.0106), (0, 0.0107), (0, 0.0108), (0, 0.0109)], [(0, 0.0369), (0, 0.0371), (0, 0.0367), (0, 0.037), (0, 0.0375), (0, 0.0373), (0, 0.0376), (0, 0.0372), (0, 0.0377), (0, 0.038), (0, 0.0379), (0, 0.0374), (0, 0.0381), (0, 0.0382), (0, 0.0378), (0, 0.0384), (0, 0.0386), (0, 0.0387), (0, 0.0385), (0, 0.0389), (0, 0.0391), (0, 0.039), (0, 0.0392), (0, 0.0394), (0, 0.0395), (0, 0.0396), (0, 0.0398), (0, 0.0399), (0, 0.04), (0, 0.0401), (0, 0.0404), (0, 0.0405), (0, 0.0406), (0, 0.0407), (0, 0.0408), (0, 0.0409), (0, 0.041), (0, 0.0411), (0, 0.0412), (0, 0.0414), (0, 0.0415), (0, 0.0416), (0, 0.0417), (0, 0.0419), (0, 0.042), (0, 0.0421), (0, 0.0422), (0, 0.0426), (0, 0.0428), (0, 0.0427), (0, 0.043), (0, 0.0429), (0, 0.0431), (0, 0.0433), (0, 0.0434), (0, 0.0435), (0, 0.0436), (0, 0.0438), (0, 0.0437), (0, 0.044), (0, 0.0442), (0, 0.0444)]]
        print "EndTime: ", datetime.strftime(datetime.now(),
                                             "%Y-%m-%d %H:%M:%S")
        namedCluster = {}
        i = 0
        orderCluster = {}
        for clu in cluster:
            i = i + 1
            namedCluster[i] = clu
            orderCluster[i] = [min(clu)[1], max(clu)[1]]

        for m in orderCluster:
            min1 = orderCluster[m][0]
            max1 = orderCluster[m][1]
            for n in orderCluster:
                min2 = orderCluster[n][0]
                max2 = orderCluster[n][1]
                if (min1 > min2 and min1 < max2) or (max1 > min2
                                                     and max1 < max2):
                    print m, " intersect with ", n, " values: ", min1, max1, min2, max2

        clusterR = []
        for row in rows:
            for nc in namedCluster:
                if (0, row[5]) in namedCluster[nc]:
                    newRow = list(row)
                    newRow.append(nc)
                    clusterR.append(newRow)
                    finalClusterRecord.append(newRow)

        #insert the clusterR into Database
        insertSql = "insert into t_daily_enrichedIndex (embers_id,derived_from,sub_sequence,stock_index,date,last_price,one_day_change,change_percent,trend_type)values (?,?,?,?,?,?,?,?,?)"
        m = 0
        for j in clusterR:
            contentStr = json.dumps(j)
            embersId = hashlib.sha1(contentStr).hexdigest()
            derivedFrom = "[" + str(j[0]) + "]"
            subsequenceId = j[1]
            postDate = j[2]
            lastPrice = j[3]
            oneDayChange = j[4]
            changePercent = j[5]
            stockIndex = j[6]
            trendType = j[7]
            cur.execute(
                insertSql,
                (embersId, derivedFrom, subsequenceId, stockIndex, postDate,
                 lastPrice, oneDayChange, changePercent, trendType))
            m = m + 1
            if m % 1000 == 0:
                con.commit()
        con.commit()
        finalOrderCluster[stock] = orderCluster

    "Write the type range into a file"
    trendRangeFile = common.get_configuration("model", "TREND_RANGE_FILE")
    dataStr = json.dumps(finalOrderCluster)
    with open(trendRangeFile, "w") as output:
        output.write(dataStr)

    "Write the training data into file"
    trendSetRecordFile = common.get_configuration("model",
                                                  "TRAINING_TREND_RECORDS")
    dataStr = json.dumps(finalClusterRecord)
    with open(trendSetRecordFile, "w") as output:
        output.write(dataStr)

    if con:
        con.close()
Example #46
0
    obj = enricheDa.enrich_all_stock(date)
    warningList = []
    for item in obj:
        warning = warningCheck(item)
        if warning is not None:
            warningList.append(warning)
    return warningList


if __name__ == "__main__":
    if len(sys.argv) == 3:
        startDay = sys.argv[1]
        endDay = sys.argv[2]
        startD = datetime.strptime(startDay, "%Y-%m-%d")
        endD = datetime.strptime(endDay, "%Y-%m-%d")
        resultFile = common.get_configuration("model", "TESTING_RESULT_FILE")
        warningResult = open(resultFile, "w")
        while startD <= endD:
            predictiveDay = datetime.strftime(startD, "%Y-%m-%d")
            warningList = execute(predictiveDay)
            if warningList is not None:
                for warning in warningList:
                    warningResult.write(json.dumps(warning))
                    warningResult.write("\n")
            startD = startD + timedelta(days=1)
        warningResult.close()
    elif len(sys.argv) == 2:
        "The imput date format should be 'yyyy-mm-dd'"
        predictiveDay = sys.argv[1]
        warningList = execute(predictiveDay)
    elif len(sys.argv) == 1:
Example #47
0
def compute_trend_contribution():
    #read the trend segments file
    trendFileName = common.get_configuration("model", "TRAINING_TREND_RECORDS")
    trendFile = open(trendFileName)
    jsonTrend = json.load(trendFile)
    
    #Group Trend By StockIndex
    stockGroupTrend = {}
    for trend in jsonTrend:
        stockIndex = trend[6]
        if stockIndex not in stockGroupTrend:
            stockGroupTrend[stockIndex] = []
        stockGroupTrend[stockIndex].append(trend)
    
    for item in stockGroupTrend:
        stockGroupTrend[item].sort()
        stockGroupTrend[item] = [w[7] for w in stockGroupTrend[item]]
    
    finalClusterMatrix = {}
    finalClusterProbability = {}
    for item in stockGroupTrend:
        #read all the line and skip the first line
        trendsSerial = stockGroupTrend[item]
        clusterDist = nltk.FreqDist(trendsSerial)
        clusterProbability = {}
        for cl in clusterDist:
            clusterProbability[cl] = "%0.4f" %(clusterDist[cl]/sum(clusterDist.values()))
        finalClusterProbability[item] = clusterProbability
        
        #Define the ultimated json object
        clusterMatrix = {}
        for cluster in range(1,21):
            #create matrix for each cluster
            matrix = [[0 for col in range(3)] for row in range(20)]
            for i in range(0,len(trendsSerial)):
                if cluster == trendsSerial[i]:
                    t1 = 0
                    t2 = 0
                    t3 = 0
                    if i - 1 >= 0:
                        t1 = trendsSerial[i-1]
                        matrix[t1-1][0] = matrix[t1-1][0] + 1
                    if i - 2 >= 0:
                        t2 = trendsSerial[i-2]
                        matrix[t2-1][1] = matrix[t2-1][1] + 1
                    if i - 3 >= 0:
                        t3 = trendsSerial[i-3]
                        matrix[t3-1][2] = matrix[t3-1][2] + 1
            
            #calculating the contribution matrix
            contributionMatrix = [[0 for col in range(3)] for row in range(20)]
            sumCol = [0,0,0]
            for col in range(3):
                for row in range(20):
                    sumCol[col] = sumCol[col] + matrix[row][col]
            
            for col in range(3):
                for row in range(20):
                    contributionMatrix[row][col] = "%0.4f" %((matrix[row][col] + 1)/(sumCol[col]+20))
            clusterMatrix[cluster] = contributionMatrix
            finalClusterMatrix[item] = clusterMatrix
    
    "Write the cluster contribution to File "
    clusterContributionFile = common.get_configuration("model", "CLUSTER_CONTRIBUTION_PATH")        
    with open(clusterContributionFile,"w") as output:
        jsString = json.dumps(finalClusterMatrix)
        output.write(jsString)
        
    "Write the cluster Probability to File "  
    clusterProbabilityFile = common.get_configuration("model", "CLUSTER_PROBABILITY_PATH")  
    with open(clusterProbabilityFile,"w") as output2:
        jsString = json.dumps(finalClusterProbability)
        output2.write(jsString)