コード例 #1
0
ファイル: initialize_news.py プロジェクト: fangjin/currency
def extract_document(file_name ,factorJson):
    #global phraseList, phraseConf
    indicator=open("/home/jf/Currency/integrate-code/Basis_enriched_version/bloomberg_news_indicator.conf","r")
    indicator1=json.load(indicator)    
   
    j = 0
    for curIndex in currency_index:
    with open(file_name) as f:
        for line in f:
            j += 1
            #print j
            article = unidecode(line)
            try:
               articleJson = json.loads(unidecode(article))
               artDate = isodate(articleJson['date'])               
            except ValueError:
               logger.debug("unable to load json line number %s" % j)
               continue
            tokens = articleJson['BasisEnrichment']['tokens']
            reqCurrencyWords = set(indicator1[currency_index])
            if any([True for k in tokens if k['value'] in reqCurrencyWords]):
                sOffsets = [k[0] for k in enumerate(tokens) if k[1]['POS'] == 'SENT']
                sOffsets.insert(0, 0)
                sOffsets.append(len(tokens) - 1)
		sentences = [[tokens[sOffsets[i]:sOffsets[i + 1]]] for i in range(0, len(sOffsets) - 1)]
		articleJson['currencyWord'] = {}
                factorJson[articleJson['embersId']]={}
		for w in factor_dic:                    
                    factorJson[articleJson['embersId']][w] = 0
                    sentArt = 0
                    #s = ' '.join([k['value'] for k in sentence[0]])
                    #print s.encode('utf-8')
                    for sentence in sentences:
                        phraseSearch.getPhrase(factor_dic[w])
                        #print phraseSearch.phraseList
                        result = phraseSearch.filterByPhrase(sentence)
                        #print result
                        if result[0]:
                            if w in articleJson['currencyWord']:
                                articleJson['currencyWord'][w].append(sentence)
                            else:
                                articleJson['currencyWord'][w] = [sentence[0]]
                        negWords = [1 for k in sentence[0] if k['value'] in sentiment['negative_word']]
                        posWords = [1 for k in sentence[0] if k['value'] in sentiment['positive_word']]
                        sentArt += sum(posWords) - sum(negWords)
                        #print sentArt
                        #factorJson['embersId'] = {w: sentArt}
                    factorJson[articleJson['embersId']][w] = sentArt
    print "factorJson" % factorJson
    return factorJson
コード例 #2
0
ファイル: articleEnricher.py プロジェクト: fangjin/currency
def 	enrich(conn,conf_file,file_name):

	CONFIG = json.load(open(conf_file))
	indicator = CONFIG["indicator"]
	sentiment = CONFIG["sentiment"]

	j = 0
	factorJson = {}
	with open(file_name) as f:
		for index in indicator:
			factorJson[index] = {}
		for line in f:
			j += 1
			article = unidecode(line)
			try:
				articleJson = json.loads(unidecode(article))
				artDate = isodate(articleJson['postDate']).strftime('%Y-%m-%d')               
			except ValueError:
				logger.debug("unable to load json line number %s" % j)
				continue
			tokens = articleJson['BasisEnrichment']['tokens']
			for index in indicator:
				reqCurrencyWords = set(indicator[index])
				sentences = []
				if any([True for k in tokens if k['value'] in reqCurrencyWords]):
					sOffsets = [k[0] for k in enumerate(tokens) if k[1]['POS'] == 'SENT']
					sOffsets.insert(0, 0)
					sOffsets.append(len(tokens) - 1)
					sentences = [[tokens[sOffsets[i]:sOffsets[i + 1]]] for i in range(0, len(sOffsets) - 1)]

				"construct enriched data of each article given currency index"
				enriched_article = {}
				enriched_article["derivedFrom"] = {"derivedIds":[articleJson['embersId']]}
				enriched_article["currencyIndex"] = index
				enriched_article["postDate"] = artDate
				enriched_embersId = hashlib.sha1(json.dumps(enriched_article)).hexdigest()
				enriched_article["embersId"] = enriched_embersId

				if artDate in factorJson[index]:
					factorJson[index][artDate][enriched_embersId] = {}
				else:
					factorJson[index][artDate] = {enriched_embersId: {}}
				for w in factor_dic:                    
					factorJson[index][artDate][enriched_embersId][w] = 0
					sentArt = 0
					#s = ' '.join([k['value'] for k in sentence[0]])
					#print s.encode('utf-8')
					for sentence in sentences:
						phraseSearch.getPhrase(factor_dic[w])
						#print phraseSearch.phraseList
						result = phraseSearch.filterByPhrase(sentence)
						#print result  #([('is', 'value'), 0], 19) or (False, None)
						if result[0]:
							negWords = [1 for k in sentence[0] if k['value'] in sentiment['negative_word']]
							posWords = [1 for k in sentence[0] if k['value'] in sentiment['positive_word']]
							sentArt += sum(posWords) - sum(negWords)

					factorJson[index][artDate][enriched_embersId][w] = sentArt

				enriched_article["interest"] = factorJson[index][artDate][enriched_embersId]["interest"]
				enriched_article["inflation"] = factorJson[index][artDate][enriched_embersId]["inflation"]
				enriched_article["invest"] = factorJson[index][artDate][enriched_embersId]["invest"]
				insert_enricheddata(conn,enriched_article)   
				with queue.open(ENRICHED_ZMQ, 'w', capture=False) as outq:
                			outq.write(enriched_article)

	"construct surrogata data of each currency by the max day of the files"
	daily_sentiment = {}
	for index in factorJson:
		daily_sentiment[index] = {}
		derivedFrom = []
		sentiment = {"interest":0,"inflation":0,"invest":0}
		"summary the sentiment of articles"
		max_day = max(factorJson[index].keys())
		for day in factorJson[index]:
			for k,v in factorJson[index][day].items():
				derivedFrom.append(k)
				for w in v:
					sentiment[w] += v[w]
		daily_sentiment[index]["postDate"] = max_day
		daily_sentiment[index]["interest"] = sentiment["interest"]
		daily_sentiment[index]["inflation"] = sentiment["inflation"]
		daily_sentiment[index]["invest"] = sentiment["invest"]
		daily_sentiment[index]["derivedFrom"] = {"derivedIds":derivedFrom}
		daily_sentiment[index]["currencyIndex"] = index
		embers_id = hashlib.sha1(json.dumps(daily_sentiment[index])).hexdigest()
		daily_sentiment[index]["embersId"] = embers_id

		insert_dailysentiment(conn,daily_sentiment[index])
		with queue.open(SURROGATE_ZMQ, 'w', capture=False) as outq:
                			outq.write(daily_sentiment[index])
		""		
	return daily_sentiment