def extract_document(file_name ,factorJson): #global phraseList, phraseConf indicator=open("/home/jf/Currency/integrate-code/Basis_enriched_version/bloomberg_news_indicator.conf","r") indicator1=json.load(indicator) j = 0 for curIndex in currency_index: with open(file_name) as f: for line in f: j += 1 #print j article = unidecode(line) try: articleJson = json.loads(unidecode(article)) artDate = isodate(articleJson['date']) except ValueError: logger.debug("unable to load json line number %s" % j) continue tokens = articleJson['BasisEnrichment']['tokens'] reqCurrencyWords = set(indicator1[currency_index]) if any([True for k in tokens if k['value'] in reqCurrencyWords]): sOffsets = [k[0] for k in enumerate(tokens) if k[1]['POS'] == 'SENT'] sOffsets.insert(0, 0) sOffsets.append(len(tokens) - 1) sentences = [[tokens[sOffsets[i]:sOffsets[i + 1]]] for i in range(0, len(sOffsets) - 1)] articleJson['currencyWord'] = {} factorJson[articleJson['embersId']]={} for w in factor_dic: factorJson[articleJson['embersId']][w] = 0 sentArt = 0 #s = ' '.join([k['value'] for k in sentence[0]]) #print s.encode('utf-8') for sentence in sentences: phraseSearch.getPhrase(factor_dic[w]) #print phraseSearch.phraseList result = phraseSearch.filterByPhrase(sentence) #print result if result[0]: if w in articleJson['currencyWord']: articleJson['currencyWord'][w].append(sentence) else: articleJson['currencyWord'][w] = [sentence[0]] negWords = [1 for k in sentence[0] if k['value'] in sentiment['negative_word']] posWords = [1 for k in sentence[0] if k['value'] in sentiment['positive_word']] sentArt += sum(posWords) - sum(negWords) #print sentArt #factorJson['embersId'] = {w: sentArt} factorJson[articleJson['embersId']][w] = sentArt print "factorJson" % factorJson return factorJson
def enrich(conn,conf_file,file_name): CONFIG = json.load(open(conf_file)) indicator = CONFIG["indicator"] sentiment = CONFIG["sentiment"] j = 0 factorJson = {} with open(file_name) as f: for index in indicator: factorJson[index] = {} for line in f: j += 1 article = unidecode(line) try: articleJson = json.loads(unidecode(article)) artDate = isodate(articleJson['postDate']).strftime('%Y-%m-%d') except ValueError: logger.debug("unable to load json line number %s" % j) continue tokens = articleJson['BasisEnrichment']['tokens'] for index in indicator: reqCurrencyWords = set(indicator[index]) sentences = [] if any([True for k in tokens if k['value'] in reqCurrencyWords]): sOffsets = [k[0] for k in enumerate(tokens) if k[1]['POS'] == 'SENT'] sOffsets.insert(0, 0) sOffsets.append(len(tokens) - 1) sentences = [[tokens[sOffsets[i]:sOffsets[i + 1]]] for i in range(0, len(sOffsets) - 1)] "construct enriched data of each article given currency index" enriched_article = {} enriched_article["derivedFrom"] = {"derivedIds":[articleJson['embersId']]} enriched_article["currencyIndex"] = index enriched_article["postDate"] = artDate enriched_embersId = hashlib.sha1(json.dumps(enriched_article)).hexdigest() enriched_article["embersId"] = enriched_embersId if artDate in factorJson[index]: factorJson[index][artDate][enriched_embersId] = {} else: factorJson[index][artDate] = {enriched_embersId: {}} for w in factor_dic: factorJson[index][artDate][enriched_embersId][w] = 0 sentArt = 0 #s = ' '.join([k['value'] for k in sentence[0]]) #print s.encode('utf-8') for sentence in sentences: phraseSearch.getPhrase(factor_dic[w]) #print phraseSearch.phraseList result = phraseSearch.filterByPhrase(sentence) #print result #([('is', 'value'), 0], 19) or (False, None) if result[0]: negWords = [1 for k in sentence[0] if k['value'] in sentiment['negative_word']] posWords = [1 for k in sentence[0] if k['value'] in sentiment['positive_word']] sentArt += sum(posWords) - sum(negWords) factorJson[index][artDate][enriched_embersId][w] = sentArt enriched_article["interest"] = factorJson[index][artDate][enriched_embersId]["interest"] enriched_article["inflation"] = factorJson[index][artDate][enriched_embersId]["inflation"] enriched_article["invest"] = factorJson[index][artDate][enriched_embersId]["invest"] insert_enricheddata(conn,enriched_article) with queue.open(ENRICHED_ZMQ, 'w', capture=False) as outq: outq.write(enriched_article) "construct surrogata data of each currency by the max day of the files" daily_sentiment = {} for index in factorJson: daily_sentiment[index] = {} derivedFrom = [] sentiment = {"interest":0,"inflation":0,"invest":0} "summary the sentiment of articles" max_day = max(factorJson[index].keys()) for day in factorJson[index]: for k,v in factorJson[index][day].items(): derivedFrom.append(k) for w in v: sentiment[w] += v[w] daily_sentiment[index]["postDate"] = max_day daily_sentiment[index]["interest"] = sentiment["interest"] daily_sentiment[index]["inflation"] = sentiment["inflation"] daily_sentiment[index]["invest"] = sentiment["invest"] daily_sentiment[index]["derivedFrom"] = {"derivedIds":derivedFrom} daily_sentiment[index]["currencyIndex"] = index embers_id = hashlib.sha1(json.dumps(daily_sentiment[index])).hexdigest() daily_sentiment[index]["embersId"] = embers_id insert_dailysentiment(conn,daily_sentiment[index]) with queue.open(SURROGATE_ZMQ, 'w', capture=False) as outq: outq.write(daily_sentiment[index]) "" return daily_sentiment