tp = sum([combin(l,2) for g in gw.values() for l in g.values() if l > 1]) fn = 0 for c in cs: l = [] for g in gw.values(): l.append(g[c]) combines = itertools.combinations(l,2) for x,y in combines: fn += x*y tn = tp_tn_fp_fn - tp_tn - fn return float((tp + tn))/tp_tn_fp_fn if __name__ == '__main__': inverse_index('bing','bing_index.json') f = open('bing_index.json') inverse_index = json.loads(f.read()) f1 = open('test_idf','w') f2 = open('test_idf2.json','w') fj = open('tf_doc.json','r') tf_doc = json.loads(fj.read()) word_map = {} doc_map = {} index_doc_map = {} index_word_map = {} i = 0 j = 0 for word in inverse_index:
Created on Jun 12, 2013 @author: Administrator ''' import json, os from part_1 import inverse_index TWEET_PATH = 'C:/Users/Administrator/workspace/670_hw_1/mars_tweets_medium.json' TWEET_TEXT_PATH = 'C:/Users/Administrator/workspace/670_hw_1/tweet/' def get_tweet_text(): if not os.path.exists(TWEET_TEXT_PATH): os.mkdir(TWEET_TEXT_PATH) wf = open(TWEET_PATH, 'r') i = 0 for line in wf: encoded_string = line.strip().decode('utf-8') tweet = json.loads(encoded_string) tweet_text = tweet['text'].encode('utf-8').lower() # print tweet_text f = open(TWEET_TEXT_PATH + '/' + str(i), 'w') f.write(tweet_text) f.close() i += 1 if __name__ == '__main__': #get_tweet_text() inverse_index('tweet', 'tweet_index.json')
tp = sum([combin(l, 2) for g in gw.values() for l in g.values() if l > 1]) fn = 0 for c in cs: l = [] for g in gw.values(): l.append(g[c]) combines = itertools.combinations(l, 2) for x, y in combines: fn += x * y tn = tp_tn_fp_fn - tp_tn - fn return float((tp + tn)) / tp_tn_fp_fn if __name__ == '__main__': inverse_index('bing', 'bing_index.json') f = open('bing_index.json') inverse_index = json.loads(f.read()) f1 = open('test_idf', 'w') f2 = open('test_idf2.json', 'w') fj = open('tf_doc.json', 'r') tf_doc = json.loads(fj.read()) word_map = {} doc_map = {} index_doc_map = {} index_word_map = {} i = 0 j = 0 for word in inverse_index:
''' Created on Jun 12, 2013 @author: Administrator ''' import json,os from part_1 import inverse_index TWEET_PATH = 'C:/Users/Administrator/workspace/670_hw_1/mars_tweets_medium.json' TWEET_TEXT_PATH = 'C:/Users/Administrator/workspace/670_hw_1/tweet/' def get_tweet_text(): if not os.path.exists(TWEET_TEXT_PATH): os.mkdir(TWEET_TEXT_PATH) wf = open(TWEET_PATH,'r') i = 0 for line in wf: encoded_string = line.strip().decode('utf-8') tweet = json.loads(encoded_string) tweet_text = tweet['text'].encode('utf-8').lower() # print tweet_text f = open(TWEET_TEXT_PATH + '/' + str(i),'w') f.write(tweet_text) f.close() i += 1 if __name__ == '__main__': #get_tweet_text() inverse_index('tweet','tweet_index.json')