def read_term_file(filename): term_list = [] term_dic = {} term_dic_new = {} f = open(filename, "r") for line in f: term_dic = cjson.decode(line) f.close() hist = stats.histogram() for term in term_dic: if term_dic[term] > 100: term_dic_new.update({term:term_dic[term]}) for term in term_dic_new: term_list.append([term, term_dic_new[term]]) hist.add(term_dic_new[term]) minc = min(term_dic_new.values()) maxc = max(term_dic_new.values()) print minc print maxc term_list = sorted(term_list, key =lambda x:x[1], reverse = True) write_file(term_list)
def filter_emerging_term(filename): term_dic = read_term_dic("../term_dic/term_dic_09_reduced_filtered_truncated_without_0-9.json") term_nutrition_dic = read_nutrition_vector(filename) term_kurtosis_skewness_dic = {} count = 0 for term in term_nutrition_dic: if term in term_dic: mystats = stats.histogram() for key in term_nutrition_dic[term]: mystats.add(term_nutrition_dic[term][key]) kurtosis = mystats.kurtosis() skewness = mystats.skewness() if not kurtosis == None and not skewness == None and not mystats.sum() == 0.0: term_kurtosis_skewness_dic.update( { term: { "mean": mystats.avg(), "standard_dev": mystats.std(), "kurtosis": kurtosis * mystats.max(), "skewness": skewness * mystats.max(), } } ) count += 1 # print sorted(term_kurtosis_skewness_dic.iteritems(), key=lambda x:x[1]["kurtosis"], reverse = True) f = open("../term_nutrition/term_kurtosis_skewness_dic_09.json", "w") json.dump(term_kurtosis_skewness_dic, f) f.close() print len(term_kurtosis_skewness_dic) f = open("../term_nutrition/term_kurtosis_skewness_09_sorted.txt", "w") l = sorted(term_kurtosis_skewness_dic.iteritems(), key=lambda x: x[1]["mean"], reverse=True) f.write( "{} ".format("user").ljust(15) + "{} ".format("mean").ljust(15) + "{} ".format("standard_dev").ljust(15) + "{} ".format("kurtosis").ljust(15) + "{} ".format("skewness").ljust(15) + "\n" ) for term in l: if not "\\" in term[0]: f.write( "{} ".format(term[0]).ljust(15) + "{} ".format(term[1]["mean"]).ljust(15) + "{} ".format(term[1]["standard_dev"]).ljust(15) + "{} ".format(term[1]["kurtosis"]).ljust(15) + "{} ".format(term[1]["skewness"]).ljust(15) + "\n" ) else: term_kurtosis_skewness_dic.pop(term[0]) print len(term_kurtosis_skewness_dic) f.close()
def process_nutrition_term(filename): term_nutrition_dic = read_nutrition_vector(filename) for term in term_nutrition_dic: mystats = stats.histogram() for key in term_nutrition_dic[term]: mystats.add(term_nutrition_dic[term][key]) for key in term_nutrition_dic[term]: if mystats.sum() == 0.0: term_nutrition_dic[term][key] = 0.0 else: term_nutrition_dic[term][key] = term_nutrition_dic[term][key] / mystats.sum() f = open("../term_nutrition/term_nutrition_vector_09_2.json", "w") json.dump(term_nutrition_dic, f) f.close()