def matrix(data,freq): # get total and ICmax total = len(data) IC_max = hpo_helper.IC(1,total) result = defaultdict(float) method_cache = {} n = total for d in data: print(n) n -= 1 hpos = d['hpo'] # apparently each hpo co-occur with itself for h in hpos: key = '-'.join([h,h]) method_cache[h] = method_cache.get(h,hpo_helper.IC(freq[h],total)/(IC_max)) result[key] = result.get(key, method_cache[h]) # check each combination of the hpos for h in itertools.combinations(hpos,2): key = '-'.join(sorted([h[0],h[1]])) # using normaliser conveniently make the result consistently smaller than if h0 is a subclass of h1 if later using IC(h0)*IC(h1)/max_IC**2 for getting weights. normaliser = 2*min(freq[h[0]],hpo_freq[h[1]]) result[key] += (method_cache[h[0]] + method_cache[h[1]]) / normaliser return result
def asym_WAM(dbs, df, ic_df, freq, ancestors): print('build hpo_asym_WAM') weight_df = pd.DataFrame(index=ic_df.index) max_ic = hpo_helper.IC(1, freq['HP:0000001']) buff = {} for i, h in enumerate(ic_df.index): this = ic_df.reset_index().apply(beta, axis=1, dbs=dbs, ic_df=ic_df, freq=freq, max_ic=max_ic, h2=h, buff=buff, mode='asym') weight_df[h] = this.values return (df.multiply(weight_df))
def IC_wrapper(a): return hpo_helper.IC(a, t)
basename = os.path.basename(input) num = basename.split('.')[0].split('_')[1] outfile = os.path.join(outfolder, 'hpofreq_' + num + '.json') return outfile ''' main ''' if __name__ == "__main__": usage = "usage: %prog [options] arg1 arg2" parser = OptionParser(usage=usage) parser.add_option("--input", dest="input", help="input sim file?") (options, args) = parser.parse_args() # get dbs dbs = phenopolis_utils.get_mongo_collections() # get input data input_data = hpo_helper.get_json(options.input) # get total and ICmax total = len(input_data) IC_max = hpo_helper.IC(1, total) # expand hpos hpo_helper.expand_hpo(input_data) # get hpo_freq hpo_freq = hpo_helper.get_hpo_freq(input_data) # get output file outfile = get_outfile(options.input) hpo_helper.write_json(hpo_freq, outfile) print('done')