def thesaurus_similarities(neighs1, neighs2, measure=cosine, weighting=k_minus_rank, maxrank=0, log=logging.getLogger()): ''' Produce a list of similarity tuples, where each tuple is a base entry with the neighbours list similarity between each thesauri. If a base entry doesn't exist in one or other of the thesauri no score is produced. Neighbours lists are assumed to have been previous sorted lexicographically in ascending order of entry. ''' log = log.getChild('compute') if log.isEnabledFor(logging.INFO): log.info('Calculating %s similarities between %s weighted neighbours lists.' % ( measure.func_name, weighting.func_name)) sims = [] i,j = 0,0 while i < len(neighs1) and j < len(neighs2) : if neighs1[i][0] == neighs2[j][0]: if log.isEnabledFor(logging.DEBUG): log.debug('entry: %s' % neighs1[i][0]) sim = neighbours_list_similarity(neighs1[i][1], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 j += 1 elif neighs1[i][0] < neighs2[j][0]: sim = neighbours_list_similarity(neighs1[i][1], [], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 else: #if th1[i][0] > th2[j][0]: sim = neighbours_list_similarity([], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs2[j][0], sim) ) j += 1 if log.isEnabledFor(logging.INFO) and max(i,j) % 1000 == 0: log.info('Calculated %d similarities. (%.1f%% complete)' % ( len(sims), 100.0 * (i+j) / (len(neighs1)+len(neighs2)))) while i < len(neighs1): sim = neighbours_list_similarity(neighs1[i][1], [], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs1[i][0], sim) ) i += 1 while j < len(neighs2): sim = neighbours_list_similarity([], neighs2[j][1], measure=measure, weighting=weighting, maxrank=maxrank) sims.append( (neighs2[j][0], sim) ) j += 1 if log.isEnabledFor(logging.INFO): log.info('Calculated %d similarities. (%.1f%% complete)' % ( len(sims), 100.0)) return sims
def equatorial_to_degrees(equatorial): r""" Converts RA, DEC coordinates in equatorial notation to degrees. Parameters ---------- equatorial : list The coordinates in degress in equatorial notation, e.g. ['1:33:55.80', '+30:43:2.00'] Returns ------- degrees : list The coordinates in degreees, e.g. [23.4825, 30.717222]. Raises ------ SystemExit If ``equatorial`` is not a list of strings in the above format. """ try: CoordsplitRA = equatorial[0].split(':') CoordsplitDec = equatorial[1].split(':') except AttributeError: log.debug("equatorial_to_degrees needs a pair of RA DEC "\ "coordinated in equatiorial notation as input") raise if float(CoordsplitDec[0]) > 0: degrees = [(float(CoordsplitRA[0]) * (360. / 24) + float(CoordsplitRA[1]) * (360. / 24 / 60) + float(CoordsplitRA[2]) * (360. / 24 / 60 / 60)), (float(CoordsplitDec[0]) + float(CoordsplitDec[1]) * (1. / 60) + float(CoordsplitDec[2]) * 1. / 60 / 60)] if float(CoordsplitDec[0]) < 0: degrees = [(float(CoordsplitRA[0]) * (360. / 24) + float(CoordsplitRA[1]) * (360. / 24 / 60) + float(CoordsplitRA[2]) * (360. / 24 / 60 / 60)), (float(CoordsplitDec[0]) - float(CoordsplitDec[1]) * (1. / 60) - float(CoordsplitDec[2]) * 1. / 60 / 60)] return degrees
def to_neg_binomial(data): # if not data: # logging.error('empty data set') # exit(1) log = logging.getLogger(__name__) arr_ks = np.asarray(data) if (arr_ks < 0).any(): log.warning( 'negative value in data set. negative binomial may not be appropriate.' ) n = len(arr_ks) mean = arr_ks.mean() var = arr_ks.var() p0 = mean / var r0 = mean**2 / (var - mean) # initial guess. r > 0 and 0 < p < 1 logging.info('r0,p0 = {:.3f}, {:.3f}'.format(r0, p0)) assert ((r0 > 0) and p0 < 1) allowed_methods = [ 'L-BFGS-B', 'TNC', 'SLSQP' ] # these are the only ones that can handle bounds. they can also all handle jacobians. none of them can handle hessians. # only LBFGS returns Hessian, in form of "LbjgsInvHessProduct" method = allowed_methods[0] func = lambda pars: -sum_log_neg_binomial(arr_ks, *pars) grad = lambda pars: -grad_sum_log_neg_binomial(arr_ks, *pars) opt_result = opt.minimize(func, (r0, p0), method=method, jac=grad, bounds=[(0, None), (0, 1)]) isSuccess = opt_result.success if not isSuccess: log.error('negative binomial fit did not succeed.') r, p = opt_result.x log.debug('jacobian = {}'.format( opt_result.jac)) # should be zero, or close to it cov = opt_result.hess_inv cov_array = cov.todense() # dense array neg_ll = opt_result.fun return isSuccess, (r, p), cov_array, -neg_ll / (n - 2)
def neighbours_list_similarity(list1, list2, measure=cosine, weighting=k_minus_rank, maxrank=0, log=logging.getLogger()): kwargs = dict() if weighting == k_minus_rank: kwargs['k'] = max(maxrank, len(list1), len(list2)) # Re-weighting the vectors if list1 == list2: wlist1 = weighting(sorted(list1), **kwargs) wlist2 = wlist1 else: wlist1 = weighting(sorted(list1), **kwargs) wlist2 = weighting(sorted(list2), **kwargs) sim = measure(wlist1, wlist2) if log.isEnabledFor(logging.DEBUG): log.debug('reweighting 1 (entry:before=>after): %s' % ['%s:%.3f=>%.3f' % (e,x,y) for (e,x),(_,y) in zip(sorted(list1), wlist1)]) log.debug('reweighting 2 (entry:before=>after): %s' % ['%s:%.3f=>%.3f' % (e,x,y) for (e,x),(_,y) in zip(sorted(list2), wlist2)]) return sim