def thesaurus_similarities(neighs1, neighs2, 
		measure=cosine, weighting=k_minus_rank, maxrank=0,
		log=logging.getLogger()):
	'''
	Produce a list of similarity tuples, where each tuple is a base entry with
	the neighbours list similarity between each thesauri. If a base entry 
	doesn't exist in one or other of the thesauri no score is produced.
	
	Neighbours lists are assumed to have been previous sorted 
	lexicographically in ascending order of entry.
	'''
	log = log.getChild('compute')
	if log.isEnabledFor(logging.INFO):
		log.info('Calculating %s similarities between %s weighted neighbours lists.' % (
			measure.func_name, weighting.func_name))
	sims = []
	i,j = 0,0
	
	
	while i < len(neighs1) and j < len(neighs2) :
		if neighs1[i][0] == neighs2[j][0]:
			if log.isEnabledFor(logging.DEBUG):	
				log.debug('entry: %s' % neighs1[i][0])			
			
			sim = neighbours_list_similarity(neighs1[i][1], neighs2[j][1],
				measure=measure, weighting=weighting, maxrank=maxrank)			
			sims.append( (neighs1[i][0], sim) )			
			i += 1
			j += 1
		elif neighs1[i][0] < neighs2[j][0]:
			sim = neighbours_list_similarity(neighs1[i][1], [],
				measure=measure, weighting=weighting, maxrank=maxrank)
			sims.append( (neighs1[i][0], sim) )	
			i += 1
		else: #if th1[i][0] > th2[j][0]:
			sim = neighbours_list_similarity([], neighs2[j][1],
				measure=measure, weighting=weighting, maxrank=maxrank)			
			sims.append( (neighs2[j][0], sim) )			
			j += 1
		if log.isEnabledFor(logging.INFO) and max(i,j) % 1000 == 0: 
			log.info('Calculated %d similarities. (%.1f%% complete)' % (
				len(sims), 100.0 * (i+j) / (len(neighs1)+len(neighs2))))
	while i < len(neighs1):
		sim = neighbours_list_similarity(neighs1[i][1], [],
			measure=measure, weighting=weighting, maxrank=maxrank)
		sims.append( (neighs1[i][0], sim) )	
		i += 1
	while j < len(neighs2):
		sim = neighbours_list_similarity([], neighs2[j][1],
			measure=measure, weighting=weighting, maxrank=maxrank)			
		sims.append( (neighs2[j][0], sim) )			
		j += 1
	if log.isEnabledFor(logging.INFO):
		log.info('Calculated %d similarities. (%.1f%% complete)' % (
			len(sims), 100.0))
	
	return sims
Example #2
0
def equatorial_to_degrees(equatorial):
    r"""
    Converts RA, DEC coordinates in equatorial notation to degrees.

    Parameters
    ----------
    equatorial : list
        The coordinates in degress in equatorial notation, e.g.
        ['1:33:55.80', '+30:43:2.00']

    Returns
    -------
    degrees : list
        The coordinates in degreees, e.g. [23.4825, 30.717222].

    Raises
    ------
    SystemExit
        If ``equatorial`` is not a list of strings in the above format.
    """
    try:
        CoordsplitRA = equatorial[0].split(':')
        CoordsplitDec = equatorial[1].split(':')
    except AttributeError:
        log.debug("equatorial_to_degrees needs a pair of RA DEC "\
                       "coordinated in equatiorial notation as input")
        raise
    if float(CoordsplitDec[0]) > 0:
        degrees = [(float(CoordsplitRA[0]) * (360. / 24) +
                  float(CoordsplitRA[1]) * (360. / 24 / 60) +
                  float(CoordsplitRA[2]) * (360. / 24 / 60 / 60)),
                  (float(CoordsplitDec[0]) + float(CoordsplitDec[1]) * (1. /
                  60) + float(CoordsplitDec[2]) * 1. / 60 / 60)]
    if float(CoordsplitDec[0]) < 0:
        degrees = [(float(CoordsplitRA[0]) * (360. / 24) +
                  float(CoordsplitRA[1]) * (360. / 24 / 60) +
                  float(CoordsplitRA[2]) * (360. / 24 / 60 / 60)),
                  (float(CoordsplitDec[0]) - float(CoordsplitDec[1]) * (1. /
                  60) - float(CoordsplitDec[2]) * 1. / 60 / 60)]
    return degrees
Example #3
0
def to_neg_binomial(data):
    # if not data:
    #     logging.error('empty data set')
    #     exit(1)
    log = logging.getLogger(__name__)
    arr_ks = np.asarray(data)
    if (arr_ks < 0).any():
        log.warning(
            'negative value in data set. negative binomial may not be appropriate.'
        )
    n = len(arr_ks)
    mean = arr_ks.mean()
    var = arr_ks.var()
    p0 = mean / var
    r0 = mean**2 / (var - mean)  # initial guess. r > 0 and 0 < p < 1
    logging.info('r0,p0 = {:.3f}, {:.3f}'.format(r0, p0))
    assert ((r0 > 0) and p0 < 1)
    allowed_methods = [
        'L-BFGS-B', 'TNC', 'SLSQP'
    ]  # these are the only ones that can handle bounds. they can also all handle jacobians. none of them can handle hessians.
    # only LBFGS returns Hessian, in form of "LbjgsInvHessProduct"
    method = allowed_methods[0]

    func = lambda pars: -sum_log_neg_binomial(arr_ks, *pars)
    grad = lambda pars: -grad_sum_log_neg_binomial(arr_ks, *pars)
    opt_result = opt.minimize(func, (r0, p0),
                              method=method,
                              jac=grad,
                              bounds=[(0, None), (0, 1)])
    isSuccess = opt_result.success
    if not isSuccess:
        log.error('negative binomial fit did not succeed.')
    r, p = opt_result.x
    log.debug('jacobian = {}'.format(
        opt_result.jac))  # should be zero, or close to it
    cov = opt_result.hess_inv
    cov_array = cov.todense()  # dense array
    neg_ll = opt_result.fun
    return isSuccess, (r, p), cov_array, -neg_ll / (n - 2)
def neighbours_list_similarity(list1, list2, 
		measure=cosine, weighting=k_minus_rank, maxrank=0,
		log=logging.getLogger()):
			
	kwargs = dict()
	if weighting == k_minus_rank:
		kwargs['k'] = max(maxrank, len(list1), len(list2))

	# Re-weighting the vectors
	if list1 == list2:
		wlist1 = weighting(sorted(list1), **kwargs)	
		wlist2 = wlist1
	else:
		wlist1 = weighting(sorted(list1), **kwargs)		
		wlist2 = weighting(sorted(list2), **kwargs)
	
	sim = measure(wlist1, wlist2)
	
	if log.isEnabledFor(logging.DEBUG):	
		log.debug('reweighting 1 (entry:before=>after): %s' % 
			['%s:%.3f=>%.3f' % (e,x,y) for (e,x),(_,y) in zip(sorted(list1), wlist1)])
		log.debug('reweighting 2 (entry:before=>after): %s' % 
			['%s:%.3f=>%.3f' % (e,x,y) for (e,x),(_,y) in zip(sorted(list2), wlist2)])
	return sim