Python count Examples

Programming Language: Python

Namespace/Package Name: libconsulte.api

Method/Function: count

Examples at hotexamples.com: 6

Python count - 6 examples found. These are the top rated real world Python examples of libconsulte.api.count extracted from open source projects. You can rate examples to help us improve the quality of examples.

Example #1

Show file

File: field_combo_count.py Project: rloth/libconsulte

def pooling(crit_fields, verbose=False):
	"""
	Sorte de tableau croisé: compte le nombre de docs pour chaque combinaison de critères
	
	Renvoie un dict avec ces décomptes et les totaux
	
	Exemple pour les critères corpusName et pdfCharCount
	{
	  "f": {
		"corpusName:bmj AND pdfCharCount:[* TO 1999]": 24524,
		"corpusName:bmj AND pdfCharCount:[2000 TO *]": 662848,
		"corpusName:brill-journals AND pdfCharCount:[* TO 1999]": 10949,
		"corpusName:brill-journals AND pdfCharCount:[2000 TO *]": 119318,
		"corpusName:elsevier AND pdfCharCount:[* TO 1999]": 275461,
		"corpusName:elsevier AND pdfCharCount:[2000 TO *]": 5740132,
		"corpusName:nature AND pdfCharCount:[* TO 1999]": 332156,
		"corpusName:nature AND pdfCharCount:[2000 TO *]": 45139,
		"corpusName:oup AND pdfCharCount:[* TO 1999]": 58662,
		"corpusName:oup AND pdfCharCount:[2000 TO *]": 1385591,
		"corpusName:springer AND pdfCharCount:[* TO 1999]": 61973,
		"corpusName:springer AND pdfCharCount:[2000 TO *]": 2242902,
		"corpusName:wiley AND pdfCharCount:[* TO 1999]": 593998,
		"corpusName:wiley AND pdfCharCount:[2000 TO *]": 4044204
	  },
	  "totd": 15982692         # nombre de docs au total dans la base
	  "nd": 15597857,          # nombre de docs pour l'ensemble des critères
	  "nr": 15597857,          # nombre de réponses pour l'ensemble des critère 
	                           # (intéressant pour les champs "QCM")
	}

	NB: les choix de fourchettes et les valeurs de facettes sont
	    configurables dans field_value_lists.py
	"""
	####### POOLING ########
	#
	N_reponses = 0
	N_workdocs = 0
	doc_grand_total = 0
	# dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
	abs_freqs = {}
	
	# ---------------------------------------------------------------
	# (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
	print("Sending count queries for criteria pools...",file=stderr)
	## build all "field:values" pairs per criterion field
	## (list of list of strings: future lucene query chunks)
	all_possibilities = []
	
	# petit hommage à notre collègue Nourdine Combo !
	n_combos = 1
	
	for my_criterion in crit_fields:
		# print("CRIT",my_criterion)
		field_outcomes = facet_vals(my_criterion)
		# print("field_outcomes",field_outcomes)
		n_combos = n_combos * len(field_outcomes)
		# lucene query chunks
		all_possibilities.append(
			[my_criterion + ':' + val for val in field_outcomes]
		)
	
	# par ex: 2 critères vont donner 2 listes dans all_possibilities
	# [
	#  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], 
	#
	#  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
	#   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', 
	#   'corpusName:oup', 'corpusName:journals']
	# ]
	
	## list combos (cartesian product of field_outcomes)
	# we're directly unpacking *args into itertool.product()
	# (=> we get an iterator over tuples of combinable query chunks)
	combinations = product(*all_possibilities)
	
	
	# example for -c corpusName, publicationDate
	#	[
	#	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
	#	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
	#	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
	#	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
	#	(...)
	#	]
	
	# ---------------------------------------------------------------
	# (2) getting total counts for each criteria --------------------
	
	# number of counted answers
	#  (1 doc can give several hits if a criterion was multivalued)
	N_reponses = 0
	
	# do the counting for each combo
	for i, combi in enumerate(sorted(combinations)):
		if i % 100 == 0:
			print("pool %i/%i" % (i,n_combos), file=stderr)
		
		query = " AND ".join(combi)
		
		# counting requests ++++
		freq = api.count(query)
		
		# print(freq)
		
		if verbose:
			print("pool:'% -30s': % 8i" %(query,freq),file=stderr)
		
		# storing and agregation
		N_reponses += freq
		abs_freqs[query] = freq
	
	# number of documents sending answers (hence normalizing constant N)
	N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields]))
	
	if verbose:
		print("--------- pool totals -----------", file=stderr)
		print("#answered hits :   % 12s" % N_reponses, file=stderr)
		print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
		# for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
		doc_grand_total = api.count(q='*')
		print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr)
		print("---------------------------------", file=stderr)
	
	# resulting pool info in f + various totals
	return {"f":abs_freqs, "nr":N_reponses, "nd":N_workdocs, "totd":doc_grand_total}

Example #2

Show file

File: sampler.py Project: rloth/libconsulte

def sample(size, crit_fields, constraint_query=None, index=None, 
           verbose=False, run_count = 0):
	global LOG
	global LISSAGE
	global FORBIDDEN_IDS
	
	# allows to set default to None instead of tricky-scope mutable {}
	if not index:
		index = {}
		flag_previous_index = False
	else:
		flag_previous_index = True
	
	
	####### POOLING ########
	
	# instead of calling pooling() maybe we have cached the pools ?
	# (always same counts for given criteria) => cache to json
	cache_filename = pool_cache_path(crit_fields)
	print('...checking cache for %s' % cache_filename,file=stderr)
	
	if path.exists(cache_filename):
		cache = open(cache_filename, 'r')
		pool_info = load(cache)
		print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr)
	else:
		print('...no cache found',file=stderr)
		# -> run doc count foreach(combination of crit fields facets) <-
		#        ---------               
		pool_info = field_combo_count.pooling(crit_fields, verbose)
		#           -----------------
	
	# dans les deux cas, même type de json
	abs_freqs       = pool_info['f']
	N_reponses      = pool_info['nr']
	N_workdocs      = pool_info['nd']
	doc_grand_total = pool_info['totd']
	
	# cache write
	cache = open(cache_filename, 'w')
	dump(pool_info, cache, indent=1, sort_keys=True)
	cache.close()
	
	######### QUOTA ########
	#
	# quota computation = target_corpus_size * pool / N
	rel_freqs = {}
	for combi_query in abs_freqs:
		
		# expérimenter avec N_reponses au dénominateur ?
		quota = round(
		  size * abs_freqs[combi_query] / N_workdocs + LISSAGE
		)
		
		if quota != 0:
			rel_freqs[combi_query] = quota
	
	# fyi 3 lines to check if rounding surprise
	rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
	if verbose:
		print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
		      file=stderr)
	
	# récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)
	
	# got_ids_idx clés = ensemble d'ids , 
	#             valeurs = critères ayant mené au choix
	
	print("Retrieving random samples in each quota...", file=stderr)
	
	for combi_query in sorted(rel_freqs.keys()):
		
		json_hits = []
		
		# how many hits do we need?
		my_quota = rel_freqs[combi_query]
		
		# adding constraints
		if constraint_query:
			my_query = '('+combi_query+') AND ('+constraint_query+')'
			# pour les indices dispo, on doit recompter avec la contrainte
			all_indices = [i for i in range(api.count(my_query))]
		
		# si pas de contrainte les indices dispos 
		# pour le tirage aléatoire sont simplement [0:freq]
		else:
			my_query = combi_query
			all_indices = [i for i in range(abs_freqs[combi_query])]
		
		
		# on lance search 1 par 1 avec les indices tirés en FROM ---------
		
		# ici => ordre aléatoire
		shuffle(all_indices)
		
		# on ne prend que les n premiers (tirage)
		local_tirage = all_indices[0:my_quota]
		
		# pour infos
		if verbose:
			print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage), file=stderr)
		
		for indice in local_tirage:
			# ----------------- api.search(...) ----------------------------
			new_hit = api.search(my_query, 
								   limit=1,
								   i_from=indice,
								   n_docs=abs_freqs[combi_query],
								   outfields=STD_MAP.keys())
				# outfields=('id','author.name','title','publicationDate','corpusName')
			
			if len(new_hit) != 1:
				# skip vides
				# à cause d'une contrainte
				continue
			else:
				# enregistrement
				json_hits.append(new_hit.pop())
			# --------------------------------------------------------------
		
		# NB: 'id' field would be enough for sampling itself, but we get
		#     more metadatas to be able to provide an info table
		my_n_answers = len(json_hits)
		
		my_n_got = 0
		
		# for debug
		# print("HITS:",json_hits, file=stderr)
		
		
		# check unicity
		for hit in json_hits:
			idi = hit['id']
			
			if idi not in index and idi not in FORBIDDEN_IDS:
				# print(hit)
				# exit()
				my_n_got += 1
				# main index
				index[idi] = {
					'_q': combi_query,
					'co': hit['corpusName'][0:3]  # trigramme eg 'els'
					}
				# store info
				# £TODO: check conventions for null values
				# £TODO: ajouter tout ça dans STD_MAP
				if 'publicationDate' in hit and len(hit['publicationDate']):
					index[idi]['yr'] = hit['publicationDate'][0:4]
				else:
					index[idi]['yr'] = 'XXXX'
				
				if 'title' in hit and len(hit['title']):
					index[idi]['ti'] = hit['title']
				else:
					index[idi]['ti'] = "UNTITLED"
				
				if 'author' in hit and len(hit['author'][0]['name']):
					first_auth = hit['author'][0]['name']
					his_lastname = first_auth.split()[-1]
					index[idi]['au'] = his_lastname
				else:
					index[idi]['au'] = "UNKNOWN"
				
				if 'language' in hit and len(hit['language']):
					index[idi]['lg'] = hit['language'][0]
				else:
					index[idi]['lg'] = "UNKOWN_LANG"
				
				if 'genre' in hit and len(hit['genre']):
					index[idi]['typ'] = hit['genre'][0]
				else:
					index[idi]['typ'] = "UNKOWN_GENRE"
				
				if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']):
					index[idi]['cat'] = "/".join(hit['categories']['wos'])
				else:
					index[idi]['cat'] = "UNKOWN_SCI_CAT"
				
				if 'qualityIndicators' in hit:
					if 'pdfVersion' in hit['qualityIndicators']:
						index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
					else:
						index[idi]['ver'] = "UNKNOWN_PDFVER"
					if 'pdfWordCount' in hit['qualityIndicators']:
						index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount']
					else:
						index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					if 'refBibsNative' in hit['qualityIndicators']:
						index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative']
					else:
						index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
				else:
					index[idi]['ver'] = "UNKNOWN_PDFVER"
					index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		
		print ("done %-70s: %i/%i" % (
					my_query[0:64]+"...", 
					my_n_got, 
					my_quota
				), file=stderr)
		
		# if within whole sample_size scope, we may observe unmeetable
		# representativity criteria (marked 'LESS' and checked for RLAX)
		if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
			my_s = "" if my_n_got == 1 else "s"
			LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota))
			
		# print("==========my_corpus ITEMS===========")
		# print([kval for kval in my_corpus.items()])
		
	return(index)

Example #3

Show file

File: field_combo_count.py Project: rloth/refbibs-stack

def pooling(crit_fields, verbose=False):
	####### POOLING ########
	#
	N_reponses = 0
	N_workdocs = 0
	doc_grand_total = 0
	# dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
	abs_freqs = {}
	
	# ---------------------------------------------------------------
	# (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
	print("Sending count queries for criteria pools...",file=stderr)
	## build all "field:values" pairs per criterion field
	## (list of list of strings: future lucene query chunks)
	all_possibilities = []
	
	# petit hommage à notre collègue Nourdine Combo !
	n_combos = 1
	
	for my_criterion in crit_fields:
		# print("CRIT",my_criterion)
		field_outcomes = facet_vals(my_criterion)
		# print("field_outcomes",field_outcomes)
		n_combos = n_combos * len(field_outcomes)
		# lucene query chunks
		all_possibilities.append(
			[my_criterion + ':' + val for val in field_outcomes]
		)
	
	# par ex: 2 critères vont donner 2 listes dans all_possibilities
	# [
	#  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], 
	#
	#  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
	#   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', 
	#   'corpusName:oup', 'corpusName:journals']
	# ]
	
	## list combos (cartesian product of field_outcomes)
	# we're directly unpacking *args into itertool.product()
	# (=> we get an iterator over tuples of combinable query chunks)
	combinations = product(*all_possibilities)
	
	
	# example for -c corpusName, publicationDate
	#	[
	#	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
	#	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
	#	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
	#	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
	#	(...)
	#	]
	
	# ---------------------------------------------------------------
	# (2) getting total counts for each criteria --------------------
	
	# number of counted answers
	#  (1 doc can give several hits if a criterion was multivalued)
	N_reponses = 0
	
	# do the counting for each combo
	for i, combi in enumerate(sorted(combinations)):
		if i % 100 == 0:
			print("pool %i/%i" % (i,n_combos), file=stderr)
		
		query = " AND ".join(combi)
		
		# counting requests ++++
		freq = api.count(query)
		
		# print(freq)
		
		if verbose:
			print("pool:'% -30s': % 8i" %(query,freq),file=stderr)
		
		# storing and agregation
		N_reponses += freq
		abs_freqs[query] = freq
	
	# number of documents sending answers (hence normalizing constant N)
	N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields]))
	
	if verbose:
		print("--------- pool totals -----------", file=stderr)
		print("#answered hits :   % 12s" % N_reponses, file=stderr)
		print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
		# for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
		doc_grand_total = api.count(q='*')
		print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr)
		print("---------------------------------", file=stderr)
	
	# resulting pool info in f + various totals
	return {'f':abs_freqs, 'nr':N_reponses, 'nd':N_workdocs, 'totd':doc_grand_total}

Example #4

Show file

File: field_combo_count.py Project: rloth/refbibs-stack

def pooling(crit_fields, verbose=False):
    ####### POOLING ########
    #
    N_reponses = 0
    N_workdocs = 0
    doc_grand_total = 0
    # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
    abs_freqs = {}

    # ---------------------------------------------------------------
    # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
    print("Sending count queries for criteria pools...", file=stderr)
    ## build all "field:values" pairs per criterion field
    ## (list of list of strings: future lucene query chunks)
    all_possibilities = []

    # petit hommage à notre collègue Nourdine Combo !
    n_combos = 1

    for my_criterion in crit_fields:
        # print("CRIT",my_criterion)
        field_outcomes = facet_vals(my_criterion)
        # print("field_outcomes",field_outcomes)
        n_combos = n_combos * len(field_outcomes)
        # lucene query chunks
        all_possibilities.append(
            [my_criterion + ':' + val for val in field_outcomes])

    # par ex: 2 critères vont donner 2 listes dans all_possibilities
    # [
    #  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'],
    #
    #  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
    #   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature',
    #   'corpusName:oup', 'corpusName:journals']
    # ]

    ## list combos (cartesian product of field_outcomes)
    # we're directly unpacking *args into itertool.product()
    # (=> we get an iterator over tuples of combinable query chunks)
    combinations = product(*all_possibilities)

    # example for -c corpusName, publicationDate
    #	[
    #	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
    #	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
    #	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
    #	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
    #	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
    #	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
    #	(...)
    #	]

    # ---------------------------------------------------------------
    # (2) getting total counts for each criteria --------------------

    # number of counted answers
    #  (1 doc can give several hits if a criterion was multivalued)
    N_reponses = 0

    # do the counting for each combo
    for i, combi in enumerate(sorted(combinations)):
        if i % 100 == 0:
            print("pool %i/%i" % (i, n_combos), file=stderr)

        query = " AND ".join(combi)

        # counting requests ++++
        freq = api.count(query)

        # print(freq)

        if verbose:
            print("pool:'% -30s': % 8i" % (query, freq), file=stderr)

        # storing and agregation
        N_reponses += freq
        abs_freqs[query] = freq

    # number of documents sending answers (hence normalizing constant N)
    N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields]))

    if verbose:
        print("--------- pool totals -----------", file=stderr)
        print("#answered hits :   % 12s" % N_reponses, file=stderr)
        print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
        # for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
        doc_grand_total = api.count(q='*')
        print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr)
        print("---------------------------------", file=stderr)

    # resulting pool info in f + various totals
    return {
        'f': abs_freqs,
        'nr': N_reponses,
        'nd': N_workdocs,
        'totd': doc_grand_total
    }

Example #5

Show file

File: sampler.py Project: rloth/utilitaires-istex

def sample(size,
           crit_fields,
           constraint_query=None,
           index=None,
           verbose=False,
           run_count=0):
    global LOG
    global LISSAGE
    global FORBIDDEN_IDS

    # allows to set default to None instead of tricky-scope mutable {}
    if not index:
        index = {}
        flag_previous_index = False
    else:
        flag_previous_index = True

    ####### POOLING ########
    #
    N_reponses = 0
    N_workdocs = 0
    doc_grand_total = 0
    # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
    abs_freqs = {}

    # instead do steps (1) (2) maybe we have cached the pools ?
    # (always same counts for given criteria) => cache to json
    cache_filename = pool_cache_path(crit_fields)
    print('...checking cache for %s' % cache_filename, file=stderr)

    if path.exists(cache_filename):
        cache = open(cache_filename, 'r')
        pool_info = load(cache)
        abs_freqs = pool_info['f']
        N_reponses = pool_info['nr']
        N_workdocs = pool_info['nd']
        doc_grand_total = pool_info['totd']
        print('...ok cache (%i workdocs)' % N_workdocs, file=stderr)
    else:
        print('...no cache found', file=stderr)

        # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
        print("Sending count queries for criteria pools...", file=stderr)
        ## build all "field:values" pairs per criterion field
        ## (list of list of strings: future lucene query chunks)
        all_possibilities = []
        n_combos = 1
        for my_criterion in crit_fields:
            field_outcomes = facet_vals(my_criterion)
            n_combos = n_combos * len(field_outcomes)
            # lucene query chunks
            all_possibilities.append(
                [my_criterion + ':' + val for val in field_outcomes])

        ## list combos (cartesian product of field_outcomes)
        # we're directly unpacking *args into itertool.product()
        # (=> we get an iterator over tuples of combinable query chunks)
        combinations = product(*all_possibilities)

        # example for -c corpusName, publicationDate
        #	[
        #	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
        #	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
        #	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
        #	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
        #	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
        #	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
        #	(...)
        #	]

        # (2) getting total counts for each criteria --------------------

        # number of counted answers
        #  (1 doc can give several hits if a criterion was multivalued)
        N_reponses = 0

        # do the counting for each combo
        for i, combi in enumerate(sorted(combinations)):
            if i % 100 == 0:
                print("pool %i/%i" % (i, n_combos), file=stderr)

            query = " AND ".join(combi)

            # counting requests ++++
            freq = api.count(query)

            if verbose:
                print("pool:'% -30s': % 8i" % (query, freq), file=stderr)

            # storing and agregation
            N_reponses += freq
            abs_freqs[query] = freq

        # number of documents sending answers (hence normalizing constant N)
        N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields]))

        if verbose:
            print("--------- pool totals -----------", file=stderr)
            print("#answered hits :   % 12s" % N_reponses, file=stderr)
            print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
            # for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
            doc_grand_total = api.count(q='*')
            print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr)
            print("---------------------------------", file=stderr)

        # cache write
        cache = open(cache_filename, 'w')
        pool_info = {
            'f': abs_freqs,
            'nr': N_reponses,
            'nd': N_workdocs,
            'totd': doc_grand_total
        }
        # json.dump
        dump(pool_info, cache, indent=1)
        cache.close()

    ######### QUOTA ########
    #
    # (3) quota computation and availability checking ------------------
    # quota computation
    rel_freqs = {}
    for combi_query in abs_freqs:

        # expérimenter avec N_reponses au dénominateur ?
        quota = round(size * abs_freqs[combi_query] / N_workdocs + LISSAGE)

        if quota != 0:
            rel_freqs[combi_query] = quota

    # fyi 3 lines to check if rounding surprise
    rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
    if verbose:
        print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
              file=stderr)

    # récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)

    # got_ids_idx clés = ensemble d'ids ,
    #             valeurs = critères ayant mené au choix

    print("Retrieving new sample chunks per pool quota...", file=stderr)

    for combi_query in sorted(rel_freqs.keys()):

        # how many hits do we need?
        my_quota = rel_freqs[combi_query]
        if not flag_previous_index and not FORBIDDEN_IDS:
            # option A: direct quota allocation to search limit
            n_needed = my_quota
        else:
            # option B: limit larger than quota by retrieved amount
            #           (provides deduplication margin if 2nd run)
            #
            # /!\ wouldn't be necessary at all if we had none or rare
            #     duplicates, like with random result ranking)

            # supplément 1: items to skip
            n_already_retrieved = len(
                # lookup retrieved
                [
                    idi for idi, metad in index.items()
                    if search(escape(combi_query), metad['_q'])
                ])

            # supplément 2: prorata de FORBIDDEN_IDS
            suppl = round(len(FORBIDDEN_IDS) * my_quota / size)
            n_already_retrieved += suppl
            n_needed = my_quota + n_already_retrieved

        # adding constraints
        if constraint_query:
            my_query = '(' + combi_query + ') AND (' + constraint_query + ')'
        else:
            my_query = combi_query

        # ----------------- api.search(...) ----------------------------
        json_hits = api.search(my_query,
                               limit=n_needed,
                               outfields=STD_MAP.keys())
        # outfields=('id','author.name','title','publicationDate','corpusName')

        # --------------------------------------------------------------

        # NB: 'id' field would be enough for sampling itself, but we get
        #     more metadatas to be able to provide an info table or to
        #     create a human-readable filename

        # £TODO 1
        # remplacer api.search() par une future fonction random_search
        # cf. elasticsearch guide: "random scoring" (=> puis supprimer
        # l'option B avec n_needed)

        my_n_answers = len(json_hits)

        my_n_got = 0

        # for debug
        # print("HITS:",json_hits, file=stderr)

        # check unicity
        for hit in json_hits:
            idi = hit['id']

            if idi not in index and idi not in FORBIDDEN_IDS:
                # print(hit)
                # exit()
                my_n_got += 1
                # main index
                index[idi] = {
                    '_q': combi_query,
                    'co': hit['corpusName'][0:3]  # trigramme eg 'els'
                }
                # store info
                # £TODO: check conventions for null values
                # £TODO: ajouter tout ça dans STD_MAP
                if 'publicationDate' in hit and len(hit['publicationDate']):
                    index[idi]['yr'] = hit['publicationDate'][0:4]
                else:
                    index[idi]['yr'] = 'XXXX'

                if 'title' in hit and len(hit['title']):
                    index[idi]['ti'] = hit['title']
                else:
                    index[idi]['ti'] = "UNTITLED"

                if 'author' in hit and len(hit['author'][0]['name']):
                    first_auth = hit['author'][0]['name']
                    his_lastname = first_auth.split()[-1]
                    index[idi]['au'] = his_lastname
                else:
                    index[idi]['au'] = "UNKNOWN"

                if 'language' in hit and len(hit['language']):
                    index[idi]['lg'] = hit['language'][0]
                else:
                    index[idi]['lg'] = "UNKOWN_LANG"

                if 'genre' in hit and len(hit['genre']):
                    index[idi]['typ'] = hit['genre'][0]
                else:
                    index[idi]['typ'] = "UNKOWN_GENRE"

                if 'categories' in hit and len(
                        hit['categories']
                ) and 'wos' in hit['categories'] and len(
                        hit['categories']['wos']):
                    index[idi]['cat'] = "/".join(hit['categories']['wos'])
                else:
                    index[idi]['cat'] = "UNKOWN_SCI_CAT"

                if 'qualityIndicators' in hit and 'pdfVersion' in hit[
                        'qualityIndicators']:
                    index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
                else:
                    index[idi]['ver'] = "UNKNOWN_PDFVER"

                if 'qualityIndicators' in hit and 'pdfWordCount' in hit[
                        'qualityIndicators']:
                    index[idi]['wcp'] = hit['qualityIndicators'][
                        'pdfWordCount']
                else:
                    index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"

            # recheck limit: needed as long as n_needed != my_quota
            # (should disappear as consequence of removing option B)
            if my_n_got == my_quota:
                break

        print("%-70s: %i(%i)/%i" %
              (my_query[0:67] + "...", my_n_got, my_n_answers, my_quota),
              file=stderr)

        # if within whole sample_size scope, we may observe unmeatable
        # representativity criteria (marked 'LESS' and checked for RLAX)
        if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
            my_s = "" if my_n_got == 1 else "s"
            LOG.append(
                "LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota"
                % (combi_query, constraint_query, my_n_got, my_s, my_s,
                   my_quota))

        # print("==========my_corpus ITEMS===========")
        # print([kval for kval in my_corpus.items()])

    return (index)

Example #6

Show file

def sample(size, crit_fields, constraint_query=None, index=None, 
           verbose=False, run_count = 0):
	global LOG
	global LISSAGE
	global FORBIDDEN_IDS
	
	# allows to set default to None instead of tricky-scope mutable {}
	if not index:
		index = {}
		flag_previous_index = False
	else:
		flag_previous_index = True
	
	
	####### POOLING ########
	
	# instead of calling pooling() maybe we have cached the pools ?
	# (always same counts for given criteria) => cache to json
	cache_filename = pool_cache_path(crit_fields)
	print('...checking cache for %s' % cache_filename,file=stderr)
	
	if path.exists(cache_filename):
		cache = open(cache_filename, 'r')
		pool_info = load(cache)
		print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr)
	else:
		print('...no cache found',file=stderr)
		# -> run doc count foreach(combination of crit fields facets) <-
		#        ---------               
		pool_info = field_combo_count.pooling(crit_fields, verbose)
		#           -----------------
	
	# dans les deux cas, même type de json
	abs_freqs       = pool_info['f']
	N_reponses      = pool_info['nr']
	N_workdocs      = pool_info['nd']
	doc_grand_total = pool_info['totd']
	
	# cache write
	cache = open(cache_filename, 'w')
	dump(pool_info, cache, indent=1, sort_keys=True)
	cache.close()
	
	######### QUOTA ########
	#
	# quota computation = target_corpus_size * pool / N
	rel_freqs = {}
	for combi_query in abs_freqs:
		
		# expérimenter avec N_reponses au dénominateur ?
		quota = round(
		  size * abs_freqs[combi_query] / N_workdocs + LISSAGE
		)
		
		if quota != 0:
			rel_freqs[combi_query] = quota
	
	# fyi 3 lines to check if rounding surprise
	rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
	if verbose:
		print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
		      file=stderr)
	
	# récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)
	
	# got_ids_idx clés = ensemble d'ids , 
	#             valeurs = critères ayant mené au choix
	
	print("Retrieving random samples in each quota...", file=stderr)
	
	for combi_query in sorted(rel_freqs.keys()):
		
		json_hits = []
		
		# how many hits do we need?
		my_quota = rel_freqs[combi_query]
		
		# adding constraints
		if constraint_query:
			my_query = '('+combi_query+') AND ('+constraint_query+')'
			# pour les indices dispo, on doit recompter avec la contrainte
			all_indices = [i for i in range(api.count(my_query))]
		
		# si pas de contrainte les indices dispos 
		# pour le tirage aléatoire sont simplement [0:freq]
		else:
			my_query = combi_query
			all_indices = [i for i in range(abs_freqs[combi_query])]
		
		
		# on lance search 1 par 1 avec les indices tirés en FROM ---------
		
		# ici => ordre aléatoire
		shuffle(all_indices)
		
		# on ne prend que les n premiers (tirage)
		local_tirage = all_indices[0:my_quota]
		
		# pour infos
		if verbose:
			print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage))
		
		for indice in local_tirage:
			# ----------------- api.search(...) ----------------------------
			new_hit = api.search(my_query, 
								   limit=1,
								   i_from=indice,
								   n_docs=abs_freqs[combi_query],
								   outfields=STD_MAP.keys())
				# outfields=('id','author.name','title','publicationDate','corpusName')
			
			if len(new_hit) != 1:
				# skip vides
				# à cause d'une contrainte
				continue
			else:
				# enregistrement
				json_hits.append(new_hit.pop())
			# --------------------------------------------------------------
		
		# NB: 'id' field would be enough for sampling itself, but we get
		#     more metadatas to be able to provide an info table
		my_n_answers = len(json_hits)
		
		my_n_got = 0
		
		# for debug
		# print("HITS:",json_hits, file=stderr)
		
		
		# check unicity
		for hit in json_hits:
			idi = hit['id']
			
			if idi not in index and idi not in FORBIDDEN_IDS:
				# print(hit)
				# exit()
				my_n_got += 1
				# main index
				index[idi] = {
					'_q': combi_query,
					'co': hit['corpusName'][0:3]  # trigramme eg 'els'
					}
				# store info
				# £TODO: check conventions for null values
				# £TODO: ajouter tout ça dans STD_MAP
				if 'publicationDate' in hit and len(hit['publicationDate']):
					index[idi]['yr'] = hit['publicationDate'][0:4]
				else:
					index[idi]['yr'] = 'XXXX'
				
				if 'title' in hit and len(hit['title']):
					index[idi]['ti'] = hit['title']
				else:
					index[idi]['ti'] = "UNTITLED"
				
				if 'author' in hit and len(hit['author'][0]['name']):
					first_auth = hit['author'][0]['name']
					his_lastname = first_auth.split()[-1]
					index[idi]['au'] = his_lastname
				else:
					index[idi]['au'] = "UNKNOWN"
				
				if 'language' in hit and len(hit['language']):
					index[idi]['lg'] = hit['language'][0]
				else:
					index[idi]['lg'] = "UNKOWN_LANG"
				
				if 'genre' in hit and len(hit['genre']):
					index[idi]['typ'] = hit['genre'][0]
				else:
					index[idi]['typ'] = "UNKOWN_GENRE"
				
				if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']):
					index[idi]['cat'] = "/".join(hit['categories']['wos'])
				else:
					index[idi]['cat'] = "UNKOWN_SCI_CAT"
				
				if 'qualityIndicators' in hit:
					if 'pdfVersion' in hit['qualityIndicators']:
						index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
					else:
						index[idi]['ver'] = "UNKNOWN_PDFVER"
					if 'pdfWordCount' in hit['qualityIndicators']:
						index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount']
					else:
						index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					if 'refBibsNative' in hit['qualityIndicators']:
						index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative']
					else:
						index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
				else:
					index[idi]['ver'] = "UNKNOWN_PDFVER"
					index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		
		print ("done %-70s: %i/%i" % (
					my_query[0:64]+"...", 
					my_n_got, 
					my_quota
				), file=stderr)
		
		# if within whole sample_size scope, we may observe unmeetable
		# representativity criteria (marked 'LESS' and checked for RLAX)
		if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
			my_s = "" if my_n_got == 1 else "s"
			LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota))
			
		# print("==========my_corpus ITEMS===========")
		# print([kval for kval in my_corpus.items()])
		
	return(index)