Beispiel #1
0
def pooling(crit_fields, verbose=False):
	"""
	Sorte de tableau croisé: compte le nombre de docs pour chaque combinaison de critères
	
	Renvoie un dict avec ces décomptes et les totaux
	
	Exemple pour les critères corpusName et pdfCharCount
	{
	  "f": {
		"corpusName:bmj AND pdfCharCount:[* TO 1999]": 24524,
		"corpusName:bmj AND pdfCharCount:[2000 TO *]": 662848,
		"corpusName:brill-journals AND pdfCharCount:[* TO 1999]": 10949,
		"corpusName:brill-journals AND pdfCharCount:[2000 TO *]": 119318,
		"corpusName:elsevier AND pdfCharCount:[* TO 1999]": 275461,
		"corpusName:elsevier AND pdfCharCount:[2000 TO *]": 5740132,
		"corpusName:nature AND pdfCharCount:[* TO 1999]": 332156,
		"corpusName:nature AND pdfCharCount:[2000 TO *]": 45139,
		"corpusName:oup AND pdfCharCount:[* TO 1999]": 58662,
		"corpusName:oup AND pdfCharCount:[2000 TO *]": 1385591,
		"corpusName:springer AND pdfCharCount:[* TO 1999]": 61973,
		"corpusName:springer AND pdfCharCount:[2000 TO *]": 2242902,
		"corpusName:wiley AND pdfCharCount:[* TO 1999]": 593998,
		"corpusName:wiley AND pdfCharCount:[2000 TO *]": 4044204
	  },
	  "totd": 15982692         # nombre de docs au total dans la base
	  "nd": 15597857,          # nombre de docs pour l'ensemble des critères
	  "nr": 15597857,          # nombre de réponses pour l'ensemble des critère 
	                           # (intéressant pour les champs "QCM")
	}

	NB: les choix de fourchettes et les valeurs de facettes sont
	    configurables dans field_value_lists.py
	"""
	####### POOLING ########
	#
	N_reponses = 0
	N_workdocs = 0
	doc_grand_total = 0
	# dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
	abs_freqs = {}
	
	# ---------------------------------------------------------------
	# (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
	print("Sending count queries for criteria pools...",file=stderr)
	## build all "field:values" pairs per criterion field
	## (list of list of strings: future lucene query chunks)
	all_possibilities = []
	
	# petit hommage à notre collègue Nourdine Combo !
	n_combos = 1
	
	for my_criterion in crit_fields:
		# print("CRIT",my_criterion)
		field_outcomes = facet_vals(my_criterion)
		# print("field_outcomes",field_outcomes)
		n_combos = n_combos * len(field_outcomes)
		# lucene query chunks
		all_possibilities.append(
			[my_criterion + ':' + val for val in field_outcomes]
		)
	
	# par ex: 2 critères vont donner 2 listes dans all_possibilities
	# [
	#  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], 
	#
	#  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
	#   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', 
	#   'corpusName:oup', 'corpusName:journals']
	# ]
	
	## list combos (cartesian product of field_outcomes)
	# we're directly unpacking *args into itertool.product()
	# (=> we get an iterator over tuples of combinable query chunks)
	combinations = product(*all_possibilities)
	
	
	# example for -c corpusName, publicationDate
	#	[
	#	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
	#	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
	#	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
	#	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
	#	(...)
	#	]
	
	# ---------------------------------------------------------------
	# (2) getting total counts for each criteria --------------------
	
	# number of counted answers
	#  (1 doc can give several hits if a criterion was multivalued)
	N_reponses = 0
	
	# do the counting for each combo
	for i, combi in enumerate(sorted(combinations)):
		if i % 100 == 0:
			print("pool %i/%i" % (i,n_combos), file=stderr)
		
		query = " AND ".join(combi)
		
		# counting requests ++++
		freq = api.count(query)
		
		# print(freq)
		
		if verbose:
			print("pool:'% -30s': % 8i" %(query,freq),file=stderr)
		
		# storing and agregation
		N_reponses += freq
		abs_freqs[query] = freq
	
	# number of documents sending answers (hence normalizing constant N)
	N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields]))
	
	if verbose:
		print("--------- pool totals -----------", file=stderr)
		print("#answered hits :   % 12s" % N_reponses, file=stderr)
		print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
		# for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
		doc_grand_total = api.count(q='*')
		print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr)
		print("---------------------------------", file=stderr)
	
	# resulting pool info in f + various totals
	return {"f":abs_freqs, "nr":N_reponses, "nd":N_workdocs, "totd":doc_grand_total}
Beispiel #2
0
def sample(size, crit_fields, constraint_query=None, index=None, 
           verbose=False, run_count = 0):
	global LOG
	global LISSAGE
	global FORBIDDEN_IDS
	
	# allows to set default to None instead of tricky-scope mutable {}
	if not index:
		index = {}
		flag_previous_index = False
	else:
		flag_previous_index = True
	
	
	####### POOLING ########
	
	# instead of calling pooling() maybe we have cached the pools ?
	# (always same counts for given criteria) => cache to json
	cache_filename = pool_cache_path(crit_fields)
	print('...checking cache for %s' % cache_filename,file=stderr)
	
	if path.exists(cache_filename):
		cache = open(cache_filename, 'r')
		pool_info = load(cache)
		print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr)
	else:
		print('...no cache found',file=stderr)
		# -> run doc count foreach(combination of crit fields facets) <-
		#        ---------               
		pool_info = field_combo_count.pooling(crit_fields, verbose)
		#           -----------------
	
	# dans les deux cas, même type de json
	abs_freqs       = pool_info['f']
	N_reponses      = pool_info['nr']
	N_workdocs      = pool_info['nd']
	doc_grand_total = pool_info['totd']
	
	# cache write
	cache = open(cache_filename, 'w')
	dump(pool_info, cache, indent=1, sort_keys=True)
	cache.close()
	
	######### QUOTA ########
	#
	# quota computation = target_corpus_size * pool / N
	rel_freqs = {}
	for combi_query in abs_freqs:
		
		# expérimenter avec N_reponses au dénominateur ?
		quota = round(
		  size * abs_freqs[combi_query] / N_workdocs + LISSAGE
		)
		
		if quota != 0:
			rel_freqs[combi_query] = quota
	
	# fyi 3 lines to check if rounding surprise
	rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
	if verbose:
		print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
		      file=stderr)
	
	# récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)
	
	# got_ids_idx clés = ensemble d'ids , 
	#             valeurs = critères ayant mené au choix
	
	print("Retrieving random samples in each quota...", file=stderr)
	
	for combi_query in sorted(rel_freqs.keys()):
		
		json_hits = []
		
		# how many hits do we need?
		my_quota = rel_freqs[combi_query]
		
		# adding constraints
		if constraint_query:
			my_query = '('+combi_query+') AND ('+constraint_query+')'
			# pour les indices dispo, on doit recompter avec la contrainte
			all_indices = [i for i in range(api.count(my_query))]
		
		# si pas de contrainte les indices dispos 
		# pour le tirage aléatoire sont simplement [0:freq]
		else:
			my_query = combi_query
			all_indices = [i for i in range(abs_freqs[combi_query])]
		
		
		# on lance search 1 par 1 avec les indices tirés en FROM ---------
		
		# ici => ordre aléatoire
		shuffle(all_indices)
		
		# on ne prend que les n premiers (tirage)
		local_tirage = all_indices[0:my_quota]
		
		# pour infos
		if verbose:
			print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage), file=stderr)
		
		for indice in local_tirage:
			# ----------------- api.search(...) ----------------------------
			new_hit = api.search(my_query, 
								   limit=1,
								   i_from=indice,
								   n_docs=abs_freqs[combi_query],
								   outfields=STD_MAP.keys())
				# outfields=('id','author.name','title','publicationDate','corpusName')
			
			if len(new_hit) != 1:
				# skip vides
				# à cause d'une contrainte
				continue
			else:
				# enregistrement
				json_hits.append(new_hit.pop())
			# --------------------------------------------------------------
		
		# NB: 'id' field would be enough for sampling itself, but we get
		#     more metadatas to be able to provide an info table
		my_n_answers = len(json_hits)
		
		my_n_got = 0
		
		# for debug
		# print("HITS:",json_hits, file=stderr)
		
		
		# check unicity
		for hit in json_hits:
			idi = hit['id']
			
			if idi not in index and idi not in FORBIDDEN_IDS:
				# print(hit)
				# exit()
				my_n_got += 1
				# main index
				index[idi] = {
					'_q': combi_query,
					'co': hit['corpusName'][0:3]  # trigramme eg 'els'
					}
				# store info
				# £TODO: check conventions for null values
				# £TODO: ajouter tout ça dans STD_MAP
				if 'publicationDate' in hit and len(hit['publicationDate']):
					index[idi]['yr'] = hit['publicationDate'][0:4]
				else:
					index[idi]['yr'] = 'XXXX'
				
				if 'title' in hit and len(hit['title']):
					index[idi]['ti'] = hit['title']
				else:
					index[idi]['ti'] = "UNTITLED"
				
				if 'author' in hit and len(hit['author'][0]['name']):
					first_auth = hit['author'][0]['name']
					his_lastname = first_auth.split()[-1]
					index[idi]['au'] = his_lastname
				else:
					index[idi]['au'] = "UNKNOWN"
				
				if 'language' in hit and len(hit['language']):
					index[idi]['lg'] = hit['language'][0]
				else:
					index[idi]['lg'] = "UNKOWN_LANG"
				
				if 'genre' in hit and len(hit['genre']):
					index[idi]['typ'] = hit['genre'][0]
				else:
					index[idi]['typ'] = "UNKOWN_GENRE"
				
				if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']):
					index[idi]['cat'] = "/".join(hit['categories']['wos'])
				else:
					index[idi]['cat'] = "UNKOWN_SCI_CAT"
				
				if 'qualityIndicators' in hit:
					if 'pdfVersion' in hit['qualityIndicators']:
						index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
					else:
						index[idi]['ver'] = "UNKNOWN_PDFVER"
					if 'pdfWordCount' in hit['qualityIndicators']:
						index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount']
					else:
						index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					if 'refBibsNative' in hit['qualityIndicators']:
						index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative']
					else:
						index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
				else:
					index[idi]['ver'] = "UNKNOWN_PDFVER"
					index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		
		print ("done %-70s: %i/%i" % (
					my_query[0:64]+"...", 
					my_n_got, 
					my_quota
				), file=stderr)
		
		# if within whole sample_size scope, we may observe unmeetable
		# representativity criteria (marked 'LESS' and checked for RLAX)
		if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
			my_s = "" if my_n_got == 1 else "s"
			LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota))
			
		# print("==========my_corpus ITEMS===========")
		# print([kval for kval in my_corpus.items()])
		
	return(index)
def pooling(crit_fields, verbose=False):
	####### POOLING ########
	#
	N_reponses = 0
	N_workdocs = 0
	doc_grand_total = 0
	# dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
	abs_freqs = {}
	
	# ---------------------------------------------------------------
	# (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
	print("Sending count queries for criteria pools...",file=stderr)
	## build all "field:values" pairs per criterion field
	## (list of list of strings: future lucene query chunks)
	all_possibilities = []
	
	# petit hommage à notre collègue Nourdine Combo !
	n_combos = 1
	
	for my_criterion in crit_fields:
		# print("CRIT",my_criterion)
		field_outcomes = facet_vals(my_criterion)
		# print("field_outcomes",field_outcomes)
		n_combos = n_combos * len(field_outcomes)
		# lucene query chunks
		all_possibilities.append(
			[my_criterion + ':' + val for val in field_outcomes]
		)
	
	# par ex: 2 critères vont donner 2 listes dans all_possibilities
	# [
	#  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], 
	#
	#  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
	#   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', 
	#   'corpusName:oup', 'corpusName:journals']
	# ]
	
	## list combos (cartesian product of field_outcomes)
	# we're directly unpacking *args into itertool.product()
	# (=> we get an iterator over tuples of combinable query chunks)
	combinations = product(*all_possibilities)
	
	
	# example for -c corpusName, publicationDate
	#	[
	#	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
	#	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
	#	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
	#	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
	#	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
	#	(...)
	#	]
	
	# ---------------------------------------------------------------
	# (2) getting total counts for each criteria --------------------
	
	# number of counted answers
	#  (1 doc can give several hits if a criterion was multivalued)
	N_reponses = 0
	
	# do the counting for each combo
	for i, combi in enumerate(sorted(combinations)):
		if i % 100 == 0:
			print("pool %i/%i" % (i,n_combos), file=stderr)
		
		query = " AND ".join(combi)
		
		# counting requests ++++
		freq = api.count(query)
		
		# print(freq)
		
		if verbose:
			print("pool:'% -30s': % 8i" %(query,freq),file=stderr)
		
		# storing and agregation
		N_reponses += freq
		abs_freqs[query] = freq
	
	# number of documents sending answers (hence normalizing constant N)
	N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields]))
	
	if verbose:
		print("--------- pool totals -----------", file=stderr)
		print("#answered hits :   % 12s" % N_reponses, file=stderr)
		print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
		# for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
		doc_grand_total = api.count(q='*')
		print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr)
		print("---------------------------------", file=stderr)
	
	# resulting pool info in f + various totals
	return {'f':abs_freqs, 'nr':N_reponses, 'nd':N_workdocs, 'totd':doc_grand_total}
def pooling(crit_fields, verbose=False):
    ####### POOLING ########
    #
    N_reponses = 0
    N_workdocs = 0
    doc_grand_total = 0
    # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
    abs_freqs = {}

    # ---------------------------------------------------------------
    # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
    print("Sending count queries for criteria pools...", file=stderr)
    ## build all "field:values" pairs per criterion field
    ## (list of list of strings: future lucene query chunks)
    all_possibilities = []

    # petit hommage à notre collègue Nourdine Combo !
    n_combos = 1

    for my_criterion in crit_fields:
        # print("CRIT",my_criterion)
        field_outcomes = facet_vals(my_criterion)
        # print("field_outcomes",field_outcomes)
        n_combos = n_combos * len(field_outcomes)
        # lucene query chunks
        all_possibilities.append(
            [my_criterion + ':' + val for val in field_outcomes])

    # par ex: 2 critères vont donner 2 listes dans all_possibilities
    # [
    #  ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'],
    #
    #  ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier',
    #   'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature',
    #   'corpusName:oup', 'corpusName:journals']
    # ]

    ## list combos (cartesian product of field_outcomes)
    # we're directly unpacking *args into itertool.product()
    # (=> we get an iterator over tuples of combinable query chunks)
    combinations = product(*all_possibilities)

    # example for -c corpusName, publicationDate
    #	[
    #	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
    #	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
    #	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
    #	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
    #	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
    #	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
    #	(...)
    #	]

    # ---------------------------------------------------------------
    # (2) getting total counts for each criteria --------------------

    # number of counted answers
    #  (1 doc can give several hits if a criterion was multivalued)
    N_reponses = 0

    # do the counting for each combo
    for i, combi in enumerate(sorted(combinations)):
        if i % 100 == 0:
            print("pool %i/%i" % (i, n_combos), file=stderr)

        query = " AND ".join(combi)

        # counting requests ++++
        freq = api.count(query)

        # print(freq)

        if verbose:
            print("pool:'% -30s': % 8i" % (query, freq), file=stderr)

        # storing and agregation
        N_reponses += freq
        abs_freqs[query] = freq

    # number of documents sending answers (hence normalizing constant N)
    N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields]))

    if verbose:
        print("--------- pool totals -----------", file=stderr)
        print("#answered hits :   % 12s" % N_reponses, file=stderr)
        print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
        # for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
        doc_grand_total = api.count(q='*')
        print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr)
        print("---------------------------------", file=stderr)

    # resulting pool info in f + various totals
    return {
        'f': abs_freqs,
        'nr': N_reponses,
        'nd': N_workdocs,
        'totd': doc_grand_total
    }
Beispiel #5
0
def sample(size,
           crit_fields,
           constraint_query=None,
           index=None,
           verbose=False,
           run_count=0):
    global LOG
    global LISSAGE
    global FORBIDDEN_IDS

    # allows to set default to None instead of tricky-scope mutable {}
    if not index:
        index = {}
        flag_previous_index = False
    else:
        flag_previous_index = True

    ####### POOLING ########
    #
    N_reponses = 0
    N_workdocs = 0
    doc_grand_total = 0
    # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...)
    abs_freqs = {}

    # instead do steps (1) (2) maybe we have cached the pools ?
    # (always same counts for given criteria) => cache to json
    cache_filename = pool_cache_path(crit_fields)
    print('...checking cache for %s' % cache_filename, file=stderr)

    if path.exists(cache_filename):
        cache = open(cache_filename, 'r')
        pool_info = load(cache)
        abs_freqs = pool_info['f']
        N_reponses = pool_info['nr']
        N_workdocs = pool_info['nd']
        doc_grand_total = pool_info['totd']
        print('...ok cache (%i workdocs)' % N_workdocs, file=stderr)
    else:
        print('...no cache found', file=stderr)

        # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES --------
        print("Sending count queries for criteria pools...", file=stderr)
        ## build all "field:values" pairs per criterion field
        ## (list of list of strings: future lucene query chunks)
        all_possibilities = []
        n_combos = 1
        for my_criterion in crit_fields:
            field_outcomes = facet_vals(my_criterion)
            n_combos = n_combos * len(field_outcomes)
            # lucene query chunks
            all_possibilities.append(
                [my_criterion + ':' + val for val in field_outcomes])

        ## list combos (cartesian product of field_outcomes)
        # we're directly unpacking *args into itertool.product()
        # (=> we get an iterator over tuples of combinable query chunks)
        combinations = product(*all_possibilities)

        # example for -c corpusName, publicationDate
        #	[
        #	('corpusName:ecco', 'publicationDate:[* TO 1959]'),
        #	('corpusName:ecco', 'publicationDate:[1960 TO 1999]'),
        #	('corpusName:ecco', 'publicationDate:[2000 TO *]'),
        #	('corpusName:elsevier', 'publicationDate:[* TO 1959]'),
        #	('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'),
        #	('corpusName:elsevier', 'publicationDate:[2000 TO *]'),
        #	(...)
        #	]

        # (2) getting total counts for each criteria --------------------

        # number of counted answers
        #  (1 doc can give several hits if a criterion was multivalued)
        N_reponses = 0

        # do the counting for each combo
        for i, combi in enumerate(sorted(combinations)):
            if i % 100 == 0:
                print("pool %i/%i" % (i, n_combos), file=stderr)

            query = " AND ".join(combi)

            # counting requests ++++
            freq = api.count(query)

            if verbose:
                print("pool:'% -30s': % 8i" % (query, freq), file=stderr)

            # storing and agregation
            N_reponses += freq
            abs_freqs[query] = freq

        # number of documents sending answers (hence normalizing constant N)
        N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields]))

        if verbose:
            print("--------- pool totals -----------", file=stderr)
            print("#answered hits :   % 12s" % N_reponses, file=stderr)
            print("#workdocs (N) :    % 12s" % N_workdocs, file=stderr)
            # for comparison: all_docs = N + api.count(q="NOT(criterion:*)")
            doc_grand_total = api.count(q='*')
            print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr)
            print("---------------------------------", file=stderr)

        # cache write
        cache = open(cache_filename, 'w')
        pool_info = {
            'f': abs_freqs,
            'nr': N_reponses,
            'nd': N_workdocs,
            'totd': doc_grand_total
        }
        # json.dump
        dump(pool_info, cache, indent=1)
        cache.close()

    ######### QUOTA ########
    #
    # (3) quota computation and availability checking ------------------
    # quota computation
    rel_freqs = {}
    for combi_query in abs_freqs:

        # expérimenter avec N_reponses au dénominateur ?
        quota = round(size * abs_freqs[combi_query] / N_workdocs + LISSAGE)

        if quota != 0:
            rel_freqs[combi_query] = quota

    # fyi 3 lines to check if rounding surprise
    rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
    if verbose:
        print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
              file=stderr)

    # récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)

    # got_ids_idx clés = ensemble d'ids ,
    #             valeurs = critères ayant mené au choix

    print("Retrieving new sample chunks per pool quota...", file=stderr)

    for combi_query in sorted(rel_freqs.keys()):

        # how many hits do we need?
        my_quota = rel_freqs[combi_query]
        if not flag_previous_index and not FORBIDDEN_IDS:
            # option A: direct quota allocation to search limit
            n_needed = my_quota
        else:
            # option B: limit larger than quota by retrieved amount
            #           (provides deduplication margin if 2nd run)
            #
            # /!\ wouldn't be necessary at all if we had none or rare
            #     duplicates, like with random result ranking)

            # supplément 1: items to skip
            n_already_retrieved = len(
                # lookup retrieved
                [
                    idi for idi, metad in index.items()
                    if search(escape(combi_query), metad['_q'])
                ])

            # supplément 2: prorata de FORBIDDEN_IDS
            suppl = round(len(FORBIDDEN_IDS) * my_quota / size)
            n_already_retrieved += suppl
            n_needed = my_quota + n_already_retrieved

        # adding constraints
        if constraint_query:
            my_query = '(' + combi_query + ') AND (' + constraint_query + ')'
        else:
            my_query = combi_query

        # ----------------- api.search(...) ----------------------------
        json_hits = api.search(my_query,
                               limit=n_needed,
                               outfields=STD_MAP.keys())
        # outfields=('id','author.name','title','publicationDate','corpusName')

        # --------------------------------------------------------------

        # NB: 'id' field would be enough for sampling itself, but we get
        #     more metadatas to be able to provide an info table or to
        #     create a human-readable filename

        # £TODO 1
        # remplacer api.search() par une future fonction random_search
        # cf. elasticsearch guide: "random scoring" (=> puis supprimer
        # l'option B avec n_needed)

        my_n_answers = len(json_hits)

        my_n_got = 0

        # for debug
        # print("HITS:",json_hits, file=stderr)

        # check unicity
        for hit in json_hits:
            idi = hit['id']

            if idi not in index and idi not in FORBIDDEN_IDS:
                # print(hit)
                # exit()
                my_n_got += 1
                # main index
                index[idi] = {
                    '_q': combi_query,
                    'co': hit['corpusName'][0:3]  # trigramme eg 'els'
                }
                # store info
                # £TODO: check conventions for null values
                # £TODO: ajouter tout ça dans STD_MAP
                if 'publicationDate' in hit and len(hit['publicationDate']):
                    index[idi]['yr'] = hit['publicationDate'][0:4]
                else:
                    index[idi]['yr'] = 'XXXX'

                if 'title' in hit and len(hit['title']):
                    index[idi]['ti'] = hit['title']
                else:
                    index[idi]['ti'] = "UNTITLED"

                if 'author' in hit and len(hit['author'][0]['name']):
                    first_auth = hit['author'][0]['name']
                    his_lastname = first_auth.split()[-1]
                    index[idi]['au'] = his_lastname
                else:
                    index[idi]['au'] = "UNKNOWN"

                if 'language' in hit and len(hit['language']):
                    index[idi]['lg'] = hit['language'][0]
                else:
                    index[idi]['lg'] = "UNKOWN_LANG"

                if 'genre' in hit and len(hit['genre']):
                    index[idi]['typ'] = hit['genre'][0]
                else:
                    index[idi]['typ'] = "UNKOWN_GENRE"

                if 'categories' in hit and len(
                        hit['categories']
                ) and 'wos' in hit['categories'] and len(
                        hit['categories']['wos']):
                    index[idi]['cat'] = "/".join(hit['categories']['wos'])
                else:
                    index[idi]['cat'] = "UNKOWN_SCI_CAT"

                if 'qualityIndicators' in hit and 'pdfVersion' in hit[
                        'qualityIndicators']:
                    index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
                else:
                    index[idi]['ver'] = "UNKNOWN_PDFVER"

                if 'qualityIndicators' in hit and 'pdfWordCount' in hit[
                        'qualityIndicators']:
                    index[idi]['wcp'] = hit['qualityIndicators'][
                        'pdfWordCount']
                else:
                    index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"

            # recheck limit: needed as long as n_needed != my_quota
            # (should disappear as consequence of removing option B)
            if my_n_got == my_quota:
                break

        print("%-70s: %i(%i)/%i" %
              (my_query[0:67] + "...", my_n_got, my_n_answers, my_quota),
              file=stderr)

        # if within whole sample_size scope, we may observe unmeatable
        # representativity criteria (marked 'LESS' and checked for RLAX)
        if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
            my_s = "" if my_n_got == 1 else "s"
            LOG.append(
                "LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota"
                % (combi_query, constraint_query, my_n_got, my_s, my_s,
                   my_quota))

        # print("==========my_corpus ITEMS===========")
        # print([kval for kval in my_corpus.items()])

    return (index)
Beispiel #6
0
def sample(size, crit_fields, constraint_query=None, index=None, 
           verbose=False, run_count = 0):
	global LOG
	global LISSAGE
	global FORBIDDEN_IDS
	
	# allows to set default to None instead of tricky-scope mutable {}
	if not index:
		index = {}
		flag_previous_index = False
	else:
		flag_previous_index = True
	
	
	####### POOLING ########
	
	# instead of calling pooling() maybe we have cached the pools ?
	# (always same counts for given criteria) => cache to json
	cache_filename = pool_cache_path(crit_fields)
	print('...checking cache for %s' % cache_filename,file=stderr)
	
	if path.exists(cache_filename):
		cache = open(cache_filename, 'r')
		pool_info = load(cache)
		print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr)
	else:
		print('...no cache found',file=stderr)
		# -> run doc count foreach(combination of crit fields facets) <-
		#        ---------               
		pool_info = field_combo_count.pooling(crit_fields, verbose)
		#           -----------------
	
	# dans les deux cas, même type de json
	abs_freqs       = pool_info['f']
	N_reponses      = pool_info['nr']
	N_workdocs      = pool_info['nd']
	doc_grand_total = pool_info['totd']
	
	# cache write
	cache = open(cache_filename, 'w')
	dump(pool_info, cache, indent=1, sort_keys=True)
	cache.close()
	
	######### QUOTA ########
	#
	# quota computation = target_corpus_size * pool / N
	rel_freqs = {}
	for combi_query in abs_freqs:
		
		# expérimenter avec N_reponses au dénominateur ?
		quota = round(
		  size * abs_freqs[combi_query] / N_workdocs + LISSAGE
		)
		
		if quota != 0:
			rel_freqs[combi_query] = quota
	
	# fyi 3 lines to check if rounding surprise
	rndd_size = sum([quota for combi_query, quota in rel_freqs.items()])
	if verbose:
		print("Méthode des quotas taille avec arrondis:     % 9s" % rndd_size,
		      file=stderr)
	
	# récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné)
	
	# got_ids_idx clés = ensemble d'ids , 
	#             valeurs = critères ayant mené au choix
	
	print("Retrieving random samples in each quota...", file=stderr)
	
	for combi_query in sorted(rel_freqs.keys()):
		
		json_hits = []
		
		# how many hits do we need?
		my_quota = rel_freqs[combi_query]
		
		# adding constraints
		if constraint_query:
			my_query = '('+combi_query+') AND ('+constraint_query+')'
			# pour les indices dispo, on doit recompter avec la contrainte
			all_indices = [i for i in range(api.count(my_query))]
		
		# si pas de contrainte les indices dispos 
		# pour le tirage aléatoire sont simplement [0:freq]
		else:
			my_query = combi_query
			all_indices = [i for i in range(abs_freqs[combi_query])]
		
		
		# on lance search 1 par 1 avec les indices tirés en FROM ---------
		
		# ici => ordre aléatoire
		shuffle(all_indices)
		
		# on ne prend que les n premiers (tirage)
		local_tirage = all_indices[0:my_quota]
		
		# pour infos
		if verbose:
			print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage))
		
		for indice in local_tirage:
			# ----------------- api.search(...) ----------------------------
			new_hit = api.search(my_query, 
								   limit=1,
								   i_from=indice,
								   n_docs=abs_freqs[combi_query],
								   outfields=STD_MAP.keys())
				# outfields=('id','author.name','title','publicationDate','corpusName')
			
			if len(new_hit) != 1:
				# skip vides
				# à cause d'une contrainte
				continue
			else:
				# enregistrement
				json_hits.append(new_hit.pop())
			# --------------------------------------------------------------
		
		# NB: 'id' field would be enough for sampling itself, but we get
		#     more metadatas to be able to provide an info table
		my_n_answers = len(json_hits)
		
		my_n_got = 0
		
		# for debug
		# print("HITS:",json_hits, file=stderr)
		
		
		# check unicity
		for hit in json_hits:
			idi = hit['id']
			
			if idi not in index and idi not in FORBIDDEN_IDS:
				# print(hit)
				# exit()
				my_n_got += 1
				# main index
				index[idi] = {
					'_q': combi_query,
					'co': hit['corpusName'][0:3]  # trigramme eg 'els'
					}
				# store info
				# £TODO: check conventions for null values
				# £TODO: ajouter tout ça dans STD_MAP
				if 'publicationDate' in hit and len(hit['publicationDate']):
					index[idi]['yr'] = hit['publicationDate'][0:4]
				else:
					index[idi]['yr'] = 'XXXX'
				
				if 'title' in hit and len(hit['title']):
					index[idi]['ti'] = hit['title']
				else:
					index[idi]['ti'] = "UNTITLED"
				
				if 'author' in hit and len(hit['author'][0]['name']):
					first_auth = hit['author'][0]['name']
					his_lastname = first_auth.split()[-1]
					index[idi]['au'] = his_lastname
				else:
					index[idi]['au'] = "UNKNOWN"
				
				if 'language' in hit and len(hit['language']):
					index[idi]['lg'] = hit['language'][0]
				else:
					index[idi]['lg'] = "UNKOWN_LANG"
				
				if 'genre' in hit and len(hit['genre']):
					index[idi]['typ'] = hit['genre'][0]
				else:
					index[idi]['typ'] = "UNKOWN_GENRE"
				
				if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']):
					index[idi]['cat'] = "/".join(hit['categories']['wos'])
				else:
					index[idi]['cat'] = "UNKOWN_SCI_CAT"
				
				if 'qualityIndicators' in hit:
					if 'pdfVersion' in hit['qualityIndicators']:
						index[idi]['ver'] = hit['qualityIndicators']['pdfVersion']
					else:
						index[idi]['ver'] = "UNKNOWN_PDFVER"
					if 'pdfWordCount' in hit['qualityIndicators']:
						index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount']
					else:
						index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					if 'refBibsNative' in hit['qualityIndicators']:
						index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative']
					else:
						index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
				else:
					index[idi]['ver'] = "UNKNOWN_PDFVER"
					index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT"
					index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE"
		
		print ("done %-70s: %i/%i" % (
					my_query[0:64]+"...", 
					my_n_got, 
					my_quota
				), file=stderr)
		
		# if within whole sample_size scope, we may observe unmeetable
		# representativity criteria (marked 'LESS' and checked for RLAX)
		if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)):
			my_s = "" if my_n_got == 1 else "s"
			LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota))
			
		# print("==========my_corpus ITEMS===========")
		# print([kval for kval in my_corpus.items()])
		
	return(index)