def pooling(crit_fields, verbose=False): """ Sorte de tableau croisé: compte le nombre de docs pour chaque combinaison de critères Renvoie un dict avec ces décomptes et les totaux Exemple pour les critères corpusName et pdfCharCount { "f": { "corpusName:bmj AND pdfCharCount:[* TO 1999]": 24524, "corpusName:bmj AND pdfCharCount:[2000 TO *]": 662848, "corpusName:brill-journals AND pdfCharCount:[* TO 1999]": 10949, "corpusName:brill-journals AND pdfCharCount:[2000 TO *]": 119318, "corpusName:elsevier AND pdfCharCount:[* TO 1999]": 275461, "corpusName:elsevier AND pdfCharCount:[2000 TO *]": 5740132, "corpusName:nature AND pdfCharCount:[* TO 1999]": 332156, "corpusName:nature AND pdfCharCount:[2000 TO *]": 45139, "corpusName:oup AND pdfCharCount:[* TO 1999]": 58662, "corpusName:oup AND pdfCharCount:[2000 TO *]": 1385591, "corpusName:springer AND pdfCharCount:[* TO 1999]": 61973, "corpusName:springer AND pdfCharCount:[2000 TO *]": 2242902, "corpusName:wiley AND pdfCharCount:[* TO 1999]": 593998, "corpusName:wiley AND pdfCharCount:[2000 TO *]": 4044204 }, "totd": 15982692 # nombre de docs au total dans la base "nd": 15597857, # nombre de docs pour l'ensemble des critères "nr": 15597857, # nombre de réponses pour l'ensemble des critère # (intéressant pour les champs "QCM") } NB: les choix de fourchettes et les valeurs de facettes sont configurables dans field_value_lists.py """ ####### POOLING ######## # N_reponses = 0 N_workdocs = 0 doc_grand_total = 0 # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...) abs_freqs = {} # --------------------------------------------------------------- # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES -------- print("Sending count queries for criteria pools...",file=stderr) ## build all "field:values" pairs per criterion field ## (list of list of strings: future lucene query chunks) all_possibilities = [] # petit hommage à notre collègue Nourdine Combo ! n_combos = 1 for my_criterion in crit_fields: # print("CRIT",my_criterion) field_outcomes = facet_vals(my_criterion) # print("field_outcomes",field_outcomes) n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( [my_criterion + ':' + val for val in field_outcomes] ) # par ex: 2 critères vont donner 2 listes dans all_possibilities # [ # ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], # # ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier', # 'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', # 'corpusName:oup', 'corpusName:journals'] # ] ## list combos (cartesian product of field_outcomes) # we're directly unpacking *args into itertool.product() # (=> we get an iterator over tuples of combinable query chunks) combinations = product(*all_possibilities) # example for -c corpusName, publicationDate # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), # ('corpusName:ecco', 'publicationDate:[2000 TO *]'), # ('corpusName:elsevier', 'publicationDate:[* TO 1959]'), # ('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'), # ('corpusName:elsevier', 'publicationDate:[2000 TO *]'), # (...) # ] # --------------------------------------------------------------- # (2) getting total counts for each criteria -------------------- # number of counted answers # (1 doc can give several hits if a criterion was multivalued) N_reponses = 0 # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: print("pool %i/%i" % (i,n_combos), file=stderr) query = " AND ".join(combi) # counting requests ++++ freq = api.count(query) # print(freq) if verbose: print("pool:'% -30s': % 8i" %(query,freq),file=stderr) # storing and agregation N_reponses += freq abs_freqs[query] = freq # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields])) if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") doc_grand_total = api.count(q='*') print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr) print("---------------------------------", file=stderr) # resulting pool info in f + various totals return {"f":abs_freqs, "nr":N_reponses, "nd":N_workdocs, "totd":doc_grand_total}
def sample(size, crit_fields, constraint_query=None, index=None, verbose=False, run_count = 0): global LOG global LISSAGE global FORBIDDEN_IDS # allows to set default to None instead of tricky-scope mutable {} if not index: index = {} flag_previous_index = False else: flag_previous_index = True ####### POOLING ######## # instead of calling pooling() maybe we have cached the pools ? # (always same counts for given criteria) => cache to json cache_filename = pool_cache_path(crit_fields) print('...checking cache for %s' % cache_filename,file=stderr) if path.exists(cache_filename): cache = open(cache_filename, 'r') pool_info = load(cache) print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr) else: print('...no cache found',file=stderr) # -> run doc count foreach(combination of crit fields facets) <- # --------- pool_info = field_combo_count.pooling(crit_fields, verbose) # ----------------- # dans les deux cas, même type de json abs_freqs = pool_info['f'] N_reponses = pool_info['nr'] N_workdocs = pool_info['nd'] doc_grand_total = pool_info['totd'] # cache write cache = open(cache_filename, 'w') dump(pool_info, cache, indent=1, sort_keys=True) cache.close() ######### QUOTA ######## # # quota computation = target_corpus_size * pool / N rel_freqs = {} for combi_query in abs_freqs: # expérimenter avec N_reponses au dénominateur ? quota = round( size * abs_freqs[combi_query] / N_workdocs + LISSAGE ) if quota != 0: rel_freqs[combi_query] = quota # fyi 3 lines to check if rounding surprise rndd_size = sum([quota for combi_query, quota in rel_freqs.items()]) if verbose: print("Méthode des quotas taille avec arrondis: % 9s" % rndd_size, file=stderr) # récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné) # got_ids_idx clés = ensemble d'ids , # valeurs = critères ayant mené au choix print("Retrieving random samples in each quota...", file=stderr) for combi_query in sorted(rel_freqs.keys()): json_hits = [] # how many hits do we need? my_quota = rel_freqs[combi_query] # adding constraints if constraint_query: my_query = '('+combi_query+') AND ('+constraint_query+')' # pour les indices dispo, on doit recompter avec la contrainte all_indices = [i for i in range(api.count(my_query))] # si pas de contrainte les indices dispos # pour le tirage aléatoire sont simplement [0:freq] else: my_query = combi_query all_indices = [i for i in range(abs_freqs[combi_query])] # on lance search 1 par 1 avec les indices tirés en FROM --------- # ici => ordre aléatoire shuffle(all_indices) # on ne prend que les n premiers (tirage) local_tirage = all_indices[0:my_quota] # pour infos if verbose: print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage), file=stderr) for indice in local_tirage: # ----------------- api.search(...) ---------------------------- new_hit = api.search(my_query, limit=1, i_from=indice, n_docs=abs_freqs[combi_query], outfields=STD_MAP.keys()) # outfields=('id','author.name','title','publicationDate','corpusName') if len(new_hit) != 1: # skip vides # à cause d'une contrainte continue else: # enregistrement json_hits.append(new_hit.pop()) # -------------------------------------------------------------- # NB: 'id' field would be enough for sampling itself, but we get # more metadatas to be able to provide an info table my_n_answers = len(json_hits) my_n_got = 0 # for debug # print("HITS:",json_hits, file=stderr) # check unicity for hit in json_hits: idi = hit['id'] if idi not in index and idi not in FORBIDDEN_IDS: # print(hit) # exit() my_n_got += 1 # main index index[idi] = { '_q': combi_query, 'co': hit['corpusName'][0:3] # trigramme eg 'els' } # store info # £TODO: check conventions for null values # £TODO: ajouter tout ça dans STD_MAP if 'publicationDate' in hit and len(hit['publicationDate']): index[idi]['yr'] = hit['publicationDate'][0:4] else: index[idi]['yr'] = 'XXXX' if 'title' in hit and len(hit['title']): index[idi]['ti'] = hit['title'] else: index[idi]['ti'] = "UNTITLED" if 'author' in hit and len(hit['author'][0]['name']): first_auth = hit['author'][0]['name'] his_lastname = first_auth.split()[-1] index[idi]['au'] = his_lastname else: index[idi]['au'] = "UNKNOWN" if 'language' in hit and len(hit['language']): index[idi]['lg'] = hit['language'][0] else: index[idi]['lg'] = "UNKOWN_LANG" if 'genre' in hit and len(hit['genre']): index[idi]['typ'] = hit['genre'][0] else: index[idi]['typ'] = "UNKOWN_GENRE" if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']): index[idi]['cat'] = "/".join(hit['categories']['wos']) else: index[idi]['cat'] = "UNKOWN_SCI_CAT" if 'qualityIndicators' in hit: if 'pdfVersion' in hit['qualityIndicators']: index[idi]['ver'] = hit['qualityIndicators']['pdfVersion'] else: index[idi]['ver'] = "UNKNOWN_PDFVER" if 'pdfWordCount' in hit['qualityIndicators']: index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount'] else: index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" if 'refBibsNative' in hit['qualityIndicators']: index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative'] else: index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE" else: index[idi]['ver'] = "UNKNOWN_PDFVER" index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE" print ("done %-70s: %i/%i" % ( my_query[0:64]+"...", my_n_got, my_quota ), file=stderr) # if within whole sample_size scope, we may observe unmeetable # representativity criteria (marked 'LESS' and checked for RLAX) if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)): my_s = "" if my_n_got == 1 else "s" LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota)) # print("==========my_corpus ITEMS===========") # print([kval for kval in my_corpus.items()]) return(index)
def pooling(crit_fields, verbose=False): ####### POOLING ######## # N_reponses = 0 N_workdocs = 0 doc_grand_total = 0 # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...) abs_freqs = {} # --------------------------------------------------------------- # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES -------- print("Sending count queries for criteria pools...",file=stderr) ## build all "field:values" pairs per criterion field ## (list of list of strings: future lucene query chunks) all_possibilities = [] # petit hommage à notre collègue Nourdine Combo ! n_combos = 1 for my_criterion in crit_fields: # print("CRIT",my_criterion) field_outcomes = facet_vals(my_criterion) # print("field_outcomes",field_outcomes) n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( [my_criterion + ':' + val for val in field_outcomes] ) # par ex: 2 critères vont donner 2 listes dans all_possibilities # [ # ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], # # ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier', # 'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', # 'corpusName:oup', 'corpusName:journals'] # ] ## list combos (cartesian product of field_outcomes) # we're directly unpacking *args into itertool.product() # (=> we get an iterator over tuples of combinable query chunks) combinations = product(*all_possibilities) # example for -c corpusName, publicationDate # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), # ('corpusName:ecco', 'publicationDate:[2000 TO *]'), # ('corpusName:elsevier', 'publicationDate:[* TO 1959]'), # ('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'), # ('corpusName:elsevier', 'publicationDate:[2000 TO *]'), # (...) # ] # --------------------------------------------------------------- # (2) getting total counts for each criteria -------------------- # number of counted answers # (1 doc can give several hits if a criterion was multivalued) N_reponses = 0 # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: print("pool %i/%i" % (i,n_combos), file=stderr) query = " AND ".join(combi) # counting requests ++++ freq = api.count(query) # print(freq) if verbose: print("pool:'% -30s': % 8i" %(query,freq),file=stderr) # storing and agregation N_reponses += freq abs_freqs[query] = freq # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k+":*" for k in crit_fields])) if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") doc_grand_total = api.count(q='*') print("#all API docs fyi: % 12s" % doc_grand_total,file=stderr) print("---------------------------------", file=stderr) # resulting pool info in f + various totals return {'f':abs_freqs, 'nr':N_reponses, 'nd':N_workdocs, 'totd':doc_grand_total}
def pooling(crit_fields, verbose=False): ####### POOLING ######## # N_reponses = 0 N_workdocs = 0 doc_grand_total = 0 # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...) abs_freqs = {} # --------------------------------------------------------------- # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES -------- print("Sending count queries for criteria pools...", file=stderr) ## build all "field:values" pairs per criterion field ## (list of list of strings: future lucene query chunks) all_possibilities = [] # petit hommage à notre collègue Nourdine Combo ! n_combos = 1 for my_criterion in crit_fields: # print("CRIT",my_criterion) field_outcomes = facet_vals(my_criterion) # print("field_outcomes",field_outcomes) n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( [my_criterion + ':' + val for val in field_outcomes]) # par ex: 2 critères vont donner 2 listes dans all_possibilities # [ # ['qualityIndicators.refBibsNative:T', 'qualityIndicators.refBibsNative:F'], # # ['corpusName:brill', 'corpusName:bmj', 'corpusName:wiley', 'corpusName:elsevier', # 'corpusName:ecco', 'corpusName:eebo', 'corpusName:springer', 'corpusName:nature', # 'corpusName:oup', 'corpusName:journals'] # ] ## list combos (cartesian product of field_outcomes) # we're directly unpacking *args into itertool.product() # (=> we get an iterator over tuples of combinable query chunks) combinations = product(*all_possibilities) # example for -c corpusName, publicationDate # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), # ('corpusName:ecco', 'publicationDate:[2000 TO *]'), # ('corpusName:elsevier', 'publicationDate:[* TO 1959]'), # ('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'), # ('corpusName:elsevier', 'publicationDate:[2000 TO *]'), # (...) # ] # --------------------------------------------------------------- # (2) getting total counts for each criteria -------------------- # number of counted answers # (1 doc can give several hits if a criterion was multivalued) N_reponses = 0 # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: print("pool %i/%i" % (i, n_combos), file=stderr) query = " AND ".join(combi) # counting requests ++++ freq = api.count(query) # print(freq) if verbose: print("pool:'% -30s': % 8i" % (query, freq), file=stderr) # storing and agregation N_reponses += freq abs_freqs[query] = freq # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields])) if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") doc_grand_total = api.count(q='*') print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr) print("---------------------------------", file=stderr) # resulting pool info in f + various totals return { 'f': abs_freqs, 'nr': N_reponses, 'nd': N_workdocs, 'totd': doc_grand_total }
def sample(size, crit_fields, constraint_query=None, index=None, verbose=False, run_count=0): global LOG global LISSAGE global FORBIDDEN_IDS # allows to set default to None instead of tricky-scope mutable {} if not index: index = {} flag_previous_index = False else: flag_previous_index = True ####### POOLING ######## # N_reponses = 0 N_workdocs = 0 doc_grand_total = 0 # dict of counts for each combo ((crit1:val_1a),(crit2:val_2a)...) abs_freqs = {} # instead do steps (1) (2) maybe we have cached the pools ? # (always same counts for given criteria) => cache to json cache_filename = pool_cache_path(crit_fields) print('...checking cache for %s' % cache_filename, file=stderr) if path.exists(cache_filename): cache = open(cache_filename, 'r') pool_info = load(cache) abs_freqs = pool_info['f'] N_reponses = pool_info['nr'] N_workdocs = pool_info['nd'] doc_grand_total = pool_info['totd'] print('...ok cache (%i workdocs)' % N_workdocs, file=stderr) else: print('...no cache found', file=stderr) # (1) PARTITIONING THE SEARCH SPACE IN POSSIBLE OUTCOMES -------- print("Sending count queries for criteria pools...", file=stderr) ## build all "field:values" pairs per criterion field ## (list of list of strings: future lucene query chunks) all_possibilities = [] n_combos = 1 for my_criterion in crit_fields: field_outcomes = facet_vals(my_criterion) n_combos = n_combos * len(field_outcomes) # lucene query chunks all_possibilities.append( [my_criterion + ':' + val for val in field_outcomes]) ## list combos (cartesian product of field_outcomes) # we're directly unpacking *args into itertool.product() # (=> we get an iterator over tuples of combinable query chunks) combinations = product(*all_possibilities) # example for -c corpusName, publicationDate # [ # ('corpusName:ecco', 'publicationDate:[* TO 1959]'), # ('corpusName:ecco', 'publicationDate:[1960 TO 1999]'), # ('corpusName:ecco', 'publicationDate:[2000 TO *]'), # ('corpusName:elsevier', 'publicationDate:[* TO 1959]'), # ('corpusName:elsevier', 'publicationDate:[1960 TO 1999]'), # ('corpusName:elsevier', 'publicationDate:[2000 TO *]'), # (...) # ] # (2) getting total counts for each criteria -------------------- # number of counted answers # (1 doc can give several hits if a criterion was multivalued) N_reponses = 0 # do the counting for each combo for i, combi in enumerate(sorted(combinations)): if i % 100 == 0: print("pool %i/%i" % (i, n_combos), file=stderr) query = " AND ".join(combi) # counting requests ++++ freq = api.count(query) if verbose: print("pool:'% -30s': % 8i" % (query, freq), file=stderr) # storing and agregation N_reponses += freq abs_freqs[query] = freq # number of documents sending answers (hence normalizing constant N) N_workdocs = api.count(" AND ".join([k + ":*" for k in crit_fields])) if verbose: print("--------- pool totals -----------", file=stderr) print("#answered hits : % 12s" % N_reponses, file=stderr) print("#workdocs (N) : % 12s" % N_workdocs, file=stderr) # for comparison: all_docs = N + api.count(q="NOT(criterion:*)") doc_grand_total = api.count(q='*') print("#all API docs fyi: % 12s" % doc_grand_total, file=stderr) print("---------------------------------", file=stderr) # cache write cache = open(cache_filename, 'w') pool_info = { 'f': abs_freqs, 'nr': N_reponses, 'nd': N_workdocs, 'totd': doc_grand_total } # json.dump dump(pool_info, cache, indent=1) cache.close() ######### QUOTA ######## # # (3) quota computation and availability checking ------------------ # quota computation rel_freqs = {} for combi_query in abs_freqs: # expérimenter avec N_reponses au dénominateur ? quota = round(size * abs_freqs[combi_query] / N_workdocs + LISSAGE) if quota != 0: rel_freqs[combi_query] = quota # fyi 3 lines to check if rounding surprise rndd_size = sum([quota for combi_query, quota in rel_freqs.items()]) if verbose: print("Méthode des quotas taille avec arrondis: % 9s" % rndd_size, file=stderr) # récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné) # got_ids_idx clés = ensemble d'ids , # valeurs = critères ayant mené au choix print("Retrieving new sample chunks per pool quota...", file=stderr) for combi_query in sorted(rel_freqs.keys()): # how many hits do we need? my_quota = rel_freqs[combi_query] if not flag_previous_index and not FORBIDDEN_IDS: # option A: direct quota allocation to search limit n_needed = my_quota else: # option B: limit larger than quota by retrieved amount # (provides deduplication margin if 2nd run) # # /!\ wouldn't be necessary at all if we had none or rare # duplicates, like with random result ranking) # supplément 1: items to skip n_already_retrieved = len( # lookup retrieved [ idi for idi, metad in index.items() if search(escape(combi_query), metad['_q']) ]) # supplément 2: prorata de FORBIDDEN_IDS suppl = round(len(FORBIDDEN_IDS) * my_quota / size) n_already_retrieved += suppl n_needed = my_quota + n_already_retrieved # adding constraints if constraint_query: my_query = '(' + combi_query + ') AND (' + constraint_query + ')' else: my_query = combi_query # ----------------- api.search(...) ---------------------------- json_hits = api.search(my_query, limit=n_needed, outfields=STD_MAP.keys()) # outfields=('id','author.name','title','publicationDate','corpusName') # -------------------------------------------------------------- # NB: 'id' field would be enough for sampling itself, but we get # more metadatas to be able to provide an info table or to # create a human-readable filename # £TODO 1 # remplacer api.search() par une future fonction random_search # cf. elasticsearch guide: "random scoring" (=> puis supprimer # l'option B avec n_needed) my_n_answers = len(json_hits) my_n_got = 0 # for debug # print("HITS:",json_hits, file=stderr) # check unicity for hit in json_hits: idi = hit['id'] if idi not in index and idi not in FORBIDDEN_IDS: # print(hit) # exit() my_n_got += 1 # main index index[idi] = { '_q': combi_query, 'co': hit['corpusName'][0:3] # trigramme eg 'els' } # store info # £TODO: check conventions for null values # £TODO: ajouter tout ça dans STD_MAP if 'publicationDate' in hit and len(hit['publicationDate']): index[idi]['yr'] = hit['publicationDate'][0:4] else: index[idi]['yr'] = 'XXXX' if 'title' in hit and len(hit['title']): index[idi]['ti'] = hit['title'] else: index[idi]['ti'] = "UNTITLED" if 'author' in hit and len(hit['author'][0]['name']): first_auth = hit['author'][0]['name'] his_lastname = first_auth.split()[-1] index[idi]['au'] = his_lastname else: index[idi]['au'] = "UNKNOWN" if 'language' in hit and len(hit['language']): index[idi]['lg'] = hit['language'][0] else: index[idi]['lg'] = "UNKOWN_LANG" if 'genre' in hit and len(hit['genre']): index[idi]['typ'] = hit['genre'][0] else: index[idi]['typ'] = "UNKOWN_GENRE" if 'categories' in hit and len( hit['categories'] ) and 'wos' in hit['categories'] and len( hit['categories']['wos']): index[idi]['cat'] = "/".join(hit['categories']['wos']) else: index[idi]['cat'] = "UNKOWN_SCI_CAT" if 'qualityIndicators' in hit and 'pdfVersion' in hit[ 'qualityIndicators']: index[idi]['ver'] = hit['qualityIndicators']['pdfVersion'] else: index[idi]['ver'] = "UNKNOWN_PDFVER" if 'qualityIndicators' in hit and 'pdfWordCount' in hit[ 'qualityIndicators']: index[idi]['wcp'] = hit['qualityIndicators'][ 'pdfWordCount'] else: index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" # recheck limit: needed as long as n_needed != my_quota # (should disappear as consequence of removing option B) if my_n_got == my_quota: break print("%-70s: %i(%i)/%i" % (my_query[0:67] + "...", my_n_got, my_n_answers, my_quota), file=stderr) # if within whole sample_size scope, we may observe unmeatable # representativity criteria (marked 'LESS' and checked for RLAX) if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)): my_s = "" if my_n_got == 1 else "s" LOG.append( "LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota)) # print("==========my_corpus ITEMS===========") # print([kval for kval in my_corpus.items()]) return (index)
def sample(size, crit_fields, constraint_query=None, index=None, verbose=False, run_count = 0): global LOG global LISSAGE global FORBIDDEN_IDS # allows to set default to None instead of tricky-scope mutable {} if not index: index = {} flag_previous_index = False else: flag_previous_index = True ####### POOLING ######## # instead of calling pooling() maybe we have cached the pools ? # (always same counts for given criteria) => cache to json cache_filename = pool_cache_path(crit_fields) print('...checking cache for %s' % cache_filename,file=stderr) if path.exists(cache_filename): cache = open(cache_filename, 'r') pool_info = load(cache) print('...ok cache (%i workdocs)' % pool_info['nd'],file=stderr) else: print('...no cache found',file=stderr) # -> run doc count foreach(combination of crit fields facets) <- # --------- pool_info = field_combo_count.pooling(crit_fields, verbose) # ----------------- # dans les deux cas, même type de json abs_freqs = pool_info['f'] N_reponses = pool_info['nr'] N_workdocs = pool_info['nd'] doc_grand_total = pool_info['totd'] # cache write cache = open(cache_filename, 'w') dump(pool_info, cache, indent=1, sort_keys=True) cache.close() ######### QUOTA ######## # # quota computation = target_corpus_size * pool / N rel_freqs = {} for combi_query in abs_freqs: # expérimenter avec N_reponses au dénominateur ? quota = round( size * abs_freqs[combi_query] / N_workdocs + LISSAGE ) if quota != 0: rel_freqs[combi_query] = quota # fyi 3 lines to check if rounding surprise rndd_size = sum([quota for combi_query, quota in rel_freqs.items()]) if verbose: print("Méthode des quotas taille avec arrondis: % 9s" % rndd_size, file=stderr) # récup AVEC CONTRAINTE et vérif total dispo (obtenu + dédoublonné) # got_ids_idx clés = ensemble d'ids , # valeurs = critères ayant mené au choix print("Retrieving random samples in each quota...", file=stderr) for combi_query in sorted(rel_freqs.keys()): json_hits = [] # how many hits do we need? my_quota = rel_freqs[combi_query] # adding constraints if constraint_query: my_query = '('+combi_query+') AND ('+constraint_query+')' # pour les indices dispo, on doit recompter avec la contrainte all_indices = [i for i in range(api.count(my_query))] # si pas de contrainte les indices dispos # pour le tirage aléatoire sont simplement [0:freq] else: my_query = combi_query all_indices = [i for i in range(abs_freqs[combi_query])] # on lance search 1 par 1 avec les indices tirés en FROM --------- # ici => ordre aléatoire shuffle(all_indices) # on ne prend que les n premiers (tirage) local_tirage = all_indices[0:my_quota] # pour infos if verbose: print(" ... drawing among %i docs :\n ... picked => %s" % (len(all_indices), local_tirage)) for indice in local_tirage: # ----------------- api.search(...) ---------------------------- new_hit = api.search(my_query, limit=1, i_from=indice, n_docs=abs_freqs[combi_query], outfields=STD_MAP.keys()) # outfields=('id','author.name','title','publicationDate','corpusName') if len(new_hit) != 1: # skip vides # à cause d'une contrainte continue else: # enregistrement json_hits.append(new_hit.pop()) # -------------------------------------------------------------- # NB: 'id' field would be enough for sampling itself, but we get # more metadatas to be able to provide an info table my_n_answers = len(json_hits) my_n_got = 0 # for debug # print("HITS:",json_hits, file=stderr) # check unicity for hit in json_hits: idi = hit['id'] if idi not in index and idi not in FORBIDDEN_IDS: # print(hit) # exit() my_n_got += 1 # main index index[idi] = { '_q': combi_query, 'co': hit['corpusName'][0:3] # trigramme eg 'els' } # store info # £TODO: check conventions for null values # £TODO: ajouter tout ça dans STD_MAP if 'publicationDate' in hit and len(hit['publicationDate']): index[idi]['yr'] = hit['publicationDate'][0:4] else: index[idi]['yr'] = 'XXXX' if 'title' in hit and len(hit['title']): index[idi]['ti'] = hit['title'] else: index[idi]['ti'] = "UNTITLED" if 'author' in hit and len(hit['author'][0]['name']): first_auth = hit['author'][0]['name'] his_lastname = first_auth.split()[-1] index[idi]['au'] = his_lastname else: index[idi]['au'] = "UNKNOWN" if 'language' in hit and len(hit['language']): index[idi]['lg'] = hit['language'][0] else: index[idi]['lg'] = "UNKOWN_LANG" if 'genre' in hit and len(hit['genre']): index[idi]['typ'] = hit['genre'][0] else: index[idi]['typ'] = "UNKOWN_GENRE" if 'categories' in hit and len(hit['categories']) and 'wos' in hit['categories'] and len(hit['categories']['wos']): index[idi]['cat'] = "/".join(hit['categories']['wos']) else: index[idi]['cat'] = "UNKOWN_SCI_CAT" if 'qualityIndicators' in hit: if 'pdfVersion' in hit['qualityIndicators']: index[idi]['ver'] = hit['qualityIndicators']['pdfVersion'] else: index[idi]['ver'] = "UNKNOWN_PDFVER" if 'pdfWordCount' in hit['qualityIndicators']: index[idi]['wcp'] = hit['qualityIndicators']['pdfWordCount'] else: index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" if 'refBibsNative' in hit['qualityIndicators']: index[idi]['bibnat'] = hit['qualityIndicators']['refBibsNative'] else: index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE" else: index[idi]['ver'] = "UNKNOWN_PDFVER" index[idi]['wcp'] = "UNKNOWN_PDFWORDCOUNT" index[idi]['bibnat'] = "UNKNOWN_REFBIBSNATIVE" print ("done %-70s: %i/%i" % ( my_query[0:64]+"...", my_n_got, my_quota ), file=stderr) # if within whole sample_size scope, we may observe unmeetable # representativity criteria (marked 'LESS' and checked for RLAX) if run_count == 0 and my_n_got < (.85 * (my_quota - LISSAGE)): my_s = "" if my_n_got == 1 else "s" LOG.append("LESS: catégorie '%s' sous-représentée pour contrainte \"%s\" : %i doc%s obtenu%s sur %i quota" % (combi_query, constraint_query, my_n_got, my_s, my_s, my_quota)) # print("==========my_corpus ITEMS===========") # print([kval for kval in my_corpus.items()]) return(index)