Ejemplo n.º 1
0
	# (3/4) téléchargement des fulltexts ---------------------------------
	
	my_ids = cobj.cols['istex_id']
	my_basenames = cobj.bnames
	
	for the_shelf in ['PDF0', 'XMLN']:
		the_api_type = cobj.origin(the_shelf)
		the_ext      = cobj.filext(the_shelf)
		tgt_dir      = cobj.shelf_path(the_shelf)
		
		print("mkdir -p: %s" % tgt_dir)
		mkdir(tgt_dir)
		
		api.write_fulltexts_loop_interact(
			my_ids, my_basenames,
			tgt_dir   = tgt_dir,
			api_types = [the_api_type]
			)
		print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name)
		if debug > 0:
			print("  (=> target dir:%s)" % tgt_dir)
		
		# NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API
		#  ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions
	
	cobj.assert_docs('PDF0')
	cobj.assert_docs('XMLN')
	
	# persistance du statut des 2 dossiers créés
	cobj.save_shelves_status()
Ejemplo n.º 2
0
def make_set(corpus_name,
			from_table=None, 
			size=None, 
			constraint=None,
			debug=0):
	"""
	Initialisation d'un corpus basique et remplissage de ses fulltexts
	
	3 façons de l'appeler :
	  - soit on fournit une table de métadonnées infos.tab (chemin fs)
	  - soit on fournit une taille (sampling directement avec l'API)
	  - soit on ne fournit rien et il fait un sampling de 10 docs
	
	Métadonnées, rangées dans CORPUS_HOME/<corpus_name>/meta/
	  - basenames.ls
	  - infos.tab
	
	Données: 3 formats, rangés dans CORPUS_HOME/<corpus_name>/data/
	  - .pdf, 
	  - .xml (natif) 
	  - et .tei.xml (pub2tei)
	
	
	
	Position dans le système de fichier
	 cf sous lib/global_conf.ini
	                -> variable CORPUS_HOME 
	                     -> mise par défaut à ./corpora/
	"""
	
	# test de base avant de tout mettre dans le dossier
	# (le seul dossier qu'on n'écrase jamais)
	future_dir = path.join(MY_CORPUS_HOME, corpus_name)
	if path.exists(future_dir):
		print("ERR:'%s'\nLe nom '%s' est déjà pris dans le dossier des corpus." % 
		       (future_dir, corpus_name), file=stderr)
		exit(1)
	
	# (1/4) echantillon initial (juste la table) -------------------------
	
	# soit on a déjà une table
	if from_table and size:
		print("""ERR bako.make_set:
		         fournir au choix 'from_table' ou 'size', mais pas les 2.""",
		         file=stderr)
		exit(1)

	if from_table:
		if path.exists(from_table):
			fic = open(from_table)
			my_tab = fic.readlines()
			fic.close()
		else:
			print("ERR bako.make_set: je ne trouve pas le fichier '%s'" % from_table, file=stderr)
			exit(1)
	
	# sinon sampling
	else:
		if not size:
			size = 10
		
		if not constraint:
			ok_corpora = CONF['workshop']['PREPROC_READY_CORPORA']
			corpora_constraint = " OR ".join(['corpusName:'+corpus for corpus in ok_corpora.split(",")])
			constraint = "qualityIndicators.refBibsNative:true AND (" + corpora_constraint +")"
		if isinstance(size, int):
			my_tab = sampler.full_run(
					['-n', str(size), 
					 '--outmode', 'tab', 
					 '--with', constraint,
					 '-v'
					 ]
				)
		else:
			print("ERR bako.make_set: 'size' doit être un entier'%s'" 
			       % from_table, file=stderr)
			exit(1)
	
	# (2/4) notre classe corpus ------------------------------------------
	
	# Corpus
	# initialisation
	#  - mode tab seul => fera un dossier meta/ et un data/ vide,
	#  - le corpus_type est mis en dur à 'gold' ce qui signale
	#    simplement qu'on ne change pas les étagères par défaut)
	cobj = Corpus(corpus_name,
					new_infos = my_tab, 
					new_home  = MY_CORPUS_HOME,
					verbose = (debug>0),
					corpus_type='gold')
	
	# (3/4) téléchargement des fulltexts ---------------------------------
	
	my_ids = cobj.cols['istex_id']
	my_basenames = cobj.bnames
	
	for the_shelf in ['PDF0', 'XMLN']:
		the_api_type = cobj.origin(the_shelf)
		the_ext      = cobj.filext(the_shelf)
		tgt_dir      = cobj.shelf_path(the_shelf)
		
		print("mkdir -p: %s" % tgt_dir,file=stderr)
		makedirs(tgt_dir)
		
		api.write_fulltexts_loop_interact(
			my_ids, my_basenames,
			api_conf  = CONF['istex-api'],
			tgt_dir   = tgt_dir,
			api_types = [the_api_type]
			)
		print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name)
		if debug > 0:
			print("  (=> target dir:%s)" % tgt_dir)
		
		# NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API
		#  ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions
	
	cobj.assert_docs('PDF0')
	cobj.assert_docs('XMLN')
	
	# persistance du statut des 2 dossiers créés
	cobj.save_shelves_status()

	
	# (4/4) conversion tei (type gold biblStruct) ------------------------
	
	# copie en changeant les pointeurs dtd
	print("***DTD LINKING***")
	cobj.dtd_repair(debug_lvl = debug)
	
	print("***XML => TEI.XML CONVERSION***")
	
	# créera le dossier C-goldxmltei
	cobj.pub2goldtei(debug_lvl = debug)      # conversion
	
	cobj.assert_docs('GTEI')
	
	# persistence du statut du dossier créé
	cobj.save_shelves_status()
	
	# we return the new filled corpus for further work or display
	return cobj
Ejemplo n.º 3
0
def full_run(arglist=None):
	global LOG
	global LISSAGE
	# output lines for direct use or print to STDOUT if __main__
	output_array = []
	
	# cli arguments
	args = my_parse_args(arglist)
	
	# do we need to change smoothing ?
	if args.smoothing_init and float(args.smoothing_init) > 0:
		print("Setting initial smoothing to %.2f" % args.smoothing_init, file=stderr)
		# global var change in main
		LISSAGE = args.smoothing_init
	
	# event log lines
	LOG = ['INIT: sampling %i' % args.sample_size]
	LOG.append('CRIT: fields(%s)' % ", ".join(args.criteria_list))
	if args.with_constraint_query:
		LOG.append('WITH: constraint query "%s"' % args.with_constraint_query)
	
	run_counter = 0
	
	# initial sampler run
	got_ids_idx = sample(
						args.sample_size,
						args.criteria_list,
						constraint_query = args.with_constraint_query,
						verbose = args.verbose,
						run_count = run_counter
						)
	run_counter += 1
	
	# how much is there?
	n_ids = len(got_ids_idx)
	
	# info
	print('-'*27 + " initial result : %i docs " % n_ids + '-'*27,
		  file=stderr)
	
	LOG.append("XGOT: picked %i" % n_ids)
	
	# check combopools status
	insufficient_pool_flag = False
	for sig in LOG:
		if search("^LESS:", sig):
			insufficient_pool_flag = True
			break
	
	# --------- a posteriori corrections -------------
	#
	# the initial quotas can take neither the "with_constraint arg"
	# nor "multiple choice fields" into account (unless use N_reponse?)
	
	# for that reason at this point in the process we may have more or
	# less than the requested sample_size
	
	# IF not enough => new sample run with lighter criteria
	if n_ids < args.sample_size:
		
		actual_criteria = args.criteria_list
		
		# keep trying...
		while (n_ids < args.sample_size and run_counter < MAX_RUNS):
			
			# => over "delta" (missing docs)
			remainder = args.sample_size - n_ids
			LOG.append("REDO: re-pioche sur %i docs" % remainder)
			
			# => with more help to small categories
			LISSAGE += 0.2
			LOG.append("SMOO: smoothing up to %.02f" % LISSAGE)
			
			# => and with less criteria if necessary
			# (if criteria pool insufficient under some constraints, we
			#  do need to relax at least one criterion, but which one?)
			if len(actual_criteria) > 1 and insufficient_pool_flag:
				# simplify criteria by removing the last one
				new_criteria = actual_criteria[0:-1]
				LOG.append("RLAX: abandon équilibrage champ '%s'" %
								actual_criteria[-1])
				
				# reset flag (£TODO recalculate after run ?)
				insufficient_pool_flag = False
			else:
				new_criteria = actual_criteria
			
			# -------- RE-RUN ---------
			previous_ids = got_ids_idx
			got_ids_idx = sample(
						remainder,
						new_criteria,
						constraint_query = args.with_constraint_query,
						index = previous_ids,
						verbose = args.verbose
						)
			
			# recount
			apport = len(got_ids_idx) - n_ids
			
			# update
			n_ids += apport
			run_counter += 1
			
			# warn
			LOG.append("XGOT: picked %i" % apport)
			print('-'*22 + " resultat après run %i: %i documents " 
			    % (run_counter, n_ids) + '-'*22, file=stderr)
	
	
	# IF overflow => random pruning
	if n_ids > args.sample_size:
		deck = [did for did in got_ids_idx.keys()]
		# random removal of excess documents
		shuffle(deck)
		nd = n_ids - args.sample_size
		sacrificed = deck[0:nd]
		for did in sacrificed:
			del got_ids_idx[did]
		LOG.append("XDEL: sacrificing %i random docs" % nd)
	
	# last recount
	n_ids = len(got_ids_idx)
	print('-'*29 +" final result: %i docs "%n_ids+'-'*29, file=stderr)
	
	# -------------- OUTPUT --------------------------------------------
	
	# ***(ids)***
	if args.out_type == 'ids':
		for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']):
			output_array.append("%s" % did)
	
	# ***(tab)***
	elif args.out_type == 'tab':
		# header line
		# £TODO STD_MAP
		output_array.append("\t".join(['istex_id', 'corpus', 'pub_year', 'pub_period', 'pdfver', 'pdfwc','bibnat',
						 'author_1','lang','doctype_1','cat_sci', 'title']))
		# contents
		for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']):
			# provenance: sample() => boucle par hits (l.500 ++)
			# print("INFO----------",info, file=stderr)
			# exit()
			
			period = year_to_range(info['yr'])
			
			
			output_array.append("\t".join([ did,
			                                info['co'],
			                                info['yr'],
			                                period,
			                                info['ver'],
			                                str(info['wcp']),
			                                str(info['bibnat']),
			                                info['au'],
			                                info['lg'],
			                                info['typ'],
			                                info['cat'],
			                                info['ti'],
			                                #~ info['_q']
			                                ]
			                              )
			             )
	
	# ***(docs)***
	# no output lines but writes a dir
	elif args.out_type == 'docs':
		my_dir = path.join(getcwd(),my_name)
		mkdir(my_dir)
		
		# two "parallel" lists
		ids = list(got_ids_idx.keys())
		basenames = [std_filename(one_id, got_ids_idx[one_id]) for one_id in ids]
		
		# loop with interactive authentification prompt if needed
		api.write_fulltexts_loop_interact(
			ids, basenames,
			tgt_dir=my_dir,
			api_types=['metadata/xml',
					   'fulltext/pdf',
					   'fulltext/tei']
		)
		
		LOG.append("SAVE: saved docs in %s/" % my_dir)
	
	if args.log:
		# separate logging lines
		logfile = open(my_name+'.log', 'w')
		for lline in LOG:
			print(lline, file=logfile)
		logfile.close()
	
	return output_array
Ejemplo n.º 4
0
    # (3/4) téléchargement des fulltexts ---------------------------------

    my_ids = cobj.cols['istex_id']
    my_basenames = cobj.bnames

    for the_shelf in ['PDF0', 'XMLN']:
        the_api_type = cobj.origin(the_shelf)
        the_ext = cobj.filext(the_shelf)
        tgt_dir = cobj.shelf_path(the_shelf)

        print("mkdir -p: %s" % tgt_dir)
        mkdir(tgt_dir)

        api.write_fulltexts_loop_interact(my_ids,
                                          my_basenames,
                                          tgt_dir=tgt_dir,
                                          api_types=[the_api_type])
        print("MAKE_SET: saved docs into CORPUS_HOME:%s" % cobj.name)
        if debug > 0:
            print("  (=> target dir:%s)" % tgt_dir)

        # NB: il doit y avoir la même extension dans cobj.filext(the_shelf) que chez l'API
        #  ou alors api.write_fulltexts doit autoriser à changer (renommer) les extensions

    cobj.assert_docs('PDF0')
    cobj.assert_docs('XMLN')

    # persistance du statut des 2 dossiers créés
    cobj.save_shelves_status()

    # (4/4) conversion tei (type gold biblStruct) ------------------------
Ejemplo n.º 5
0
def full_run(arglist=None):
    global LOG
    global LISSAGE
    # output lines for direct use or print to STDOUT if __main__
    output_array = []

    # cli arguments
    args = my_parse_args(arglist)

    # do we need to change smoothing ?
    if args.smoothing_init and float(args.smoothing_init) > 0:
        print("Setting initial smoothing to %.2f" % args.smoothing_init,
              file=stderr)
        # global var change in main
        LISSAGE = args.smoothing_init

    # event log lines
    LOG = ['INIT: sampling %i' % args.sample_size]
    LOG.append('CRIT: fields(%s)' % ", ".join(args.criteria_list))
    if args.with_constraint_query:
        LOG.append('WITH: constraint query "%s"' % args.with_constraint_query)

    run_counter = 0

    # initial sampler run
    got_ids_idx = sample(args.sample_size,
                         args.criteria_list,
                         constraint_query=args.with_constraint_query,
                         verbose=args.verbose,
                         run_count=run_counter)
    run_counter += 1

    # how much is there?
    n_ids = len(got_ids_idx)

    # info
    print('-' * 27 + " initial result : %i docs " % n_ids + '-' * 27,
          file=stderr)

    LOG.append("XGOT: picked %i" % n_ids)

    # check combopools status
    insufficient_pool_flag = False
    for sig in LOG:
        if search("^LESS:", sig):
            insufficient_pool_flag = True
            break

    # --------- a posteriori corrections -------------
    #
    # the initial quotas can take neither the "with_constraint arg"
    # nor "multiple choice fields" into account (unless use N_reponse?)

    # for that reason at this point in the process we may have more or
    # less than the requested sample_size

    # IF not enough => new sample run with lighter criteria
    if n_ids < args.sample_size:

        actual_criteria = args.criteria_list

        # keep trying...
        while (n_ids < args.sample_size and run_counter < MAX_RUNS):

            # => over "delta" (missing docs)
            remainder = args.sample_size - n_ids
            LOG.append("REDO: re-pioche sur %i docs" % remainder)

            # => with more help to small categories
            LISSAGE += 0.2
            LOG.append("SMOO: smoothing up to %.02f" % LISSAGE)

            # => and with less criteria if necessary
            # (if criteria pool insufficient under some constraints, we
            #  do need to relax at least one criterion, but which one?)
            if len(actual_criteria) > 1 and insufficient_pool_flag:
                # simplify criteria by removing the last one
                new_criteria = actual_criteria[0:-1]
                LOG.append("RLAX: abandon équilibrage champ '%s'" %
                           actual_criteria[-1])

                # reset flag (£TODO recalculate after run ?)
                insufficient_pool_flag = False
            else:
                new_criteria = actual_criteria

            # -------- RE-RUN ---------
            previous_ids = got_ids_idx
            got_ids_idx = sample(remainder,
                                 new_criteria,
                                 constraint_query=args.with_constraint_query,
                                 index=previous_ids,
                                 verbose=args.verbose)

            # recount
            apport = len(got_ids_idx) - n_ids

            # update
            n_ids += apport
            run_counter += 1

            # warn
            LOG.append("XGOT: picked %i" % apport)
            print('-' * 22 + " resultat après run %i: %i documents " %
                  (run_counter, n_ids) + '-' * 22,
                  file=stderr)

    # IF overflow => random pruning
    if n_ids > args.sample_size:
        deck = [did for did in got_ids_idx.keys()]
        # random removal of excess documents
        shuffle(deck)
        nd = n_ids - args.sample_size
        sacrificed = deck[0:nd]
        for did in sacrificed:
            del got_ids_idx[did]
        LOG.append("XDEL: sacrificing %i random docs" % nd)

    # last recount
    n_ids = len(got_ids_idx)
    print('-' * 29 + " final result: %i docs " % n_ids + '-' * 29, file=stderr)

    # -------------- OUTPUT --------------------------------------------

    # ***(ids)***
    if args.out_type == 'ids':
        for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']):
            output_array.append("%s" % did)

    # ***(tab)***
    elif args.out_type == 'tab':
        # header line
        # £TODO STD_MAP
        output_array.append("\t".join([
            'istex_id', 'corpus', 'pub_year', 'pub_period', 'pdfver', 'pdfwc',
            'author_1', 'lang', 'doctype_1', 'cat_sci', 'title'
        ]))
        # contents
        for did, info in sorted(got_ids_idx.items(), key=lambda x: x[1]['_q']):
            # provenance: sample() => boucle par hits (l.500 ++)
            # print("INFO----------",info, file=stderr)
            # exit()

            period = year_to_range(info['yr'])

            output_array.append("\t".join([
                did,
                info['co'],
                info['yr'],
                period,
                info['ver'],
                str(info['wcp']),
                info['au'],
                info['lg'],
                info['typ'],
                info['cat'],
                info['ti'],
                #~ info['_q']
            ]))

    # ***(docs)***
    # no output lines but writes a dir
    elif args.out_type == 'docs':
        my_dir = path.join(getcwd(), my_name)
        mkdir(my_dir)

        # two "parallel" lists
        ids = list(got_ids_idx.keys())
        basenames = [
            std_filename(one_id, got_ids_idx[one_id]) for one_id in ids
        ]

        # loop with interactive authentification prompt if needed
        api.write_fulltexts_loop_interact(
            ids,
            basenames,
            tgt_dir=my_dir,
            api_types=['metadata/xml', 'fulltext/pdf'])

        LOG.append("SAVE: saved docs in %s/" % my_dir)

    return (output_array, LOG)