def load_status(project_id): """loads the status from disc""" with app.app_context(): location = app.config.get("LIBINTEL_DATA_DIR") path_to_file = location + '/out/' + project_id + '/status.json' with open(path_to_file) as json_file: status = json.load(json_file) json_file.close() return Status(**status)
def read_status(self): if not self.read_boolean(): return None return Status(self.read_long(), self.read_enum(StatusType), self.read_long(), self.read_long(), self.read_int())
def data_collection_execution(project_id): """ run the data collection :parameter project_id the id of the current project """ mode = '' if request.args.get('mode') is not None: mode = request.args.get('mode') app.logger.info('project {}: collecting data with mode {}'.format( project_id, mode)) # load project, set status bools, and load and eid list. initialize missed eid list project = project_service.load_project(project_id) project.isDataCollecting = True project.isDataCollected = False eids = eids_service.load_eid_list(project_id, mode) missed_eids = [] with app.app_context(): keys = app.config.get("LIBINTEL_SCOPUS_KEYS") # initialize status, set to collecting and save status status = Status("DATA_COLLECTING") status.total = len(eids) status_service.save_status(project_id, status) if status.total > 0: if mode != 'missed': elasticsearch_service.delete_index(project.project_id) else: eids_service.deleteMissedEids() if type(keys) is tuple: # the number of threads is given by the number of available API keys number_of_threads = len(keys) app.logger.info('project {}: collecting data in {} threads'.format( project_id, number_of_threads)) # gather the individual chunks provided to each process length_of_chunks = math.ceil(status.total / number_of_threads) list_chunks = list(chunks(eids, length_of_chunks)) # make asynchronous calls and delegate the individual collection to the individual threads for key_index, key in enumerate(keys): if len(list_chunks) > key_index: thread = Thread(target=collect_data, args=(list_chunks[key_index], project.project_id, project.name, key_index, key, app._get_current_object())) thread.start() return Response('finished', status=204) collect_data(eids=eids, project_id=project.project_id, project_name=project.name, i=0, key=keys, app=app._get_current_object()) # if only one API-Key is given, collect data sequentially for idx, eid in enumerate(eids): # set scopus api-key to the provided key scopus.config['Authentication']['APIKEy'] = keys # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve data from scopus try: scopus_abstract = scopus.AbstractRetrieval(identifier=eid, id_type='eid', view="FULL", refresh=True) app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) except Exception as inst: app.logger.error( 'project {}: could not collect scopus data for EID {}, exception: {}' .format(project_id, eid, type(inst))) missed_eids.append(eid) continue # create new AllResponses object to hold the individual information response = AllResponses(eid, project.name, project.project_id) # add scopus abstract to AllResponses object response.scopus_abstract_retrieval = scopus_abstract # get doi and collect unpaywall data and Altmetric data doi = scopus_abstract.doi if doi is not None: if doi != "": response.unpaywall_response = Unpaywall(doi) response.altmetric_response = Altmetric(doi) response.scival_data = Scival([]) # send response to elastic search index elasticsearch_service.send_to_index(response, project.project_id) app.logger.info('project {}: saved EID {} to elasticsearch'.format( project_id, eid)) eids_service.save_eid_list(project_id=project.project_id, eids=missed_eids, prefix='missed_') app.logger.info('project {}: all EID data collected'.format(project_id)) status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) project.isDataCollecting = False project.isDataCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def references_collection_execution(project_id): """ collects the references for a given collection of publications :param project_id: the ID of the current project :return: 204 if successful """ # initialize lists, read sample size from request and load eid list sample_size = int(request.args.get('sample_size')) missed_eids = [] references_eids = [] eids = eids_service.load_eid_list(project_id) # load project and set booleans project = project_service.load_project(project_id) project.isReferencesCollecting = True project.isReferencesCollected = False project_service.save_project(project) # prepare status status = Status("REFERENCES_COLLECTING") status.total = eids.__len__() status_service.save_status(project_id, status) # if eids are given, cycle through all of them if status.total > 0: for idx, eid in enumerate(eids): # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve refereces from scopus try: scopus_abstract = scopus.AbstractRetrieval(eid, view="FULL") app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) if scopus_abstract.references is not None: references_eids = references_eids + scopus_abstract.references else: app.logger.warn( 'project {}: no references given in scopus export for EID {}.' .format(project_id, eid)) except IOError: app.logger.error( 'project {}: could not collect scopus data for EID {}'. format(project_id, eid)) missed_eids.append(eid) continue # transform references eids into tuple and calculate the occurences references_eids_tuple = tuple(references_eids) occurences = Counter(references_eids_tuple) most_occurences = occurences.most_common(sample_size) # save the counter with the most occurences to disk counter_service.save_counter(project_id, most_occurences, 'references_') eids_service.save_eid_list(project_id, missed_eids, prefix='missed_') # set the status and save it to disk status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) # set the project booleans and save it to disk project.isReferencesCollecting = False project.isReferencesCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def query_execution(project_id): """ executes the defined and saved query in scopus :param project_id: the ID of the current project :return: 'finished' with a status of 204 when the query was executed successfully """ app.logger.info('project {}: running queries'.format(project_id)) # reads the saved Scopus search string from disk scopus_queries = query_service.load_scopus_queries(project_id) # retrieve the project from disk, set the booleans and save the project project = project_service.load_project(project_id) project.isEidsCollected = False project.isEidsCollecting = True project_service.save_project(project) # prepares the status file status = Status("EIDS_COLLECTING") status_service.save_status(project_id, status) # prepare EIDs list eids = [] for index, search_strings in enumerate(scopus_queries.search_strings): individual_eids = [] for search_string in search_strings: app.logger.info('project {}: executing search {} - {}'.format( project_id, index, search_string)) search = scopus.ScopusSearch(search_string, refresh=True, field='eid', view='STANDARD') if search.results is not None: app.logger.info( 'project {}: result search {} - {} entries found'.format( project_id, index, len(search.results))) for result in search.results: # add EID if it is not already in the list (from a former search) eids.append(result.eid) individual_eids.append(result.eid) eids_service.save_eid_list( project_id=project_id, eids=set(individual_eids), prefix=(str(scopus_queries.search_ids[index]) + '_')) # convert to set in order to remove duplicates eids = set(eids) # print the results to the command line for logging app.logger.info('project {}: found {} eids in Scopus'.format( project_id, len(eids))) # persist EIDs to file eids_service.save_eid_list(project_id=project_id, eids=eids) # set the total number of results to the relevance_measures measure save it to disk relevance_measure = RelevanceMeasure( number_of_search_results=eids.__len__()) relevance_measure_service.save_relevance_measures(project_id, relevance_measure) # set the total number of results to the status save it to disk status.total = relevance_measure.number_of_search_results status_service.save_status(project_id, status) # set the status and save it to disk status = Status("EIDS_COLLECTED") status_service.save_status(project_id, status) # set the project boolean and save the project project.isEidslist = True project.isEidsCollected = True project.isEidsCollecting = False project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)