def execute_query(project_id, query_id): """ executes the defined and saved query in scopus :param project_id: the ID of the current project :return: 'finished' with a status of 204 when the query was executed successfully """ app.logger.info('project {}: running query {}'.format( project_id, query_id)) # reads the saved Scopus search string from disk scopus_queries = query_service.load_scopus_queries(project_id, query_id) # retrieve the project from disk, set the booleans and save the project project = project_service.load_project(project_id) project.isEidsCollected = False project.isEidsCollecting = True project_service.save_project(project) eids = scopus_service.execute_query(scopus_queries) # print the results to the command line for logging app.logger.info('project {}: found {} entries in Scopus'.format( project_id, len(eids))) # persist EIDs to file identifier_service.save_id_list(project_id=project_id, query_id=query_id, identifiers=eids) # set the project boolean and save the project project.isEidslist = True project.isEidsCollected = True project.isEidsCollecting = False project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def set_survey_id(project_id): survey_id = request.form['survey_id'] project = project_service.load_project(project_id) project.survey_id = survey_id project_service.save_project(project) app.logger.info('project {}: connecting with survey {}'.format( project_id, survey_id)) return Response('survey ID saved', status=204)
def upload_sample_judgement_file(project_id): with app.app_context(): location = app.config.get("LIBINTEL_DATA_DIR") print("saving sample test file for " + project_id) project = project_service.load_project(project_id) file = request.files['sample-judgement-file'] path_to_save = location + '/out/' + project_id + '/' if not os.path.exists(path_to_save): os.makedirs(path_to_save) file.save(path_to_save + 'sample_judgement_eids_list.csv') project.isSampledata = True project_service.save_project(project) return Response('list saved', status=204)
def upload_test_file(project_id): with app.app_context(): location = app.config.get("LIBINTEL_DATA_DIR") print("saving test file for " + project_id) project = project_service.load_project(project_id) file = request.files['test-file'] path_to_save = location + '/out/' + project_id + '/' if not os.path.exists(path_to_save): os.makedirs(path_to_save) file.save(path_to_save + 'test_eids_list.txt') project['isTestdata'] = True project_service.save_project(project) return Response('list saved', status=204)
def save_project(): # Get the client provided project project = get_project_from_req(request) response_body = {} if project: operation = "save" if "_id" in project else "create" project = project_service.save_project(project) if project: populate_response_with_project(response_body, project) else: response_body["error"] = True response_body["message"] = "Error saving the project" now = str(datetime.now()) print ">> %s\t%s\t%s_project()\t[%s]" % \ (request.remote_addr, now, operation, project["_id"]) else: response_body["error"] = True response_body["message"] = "No project provided to save" # Create the response object response = make_response(jsonify(response_body)) return response
def upload_xml_file(project_id): """ retrieves the query xml file from the request and saves it to disc :param project_id: the ID of the current project :return: returns a status of 204 when the file could be saved """ with app.app_context(): location = app.config.get("LIBINTEL_DATA_DIR") app.logger.info("project {}: saving uploaded xml file".format(project_id)) if request.method == 'POST': project = project_service.load_project(project_id) file = request.files['query_xml'] path_to_save = location + '/out/' + project_id + '/' if not os.path.exists(path_to_save): os.makedirs(path_to_save) file.save(path_to_save + 'query.xml') project.isQueryDefined = True project_service.save_project(project) return Response("OK", status=204)
def save_query_as_xml(project_id): """ saves the query as xml document in the working directory as query.json file. Creates a scopus search string and saves it as scopus_search_string.txt. Sets project.isQueryDefined = True :param project_id: the ID of the current project :return: the saved query """ project = project_service.load_project(project_id) query_json = request.get_json(silent=True) query = query_service.from_json(query_json) try: query_service.save_query_to_xml(project_id, query) app.logger.info( 'project {}: successfully saved query to xml'.format(project_id)) except IOError: app.logger.warn( 'project {}: could not save query to xml'.format(project_id)) return Response("could not save query", status=500) query_service.create_scopus_queries(project_id, query) project.isQueryDefined = True app.logger.info('project {}: scopus queried defined'.format(project_id)) project_service.save_project(project) return json.dumps(query, default=lambda o: o.__getstate__())
def data_collection_execution(project_id): """ run the data collection :parameter project_id the id of the current project """ mode = '' if request.args.get('mode') is not None: mode = request.args.get('mode') app.logger.info('project {}: collecting data with mode {}'.format( project_id, mode)) # load project, set status bools, and load and eid list. initialize missed eid list project = project_service.load_project(project_id) project.isDataCollecting = True project.isDataCollected = False eids = eids_service.load_eid_list(project_id, mode) missed_eids = [] with app.app_context(): keys = app.config.get("LIBINTEL_SCOPUS_KEYS") # initialize status, set to collecting and save status status = Status("DATA_COLLECTING") status.total = len(eids) status_service.save_status(project_id, status) if status.total > 0: if mode != 'missed': elasticsearch_service.delete_index(project.project_id) else: eids_service.deleteMissedEids() if type(keys) is tuple: # the number of threads is given by the number of available API keys number_of_threads = len(keys) app.logger.info('project {}: collecting data in {} threads'.format( project_id, number_of_threads)) # gather the individual chunks provided to each process length_of_chunks = math.ceil(status.total / number_of_threads) list_chunks = list(chunks(eids, length_of_chunks)) # make asynchronous calls and delegate the individual collection to the individual threads for key_index, key in enumerate(keys): if len(list_chunks) > key_index: thread = Thread(target=collect_data, args=(list_chunks[key_index], project.project_id, project.name, key_index, key, app._get_current_object())) thread.start() return Response('finished', status=204) collect_data(eids=eids, project_id=project.project_id, project_name=project.name, i=0, key=keys, app=app._get_current_object()) # if only one API-Key is given, collect data sequentially for idx, eid in enumerate(eids): # set scopus api-key to the provided key scopus.config['Authentication']['APIKEy'] = keys # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve data from scopus try: scopus_abstract = scopus.AbstractRetrieval(identifier=eid, id_type='eid', view="FULL", refresh=True) app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) except Exception as inst: app.logger.error( 'project {}: could not collect scopus data for EID {}, exception: {}' .format(project_id, eid, type(inst))) missed_eids.append(eid) continue # create new AllResponses object to hold the individual information response = AllResponses(eid, project.name, project.project_id) # add scopus abstract to AllResponses object response.scopus_abstract_retrieval = scopus_abstract # get doi and collect unpaywall data and Altmetric data doi = scopus_abstract.doi if doi is not None: if doi != "": response.unpaywall_response = Unpaywall(doi) response.altmetric_response = Altmetric(doi) response.scival_data = Scival([]) # send response to elastic search index elasticsearch_service.send_to_index(response, project.project_id) app.logger.info('project {}: saved EID {} to elasticsearch'.format( project_id, eid)) eids_service.save_eid_list(project_id=project.project_id, eids=missed_eids, prefix='missed_') app.logger.info('project {}: all EID data collected'.format(project_id)) status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) project.isDataCollecting = False project.isDataCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def references_collection_execution(project_id): """ collects the references for a given collection of publications :param project_id: the ID of the current project :return: 204 if successful """ # initialize lists, read sample size from request and load eid list sample_size = int(request.args.get('sample_size')) missed_eids = [] references_eids = [] eids = eids_service.load_eid_list(project_id) # load project and set booleans project = project_service.load_project(project_id) project.isReferencesCollecting = True project.isReferencesCollected = False project_service.save_project(project) # prepare status status = Status("REFERENCES_COLLECTING") status.total = eids.__len__() status_service.save_status(project_id, status) # if eids are given, cycle through all of them if status.total > 0: for idx, eid in enumerate(eids): # update the progress status and save the status to disk status.progress = idx + 1 status_service.save_status(project_id, status) # print progress app.logger.info('project {}: processing entry ' + str(idx) + 'of ' + str(status.total) + ' entries: ' + str(idx / status.total * 100) + '%') # retrieve refereces from scopus try: scopus_abstract = scopus.AbstractRetrieval(eid, view="FULL") app.logger.info( 'project {}: collected scopus data for EID {}'.format( project_id, eid)) if scopus_abstract.references is not None: references_eids = references_eids + scopus_abstract.references else: app.logger.warn( 'project {}: no references given in scopus export for EID {}.' .format(project_id, eid)) except IOError: app.logger.error( 'project {}: could not collect scopus data for EID {}'. format(project_id, eid)) missed_eids.append(eid) continue # transform references eids into tuple and calculate the occurences references_eids_tuple = tuple(references_eids) occurences = Counter(references_eids_tuple) most_occurences = occurences.most_common(sample_size) # save the counter with the most occurences to disk counter_service.save_counter(project_id, most_occurences, 'references_') eids_service.save_eid_list(project_id, missed_eids, prefix='missed_') # set the status and save it to disk status.status = "DATA_COLLECTED" status_service.save_status(project_id, status) # set the project booleans and save it to disk project.isReferencesCollecting = False project.isReferencesCollected = True project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)
def query_execution(project_id): """ executes the defined and saved query in scopus :param project_id: the ID of the current project :return: 'finished' with a status of 204 when the query was executed successfully """ app.logger.info('project {}: running queries'.format(project_id)) # reads the saved Scopus search string from disk scopus_queries = query_service.load_scopus_queries(project_id) # retrieve the project from disk, set the booleans and save the project project = project_service.load_project(project_id) project.isEidsCollected = False project.isEidsCollecting = True project_service.save_project(project) # prepares the status file status = Status("EIDS_COLLECTING") status_service.save_status(project_id, status) # prepare EIDs list eids = [] for index, search_strings in enumerate(scopus_queries.search_strings): individual_eids = [] for search_string in search_strings: app.logger.info('project {}: executing search {} - {}'.format( project_id, index, search_string)) search = scopus.ScopusSearch(search_string, refresh=True, field='eid', view='STANDARD') if search.results is not None: app.logger.info( 'project {}: result search {} - {} entries found'.format( project_id, index, len(search.results))) for result in search.results: # add EID if it is not already in the list (from a former search) eids.append(result.eid) individual_eids.append(result.eid) eids_service.save_eid_list( project_id=project_id, eids=set(individual_eids), prefix=(str(scopus_queries.search_ids[index]) + '_')) # convert to set in order to remove duplicates eids = set(eids) # print the results to the command line for logging app.logger.info('project {}: found {} eids in Scopus'.format( project_id, len(eids))) # persist EIDs to file eids_service.save_eid_list(project_id=project_id, eids=eids) # set the total number of results to the relevance_measures measure save it to disk relevance_measure = RelevanceMeasure( number_of_search_results=eids.__len__()) relevance_measure_service.save_relevance_measures(project_id, relevance_measure) # set the total number of results to the status save it to disk status.total = relevance_measure.number_of_search_results status_service.save_status(project_id, status) # set the status and save it to disk status = Status("EIDS_COLLECTED") status_service.save_status(project_id, status) # set the project boolean and save the project project.isEidslist = True project.isEidsCollected = True project.isEidsCollecting = False project_service.save_project(project) return Response({"status": "FINISHED"}, status=204)