Beispiel #1
0
def check_test_eids(project_id):
    """
    calcluates the Recall by comparing the list of EIDs retrieved from the query against lists of EIDs as obtained from
     the survey
    :param project_id: the ID of the current project
    :return: a JSON formatted relevance measure object.
    """
    test_eids = eids_service.load_eid_list(project_id, 'test_')
    app.logger.info('project {}: loaded test eids'.format(project_id))

    # load collected eids
    eids = eids_service.load_eid_list(project_id)
    relevance_measure = relevance_measure_service.load_relevance_measure(
        project_id)
    if relevance_measure is None:
        relevance_measure = RelevanceMeasure()
    relevance_measure.number_of_search_results = len(eids)
    relevance_measure.number_test_entries = len(test_eids)
    relevance_measure.number_test_entries_found = 0
    for test_eid in test_eids:
        if test_eid in eids:
            relevance_measure.number_test_entries_found = relevance_measure.number_test_entries_found + 1
    if relevance_measure.number_of_search_results > 0:
        relevance_measure.recall = relevance_measure.number_test_entries_found / relevance_measure.number_test_entries
    else:
        relevance_measure.recall = 0
    relevance_measure_service.save_relevance_measures(project_id,
                                                      relevance_measure)
    app.logger.info(
        'project {}: calculated relevance measure recall'.format(project_id))
    return jsonify(relevance_measure)
Beispiel #2
0
def get_eids_list_length(project_id):
    prefix = request.args.get('prefix')
    try:
        eids = eids_service.load_eid_list(project_id, prefix)
        return Response(str(eids.__len__()), status=200)
    except FileNotFoundError:
        return Response("File not found", status=404)
Beispiel #3
0
def getPrecision(project_id):
    # load collected eids
    eids = eids_service.load_eid_list(project_id)
    relevance_measure = relevance_measure_service.load_relevance_measure(
        project_id)
    if relevance_measure is None:
        relevance_measure = RelevanceMeasure()
    relevance_measure.number_of_search_results = len(eids)
    judgement_list = eids_service.load_judgement_file(project_id)
    app.logger.info('project {}: loaded judgements'.format(project_id))
    relevance_measure.number_sample_entries = len(judgement_list)
    relevance_measure.number_positive_sample_entries = 0
    for judgement in judgement_list:
        if judgement.isRelevant:
            relevance_measure.number_positive_sample_entries = \
                relevance_measure.number_positive_sample_entries + 1
    if relevance_measure.number_sample_entries > 0:
        relevance_measure.precision = relevance_measure.number_positive_sample_entries / relevance_measure.number_sample_entries
    else:
        relevance_measure.precision = 0
    relevance_measure_service.save_relevance_measures(project_id,
                                                      relevance_measure)
    app.logger.info(
        'project {}: calculated relevance measure precision'.format(
            project_id))
    return jsonify(relevance_measure)
def get_eids_list_length(project_id):
    prefix = request.args.get('prefix')
    try:
        eids = eids_service.load_eid_list(project_id, prefix)
        return Response(str(len(eids)), status=200)
    except FileNotFoundError:
        app.logger.error('project {}: could not send {} eid file'.format(
            project_id, prefix))
        return Response("File not found", status=404)
def check_test_eids(project_id):
    # load test eids
    test_eids = eids_service.load_eid_list(project_id, 'test_')

    # load collected eids
    eids = eids_service.load_eid_list(project_id)
    relevance_measure = relevance_measure_service.load_relevance_measure(
        project_id)
    if relevance_measure is None:
        relevance_measure = RelevanceMeasure()
    relevance_measure.number_of_search_results = len(eids)
    relevance_measure.number_test_entries = len(test_eids)
    for test_eid in test_eids:
        if test_eid in eids:
            relevance_measure.number_test_entries_found = relevance_measure.number_test_entries_found + 1
    if relevance_measure.number_of_search_results > 0:
        relevance_measure.recall = relevance_measure.number_test_entries_found / relevance_measure.number_test_entries
    else:
        relevance_measure.recall = 0
    relevance_measure_service.save_relevance_measures(project_id,
                                                      relevance_measure)
    return jsonify(relevance_measure.__dict__)
Beispiel #6
0
def generate_sample_publication_list(project_id, sample_size, session_id):
    # path to the file
    eids = eids_service.load_eid_list(project_id)

    number = eids.__len__()
    random_sample_eids = []
    if number > sample_size:
        test_indices = random.sample(range(1, eids.__len__()), sample_size)
        for index, value in enumerate(eids):
            if index in test_indices:
                random_sample_eids.append(value)
    else:
        random_sample_eids = eids
    eids_service.save_eid_list(project_id, random_sample_eids, session_id)
    return random_sample_eids
Beispiel #7
0
def get_eids_scopus_search_string(project_id):
    prefix = request.args.get('prefix')
    if prefix == 'sample_':
        sample_size = int(request.args.get('sample_size'))
        build_sample_list(project_id, sample_size)
    try:
        eids = eids_service.load_eid_list(project_id, prefix)
    except FileNotFoundError:
        return Response("File not found", status=404)
    search_string = 'EID('
    for index, eid in enumerate(eids):
        if index > 0:
            search_string = search_string + ' OR '
        search_string = search_string + eid
    search_string = search_string + ')'
    return Response(search_string, status=200)
def check_sample_eids(project_id):
    # load collected eids
    eids = eids_service.load_eid_list(project_id)
    relevance_measure = relevance_measure_service.load_relevance_measure(
        project_id)
    if relevance_measure is None:
        relevance_measure = RelevanceMeasure()
    relevance_measure.number_of_search_results = len(eids)
    judgement_list = eids_service.load_judgement_file(project_id)
    relevance_measure['number_sample_entries'] = len(judgement_list)
    for judgement in judgement_list:
        if judgement['isRelevant']:
            relevance_measure['number_positive_sample_entries'] = \
                relevance_measure['number_positive_sample_entries'] + 1
    relevance_measure_service.save_relevance_measures(project_id,
                                                      relevance_measure)
    return jsonify(relevance_measure.__dict__)
Beispiel #9
0
def build_sample_list(project_id, sample_size=100):
    if sample_size is None:
        sample_size = 100
    # path to the file
    eids = eids_service.load_eid_list(project_id)

    number = eids.__len__()
    random_sample_eids = []
    if number > sample_size:
        test_indices = random.sample(range(1, eids.__len__()), sample_size)
        for index, value in enumerate(eids):
            if index in test_indices:
                random_sample_eids.append(value)
        eids_service.save_eid_list(project_id=project_id,
                                   eids=random_sample_eids,
                                   prefix='sample_')
    else:
        eids_service.save_eid_list(project_id=project_id,
                                   eids=eids,
                                   prefix='sample_')
Beispiel #10
0
def retrieve_publications_sample(project_id, query_id):
    session_id = request.args.get('session')
    sample_size = int(request.args.get('sample_size'))
    if sample_size is None:
        sample_size = 100
    if session_id is None:
        session_id = 'default_session_'
    try:
        random_sample_eids = eids_service.load_eid_list(project_id, session_id)
    except:
        random_sample_eids = generate_sample_publication_list(
            project_id, sample_size, session_id)
    search_string = utils.generate_scopus_search_from_eid_list(
        random_sample_eids)
    search = scopus.ScopusSearch(search_string,
                                 refresh=True,
                                 project_id=project_id)
    sample_publications_json = json.dumps(search.results, cls=PropertyEncoder)
    return Response(sample_publications_json,
                    status=200,
                    mimetype='application/json')
Beispiel #11
0
def add_query_ids(project_id):
    query_ids = query_service.load_scopus_queries(project_id).search_ids
    for query_id in query_ids:
        eids = eids_service.load_eid_list(project_id, prefix=query_id + '_')
        for eid in eids:
            try:
                record = elasticsearch_service.get_record(project_id, eid)
            except:
                app.logger.warning('eid not in index: '.format(eid))
                continue
            try:
                if query_id in record['query_id']:
                    continue
                if record['query_id'] == '':
                    record['query_id'] = query_id
                else:
                    record['query_id'] = record['query_id'] + '; ' + query_id
            except KeyError:
                record['query_id'] = query_id

            elasticsearch_service.append_to_index(record, eid, project_id)
            app.logger.info('set query id {} to entry {}'.format(
                query_id, eid))
    return Response({"status": "FINISHED"}, status=204)
Beispiel #12
0
def data_collection_execution(project_id):
    """
    run the data collection

    :parameter project_id the id of the current project

    """

    mode = ''

    if request.args.get('mode') is not None:
        mode = request.args.get('mode')

    app.logger.info('project {}: collecting data with mode {}'.format(
        project_id, mode))

    # load project, set status bools, and load and eid list. initialize missed eid list
    project = project_service.load_project(project_id)
    project.isDataCollecting = True
    project.isDataCollected = False
    eids = eids_service.load_eid_list(project_id, mode)
    missed_eids = []

    with app.app_context():
        keys = app.config.get("LIBINTEL_SCOPUS_KEYS")

    # initialize status, set to collecting and save status
    status = Status("DATA_COLLECTING")
    status.total = len(eids)
    status_service.save_status(project_id, status)

    if status.total > 0:
        if mode != 'missed':
            elasticsearch_service.delete_index(project.project_id)
        else:
            eids_service.deleteMissedEids()
        if type(keys) is tuple:

            # the number of threads is given by the number of available API keys
            number_of_threads = len(keys)
            app.logger.info('project {}: collecting data in {} threads'.format(
                project_id, number_of_threads))

            # gather the individual chunks provided to each process
            length_of_chunks = math.ceil(status.total / number_of_threads)
            list_chunks = list(chunks(eids, length_of_chunks))
            # make asynchronous calls and delegate the individual collection to the individual threads
            for key_index, key in enumerate(keys):
                if len(list_chunks) > key_index:
                    thread = Thread(target=collect_data,
                                    args=(list_chunks[key_index],
                                          project.project_id, project.name,
                                          key_index, key,
                                          app._get_current_object()))
                    thread.start()
            return Response('finished', status=204)

        collect_data(eids=eids,
                     project_id=project.project_id,
                     project_name=project.name,
                     i=0,
                     key=keys,
                     app=app._get_current_object())

        # if only one API-Key is given, collect data sequentially
        for idx, eid in enumerate(eids):

            # set scopus api-key to the provided key
            scopus.config['Authentication']['APIKEy'] = keys

            # update the progress status and save the status to disk
            status.progress = idx + 1
            status_service.save_status(project_id, status)

            # print progress
            app.logger.info('project {}: processing entry ' + str(idx) +
                            'of ' + str(status.total) + ' entries: ' +
                            str(idx / status.total * 100) + '%')

            # retrieve data from scopus
            try:
                scopus_abstract = scopus.AbstractRetrieval(identifier=eid,
                                                           id_type='eid',
                                                           view="FULL",
                                                           refresh=True)
                app.logger.info(
                    'project {}: collected scopus data for EID {}'.format(
                        project_id, eid))
            except Exception as inst:
                app.logger.error(
                    'project {}: could not collect scopus data for EID {}, exception: {}'
                    .format(project_id, eid, type(inst)))
                missed_eids.append(eid)
                continue

            # create new AllResponses object to hold the individual information
            response = AllResponses(eid, project.name, project.project_id)

            # add scopus abstract to AllResponses object
            response.scopus_abstract_retrieval = scopus_abstract

            # get doi and collect unpaywall data and Altmetric data
            doi = scopus_abstract.doi
            if doi is not None:
                if doi != "":
                    response.unpaywall_response = Unpaywall(doi)
                    response.altmetric_response = Altmetric(doi)
                    response.scival_data = Scival([])

            # send response to elastic search index
            elasticsearch_service.send_to_index(response, project.project_id)
            app.logger.info('project {}: saved EID {} to elasticsearch'.format(
                project_id, eid))
    eids_service.save_eid_list(project_id=project.project_id,
                               eids=missed_eids,
                               prefix='missed_')
    app.logger.info('project {}: all EID data collected'.format(project_id))
    status.status = "DATA_COLLECTED"
    status_service.save_status(project_id, status)
    project.isDataCollecting = False
    project.isDataCollected = True
    project_service.save_project(project)
    return Response({"status": "FINISHED"}, status=204)
Beispiel #13
0
def references_collection_execution(project_id):
    """
    collects the references for a given collection of publications
    :param project_id: the ID of the current project
    :return: 204 if successful
    """
    # initialize lists, read sample size from request and load eid list
    sample_size = int(request.args.get('sample_size'))
    missed_eids = []
    references_eids = []
    eids = eids_service.load_eid_list(project_id)

    # load project and set booleans
    project = project_service.load_project(project_id)
    project.isReferencesCollecting = True
    project.isReferencesCollected = False
    project_service.save_project(project)

    # prepare status
    status = Status("REFERENCES_COLLECTING")
    status.total = eids.__len__()
    status_service.save_status(project_id, status)

    # if eids are given, cycle through all of them
    if status.total > 0:
        for idx, eid in enumerate(eids):
            # update the progress status and save the status to disk
            status.progress = idx + 1
            status_service.save_status(project_id, status)

            # print progress
            app.logger.info('project {}: processing entry ' + str(idx) +
                            'of ' + str(status.total) + ' entries: ' +
                            str(idx / status.total * 100) + '%')

            # retrieve refereces from scopus
            try:
                scopus_abstract = scopus.AbstractRetrieval(eid, view="FULL")
                app.logger.info(
                    'project {}: collected scopus data for EID {}'.format(
                        project_id, eid))
                if scopus_abstract.references is not None:
                    references_eids = references_eids + scopus_abstract.references
                else:
                    app.logger.warn(
                        'project {}: no references given in scopus export for EID {}.'
                        .format(project_id, eid))
            except IOError:
                app.logger.error(
                    'project {}: could not collect scopus data for EID {}'.
                    format(project_id, eid))
                missed_eids.append(eid)
                continue
    # transform references eids into tuple and calculate the occurences
    references_eids_tuple = tuple(references_eids)
    occurences = Counter(references_eids_tuple)
    most_occurences = occurences.most_common(sample_size)

    # save the counter with the most occurences to disk
    counter_service.save_counter(project_id, most_occurences, 'references_')
    eids_service.save_eid_list(project_id, missed_eids, prefix='missed_')

    # set the status and save it to disk
    status.status = "DATA_COLLECTED"
    status_service.save_status(project_id, status)

    # set the project booleans and save it to disk
    project.isReferencesCollecting = False
    project.isReferencesCollected = True
    project_service.save_project(project)

    return Response({"status": "FINISHED"}, status=204)