Python LiteratureSearch.scanの例、inspirehep.modules.search.LiteratureSearch.scan Pythonの例

コード例 #1

0

ファイルを表示

ファイル: institutions.py プロジェクト: rikirenz/inspire-next

    def serialize(self, pid, record, links_factory=None):
        search_by_institution = LiteratureSearch().query(
            'match', authors__affiliations__recid=get_id(record)
        ).params(
            _source=[
                'control_number',
            ],
        )

        literature_recids = [
            get_id(el.to_dict()) for el in search_by_institution.scan()]

        search_by_recids = LiteratureSearch().filter(
            'terms', control_number=literature_recids
        ).params(
            _source=[
                'authors.recid',
                'collaborations.value',
                'control_number',
                'earliest_date',
                'facet_inspire_doc_type',
                'inspire_categories',
                'titles.title',
            ],
        )

        return json.dumps(build_citesummary(search_by_recids))

コード例 #2

0

ファイルを表示

ファイル: utils.py プロジェクト: michamos/inspire-next

def build_citesummary(search):
    citesummary = []

    for i, el in enumerate(search.scan()):
        result = el.to_dict()

        citesummary.append({
            'citations': [],
            'collaboration': is_collaboration(result),
            'core': is_core(result),
            'date': get_date(result),
            'document_type': get_document_type(result),
            'id': get_id(result),
            'subject': get_subject(result),
            'title': get_title(result),
        })

        search_by_literature = LiteratureSearch().query(
            'match', references__recid=get_id(result)).params(_source=[
                'authors.recid',
                'collaboration.value',
                'collections.primary',
                'control_number',
                'earliest_date',
                'facet_inspire_doc_type',
                'inspire_categories',
                'titles.title',
            ])

        for el in search_by_literature.scan():
            literature_result = el.to_dict()

            citesummary[i]['citations'].append({
                'collaboration':
                is_collaboration(literature_result),
                'core':
                is_core(literature_result),
                'date':
                get_date(literature_result),
                'document_type':
                get_document_type(literature_result),
                'id':
                get_id(literature_result),
                'selfcite':
                is_selfcite(result, literature_result),
                'subject':
                get_subject(literature_result),
                'title':
                get_title(literature_result),
            })

    return citesummary

コード例 #3

0

ファイルを表示

ファイル: literature.py プロジェクト: michamos/inspire-next

    def serialize(self, pid, record, links_factory=None):
        citesummary = [
            {
                'citations': [],
                'collaboration': is_collaboration(record),
                'core': is_core(record),
                'date': get_date(record),
                'document_type': get_document_type(record),
                'id': get_id(record),
                'subject': get_subject(record),
                'title': get_title(record),
            },
        ]

        search = LiteratureSearch().query(
            'match', references__recid=get_id(record)).params(_source=[
                'authors.recid',
                'collaboration.value',
                'collections.primary',
                'control_number',
                'earliest_date',
                'facet_inspire_doc_type',
                'inspire_categories',
                'titles.title',
            ], )

        for el in search.scan():
            result = el.to_dict()

            citesummary[0]['citations'].append({
                'collaboration':
                is_collaboration(result),
                'core':
                is_core(result),
                'date':
                get_date(result),
                'document_type':
                get_document_type(result),
                'id':
                get_id(result),
                'subject':
                get_subject(result),
                'selfcite':
                is_selfcite(record, result),
                'title':
                get_title(result),
            })

        return json.dumps(citesummary)

コード例 #4

0

ファイルを表示

    def serialize(self, pid, record, links_factory=None):
        """Return a list of co-authors for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        coauthors = {}

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(
            _source=[
                "authors.full_name",
                "authors.recid",
                "authors.record",
            ]
        )

        for result in search.scan():
            result_source = result.to_dict()['authors']

            for author in result_source:
                try:
                    # Don't add the reference author.
                    if author['recid'] != author_pid:
                        if author['recid'] in coauthors:
                            coauthors[author['recid']]['count'] += 1
                        else:
                            coauthors[author['recid']] = dict(
                                count=1,
                                full_name=author['full_name'],
                                id=author['recid'],
                                record=author['record'],
                            )
                except KeyError:
                    pass

        return json.dumps(coauthors.values())

コード例 #5

0

ファイルを表示

ファイル: utils.py プロジェクト: fschwenn/inspire-next

def build_citesummary(search):
    citesummary = []

    for i, el in enumerate(search.scan()):
        result = el.to_dict()

        citesummary.append({
            'citations': [],
            'collaboration': is_collaboration(result),
            'core': is_core(result),
            'date': get_date(result),
            'document_type': get_document_type(result),
            'id': get_id(result),
            'subject': get_subject(result),
            'title': get_title(result),
        })

        search_by_literature = LiteratureSearch().query(
            'match', references__recid=get_id(result)
        ).params(
            _source=[
                'authors.recid',
                'collaborations.value',
                'control_number',
                'earliest_date',
                'facet_inspire_doc_type',
                'inspire_categories',
                'titles.title',
            ]
        )

        for el in search_by_literature.scan():
            literature_result = el.to_dict()

            citesummary[i]['citations'].append({
                'collaboration': is_collaboration(literature_result),
                'core': is_core(literature_result),
                'date': get_date(literature_result),
                'document_type': get_document_type(literature_result),
                'id': get_id(literature_result),
                'selfcite': is_selfcite(
                    result, literature_result),
                'subject': get_subject(literature_result),
                'title': get_title(literature_result),
            })

    return citesummary

コード例 #6

0

ファイルを表示

ファイル: publications.py プロジェクト: jacenkow/inspire-next

    def serialize(self, pid, record, links_factory=None):
        """Return a list of publications for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        publications = []

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(
            _source=[
                "accelerator_experiments",
                "earliest_date",
                "citation_count",
                "control_number",
                "facet_inspire_doc_type",
                "publication_info",
                "self",
                "thesaurus_terms",
                "titles",
            ]
        )

        for result in search.scan():
            result_source = result.to_dict()

            publication = {}
            publication['id'] = int(result_source['control_number'])
            publication['record'] = result_source['self']
            publication['title'] = get_title(result_source)

            # Get the earliest date.
            try:
                publication['date'] = result_source['earliest_date']
            except KeyError:
                pass

            # Get publication type.
            try:
                publication['type'] = result_source.get(
                    'facet_inspire_doc_type', [])[0]
            except IndexError:
                pass

            # Get citation count.
            try:
                publication['citations'] = result_source['citation_count']
            except KeyError:
                pass

            # Get journal.
            try:
                publication['journal'] = {}
                publication['journal']['title'] = result_source.get(
                    'publication_info', [])[0]['journal_title']

                # Get journal id and $self.
                try:
                    publication['journal']['id'] = result_source.get(
                        'publication_info', [])[0]['journal_recid']
                    publication['journal']['record'] = result_source.get(
                        'publication_info', [])[0]['journal_record']
                except KeyError:
                    pass
            except (IndexError, KeyError):
                del publication['journal']

            # Get collaborations.
            collaborations = set()

            for experiment in result_source.get('accelerator_experiments', []):
                collaborations.add(experiment.get('experiment'))

            if collaborations:
                publication['collaborations'] = list(collaborations)

            publications.append(publication)

        return json.dumps(publications)

コード例 #7

0

ファイルを表示

ファイル: citations.py プロジェクト: rikirenz/inspire-next

    def serialize(self, pid, record, links_factory=None):
        """Return a list of citations for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        citations = {}

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(_source=[
            "authors.recid",
            "control_number",
            "self",
        ])

        # For each publication co-authored by a given author...
        for result in search.scan():
            result_source = result.to_dict()

            recid = result_source['control_number']
            authors = set([i['recid'] for i in result_source['authors']])
            citations[recid] = {}

            nested_search = LiteratureSearch().query({
                "match": {
                    "references.recid": recid
                }
            }).params(_source=[
                "authors.recid",
                "collections",
                "control_number",
                "earliest_date",
                "self",
            ])

            # The source record that is being cited.
            citations[recid]['citee'] = dict(
                id=recid,
                record=result_source['self'],
            )
            citations[recid]['citers'] = []

            # Check all publications, which cite the parent record.
            for nested_result in nested_search.scan():
                nested_result_source = nested_result.to_dict()

                # Not every signature has a recid (at least for demo records).
                try:
                    nested_authors = set(
                        [i['recid'] for i in nested_result_source['authors']])
                except KeyError:
                    nested_authors = set()

                citation = dict(
                    citer=dict(id=int(nested_result_source['control_number']),
                               record=nested_result_source['self']),
                    # If at least one author is shared, it's a self-citation.
                    self_citation=len(authors & nested_authors) > 0,
                )

                # Get the earliest date of a citer.
                try:
                    citation['date'] = nested_result_source['earliest_date']
                except KeyError:
                    pass

                # Get status if a citer is published.
                # FIXME: As discussed with Sam, we should have a boolean flag
                #        for this type of information.
                try:
                    citation['published_paper'] = "Published" in [
                        i['primary']
                        for i in nested_result_source['collections']
                    ]
                except KeyError:
                    citation['published_paper'] = False

                citations[recid]['citers'].append(citation)

        return json.dumps(citations.values())

コード例 #8

0

ファイルを表示

ファイル: publications.py プロジェクト: jacenkow/inspire-next

def get_publications():
    recid = request.values.get('recid', 0, type=int)

    publications = []
    collaborations = set()
    keywords = set()

    search = LiteratureSearch().query({
        "match": {
            "authors.recid": recid
        }
    }).params(_source=[
        'accelerator_experiments', 'control_number', 'earliest_date',
        'facet_inspire_doc_type', 'publication_info', 'titles',
        'thesaurus_terms'
    ])
    for result in search.scan():
        try:
            result_source = result.to_dict()
            publication = {}

            # Get publication title (required).
            publication['title'] = get_title(result_source)

            # Get publication recid (required).
            publication['recid'] = result_source['control_number']
        except (IndexError, KeyError):
            continue

        # Get publication type.
        try:
            publication['type'] = result_source.get('facet_inspire_doc_type',
                                                    [])[0]
        except IndexError:
            publication['type'] = "Not defined"

        # Get journal title.
        try:
            publication['journal_title'] = result_source.get(
                'publication_info', [])[0]['journal_title']

            # Get journal recid.
            try:
                publication['journal_recid'] = result_source.get(
                    'publication_info', [])[0]['journal_recid']
            except KeyError:
                pass
        except (IndexError, KeyError):
            pass

        # Get publication year.
        try:
            publication['year'] = result_source.get('publication_info',
                                                    [])[0]['year']
        except (IndexError, KeyError):
            pass

        # Get keywords.
        for keyword in result_source.get('thesaurus_terms', []):
            if keyword.get('keyword') is not "* Automatic Keywords *" \
                    and keyword.get('keyword'):
                keywords.add(keyword.get('keyword'))

        # Get collaborations.
        for experiment in result_source.get('accelerator_experiments', []):
            collaborations.add(experiment.get('experiment'))

        # Append to the list.
        publications.append(publication)

    response = {}
    response['publications'] = publications
    response['keywords'] = list(keywords)
    response['collaborations'] = list(collaborations)

    return jsonify(response)

コード例 #9

0

ファイルを表示

ファイル: publications.py プロジェクト: bittirousku/inspire-next

def get_publications():
    recid = request.values.get('recid', 0, type=int)

    publications = []
    collaborations = set()
    keywords = set()

    search = LiteratureSearch().query(
        {"match": {"authors.recid": recid}}
    ).params(
        _source=[
            'accelerator_experiments',
            'control_number',
            'earliest_date',
            'facet_inspire_doc_type',
            'publication_info',
            'titles',
            'keywords'
        ]
    )
    for result in search.scan():
        try:
            result_source = result.to_dict()
            publication = {}

            # Get publication title (required).
            publication['title'] = get_title(result_source)

            # Get publication recid (required).
            publication['recid'] = result_source['control_number']
        except (IndexError, KeyError):
            continue

        # Get publication type.
        try:
            publication['type'] = result_source.get(
                'facet_inspire_doc_type', [])[0]
        except IndexError:
            publication['type'] = "Not defined"

        # Get journal title.
        try:
            publication['journal_title'] = result_source.get(
                'publication_info', [])[0]['journal_title']

            # Get journal recid.
            try:
                publication['journal_recid'] = result_source.get(
                    'publication_info', [])[0]['journal_recid']
            except KeyError:
                pass
        except (IndexError, KeyError):
            pass

        # Get publication year.
        try:
            publication['year'] = result_source.get(
                'publication_info', [])[0]['year']
        except (IndexError, KeyError):
            pass

        # Get keywords.
        for keyword in result_source.get('keywords', []):
            if keyword.get('keyword') is not "* Automatic Keywords *" \
                    and keyword.get('keyword'):
                keywords.add(keyword.get('keyword'))

        # Get collaborations.
        for experiment in result_source.get(
                'accelerator_experiments', []):
            collaborations.add(experiment.get('experiment'))

        # Append to the list.
        publications.append(publication)

    response = {}
    response['publications'] = publications
    response['keywords'] = list(keywords)
    response['collaborations'] = list(collaborations)

    return jsonify(response)

コード例 #10

0

ファイルを表示

ファイル: stats.py プロジェクト: jacenkow/inspire-next

    def serialize(self, pid, record, links_factory=None):
        """Return a different metrics for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value

        fields = set()
        keywords = []

        statistics = {}
        statistics['citations'] = 0
        statistics['publications'] = 0
        statistics['types'] = {}

        statistics_citations = {}

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(
            _source=[
                "citation_count",
                "control_number",
                "facet_inspire_doc_type",
                "facet_inspire_subjects",
                "thesaurus_terms",
            ]
        )

        for result in search.scan():
            result_source = result.to_dict()

            # Increment the count of the total number of publications.
            statistics['publications'] += 1

            # Increment the count of citations.
            citation_count = result_source.get('citation_count', 0)

            statistics['citations'] += citation_count
            statistics_citations[int(result_source['control_number'])] = \
                citation_count

            # Count how many times certain type of publication was published.
            try:
                publication_type = result_source.get(
                    'facet_inspire_doc_type', [])[0]
            except IndexError:
                pass

            if publication_type:
                if publication_type in statistics['types']:
                    statistics['types'][publication_type] += 1
                else:
                    statistics['types'][publication_type] = 1

            # Get fields.
            for field in result_source.get('facet_inspire_subjects', []):
                fields.add(field)

            # Get keywords.
            keywords.extend([
                k for k in force_force_list(
                    get_value(result_source, 'thesaurus_terms.keyword'))
                if k != '* Automatic Keywords *'])

        # Calculate h-index together with i10-index.
        statistics['hindex'] = calculate_h_index(statistics_citations)
        statistics['i10index'] = calculate_i10_index(statistics_citations)

        if fields:
            statistics['fields'] = list(fields)

        # Return the top 25 keywords.
        if keywords:
            counter = Counter(keywords)
            statistics['keywords'] = [{
                'count': i[1],
                'keyword': i[0]
            } for i in counter.most_common(25)]

        return json.dumps(statistics)

コード例 #11

0

ファイルを表示

    def serialize(self, pid, record, links_factory=None):
        """Return a different metrics for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value

        fields = set()
        keywords = []

        statistics = {}
        statistics['citations'] = 0
        statistics['publications'] = 0
        statistics['types'] = {}

        statistics_citations = {}

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(_source=[
            "citation_count",
            "control_number",
            "facet_inspire_doc_type",
            "facet_inspire_subjects",
            "keywords",
        ])

        for result in search.scan():
            result_source = result.to_dict()

            # Increment the count of the total number of publications.
            statistics['publications'] += 1

            # Increment the count of citations.
            citation_count = result_source.get('citation_count', 0)

            statistics['citations'] += citation_count
            statistics_citations[result_source['control_number']] = \
                citation_count

            # Count how many times certain type of publication was published.
            try:
                publication_type = result_source.get('facet_inspire_doc_type',
                                                     [])[0]
            except IndexError:
                pass

            if publication_type:
                if publication_type in statistics['types']:
                    statistics['types'][publication_type] += 1
                else:
                    statistics['types'][publication_type] = 1

            # Get fields.
            for field in result_source.get('facet_inspire_subjects', []):
                fields.add(field)

            # Get keywords.
            keywords.extend([
                k
                for k in force_list(get_value(result_source, 'keywords.value'))
                if k != '* Automatic Keywords *'
            ])

        # Calculate h-index together with i10-index.
        statistics['hindex'] = calculate_h_index(statistics_citations)
        statistics['i10index'] = calculate_i10_index(statistics_citations)

        if fields:
            statistics['fields'] = list(fields)

        # Return the top 25 keywords.
        if keywords:
            counter = Counter(keywords)
            statistics['keywords'] = [{
                'count': i[1],
                'keyword': i[0]
            } for i in counter.most_common(25)]

        return json.dumps(statistics)

コード例 #12

0

ファイルを表示

ファイル: citations.py プロジェクト: david-caro/inspire-next

    def serialize(self, pid, record, links_factory=None):
        """Return a list of citations for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        citations = {}

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(
            _source=[
                "authors.recid",
                "control_number",
                "self",
            ]
        )

        # For each publication co-authored by a given author...
        for result in search.scan():
            result_source = result.to_dict()

            recid = result_source['control_number']
            authors = set([i['recid'] for i in result_source['authors']])
            citations[recid] = {}

            nested_search = LiteratureSearch().query({
                "match": {
                    "references.recid": recid
                }
            }).params(
                _source=[
                    "authors.recid",
                    "collections",
                    "control_number",
                    "earliest_date",
                    "self",
                ]
            )

            # The source record that is being cited.
            citations[recid]['citee'] = dict(
                id=recid,
                record=result_source['self'],
            )
            citations[recid]['citers'] = []

            # Check all publications, which cite the parent record.
            for nested_result in nested_search.scan():
                nested_result_source = nested_result.to_dict()

                # Not every signature has a recid (at least for demo records).
                try:
                    nested_authors = set(
                        [i['recid'] for i in nested_result_source['authors']]
                    )
                except KeyError:
                    nested_authors = set()

                citation = dict(
                    citer=dict(
                        id=int(nested_result_source['control_number']),
                        record=nested_result_source['self']
                    ),
                    # If at least one author is shared, it's a self-citation.
                    self_citation=len(authors & nested_authors) > 0,
                )

                # Get the earliest date of a citer.
                try:
                    citation['date'] = nested_result_source['earliest_date']
                except KeyError:
                    pass

                # Get status if a citer is published.
                # FIXME: As discussed with Sam, we should have a boolean flag
                #        for this type of information.
                try:
                    citation['published_paper'] = "Published" in [
                        i['primary'] for i in nested_result_source[
                            'collections']]
                except KeyError:
                    citation['published_paper'] = False

                citations[recid]['citers'].append(citation)

        return json.dumps(citations.values())

コード例 #13

0

ファイルを表示

ファイル: publications.py プロジェクト: bittirousku/inspire-next

    def serialize(self, pid, record, links_factory=None):
        """Return a list of publications for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        publications = []

        search = LiteratureSearch().query({
            "match": {
                "authors.recid": author_pid
            }
        }).params(
            _source=[
                "accelerator_experiments",
                "earliest_date",
                "citation_count",
                "control_number",
                "facet_inspire_doc_type",
                "publication_info",
                "self",
                "keywords",
                "titles",
            ]
        )

        for result in search.scan():
            result_source = result.to_dict()

            publication = {}
            publication['id'] = int(result_source['control_number'])
            publication['record'] = result_source['self']
            publication['title'] = get_title(result_source)

            # Get the earliest date.
            try:
                publication['date'] = result_source['earliest_date']
            except KeyError:
                pass

            # Get publication type.
            try:
                publication['type'] = result_source.get(
                    'facet_inspire_doc_type', [])[0]
            except IndexError:
                pass

            # Get citation count.
            try:
                publication['citations'] = result_source['citation_count']
            except KeyError:
                pass

            # Get journal.
            try:
                publication['journal'] = {}
                publication['journal']['title'] = result_source.get(
                    'publication_info', [])[0]['journal_title']

                # Get journal id and $self.
                try:
                    publication['journal']['id'] = result_source.get(
                        'publication_info', [])[0]['journal_recid']
                    publication['journal']['record'] = result_source.get(
                        'publication_info', [])[0]['journal_record']
                except KeyError:
                    pass
            except (IndexError, KeyError):
                del publication['journal']

            # Get collaborations.
            collaborations = set()

            for experiment in result_source.get('accelerator_experiments', []):
                collaborations.add(experiment.get('experiment'))

            if collaborations:
                publication['collaborations'] = list(collaborations)

            publications.append(publication)

        return json.dumps(publications)

コード例 #14

0

ファイルを表示

    def serialize(self, pid, record, links_factory=None):
        """Return a list of publications for a given author recid.

        :param pid:
            Persistent identifier instance.

        :param record:
            Record instance.

        :param links_factory:
            Factory function for the link generation, which are added to
            the response.
        """
        author_pid = pid.pid_value
        publications = []

        query = Q('match', authors__recid=author_pid)
        search = LiteratureSearch().query('nested', path='authors', query=query)\
                                   .params(_source=[
                                       'accelerator_experiments',
                                       'citation_count',
                                       'control_number',
                                       'earliest_date',
                                       'facet_inspire_doc_type',
                                       'keywords',
                                       'publication_info',
                                       'self',
                                       'titles',
                                   ])

        for result in search.scan():
            result_source = result.to_dict()

            publication = {}
            publication['id'] = int(result_source['control_number'])
            publication['record'] = result_source['self']
            publication['title'] = LiteratureReader(result_source).title

            # Get the earliest date.
            try:
                publication['date'] = result_source['earliest_date']
            except KeyError:
                pass

            # Get publication type.
            try:
                publication['type'] = result_source.get(
                    'facet_inspire_doc_type', [])[0]
            except IndexError:
                pass

            # Get citation count.
            try:
                publication['citations'] = result_source['citation_count']
            except KeyError:
                pass

            # Get journal.
            try:
                publication['journal'] = {}
                publication['journal']['title'] = result_source.get(
                    'publication_info', [])[0]['journal_title']

                # Get journal id and $self.
                try:
                    publication['journal']['id'] = result_source.get(
                        'publication_info', [])[0]['journal_recid']
                    publication['journal']['record'] = result_source.get(
                        'publication_info', [])[0]['journal_record']
                except KeyError:
                    pass
            except (IndexError, KeyError):
                del publication['journal']

            # Get collaborations.
            collaborations = set()

            for experiment in result_source.get('accelerator_experiments', []):
                collaborations.add(experiment.get('experiment'))

            if collaborations:
                publication['collaborations'] = list(collaborations)

            publications.append(publication)

        return json.dumps(publications)