Esempio n. 1
0
def get_timestamps(db_record, out):
    out = {}
    last_update = None
    for k, v in fmap.items():
        if v in db_record and db_record[v]:
            t = db_record[v]
            out[k] = date2solrstamp(t)
            if last_update is None or t > last_update:
                last_update = t
    if last_update:
        out['update_timestamp'] = date2solrstamp(last_update)
    return out
Esempio n. 2
0
File: app.py Progetto: adsabs/oboi
 def consume(self, date, key, value):
     json_body = [{
         "measurement": "reads",
         "tags": {
             "host": "arxiv",
             "paper": key
         },
         "time": date2solrstamp(date),
         "fields": {
             "Int_value": 1,
             "String_value": value
         }
     }]
     self.client.write_points(json_body)
Esempio n. 3
0
    def test_get_date(self):
        """Check we always work with UTC dates"""

        d = adsputils.get_date()
        self.assertTrue(d.tzname() == 'UTC')

        d1 = adsputils.get_date('2009-09-04T01:56:35.450686Z')
        self.assertTrue(d1.tzname() == 'UTC')
        self.assertEqual(d1.isoformat(), '2009-09-04T01:56:35.450686+00:00')
        self.assertEqual(adsputils.date2solrstamp(d1),
                         '2009-09-04T01:56:35.450686Z')

        d2 = adsputils.get_date('2009-09-03T20:56:35.450686-05:00')
        self.assertTrue(d2.tzname() == 'UTC')
        self.assertEqual(d2.isoformat(), '2009-09-04T01:56:35.450686+00:00')
        self.assertEqual(adsputils.date2solrstamp(d2),
                         '2009-09-04T01:56:35.450686Z')

        d3 = adsputils.get_date('2009-09-03T20:56:35.450686')
        self.assertTrue(d3.tzname() == 'UTC')
        self.assertEqual(d3.isoformat(), '2009-09-03T20:56:35.450686+00:00')
        self.assertEqual(adsputils.date2solrstamp(d3),
                         '2009-09-03T20:56:35.450686Z')
 def _entry_date(ADS_record):
   d = ADS_record.get('entry_date', None)     
   return {'entry_date': date2solrstamp(d and get_date(d) or get_date())}
Esempio n. 5
0
def build_record(app, citation_change, parsed_metadata, citations):
    if citation_change.content_type != CitationChangeContentType.doi:
        raise Exception("Only DOI records can be forwarded to master")
    # Extract required values
    bibcode = parsed_metadata.get('bibcode')
    if bibcode is None:
        raise Exception(
            "Only records with a bibcode can be forwarded to master")
    alternate_bibcode = parsed_metadata.get('alternate_bibcode', [])
    abstract = parsed_metadata.get('abstract', u"")
    title = parsed_metadata.get('title', u"")
    keywords = parsed_metadata.get('keywords', [])
    authors = parsed_metadata.get('authors', [])
    normalized_authors = parsed_metadata.get('normalized_authors', [])
    affiliations = parsed_metadata.get('affiliations', [u'-'] * len(authors))
    pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d"))
    source = parsed_metadata.get('source', u"Unknown")
    version = parsed_metadata.get('version', u"")
    doctype = parsed_metadata.get('doctype', u"software")
    # Clean abstract and title
    abstract = u''.join(
        BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace(
            '\n', ' ').replace('\r', '')
    title = u''.join(BeautifulSoup(
        title,
        features="lxml").findAll(text=True)).replace('\n',
                                                     ' ').replace('\r', '')
    # Extract year
    year = pubdate.split("-")[0]
    # Build an author_facet_hier list with the following structure:
    #   "0/Blanco-Cuaresma, S",
    #   "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S",
    #   "0/Soubiran, C",
    #   "1/Soubiran, C/Soubiran, C",
    author_facet_hier = list(
        itertools.chain.from_iterable(
            zip(["0/" + a for a in normalized_authors], [
                "1/" + a[0] + "/" + a[1]
                for a in zip(normalized_authors, authors)
            ])))

    # Count
    n_keywords = len(keywords)
    n_authors = len(authors)
    n_citations = len(citations)
    doi = citation_change.content
    record_dict = {
        'abstract':
        abstract,
        'ack':
        u'',
        'aff': [u"-" if aff == "" else aff for aff in affiliations],
        'alternate_bibcode':
        alternate_bibcode,
        'alternate_title': [],
        'arxiv_class': [],
        'author':
        authors,
        'author_count':
        n_authors,
        'author_facet':
        normalized_authors,
        'author_facet_hier':
        author_facet_hier,
        'author_norm':
        normalized_authors,
        'bibcode':
        bibcode,
        'bibstem': [u'zndo'],
        'bibstem_facet':
        u'zndo',
        'copyright': [],
        'comment': [],
        'database': [u'general', u'astronomy'],
        'entry_date':
        date2solrstamp(citation_change.timestamp.ToDatetime()
                       ),  # date2solrstamp(get_date()),
        'year':
        year,
        'date': (citation_change.timestamp.ToDatetime() +
                 datetime.timedelta(minutes=30)).
        strftime(
            '%Y-%m-%dT%H:%M:%S.%fZ'
        ),  # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter
        'doctype':
        doctype,
        'doctype_facet_hier': [u"0/Non-Article", u"1/Non-Article/Software"],
        'doi': [doi],
        'eid':
        doi,
        'email': [u'-'] * n_authors,
        'first_author':
        authors[0] if n_authors > 0 else u'',
        'first_author_facet_hier':
        author_facet_hier[:2],
        'first_author_norm':
        normalized_authors[0] if n_authors > 0 else u'',
        'links_data': [
            u'{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}'
            .format(app.conf['DOI_URL'] + doi)
        ],  # TODO: How is it different from nonbib?
        'identifier': [bibcode, doi] + alternate_bibcode,
        'esources': [u"PUB_HTML"],
        'citation':
        citations,
        'citation_count':
        n_citations,
        'citation_count_norm':
        n_citations / n_authors if n_authors > 0 else 0,
        'data_count':
        1,  # Number of elements in `links_data`
        'keyword':
        keywords,
        'keyword_facet':
        keywords,
        'keyword_norm': [u"-"] * n_keywords,
        'keyword_schema': [u"-"] * n_keywords,
        'property': [
            u"ESOURCE", u"NONARTICLE", u"NOT REFEREED", u"PUB_OPENACCESS",
            u"OPENACCESS"
        ],
        'pub':
        source,
        'pub_raw':
        source,
        'pubdate':
        pubdate,
        'pubnote': [],
        'read_count':
        0,
        'title': [title],
        'publisher':
        source,
        'version':
        version
    }
    # Status
    if citation_change.status == Status.new:
        status = 2
    elif citation_change.status == Status.updated:
        status = 3
    elif citation_change.status == Status.deleted:
        status = 1
        # Only use this field for deletions, otherwise Solr will complain the field does not exist
        # and if this key does not exist in the dict/protobuf, the message will be
        # treated as new/update by MasterPipeline
        record_dict['status'] = status
    else:
        status = 0  # active
    record = DenormalizedRecord(**record_dict)
    nonbib_record = _build_nonbib_record(app, citation_change, record, status)
    return record, nonbib_record
Esempio n. 6
0
def build_record(app,
                 citation_change,
                 parsed_metadata,
                 citations,
                 db_versions,
                 entry_date=None):
    if citation_change.content_type != CitationChangeContentType.doi:
        raise Exception("Only DOI records can be forwarded to master")
    # Extract required values
    bibcode = parsed_metadata.get('bibcode')
    if bibcode is None:
        raise Exception(
            "Only records with a bibcode can be forwarded to master")
    if entry_date is None:
        entry_date = citation_change.timestamp.ToDatetime()
    #Check if doi points to a concept record or to a specific version
    if parsed_metadata.get('version_of', None) not in (None, "", [], ''):
        is_release = True
    else:
        is_release = False
    alternate_bibcode = parsed_metadata.get('alternate_bibcode', [])
    abstract = parsed_metadata.get('abstract', "")
    title = parsed_metadata.get('title', "")
    keywords = parsed_metadata.get('keywords', [])
    authors = parsed_metadata.get('authors', [])
    normalized_authors = parsed_metadata.get('normalized_authors', [])
    affiliations = parsed_metadata.get('affiliations', ['-'] * len(authors))
    pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d"))
    try:
        solr_date = (
            datetime.datetime.strptime(pubdate, "%Y-%m-%d") +
            datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
    except ValueError:
        try:
            #In the event only a year is specified, the date is assumed to be January 1st of the given year.
            logger.warn(
                "Publication date does not conform to Y-m-d format. Assuming only year is specified."
            )
            pubdate = pubdate + "-01" + "-01"
            solr_date = (datetime.datetime.strptime(pubdate, "%Y-%m-%d") +
                         datetime.timedelta(minutes=30)
                         ).strftime('%Y-%m-%dT%H:%M:%S.%fZ')
        except ValueError:
            #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream.
            logger.warn(
                "Cannot parse publication date. Setting to current datetime.")
            solr_date = date2solrstamp(entry_date)

    source = parsed_metadata.get('source', "Unknown")
    version = parsed_metadata.get('version', "")
    doctype = parsed_metadata.get('doctype', "software")
    # Clean abstract and title
    abstract = ''.join(
        BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace(
            '\n', ' ').replace('\r', '')
    title = ''.join(BeautifulSoup(title,
                                  features="lxml").findAll(text=True)).replace(
                                      '\n', ' ').replace('\r', '')
    # Extract year
    year = pubdate.split("-")[0]
    # Build an author_facet_hier list with the following structure:
    #   "0/Blanco-Cuaresma, S",
    #   "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S",
    #   "0/Soubiran, C",
    #   "1/Soubiran, C/Soubiran, C",
    author_facet_hier = list(
        itertools.chain.from_iterable(
            zip(["0/" + a for a in normalized_authors], [
                "1/" + a[0] + "/" + a[1]
                for a in zip(normalized_authors, authors)
            ])))

    # Count
    n_keywords = len(keywords)
    n_authors = len(authors)
    n_citations = len(citations)
    doi = citation_change.content
    record_dict = {
        'abstract':
        abstract,
        'ack':
        '',
        'aff': ["-" if aff == "" else aff for aff in affiliations],
        'alternate_bibcode':
        alternate_bibcode,
        'alternate_title': [],
        'arxiv_class': [],
        'author':
        authors,
        'author_count':
        n_authors,
        'author_facet':
        normalized_authors,
        'author_facet_hier':
        author_facet_hier,
        'author_norm':
        normalized_authors,
        'bibcode':
        bibcode,
        'bibstem': ['zndo'],
        'bibstem_facet':
        'zndo',
        'copyright': [],
        'comment': [],
        'database': ['general', 'astronomy'],
        'entry_date':
        date2solrstamp(entry_date),  # date2solrstamp(get_date()),
        'year':
        year,
        'date':
        solr_date,  # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter
        'doctype':
        doctype,
        'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"],
        'doi': [doi],
        'eid':
        doi,
        'email': ['-'] * n_authors,
        'first_author':
        authors[0] if n_authors > 0 else '',
        'first_author_facet_hier':
        author_facet_hier[:2],
        'first_author_norm':
        normalized_authors[0] if n_authors > 0 else '',
        'links_data': [
            '{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}'
            .format(app.conf['DOI_URL'] + doi)
        ],  # TODO: How is it different from nonbib?
        'identifier': [bibcode, doi] + alternate_bibcode,
        'esources': ["PUB_HTML"],
        'citation':
        citations,
        'citation_count':
        n_citations,
        'citation_count_norm':
        n_citations / n_authors if n_authors > 0 else 0,
        'data_count':
        1,  # Number of elements in `links_data`
        'keyword':
        keywords,
        'keyword_facet':
        keywords,
        'keyword_norm': ["-"] * n_keywords,
        'keyword_schema': ["-"] * n_keywords,
        'property': [
            "ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS",
            "OPENACCESS"
        ],
        'pub':
        source,
        'pub_raw':
        source,
        'pubdate':
        pubdate,
        'pubnote': [],
        'read_count':
        0,
        'title': [title],
        'publisher':
        source,
        'version':
        version
    }
    if version is None:  # Concept DOIs may not contain version
        del record_dict['version']
    # Status
    if citation_change.status == Status.new:
        status = 2
    elif citation_change.status == Status.updated:
        status = 3
    elif citation_change.status == Status.deleted:
        status = 1
        # Only use this field for deletions, otherwise Solr will complain the field does not exist
        # and if this key does not exist in the dict/protobuf, the message will be
        # treated as new/update by MasterPipeline
        record_dict['status'] = status
    else:
        status = 0  # active
    if db_versions not in [{"": ""}, {}, None]:
        record_dict['property'].append('ASSOCIATED')
    if is_release:
        record_dict['property'].append('RELEASE')

    record = DenormalizedRecord(**record_dict)
    nonbib_record = _build_nonbib_record(app, citation_change, record,
                                         db_versions, status)
    return record, nonbib_record