def get_timestamps(db_record, out): out = {} last_update = None for k, v in fmap.items(): if v in db_record and db_record[v]: t = db_record[v] out[k] = date2solrstamp(t) if last_update is None or t > last_update: last_update = t if last_update: out['update_timestamp'] = date2solrstamp(last_update) return out
def consume(self, date, key, value): json_body = [{ "measurement": "reads", "tags": { "host": "arxiv", "paper": key }, "time": date2solrstamp(date), "fields": { "Int_value": 1, "String_value": value } }] self.client.write_points(json_body)
def test_get_date(self): """Check we always work with UTC dates""" d = adsputils.get_date() self.assertTrue(d.tzname() == 'UTC') d1 = adsputils.get_date('2009-09-04T01:56:35.450686Z') self.assertTrue(d1.tzname() == 'UTC') self.assertEqual(d1.isoformat(), '2009-09-04T01:56:35.450686+00:00') self.assertEqual(adsputils.date2solrstamp(d1), '2009-09-04T01:56:35.450686Z') d2 = adsputils.get_date('2009-09-03T20:56:35.450686-05:00') self.assertTrue(d2.tzname() == 'UTC') self.assertEqual(d2.isoformat(), '2009-09-04T01:56:35.450686+00:00') self.assertEqual(adsputils.date2solrstamp(d2), '2009-09-04T01:56:35.450686Z') d3 = adsputils.get_date('2009-09-03T20:56:35.450686') self.assertTrue(d3.tzname() == 'UTC') self.assertEqual(d3.isoformat(), '2009-09-03T20:56:35.450686+00:00') self.assertEqual(adsputils.date2solrstamp(d3), '2009-09-03T20:56:35.450686Z')
def _entry_date(ADS_record): d = ADS_record.get('entry_date', None) return {'entry_date': date2solrstamp(d and get_date(d) or get_date())}
def build_record(app, citation_change, parsed_metadata, citations): if citation_change.content_type != CitationChangeContentType.doi: raise Exception("Only DOI records can be forwarded to master") # Extract required values bibcode = parsed_metadata.get('bibcode') if bibcode is None: raise Exception( "Only records with a bibcode can be forwarded to master") alternate_bibcode = parsed_metadata.get('alternate_bibcode', []) abstract = parsed_metadata.get('abstract', u"") title = parsed_metadata.get('title', u"") keywords = parsed_metadata.get('keywords', []) authors = parsed_metadata.get('authors', []) normalized_authors = parsed_metadata.get('normalized_authors', []) affiliations = parsed_metadata.get('affiliations', [u'-'] * len(authors)) pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d")) source = parsed_metadata.get('source', u"Unknown") version = parsed_metadata.get('version', u"") doctype = parsed_metadata.get('doctype', u"software") # Clean abstract and title abstract = u''.join( BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') title = u''.join(BeautifulSoup( title, features="lxml").findAll(text=True)).replace('\n', ' ').replace('\r', '') # Extract year year = pubdate.split("-")[0] # Build an author_facet_hier list with the following structure: # "0/Blanco-Cuaresma, S", # "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S", # "0/Soubiran, C", # "1/Soubiran, C/Soubiran, C", author_facet_hier = list( itertools.chain.from_iterable( zip(["0/" + a for a in normalized_authors], [ "1/" + a[0] + "/" + a[1] for a in zip(normalized_authors, authors) ]))) # Count n_keywords = len(keywords) n_authors = len(authors) n_citations = len(citations) doi = citation_change.content record_dict = { 'abstract': abstract, 'ack': u'', 'aff': [u"-" if aff == "" else aff for aff in affiliations], 'alternate_bibcode': alternate_bibcode, 'alternate_title': [], 'arxiv_class': [], 'author': authors, 'author_count': n_authors, 'author_facet': normalized_authors, 'author_facet_hier': author_facet_hier, 'author_norm': normalized_authors, 'bibcode': bibcode, 'bibstem': [u'zndo'], 'bibstem_facet': u'zndo', 'copyright': [], 'comment': [], 'database': [u'general', u'astronomy'], 'entry_date': date2solrstamp(citation_change.timestamp.ToDatetime() ), # date2solrstamp(get_date()), 'year': year, 'date': (citation_change.timestamp.ToDatetime() + datetime.timedelta(minutes=30)). strftime( '%Y-%m-%dT%H:%M:%S.%fZ' ), # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter 'doctype': doctype, 'doctype_facet_hier': [u"0/Non-Article", u"1/Non-Article/Software"], 'doi': [doi], 'eid': doi, 'email': [u'-'] * n_authors, 'first_author': authors[0] if n_authors > 0 else u'', 'first_author_facet_hier': author_facet_hier[:2], 'first_author_norm': normalized_authors[0] if n_authors > 0 else u'', 'links_data': [ u'{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}' .format(app.conf['DOI_URL'] + doi) ], # TODO: How is it different from nonbib? 'identifier': [bibcode, doi] + alternate_bibcode, 'esources': [u"PUB_HTML"], 'citation': citations, 'citation_count': n_citations, 'citation_count_norm': n_citations / n_authors if n_authors > 0 else 0, 'data_count': 1, # Number of elements in `links_data` 'keyword': keywords, 'keyword_facet': keywords, 'keyword_norm': [u"-"] * n_keywords, 'keyword_schema': [u"-"] * n_keywords, 'property': [ u"ESOURCE", u"NONARTICLE", u"NOT REFEREED", u"PUB_OPENACCESS", u"OPENACCESS" ], 'pub': source, 'pub_raw': source, 'pubdate': pubdate, 'pubnote': [], 'read_count': 0, 'title': [title], 'publisher': source, 'version': version } # Status if citation_change.status == Status.new: status = 2 elif citation_change.status == Status.updated: status = 3 elif citation_change.status == Status.deleted: status = 1 # Only use this field for deletions, otherwise Solr will complain the field does not exist # and if this key does not exist in the dict/protobuf, the message will be # treated as new/update by MasterPipeline record_dict['status'] = status else: status = 0 # active record = DenormalizedRecord(**record_dict) nonbib_record = _build_nonbib_record(app, citation_change, record, status) return record, nonbib_record
def build_record(app, citation_change, parsed_metadata, citations, db_versions, entry_date=None): if citation_change.content_type != CitationChangeContentType.doi: raise Exception("Only DOI records can be forwarded to master") # Extract required values bibcode = parsed_metadata.get('bibcode') if bibcode is None: raise Exception( "Only records with a bibcode can be forwarded to master") if entry_date is None: entry_date = citation_change.timestamp.ToDatetime() #Check if doi points to a concept record or to a specific version if parsed_metadata.get('version_of', None) not in (None, "", [], ''): is_release = True else: is_release = False alternate_bibcode = parsed_metadata.get('alternate_bibcode', []) abstract = parsed_metadata.get('abstract', "") title = parsed_metadata.get('title', "") keywords = parsed_metadata.get('keywords', []) authors = parsed_metadata.get('authors', []) normalized_authors = parsed_metadata.get('normalized_authors', []) affiliations = parsed_metadata.get('affiliations', ['-'] * len(authors)) pubdate = parsed_metadata.get('pubdate', get_date().strftime("%Y-%m-%d")) try: solr_date = ( datetime.datetime.strptime(pubdate, "%Y-%m-%d") + datetime.timedelta(minutes=30)).strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: try: #In the event only a year is specified, the date is assumed to be January 1st of the given year. logger.warn( "Publication date does not conform to Y-m-d format. Assuming only year is specified." ) pubdate = pubdate + "-01" + "-01" solr_date = (datetime.datetime.strptime(pubdate, "%Y-%m-%d") + datetime.timedelta(minutes=30) ).strftime('%Y-%m-%dT%H:%M:%S.%fZ') except ValueError: #If above fails, just set it to the current date. Running maintenance_metadata could fix the bad publication date in the future if it is updated upstream. logger.warn( "Cannot parse publication date. Setting to current datetime.") solr_date = date2solrstamp(entry_date) source = parsed_metadata.get('source', "Unknown") version = parsed_metadata.get('version', "") doctype = parsed_metadata.get('doctype', "software") # Clean abstract and title abstract = ''.join( BeautifulSoup(abstract, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') title = ''.join(BeautifulSoup(title, features="lxml").findAll(text=True)).replace( '\n', ' ').replace('\r', '') # Extract year year = pubdate.split("-")[0] # Build an author_facet_hier list with the following structure: # "0/Blanco-Cuaresma, S", # "1/Blanco-Cuaresma, S/Blanco-Cuaresma, S", # "0/Soubiran, C", # "1/Soubiran, C/Soubiran, C", author_facet_hier = list( itertools.chain.from_iterable( zip(["0/" + a for a in normalized_authors], [ "1/" + a[0] + "/" + a[1] for a in zip(normalized_authors, authors) ]))) # Count n_keywords = len(keywords) n_authors = len(authors) n_citations = len(citations) doi = citation_change.content record_dict = { 'abstract': abstract, 'ack': '', 'aff': ["-" if aff == "" else aff for aff in affiliations], 'alternate_bibcode': alternate_bibcode, 'alternate_title': [], 'arxiv_class': [], 'author': authors, 'author_count': n_authors, 'author_facet': normalized_authors, 'author_facet_hier': author_facet_hier, 'author_norm': normalized_authors, 'bibcode': bibcode, 'bibstem': ['zndo'], 'bibstem_facet': 'zndo', 'copyright': [], 'comment': [], 'database': ['general', 'astronomy'], 'entry_date': date2solrstamp(entry_date), # date2solrstamp(get_date()), 'year': year, 'date': solr_date, # TODO: Why this date has to be 30 minutes in advance? This is based on ADSImportPipeline SolrAdapter 'doctype': doctype, 'doctype_facet_hier': ["0/Non-Article", "1/Non-Article/Software"], 'doi': [doi], 'eid': doi, 'email': ['-'] * n_authors, 'first_author': authors[0] if n_authors > 0 else '', 'first_author_facet_hier': author_facet_hier[:2], 'first_author_norm': normalized_authors[0] if n_authors > 0 else '', 'links_data': [ '{{"access": "", "instances": "", "title": "", "type": "electr", "url": "{}"}}' .format(app.conf['DOI_URL'] + doi) ], # TODO: How is it different from nonbib? 'identifier': [bibcode, doi] + alternate_bibcode, 'esources': ["PUB_HTML"], 'citation': citations, 'citation_count': n_citations, 'citation_count_norm': n_citations / n_authors if n_authors > 0 else 0, 'data_count': 1, # Number of elements in `links_data` 'keyword': keywords, 'keyword_facet': keywords, 'keyword_norm': ["-"] * n_keywords, 'keyword_schema': ["-"] * n_keywords, 'property': [ "ESOURCE", "NONARTICLE", "NOT REFEREED", "PUB_OPENACCESS", "OPENACCESS" ], 'pub': source, 'pub_raw': source, 'pubdate': pubdate, 'pubnote': [], 'read_count': 0, 'title': [title], 'publisher': source, 'version': version } if version is None: # Concept DOIs may not contain version del record_dict['version'] # Status if citation_change.status == Status.new: status = 2 elif citation_change.status == Status.updated: status = 3 elif citation_change.status == Status.deleted: status = 1 # Only use this field for deletions, otherwise Solr will complain the field does not exist # and if this key does not exist in the dict/protobuf, the message will be # treated as new/update by MasterPipeline record_dict['status'] = status else: status = 0 # active if db_versions not in [{"": ""}, {}, None]: record_dict['property'].append('ASSOCIATED') if is_release: record_dict['property'].append('RELEASE') record = DenormalizedRecord(**record_dict) nonbib_record = _build_nonbib_record(app, citation_change, record, db_versions, status) return record, nonbib_record