def date_closed(self, key, value): def _contains_email(val): return '@' in val def _contains_url(val): return 'www' in val or 'http' in val el = force_single_element(value) deadline_date = force_single_element(el.get('i')) if deadline_date: self['deadline_date'] = normalize_date(deadline_date) closing_date = None closing_info = force_single_element(el.get('l')) if closing_info: if _contains_email(closing_info): if 'reference_email' in self: self['reference_email'].append(closing_info) else: self['reference_email'] = [closing_info] elif _contains_url(closing_info): if 'urls' in self: self['urls'].append({'value': closing_info}) else: self['urls'] = [{'value': closing_info}] else: closing_date = normalize_date(closing_info) return closing_date
def name(self, key, value): """Populate the ``name`` key. Also populates the ``status``, ``birth_date`` and ``death_date`` keys through side effects. """ def _get_title(value): c_value = force_single_element(value.get('c', '')) if c_value != 'title (e.g. Sir)': return c_value def _get_value(value): a_value = force_single_element(value.get('a', '')) q_value = force_single_element(value.get('q', '')) return a_value or normalize_name(q_value) if value.get('d'): dates = value['d'] try: self['death_date'] = normalize_date(dates) except ValueError: dates = dates.split(' - ') if len(dates) == 1: dates = dates[0].split('-') self['birth_date'] = normalize_date(dates[0]) self['death_date'] = normalize_date(dates[1]) self['status'] = force_single_element(value.get('g', '')).lower() return { 'numeration': force_single_element(value.get('b', '')), 'preferred_name': force_single_element(value.get('q', '')), 'title': _get_title(value), 'value': _get_value(value), }
def positions(self, key, value): current = False record = None recid_or_status = force_list(value.get('z')) for el in recid_or_status: if el.lower() == 'current': current = True else: record = get_record_ref(maybe_int(el), 'institutions') institution = { 'name': value.get('a'), 'record': record, 'curated_relation': record is not None, } emails = [el for el in force_list(value.get('m'))] old_emails = [el for el in force_list(value.get('o'))] _rank = value.get('r') rank = normalize_rank(_rank) return { 'institution': institution if institution['name'] else None, 'emails': emails, 'old_emails': old_emails, '_rank': _rank, 'rank': rank, 'start_date': normalize_date(value.get('s')), 'end_date': normalize_date(value.get('t')), 'current': current, }
def add_institution( self, institution, start_date=None, end_date=None, rank=None, record=None, curated=False, current=False, hidden=False, ): """Add an institution where the person works/worked. Args: :param institution: name of the institution. :type institution: string :param start_date: the date when the person joined the institution, in any format. :type start_date: string :param end_date: the date when the person left the institution, in any format. :type end_date: string :param rank: the rank of academic position of the person inside the institution. :type rank: string :param record: URI for the institution record. :type record: string :param curated: if the institution has been curated i.e. has been verified. :type curated: boolean :param current: if the person is currently associated with this institution. :type current: boolean :param hidden: if the institution should be visible :type hidden: boolean """ new_institution = {} new_institution['institution'] = institution if start_date: new_institution['start_date'] = normalize_date(start_date) if end_date: new_institution['end_date'] = normalize_date(end_date) if rank: new_institution['rank'] = rank if record: new_institution['record'] = record new_institution["hidden"] = hidden new_institution['curated_relation'] = curated new_institution['current'] = current self._append_to('positions', new_institution) self.obj['positions'].sort(key=self._get_institution_priority_tuple, reverse=True)
def dates(self, key, values): for value in force_list(values): if value.get('q'): self['date_proposed'] = normalize_date(value.get('q')) if value.get('r'): self['date_approved'] = normalize_date(value.get('r')) if value.get('s'): self['date_started'] = normalize_date(value.get('s')) if value.get('c'): self['date_cancelled'] = normalize_date(value.get('c')) if value.get('t'): self['date_completed'] = normalize_date(value.get('t')) raise IgnoreKey
def _harvesting_info(self, key, value): return { 'coverage': value.get('a'), 'date_last_harvest': normalize_date(value.get('c')), 'last_seen_item': value.get('3'), 'method': value.get('i'), }
def _desy_bookkeeping(self, key, value): """Populate the ``_desy_bookkeeping`` key.""" return { 'date': normalize_date(value.get('d')), 'expert': force_single_element(value.get('a')), 'status': value.get('s'), }
def add_imprint_date(self, imprint_date): """Add imprint date. :type imprint_date: string. A (partial) date in any format. The date should contain at least a year """ self._append_to('imprints', {'date': normalize_date(imprint_date)})
def add_preprint_date(self, preprint_date): """Add preprint date. :type preprint_date: string. A (partial) date in any format. The date should contain at least a year """ self.record['preprint_date'] = normalize_date(preprint_date)
def add_book(self, publisher=None, place=None, date=None): """ Make a dictionary that is representing a book. :param publisher: publisher name :type publisher: string :param place: place of publication :type place: string :param date: A (partial) date in any format. The date should contain at least a year :type date: string :rtype: dict """ imprint = {} if date is not None: imprint['date'] = normalize_date(date) if place is not None: imprint['place'] = place if publisher is not None: imprint['publisher'] = publisher self._append_to('imprints', imprint)
def add_project(self, name, record=None, start_date=None, end_date=None, curated=False, current=False, hidden=False): """Add an experiment that the person worked on. Args: :param name: name of the experiment. :type name: string :param start_date: the date when the person started working on the experiment. :type start_date: string :param end_date: the date when the person stopped working on the experiment. :type end_date: string :param record: URI for the experiment record. :type record: string :param curated: if the experiment has been curated i.e. has been verified. :type curated: boolean :param current: if the person is currently working on this experiment. :type current: boolean :param hidden: if the project should be visible :type hidden: boolean """ new_experiment = {} new_experiment['name'] = name if start_date: new_experiment['start_date'] = normalize_date(start_date) if end_date: new_experiment['end_date'] = normalize_date(end_date) if record: new_experiment['record'] = record new_experiment["hidden"] = hidden new_experiment['curated_relation'] = curated new_experiment['current'] = current self._append_to('project_membership', new_experiment) self.obj['project_membership'].sort(key=self._get_work_priority_tuple, reverse=True)
def set_opening_date(self, date=None): """Add conference opening date. Args: date (str): conference opening date. """ if date is not None: self.record['opening_date'] = normalize_date(date=date)
def positions(self, key, value): """Populate the positions field. Also populates the email_addresses field by side effect. """ email_addresses = self.get("email_addresses", []) current = None record = None recid_or_status = force_list(value.get('z')) for el in recid_or_status: if el.lower() == 'current': current = True if value.get('a') else None else: record = get_record_ref(maybe_int(el), 'institutions') rank = normalize_rank(value.get('r')) current_email_addresses = force_list(value.get('m')) non_current_email_addresses = force_list(value.get('o')) email_addresses.extend({ 'value': address, 'current': True, } for address in current_email_addresses) email_addresses.extend({ 'value': address, 'current': False, } for address in non_current_email_addresses) self['email_addresses'] = email_addresses if 'a' not in value: return None return { 'institution': value['a'], 'record': record, 'curated_relation': True if record is not None else None, 'rank': rank, 'start_date': normalize_date(value.get('s')), 'end_date': normalize_date(value.get('t')), 'current': current, }
def _dates(self, key, value): """Don't populate any key through the return value. On the other hand, populates the ``date_proposed``, ``date_approved``, ``date_started``, ``date_cancelled``, and the ``date_completed`` keys through side effects. """ if value.get('q'): self['date_proposed'] = normalize_date(value['q']) if value.get('r'): self['date_approved'] = normalize_date(value['r']) if value.get('s'): self['date_started'] = normalize_date(value['s']) if value.get('c'): self['date_cancelled'] = normalize_date(value['c']) if value.get('t'): self['date_completed'] = normalize_date(value['t']) raise IgnoreKey
def add_project(self, name, record=None, start_date=None, end_date=None, curated=False, current=False): """Add an experiment that the person worked on. Args: :param name: name of the experiment. :type name: string :param start_date: the date when the person started working on the experiment. :type start_date: string :param end_date: the date when the person stopped working on the experiment. :type end_date: string :param record: URI for the experiment record. :type record: string :param curated: if the experiment has been curated i.e. has been verified. :type curated: boolean :param current: if the person is currently working on this experiment. :type current: boolean """ new_experiment = {} new_experiment['name'] = name if start_date: new_experiment['start_date'] = normalize_date(start_date) if end_date: new_experiment['end_date'] = normalize_date(end_date) if record: new_experiment['record'] = record new_experiment['curated_relation'] = curated new_experiment['current'] = current self._append_to('project_membership', new_experiment)
def normalize_date_aggressively(date): """Normalize date, stripping date parts until a valid date is obtained.""" def _strip_last_part(date): parts = date.split('-') return '-'.join(parts[:-1]) fake_dates = {'0000', '9999'} if date in fake_dates: return None try: return normalize_date(date) except (IndexError, TypeError, ValueError): if '-' not in date: raise else: new_date = _strip_last_part(date) return normalize_date_aggressively(new_date)
def thesis_info(self, key, value): """Populate the ``thesis_info`` key.""" def _get_degree_type(value): DEGREE_TYPES_MAP = { 'RAPPORT DE STAGE': 'other', 'INTERNSHIP REPORT': 'other', 'DIPLOMA': 'diploma', 'BACHELOR': 'bachelor', 'LAUREA': 'laurea', 'MASTER': 'master', 'THESIS': 'other', 'PHD': 'phd', 'PDF': 'phd', 'PH.D. THESIS': 'phd', 'HABILITATION': 'habilitation', } b_value = force_single_element(value.get('b', '')) if b_value: return DEGREE_TYPES_MAP.get(b_value.upper(), 'other') def _get_institutions(value): c_values = force_list(value.get('c')) z_values = force_list(value.get('z')) # XXX: we zip only when they have the same length, otherwise # we might match a value with the wrong recid. if len(c_values) != len(z_values): return [{'name': c_value} for c_value in c_values] else: return [{ 'curated_relation': True, 'name': c_value, 'record': get_record_ref(z_value, 'institutions'), } for c_value, z_value in zip(c_values, z_values)] thesis_info = self.get('thesis_info', {}) thesis_info['date'] = normalize_date(force_single_element(value.get('d'))) thesis_info['degree_type'] = _get_degree_type(value) thesis_info['institutions'] = _get_institutions(value) return thesis_info
def public_notes(self, key, value): """Populate the ``public_notes`` key. Also populates the ``curated`` and ``thesis_info`` keys through side effects. """ def _means_not_curated(public_note): return public_note in [ '*Brief entry*', '* Brief entry *', '*Temporary entry*', '* Temporary entry *', '*Temporary record*', '* Temporary record *', ] public_notes = self.get('public_notes', []) curated = self.get('curated') thesis_info = self.get('thesis_info', {}) source = force_single_element(value.get('9', '')) for value in force_list(value): for public_note in force_list(value.get('a')): match = IS_DEFENSE_DATE.match(public_note) if match: try: thesis_info['defense_date'] = normalize_date( match.group('defense_date')) except ValueError: public_notes.append({ 'source': source, 'value': public_note, }) elif _means_not_curated(public_note): curated = False else: public_notes.append({ 'source': source, 'value': public_note, }) self['curated'] = curated self['thesis_info'] = thesis_info return public_notes
def test_normalize_date_handles_year_month(): expected = '1686-06' result = normalize_date('1686-06') assert expected == result
def imprints(self, key, value): return { 'place': value.get('a'), 'publisher': value.get('b'), 'date': normalize_date(value.get('c')), }
def set_imprint_date(self, date): self._ensure_reference_field('imprint', {}) self.obj['reference']['imprint']['date'] = normalize_date(date)
def test_normalize_date_handles_ISO(): expected = '1686-06-30' result = normalize_date('1686-06-30') assert expected == result
def preprint_date(self, key, value): return normalize_date(value.get('c'))
def test_normalize_date_returns_none_on_none(): assert normalize_date(None) is None
rec['supervisor'] = [[re.sub('^Dr. ', '', p.text.strip())]] for div in artpage.body.find_all('div', attrs={'id': 'advisor2'}): for p in div.find_all('p'): rec['supervisor'].append([re.sub('^Dr. ', '', p.text.strip())]) if not rec.has_key('doi'): rec['doi'] = '20.2000/Connecticut/' + re.sub( '\W', '', re.sub('.*edu', '', rec['artlink'])) rec['link'] = rec['artlink'] #keywords for div in artpage.body.find_all('div', attrs={'id': 'subject_area'}): for p in div.find_all('p'): rec['keyw'] = re.split(', ', p.text.strip()) #embargo for div in artpage.body.find_all('div', attrs={'id': 'embargo_date'}): for p in div.find_all('p'): rec['embargo'] = normalize_date(p.text.strip()) if 'pdf_url' in rec.keys(): if 'embargo' in rec.keys(): if rec['embargo'] > stampoftoday: print ' embargo until %s' % (rec['embargo']) else: rec['FFT'] = rec['pdf_url'] else: rec['FFT'] = rec['pdf_url'] #depatrment for div in artpage.body.find_all('div', attrs={'id': 'department'}): for p in div.find_all('p'): department = p.text.strip() if department in boringdepartments: print ' skip "%s"' % (department) else:
def test_normalize_date_raises_on_unparseable_dates(): with pytest.raises(ValueError): normalize_date('Foo')
def test_normalize_date_raises_on_dates_without_year(): with pytest.raises(ValueError): normalize_date('Fri June 30')
def test_normalize_date_handles_human_friendly_dates(): expected = '1686-06-30' result = normalize_date('Fri June 30 1686') assert expected == result
def test_normalize_date_handles_default_dates(): default_date1 = '0001-01-01' default_date2 = '0002-02-02' assert default_date1 == normalize_date('0001-01-01') assert default_date2 == normalize_date('0002-02-02')
def legacy_creation_date(self, key, value): if 'legacy_creation_date' in self: return self['legacy_creation_date'] return normalize_date(value.get('x'))