Example #1
0
 def _process_row(self, row):
     item = {}
     cells = row.findAll('td')
     item['series'] = cells[1].string.strip()
     item['control_symbol'] = cells[2].a.string.strip()
     item['title'] = cells[3].contents[0].string.strip()
     access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string
     item['access_status'] = re.search(r'Access status: (\w+)',
                                       access_string).group(1)
     location_string = cells[3].find('div',
                                     'CombinedTitleBottomRight').string
     item['location'] = re.search(r'Location: (\w+)',
                                  location_string).group(1)
     date_str = cells[4].string.strip()
     dates = utilities.process_date_string(date_str)
     date_range = {'date_str': date_str}
     date_range['start_date'] = utilities.convert_date_to_iso(
         dates['start_date'])
     date_range['end_date'] = utilities.convert_date_to_iso(
         dates['end_date'])
     item['date_range'] = date_range
     barcode = cells[7].string.strip()
     if cells[5].find('a') is not None:
         item['digitised_status'] = True
         item['digitised_pages'] = self.get_digitised_pages(barcode)
     else:
         item['digitised_status'] = False
         item['digitised_pages'] = 0
     item['identifier'] = barcode
     return item
Example #2
0
 def _process_row(self, row):
     item = {}
     cells = row.findAll('td')
     item['series'] = cells[1].string.strip()
     item['control_symbol'] = cells[2].a.string.strip()
     item['title'] = cells[3].contents[0].string.strip()
     access_string = cells[3].find('div', 'CombinedTitleBottomLeft').string
     item['access_status'] = re.search(r'Access status: (\w+)', access_string).group(1)
     location_string = cells[3].find('div', 'CombinedTitleBottomRight').string
     item['location'] = re.search(r'Location: (\w+)', location_string).group(1)
     date_str = cells[4].string.strip()
     dates = utilities.process_date_string(date_str)
     date_range = {'date_str': date_str}
     date_range['start_date'] = utilities.convert_date_to_iso(dates['start_date'])
     date_range['end_date'] = utilities.convert_date_to_iso(dates['end_date'])
     item['contents_dates'] = date_range
     barcode = cells[7].string.strip()
     if cells[5].find('a') is not None:
         item['digitised_status'] = True
         if self.get_digitised:
             item['digitised_pages'] = self.get_digitised_pages(barcode)
     else:
         item['digitised_status'] = False
         if self.get_digitised:
             item['digitised_pages'] = 0
     item['identifier'] = barcode
     return item
Example #3
0
 def _get_formatted_dates(self, label, entity_id, date_format):
     try:
         date_str = self._get_value(label, entity_id)
     except AttributeError:
         dates = {'date_str': date_str, 'start_date': None, 'end_date': None}
     else:
         dates = utilities.process_date_string(date_str)
         if date_format == 'iso':
             formatted_dates = {
                                 'date_str': date_str,
                                 'start_date': utilities.convert_date_to_iso(dates['start_date']),
                                 'end_date': utilities.convert_date_to_iso(dates['end_date']),
                                 }
         elif date_format == 'obj':
             formatted_dates = dates
     return formatted_dates
Example #4
0
 def test_convert_date_to_iso(self):
     cases = [
         ({'date': datetime.datetime(1884, 6, 2), 'day': True, 'month': True}, '1884-06-02'),
         ({'date': datetime.datetime(1778, 12, 1), 'day': False, 'month': True}, '1778-12'),
         ({'date': datetime.datetime(1962, 1, 1), 'day': False, 'month': False}, '1962'),
     ]
     for case in cases:
         self.assertEqual(utilities.convert_date_to_iso(case[0]), case[1])
Example #5
0
 def _get_relations(self, label, entity_id, date_format):
     cell = self._get_cell(label, entity_id)
     relations = []
     if cell is not None:
         for relation in cell.findAll('li'):
             try:
                 date_str = relation.find('div', 'dates').string.strip()
             except AttributeError:
                 dates = {
                     'date_str': date_str,
                     'start_date': None,
                     'end_date': None
                 }
             else:
                 dates = utilities.process_date_string(date_str)
                 if date_format == 'iso':
                     formatted_dates = {
                         'date_str':
                         date_str,
                         'start_date':
                         utilities.convert_date_to_iso(dates['start_date']),
                         'end_date':
                         utilities.convert_date_to_iso(dates['end_date']),
                     }
                 elif date_format == 'obj':
                     formatted_dates = dates
             details = [
                 string for string in relation.find(
                     'div', 'linkagesInfo').stripped_strings
             ]
             try:
                 identifier = details[0]
                 title = details[1][2:]
             except IndexError:
                 identifier = details[0]
                 title = details[0]
             relations.append({
                 'date_str': formatted_dates['date_str'],
                 'start_date': formatted_dates['start_date'],
                 'end_date': formatted_dates['end_date'],
                 'identifier': identifier,
                 'title': title
             })
     else:
         relations = None
     return relations
Example #6
0
 def _get_formatted_dates(self, label, entity_id, date_format):
     try:
         date_str = self._get_value(label, entity_id)
     except AttributeError:
         dates = {
             'date_str': date_str,
             'start_date': None,
             'end_date': None
         }
     else:
         dates = utilities.process_date_string(date_str)
         if date_format == 'iso':
             formatted_dates = {
                 'date_str':
                 date_str,
                 'start_date':
                 utilities.convert_date_to_iso(dates['start_date']),
                 'end_date':
                 utilities.convert_date_to_iso(dates['end_date']),
             }
         elif date_format == 'obj':
             formatted_dates = dates
     return formatted_dates
Example #7
0
 def _get_relations(self, label, entity_id, date_format):
     cell = self._get_cell(label, entity_id)
     relations = []
     if cell is not None:
         for relation in cell.findAll('li'):
             try:
                 date_str = relation.find('div', 'dates').string.strip()
             except AttributeError:
                 date_str = ''
                 dates = {'date_str': '', 'start_date': None, 'end_date': None}
             else:
                 dates = utilities.process_date_string(date_str)
             if date_format == 'iso':
                 formatted_dates = {
                     'date_str': date_str,
                     'start_date': utilities.convert_date_to_iso(dates['start_date']),
                     'end_date': utilities.convert_date_to_iso(dates['end_date']),
                 }
             elif date_format == 'obj':
                 formatted_dates = dates
             details = [string for string in relation.find('div', 'linkagesInfo').stripped_strings]
             try:
                 identifier = details[0]
                 title = details[1][2:]
             except IndexError:
                 identifier = details[0]
                 title = details[0]
             relations.append({
                 'date_str': formatted_dates['date_str'],
                 'start_date': formatted_dates['start_date'],
                 'end_date': formatted_dates['end_date'],
                 'identifier': identifier,
                 'title': title
             })
     else:
         relations = None
     return relations