def test_get_paragraphs_between(self): paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml) items = parsedoc.get_paragraphs_between(paragraphs, 'Main items', 'Other stuff') self.assertEqual( ['Very large tomatoes', 'Huge apricots', 'Mediocre marrows'], items)
def read_variables(self, startmarker, endmarker): """Read variables table from document and return as list of VariableSpecification objects.""" # get relevant paragraphs paragraphs_variables = parsedoc.get_paragraphs_between( self.paragraphs_all, startmarker, endmarker) # build list variables = [] current_variable = None for paragraph in paragraphs_variables: if paragraph.startswith('Variable: '): fields = paragraph.split(' ') name = fields[1] dtype = VARIABLE_TYPES[fields[2]] current_variable = VariableSpecification(name, dtype) variables.append(current_variable) elif paragraph: key = paragraph.split(': ')[0] value = paragraph[(len(key) + 2):] # allow arrays of integers as needed for flag meanings if value and (value[0] == '['): value = ast.literal_eval(value) current_variable.add_metadata(key, value) # return result return variables
def read_name_list(self, startmarker, endmarker): """Read a comma-separated list.""" # Read relevant line (comma-separated list) namelines = parsedoc.get_paragraphs_between(self.paragraphs_all, startmarker, endmarker) # Parse return namelines[0].strip().split(', ')
def test_get_paragraphs_containing(self): paragraphs = parsedoc.parse_docx_paragraphs(self.rawxml) items = parsedoc.get_paragraphs_between(paragraphs, 'Main items', 'Other stuff') measurements = parsedoc.get_paragraphs_containing( paragraphs, items, 'Measurement:') self.assertEqual( { 'Very large tomatoes': 'Measurement: 20cm', 'Huge apricots': 'Measurement: 15cm', 'Mediocre marrows': 'Measurement: 12cm' }, measurements)
def get_dataset_list(): """Parse and return expected array of descriptors per dataset.""" pathname = os.path.normpath( os.path.join(eustaceconfig.SYSTEM_PATH, DELIVERABLE_D4_3_RELATIVE_PATH)) inputfile = open(pathname, 'rb') rawxml = retrieve_document_xml(inputfile) paragraphs = parse_docx_paragraphs(rawxml, ignore_trailing_digits=True) names = get_paragraphs_between(paragraphs, DATASET_CONTENTS_START, DATASET_CONTENTS_END) paths = get_paragraphs_containing(paragraphs, names, eustaceconfig.WORKSPACE_PATH) result = [CatalogueDataSet(name, paths.get(name)) for name in names] return result
def read_attributes(self, startmarker, endmarker): """Read attributes table from document and return as dictionary.""" # get relevant paragraphs paragraphs_attributes = parsedoc.get_paragraphs_between( self.paragraphs_all, startmarker, endmarker) # parse attributes table attributes = {} for paragraph in paragraphs_attributes: column = paragraph.split(': ') key = column[0] value = column[1] attributes[key] = value # return result return attributes
def read_dimensions(self, startmarker, endmarker): """Read required dimensions from document and return as dictionary of required values (or None to indicate UNLIMITED).""" # get relevant paragraphs paragraphs_dimensions = parsedoc.get_paragraphs_between( self.paragraphs_all, startmarker, endmarker) # parse dimensions required dimensions_required = {} for paragraph in paragraphs_dimensions[1:]: column = paragraph.split(': ') if column[1] == DIMENSION_UNLIMITED: dimensions_required[column[0]] = None else: dimensions_required[column[0]] = int(column[1]) # return result return dimensions_required