def _check_empty_line(fh): """Check that the next line in `fh` is empty or whitespace-only.""" line = next(fh, None) if line is None: raise OrdinationFormatError( "Reached end of file while looking for blank line separating " "sections.") if line.strip(): raise OrdinationFormatError("Expected an empty line.")
def _parse_header(fh, header_id, num_dimensions): line = next(fh, None) if line is None: raise OrdinationFormatError( "Reached end of file while looking for %s header." % header_id) header = line.strip().split('\t') # +1 for the header ID if len(header) != num_dimensions + 1 or header[0] != header_id: raise OrdinationFormatError("%s header not found." % header_id) return header
def _parse_array_section(fh, header_id, has_ids=True): """Parse an array section of `fh` identified by `header_id`.""" # Parse the array header header = _parse_header(fh, header_id, 2) # Parse the dimensions of the array rows = int(header[1]) cols = int(header[2]) ids = None if rows == 0 and cols == 0: # The ordination method didn't generate the array data for 'header', so # set it to None data = None elif rows == 0 or cols == 0: # Both dimensions should be 0 or none of them are zero raise OrdinationFormatError("One dimension of %s is 0: %d x %d" % (header_id, rows, cols)) else: # Parse the data data = np.empty((rows, cols), dtype=np.float64) if has_ids: ids = [] for i in range(rows): # Parse the next row of data line = next(fh, None) if line is None: raise OrdinationFormatError( "Reached end of file while looking for row %d in %s " "section." % (i + 1, header_id)) vals = line.strip().split('\t') if has_ids: ids.append(vals[0]) vals = vals[1:] if len(vals) != cols: raise OrdinationFormatError( "Expected %d values, but found %d in row %d." % (cols, len(vals), i + 1)) data[i, :] = np.asarray(vals, dtype=np.float64) data = pd.DataFrame(data, index=ids) return data
def _check_length_against_eigvals(data, eigvals, label): if data is not None: num_vals = data.shape[-1] num_eigvals = eigvals.shape[-1] if num_vals != num_eigvals: raise OrdinationFormatError( "There should be as many %s as eigvals: %d != %d" % (label, num_vals, num_eigvals))
def _parse_vector_section(fh, header_id): header = _parse_header(fh, header_id, 1) # Parse how many values we are waiting for num_vals = int(header[1]) if num_vals == 0: # The ordination method didn't generate the vector, so set it to None vals = None else: # Parse the line with the vector values line = next(fh, None) if line is None: raise OrdinationFormatError( "Reached end of file while looking for line containing values " "for %s section." % header_id) vals = np.asarray(line.strip().split('\t'), dtype=np.float64) if len(vals) != num_vals: raise OrdinationFormatError( "Expected %d values in %s section, but found %d." % (num_vals, header_id, len(vals))) return vals
def _ordination_to_ordination_results(fh): eigvals = _parse_vector_section(fh, 'Eigvals') if eigvals is None: raise OrdinationFormatError("At least one eigval must be present.") _check_empty_line(fh) prop_expl = _parse_vector_section(fh, 'Proportion explained') _check_length_against_eigvals(prop_expl, eigvals, 'proportion explained values') _check_empty_line(fh) species, species_ids = _parse_array_section(fh, 'Species') _check_length_against_eigvals(species, eigvals, 'coordinates per species') _check_empty_line(fh) site, site_ids = _parse_array_section(fh, 'Site') _check_length_against_eigvals(site, eigvals, 'coordinates per site') _check_empty_line(fh) # biplot does not have ids to parse (the other arrays do) biplot, _ = _parse_array_section(fh, 'Biplot', has_ids=False) _check_empty_line(fh) cons, cons_ids = _parse_array_section(fh, 'Site constraints') if cons_ids is not None and site_ids is not None: if cons_ids != site_ids: raise OrdinationFormatError( "Site constraints ids and site ids must be equal: %s != %s" % (cons_ids, site_ids)) return OrdinationResults(eigvals=eigvals, species=species, site=site, biplot=biplot, site_constraints=cons, proportion_explained=prop_expl, species_ids=species_ids, site_ids=site_ids)
def _ordination_to_ordination_results(fh): eigvals = _parse_vector_section(fh, 'Eigvals') if eigvals is None: raise OrdinationFormatError("At least one eigval must be present.") _check_empty_line(fh) prop_expl = _parse_vector_section(fh, 'Proportion explained') _check_length_against_eigvals(prop_expl, eigvals, 'proportion explained values') _check_empty_line(fh) species = _parse_array_section(fh, 'Species') _check_length_against_eigvals(species, eigvals, 'coordinates per species') _check_empty_line(fh) site = _parse_array_section(fh, 'Site') _check_length_against_eigvals(site, eigvals, 'coordinates per site') _check_empty_line(fh) # biplot does not have ids to parse (the other arrays do) biplot = _parse_array_section(fh, 'Biplot', has_ids=False) _check_empty_line(fh) cons = _parse_array_section(fh, 'Site constraints') if cons is not None and site is not None: if not np.array_equal(cons.index, site.index): raise OrdinationFormatError( "Site constraints ids and site ids must be equal: %s != %s" % (cons.index, site.index)) return OrdinationResults( short_method_name='', long_method_name='', eigvals=eigvals, features=species, samples=site, biplot_scores=biplot, sample_constraints=cons, proportion_explained=prop_expl)