def test_numeric_exception(self): bad_ids = [u"b002", u"d", u"c", u"a1"] idsa = IDSortingAlgorithm(IDSortingAlgorithm.NUMERIC) sids = idsa.sort(bad_ids) self.assertTrue(sids == bad_ids)
def _read_unparsed(self, lines, parameters): """ Read text fragments from an unparsed format text file. :param lines: the lines of the unparsed text file :type lines: list of strings :param parameters: additional parameters for parsing (e.g., class/id regex strings) :type parameters: dict """ # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self._log("Parsing fragments from unparsed text format") pairs = [] # get filter attributes attributes = dict() if gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX in parameters: class_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_CLASS_REGEX] if class_regex_string is not None: self._log(["Regex for class: '%s'", class_regex_string]) class_regex = re.compile(r".*\b" + class_regex_string + r"\b.*") attributes['class'] = class_regex if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX in parameters: id_regex_string = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_REGEX] if id_regex_string is not None: self._log(["Regex for id: '%s'", id_regex_string]) id_regex = re.compile(r".*\b" + id_regex_string + r"\b.*") attributes['id'] = id_regex # get id sorting algorithm id_sort = IDSortingAlgorithm.UNSORTED if gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT in parameters: id_sort = parameters[gc.PPN_JOB_IS_TEXT_UNPARSED_ID_SORT] self._log(["Sorting text fragments using '%s'", id_sort]) # transform text in a soup object self._log("Creating soup") soup = BeautifulSoup.BeautifulSoup("\n".join(lines)) # extract according to class_regex and id_regex text_from_id = dict() ids = [] self._log(["Finding elements matching attributes '%s'", attributes]) nodes = soup.findAll(attrs=attributes) for node in nodes: try: f_id = node['id'] f_text = node.text text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self._log("KeyError while parsing a node", Logger.WARNING) # sort by ID as requested self._log("Sorting text fragments") sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self._log("Appending fragments") for key in sorted_ids: pairs.append([key, [text_from_id[key]]]) self._create_text_fragments(pairs)
def test_numeric(self): expected = [u"b001", u"a2", u"c03", u"d4"] idsa = IDSortingAlgorithm(IDSortingAlgorithm.NUMERIC) sids = idsa.sort(self.IDS) self.assertTrue(sids == expected)
def test_lexicographic(self): expected = [u"a2", u"b001", u"c03", u"d4"] idsa = IDSortingAlgorithm(IDSortingAlgorithm.LEXICOGRAPHIC) sids = idsa.sort(self.IDS) self.assertTrue(sids == expected)
def test_unsorted(self): expected = [u"b001", u"c03", u"d4", u"a2"] idsa = IDSortingAlgorithm(IDSortingAlgorithm.UNSORTED) sids = idsa.sort(self.IDS) self.assertTrue(sids == expected)
def test_invalid_algorithm(self): with self.assertRaises(ValueError): idsa = IDSortingAlgorithm(u"foo")
def _read_unparsed(self, lines): """ Read text fragments from an unparsed format text file. :param list lines: the lines of the unparsed text file """ from bs4 import BeautifulSoup def filter_attributes(): """ Return a dict with the bs4 filter parameters """ attributes = {} for attribute_name, filter_name in [ ("class", gc.PPN_TASK_IS_TEXT_UNPARSED_CLASS_REGEX), ("id", gc.PPN_TASK_IS_TEXT_UNPARSED_ID_REGEX) ]: if filter_name in self.parameters: regex_string = self.parameters[filter_name] if regex_string is not None: self.log([ u"Regex for %s: '%s'", attribute_name, regex_string ]) regex = re.compile(r".*\b" + regex_string + r"\b.*") attributes[attribute_name] = regex return attributes # # TODO better and/or parametric parsing, # for example, removing tags but keeping text, etc. # self.log(u"Parsing fragments from unparsed text format") # transform text in a soup object self.log(u"Creating soup") soup = BeautifulSoup("\n".join(lines), "lxml") # extract according to class_regex and id_regex text_from_id = {} ids = [] filter_attributes = filter_attributes() self.log( [u"Finding elements matching attributes '%s'", filter_attributes]) nodes = soup.findAll(attrs=filter_attributes) for node in nodes: try: f_id = gf.safe_unicode(node["id"]) f_text = gf.safe_unicode(node.text) text_from_id[f_id] = f_text ids.append(f_id) except KeyError: self.log_warn(u"KeyError while parsing a node") # sort by ID as requested id_sort = gf.safe_get(dictionary=self.parameters, key=gc.PPN_TASK_IS_TEXT_UNPARSED_ID_SORT, default_value=IDSortingAlgorithm.UNSORTED, can_return_none=False) self.log([u"Sorting text fragments using '%s'", id_sort]) sorted_ids = IDSortingAlgorithm(id_sort).sort(ids) # append to fragments self.log(u"Appending fragments") self._create_text_fragments([(key, [text_from_id[key]]) for key in sorted_ids])