def _get_vocabulary_group_information(self, vocabulary_group, category, language): """ parses the word pool of a vocabularygroup """ group_conf = conf["group_information"] vocabulary_group_header = self._get_vocabulary_group_header(vocabulary_group) text = vocabulary_group_header.xpath("td[1]//text()").extract_first() lis = text.split(group_conf["vocabulary_group_split_character"]) regex_conf = None try: regex_conf = group_conf["regex"][str(len(lis))] except KeyError: log.error("There was no configuration for lists with " + str(len(lis)) + " words") # TODO: return default names word_pool_name_indices = regex_conf["default"]["wordpool_name_indices"] group_name_indices = regex_conf["default"]["group_name_indices"] if regex_conf.has_key("cases"): for case in regex_conf["cases"].itervalues(): if re.search(case["regex"], text): word_pool_name_indices = case["wordpool_name_indices"] group_name_indices = case["group_name_indices"] break # log.info(category + " " + language + " " + text) word_pool_name = self._get_word_pool_name_from_indices(word_pool_name_indices, lis) group_name = self._get_group_name_from_indices(group_name_indices, lis) return word_pool_name, group_name, text
def parse(self, response): """ parses all categories and searches a link to each """ log.info("Start parsing ...") categories = response.xpath('//div[@id="col3_content"]/table[1]/tbody/tr[not(@class)]') for category in categories: anchor = category.xpath("td[1]//a") url = response.urljoin(anchor.xpath("@href").extract_first()) category = anchor.xpath("text()").extract_first().strip() # request the category page request = scrapy.Request(url, callback=self.parse_category_contents) request.meta["category"] = category yield request