def update_aliases_from_asc(self, filename): '''Gets aliases from sample x-ray file and expands them if users settings say to do so''' cursor = connect(filename).cursor() characters = {x[1]: [x[1]] for x in cursor.execute('SELECT * FROM entity').fetchall() if x[3] == 1} self._aliases = {} for alias, fullname in auto_expand_aliases(characters).items(): if fullname not in self._aliases.keys(): self._aliases[fullname] = [alias] continue self._aliases[fullname].append(alias)
def get_characters(self, entity_id): '''Gets book's character data''' if self._page_source is None: return characters = self._page_source.xpath( '//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a' ) character_data = {} for char in characters: if '/characters/' not in char.get('href'): continue resp = open_url(self._connection, char.get('href')) if not resp: continue char_page = html.fromstring(resp) if char_page is None: continue desc = char_page.xpath( '//div[@class="workCharacterAboutClear"]/text()') if desc and re.sub(r'\s+', ' ', desc[0]).strip(): desc = str( re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = 'No description found on Goodreads.' alias_list = char_page.xpath( '//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()' ) alias_list = [ re.sub(r'\s+', ' ', x).strip() for aliases in alias_list for x in aliases.split(',') if re.sub(r'\s+', ' ', x).strip() ] character_data[entity_id] = { 'label': str(char.text.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': alias_list } entity_id += 1 if prefs['expand_aliases']: characters = {} for char, char_data in list(character_data.items()): characters[char] = [char_data['label']] + char_data['aliases'] expanded_aliases = auto_expand_aliases(characters) for alias, ent_id in list(expanded_aliases.items()): character_data[ent_id]['aliases'].append(alias) return character_data
def get_characters(self, entity_id): '''Gets book's character data''' if self._page_source is None: return characters = self._page_source.xpath('//div[@class="clearFloats" and contains(., "Characters")]//div[@class="infoBoxRowItem"]//a') character_data = {} for char in characters: if '/characters/' not in char.get('href'): continue resp = open_url(self._connection, char.get('href')) if not resp: continue char_page = html.fromstring(resp) if char_page is None: continue desc = char_page.xpath('//div[@class="workCharacterAboutClear"]/text()') if desc and re.sub(r'\s+', ' ', desc[0]).strip(): desc = unicode(re.sub(r'\s+', ' ', desc[0]).strip().decode('utf-8').encode('latin-1')) else: desc = u'No description found on Goodreads.' alias_list = char_page.xpath('//div[@class="grey500BoxContent" and contains(.,"aliases")]/text()') alias_list = [re.sub(r'\s+', ' ', x).strip() for aliases in alias_list for x in aliases.split(',') if re.sub(r'\s+', ' ', x).strip()] character_data[entity_id] = {'label': unicode(char.text.decode('utf-8').encode('latin-1')), 'description': desc, 'aliases': alias_list} entity_id += 1 if prefs['expand_aliases']: characters = {} for char, char_data in character_data.items(): characters[char] = [char_data['label']] + char_data['aliases'] expanded_aliases = auto_expand_aliases(characters) for alias, ent_id in expanded_aliases.items(): character_data[ent_id]['aliases'].append(alias) return character_data