def parse_all(self): sil_codes = self.get_sil_codes() errors = [] for sil in sil_codes: try: html = self.get_html(sil) dictionary = {} dictionary['sil'] = sil name = self.get_name(html) dictionary['name'] = name altnames =\ self.get_alternative_names(html) dictionary['alt_names'] = self.manual_filter(name, altnames) d = self.get_tabular_data(html) if d is not None: for key in d: if key not in self.needed_keys: continue all_, online = d[key] dictionary[self.needed_keys[key] + '_all'] = all_ dictionary[self.needed_keys[key] + '_online'] = online yield dictionary except ParserException: errors.append(sil) continue if len(errors) > 0: msg = "Error in LanguageArchiveParser for following sils: " msg += repr(errors) raise ParserException(msg)
def get_country(self, string): try: return string.split('<h2>')[1].split('>')[1].split('</a')[0] except Exception as e: raise ParserException( '{0} in EthnologueParser.get_country(), sil:{1}'.format( type(e), self.sil))
def get_title(self, string): try: return string.split('<h1 class="title" id="page-title">')[1]\ .split('</h1>')[0] except Exception as e: raise ParserException( '{0} in EthnologueParser.get_title() sil:{1}'.format( type(e), self.sil))
def split_headline(self, headline): try: return headline.replace('\n', '').replace('</th>', '')\ .split('<th>') except Exception as e: raise ParserException( '{0} in WikipediaListOfLanguagesParser.split_headline' .format(type(e)))
def get_row_dict(self, column_titles, cells): try: return dict([(column_titles[i], cells[i]) for i in xrange(len(column_titles))]) except Exception as e: raise ParserException( '{0} in WikipediaListOfLanguagesParser.get_row_dict' .format(type(e)))
def get_attachment(self, string): try: attachment = string.split( '<div class="attachment attachment-after">')[1].split( '<aside class="grid-6 region region-sidebar-second "id="' 'region-sidebar-second">')[0] return attachment except Exception as e: raise ParserException( '{0} in EthnologueParser.get_attachment(), sil:{1}'.format( type(e), self.sil))
def split_row(self, row): try: row = row.replace('\n', '') row = replace_html_formatting(row) return [item.replace('</td>', '') for item in re.split('<td.*?>', row)] except Exception as e: raise ParserException( '{0} in WikipediaListOfLanguagesParser.split_row' .format(type(e)))
def get_attachment_blocks_titles(self, attachment): try: attachment_blocks = attachment.split( '<legend><span class="fieldset-legend"><span>')[1:] attachment_titles = [ block.split('</span')[0] for block in attachment_blocks ] return attachment_blocks, attachment_titles except Exception as e: raise ParserException( '{0} in EthnologueParser.get_attachment_blocks_titles(),sil:{1}' .format(type(e), self.sil)) # nopep8
def get_attachment_title(self, attachment): try: if not '<div class="view-header">' in attachment: return None else: attachment_title = attachment.split('<h3>')[1].split( '</h3>')[0] return attachment_title except Exception as e: raise ParserException( '{0} in EthnologueParser.get_attachment_title()' ' sil:{1}'.format(type(e), self.sil))
def process_main_table_rows(self, string): try: main_rows = string.split('<div class="field-label">')[1:-1]\ + [string.split('<div class="field-label">')[-1].split( '<div class="attachment attachment-after">')[0]] res = [] for row in main_rows: res.append(self.parse_row(row)) return res except Exception as e: raise ParserException( '{0} in EthnologueParser.process_main_table_rows()' ' sil:{1}'.format(type(e), self.sil))
def get_name(self, string): try: name_wrapped = string.split('about the')[1].split('</title>')[0] if 'language'in name_wrapped: name = name_wrapped.split('language')[0].strip(' ') else: name = name_wrapped return name except Exception as e: raise ParserException( '{0} in LanguageArchivesBaseParser.get_name' .format(type(e)))
def generate_rows(self, tabular): try: pattern = re.compile('<tr>[\n]{0,1}<td>(.*?)(</tr>.*)', re.DOTALL) m = pattern.search(tabular) while m is not None: row, rest = m.groups() yield row m = pattern.search(rest) except Exception as e: raise ParserException( '{0} in WikipediaListOfLanguagesParser.generate_rows' .format(type(e)))
def parse_table(self, item): try: table = item.split('<ol>')[1].split('</ol>')[0] rows = table.split('<li>')[1:] online_count = 0 for row in rows: if '<span class="online_indicator">' in row: online_count += 1 return len(rows), online_count except Exception as e: raise ParserException( '{0} in LanguageArchivesBaseParser.parse_table' .format(type(e)))
def get_tabular_data(self, html): try: d = {} lines = html.split('<h2>')[1:] for item in lines: category = item.split('</h2>')[0] counts = self.parse_table(item) d[category] = counts return d except Exception as e: raise ParserException( '{0} in LanguageArchivesBaseParser.get_tabular_data' .format(type(e)))
def generate_tabulars(self, html): try: pattern = re.compile('<table border.*?<th>(.*?)' + '(</tr>.*?)</table>(.*)', re.DOTALL) m = pattern.search(html) while m is not None: header, tabular, rest = m.groups() yield header, tabular m = pattern.search(rest) except Exception as e: raise ParserException( '{0} in WikipediaListOfLanguagesParser.generate_tabulars' .format(type(e)))
def parse_attachment_block(self, block): try: inner_dictionary = {} for item in re.split('<strong class=.*?>', block)[1:]: matcher = re.compile( '(.*?)</strong><span class=.*?>(.*?)</span>') matched = matcher.match(item) if matched is not None: key, value = matched.groups() inner_dictionary[key] = value return inner_dictionary except Exception as e: raise ParserException( '{0} in EthnologueParser.parse_attachment_block()' ' sil:{1}'.format(type(e), self.sil))
def parse_row(self, row): try: key = row.split('</div>')[0].strip() value_wrapped = '</div>'.join(row.split('</div>')[1:]) if len(value_wrapped.split('<div class="field-item even">')) > 1: value_extras = value_wrapped.split( '<div class="field-item even">')[1].split('</div>')[0] else: value_extras = value_wrapped.split( '<div class="field-item">')[1].split('</div>')[0] value = self.strip_nonstring(value_extras).strip() return key, value except Exception as e: raise ParserException( '{0} in EthnologueParser.parse_row(), at row\n{1} ' + 'sil:{2}'.format(type(e), row, self.sil))
def parse_all(self, **kwargs): sil_codes = kwargs["sil_codes"] errors = set() for sil_code in sil_codes: try: self.sil = sil_code html = self.get_html(self.sil) d = {} d['sil'] = sil_code d['name'] = self.get_title(html) d['country'] = self.get_country(html) main_items = self.process_main_table_rows(html) if main_items is not None: for key, value in main_items: if key in self.needed_keys: if key == 'Population': population, ethnic_population = \ self.normalize_population(value) value = [("ethnologue", "L1", population)] d[self.needed_keys[key]] = value d['eth_ethnic_population'] = ethnic_population elif key == 'Language Status': value = [("ethnologue", self.normalize_lang_status(value), None)] if value[0][1] is None: continue d[self.needed_keys[key]] = value elif key == "Alternate Names": value = [s.strip() for s in value.split(",")] d[self.needed_keys[key]] = value else: d[self.needed_keys[key]] = value yield d except ParserException: errors.add(sil_code) if len(errors) > 0: raise ParserException("error with sils: {0}".format(errors))
def get_html(self, sil, encoding='utf-8'): fn = '{0}/{1}'.format(self.basedir, self.sil) if os.path.exists(fn): return open(fn).read().decode(encoding) else: raise ParserException()