def parse_all(self):
        sil_codes = self.get_sil_codes()
        errors = []
        for sil in sil_codes:
            try:
                html = self.get_html(sil)
                dictionary = {}
                dictionary['sil'] = sil
                name = self.get_name(html)
                dictionary['name'] = name
                altnames =\
                    self.get_alternative_names(html)
                dictionary['alt_names'] = self.manual_filter(name, altnames)
                d = self.get_tabular_data(html)
                if d is not None:
                    for key in d:
                        if key not in self.needed_keys:
                            continue

                        all_, online = d[key]
                        dictionary[self.needed_keys[key] + '_all'] = all_
                        dictionary[self.needed_keys[key] + '_online'] = online
                yield dictionary
            except ParserException:
                errors.append(sil)
                continue

        if len(errors) > 0:
            msg = "Error in LanguageArchiveParser for following sils: "
            msg += repr(errors)
            raise ParserException(msg)
Example #2
0
 def get_country(self, string):
     try:
         return string.split('<h2>')[1].split('>')[1].split('</a')[0]
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.get_country(), sil:{1}'.format(
                 type(e), self.sil))
Example #3
0
 def get_title(self, string):
     try:
         return string.split('<h1 class="title" id="page-title">')[1]\
             .split('</h1>')[0]
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.get_title() sil:{1}'.format(
                 type(e), self.sil))
Example #4
0
    def split_headline(self, headline):

        try:
            return headline.replace('\n', '').replace('</th>', '')\
                .split('<th>')
        except Exception as e:
            raise ParserException(
                '{0} in WikipediaListOfLanguagesParser.split_headline'
                .format(type(e)))
Example #5
0
    def get_row_dict(self, column_titles, cells):

        try:
            return dict([(column_titles[i], cells[i])
                         for i in xrange(len(column_titles))])

        except Exception as e:
            raise ParserException(
                '{0} in WikipediaListOfLanguagesParser.get_row_dict'
                .format(type(e)))
Example #6
0
 def get_attachment(self, string):
     try:
         attachment = string.split(
             '<div class="attachment attachment-after">')[1].split(
                 '<aside class="grid-6 region region-sidebar-second "id="'
                 'region-sidebar-second">')[0]
         return attachment
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.get_attachment(), sil:{1}'.format(
                 type(e), self.sil))
Example #7
0
    def split_row(self, row):

        try:
            row = row.replace('\n', '')
            row = replace_html_formatting(row)
            return [item.replace('</td>', '')
                    for item in re.split('<td.*?>', row)]
        except Exception as e:
            raise ParserException(
                '{0} in WikipediaListOfLanguagesParser.split_row'
                .format(type(e)))
Example #8
0
 def get_attachment_blocks_titles(self, attachment):
     try:
         attachment_blocks = attachment.split(
             '<legend><span class="fieldset-legend"><span>')[1:]
         attachment_titles = [
             block.split('</span')[0] for block in attachment_blocks
         ]
         return attachment_blocks, attachment_titles
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.get_attachment_blocks_titles(),sil:{1}'
             .format(type(e), self.sil))  # nopep8
Example #9
0
 def get_attachment_title(self, attachment):
     try:
         if not '<div class="view-header">' in attachment:
             return None
         else:
             attachment_title = attachment.split('<h3>')[1].split(
                 '</h3>')[0]
             return attachment_title
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.get_attachment_title()'
             ' sil:{1}'.format(type(e), self.sil))
Example #10
0
 def process_main_table_rows(self, string):
     try:
         main_rows = string.split('<div class="field-label">')[1:-1]\
             + [string.split('<div class="field-label">')[-1].split(
                 '<div class="attachment attachment-after">')[0]]
         res = []
         for row in main_rows:
             res.append(self.parse_row(row))
         return res
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.process_main_table_rows()'
             ' sil:{1}'.format(type(e), self.sil))
    def get_name(self, string):

        try:
            name_wrapped = string.split('about the')[1].split('</title>')[0]
            if 'language'in name_wrapped:
                name = name_wrapped.split('language')[0].strip(' ')
            else:
                name = name_wrapped
            return name
        except Exception as e:
            raise ParserException(
                '{0} in LanguageArchivesBaseParser.get_name'
                .format(type(e)))
Example #12
0
    def generate_rows(self, tabular):

        try:
            pattern = re.compile('<tr>[\n]{0,1}<td>(.*?)(</tr>.*)', re.DOTALL)
            m = pattern.search(tabular)
            while m is not None:
                row, rest = m.groups()
                yield row
                m = pattern.search(rest)
        except Exception as e:
            raise ParserException(
                '{0} in WikipediaListOfLanguagesParser.generate_rows'
                .format(type(e)))
    def parse_table(self, item):

        try:
            table = item.split('<ol>')[1].split('</ol>')[0]
            rows = table.split('<li>')[1:]
            online_count = 0
            for row in rows:
                if '<span class="online_indicator">' in row:
                    online_count += 1
            return len(rows), online_count
        except Exception as e:
            raise ParserException(
                '{0} in LanguageArchivesBaseParser.parse_table'
                .format(type(e)))
    def get_tabular_data(self, html):

        try:
            d = {}
            lines = html.split('<h2>')[1:]
            for item in lines:
                category = item.split('</h2>')[0]
                counts = self.parse_table(item)
                d[category] = counts
            return d
        except Exception as e:
            raise ParserException(
                '{0} in LanguageArchivesBaseParser.get_tabular_data'
                .format(type(e)))
Example #15
0
    def generate_tabulars(self, html):

        try:
            pattern = re.compile('<table border.*?<th>(.*?)' +
                                 '(</tr>.*?)</table>(.*)', re.DOTALL)
            m = pattern.search(html)
            while m is not None:
                header, tabular, rest = m.groups()
                yield header, tabular
                m = pattern.search(rest)
        except Exception as e:
            raise ParserException(
                '{0} in WikipediaListOfLanguagesParser.generate_tabulars'
                .format(type(e)))
Example #16
0
 def parse_attachment_block(self, block):
     try:
         inner_dictionary = {}
         for item in re.split('<strong class=.*?>', block)[1:]:
             matcher = re.compile(
                 '(.*?)</strong><span class=.*?>(.*?)</span>')
             matched = matcher.match(item)
             if matched is not None:
                 key, value = matched.groups()
                 inner_dictionary[key] = value
         return inner_dictionary
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.parse_attachment_block()'
             ' sil:{1}'.format(type(e), self.sil))
Example #17
0
 def parse_row(self, row):
     try:
         key = row.split('</div>')[0].strip()
         value_wrapped = '</div>'.join(row.split('</div>')[1:])
         if len(value_wrapped.split('<div class="field-item even">')) > 1:
             value_extras = value_wrapped.split(
                 '<div class="field-item even">')[1].split('</div>')[0]
         else:
             value_extras = value_wrapped.split(
                 '<div class="field-item">')[1].split('</div>')[0]
         value = self.strip_nonstring(value_extras).strip()
         return key, value
     except Exception as e:
         raise ParserException(
             '{0} in EthnologueParser.parse_row(), at row\n{1} ' +
             'sil:{2}'.format(type(e), row, self.sil))
Example #18
0
    def parse_all(self, **kwargs):
        sil_codes = kwargs["sil_codes"]
        errors = set()
        for sil_code in sil_codes:
            try:
                self.sil = sil_code
                html = self.get_html(self.sil)
                d = {}
                d['sil'] = sil_code
                d['name'] = self.get_title(html)
                d['country'] = self.get_country(html)
                main_items = self.process_main_table_rows(html)
                if main_items is not None:
                    for key, value in main_items:
                        if key in self.needed_keys:
                            if key == 'Population':
                                population, ethnic_population = \
                                    self.normalize_population(value)
                                value = [("ethnologue", "L1", population)]
                                d[self.needed_keys[key]] = value
                                d['eth_ethnic_population'] = ethnic_population
                            elif key == 'Language Status':
                                value = [("ethnologue",
                                          self.normalize_lang_status(value),
                                          None)]
                                if value[0][1] is None:
                                    continue

                                d[self.needed_keys[key]] = value
                            elif key == "Alternate Names":
                                value = [s.strip() for s in value.split(",")]
                                d[self.needed_keys[key]] = value

                            else:
                                d[self.needed_keys[key]] = value
                yield d
            except ParserException:
                errors.add(sil_code)

        if len(errors) > 0:
            raise ParserException("error with sils: {0}".format(errors))
Example #19
0
 def get_html(self, sil, encoding='utf-8'):
     fn = '{0}/{1}'.format(self.basedir, self.sil)
     if os.path.exists(fn):
         return open(fn).read().decode(encoding)
     else:
         raise ParserException()