def parse(self, bs_in, fp_out): tables = bs_in.find_all('table') tparser = HtmlTableParser() founded_t = None founded_t_num = 0 for table in tables: if not tparser.has_child_tables(table): arr = tparser(table) i = len(arr) if i > 5: founded_t = arr founded_t_num += 1 if (founded_t is not None) & (founded_t_num > 1): self.logger.error("Found more than one results table (%s) in %s" % (str(founded_t_num), self.filename_in_full)) return [] if founded_t is None: self.logger.error("Could not found any result table in " + self.filename_in_full) return [] # table found out_2_file = [] for row in founded_t: if (len(row) > 3) | (len(row) < 2): continue if (row[0].strip() == '') | (len(row) == 2): out_2_file.append(['-----']) continue out_2_file.append([_.strip() for _ in row]) array2d_2tsv(out_2_file, fp_out) return []
def parse(self, bs_in, fp_out): tbl = bs_in.find_all('table')[7] row_parser = CombinedRowExtractor(row_extractor_simple, row_extractor_href) t_parser = HtmlTableParser(row_parser) # move dates and fill regions: t -> [[date, region, value, href]] currdate = None curregion = None cursubregion = None newt = [] i = 0 for row in t_parser(tbl): # if date if len(row) == 1: currdate = row[0] else: # if row with region if len(row) == 3: if row[0].strip() != '': if row[0].startswith('\xa0\xa0'): cursubregion = row[0] else: curregion = row[0] cursubregion = '' newt.append([_.strip() for _ in [str(i), currdate, curregion, cursubregion, row[1], row[2]]]) else: newt.append([_.strip() for _ in [str(i), currdate, curregion, cursubregion, row[0], row[1]]]) assert 'http' in newt[i][5] i += 1 array2d_2tsv(newt, fp_out) # [[href, filename, parser],..] return ([row[5], self.make_filename_out(str(row[0])), self.subj_parser(self.root_path)] for row in newt)
def parse(self, bs_in, fp_out): version = self.detect_version(bs_in) out_return = [] out_2_file = [] for lnk in get_candlink(bs_in): out_return.append([ lnk[1], self.make_filename_out('candidates' if version == 1 else 'candidates1'), RawParserCandidatesList(root_path=self.root_path, version=version) ]) out_2_file.append(lnk) for lnk in get_reslink(bs_in): out_return.append([ lnk[1], self.make_filename_out('results'), RawParserResultsSummary(root_path=self.root_path, version=version) ]) out_2_file.append(lnk) array2d_2tsv(out_2_file, fp_out) if len(out_2_file) != 2: self.logger.error( 'parsing %s for hrefs to cands and res found %s hrefs, not two' % (self.filename_in, str(len(out_2_file)))) return [] # assert len(out_2_file) == 2 return out_return
def parse(self, bs_in, fp_out): arr = self.parse_candlist(bs_in, 'thead') if arr is None: self.logger.error("Candidates not found in " + self.filename_in_full) return [] basename = os.path.dirname(self.filename_in_full) # ФИО во втором столбцеб ссыль - в последнем array2d_2tsv(arr, fp_out) return [[ row[len(row) - 1], os.path.join(basename, row[1] + '.html'), RawParserCandidateCard(self.root_path, version=2) ] for row in arr]
def parse(self, bs_in, fp_out): cands = get_candlink(bs_in) if self.version is None: self.version = self.detect_version(bs_in) out_return = [] out_2_file = [] if cands is None: self.logger.error("Cands not found " + self.filename_in_full) return [] for lnk in cands: out_return.append([ lnk[1], self.make_filename_out('candidates' if self.version == 1 else 'candidates1'), RawParserCandidatesList(root_path=self.root_path, version=self.version) ]) out_2_file.append(lnk) subregs = self.find_subregions(bs_in) if (subregs is not None) and (len(subregs) > 0): self.logger.info("subregs found in " + self.filename_in_full) for subreg in subregs: out_return.append([ subreg[1], self.make_filename_out(subreg[0].split(' ')[0]), RawParserMajorSubjPageSubreg(root_path=self.root_path, version=self.version) ]) out_2_file.append(subreg) array2d_2tsv(out_2_file, fp_out) return out_return reslinks = get_reslink(bs_in) if len(reslinks) != 1: self.logger.error( "some troubles with getting results in %s; got reslinks %s" % (self.filename_in_full, str(len(reslinks)))) else: out_return.append([ reslinks[0][1], self.make_filename_out('results'), RawParserResultsSummary(self.root_path, self.version) ]) out_2_file.append(reslinks[0]) array2d_2tsv(out_2_file, fp_out) return out_return
def parse(self, bs_in, fp_out): reslinks = get_reslink(bs_in) if len(reslinks) != 1: self.logger.error( "some troubles with getting results in %s; got reslinks %s" % (self.filename_in_full, str(len(reslinks)))) return [] array2d_2tsv([reslinks], fp_out) return [[ reslinks[0][1], '%s.results%s' % os.path.splitext(self.filename_in_full), RawParserResultsSummary(self.root_path, self.version) ]]
def parse(self, bs_in, fp_out): tparcer = HtmlTableParser() tables = [tparcer(_) for _ in bs_in.find_all('table')] if self.version == 1: tables = list(filter(lambda x: len(x) == 11, tables)) if len(tables) == 1: array2d_2tsv(tables[0], fp_out) return [] self.logger.error("Error in parsing. Found several potential tables in " + self.filename_in_full) return [] if self.version == 2: tables = list(filter(lambda x: len(x) == 11, tables)) if len(tables) == 1: array2d_2tsv(tables[0][1:], fp_out) return [] self.logger.error("Error in parsing. Found several potential tables in " + self.filename_in_full) return [] return []