def get_sub_content(link, f:'_io.TextIOWrapper'): #print("http://"+baseurl[0:-1]+link) (req, ans) = get_content("http://"+baseurl+link) soup=BeautifulSoup(ans,from_encoding="utf-8") content = soup.find(class_='CardView') content = content.find_all('table') result = '' for table in content[0:2]: for td in table.find_all('td')[1::2]: result+=td.get_text()+';' result += content[2].find_all('td')[4].get_text()+';' result = result.replace('\t', '').replace('\n', '').replace('\x0D', '') result = normalUtf(result) f.write(result+'\n') f.flush()
def inf_from_table(table:'BeautifulSoup',f:'_io.TextIOWrapper'): rows=table.find_all('tr',class_='RowsTable_Default') rows2=table.find_all('tr',class_='RowsTable_Default_') rows=rows+rows2 for row in rows: str='' row_data=row.find_all('td') #<notice_number> str=str+row_data[1].get_text()+';' #</notice_number> #<subject> str=str+row_data[2].get_text()+';' #</subject> #<price> str=str+row_data[3].get_text()+';' #</price> #<organizer> str=str+row_data[4].get_text()+';' #</organizer> #<publication_date_time> str=str+row_data[5].get_text()+';' #</publication_date_time> #<start_date> str=str+row_data[6].get_text()+';' #</start_date> #<start_time> str=str+row_data[7].get_text()+';' #</start_time> #<state> str=str+row_data[8].get_text()+';' #</state> #<url> link = row_data[0].find('a'); if not (link is None): #print(link['href']) str=str+link['href'].replace('\n', '')+';' #</url> str=normalUtf(str) f.write(str) f.flush() #begin full info about zakaz get_sub_content(link['href'], f) #end full info about zakaz return link['href']
main_table=soup.find("table",class_="tbl_org tbl_org_zakon tbl_org_regedit tbl_torgs ") lines=main_table.find_all("tr") g=1 num_ln=len(lines) while g < num_ln: _str=''; cur_line=lines[g] #begin get information about competition link = cur_line['onclick'] url_with_full_inf = link.split("'")[1] _str+=subInfo(url_with_full_inf) #end get information about competition data_in_line=cur_line.find_all("td") #<customer> _str=_str+data_in_line[0].get_text()+';' #</customer> #<subject> _str=_str+data_in_line[1].get_text()+';' #</subject> #<date> _str=_str+data_in_line[2].get_text()+';' #</date> g=g+1 _str = normalUtf(_str) inf_file.write(_str) inf_file.write("\n") inf_file.flush() cur_page=cur_page+1 inf_file.close()