def get_match_info(match): url = match.get('href') url = head + url driver = webdriver.Chrome() driver.get(url) data = BeautifulSoup(driver.page_source,"lxml") name = data.find("span",attrs = {"class":"team-header-name"}).string print name Team_stat = data.find("dl",attrs = {"class":"stats"}) data.find_next() Team_stat_details = Team_stat.findChildren("dt") t={} t["Team_name"]=name for stats in Team_stat_details: print stats.string print stats.find_next("dd").string if(stats.string=="Discipline"): Discipline ={} Discipline["yellow-card"] = (data.find("span",attrs = {"class":"yellow-card-box"}).string) Discipline["red-card"] = (data.find("span",attrs = {"class":"red-card-box"}).string) t[stats.string] = Discipline else: t[stats.string] = (stats.find_next("dd").string) record = json.load(open("U:\Projects\PythonApplication7\PythonApplication7\\infos.json", 'r')) record["info_list"].append(t) json.dump(record, open("U:\Projects\PythonApplication7\PythonApplication7\\infos.json", 'w')) driver.close() print 'get\n'
def recent_tournaments(): url = 'https://dota2.ru/esport/tournaments/' url = urllib.request.urlopen(url) soup = BeautifulSoup(url, "lxml") soup = soup.find('div', {'class': 'esport-tournament-list'}) soup = soup.find_next('div', {'class': 'esport-tournament-list'}) name = 1 summary = {} summary_t = [] while soup.find_next('div', {'class': 'esport-tournament-list-single'}) != None: soup = soup.find_next('div', {'class': 'esport-tournament-list-single'}) name = soup.find('div', {'class': 'title'}) name = name.find('a') name = re.sub('[\r\n ]', '', name.get_text()) date = soup.find('div', {'class': 'date'}) date = date.find_next('div').get_text() prize = soup.find('div', {'class': 'prize'}) prize = re.sub('[\r\n ]', '', prize.get_text()) if prize != "Приглашениенаосновнойэтап": summary['name'] = name.strip() summary['date'] = date.strip() summary['prize'] = prize.strip() summary_t.append(summary) summary = {} print(summary_t) return summary_t
def scrape(category, targetSections, targetArticles): url = fetchUrl(category, targetArticles) if not url: return 0 site = fetch(url) soup = BeautifulSoup(site.text, 'html.parser').find('div', class_='component component-news-article').find( 'ul').find_next('p') soup = soup.find_next(lambda tag: tag.name == 'span' and any(x in tag.text for x in targetSections)) return soup.find_next('p') if soup else 0
def job_traverse_all_pages(browser, url, current_page=0): browser.get(url) js_scroll_to_bottom = ''' var jobPane = document.querySelector(".jobs-search-results"); jobPane.scrollIntoView(); jobPane.scrollTo(0,jobPane.scrollHeight); ''' # LinkedIn won't show you up to 25 job listings right away due to # disgusting JS infested UI design browser.execute_script(js_scroll_to_bottom) time.sleep(1.0) page = BeautifulSoup(browser.page_source, features="html.parser") links = list(set(get_job_links(page))) # list(set(foo)) removes duplicates url_nextpage = url_job_pages + "&start=" + str(current_page * 25) # LinkedIn paginates its jobs every 25 listings current_page += 1 time.sleep(random.uniform(0.2, 0.9)) # random sleep # len(links) < 25: # if there's less than 25 job listings then we # assume there's no next page next_list_item = page.find_next('li', {'data-test-pagination-page-btn': True}) curr_button = page.find_next('button'{}) if next_button and : return links else: return links + job_traverse_all_pages( browser, url_nextpage, current_page)
def search(): # Determine the location to be searched #city = input("Your Town/City: ") Put this in later to replace "town" variable town = "greenville" craigslist = 'http://' + town + ".craigslist.org/search/zip" print(craigslist) r = requests.get(craigslist).text # Scraping the site first_file = "C:\\Users\\Sam\\Documents\\first.csv" for pg in craigslist: soup = BeautifulSoup(r, 'html.parser') name_box = soup.find_next( "p") # need to find a way to get iterate thru name = name_box.text data = [] data.append(name) not_dup = [] if name != not_dup: not_dup.append(name) with open(first_file, 'a') as csv_file: writer = csv.writer(csv_file) for name in data: writer.writerow(name) print(name) else: continue print("done")
def parse_me(filename): file = open(os.getcwd() + filename) # Открываем файл soup = BeautifulSoup(file, 'lxml') links = soup.find_all('a') # Поиск всех ссылок в которых будут имена headers = soup.find('h3') # print(headers.find_next('h3')) female_name_base_list = ['Любовь'] # Список женских имен <ИСКЛ> male_name_base_list = ['Никита', 'Лёва', 'Илья'] # Список мужских имен <ИСКЛ> male_list = {} # Словарь с ключами - годами female_list = {} while soup.find_next('a'): name = soup.find('a') if soup.find_next() == 'h3': break
def scrape_country_data(country_data: BeautifulSoup) -> Dict[str, Any]: """Scrapes all data from a table as a soup and returns the dict""" tbody = country_data.find_next('tbody') rows = tbody.find_all('tr') data_dict = {2016: {}, 2017: {}, 2018: {}} year_index = 4 for key in data_dict.keys(): # Population data_dict[key]['midyear_pop'] = float([ td for td in rows[1].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['growth_rate'] = float([ td for td in rows[2].find_all('td')[year_index] ][0].replace(',', '')) # Fertility data_dict[key]['total_fertility_rate'] = float([ td for td in rows[4].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['crude_birth_rate'] = float([ td for td in rows[5].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['births'] = float([ td for td in rows[6].find_all('td')[year_index] ][0].replace(',', '')) # Mortality data_dict[key]['life_expectancy'] = float([ td for td in rows[8].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['infant_mortality_rate'] = float([ td for td in rows[9].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['under_5_mortality_rate'] = float([ td for td in rows[10].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['crude_death_rate'] = float([ td for td in rows[11].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['deaths'] = float([ td for td in rows[12].find_all('td')[year_index] ][0].replace(',', '')) # Migration data_dict[key]['net_migration_rate'] = float([ td for td in rows[14].find_all('td')[year_index] ][0].replace(',', '')) data_dict[key]['net_num_migrants'] = float([ td for td in rows[15].find_all('td')[year_index] ][0].replace(',', '')) year_index += 1 return data_dict
def get_links(urls): """ Gets all of the links based on a list of urls :return: Lists of links to individual movies. """ links = [] for url in urls: r = requests.get(url) soup = Bs(r.text) # Only one table on the website table = soup.find_next('tbody') links = table.find_all('a') links.append(links) return links
def bash_rand(m): """Get a random quote from bash.org""" resp = get_url(m, "http://bash.org?random1") soup = BeautifulSoup(resp) raw = soup.find(class_="qt") if raw: meta = soup.find(class_="quote") while True: if not raw: bash_rand(m) return lines = raw.get_text().splitlines() if len(lines) <= 5: break raw = raw.find_next(class_="qt") meta = soup.find_next(class_="quote") format_quote(m, lines, meta) else: m.bot.private_message(m.location, "Could not find bash quote.")
def bash_rand(m): """Get a random quote from bash.org""" resp = get_url(m, "http://bash.org?random1") soup = BeautifulSoup(resp, features="html.parser") raw = soup.find(class_="qt") if raw: meta = soup.find(class_="quote") while True: if not raw: bash_rand(m) return lines = raw.get_text().splitlines() if len(lines) <= 5: break raw = raw.find_next(class_="qt") meta = soup.find_next(class_="quote") format_quote(m, lines, meta) else: m.bot.private_message(m.location, "Could not find bash quote.")
def refresh_login(global_config): global xcsrf_token, cookie url = "http://xn--v9x.net/" loginUrl = url + "login/" imageUrl = url + "image/" header = { 'User-Agent': "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36", } imageRequest = { "optype": "get_images", "category": "", "cached_images": [] } initr = requests.get(url, headers=header) header['Referer'] = url loginGet = requests.get(loginUrl, headers=header, cookies=initr.cookies) loginParse = BeautifulSoup(loginGet.text) csrfMiddle = loginParse.find_next(name="csrfmiddlewaretoken").string() header['Referer'] = loginUrl loginForm = { 'csrfmiddlewaretoken': csrfMiddle, 'Email': global_config["XNV9X_EMAIL"], "Password": global_config["XNV9X_PASSWORD"], } loginPost = requests.post(loginUrl, data=loginForm, cookies=loginGet.cookies, headers=header) header["Referer"] = url imageGet = requests.get(imageUrl, cookies=loginPost, headers=header) print(re.findall("\"X-CSRFToken\": '(.*?)' },", imageGet.text)) print(str(imageGet.cookies))
def find_processor(tag): # Link para partsurfer HPE url = "https://partsurfer.hpe.com/Search.aspx?searchText={}".format(tag) try: response = requests.get(url, verify=False) soap = BeautifulSoup(response.text, "html.parser") previous_tag = soap.find_next("span") processor = "Nao encontrado." for item in soap.find_all("span"): if "logic cpu" in item.text.lower( ) and "proc" in previous_tag.text.lower(): processor = previous_tag.text previous_tag = item # Em alguns casos o PartSufer da HP nao gera a coluna Logic CPU # Nesse caso e feita a busca direta pelo termo sps-proc if "Nao encontrado" in processor: for item in soap.find_all("span"): if "sps-proc" in item.text.lower(): processor = item.text with open("processadores.csv", mode="a") as f: f.write("{};{}\n".format(tag, processor)) print("Serial: {}, Processador: {}".format(tag, processor)) except (Exception) as e: if (type(e) is requests.exceptions.ConnectionError): raise e else: print("Serial: {}, falha ao obter processador.".format(tag)) return
def get_author(image_id): html = requests.get("http://en.fotolia.com/id/" + str(image_id)).content print "http://en.fotolia.com/id/" + str(image_id) html_td = BeautifulSoup(html, "html.parser").find("div", class_="content-preview") return html_td.find_next('a').string
# itération sur chaque page for data in URLs: cl = {} link = data['link'] print("Processing %s" % link) pageURL = link if MOCK_CLASS: content = BeautifulSoup(open(MOCK_CLASS),features="lxml").body else: content = BeautifulSoup(urllib.request.urlopen(pageURL).read(),features="lxml").body # titre name = cleanName(content.find_next('caption').string.strip()) cl[u'Nom'] = name # référence cl[u'Référence'] = link # prestige if 'prestige' in data.keys() and data['prestige']: cl[u'Prestige'] = True # source cl[u'Source'] = data['source'] # description descr = findAfter(content, "div", {"class": "presentation"},'i'); cl[u'Description'] = descr
continue print('{} {}'.format(brand, a[0])) refName = a[0] for currentLoc in loc.keys(): url = "https://www.amazon.{local}/s?k=".format( local=loc[currentLoc]) azurl = "{}{}+{}".format(url, brand, refName.replace(' ', '+')) rq = s.get(azurl) if rq.raise_for_status(): print(rq.status_code + ' ' + rq.text) soup = BeautifulSoup(rq.text, 'html5lib') articles = soup.find("span", {'class': 'a-size-medium'}) prices = soup.find('span', {'class': 'a-price-whole'}) if prices == None: articles = soup.find("span", {'class': 'a-size-medium'}) prices = soup.find_next('span', {'class': 'a-price-whole'}) if (prices == None or articles == None): print("No item found") else: if force == True or (cur.execute( 'SELECT amName FROM refprices WHERE locale = "{currentloc}" AND name = "{refname}" ' .format(currentloc=currentLoc, refname=refName)).fetchone() == str( articles.contents[0])): print("{price} ---- {item}".format( item=articles.contents[0], price=prices.contents[0])) item = (str(float(prices.contents[0].replace(",", "."))), brand, refName, str(articles.contents[0]), currentLoc, datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f")) cur.execute('INSERT INTO refprices VALUES (?,?,?,?,?,?)',
def _find_number(self, number_column: BeautifulSoup) -> str: number_tag = number_column.find_next("div", {"class": "rn_nummer"}) return number_tag.getText()
def handle_endtag(self, tag): if tag != "script": print("Encountered an end tag:", tag) self.prev_tag.pop() def handle_data(self, data): if self.prev_tag.get() != "script": print("Encountered data:", data) myparser = MyHTMLParser() myparser.feed(raw) from bs4 import BeautifulSoup soup = BeautifulSoup(raw, 'html.parser') soup.prettify() soup.title soup.title.name soup.title.string soup.p soup.findall('div') soup.find_all('div') soup.find_all('div', attrs='class') soup.find_all('div', attrs='center-buttons') soup.find_all('div', attrs='fixed-recipe-card') soup.find_all('article', attrs='fixed-recipe-card') soup.find('article', attrs='fixed-recipe-card') soup.find_next('article', attrs='fixed-recipe-card') soup.find_next('article') soup.find_all_next('article', attrs='fixed-recipe-card') soup.find_all('article', attrs='fixed-recipe-card')
def syntax_color(src): html = highlight(src, PlotDeviceLexer(), HtmlFormatter()) soup = BeautifulSoup(html, "html5lib") return soup.find_next("pre").extract()
uid = recipeId.get("data-typespecificid") recipe["uid"] = uid title = recipePage.find("h1", id = "itemTitle") recipe["name"] = title.string print("\n\n" + title.string + "[" + uid + "]") print(link) photo = recipePage.find("img", id = "imgPhoto") recipe["photo"] = photo.get("src") print(photo.get("src")) # Create empty ingredients list ingredients = [ ] names = recipePage.find_all("span", "ingredient-name") for name in names: if name.string != "" and name.string != " ": split = name.string.split(",") ingredient = split[0] ingredients.append(ingredient) # tokens = nltk.word_tokenize(ingredients) # print(tokens) # tagged = nltk.pos_tag(tokens) # f.write(ingredients+"\n") print(ingredient) names = recipePage.find_next("span", "ingredient-name") recipe["ingredients"] = ingredients jsonString = json.dumps(recipe) + ",\n" f.write(jsonString) print(jsonString) divs = divs.find_next("div", "hub-list-view")
class domain_com(log.csv_): def __init__(self, request): self.request = request session = requests.Session() session.max_redirects = 30 try: web_page = session.get(self.request, allow_redirects=True).text except requests.exceptions.MissingSchema: print('Type correct url. Did you forget http / https?') raise (exit()) self.soup = BeautifulSoup(web_page, 'lxml') def scrape_property_url(self): href_list = [] try: count_property = self.soup.find('strong') count_property = re.findall(r'(\d+).+', count_property.string) except AttributeError: print('Type correct url') raise (exit()) count_pages = math.ceil(int(count_property[0]) / 20) body = self.soup.find('div', 'css-1mf5g4s') href2 = re.findall(r'href="(.+?)"', str(body)) for href in href2: if not href in href_list: href_list.append(href) return href_list, count_pages, count_property def direct_to_property(self, page): hrefs = self.scrape_property_url()[0] for href in hrefs: compile_domain(href) if page == 1: self.next_page() def next_page(self): page = 2 pages = int(self.scrape_property_url()[1]) while pages >= page: main_url = self.request + '&page=' + str(page) url = domain_com(main_url) url.direct_to_property(page) page += 1 print('Done!') def buy_rent(self): tags = self.soup.find_all('span', class_='css-0') if tags[1].string == 'Sale' or tags[1].string == 'New Homes': return 'Buy' elif tags[1].string == 'Rent': return 'Rent' def bedBathCarSquare_count(self): try: tag = self.soup.find('span', 'css-9fxapx', string='Beds') bed_tag = (tag.find_parent()) try: beds = list(bed_tag.strings)[0] except AttributeError: print('Beds not found') beds = '-' bath_tag = tag.find_next('span', 'css-1rzse3v') try: baths = list(bath_tag.strings)[0] except AttributeError: print('Baths not found') baths = '-' car_tag = bath_tag.find_next('span', 'css-1rzse3v') try: cars = list(car_tag.strings)[0] except AttributeError: print('Car places not found') cars = '-' square_tag = car_tag.find_next('span', 'css-1rzse3v') try: square = list(square_tag.strings)[0] except AttributeError: print('Square not found') square = '-' try: if int(square) < 15: square = '-' except ValueError: pass return beds, baths, cars, square except AttributeError: try: square_tag = self.soup.find_next('span', 'css-1rzse3v') square = list(square_tag.strings)[0] try: if int(square) < 15: square = 'None' except ValueError: pass except AttributeError: square = 'None' return '-', '-', '-', square def agent_name(self): try: agent = self.soup.find( 'a', 'is-a-link listing-details__agent-details-agent-name') return agent.string except AttributeError: print('Agent not found') return 'None' def property_addr(self): addr = self.soup.find('h1', 'listing-details__listing-summary-address') try: return addr.string except AttributeError: print('Adress not found') return 'None' def property_type(self): try: tag = self.soup.find( 'span', 'listing-details__property-type-features-text').string except AttributeError: try: tag = self.soup.find('p', 'listing-details__property-type').string except AttributeError: print('Property Type not found') return 'None' return tag def price_buy(self): price = self.soup.find('div', class_='listing-details__summary-title') try: return price.string except AttributeError: return 'Auction / No price' def price_rent(self): price = self.soup.find('div', 'listing-details__summary-title') try: return price.string except AttributeError: print('Property Price not found') return 'None' def property_features(self): try: features = [ feature.string for feature in self.soup.find_all( 'li', 'listing-details__additional-features-listing') ] except AttributeError: print('Features not found') return 'None' if not features: print('Features not found') features = 'None' return features def property_description(self): try: full_desc = '' description = self.soup.find('div', 'listing-details__description') description = description.find_all('p') for desc in description: full_desc += desc.string except AttributeError: full_desc = 'None' print('Descripton not found') except TypeError: full_desc = 'None' print('Descripton not found') return full_desc
class CvParser51Job(CvTopParser): """ 对51job的简历进行解析 """ def __init__(self): CvTopParser.__init__(self) self.result = OrderedDict() self.PAY = re.compile(u"(\d+[\s\-])?\d+元") self.UPDATETIME = re.compile("(更新日期|更新时间)[::\s](\d{4}.\d{2}.\d{2})") self.ADDR = re.compile(u"居住地[::\s](\S+)") self.JOB_START = re.compile(u"(\d+.\d+?)--") self.JOB_END = re.compile(u"--(\d+.\d+)") self.JOB_DURATION = re.compile(u"\[(.+?)\]") self.INC_SCALE = re.compile(u"\d+-\d+人|\d+人[以上下]?") self.INC_NAME = re.compile(u"[::>](\S+?)[\(\[【\r\n ]") self.JOB_DEPARTMENT = re.compile(u"部门[:\s:](\S+?)",re.S) self.PROJ_NAME = re.compile(u"[::](\S+?$)") def preprocess(self,htmlContent=None,fname=None,url=None): if url!=None: self.html= urlopen(url).read().decode('utf-8') elif htmlContent: self.html = htmlContent elif fname: self.html = codecs.open(fname,'rb','gb18030').read() else: raise Exception("input error") if re.search(u"已被(求职者)?删除|无法查看",self.html): raise Exception("error: input illegal cv ") self.soup = BeautifulSoup(self.html,"lxml") # 无联系方式无匹配度的 if self.soup.find("title") and re.search(u"简历ID",self.soup.find("title").get_text()): self.HasName = 0 self.resume = self.soup.find('div',{"id":"divResume"}) self.topsoup = self.resume.find("table").find("table") self.field_list = self.resume.find_all("td","cvtitle") # 求职本公司的 elif self.soup.find("div","titleLineB") and self.soup.find(name="span",text=re.compile(u"应聘职位")): self.HasName = 3 self.job_for_soup = self.soup.find(name="span",text = re.compile(u"应聘职位")).find_previous("table") self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地:")).find_previous("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("div","titleLineB") if not self.field_list: self.field_list = self.resume.find("td","cvtitle") # 有应聘职位公司并且显示匹配度的 elif self.soup.find("div",{"id":"divHead"}): self.HasName = 1 self.job_for_soup = self.soup.find("div",{"id":"divHead"}).find("td") self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地:")).find_parent("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("td","cvtitle") # 有联系方式无匹配度的 elif self.soup.find_all(name="td",text=re.compile(u"E-mail:"),limit=10): self.HasName = 2 find_job_for = self.soup.find(name="span",text=re.compile(u"应聘职位")) if find_job_for: self.job_for_soup = self.soup.find(name="span",text = re.compile(u"应聘职位")).find_previous("table") else: self.job_for_soup = BeautifulSoup() self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地:")).find_previous("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("td","cvtitle") if not self.field_list: self.field_list = self.resume.find_all("div","titleLineB") self.result.clear() self.result["cvFrom"] = "51job" self.result["privateInfo"] = {} # 求职岗位和企业,自我介绍,兴趣爱好,校园实践,获得荣誉,股票,补贴等各种其他额外信息 self.result["others"] = {} def regular_basic(self): """ 解析基本信息 """ res = OrderedDict() find_update_time = self.soup.find("span",{"id":"lblResumeUpdateTime"}) if find_update_time: find_update_time = find_update_time.get_text() elif self.UPDATETIME.search(self.html): find_update_time = self.UPDATETIME.search(self.html).group() base_info = self.topsoup.get_text() find_cv_id = self.CV_ID.search(base_info) res["cvId"] = find_cv_id.group(1).strip() if find_cv_id else "None" res["updateTime"] = find_update_time.split(u":")[-1].strip() if find_update_time else "None" base_info1,base_info2 = "","" if self.HasName!=3 and self.topsoup.find_next(name="b",text=u"最近工作"): base_info1 = self.topsoup.find_next(name="b",text=re.compile(u"最近工作")).find_parent("table") base_info2 = self.topsoup.find_next(name="b",text=re.compile(u"学历")).find_parent("table") elif self.topsoup.find_next("div",text=re.compile(u"最近工作")): base_info1 = self.topsoup.find_next(name="div",text=re.compile(u"最近工作")).find_next("table") base_info2 = base_info1.find_next("table") if base_info1: tokens = base_info1.find_all("tr") for token in tokens: items = token.find_all("td") if len(items)==2: if re.search(u"公.?司",items[0].get_text()): res["nowInc"] = items[1].get_text().strip() elif re.search(u"行.?业",items[0].get_text()): res["nowIndustry"] = items[1].get_text().strip() elif re.search(u"职.?位",items[0].get_text()): res["nowPosition"] = items[1].get_text().strip() if base_info2: tokens = base_info2.find_all("tr") for token in tokens: items = token.find_all("td") if len(items)==2: if re.search(u"学.?历",items[0].get_text()): res["nowDiploma"] = items[1].get_text().strip() elif re.search(u"专.?业",items[0].get_text()): res["recentMajorName"] = items[1].get_text().strip() elif re.search(u"学.?校",items[0].get_text()): res["recentSchName"] = items[1].get_text().strip() find_sex = self.SEX.search(base_info) res["gender"]= find_sex.group() if find_sex else "0" find_age = self.AGE.search(base_info) res["age"] = find_age.group() if find_age else "0" find_dob = self.DOB.search(base_info) res["dob"] = find_dob.group(1) if find_dob else "None" try: res["nowWorkAge"] = self.topsoup.find("span","blue").find("b").get_text().split(u"|")[0] except: res["nowWorkAge"] = self.topsoup.find("span","blue1").find("b").get_text().split(u"|")[0] if not re.search(u"经验|在读|应届|年",res["nowWorkAge"]): res["nowWorkAge"] = "None" if "nowDiploma" not in res: find_degree = self.DEGREE.search(base_info) res["nowDiploma"] = find_degree.group() if find_degree else "None" find_marriage = self.MARRIAGE.search(base_info) res["marriage"] = find_marriage.group() if find_marriage else "None" find_politic = self.POLITIC.search(base_info) res["nowPolistatus"] = find_politic.group() if find_politic else u"群众" # 居住地和户口 items = self.topsoup.find_all("td",limit=8) for item in items: if re.search(u"居住地",item.get_text()) and item.find_next_sibling("td"): res["nowAddress"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"户.{0,3}口",item.get_text()) and item.find_next_sibling("td"): res["nowHukou"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"地.址",item.get_text()) and item.find_next_sibling("td"): res["nowAddressDetail"] = item.find_next_sibling("td").get_text().strip() find_height = self.HEIGHT.search(base_info) res["height"] = find_height.group(1) if find_height else "None" if base_info2: find_benefit = base_info2.find_next("table").find_next("td",text=re.compile(u"基本工资|目前薪资")) if find_benefit: tmpsoup = find_benefit.find_previous("table") items = tmpsoup.find_all("td") for item in items: # 基本工资等福利信息 if re.search(u"工资|薪资",item.get_text()): res["nowSalary"] = re.sub("\s+","",item.find_next_sibling("td").get_text().strip()) elif re.search(u"补.?贴",item.get_text()): self.result["others"]["subsidy"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"奖.?金",item.get_text()): self.result["others"]["bonus"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"股.?票",item.get_text()): self.result["others"]["stock"] = item.find_next_sibling("td").get_text().strip() find_oversea = self.OVER_SEA.search(base_info) res["overSea"] = "1" if find_oversea else "None" self.result['baseInfo'] = res # 求职意向 def regular_expect(self): res = OrderedDict() soup = "" for field in self.field_list: if re.search(u"求职意向",field.get_text()): if self.HasName==1: soup = field.find_next("table") else: soup = field.find_previous("table") break if soup: rows = soup.find_all("tr") for item in rows: if not item.find("td"): continue if re.search(u"目标地",item.find("td").get_text()): res["expLocations"] = item.find("td").find_next().get_text() elif re.search(u"月薪|薪资|工资|薪酬",item.find("td").get_text()): res["expSalary"] = self.CLEAN_TEXT.sub("",item.find("td").find_next().get_text()) elif re.search(u"目前状况|求职状态",item.find("td").get_text()): res["workStatus"] = item.find("td").find_next().get_text() elif re.search(u"工作性|目标性",item.find("td").get_text()): res["expJobTypes"] = item.find("td").find_next().get_text() elif re.search(u"希望行业|期望行业",item.find("td").get_text()): res["expIndustrys"] = item.find("td").find_next().get_text() elif re.search(u"岗位|职[业位]",item.find("td").get_text()): res["expPositions"] = item.find("td").find_next().get_text() elif re.search(u"到岗时间",item.find("td").get_text()): res["dutyTime"] = item.find("td").find_next().get_text().strip() elif re.search(u"勿推荐|不要推荐",item.find("td").get_text()): res["ignoreIncs"] = item.find("td").find_next().get_text().strip() elif re.search(u"职能",item.find("td").get_text()): res["expJobCates"] = item.find("td").find_next().get_text().strip() self.result['jobExp'] = res # 教育经历 def regular_educate(self): soup = "" for field in self.field_list: if re.search(u"教育经历",field.get_text()): soup = field.find_next("table") break res = [] if soup: rows = soup.find_all("tr") id = 1 for item in rows: tokens =[ token.get_text().strip() for token in item.find_all("td") if len(token.get_text())>1] tmp = {} if len(tokens)==4: tmp["itemId"] = str(id) tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0]) tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1]) tmp["schName"] = tokens[1] tmp["majorName"] = tokens[2] tmp["eduDiploma"] = tokens[3] id += 1 res.append(tmp) if res: # 基本信息中的最高学历学校,专业 self.result["baseInfo"]["recentSchName"] = res[0]["schName"] self.result["baseInfo"]["recentMajorName"] = res[0]["majorName"] self.result['eduList'] = res # 工作经历 def regular_workexp(self): soup = "" for field in self.field_list: if re.search(u"工作经",field.get_text()): soup = field.find_next("table") break res = [] if soup: rows = soup.find_all("tr") id = 1 tokens,tmp = [],[] for item in rows: if item.find("hr"): tokens.append(tmp) tmp = [] continue else: tmp.append(item) if tmp: tokens.append(tmp) for token in tokens: tmp = {} if len(token)>2: tmp["itemId"] = str(id) job_title = re.sub(u"[\s\r\n ]","",token[0].find("td").get_text()) tmp["jobStart"] = self.clean_edu_time(self.JOB_START.search(job_title).group(1)) if self.JOB_START.search(job_title) else job_title[:6] tmp["jobEnd"] = self.clean_edu_time(self.JOB_END.search(job_title).group(1)) if self.JOB_END.search(job_title) else "None" tmp["jobDuration"] = self.JOB_DURATION.search(job_title).group(1).strip() if self.JOB_DURATION.search(job_title) else "None" tmp["incEmployee"] = self.INC_SCALE.search(job_title).group().strip() if self.INC_SCALE.search(job_title) else "None" if len(token)>3: tmp["jobDesc"] = token[3].get_text().strip() if job_title: if token[0].find("td").find("b") and not re.search(u"年|月",job_title): tmp["incName"] = token[0].find("td").find("b").get_text().strip() else: tmp["incName"] = self.INC_NAME.search(job_title).group(1).strip() if self.INC_NAME.search(job_title) else "None" if re.search(u"所属行业",token[1].get_text()): tmp["incIndustrys"] = token[1].find_all("td")[-1].get_text().strip() else: tmp["jobPosition"] = token[1].find_all("td")[-1].get_text().strip() tmp["jobDesc"] = token[-1].find_all("td")[-1].get_text().strip() jobTagItem = token[2].find_all("td") if len(jobTagItem)==2: tmp["jobPosition"] = jobTagItem[1].get_text().strip() tmp["jobDepartment"] = jobTagItem[0].get_text().strip() elif len(jobTagItem)==3: tmp["jobPosition"] = jobTagItem[1].get_text().strip() tmp["jobDepartment"] = jobTagItem[0].get_text().strip() tmp["jobSalary"] = jobTagItem[2].get_text().strip() else: if token[0].find("td").find('b'): tmp["incName"] = token[0].find("td").find("b").get_text().strip() if re.search(u"职位名称",token[1].get_text()): tmp["jobPosition"] = token[1].find("td").find("b").get_text().strip() tmp["jobDepartment"] = self.JOB_DEPARTMENT.search(token[1].find('td').get_text()).group(1) if self.JOB_DEPARTMENT.search(token[1].find("td").get_text()) else "None" if re.search(u"行业",token[2].get_text()): tmp["incIndustrys"] = token[2].find("td").get_text().strip()[3:] id += 1 res.append(tmp) self.result['jobList'] = res # 语言技能 def regular_language(self): soup = "" for field in self.field_list: if re.search(u"语言.?能.?",field.get_text()): soup = field.find_next("table") if soup and soup.find_all("table"): soup = soup.find_all("table")[-1] break res = [] id = 1 if soup: rows = soup.find_all("tr") for item in rows: tokens = [ i.get_text() for i in item.find_all("td") if i] if len(tokens)!=2: tokens = re.split(u"[::]",item.get_text(),maxsplit=1) if not len(tokens)==2: tokens = re.split(u"[(\(]",item.get_text()) if len(tokens)==2: tmp = {} tmp["itemId"] = str(id) tmp["languageName"] = re.sub(u"[\s+:: ]","",tokens[0]).split("(")[0] tmp["languageLevel"] = re.sub(u"[\s+ ]","",tokens[1]) res.append(tmp) id += 1 self.result["languageList"] = res # 证书 def regular_cert(self): soup ="" for field in self.field_list: if field and re.search(u"证书",field.get_text()): soup = field.find_next("table") break res = [] id = 1 if soup: items = soup.find_all("tr") for item in items: tokens = item.find_all("td") if len(tokens)<2:continue tmp = {} tmp["itemId"] = str(id) tmp["certTime"] = self.clean_edu_time(tokens[0].get_text()) tmp["certName"] = tokens[1].get_text().strip() cert_str = tmp["certName"] find_level = self.CERT_LEVEL.search(cert_str) if find_level: tmp["certLevel"] = find_level.group() tmp["certName"] = tmp["certName"] elif len(tokens)>2: tmp["certLevel"] = tokens[2].get_text().strip() if tmp: res.append(tmp) id += 1 self.result["certList"] = res # 技能 def regular_skill(self): """ 技能模块 """ soup = "" for field in self.field_list: if re.search(u"技能",field.get_text()): soup = field.find_next("table") if soup and soup.find_all("table"): soup = soup.find_all("table")[-1] break res = [] id = 1 if soup: # items = soup.find_all("table",limit=4)[-1].find_all("tr") if soup.find("table") else [] items = soup.find_all("tr") for item in items: tokens = [token.get_text() for token in item.find_all("td")] if len(tokens)<2 or re.search(u"名称",tokens[0]):continue tmp = {} tmp["itemId"] = str(id) tmp["skillName"] = tokens[0].strip().lower() tmp["skillLevel"] = tokens[1].strip() if len(tokens)>2: tmp["skillDuration"] = tokens[2].strip() else: find_duration = re.search("\d+月|[半一二三四五六七八九十\d]年",item.get_text()) tmp["skillDuration"] = find_duration.group() if find_duration else "None" if tmp: res.append(tmp) id += 1 self.result['skillList'] = res # 项目经验 def regular_project(self): soup = "" for field in self.field_list: if re.search(u"项目经.",field.get_text()): soup = field.find_next("table") break res = [] id = 1 if soup: items = soup.find_all("tr") tokens,tmpitem =[],[] for item in items: if item.find("hr"): tokens.append(tmpitem) tmpitem = [] continue elif item: tmpitem.append(item) if tmpitem: tokens.append(tmpitem) for token in tokens: # 解析第一行项目标题 title_str = re.sub(u"[\s\r\n ]","",token[0].get_text()) tmp = {} tmp["itemId"] = str(id) tmp["proStart"] = self.clean_edu_time(self.JOB_START.search(title_str).group(1)) if self.JOB_START.search(title_str) else "None" tmp["proEnd"] = self.clean_edu_time(self.JOB_END.search(title_str).group(1)) if self.JOB_END.search(title_str) else "None" tmp["proName"] = re.sub("\s+","",self.PROJ_NAME.search(title_str).group(1)) if self.PROJ_NAME.search(title_str) else title_str # 解析剩余行标签 field_list = [ item.find("td") for item in token[1:] ] for field in field_list: field_str = field.get_text().strip() if re.search(u"软件环境",field_str): tmp["softwareEnv"] = field.find_next("td").get_text() elif re.search(u"硬件环境",field_str): tmp["hardwareEnv"] = field.find_next("td").get_text() elif re.search(u"开发工具",field_str): tmp["devTool"] = field.find_next("td").get_text() elif re.search(u"项目描述",field_str): tmp["proDesc"] = field.find_next("td").get_text() elif re.search(u"责任描述",field_str): tmp["proDuty"] = field.find_next("td").get_text() if tmp: res.append(tmp) id += 1 self.result['proList'] = res def regular_train(self): soup = "" for field in self.field_list: if re.search(u"培训经.",field.get_text()): soup = field.find_next("table") break res = [] id = 1 if soup: items = soup.find_all("tr") for item in items: tokens =[item.get_text() for item in item.find_all("td") if len(item.get_text())>1] if len(tokens)<3:continue tmp = {} tmp["itemId"] = str(id) tmp["trainStart"] = self.clean_edu_time(tokens[0].split(u'-')[0]) tmp["trainEnd"] = self.clean_edu_time(tokens[0].split(u"-")[-1]) tmp["trainAgency"] = tokens[1].strip() tmp["trainTitle"] = tokens[-1].strip() res.append(tmp) id += 1 self.result["trainList"] = res def regular_private(self): """ 身份证号,联系电话等隐私信息 """ res = {} base_info = self.topsoup.get_text() find_phone = self.PHONE.search(base_info) find_email = self.EMAIL.search(base_info) find_qq = self.QQ.search(base_info) find_idNum = self.IDNUM.search(base_info) userName = "" if self.HasName: find_name = self.topsoup.find_previous("tr").find_previous("tr").find("b") if not find_name: find_name = self.topsoup.find_previous("tr").find_previous("tr").find("strong") if find_name and len(find_name.get_text().strip())<5: userName = find_name.get_text().strip() res["userName"] = userName if userName else "None" res["phoneNumber"] = find_phone.group(1) if find_phone else "None" res["email"] = find_email.group(1) if find_email else "None" res["qq"] = find_qq.group(1) if find_qq else "None" res["idNumber"] = find_idNum.group(1) if find_idNum else "None" find_key_word = self.soup.find("span",text=re.compile(u"简历关键字")) key_words = "" if find_key_word and find_key_word.find_next("span","rsblue"): key_words = find_key_word.find_next("span","rsblue").get_text() elif find_key_word and find_key_word.find_next("td"): key_words = find_key_word.find_next("td").get_text() if key_words and re.search(u"有|熟悉|经验|强|善于|精通|证",key_words): res["keywords"] = key_words.strip().split() self.result["privateInfo"] = res def regular_other(self): res = {} res["jobPositionFor"] = "None" res["jobIncNameFor"] = "None" for field in self.field_list: if re.search(u"自我介绍|个人简介|亮点|自我评价",field.get_text()): res["selfIntro"] = field.find_previous("table").get_text().strip() elif re.search(u"实践|实习",field.get_text()): res["stuPractice"] = re.sub("\s+"," ",field.find_next("table").get_text().strip()) elif re.search(u"校内|校园|社团",field.get_text()): res["schoolExp"] = re.sub("\s+"," ",field.find_next("table").get_text().strip()) elif re.search(u"论文|著作|作品",field.get_text()): res["pubWork"] = res.get("otherWork","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip()) elif re.search(u"奖项|荣誉",field.get_text()): res["gainHoner"] = res.get("otherWork","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip()) elif re.search(u"兴趣|爱好|特长",field.get_text()): res["otherHobby"] = res.get("otherHobby","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip()) elif re.search(u"其他",field.get_text()): res["otherInfo"] = field.find_next("table").get_text().strip() if self.HasName==1: find_jobPositionName = re.search(u"应聘职位",self.job_for_soup.get_text()) if find_jobPositionName: res["jobPositionFor"] = self.job_for_soup.find_next("span").get_text().strip() find_jobIncName = re.search(u"应聘公司",self.job_for_soup.get_text()) if find_jobIncName: res["jobIncNameFor"] = self.job_for_soup.find_next("span").find_next("span").get_text().strip() find_updateTime = re.search(u"投递时间",self.job_for_soup.get_text()) if self.result["baseInfo"]["updateTime"]=="None" and find_updateTime: self.result["baseInfo"]["updateTime"] = self.job_for_soup.find_next("span").find_next("span").get_text().strip() elif self.HasName>1: items = self.job_for_soup.find_all("td",limit=6) for item in items: if re.search(u"应聘职位",item.get_text()): res["jobPositionFor"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"应聘公司",item.get_text()): res["jobIncNameFor"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"投递时间",item.get_text()): self.result["baseInfo"]["updateTime"] = item.find_next_sibling("td").get_text().strip() break res.update(self.result.pop("others",{})) self.result["others"] = res def parser(self,htmlContent=None,fname=None,url=None): self.preprocess(htmlContent,fname,url) self.regular_basic() self.regular_private() self.regular_expect() self.regular_educate() self.regular_workexp() self.regular_skill() self.regular_cert() self.regular_language() self.regular_project() self.regular_train() self.regular_other() return self.result def output(self): res = "\n" for k in self.result: res += k+":"+"\n" if isinstance(self.result[k],dict): for kk,vv in self.result[k].iteritems(): res += '%1s: %s\n' %( kk,vv ) elif isinstance(self.result[k],list): for i,exp in enumerate(self.result[k]): res+= "%12s\n" % (str(i+1)) if isinstance(exp,dict): for kk,vv in exp.iteritems(): res += "%22s: %s\n" % (kk,vv) elif isinstance(exp,tuple): for kk in exp: res += '%22s \n'% (kk) res += " "*10+'---'*10+'\n' else: res += " "*10+"%s\n" % (self.result[k]) return res
class CvParser51Job(CvTopParser): """ 对51job的简历进行解析 """ def __init__(self): CvTopParser.__init__(self) self.result = OrderedDict() self.PAY = re.compile(u"(\d+[\s\-])?\d+元") self.UPDATETIME = re.compile("(更新日期|更新时间)[::\s](\d{4}.\d{2}.\d{2})") self.ADDR = re.compile(u"居住地[::\s](\S+)") self.JOB_START = re.compile(u"(\d+.\d+?)--") self.JOB_END = re.compile(u"--(\d+.\d+|至今)") self.JOB_DURATION = re.compile(u"\[(.+?)\]") self.INC_SCALE = re.compile(u"\d+-\d+人|\d+人(以上|以下)?|少于\d+人") # self.INC_NAME = re.compile(u"[::>](\S+?)[\(\[【\r\n ]") self.INC_NAME = re.compile(u"[::>](\S+?)(\(\d|\[\d|\(少于|\[一)") self.JOB_DEPARTMENT = re.compile(u"部门[:\s:](\S+?)", re.S) self.PROJ_NAME = re.compile(u"[::](\S+?$)") def preprocess(self, htmlContent=None, fname=None, url=None): if url != None: self.html = urlopen(url).read().decode('utf-8') elif htmlContent: self.html = htmlContent elif fname: self.html = codecs.open(fname, 'rb', 'gb18030').read() else: raise Exception("input error") if re.search(u"已被(求职者)?删除|无法查看", self.html): raise Exception("error: input illegal cv ") self.soup = BeautifulSoup(self.html, "lxml") # 无联系方式无匹配度的 if self.soup.find("title") and re.search(u"简历ID", self.soup.find("title").get_text()): self.HasName = 0 self.resume = self.soup.find('div', {"id": "divResume"}) self.topsoup = self.resume.find("table").find("table").find_next("table") self.topsoup = self.resume.find("table").find("table") self.field_list = self.resume.find_all("td", "cvtitle") # 求职本公司的 elif self.soup.find("div", "titleLineB") and self.soup.find(name="span", text=re.compile(u"应聘职位")): self.HasName = 3 self.job_for_soup = self.soup.find(name="span", text=re.compile(u"应聘职位")).find_previous("table") self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地:")).find_previous("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("div", "titleLineB") if not self.field_list: self.field_list = self.resume.find("td", "cvtitle") # 有应聘职位公司并且显示匹配度的 elif self.soup.find("div", {"id": "divHead"}): self.HasName = 1 self.job_for_soup = self.soup.find("div", {"id": "divHead"}).find("td") self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地:")).find_parent("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("td", "cvtitle") # 有联系方式无匹配度的 elif self.soup.find_all(name="td", text=re.compile(u"E-mail:"), limit=10): self.HasName = 2 find_job_for = self.soup.find(name="span", text=re.compile(u"应聘职位")) if find_job_for: self.job_for_soup = self.soup.find(name="span", text=re.compile(u"应聘职位")).find_previous("table") else: self.job_for_soup = BeautifulSoup() self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地:")).find_previous("table") self.resume = self.topsoup.find_parent("table").find_parent("table") self.field_list = self.resume.find_all("td", "cvtitle") if not self.field_list: self.field_list = self.resume.find_all("div", "titleLineB") self.refresh() self.result["cvFrom"] = "51job" # 基本信息 1 def regular_basic(self): """ 解析基本信息 """ temp_nowAddressDetail = "" find_update_time = self.soup.find("span", {"id": "lblResumeUpdateTime"}) if find_update_time: find_update_time = find_update_time.get_text() elif self.UPDATETIME.search(self.html): find_update_time = self.UPDATETIME.search(self.html).group() # 20160309 zhangzq self.resumeType = 0 find_resume_type = self.soup.find("span", {"id": "lblResumeType"}) if find_resume_type: find_resume_type = find_resume_type.get_text() if re.search(u"粘贴简历", find_resume_type): self.resumeType = 1 self.base_html = self.topsoup.get_text().strip() base_info = self.topsoup.get_text() # print self.topsoup find_cv_id = self.CV_ID.search(base_info) self.result["baseInfo"]["cvId"] = find_cv_id.group(1).strip() if find_cv_id else "" self.result["baseInfo"]["updateTime"] = find_update_time.split(u":")[-1].strip() if find_update_time else "None" base_info1, base_info2 = "", "" if self.HasName != 3 and self.topsoup.find_next(name="b", text=u"最近工作"): base_info1 = self.topsoup.find_next(name="b", text=re.compile(u"最近工作")).find_parent("table") base_info2 = self.topsoup.find_next(name="b", text=re.compile(u"学历")).find_parent("table") elif self.topsoup.find_next("div", text=re.compile(u"最近工作")): base_info1 = self.topsoup.find_next(name="div", text=re.compile(u"最近工作")).find_next("table") base_info2 = base_info1.find_next("table") if base_info1: tokens = base_info1.find_all("tr") for token in tokens: items = token.find_all("td") if len(items) == 2: if re.search(u"公.?司", items[0].get_text()): self.result["baseInfo"]["nowInc"] = items[1].get_text().strip() elif re.search(u"行.?业", items[0].get_text()): self.result["baseInfo"]["nowIndustry"] = items[1].get_text().strip() elif re.search(u"职.?位", items[0].get_text()): self.result["baseInfo"]["nowPosition"] = items[1].get_text().strip() if base_info2: tokens = base_info2.find_all("tr") for token in tokens: items = token.find_all("td") if len(items) == 2: if re.search(u"学.?历", items[0].get_text()): self.result["baseInfo"]["nowDiploma"] = items[1].get_text().strip() elif re.search(u"专.?业", items[0].get_text()): self.result["baseInfo"]["recentMajorName"] = items[1].get_text().strip() elif re.search(u"学.?校", items[0].get_text()): self.result["baseInfo"]["recentSchName"] = items[1].get_text().strip() find_sex = self.SEX.search(base_info) self.result["baseInfo"]["gender"] = find_sex.group() if find_sex else "0" find_age = self.AGE.search(base_info) self.result["baseInfo"]["age"] = find_age.group() if find_age else "0" find_dob = self.DOB.search(base_info) self.result["baseInfo"]["dob"] = find_dob.group(1) if find_dob else "None" try: self.result["baseInfo"]["nowWorkAge"] = self.topsoup.find("span", "blue").find("b").get_text().split(u"|")[0] except: # res["nowWorkAge"] = self.topsoup.find("span","blue1").find("b").get_text().split(u"|")[0] self.result["baseInfo"]["nowWorkAge"] = "" if not re.search(u"经验|在读|应届|年", self.result["baseInfo"]["nowWorkAge"]): self.result["baseInfo"]["nowWorkAge"] = "" if not self.result["baseInfo"]["nowDiploma"]: find_degree = self.DEGREE.search(base_info) self.result["baseInfo"]["nowDiploma"] = find_degree.group() if find_degree else "" find_marriage = self.MARRIAGE.search(base_info) self.result["baseInfo"]["marriage"] = find_marriage.group() if find_marriage else "" find_politic = self.POLITIC.search(base_info) # res["nowPolistatus"] = find_politic.group() if find_politic else u"群众" # print find_politic.group() # print base_info self.result["baseInfo"]["nowPoliStatus"] = find_politic.group() if find_politic else "" # 居住地和户口 if self.topsoup.find("table"): items = self.topsoup.find("table").find_all("td", limit=20) else: items = self.topsoup.find_all("td", limit=20) for item in items: if re.search(u"专.业", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["recentMajorName"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"学.历", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["nowDiploma"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"职.能", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["jobPosition"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"行.业", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["incIndustrys"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"居住地", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["nowAddress"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"户.{0,3}口", item.get_text()) and item.find_next_sibling("td"): self.result["baseInfo"]["nowHukou"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"地.址", item.get_text()) and item.find_next_sibling("td"): temp_nowAddressDetail = item.find_next_sibling("td").get_text().strip() self.result["baseInfo"]["nowAddressDetail"] = item.find_next_sibling("td").get_text().split(u'(')[ 0].strip() # print item.find_next_sibling("td").get_text().split('(')[0].strip() elif re.search(u"关键词", item.get_text()) and item.find_next_sibling("td"): self.result["privateInfo"]["keyWords"] = item.find_next_sibling("td").get_text(" ", strip=True) find_height = self.HEIGHT.search(base_info) self.result["baseInfo"]["height"] = find_height.group(1) if find_height else "" if base_info2: find_benefit = base_info2.find_next("table").find_next("td", text=re.compile(u"基本工资|目前薪资")) if find_benefit: tmpsoup = find_benefit.find_previous("table") items = tmpsoup.find_all("td") for item in items: # 基本工资等福利信息 if re.search(u"目前薪资|目前年薪", item.get_text()): self.result["baseInfo"]["nowSalary"] = re.sub("\s+", "", item.find_next_sibling("td").get_text().strip()) elif re.search(u"基本薪资|基本工资", item.get_text()): self.result["baseInfo"]["baseSalary"] = re.sub("\s+", "", item.find_next_sibling("td").get_text().strip()) elif re.search(u"补.?贴", item.get_text()): self.result["baseInfo"]["subsidy"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"奖.?金", item.get_text()): self.result["baseInfo"]["bonus"] = item.find_next_sibling("td").get_text().strip() elif re.search(u"股.?票", item.get_text()): self.result["baseInfo"]["stock"] = item.find_next_sibling("td").get_text().strip() find_oversea = self.OVER_SEA.search(base_info) self.result["baseInfo"]["overSea"] = "1" if find_oversea else "" if re.search(u"邮编:(\d{6})", temp_nowAddressDetail): self.result["baseInfo"]["nowZipCode"] = re.search(u"邮编:(\d{6})", temp_nowAddressDetail).group(1) # 求职意向 2 def regular_expect(self): soup = "" for field in self.field_list: if re.search(u"求职意向", field.get_text()): if self.HasName == 1: soup = field.find_next("table") else: soup = field.find_previous("table") break self.expect_html = soup.get_text().strip() if soup else "" if soup: rows = soup.find_all("tr") for item in rows: if not item.find("td"): continue if re.search(u"目标地", item.find("td").get_text()): self.result['jobExp']["expLocations"] = item.find("td").find_next().get_text() elif re.search(u"职能", item.find("td").get_text()): self.result['jobExp']["expJobCates"] = item.find("td").find_next().get_text().strip() # print self.result['jobExp']['expJobCates'] elif re.search(u"月薪|薪资|工资|薪酬", item.find("td").get_text()): # print item.find('td').find_next().get_text() self.result['jobExp']["expSalary"] = self.CLEAN_TEXT.sub("",item.find("td").find_next().get_text()) # print self.result['jobExp']['expSalary'] elif re.search(u"目前状况|求职状态", item.find("td").get_text()): self.result['jobExp']["workStatus"] = item.find("td").find_next().get_text() elif re.search(u"工作性|目标性", item.find("td").get_text()): self.result['jobExp']["expJobTypes"] = item.find("td").find_next().get_text() elif re.search(u"希望行业|期望行业", item.find("td").get_text()): self.result['jobExp']["expIndustrys"] = item.find("td").find_next().get_text() elif re.search(u"岗位|职[业位]", item.find("td").get_text()): self.result['jobExp']["expPositions"] = item.find("td").find_next().get_text() elif re.search(u"到岗时间", item.find("td").get_text()): self.result['jobExp']["dutyTime"] = item.find("td").find_next().get_text().strip() elif re.search(u"勿推荐|不要推荐", item.find("td").get_text()): self.result['jobExp']["ignoreIncs"] = item.find("td").find_next().get_text().strip() # elif re.search(u"职能",item.find("td").get_text()): # self.result['jobExp']["expJobCates"] = item.find("td").find_next().get_text().strip() # 教育经历 3 def regular_educate(self): soup = "" for field in self.field_list: if re.search(u"教育经历", field.get_text()): soup = field.find_next("table") break self.edu_html = soup.get_text().strip() if soup else "" if soup: rows = soup.find_all("tr") id = 1 for item in rows: tokens = [token.get_text().strip() for token in item.find_all("td") if len(token.get_text()) > 1] tmp = self.get_eduDict() if len(tokens) == 4: tmp["itemId"] = str(id) tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0]) tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1]) tmp["schName"] = tokens[1] tmp["majorName"] = tokens[2] tmp["eduDiploma"] = tokens[3] id += 1 self.result["eduList"].append(tmp) elif len(tokens) == 3: tmp["itemId"] = str(id) tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0]) tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1]) tmp["schName"] = tokens[1] tmp["eduDiploma"] = tokens[2] id += 1 self.result["eduList"].append(tmp) # if res: # # 基本信息中的最高学历学校,专业 # if not self.result["baseInfo"]["recentSchName"]: # self.result["baseInfo"]["recentSchName"] = res[0]["schName"] # if not self.result["baseInfo"]["recentMajorName"]: # self.result["baseInfo"]["recentMajorName"] = res[0]["majorName"] # 工作经历 4 def regular_workexp(self): soup = "" for field in self.field_list: if re.search(u"工作经", field.get_text()): soup = field.find_next("table") break self.work_html = soup.get_text().strip() if soup else "" # print soup if soup: rows = soup.find_all("tr") id = 1 tokens, tmp = [], [] # print rows for item in rows: if item.find("hr"): tokens.append(tmp) tmp = [] continue else: tmp.append(item) # print tmp if tmp: tokens.append(tmp) for token in tokens: # print len(tokens) # print "token[0]: %s\ntoken[1]: %s\ntoken[2]: %s" %(token[0], token[1], token[2]) tmp = self.get_jobDict() # print len(token) if len(token) >= 2: tmp["itemId"] = str(id) job_title = re.sub(u"[\s\r\n ]", "", token[0].find("td").get_text()) tmp["jobStart"] = self.clean_edu_time( self.JOB_START.search(job_title).group(1)) if self.JOB_START.search(job_title) else job_title[ :6] tmp["jobEnd"] = self.clean_edu_time(self.JOB_END.search(job_title).group(1)) if self.JOB_END.search( job_title) else "" tmp["jobDuration"] = self.JOB_DURATION.search(job_title).group( 1).strip() if self.JOB_DURATION.search(job_title) else "" tmp["incEmployee"] = self.INC_SCALE.search(job_title).group().strip() if self.INC_SCALE.search( job_title) else "" if len(token) >= 3: jobDescStr = token[-1].get_text().strip() # print jobDescStr jobDesc = re.search(u'工作内容:(.*)', jobDescStr) # print jobDesc.group(1) if jobDesc: # print 'yes' tmp["jobDesc"] = re.sub(u'#.*?#', '', token[-1].get_text(separator='\n').strip()) # print tmp["jobDesc"] else: # print 'no' # print len(token) # print token[3] # job_tmp = re.search('<td(.*?)>(.*?)</td>', str(token[-1])) # job_tmp_notags = re.sub('<br/>', '\n', job_tmp.group(2)) # print token[3].get_text() # print self.result['baseInfo']['cvId'] if self.result['baseInfo']['cvId'] == '330482369' or self.result['baseInfo']['cvId'] == '333633446'\ or self.result['baseInfo']['cvId'] == '338845280' or self.result['baseInfo']['cvId'] == '65996657': if len(token) > 3: tmp["jobDesc"] = re.sub(u'#.*?#', '', token[3].get_text(separator='\n').strip()) else: tmp["jobDesc"] = re.sub(u'#.*?#', '', token[-1].get_text(separator='\n').strip()) # print tmp['jobDesc'] # print job_title if job_title: if token[0].find("td").find("b") and not re.search(u"年|月", job_title): tmp["incName"] = token[0].find("td").find("b").get_text().strip() else: if self.INC_NAME.search(job_title): # print self.INC_NAME.search(job_title).group(1) tmp["incName"] = self.INC_NAME.search(job_title).group(1).strip() else: tmp["incName"] = "" if re.search(u"所属行业", token[1].get_text()): tmp["incIndustrys"] = token[1].find_all("td")[-1].get_text().strip() else: tmp["jobPosition"] = token[1].find_all("td")[-1].get_text().strip() tmp["jobDepartment"] = token[1].find_all("td")[0].get_text().strip() if "jobDesc" not in tmp.keys(): tmp["jobDesc"] = token[-1].find_all("td")[-1].get_text(separator='\n').strip() try: jobTagItem = token[2].find_all("td") except: pass #print self.result['baseInfo']['cvId'] else: if len(jobTagItem) == 2: tmp["jobPosition"] = re.sub(u'\s+', '', jobTagItem[1].get_text().strip()) tmp["jobDepartment"] = jobTagItem[0].get_text().strip() elif len(jobTagItem) == 3: tmp["jobPosition"] = jobTagItem[1].get_text().strip() tmp["jobDepartment"] = jobTagItem[0].get_text().strip() tmp["jobSalary"] = jobTagItem[2].get_text().strip() else: if token[0].find("td").find('b'): tmp["incName"] = token[0].find("td").find("b").get_text().strip() if re.search(u"职位名称", token[1].get_text()): tmp["jobPosition"] = token[1].find("td").find("b").get_text().strip() tmp["jobDepartment"] = self.JOB_DEPARTMENT.search(token[1].find('td').get_text()).group( 1) if self.JOB_DEPARTMENT.search(token[1].find("td").get_text()) else "None" if re.search(u"行业", token[2].get_text()): tmp["incIndustrys"] = token[2].find("td").get_text().strip()[3:] id_ma = 1 for t in token[4:]: ttext = t.get_text() # print ttext reportTo = re.search(u'汇报对象:(.*)', ttext) underNum = re.search(u'下属人数:(\d+)', ttext) witness = re.search(u'证 明 人:(.*)', ttext) leaveReason = re.search(u'离职原因:(.*)', ttext) achieveDesc = re.search(u'工作业绩:(.*)', ttext) tmp['manageExp']['itemId'] = str(id_ma) if reportTo: tmp['manageExp']['reportTo'] = reportTo.group(1) elif underNum: tmp['manageExp']['underNum'] = underNum.group(1) elif witness: tmp['manageExp']['witness'] = witness.group(1) elif leaveReason: tmp['manageExp']['leaveReason'] = leaveReason.group(1) elif achieveDesc: tmp['manageExp']['achieveDesc'] = achieveDesc.group(1) id_ma += 1 id += 1 self.result["jobList"].append(tmp) # 语言技能 5 def regular_language(self): soup = "" for field in self.field_list: if re.search(u"语言.?能.?", field.get_text()): soup = field.find_next("table") if soup and soup.find_all("table"): soup = soup.find_all("table")[-1] break self.language_html = soup.get_text().strip() if soup else "" res = [] id = 1 if soup: rows = soup.find_all("tr") for item in rows: tokens = [i.get_text() for i in item.find_all("td") if i] if len(tokens) != 2: tokens = re.split(u"[::]", item.get_text(), maxsplit=1) if not len(tokens) == 2: tokens = re.split(u"[(\(]", item.get_text()) if len(tokens) == 2: tmp = self.get_languageDict() tmp["itemId"] = str(id) tmp["languageName"] = re.sub(u"[\s+:: ]", "", tokens[0]) # .split("(")[0] tmp["languageLevel"] = re.sub(u"[\s+ ]", "", tokens[1]) res.append(tmp) id += 1 self.result["languageList"] = res # 证书 6 def regular_cert(self): soup = "" for field in self.field_list: if field and re.search(u"证书", field.get_text()): soup = field.find_next("table") break self.cert_html = soup.get_text().strip() if soup else "" res = [] id = 1 if soup: items = soup.find_all("tr") for item in items: tokens = item.find_all("td") if len(tokens) < 2: continue tmp = self.get_certDict() tmp["itemId"] = str(id) tmp["certTime"] = self.clean_edu_time(tokens[0].get_text()) certName = tokens[1].get_text().strip() tmp["certName"] = re.sub(u"#.*?#", '', certName) if len(tokens) == 3: tmp["certLevel"] = tokens[2].get_text().strip() else: tmp["certLevel"] = "None" # cert_str = tmp["certName"] # find_level = self.CERT_LEVEL.search(cert_str) # if find_level: # tmp["certLevel"] = find_level.group() # tmp["certName"] = tmp["certName"] # elif len(tokens)>2: # tmp["certLevel"] = tokens[2].get_text().strip() if tmp: res.append(tmp) id += 1 self.result["certList"] = res # 技能 7 def regular_skill(self): """ 技能模块 """ soup = "" for field in self.field_list: if re.search(u"技能", field.get_text()): soup = field.find_next("table") if soup and soup.find_all("table"): soup = soup.find_all("table")[-1] break self.skill_html = soup.get_text().strip() if soup else "" res = [] id = 1 if soup: # items = soup.find_all("table",limit=4)[-1].find_all("tr") if soup.find("table") else [] items = soup.find_all("tr") for item in items: tokens = [token.get_text() for token in item.find_all("td")] if len(tokens) < 2 or re.search(u"名称", tokens[0]): continue tmp = self.get_skillDict() tmp["itemId"] = str(id) tmp["skillName"] = tokens[0].strip().lower() tmp["skillLevel"] = tokens[1].strip() if len(tokens) > 2: tmp["skillDuration"] = tokens[2].strip() else: find_duration = re.search("\d+月|[半一二三四五六七八九十\d]年", item.get_text()) tmp["skillDuration"] = find_duration.group() if find_duration else "None" if tmp: res.append(tmp) id += 1 self.result['skillList'] = res # 项目经验 8 def regular_project(self): soup = "" for field in self.field_list: if re.search(u"项目经.", field.get_text()): soup = field.find_next("table") break self.project_html = soup.get_text().strip() if soup else "" res = [] id = 1 if soup: items = soup.find_all("tr") tokens, tmpitem = [], [] for item in items: if item.find("hr"): tokens.append(tmpitem) tmpitem = [] continue elif item: tmpitem.append(item) if tmpitem: tokens.append(tmpitem) for token in tokens: # 解析第一行项目标题 title_str = re.sub(u"[\s\r\n ]", "", token[0].get_text()) tmp = self.get_proDict() tmp["itemId"] = str(id) tmp["proStart"] = self.clean_edu_time( self.JOB_START.search(title_str).group(1)) if self.JOB_START.search(title_str) else "None" tmp["proEnd"] = self.clean_edu_time(self.JOB_END.search(title_str).group(1)) if self.JOB_END.search( title_str) else "None" tmp["proName"] = re.sub("\s+", "", self.PROJ_NAME.search(title_str).group(1)) if self.PROJ_NAME.search( title_str) else title_str # 解析剩余行标签 field_list = [item.find("td") for item in token[1:]] for field in field_list: field_str = field.get_text().strip() if re.search(u"软件环境", field_str): tmp["softwareEnv"] = field.find_next("td").get_text() elif re.search(u"硬件环境", field_str): tmp["hardwareEnv"] = field.find_next("td").get_text() elif re.search(u"开发工具", field_str): tmp["devTool"] = field.find_next("td").get_text() elif re.search(u"项目描述", field_str): tmp["proDesc"] = field.find_next("td").get_text(separator='\n') elif re.search(u"责任描述", field_str): tmp["proDuty"] = field.find_next("td").get_text(separator='\n') if tmp: res.append(tmp) id += 1 self.result['proList'] = res # 培训经历 9 def regular_train(self): soup = "" for field in self.field_list: if re.search(u"培训经.", field.get_text()): soup = field.find_next("table") break self.train_html = soup.get_text().strip() if soup else "" res = [] id = 1 if soup: items = soup.find_all("tr") for item in items: tokens = [item.get_text() for item in item.find_all("td") if len(item.get_text()) > 1] if len(tokens) < 3: continue # print res tmp = self.get_trainDict() tmp["itemId"] = str(id) tmp["trainStart"] = self.clean_edu_time(tokens[0].split(u'-')[0]) tmp["trainEnd"] = self.clean_edu_time(tokens[0].split(u"-")[-1]) tmp["trainAgency"] = tokens[1].strip() tmp["trainTitle"] = tokens[2].strip() if len(tokens) > 3: tmp["trainCert"] = tokens[3].strip() tt = item.find_next('td').get_text().strip().split('\\') tmp["trainDesc"] = '/'.join(tt) # print tmp["trainDesc"] res.append(tmp) id += 1 self.result["trainList"] = res # print self.result["trainList"] def regular_private(self): """ 身份证号,联系电话等隐私信息 """ base_info = self.topsoup.get_text() find_phone = self.PHONE.search(base_info) find_email = self.EMAIL.search(base_info) find_qq = self.QQ.search(base_info) find_idNum = self.IDNUM.search(base_info) userName = "" if self.HasName: find_name = self.topsoup.find_previous("tr").find_previous("tr").find("b") if not find_name: find_name = self.topsoup.find_previous("tr").find_previous("tr").find("strong") if find_name and len(find_name.get_text().strip()) < 5: userName = find_name.get_text().strip() self.result["privateInfo"]["userName"] = userName if userName else "" self.result["privateInfo"]["phoneNumber"] = find_phone.group(1) if find_phone else "" self.result["privateInfo"]["email"] = find_email.group(1) if find_email else "" self.result["privateInfo"]["qq"] = find_qq.group(1) if find_qq else "" self.result["privateInfo"]["idNumber"] = find_idNum.group(1) if find_idNum else "" find_key_word = self.soup.find("span", text=re.compile(u"简历关键字")) key_words = "" if find_key_word and find_key_word.find_next("span", "rsblue"): key_words = find_key_word.find_next("span", "rsblue").get_text() elif find_key_word and find_key_word.find_next("td"): key_words = find_key_word.find_next("td").get_text() if key_words and re.search(u"有|熟悉|经验|强|善于|精通|证", key_words): self.result["privateInfo"]["keyWords"] = key_words.strip().split() # 其他信息 10 def regular_other(self): self.other_html = [] for field in self.field_list: # print field.get_text() if re.search(u"自我介绍|个人简介|亮点|自我评价", field.get_text()): self.result["others"]["selfIntro"] = field.find_previous("table").get_text(separator='\n')[4:].strip() # print self.result["others"]["selfIntro"] self.other_html.append(self.result["others"]["selfIntro"]) elif re.search(u"实践|实习", field.get_text()): self.result["others"]["stuPractice"] = re.sub("\s+", " ", field.find_next("table").get_text().strip()) # print self.result["others"]['stuPractice'] self.other_html.append(self.result["others"]["stuPractice"]) elif re.search(u"校内|校园|社团", field.get_text()): self.result["others"]["schoolExp"] = re.sub("\s+", " ", field.find_next("table").get_text().strip()) self.other_html.append(self.result["others"]["schoolExp"]) elif re.search(u"论文|著作|作品", field.get_text()): self.result["others"]["pubWork"] = self.result["others"]["pubWork"] + "\n" + self.CLEAN_TEXT.sub(" ", field.find_next( "table").get_text().strip()) self.other_html.append(self.result["others"]["pubWork"]) elif re.search(u"奖项|荣誉", field.get_text()): self.result["others"]["gainHoner"] = self.result["others"]["gainHoner"] + "\n" + self.CLEAN_TEXT.sub( " ", field.find_next("table").get_text().strip()) self.other_html.append(self.result["others"]["gainHoner"]) elif re.search(u"兴趣|爱好|特长", field.get_text()): self.result["others"]["otherHobby"] = self.result["others"]["otherHobby"] + "\n" + self.CLEAN_TEXT.sub( " ", field.find_next("table").get_text().strip()) self.other_html.append(self.result["others"]["otherHobby"]) # 附件信息 elif re.search(u"附件", field.get_text().strip()): self.result["others"]["attachment"] = field.find_next("table").get_text() self.other_html.append(self.result["others"]["attachment"]) elif re.search(u"其他信息", field.get_text()): if self.resumeType == 1: self.result["others"]["otherInfo"] = field.find_parent("table").get_text().strip() else: self.result["others"]["otherInfo"] = field.find_next("table").get_text().strip() self.other_html.append(self.result["others"]["otherInfo"]) if self.HasName == 1: find_jobPositionName = re.search(u"应聘职位", self.job_for_soup.get_text()) if find_jobPositionName: self.result["others"]["jobPositionFor"] = self.job_for_soup.find_next("span").get_text().strip() self.other_html.append(self.result["others"]["jobPositionFor"]) find_jobIncName = re.search(u"应聘公司", self.job_for_soup.get_text()) if find_jobIncName: self.result["others"]["jobIncNameFor"] = self.job_for_soup.find_next("span").find_next( "span").get_text().strip() self.other_html.append(self.result["others"]["jobIncNameFor"]) find_updateTime = re.search(u"投递时间", self.job_for_soup.get_text()) if self.result["baseInfo"]["updateTime"] == "None" and find_updateTime: self.result["baseInfo"]["updateTime"] = self.job_for_soup.find_next("span").find_next( "span").get_text().strip() elif self.HasName > 1: items = self.job_for_soup.find_all("td", limit=6) for item in items: if re.search(u"应聘职位", item.get_text()): self.result["others"]["jobPositionFor"] = item.find_next_sibling("td").get_text().strip() self.other_html.append(self.result["others"]["jobPositionFor"]) elif re.search(u"应聘公司", item.get_text()): self.result["others"]["jobIncNameFor"] = item.find_next_sibling("td").get_text().strip() self.other_html.append(self.result["others"]["jobIncNameFor"]) elif re.search(u"投递时间", item.get_text()): self.result["baseInfo"]["updateTime"] = item.find_next_sibling("td").get_text().strip() break # 求职信 recommendLetter = self.soup.find("table", {"id": "tabCvletter"}) if recommendLetter: text = recommendLetter.get_text().strip() self.result["others"]["recommendLetter"] = text[5:].strip() self.other_html.append(self.result["others"]["recommendLetter"]) # 针对粘贴简历 find_other_info = self.soup.find("div", "titleLineB", text=re.compile(u"其它信息")) if find_other_info: self.result["others"]["otherInfo"] = find_other_info.find_parent("table").get_text().strip() self.other_html.append(self.result["others"]["otherInfo"]) # 工作地址 if re.search(u"((.*))", self.result["others"]["jobPositionFor"]): self.result["others"]["workLoc"] = re.search(u"((.*))", self.result["others"]["jobPositionFor"]).group(1) self.other_html.append(self.result["others"]["workLoc"]) def parser(self, htmlContent=None, fname=None, url=None): self.preprocess(htmlContent, fname, url) self.regular_basic() self.regular_private() self.regular_expect() self.regular_educate() self.regular_workexp() self.regular_skill() self.regular_cert() self.regular_language() self.regular_project() self.regular_train() self.regular_other() return self.result def output(self): res = "\n" for k in self.result: res += k + ":" + "\n" if isinstance(self.result[k], dict): for kk, vv in self.result[k].iteritems(): res += '%1s: %s\n' % (kk, vv) elif isinstance(self.result[k], list): for i, exp in enumerate(self.result[k]): res += "%12s\n" % (str(i + 1)) if isinstance(exp, dict): for kk, vv in exp.iteritems(): res += "%22s: %s\n" % (kk, vv) elif isinstance(exp, tuple): for kk in exp: res += '%22s \n' % (kk) res += " " * 10 + '---' * 10 + '\n' else: res += " " * 10 + "%s\n" % (self.result[k]) return res
""" """ import os import sys import argparse import bz2 from bs4 import BeautifulSoup # with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input: # for i in range(10): # l = input.readline() # print("{}\n".format(l)) # <Id db="BioSample" is_primary="1">SAMN00000002</Id>\n' def primaryId(tag): return tag['db'] == 'BioSample' and tag['is_primary'] with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input: soup = BeautifulSoup(input, 'xml') pi = soup.find_next(primaryId) print("{}".format(pi))
def saveInmateProfile(browser): inmate = Inmate() # inmate profile record = InmateRecord() # inmate current record facility = Facility() # find inmate ID, will go in active record time.sleep(2) element = browser.find_elements_by_class_name("section_data") backgroundPersonInfo = BeautifulSoup(element[1].get_attribute('innerHTML'), 'html.parser').find("tbody").find("tbody") personInfoCells = backgroundPersonInfo.find_all("tr") for ind in range(len(personInfoCells)): cell = personInfoCells[ind] if not isinstance(cell, NavigableString): txt = " ".join(cell.text.strip().split()) if "Name" in txt: fullName = txt.replace(",", "").split(" ") lastName = fullName[1] firstName = fullName[2] middleName = fullName[-1] if len(fullName) == 4 else None inmate.name = Name(firstName, middleName, lastName) elif "Age" in txt: inmate.age = txt.split(" ")[-1] inmate.DOB = Date(inmate.age, None, None, True) elif "Gender" in txt: inmate.sex = txt.split(" ")[-1] elif "Ethnicity" in txt: inmate.race = txt.split(" ")[-1] elif "Hair Color" in txt: inmate.hairColor = txt.split(" ")[-1] elif "Eye Color" in txt: inmate.eyeColor = txt.split(" ")[-1] elif "Height" in txt: inmate.height = txt.split(" ")[-2] + txt.split(" ")[-1] elif "Weight" in txt: inmate.weight = txt.split(" ")[-1] backgroundPersonPrisonInfo = backgroundPersonInfo.find_next("tbody") personPrisonInfoCells = backgroundPersonPrisonInfo.find_all("tr") for ind in range(len(personPrisonInfoCells)): cell = personPrisonInfoCells[ind] if not isinstance(cell, NavigableString): txt = " ".join(cell.text.strip().split()) if "DOC Number" in txt: # inmate's id given by Colorado Department of Corrections record.recordNumber = txt.split(" ")[-1] elif "Est. Parole Eligibility Date" in txt: dateSplit = txt.split(" ")[-1].split("/") if len(dateSplit) > 1: record.paroleEligibilityDate = Date(dateSplit[-1], dateSplit[0], dateSplit[1]) elif "Next Parole Hearing Date" in txt: dateSplit = txt.split(":") if len(dateSplit) > 1: record.nextParoleHearingDate = dateSplit[-1].strip() elif "Est. Sentence Discharge Date" in txt: dateSplit = txt.split(" ")[-1].split("/") if len(dateSplit) > 1: record.estReleaseDate = Date(dateSplit[-1], dateSplit[0], dateSplit[1]) elif "Current Facility Assignment" in txt: facility.name = txt.split(":")[-1].strip() # saves profile to the database writeToDB(inmate) browser.find_element_by_id("btn_search_txt").click() return inmate.name.first + " " + inmate.name.last
import requests from lxml.html import fromstring from bs4 import BeautifulSoup r = requests.get("http://github.com/") print(r.status_code) if r.status_code == 200: tree = fromstring(r.content) a = tree.findtext('.//title') soup = BeautifulSoup(r.text) metas = soup.find_all('meta') get_text = r.text soup = BeautifulSoup(get_text, "html.parser") b = soup.find_next('h1', 'class:listing-name') print(a, metas, b) r = requests.get("https://github.com/") print(r.status_code) if r.status_code == 200: tree = fromstring(r.content) a = tree.findtext('.//title') soup = BeautifulSoup(r.text) metas = soup.find_all('meta') get_text = r.text soup = BeautifulSoup(get_text, "html.parser") b = soup.find_next('h1', 'class:listing-name') print(a, metas, b) r = requests.get("https://www.github.com/") print(r.status_code) if r.status_code == 200:
for data in URLs: race = {} link = data['link'] print("Processing %s" % link) pageURL = link if MOCK_RACE: content = BeautifulSoup(open(MOCK_RACE), features="lxml").body else: content = BeautifulSoup(urllib.request.urlopen(pageURL).read(), features="lxml").body # titre name = content.find_next('h1', {'class': 'pagetitle'}).string.strip() if name.startswith('Les '): name = name[4:-1].title() race[u'01Nom'] = name # source race[u'03Source'] = data['source'] # référence race[u'04Référence'] = link # traits race[u'05Traits'] = [] section = jumpTo(html, 'h2', {'class': 'separator'}, u"Traits raciaux standards")
def searchForArtist(title): query = "{title}".format(title=title).replace(" ", "+").replace("&", "and") search = "https://google.com/search?hl={lang}&q={query}+song".format( lang='en', query=query) res = requests.get(search, headers=HEADERS_GET).text with open(r"C:\Users\talsi\Desktop\test.html", 'w', encoding='utf-8') as f: f.write(res) # try: text = res.split('Other recordings of this song') res = text[0] soup = BeautifulSoup(res, 'html.parser') i = 0 songs = [tag.text for tag in soup.find_all(attrs=GOOGLE_SONG_TAG_ATTRS)] while i < len(songs): z = songs[i] if z.lower() != title.lower(): print("z.lower= " + z.lower()) print("title.lower= " + title.lower()) print("what we going to del: " + songs[i]) del songs[i] i -= 1 else: print("we didnt delete: " + z.lower()) print("cause we thought its: " + title.lower()) i += 1 print(songs) i = 0 print("songs[0]= " + songs[0]) print("soup.find({place0})= ".format(place0=songs[0]) + soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=songs[0]).text) lastArt = soup.find( attrs=GOOGLE_SONG_TAG_ATTRS, text=songs[0]).find_next(attrs=GOOGLE_ARTISTS_TAG_ATTRS).text artist = lastArt print("lastArt= " + lastArt) artists = [] newSongs = [] nextIsOK = True while i < len(songs): song = songs[i] lastSong = song if nextIsOK: newSongs.append(song) artists.append(artist) print(lastArt[:-7]) print(soup.find_next(text=lastArt[:-7])) print( soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next(text=lastArt).text) if soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next( attrs=GOOGLE_ARTISTS_TAG_ATTRS, text=lastArt).find_next( "span").find_next("span").text == soup.find( attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next( attrs=GOOGLE_ARTISTS_TAG_ATTRS, text=lastArt).find_next("span").find_next( attrs=GOOGLE_ARTISTS_TAG_ATTRS).text: print("got in") nextIsOK = True artist = soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next( attrs=GOOGLE_ARTISTS_TAG_ATTRS, text=lastArt).find_next("span").find_next( attrs=GOOGLE_ARTISTS_TAG_ATTRS).text else: nextIsOK = False artist = ""
import re from bs4 import BeautifulSoup email_id_example = """<br/> <div>The below HTML has the information that has email ids.</div> [email protected] <div>[email protected]</div> <span>[email protected]</span> """ soup = BeautifulSoup(email_id_example, "lxml") emailid_regexp = re.compile( "\w[-\w.+]*@([A-Za-z0-9][-A-Za-z0-9]+\.)+[A-Za-z]{2,14}") first_email_id = soup.find(text=emailid_regexp) print(first_email_id) next_email_id = soup.find_next(text=emailid_regexp) print(next_email_id)
class JasssArticle(ASSArticle): bs_article: BeautifulSoup def __init__(self, *args, **kwargs): # args -- tuple of anonymous arguments # kwargs -- dictionary of named arguments """init article from an url *args :param int volume: :param int issue: :param int article: **kwargs :param url url: """ if len(args) == 0: req = requests.get( kwargs.get('url', ass_scrap_util.get_latest_url())) if req.status_code == requests.codes.ok: self.url = req.url self.bs_article = BeautifulSoup(req.content, 'html5lib') else: raise HTTPError(req.reason) else: basic_url = ass_scrap_util.base_url + str( args[0]) + ass_scrap_util.separator + str( args[1]) + ass_scrap_util.separator req = requests.get(basic_url + str(args[2]) + ass_scrap_util.html) self.url = req.url if req.status_code == requests.codes.ok: self.bs_article = BeautifulSoup(req.content, 'html5lib') else: self.bs_article = BeautifulSoup( requests.get(basic_url + str("review" + args[2]) + ass_scrap_util.html), 'html5lib') def __repr__(self): return self.url def is_review(self): """ Tells if this article is a review or not """ return True if "review" in self.__repr__() else False def keywords(self): """ Get the key worlds from an article :param html bs_article: :return: a tuple made of key worlds """ return [ x.strip() for x in self.get_meta_content_with_tag("tags").split(',') ] def title(self): """ Retrieve the title of the article """ return self.get_meta_content_with_tag() def authors(self): """ Retrieve the authors of the article :param html bs_article: :return: a tuple of authors """ return [ x.strip() for x in self.get_meta_content_with_tag("authors").split(';') ] def abstract(self): """ Retrieve the abstract of the article""" the_abstract = self.get_meta_content_with_tag("abstract") if len(the_abstract.split()) < 5: return str( self.bs_article.find(string="Abstract").findNext( "dl").next.contents[0]).strip() return the_abstract def issn(self): return '1460-7425' def doi(self): """ Give the DOI stored in meta data :return: a unique *string* that represent this article """ if self.is_review(): return self.__repr__() try: doi = self.get_meta_content_with_tag("doi") except TypeError: doi = self.get_art_content_with_tag("doi") return doi def _text(self): body = self.bs_article.findAll("article") if len(body) == 1: return body[0] else: art = self.bs_article.findAll("div", {'class': 'article'}) if len(art) > 0: return art[0] else: if len(art) == 0: art = self.bs_article body = art.find("body") the_ps = body.findAll("p") for ppps in the_ps: ppps.extract() dls = body.findAll("dl") if len(dls) > 0: dds = dls[0].findAll("dd") if len(dds) > 1: dds[0].extract() dds[1].extract() return body def text(self): """ Text content of the article :return: The plain text of the article """ html_text = self._text() bibliography: BeautifulSoup.Tag = html_text.findAll( "div", {'class': 'refs'}) log.debug("Looking for the bibilography div: " + str(bibliography)) if not bibliography: ref_tag = html_text.findAll( "h3", text=ass_scrap_util.jasss_biblio_match)[-1] log.debug("Match html tag for bibliography " + str(ref_tag)) for n in ref_tag.next_siblings: log.debug("Extract " + str(n) + " from the text") n.extract() else: bibliography.extract() return ass_scrap_util.text_cleaner(html_text.getText()) def get_meta_content_with_tag(self, tag="title"): """ Retrieve the content of a tag as define by *beautifulsoup* :param string tag: the tag to find in the soup :return: a string representation of the content of the tag """ m_name = ass_scrap_util.jasss_meta_name m_content = ass_scrap_util.jasss_meta_content if self.bs_article.find_next( ass_scrap_util.jasss_meta_tag, {ass_scrap_util.jasss_meta_name.upper(): "title"}): m_name = ass_scrap_util.jasss_meta_name.upper() m_content = ass_scrap_util.jasss_meta_content.upper() if isinstance(ass_scrap_util.meta[tag], str): meta_context = self.bs_article.find( ass_scrap_util.jasss_meta_tag, {m_name: ass_scrap_util.meta[tag]}) else: for tg in ass_scrap_util.meta[tag]: meta_context = self.bs_article.find( ass_scrap_util.jasss_meta_tag, {m_name: tg}) if meta_context is not None: break return meta_context[m_content] def get_art_content_with_tag(self, tag="title"): """ Retrieve the content of a tag define in the *art* section of JASSS article pages :param tag: :return: a string representation of the content of the tag """ balise: str = "p" if tag == "doi": balise = "span" result = self.bs_article.find(balise, {'class': ass_scrap_util.art[tag]}) if result is None: return "-".join([str(s) for s in self.__repr__() if s.isdigit()]) if tag == "doi": result = result.contents[0].replace('DOI:', '') return result.strip() def get_soup(self): """ :return: the soup of the source retrieve by *beautifulsoup* """ return self.bs_article
def parse_ancestries(pages: List[str]) -> Optional[List[object]]: ancestries: List[Ancestry] = [] for ind, page in enumerate(pages): if page == '': continue # placeholder to properly enumerate "bad" array items anc: Ancestry = Ancestry() anc.id = ind + 1 whole_text = BeautifulSoup(page, 'html5lib').find( 'span', {'id': 'ctl00_MainContent_DetailedOutput'}) # get name name_tags = [ t for t in whole_text.find_next('h1').children if t.string ] for m in name_tags: if m.string: anc.name = ''.join((anc.name, m.string)) # if this is just a heritage, we don't parse it if 'Heritage' in anc.name: continue # get rarity and traits uncommon: Tag = whole_text.find_next(class_='traituncommon') rare: Tag = whole_text.find_next(class_='traitrare') if uncommon: anc.rarity = uncommon.string.strip() elif rare: anc.rarity = rare.string.strip() traits: List[Tag] = whole_text.find_all_next(class_='trait') anc.traits = [str(t.string) for t in traits if t] # get source src: str = whole_text.find_next( name='b', text='Source').find_next(name='a').string anc.source.book = src.split('pg.')[0].strip() anc.source.page = int(src.split('pg.')[1]) # get description and other entries m_tag = whole_text.find_next(name='b', text='Source').find_next(name='br') d_str = '' d_header = '' d_entries = [] while m_tag.next: if m_tag.name and m_tag.name == 'h1': d_str = d_str.replace(d_header, '', 1) d_entries.append(AncestryHeader(d_header, d_str.strip())) break if type(m_tag) == NavigableString: d_str = ''.join((d_str, m_tag.string)) elif m_tag.name == 'br': d_str = ''.join((d_str, '\n')) elif m_tag.name == 'h2' or m_tag.name == 'h3': d_str = d_str.replace(d_header, '', 1) d_entries.append(AncestryHeader(d_header, d_str.strip())) d_header = m_tag.string.strip() d_str = '' m_tag = m_tag.next anc.description = d_entries # Hit Points, Size, Speed, Ability Boosts (Flaws), Languages, Senses, Extra(s) # usually in that order (?) # then break when m_tag == m_tag.find_next(name='div', class_='clear').previous.previous.previous d_str = '' anc.hitPoints = str( m_tag.find_next(name='h2', text='Hit Points').next.next) anc.size = str(m_tag.find_next(name='h2', text='Size').next.next) anc.speed = str(m_tag.find_next(name='h2', text='Speed').next.next) boosts_tag = m_tag.find_next(name='h2', text='Ability Boosts').next.next while boosts_tag.next: if boosts_tag.name and boosts_tag.name == 'h2': break if type(boosts_tag) == NavigableString: d_str = ''.join((d_str, boosts_tag.string)) elif boosts_tag.name == 'br': d_str = ''.join((d_str, '\n')) boosts_tag = boosts_tag.next anc.abilityBoosts = [d for d in d_str.split('\n') if d] flaws_tag = m_tag.find_next(name='h2', text='Ability Flaw(s)') if flaws_tag: d_str = '' # reset flaws_tag = flaws_tag.next.next while flaws_tag.next: if flaws_tag.name and flaws_tag.name == 'h2': break if type(flaws_tag) == NavigableString: d_str = ''.join((d_str, flaws_tag.string)) elif flaws_tag.name == 'br': d_str = ''.join((d_str, '\n')) flaws_tag = flaws_tag.next anc.abilityFlaws = [d for d in d_str.split('\n') if d] lang_tag = m_tag.find_next(name='h2', text='Languages').next_sibling d_str = '' while lang_tag.next: if lang_tag.name and lang_tag.name == 'h2': anc.languages.append(d_str) break if type(lang_tag) == NavigableString: d_str = ''.join((d_str, lang_tag.string)) elif lang_tag.name == 'br': anc.languages.append(d_str) d_str = '' lang_tag = lang_tag.next dvision_tag = m_tag.find_next(name='h2', text='Darkvision') ll_tag = m_tag.find_next(name='h2', text='Low-Light Vision') if dvision_tag: anc.senses.append( AncestryHeader( 'Darkvision', 'You can see in darkness and dim light just as well as you can see in bright light, though your vision in darkness is in black and white.' )) elif ll_tag: anc.senses.append( AncestryHeader( 'Low-Light Vision', 'You can see in dim light as though it were bright light, so you ignore the concealed condition due to dim light.' )) # the rest are extras extras_tag: Tag = lang_tag.previous.find_next('h2') if extras_tag: end_tag = [x for x in extras_tag.parent.children][-1] d_str: Union[str, List[List[str]]] = '' d_header = '' while extras_tag.next: if extras_tag == end_tag.next: if type(d_str) == str: anc.extras.append( AncestryHeader(d_header, d_str.replace(d_header, '', 1))) elif type(d_str) == list: anc.extras.append(AncestryHeader(d_header, '', d_str)) break if type(extras_tag) == NavigableString: d_str = ''.join((d_str, extras_tag.string)) elif extras_tag.name == 'br': d_str = ''.join((d_str, '\n')) elif extras_tag.name == 'h2': if d_str != '': anc.extras.append( AncestryHeader(d_header, d_str.replace(d_header, '', 1))) d_header = str(extras_tag.string) d_str = '' elif extras_tag.name == 'table': d_str = parse_table_into_list(extras_tag) extras_tag = extras_tag.next_sibling.previous if extras_tag.next_sibling else extras_tag extras_tag = extras_tag.next ancestries.append(dataclasses.asdict(anc)) return ancestries
# hacks if value == 'Utilisation des objets magiques': value = 'Utilisation d\'objets magiques' elif value == 'Connaissances (mystère)': value = 'Connaissances (mystères)' if value == 'Connaissances (toutes)' or value == 'Connaissances (tous les domaines)' or value == 'Connaissances (au choix, chaque compétence devant être prise séparément)': for c in CONN: cl['CompétencesDeClasse'].append({'Compétence': c}) else: cl['CompétencesDeClasse'].append({'Compétence': value.strip()}) elif s.name == 'br': break # tableau (progression) rows = content.find_next('table', {"class": "tablo"}).find_all('tr') maxSpellLvl = 0 minSpellLvl = 1 if 'spellLvl' in data.keys(): maxSpellLvl = data['spellLvl'] if 'spellLvl0' in data.keys() and data['spellLvl0']: minSpellLvl = 0 cl['Progression'] = [] for r in rows: # ignorer les en-têtes if r.has_attr('class') and (r['class'][0] == 'titre' or r['class'][0] == 'soustitre'): continue idx = 0
def _find_name(self, name_column: BeautifulSoup) -> str: # posrela_tag = player_row.find_next('td', {'class': 'posrela'}) name_tag = name_column.find_next('a', {'class': 'spielprofil_tooltip'}) return name_tag.getText()