Python BeautifulSoup.find_next Examples, bs4.BeautifulSoup.find_next Python Examples

Example #1

0

Show file

def get_match_info(match):
    url = match.get('href')
    url = head + url
    driver = webdriver.Chrome()
    driver.get(url)
    data = BeautifulSoup(driver.page_source,"lxml")
    name = data.find("span",attrs = {"class":"team-header-name"}).string
    print name
    Team_stat = data.find("dl",attrs = {"class":"stats"})
    data.find_next()
    Team_stat_details = Team_stat.findChildren("dt")
    t={}
    t["Team_name"]=name
    for stats in Team_stat_details:
        print stats.string
        print stats.find_next("dd").string
        if(stats.string=="Discipline"):
            Discipline ={}
            Discipline["yellow-card"] = (data.find("span",attrs = {"class":"yellow-card-box"}).string)
            Discipline["red-card"] = (data.find("span",attrs = {"class":"red-card-box"}).string)
            t[stats.string] = Discipline
        else:
            t[stats.string] = (stats.find_next("dd").string)
    record = json.load(open("U:\Projects\PythonApplication7\PythonApplication7\\infos.json", 'r'))
    record["info_list"].append(t)
    json.dump(record, open("U:\Projects\PythonApplication7\PythonApplication7\\infos.json", 'w'))
    driver.close()
    print 'get\n'

Example #2

0

Show file

def recent_tournaments():
    url = 'https://dota2.ru/esport/tournaments/'
    url = urllib.request.urlopen(url)
    soup = BeautifulSoup(url, "lxml")
    soup = soup.find('div', {'class': 'esport-tournament-list'})
    soup = soup.find_next('div', {'class': 'esport-tournament-list'})
    name = 1
    summary = {}
    summary_t = []
    while soup.find_next('div',
                         {'class': 'esport-tournament-list-single'}) != None:
        soup = soup.find_next('div',
                              {'class': 'esport-tournament-list-single'})
        name = soup.find('div', {'class': 'title'})
        name = name.find('a')
        name = re.sub('[\r\n ]', '', name.get_text())
        date = soup.find('div', {'class': 'date'})
        date = date.find_next('div').get_text()
        prize = soup.find('div', {'class': 'prize'})
        prize = re.sub('[\r\n ]', '', prize.get_text())
        if prize != "Приглашениенаосновнойэтап":
            summary['name'] = name.strip()
            summary['date'] = date.strip()
            summary['prize'] = prize.strip()
            summary_t.append(summary)
        summary = {}
    print(summary_t)
    return summary_t

Example #3

0

Show file

def scrape(category, targetSections, targetArticles):
    url = fetchUrl(category, targetArticles)
    if not url:
        return 0
    site = fetch(url)
    soup = BeautifulSoup(site.text, 'html.parser').find('div', class_='component component-news-article').find(
        'ul').find_next('p')
    soup = soup.find_next(lambda tag: tag.name == 'span' and any(x in tag.text for x in targetSections))
    return soup.find_next('p') if soup else 0

Example #4

0

Show file

File: linkedin-applier.py Project: mavaddat/linkedin-applier

def job_traverse_all_pages(browser, url, current_page=0):
    browser.get(url)
    js_scroll_to_bottom = '''
         var jobPane = document.querySelector(".jobs-search-results");
         jobPane.scrollIntoView();
         jobPane.scrollTo(0,jobPane.scrollHeight);
         '''
    # LinkedIn won't show you up to 25 job listings right away due to
    # disgusting JS infested UI design
    browser.execute_script(js_scroll_to_bottom)
    time.sleep(1.0)
    page = BeautifulSoup(browser.page_source, features="html.parser")

    links = list(set(get_job_links(page)))
    # list(set(foo)) removes duplicates
    url_nextpage = url_job_pages + "&start=" + str(current_page * 25)
    # LinkedIn paginates its jobs every 25 listings
    current_page += 1
    time.sleep(random.uniform(0.2, 0.9))  # random sleep

# len(links) < 25:  # if there's less than 25 job listings then we
        # assume there's no next page
    next_list_item = page.find_next('li', {'data-test-pagination-page-btn': True})
    curr_button = page.find_next('button'{})
    if next_button and :
        return links
    else:
        return links + job_traverse_all_pages(
            browser, url_nextpage, current_page)

Example #5

0

Show file

File: dicking.py Project: SamGriffith3/scour

def search():
    # Determine the location to be searched
    #city = input("Your Town/City: ")   Put this in later to replace "town" variable
    town = "greenville"
    craigslist = 'http://' + town + ".craigslist.org/search/zip"
    print(craigslist)
    r = requests.get(craigslist).text

    # Scraping the site
    first_file = "C:\\Users\\Sam\\Documents\\first.csv"

    for pg in craigslist:
        soup = BeautifulSoup(r, 'html.parser')
        name_box = soup.find_next(
            "p")  # need to find a way to get iterate thru
        name = name_box.text

        data = []
        data.append(name)
        not_dup = []
        if name != not_dup:
            not_dup.append(name)
            with open(first_file, 'a') as csv_file:
                writer = csv.writer(csv_file)
                for name in data:
                    writer.writerow(name)
            print(name)
        else:
            continue

    print("done")

Example #6

0

Show file

def parse_me(filename):
    file = open(os.getcwd() + filename)  # Открываем файл
    soup = BeautifulSoup(file, 'lxml')

    links = soup.find_all('a')  # Поиск всех ссылок в которых будут имена
    headers = soup.find('h3')
    # print(headers.find_next('h3'))
    female_name_base_list = ['Любовь']  # Список женских имен <ИСКЛ>
    male_name_base_list = ['Никита', 'Лёва',
                           'Илья']  # Список мужских имен <ИСКЛ>

    male_list = {}  # Словарь с ключами - годами
    female_list = {}

    while soup.find_next('a'):
        name = soup.find('a')
        if soup.find_next() == 'h3':
            break

Example #7

0

Show file

File: census_country_scraper.py Project: jacobtie/country_scraper

def scrape_country_data(country_data: BeautifulSoup) -> Dict[str, Any]:
    """Scrapes all data from a table as a soup and returns the dict"""
    tbody = country_data.find_next('tbody')
    rows = tbody.find_all('tr')
    data_dict = {2016: {}, 2017: {}, 2018: {}}
    year_index = 4
    for key in data_dict.keys():
        # Population
        data_dict[key]['midyear_pop'] = float([
            td for td in rows[1].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['growth_rate'] = float([
            td for td in rows[2].find_all('td')[year_index]
        ][0].replace(',', ''))
        # Fertility
        data_dict[key]['total_fertility_rate'] = float([
            td for td in rows[4].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['crude_birth_rate'] = float([
            td for td in rows[5].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['births'] = float([
            td for td in rows[6].find_all('td')[year_index]
        ][0].replace(',', ''))
        # Mortality
        data_dict[key]['life_expectancy'] = float([
            td for td in rows[8].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['infant_mortality_rate'] = float([
            td for td in rows[9].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['under_5_mortality_rate'] = float([
            td for td in rows[10].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['crude_death_rate'] = float([
            td for td in rows[11].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['deaths'] = float([
            td for td in rows[12].find_all('td')[year_index]
        ][0].replace(',', ''))
        # Migration
        data_dict[key]['net_migration_rate'] = float([
            td for td in rows[14].find_all('td')[year_index]
        ][0].replace(',', ''))
        data_dict[key]['net_num_migrants'] = float([
            td for td in rows[15].find_all('td')[year_index]
        ][0].replace(',', ''))

        year_index += 1
    return data_dict

Example #8

0

Show file

def get_links(urls):
    """
    Gets all of the links based on a list of urls

    :return: Lists of links to individual movies.

    """
    links = []
    for url in urls:
        r = requests.get(url)
        soup = Bs(r.text)

        # Only one table on the website
        table = soup.find_next('tbody')
        links = table.find_all('a')
        links.append(links)
    return links

Example #9

0

Show file

File: scrape_listings.py Project: Will-So/data_parsing

def get_links(urls):
    """
    Gets all of the links based on a list of urls

    :return: Lists of links to individual movies.

    """
    links = []
    for url in urls:
        r = requests.get(url)
        soup = Bs(r.text)

        # Only one table on the website
        table = soup.find_next('tbody')
        links = table.find_all('a')
        links.append(links)
    return links

Example #10

0

Show file

File: bash.py Project: molly/GorillaBot

def bash_rand(m):
    """Get a random quote from bash.org"""
    resp = get_url(m, "http://bash.org?random1")
    soup = BeautifulSoup(resp)
    raw = soup.find(class_="qt")
    if raw:
        meta = soup.find(class_="quote")
        while True:
            if not raw:
                bash_rand(m)
                return
            lines = raw.get_text().splitlines()
            if len(lines) <= 5:
                break
            raw = raw.find_next(class_="qt")
            meta = soup.find_next(class_="quote")
        format_quote(m, lines, meta)
    else:
        m.bot.private_message(m.location, "Could not find bash quote.")

Example #11

0

Show file

def bash_rand(m):
    """Get a random quote from bash.org"""
    resp = get_url(m, "http://bash.org?random1")
    soup = BeautifulSoup(resp, features="html.parser")
    raw = soup.find(class_="qt")
    if raw:
        meta = soup.find(class_="quote")
        while True:
            if not raw:
                bash_rand(m)
                return
            lines = raw.get_text().splitlines()
            if len(lines) <= 5:
                break
            raw = raw.find_next(class_="qt")
            meta = soup.find_next(class_="quote")
        format_quote(m, lines, meta)
    else:
        m.bot.private_message(m.location, "Could not find bash quote.")

Example #12

0

Show file

def refresh_login(global_config):
    global xcsrf_token, cookie

    url = "http://xn--v9x.net/"
    loginUrl = url + "login/"
    imageUrl = url + "image/"
    header = {
        'User-Agent':
        "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/94.0.4606.61 Safari/537.36",
    }
    imageRequest = {
        "optype": "get_images",
        "category": "",
        "cached_images": []
    }

    initr = requests.get(url, headers=header)
    header['Referer'] = url
    loginGet = requests.get(loginUrl, headers=header, cookies=initr.cookies)
    loginParse = BeautifulSoup(loginGet.text)
    csrfMiddle = loginParse.find_next(name="csrfmiddlewaretoken").string()
    header['Referer'] = loginUrl

    loginForm = {
        'csrfmiddlewaretoken': csrfMiddle,
        'Email': global_config["XNV9X_EMAIL"],
        "Password": global_config["XNV9X_PASSWORD"],
    }

    loginPost = requests.post(loginUrl,
                              data=loginForm,
                              cookies=loginGet.cookies,
                              headers=header)
    header["Referer"] = url
    imageGet = requests.get(imageUrl, cookies=loginPost, headers=header)
    print(re.findall("\"X-CSRFToken\": '(.*?)' },", imageGet.text))

    print(str(imageGet.cookies))

Example #13

0

Show file

def find_processor(tag):

    # Link para partsurfer HPE
    url = "https://partsurfer.hpe.com/Search.aspx?searchText={}".format(tag)

    try:
        response = requests.get(url, verify=False)
        soap = BeautifulSoup(response.text, "html.parser")
        previous_tag = soap.find_next("span")
        processor = "Nao encontrado."

        for item in soap.find_all("span"):
            if "logic cpu" in item.text.lower(
            ) and "proc" in previous_tag.text.lower():
                processor = previous_tag.text
            previous_tag = item

        # Em alguns casos o PartSufer da HP nao gera a coluna Logic CPU
        # Nesse caso e feita a busca direta pelo termo sps-proc
        if "Nao encontrado" in processor:
            for item in soap.find_all("span"):
                if "sps-proc" in item.text.lower():
                    processor = item.text

        with open("processadores.csv", mode="a") as f:
            f.write("{};{}\n".format(tag, processor))

        print("Serial: {}, Processador: {}".format(tag, processor))

    except (Exception) as e:

        if (type(e) is requests.exceptions.ConnectionError):
            raise e

        else:
            print("Serial: {}, falha ao obter processador.".format(tag))
            return

Example #14

0

Show file

File: fotoliasort.py Project: michaelcontento/fotoliasort

def get_author(image_id):
    html = requests.get("http://en.fotolia.com/id/" + str(image_id)).content
    print "http://en.fotolia.com/id/" + str(image_id)
    html_td = BeautifulSoup(html, "html.parser").find("div", class_="content-preview")
    return html_td.find_next('a').string

Example #15

0

Show file

# itération sur chaque page
for data in URLs:
    cl = {}
    
    link = data['link']
    
    print("Processing %s" % link)
    pageURL = link
    
    if MOCK_CLASS:
        content = BeautifulSoup(open(MOCK_CLASS),features="lxml").body
    else:
        content = BeautifulSoup(urllib.request.urlopen(pageURL).read(),features="lxml").body

    # titre
    name = cleanName(content.find_next('caption').string.strip())
    cl[u'Nom'] = name

    # référence
    cl[u'Référence'] = link
    
    # prestige
    if 'prestige' in data.keys() and data['prestige']:
        cl[u'Prestige'] = True
    
    # source
    cl[u'Source'] = data['source']

    # description
    descr = findAfter(content, "div", {"class": "presentation"},'i');
    cl[u'Description'] = descr

Example #16

0

Show file

     continue
 print('{} {}'.format(brand, a[0]))
 refName = a[0]
 for currentLoc in loc.keys():
     url = "https://www.amazon.{local}/s?k=".format(
         local=loc[currentLoc])
     azurl = "{}{}+{}".format(url, brand, refName.replace(' ', '+'))
     rq = s.get(azurl)
     if rq.raise_for_status():
         print(rq.status_code + ' ' + rq.text)
     soup = BeautifulSoup(rq.text, 'html5lib')
     articles = soup.find("span", {'class': 'a-size-medium'})
     prices = soup.find('span', {'class': 'a-price-whole'})
     if prices == None:
         articles = soup.find("span", {'class': 'a-size-medium'})
         prices = soup.find_next('span', {'class': 'a-price-whole'})
     if (prices == None or articles == None):
         print("No item found")
     else:
         if force == True or (cur.execute(
                 'SELECT amName FROM refprices WHERE locale = "{currentloc}" AND name = "{refname}" '
                 .format(currentloc=currentLoc,
                         refname=refName)).fetchone() == str(
                             articles.contents[0])):
             print("{price} ---- {item}".format(
                 item=articles.contents[0], price=prices.contents[0]))
             item = (str(float(prices.contents[0].replace(",",
                                                          "."))), brand,
                     refName, str(articles.contents[0]), currentLoc,
                     datetime.now().strftime("%Y-%m-%d %H:%M:%S.%f"))
             cur.execute('INSERT INTO refprices VALUES (?,?,?,?,?,?)',

Example #17

0

Show file

 def _find_number(self, number_column: BeautifulSoup) -> str:
     number_tag = number_column.find_next("div", {"class": "rn_nummer"})
     return number_tag.getText()

Example #18

0

Show file

File: test1.py Project: elazarski/nlp-project-elazarski

    def handle_endtag(self, tag):
        if tag != "script":
            print("Encountered an end tag:", tag)
        self.prev_tag.pop()

    def handle_data(self, data):
        if self.prev_tag.get() != "script":
            print("Encountered data:", data)


myparser = MyHTMLParser()
myparser.feed(raw)
from bs4 import BeautifulSoup
soup = BeautifulSoup(raw, 'html.parser')
soup.prettify()
soup.title
soup.title.name
soup.title.string
soup.p
soup.findall('div')
soup.find_all('div')
soup.find_all('div', attrs='class')
soup.find_all('div', attrs='center-buttons')
soup.find_all('div', attrs='fixed-recipe-card')
soup.find_all('article', attrs='fixed-recipe-card')
soup.find('article', attrs='fixed-recipe-card')
soup.find_next('article', attrs='fixed-recipe-card')
soup.find_next('article')
soup.find_all_next('article', attrs='fixed-recipe-card')
soup.find_all('article', attrs='fixed-recipe-card')

Example #19

0

Show file

File: build.py Project: plotdevice/plotdevice-manual

def syntax_color(src):
    html = highlight(src, PlotDeviceLexer(), HtmlFormatter())
    soup = BeautifulSoup(html, "html5lib")
    return soup.find_next("pre").extract()

Example #20

0

Show file

File: firstScraper.py Project: andrele/RecipeScraper

			uid = recipeId.get("data-typespecificid")
			recipe["uid"] = uid
			title = recipePage.find("h1", id = "itemTitle")
			recipe["name"] = title.string
			print("\n\n" + title.string + "[" + uid + "]")
			print(link)
			photo = recipePage.find("img", id = "imgPhoto")
			recipe["photo"] = photo.get("src")
			print(photo.get("src"))

			# Create empty ingredients list
			ingredients = [ ]
			names = recipePage.find_all("span", "ingredient-name")
			for name in names:
				if name.string != "" and name.string != " ":
					split = name.string.split(",")
					ingredient = split[0]
					ingredients.append(ingredient)
					# tokens = nltk.word_tokenize(ingredients)
					# print(tokens)
					# tagged = nltk.pos_tag(tokens)
					# f.write(ingredients+"\n")
					print(ingredient)
				names = recipePage.find_next("span", "ingredient-name")
			recipe["ingredients"] = ingredients
			jsonString = json.dumps(recipe) + ",\n"
			f.write(jsonString)
			print(jsonString)
		divs = divs.find_next("div", "hub-list-view")

Example #21

0

Show file

File: parse.py Project: korolyofff/realestate-scrapers

class domain_com(log.csv_):
    def __init__(self, request):
        self.request = request
        session = requests.Session()
        session.max_redirects = 30
        try:
            web_page = session.get(self.request, allow_redirects=True).text

        except requests.exceptions.MissingSchema:
            print('Type correct url. Did you forget http / https?')
            raise (exit())

        self.soup = BeautifulSoup(web_page, 'lxml')

    def scrape_property_url(self):
        href_list = []
        try:
            count_property = self.soup.find('strong')
            count_property = re.findall(r'(\d+).+', count_property.string)

        except AttributeError:
            print('Type correct url')
            raise (exit())

        count_pages = math.ceil(int(count_property[0]) / 20)
        body = self.soup.find('div', 'css-1mf5g4s')
        href2 = re.findall(r'href="(.+?)"', str(body))
        for href in href2:
            if not href in href_list:
                href_list.append(href)
        return href_list, count_pages, count_property

    def direct_to_property(self, page):

        hrefs = self.scrape_property_url()[0]
        for href in hrefs:
            compile_domain(href)
        if page == 1:
            self.next_page()

    def next_page(self):
        page = 2
        pages = int(self.scrape_property_url()[1])
        while pages >= page:
            main_url = self.request + '&page=' + str(page)
            url = domain_com(main_url)
            url.direct_to_property(page)
            page += 1
        print('Done!')

    def buy_rent(self):
        tags = self.soup.find_all('span', class_='css-0')
        if tags[1].string == 'Sale' or tags[1].string == 'New Homes':
            return 'Buy'

        elif tags[1].string == 'Rent':
            return 'Rent'

    def bedBathCarSquare_count(self):
        try:
            tag = self.soup.find('span', 'css-9fxapx', string='Beds')
            bed_tag = (tag.find_parent())
            try:
                beds = list(bed_tag.strings)[0]
            except AttributeError:
                print('Beds not found')
                beds = '-'

            bath_tag = tag.find_next('span', 'css-1rzse3v')
            try:
                baths = list(bath_tag.strings)[0]
            except AttributeError:
                print('Baths not found')
                baths = '-'

            car_tag = bath_tag.find_next('span', 'css-1rzse3v')
            try:
                cars = list(car_tag.strings)[0]
            except AttributeError:
                print('Car places not found')
                cars = '-'

            square_tag = car_tag.find_next('span', 'css-1rzse3v')
            try:
                square = list(square_tag.strings)[0]
            except AttributeError:
                print('Square not found')
                square = '-'

            try:
                if int(square) < 15:
                    square = '-'
            except ValueError:
                pass

            return beds, baths, cars, square

        except AttributeError:
            try:
                square_tag = self.soup.find_next('span', 'css-1rzse3v')
                square = list(square_tag.strings)[0]
                try:
                    if int(square) < 15:
                        square = 'None'

                except ValueError:
                    pass

            except AttributeError:
                square = 'None'

            return '-', '-', '-', square

    def agent_name(self):
        try:
            agent = self.soup.find(
                'a', 'is-a-link listing-details__agent-details-agent-name')
            return agent.string
        except AttributeError:
            print('Agent not found')
            return 'None'

    def property_addr(self):
        addr = self.soup.find('h1', 'listing-details__listing-summary-address')
        try:
            return addr.string
        except AttributeError:
            print('Adress not found')
            return 'None'

    def property_type(self):
        try:
            tag = self.soup.find(
                'span', 'listing-details__property-type-features-text').string
        except AttributeError:
            try:
                tag = self.soup.find('p',
                                     'listing-details__property-type').string

            except AttributeError:
                print('Property Type not found')
                return 'None'

        return tag

    def price_buy(self):
        price = self.soup.find('div', class_='listing-details__summary-title')
        try:
            return price.string

        except AttributeError:
            return 'Auction / No price'

    def price_rent(self):
        price = self.soup.find('div', 'listing-details__summary-title')
        try:
            return price.string

        except AttributeError:
            print('Property Price not found')
            return 'None'

    def property_features(self):
        try:
            features = [
                feature.string for feature in self.soup.find_all(
                    'li', 'listing-details__additional-features-listing')
            ]
        except AttributeError:
            print('Features not found')
            return 'None'

        if not features:
            print('Features not found')
            features = 'None'

        return features

    def property_description(self):
        try:
            full_desc = ''
            description = self.soup.find('div', 'listing-details__description')
            description = description.find_all('p')
            for desc in description:
                full_desc += desc.string

        except AttributeError:
            full_desc = 'None'
            print('Descripton not found')

        except TypeError:
            full_desc = 'None'
            print('Descripton not found')

        return full_desc

Example #22

0

Show file

File: cv_parser_51job.py Project: haogods/etl_task

class CvParser51Job(CvTopParser):
    """
    对51job的简历进行解析
    """
    def __init__(self):


        CvTopParser.__init__(self)

        self.result = OrderedDict()
        self.PAY = re.compile(u"(\d+[\s\-])?\d+元")
        self.UPDATETIME = re.compile("(更新日期|更新时间)[:：\s](\d{4}.\d{2}.\d{2})")
        self.ADDR = re.compile(u"居住地[：:\s](\S+)")
        self.JOB_START = re.compile(u"(\d+.\d+?)--")
        self.JOB_END = re.compile(u"--(\d+.\d+)")
        self.JOB_DURATION = re.compile(u"\[(.+?)\]")
        self.INC_SCALE = re.compile(u"\d+-\d+人|\d+人[以上下]?")
        self.INC_NAME = re.compile(u"[：:>](\S+?)[\(\[【\r\n ]")
        self.JOB_DEPARTMENT = re.compile(u"部门[:\s：](\S+?)",re.S)
        self.PROJ_NAME = re.compile(u"[:：](\S+?$)")
        


    def preprocess(self,htmlContent=None,fname=None,url=None):
        if url!=None:
            self.html= urlopen(url).read().decode('utf-8')
        elif htmlContent:
            self.html = htmlContent
        elif fname:
            self.html = codecs.open(fname,'rb','gb18030').read()
        else:
            raise Exception("input error")

        if re.search(u"已被(求职者)?删除|无法查看",self.html):
            raise Exception("error: input illegal cv ")

        self.soup = BeautifulSoup(self.html,"lxml")

        
        # 无联系方式无匹配度的
        if self.soup.find("title") and re.search(u"简历ID",self.soup.find("title").get_text()):
            self.HasName = 0
            self.resume = self.soup.find('div',{"id":"divResume"})
            self.topsoup = self.resume.find("table").find("table")
            self.field_list = self.resume.find_all("td","cvtitle")

        # 求职本公司的
        elif self.soup.find("div","titleLineB") and self.soup.find(name="span",text=re.compile(u"应聘职位")):
            self.HasName = 3
            self.job_for_soup = self.soup.find(name="span",text = re.compile(u"应聘职位")).find_previous("table")
            self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地：")).find_previous("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("div","titleLineB")
            if not self.field_list:
                self.field_list = self.resume.find("td","cvtitle")

        # 有应聘职位公司并且显示匹配度的
        elif self.soup.find("div",{"id":"divHead"}):
            self.HasName = 1  
            self.job_for_soup = self.soup.find("div",{"id":"divHead"}).find("td")
            self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地：")).find_parent("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("td","cvtitle")


        # 有联系方式无匹配度的
        elif self.soup.find_all(name="td",text=re.compile(u"E-mail："),limit=10):
            self.HasName = 2   
            find_job_for = self.soup.find(name="span",text=re.compile(u"应聘职位"))
            if find_job_for:
                self.job_for_soup = self.soup.find(name="span",text = re.compile(u"应聘职位")).find_previous("table")
            else:
                self.job_for_soup = BeautifulSoup()

            self.topsoup = self.soup.find(name="td",text=re.compile(u"居住地：")).find_previous("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("td","cvtitle")
            if not self.field_list:
                self.field_list = self.resume.find_all("div","titleLineB")


        self.result.clear()
        self.result["cvFrom"] =  "51job"
        self.result["privateInfo"] = {}
        
        # 求职岗位和企业，自我介绍，兴趣爱好，校园实践，获得荣誉，股票，补贴等各种其他额外信息
        self.result["others"] = {}   
        


    def regular_basic(self):
        """
        解析基本信息
        """

        res = OrderedDict()
        
        find_update_time = self.soup.find("span",{"id":"lblResumeUpdateTime"})
        if find_update_time:
           find_update_time = find_update_time.get_text() 
        elif self.UPDATETIME.search(self.html):
            find_update_time = self.UPDATETIME.search(self.html).group()
        
        base_info = self.topsoup.get_text()
        find_cv_id = self.CV_ID.search(base_info)
        res["cvId"] = find_cv_id.group(1).strip() if find_cv_id else "None"

        
        res["updateTime"] = find_update_time.split(u"：")[-1].strip() if find_update_time else "None"     

        base_info1,base_info2 = "",""

        if self.HasName!=3 and self.topsoup.find_next(name="b",text=u"最近工作"):
            base_info1 = self.topsoup.find_next(name="b",text=re.compile(u"最近工作")).find_parent("table")
            base_info2 = self.topsoup.find_next(name="b",text=re.compile(u"学历")).find_parent("table")
        elif self.topsoup.find_next("div",text=re.compile(u"最近工作")):
            base_info1 = self.topsoup.find_next(name="div",text=re.compile(u"最近工作")).find_next("table")
            base_info2 = base_info1.find_next("table")
        
        if base_info1:
            tokens = base_info1.find_all("tr")
            for token in tokens:
                items = token.find_all("td")
                if len(items)==2:
                    if re.search(u"公.?司",items[0].get_text()):
                        res["nowInc"] = items[1].get_text().strip()
                    elif re.search(u"行.?业",items[0].get_text()):
                        res["nowIndustry"] = items[1].get_text().strip()
                    elif re.search(u"职.?位",items[0].get_text()):
                        res["nowPosition"] = items[1].get_text().strip()

        if base_info2:
            tokens = base_info2.find_all("tr")
            for token in tokens:
                items = token.find_all("td")
                if len(items)==2:
                    if re.search(u"学.?历",items[0].get_text()):
                        res["nowDiploma"] = items[1].get_text().strip()
                    elif re.search(u"专.?业",items[0].get_text()):
                        res["recentMajorName"] = items[1].get_text().strip()
                    elif re.search(u"学.?校",items[0].get_text()):
                        res["recentSchName"] = items[1].get_text().strip()


        find_sex = self.SEX.search(base_info)
        res["gender"]= find_sex.group() if find_sex else "0"

        find_age =  self.AGE.search(base_info)
        res["age"] = find_age.group() if find_age else "0"
       
        find_dob = self.DOB.search(base_info)
        res["dob"] = find_dob.group(1) if find_dob else "None"

       
        try:
            res["nowWorkAge"] = self.topsoup.find("span","blue").find("b").get_text().split(u"|")[0] 
        except:
            res["nowWorkAge"] = self.topsoup.find("span","blue1").find("b").get_text().split(u"|")[0] 

        if not re.search(u"经验|在读|应届|年",res["nowWorkAge"]):
            res["nowWorkAge"] = "None"
        
        
        if "nowDiploma" not in res:
            find_degree = self.DEGREE.search(base_info)
            res["nowDiploma"] = find_degree.group() if find_degree else "None"

        find_marriage = self.MARRIAGE.search(base_info)
        res["marriage"] = find_marriage.group() if find_marriage else "None"
        

        find_politic = self.POLITIC.search(base_info)
        res["nowPolistatus"] = find_politic.group() if find_politic else u"群众"

        #　居住地和户口
        items = self.topsoup.find_all("td",limit=8)
        for item in items:
            if re.search(u"居住地",item.get_text()) and item.find_next_sibling("td"):
                res["nowAddress"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"户.{0,3}口",item.get_text()) and item.find_next_sibling("td"):
                res["nowHukou"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"地.址",item.get_text()) and item.find_next_sibling("td"):
                res["nowAddressDetail"] = item.find_next_sibling("td").get_text().strip()

        find_height = self.HEIGHT.search(base_info)
        res["height"] = find_height.group(1) if find_height else "None"
        
        if base_info2:
            find_benefit = base_info2.find_next("table").find_next("td",text=re.compile(u"基本工资|目前薪资"))
            if find_benefit:
                tmpsoup = find_benefit.find_previous("table")
                items = tmpsoup.find_all("td")
                for item in items:
                    # 基本工资等福利信息
                    if re.search(u"工资|薪资",item.get_text()):
                        res["nowSalary"] = re.sub("\s+","",item.find_next_sibling("td").get_text().strip())

                    elif re.search(u"补.?贴",item.get_text()):
                        self.result["others"]["subsidy"] = item.find_next_sibling("td").get_text().strip()

                    elif re.search(u"奖.?金",item.get_text()):
                        self.result["others"]["bonus"] = item.find_next_sibling("td").get_text().strip()

                    elif re.search(u"股.?票",item.get_text()):
                        self.result["others"]["stock"] = item.find_next_sibling("td").get_text().strip()

        find_oversea = self.OVER_SEA.search(base_info)
        res["overSea"] = "1" if find_oversea else "None"
        
        self.result['baseInfo'] =  res


    # 求职意向
    def regular_expect(self):

        res = OrderedDict()

        soup = ""
        for field in self.field_list:
            
            if re.search(u"求职意向",field.get_text()):
                if self.HasName==1:
                    soup = field.find_next("table")
                else:
                    soup = field.find_previous("table")
                break

        if soup:
            rows = soup.find_all("tr")
            for item in rows:
                if not item.find("td"):
                    continue
                if re.search(u"目标地",item.find("td").get_text()):
                    res["expLocations"] = item.find("td").find_next().get_text()

                elif re.search(u"月薪|薪资|工资|薪酬",item.find("td").get_text()):
                    res["expSalary"] = self.CLEAN_TEXT.sub("",item.find("td").find_next().get_text())

                elif re.search(u"目前状况|求职状态",item.find("td").get_text()):
                    res["workStatus"] = item.find("td").find_next().get_text()

                elif re.search(u"工作性|目标性",item.find("td").get_text()):
                    res["expJobTypes"] = item.find("td").find_next().get_text()

                elif re.search(u"希望行业|期望行业",item.find("td").get_text()):
                    res["expIndustrys"] = item.find("td").find_next().get_text()

                elif re.search(u"岗位|职[业位]",item.find("td").get_text()):
                    res["expPositions"] = item.find("td").find_next().get_text()

                elif re.search(u"到岗时间",item.find("td").get_text()):
                    res["dutyTime"] = item.find("td").find_next().get_text().strip()

                elif re.search(u"勿推荐|不要推荐",item.find("td").get_text()):
                    res["ignoreIncs"] = item.find("td").find_next().get_text().strip()

                elif re.search(u"职能",item.find("td").get_text()):
                    res["expJobCates"] = item.find("td").find_next().get_text().strip()

        self.result['jobExp'] = res




    # 教育经历
    def regular_educate(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"教育经历",field.get_text()):
                soup = field.find_next("table")
                break
        res = []
        if soup:
            rows = soup.find_all("tr")
            id = 1
            for item in rows:
                tokens =[ token.get_text().strip() for token in  item.find_all("td") if len(token.get_text())>1]
                tmp = {}
                if len(tokens)==4:
                    tmp["itemId"] = str(id)
                    tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0])
                    tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1])
                    tmp["schName"] = tokens[1]
                    tmp["majorName"] = tokens[2]
                    tmp["eduDiploma"] = tokens[3]
                    id += 1
                    res.append(tmp)

        if res:
            # 基本信息中的最高学历学校，专业
            self.result["baseInfo"]["recentSchName"] = res[0]["schName"]
            self.result["baseInfo"]["recentMajorName"] = res[0]["majorName"]
        self.result['eduList'] = res




    #　工作经历
    def regular_workexp(self):
        
        soup = ""
        for field in self.field_list:
            if re.search(u"工作经",field.get_text()):
                soup = field.find_next("table")
                break

        res = []

        if soup:
            rows = soup.find_all("tr")
            id = 1
            tokens,tmp = [],[]
            for item in rows:
                if item.find("hr"):
                    tokens.append(tmp)
                    tmp = []
                    continue
                else:
                    tmp.append(item)
            if tmp:
                tokens.append(tmp)
                
            for token in tokens:
                tmp = {}
                if len(token)>2:
                    tmp["itemId"] = str(id)
                    job_title = re.sub(u"[\s\r\n　]","",token[0].find("td").get_text())
                    tmp["jobStart"] = self.clean_edu_time(self.JOB_START.search(job_title).group(1)) if self.JOB_START.search(job_title) else job_title[:6]
                    tmp["jobEnd"] = self.clean_edu_time(self.JOB_END.search(job_title).group(1)) if self.JOB_END.search(job_title) else "None"
                    tmp["jobDuration"] = self.JOB_DURATION.search(job_title).group(1).strip() if self.JOB_DURATION.search(job_title) else "None"

                    tmp["incEmployee"] = self.INC_SCALE.search(job_title).group().strip() if self.INC_SCALE.search(job_title) else "None"           
                    
                    if len(token)>3:
                        tmp["jobDesc"] = token[3].get_text().strip()
                        
                    if job_title:
                        if token[0].find("td").find("b") and not re.search(u"年|月",job_title):
                            tmp["incName"] = token[0].find("td").find("b").get_text().strip() 
                        else:
                            tmp["incName"] = self.INC_NAME.search(job_title).group(1).strip() if self.INC_NAME.search(job_title) else "None"

                        if re.search(u"所属行业",token[1].get_text()):
                            tmp["incIndustrys"] =  token[1].find_all("td")[-1].get_text().strip()
                        else:
                            tmp["jobPosition"] = token[1].find_all("td")[-1].get_text().strip()

                        tmp["jobDesc"] = token[-1].find_all("td")[-1].get_text().strip()

                        jobTagItem = token[2].find_all("td")

                        if len(jobTagItem)==2:
                            tmp["jobPosition"] = jobTagItem[1].get_text().strip()
                            tmp["jobDepartment"] = jobTagItem[0].get_text().strip()
                        elif len(jobTagItem)==3:
                            tmp["jobPosition"] = jobTagItem[1].get_text().strip()
                            tmp["jobDepartment"] = jobTagItem[0].get_text().strip()
                            tmp["jobSalary"] = jobTagItem[2].get_text().strip()

                    else:
                        if token[0].find("td").find('b'):
                            tmp["incName"] = token[0].find("td").find("b").get_text().strip()

                        if re.search(u"职位名称",token[1].get_text()):
                            tmp["jobPosition"] = token[1].find("td").find("b").get_text().strip()
                            tmp["jobDepartment"] = self.JOB_DEPARTMENT.search(token[1].find('td').get_text()).group(1) if self.JOB_DEPARTMENT.search(token[1].find("td").get_text()) else "None"
                        
                        if re.search(u"行业",token[2].get_text()):
                            tmp["incIndustrys"] = token[2].find("td").get_text().strip()[3:]
                        


                    id += 1
                    res.append(tmp)
       
        self.result['jobList'] = res


    # 语言技能
    def regular_language(self):

        
        soup = ""
        for field in self.field_list:
            if re.search(u"语言.?能.?",field.get_text()):
                soup = field.find_next("table")
                if soup and soup.find_all("table"):
                    soup = soup.find_all("table")[-1]
                break

        res = []
        id = 1
        if soup:
            rows = soup.find_all("tr") 
            for item in rows:
                tokens = [ i.get_text() for i in item.find_all("td") if i]
                if len(tokens)!=2:
                    tokens = re.split(u"[:：]",item.get_text(),maxsplit=1)
                if not len(tokens)==2:
                    tokens = re.split(u"[（\(]",item.get_text())
                if len(tokens)==2:
                    tmp = {}
                    tmp["itemId"] = str(id)
                    tmp["languageName"] = re.sub(u"[\s+：:　]","",tokens[0]).split("（")[0]
                    tmp["languageLevel"] = re.sub(u"[\s+　]","",tokens[1])
                    res.append(tmp)
                    id += 1

        self.result["languageList"] = res


    #　证书
    def regular_cert(self):
        

        soup =""
        for field in self.field_list:
            if field and re.search(u"证书",field.get_text()):
                soup = field.find_next("table")
                break

        res = []
        id = 1
        if soup:
            items = soup.find_all("tr") 
            for item in items:
                tokens = item.find_all("td")
                if len(tokens)<2:continue
                tmp = {}
                tmp["itemId"] = str(id)
                tmp["certTime"] = self.clean_edu_time(tokens[0].get_text())
                tmp["certName"] = tokens[1].get_text().strip()
                cert_str = tmp["certName"]
                find_level = self.CERT_LEVEL.search(cert_str)
                if find_level:
                    tmp["certLevel"] = find_level.group()
                    tmp["certName"] = tmp["certName"]
                elif len(tokens)>2:
                    tmp["certLevel"] = tokens[2].get_text().strip()

                if tmp:
                    res.append(tmp)
                    id += 1

        self.result["certList"] = res
   
    
    # 技能
    def regular_skill(self):
        """
        技能模块
        """

        soup = ""
        for field in self.field_list:
            if re.search(u"技能",field.get_text()):
                soup = field.find_next("table")
                if soup and soup.find_all("table"):
                    soup = soup.find_all("table")[-1]
                break

        res = []
        id = 1
        if soup:
#            items = soup.find_all("table",limit=4)[-1].find_all("tr") if soup.find("table") else []
            items = soup.find_all("tr")
            for item in items:
                tokens = [token.get_text() for token in item.find_all("td")]
                if len(tokens)<2 or re.search(u"名称",tokens[0]):continue
                tmp = {}
                tmp["itemId"] = str(id)
                tmp["skillName"] = tokens[0].strip().lower()
                tmp["skillLevel"] = tokens[1].strip()

                if len(tokens)>2:
                    tmp["skillDuration"] = tokens[2].strip()
                else:
                    find_duration = re.search("\d+月|[半一二三四五六七八九十\d]年",item.get_text())
                    tmp["skillDuration"] = find_duration.group() if find_duration else "None"

                if tmp:
                    res.append(tmp)
                    id += 1


        self.result['skillList'] = res

     
    #　项目经验
    def regular_project(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"项目经.",field.get_text()):
                soup = field.find_next("table")
                break

        res = []
        id = 1
        if soup:
            items = soup.find_all("tr") 

            tokens,tmpitem =[],[]
            for item in items:
                if item.find("hr"):
                    tokens.append(tmpitem)
                    tmpitem = []
                    continue
                elif item:
                    tmpitem.append(item)
            if tmpitem:
                tokens.append(tmpitem)


            for token in tokens:

                # 解析第一行项目标题
                title_str = re.sub(u"[\s\r\n　]","",token[0].get_text())
                tmp = {}
                tmp["itemId"] = str(id)
                tmp["proStart"] = self.clean_edu_time(self.JOB_START.search(title_str).group(1)) if self.JOB_START.search(title_str) else "None" 
                tmp["proEnd"] = self.clean_edu_time(self.JOB_END.search(title_str).group(1)) if self.JOB_END.search(title_str) else "None" 
                tmp["proName"] = re.sub("\s+","",self.PROJ_NAME.search(title_str).group(1)) if self.PROJ_NAME.search(title_str) else title_str
                
                #　解析剩余行标签
                field_list = [ item.find("td") for item in token[1:] ]
                for field in field_list:
                    field_str = field.get_text().strip()

                    if re.search(u"软件环境",field_str):
                        tmp["softwareEnv"] = field.find_next("td").get_text()
                    elif re.search(u"硬件环境",field_str):
                        tmp["hardwareEnv"] = field.find_next("td").get_text()
                    elif re.search(u"开发工具",field_str):
                        tmp["devTool"] = field.find_next("td").get_text()
                    elif re.search(u"项目描述",field_str):
                        tmp["proDesc"] = field.find_next("td").get_text()
                    elif re.search(u"责任描述",field_str):
                        tmp["proDuty"] = field.find_next("td").get_text()

                if tmp:
                    res.append(tmp)
                    id += 1

        self.result['proList'] = res




    def regular_train(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"培训经.",field.get_text()):
                soup = field.find_next("table")
                break

        res = []
        id = 1
        if soup:
            items = soup.find_all("tr") 
            for item in items:
                tokens =[item.get_text() for item in item.find_all("td") if len(item.get_text())>1]
                if len(tokens)<3:continue
                tmp = {}
                tmp["itemId"] = str(id)
                tmp["trainStart"] = self.clean_edu_time(tokens[0].split(u'-')[0])
                tmp["trainEnd"] = self.clean_edu_time(tokens[0].split(u"-")[-1])
                tmp["trainAgency"] = tokens[1].strip()
                tmp["trainTitle"] = tokens[-1].strip()
                res.append(tmp)
                id += 1

        self.result["trainList"] = res

    
    def regular_private(self):
        """
        身份证号，联系电话等隐私信息
        """
        
        res = {}
        base_info = self.topsoup.get_text()
        find_phone = self.PHONE.search(base_info)
        find_email = self.EMAIL.search(base_info)
        find_qq = self.QQ.search(base_info)
        find_idNum = self.IDNUM.search(base_info)

        userName = ""

        if self.HasName:
            find_name = self.topsoup.find_previous("tr").find_previous("tr").find("b")
            if not find_name:
                find_name = self.topsoup.find_previous("tr").find_previous("tr").find("strong")
            if find_name and len(find_name.get_text().strip())<5:
                userName = find_name.get_text().strip()
        
        res["userName"] = userName if userName else "None"
        res["phoneNumber"] = find_phone.group(1) if find_phone else "None"
        res["email"] = find_email.group(1) if find_email else "None"
        res["qq"] = find_qq.group(1) if find_qq else "None"
        res["idNumber"] = find_idNum.group(1) if find_idNum else "None"
        
        find_key_word =  self.soup.find("span",text=re.compile(u"简历关键字"))
        key_words = ""
        if find_key_word and find_key_word.find_next("span","rsblue"):
            key_words =  find_key_word.find_next("span","rsblue").get_text()
        elif find_key_word and find_key_word.find_next("td"):
            key_words = find_key_word.find_next("td").get_text()

        if key_words and re.search(u"有|熟悉|经验|强|善于|精通|证",key_words):
            res["keywords"] = key_words.strip().split()

        self.result["privateInfo"] = res



    def regular_other(self):
        
        res = {}
        res["jobPositionFor"] = "None"
        res["jobIncNameFor"] = "None"

        for field in self.field_list:
            if re.search(u"自我介绍|个人简介|亮点|自我评价",field.get_text()):
               res["selfIntro"] = field.find_previous("table").get_text().strip()
            
            elif re.search(u"实践|实习",field.get_text()):
                res["stuPractice"] = re.sub("\s+"," ",field.find_next("table").get_text().strip())
            
            elif re.search(u"校内|校园|社团",field.get_text()):
                res["schoolExp"] = re.sub("\s+"," ",field.find_next("table").get_text().strip())

            elif re.search(u"论文|著作|作品",field.get_text()):
                res["pubWork"] = res.get("otherWork","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip())
        
            elif re.search(u"奖项|荣誉",field.get_text()):
                res["gainHoner"] = res.get("otherWork","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip())

            elif re.search(u"兴趣|爱好|特长",field.get_text()):
                res["otherHobby"] = res.get("otherHobby","")+"\n"+self.CLEAN_TEXT.sub(" ",field.find_next("table").get_text().strip())
            
            elif re.search(u"其他",field.get_text()):
                res["otherInfo"] = field.find_next("table").get_text().strip()

        if self.HasName==1:
            find_jobPositionName = re.search(u"应聘职位",self.job_for_soup.get_text())
            if find_jobPositionName:
                res["jobPositionFor"] = self.job_for_soup.find_next("span").get_text().strip()

            find_jobIncName = re.search(u"应聘公司",self.job_for_soup.get_text())
            if find_jobIncName:
                res["jobIncNameFor"] = self.job_for_soup.find_next("span").find_next("span").get_text().strip()
            
            find_updateTime = re.search(u"投递时间",self.job_for_soup.get_text())
            if self.result["baseInfo"]["updateTime"]=="None" and find_updateTime:
                self.result["baseInfo"]["updateTime"] = self.job_for_soup.find_next("span").find_next("span").get_text().strip()


        elif self.HasName>1:
            items = self.job_for_soup.find_all("td",limit=6)
            for item in items:
                if re.search(u"应聘职位",item.get_text()):
                    res["jobPositionFor"] = item.find_next_sibling("td").get_text().strip()
                elif re.search(u"应聘公司",item.get_text()):
                    res["jobIncNameFor"] = item.find_next_sibling("td").get_text().strip()
                elif re.search(u"投递时间",item.get_text()):
                    self.result["baseInfo"]["updateTime"] = item.find_next_sibling("td").get_text().strip()
                    break
        res.update(self.result.pop("others",{}))
        self.result["others"] = res


    def parser(self,htmlContent=None,fname=None,url=None):
        self.preprocess(htmlContent,fname,url)
        self.regular_basic()
        self.regular_private()
        self.regular_expect()
        self.regular_educate()
        self.regular_workexp()
        self.regular_skill()
        self.regular_cert()
        self.regular_language()
        self.regular_project()
        self.regular_train()
        self.regular_other()
        return self.result


    
    def output(self):
        res = "\n"
        for k in self.result:
            res += k+":"+"\n"
            if isinstance(self.result[k],dict):
                for kk,vv in self.result[k].iteritems():
                    res += '%1s: %s\n' %( kk,vv )
            elif isinstance(self.result[k],list):
                for i,exp in enumerate(self.result[k]):
                    res+= "%12s\n" % (str(i+1))
                    if isinstance(exp,dict):
                        for kk,vv in exp.iteritems():
                            res += "%22s: %s\n" % (kk,vv)
                    elif isinstance(exp,tuple):
                        for kk in exp:
                            res += '%22s \n'% (kk)
                    res += " "*10+'---'*10+'\n'
            else:
                res += " "*10+"%s\n" % (self.result[k])
        return res

Example #23

0

Show file

File: cv_parser_51job.py Project: haogods/etl_task

class CvParser51Job(CvTopParser):
    """
    对51job的简历进行解析
    """

    def __init__(self):

        CvTopParser.__init__(self)

        self.result = OrderedDict()
        self.PAY = re.compile(u"(\d+[\s\-])?\d+元")
        self.UPDATETIME = re.compile("(更新日期|更新时间)[:：\s](\d{4}.\d{2}.\d{2})")
        self.ADDR = re.compile(u"居住地[：:\s](\S+)")
        self.JOB_START = re.compile(u"(\d+.\d+?)--")
        self.JOB_END = re.compile(u"--(\d+.\d+|至今)")
        self.JOB_DURATION = re.compile(u"\[(.+?)\]")
        self.INC_SCALE = re.compile(u"\d+-\d+人|\d+人(以上|以下)?|少于\d+人")
        # self.INC_NAME = re.compile(u"[：:>](\S+?)[\(\[【\r\n ]")
        self.INC_NAME = re.compile(u"[：:>](\S+?)(\(\d|\[\d|\(少于|\[一)")
        self.JOB_DEPARTMENT = re.compile(u"部门[:\s：](\S+?)", re.S)
        self.PROJ_NAME = re.compile(u"[:：](\S+?$)")

    def preprocess(self, htmlContent=None, fname=None, url=None):
        if url != None:
            self.html = urlopen(url).read().decode('utf-8')
        elif htmlContent:
            self.html = htmlContent
        elif fname:
            self.html = codecs.open(fname, 'rb', 'gb18030').read()
        else:
            raise Exception("input error")

        if re.search(u"已被(求职者)?删除|无法查看", self.html):
            raise Exception("error: input illegal cv ")

        self.soup = BeautifulSoup(self.html, "lxml")

        # 无联系方式无匹配度的
        if self.soup.find("title") and re.search(u"简历ID", self.soup.find("title").get_text()):
            self.HasName = 0
            self.resume = self.soup.find('div', {"id": "divResume"})

            self.topsoup = self.resume.find("table").find("table").find_next("table")
            self.topsoup = self.resume.find("table").find("table")
            self.field_list = self.resume.find_all("td", "cvtitle")

        # 求职本公司的
        elif self.soup.find("div", "titleLineB") and self.soup.find(name="span", text=re.compile(u"应聘职位")):
            self.HasName = 3
            self.job_for_soup = self.soup.find(name="span", text=re.compile(u"应聘职位")).find_previous("table")
            self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地：")).find_previous("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("div", "titleLineB")
            if not self.field_list:
                self.field_list = self.resume.find("td", "cvtitle")

        # 有应聘职位公司并且显示匹配度的
        elif self.soup.find("div", {"id": "divHead"}):
            self.HasName = 1
            self.job_for_soup = self.soup.find("div", {"id": "divHead"}).find("td")
            self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地：")).find_parent("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("td", "cvtitle")


        # 有联系方式无匹配度的
        elif self.soup.find_all(name="td", text=re.compile(u"E-mail："), limit=10):
            self.HasName = 2
            find_job_for = self.soup.find(name="span", text=re.compile(u"应聘职位"))
            if find_job_for:
                self.job_for_soup = self.soup.find(name="span", text=re.compile(u"应聘职位")).find_previous("table")
            else:
                self.job_for_soup = BeautifulSoup()

            self.topsoup = self.soup.find(name="td", text=re.compile(u"居住地：")).find_previous("table")
            self.resume = self.topsoup.find_parent("table").find_parent("table")
            self.field_list = self.resume.find_all("td", "cvtitle")
            if not self.field_list:
                self.field_list = self.resume.find_all("div", "titleLineB")

        self.refresh()
        self.result["cvFrom"] = "51job"

    # 基本信息 1
    def regular_basic(self):
        """
        解析基本信息
        """

        temp_nowAddressDetail = ""
        find_update_time = self.soup.find("span", {"id": "lblResumeUpdateTime"})
        if find_update_time:
            find_update_time = find_update_time.get_text()
        elif self.UPDATETIME.search(self.html):
            find_update_time = self.UPDATETIME.search(self.html).group()

        # 20160309 zhangzq
        self.resumeType = 0
        find_resume_type = self.soup.find("span", {"id": "lblResumeType"})
        if find_resume_type:
            find_resume_type = find_resume_type.get_text()
            if re.search(u"粘贴简历", find_resume_type):
                self.resumeType = 1
        self.base_html = self.topsoup.get_text().strip()
        base_info = self.topsoup.get_text()
        # print self.topsoup
        find_cv_id = self.CV_ID.search(base_info)
        self.result["baseInfo"]["cvId"] = find_cv_id.group(1).strip() if find_cv_id else ""

        self.result["baseInfo"]["updateTime"] = find_update_time.split(u"：")[-1].strip() if find_update_time else "None"

        base_info1, base_info2 = "", ""

        if self.HasName != 3 and self.topsoup.find_next(name="b", text=u"最近工作"):
            base_info1 = self.topsoup.find_next(name="b", text=re.compile(u"最近工作")).find_parent("table")
            base_info2 = self.topsoup.find_next(name="b", text=re.compile(u"学历")).find_parent("table")
        elif self.topsoup.find_next("div", text=re.compile(u"最近工作")):
            base_info1 = self.topsoup.find_next(name="div", text=re.compile(u"最近工作")).find_next("table")
            base_info2 = base_info1.find_next("table")

        if base_info1:
            tokens = base_info1.find_all("tr")
            for token in tokens:
                items = token.find_all("td")
                if len(items) == 2:
                    if re.search(u"公.?司", items[0].get_text()):
                        self.result["baseInfo"]["nowInc"] = items[1].get_text().strip()
                    elif re.search(u"行.?业", items[0].get_text()):
                        self.result["baseInfo"]["nowIndustry"] = items[1].get_text().strip()
                    elif re.search(u"职.?位", items[0].get_text()):
                        self.result["baseInfo"]["nowPosition"] = items[1].get_text().strip()

        if base_info2:
            tokens = base_info2.find_all("tr")
            for token in tokens:
                items = token.find_all("td")
                if len(items) == 2:
                    if re.search(u"学.?历", items[0].get_text()):
                        self.result["baseInfo"]["nowDiploma"] = items[1].get_text().strip()
                    elif re.search(u"专.?业", items[0].get_text()):
                        self.result["baseInfo"]["recentMajorName"] = items[1].get_text().strip()
                    elif re.search(u"学.?校", items[0].get_text()):
                        self.result["baseInfo"]["recentSchName"] = items[1].get_text().strip()

        find_sex = self.SEX.search(base_info)
        self.result["baseInfo"]["gender"] = find_sex.group() if find_sex else "0"

        find_age = self.AGE.search(base_info)
        self.result["baseInfo"]["age"] = find_age.group() if find_age else "0"

        find_dob = self.DOB.search(base_info)
        self.result["baseInfo"]["dob"] = find_dob.group(1) if find_dob else "None"

        try:
            self.result["baseInfo"]["nowWorkAge"] = self.topsoup.find("span", "blue").find("b").get_text().split(u"|")[0]
        except:
            # res["nowWorkAge"] = self.topsoup.find("span","blue1").find("b").get_text().split(u"|")[0]
            self.result["baseInfo"]["nowWorkAge"] = ""

        if not re.search(u"经验|在读|应届|年", self.result["baseInfo"]["nowWorkAge"]):
            self.result["baseInfo"]["nowWorkAge"] = ""

        if not self.result["baseInfo"]["nowDiploma"]:
            find_degree = self.DEGREE.search(base_info)
            self.result["baseInfo"]["nowDiploma"] = find_degree.group() if find_degree else ""

        find_marriage = self.MARRIAGE.search(base_info)
        self.result["baseInfo"]["marriage"] = find_marriage.group() if find_marriage else ""

        find_politic = self.POLITIC.search(base_info)
        # res["nowPolistatus"] = find_politic.group() if find_politic else u"群众"
        # print find_politic.group()
        # print base_info
        self.result["baseInfo"]["nowPoliStatus"] = find_politic.group() if find_politic else ""

        # 　居住地和户口
        if self.topsoup.find("table"):
            items = self.topsoup.find("table").find_all("td", limit=20)
        else:
            items = self.topsoup.find_all("td", limit=20)

        for item in items:
            if re.search(u"专.业", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["recentMajorName"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"学.历", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["nowDiploma"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"职.能", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["jobPosition"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"行.业", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["incIndustrys"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"居住地", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["nowAddress"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"户.{0,3}口", item.get_text()) and item.find_next_sibling("td"):
                self.result["baseInfo"]["nowHukou"] = item.find_next_sibling("td").get_text().strip()
            elif re.search(u"地.址", item.get_text()) and item.find_next_sibling("td"):
                temp_nowAddressDetail = item.find_next_sibling("td").get_text().strip()
                self.result["baseInfo"]["nowAddressDetail"] = item.find_next_sibling("td").get_text().split(u'（')[
                    0].strip()
                # print item.find_next_sibling("td").get_text().split('(')[0].strip()
            elif re.search(u"关键词", item.get_text()) and item.find_next_sibling("td"):
                self.result["privateInfo"]["keyWords"] = item.find_next_sibling("td").get_text(" ", strip=True)

        find_height = self.HEIGHT.search(base_info)
        self.result["baseInfo"]["height"] = find_height.group(1) if find_height else ""

        if base_info2:
            find_benefit = base_info2.find_next("table").find_next("td", text=re.compile(u"基本工资|目前薪资"))
            if find_benefit:
                tmpsoup = find_benefit.find_previous("table")
                items = tmpsoup.find_all("td")
                for item in items:
                    # 基本工资等福利信息
                    if re.search(u"目前薪资|目前年薪", item.get_text()):
                        self.result["baseInfo"]["nowSalary"] = re.sub("\s+", "",
                                                                      item.find_next_sibling("td").get_text().strip())
                    elif re.search(u"基本薪资|基本工资", item.get_text()):
                        self.result["baseInfo"]["baseSalary"] = re.sub("\s+", "",
                                                                       item.find_next_sibling("td").get_text().strip())

                    elif re.search(u"补.?贴", item.get_text()):
                        self.result["baseInfo"]["subsidy"] = item.find_next_sibling("td").get_text().strip()

                    elif re.search(u"奖.?金", item.get_text()):
                        self.result["baseInfo"]["bonus"] = item.find_next_sibling("td").get_text().strip()

                    elif re.search(u"股.?票", item.get_text()):
                        self.result["baseInfo"]["stock"] = item.find_next_sibling("td").get_text().strip()

        find_oversea = self.OVER_SEA.search(base_info)
        self.result["baseInfo"]["overSea"] = "1" if find_oversea else ""

        if re.search(u"邮编：(\d{6})", temp_nowAddressDetail):
            self.result["baseInfo"]["nowZipCode"] = re.search(u"邮编：(\d{6})", temp_nowAddressDetail).group(1)

    # 求职意向 2
    def regular_expect(self):
        soup = ""
        for field in self.field_list:

            if re.search(u"求职意向", field.get_text()):
                if self.HasName == 1:
                    soup = field.find_next("table")
                else:
                    soup = field.find_previous("table")
                break
        self.expect_html = soup.get_text().strip() if soup else ""
        if soup:
            rows = soup.find_all("tr")
            for item in rows:
                if not item.find("td"):
                    continue
                if re.search(u"目标地", item.find("td").get_text()):
                    self.result['jobExp']["expLocations"] = item.find("td").find_next().get_text()

                elif re.search(u"职能", item.find("td").get_text()):
                    self.result['jobExp']["expJobCates"] = item.find("td").find_next().get_text().strip()
                    # print self.result['jobExp']['expJobCates']

                elif re.search(u"月薪|薪资|工资|薪酬", item.find("td").get_text()):
                    # print item.find('td').find_next().get_text()
                    self.result['jobExp']["expSalary"] = self.CLEAN_TEXT.sub("",item.find("td").find_next().get_text())

                    # print self.result['jobExp']['expSalary']
                elif re.search(u"目前状况|求职状态", item.find("td").get_text()):
                    self.result['jobExp']["workStatus"] = item.find("td").find_next().get_text()

                elif re.search(u"工作性|目标性", item.find("td").get_text()):
                    self.result['jobExp']["expJobTypes"] = item.find("td").find_next().get_text()

                elif re.search(u"希望行业|期望行业", item.find("td").get_text()):
                    self.result['jobExp']["expIndustrys"] = item.find("td").find_next().get_text()

                elif re.search(u"岗位|职[业位]", item.find("td").get_text()):
                    self.result['jobExp']["expPositions"] = item.find("td").find_next().get_text()

                elif re.search(u"到岗时间", item.find("td").get_text()):
                    self.result['jobExp']["dutyTime"] = item.find("td").find_next().get_text().strip()

                elif re.search(u"勿推荐|不要推荐", item.find("td").get_text()):
                    self.result['jobExp']["ignoreIncs"] = item.find("td").find_next().get_text().strip()

                    # elif re.search(u"职能",item.find("td").get_text()):
                    #     self.result['jobExp']["expJobCates"] = item.find("td").find_next().get_text().strip()

    # 教育经历 3
    def regular_educate(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"教育经历", field.get_text()):
                soup = field.find_next("table")
                break
        self.edu_html = soup.get_text().strip() if soup else ""
        if soup:
            rows = soup.find_all("tr")
            id = 1
            for item in rows:
                tokens = [token.get_text().strip() for token in item.find_all("td") if len(token.get_text()) > 1]
                tmp = self.get_eduDict()
                if len(tokens) == 4:

                    tmp["itemId"] = str(id)
                    tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0])
                    tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1])
                    tmp["schName"] = tokens[1]
                    tmp["majorName"] = tokens[2]
                    tmp["eduDiploma"] = tokens[3]
                    id += 1
                    self.result["eduList"].append(tmp)
                elif len(tokens) == 3:
                    tmp["itemId"] = str(id)
                    tmp["eduStart"] = self.clean_edu_time(tokens[0].split("-")[0])
                    tmp["eduEnd"] = self.clean_edu_time(tokens[0].split("-")[-1])
                    tmp["schName"] = tokens[1]
                    tmp["eduDiploma"] = tokens[2]
                    id += 1
                    self.result["eduList"].append(tmp)

                    # if res:
                    #     # 基本信息中的最高学历学校，专业
                    #     if not self.result["baseInfo"]["recentSchName"]:
                    #         self.result["baseInfo"]["recentSchName"] = res[0]["schName"]
                    #     if not self.result["baseInfo"]["recentMajorName"]:
                    #         self.result["baseInfo"]["recentMajorName"] = res[0]["majorName"]

    # 工作经历 4
    def regular_workexp(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"工作经", field.get_text()):
                soup = field.find_next("table")
                break

        self.work_html = soup.get_text().strip() if soup else ""
        #        print soup
        if soup:
            rows = soup.find_all("tr")
            id = 1
            tokens, tmp = [], []
            #            print rows
            for item in rows:
                if item.find("hr"):
                    tokens.append(tmp)
                    tmp = []
                    continue
                else:
                    tmp.append(item)
                #            print tmp
            if tmp:
                tokens.append(tmp)

            for token in tokens:

                #                print len(tokens)
                #                print "token[0]: %s\ntoken[1]: %s\ntoken[2]: %s" %(token[0], token[1], token[2])
                tmp = self.get_jobDict()
                #                print len(token)
                if len(token) >= 2:
                    tmp["itemId"] = str(id)
                    job_title = re.sub(u"[\s\r\n　]", "", token[0].find("td").get_text())
                    tmp["jobStart"] = self.clean_edu_time(
                        self.JOB_START.search(job_title).group(1)) if self.JOB_START.search(job_title) else job_title[
                                                                                                            :6]
                    tmp["jobEnd"] = self.clean_edu_time(self.JOB_END.search(job_title).group(1)) if self.JOB_END.search(
                        job_title) else ""
                    tmp["jobDuration"] = self.JOB_DURATION.search(job_title).group(
                        1).strip() if self.JOB_DURATION.search(job_title) else ""

                    tmp["incEmployee"] = self.INC_SCALE.search(job_title).group().strip() if self.INC_SCALE.search(
                        job_title) else ""

                    if len(token) >= 3:
                        jobDescStr = token[-1].get_text().strip()
                        # print jobDescStr
                        jobDesc = re.search(u'工作内容：(.*)', jobDescStr)
                        # print jobDesc.group(1)
                        if jobDesc:
                            # print 'yes'
                            tmp["jobDesc"] = re.sub(u'#.*?#', '', token[-1].get_text(separator='\n').strip())
                            # print tmp["jobDesc"]
                        else:
                            # print 'no'
                            # print len(token)
                            # print token[3]
                            # job_tmp = re.search('<td(.*?)>(.*?)</td>', str(token[-1]))
                            # job_tmp_notags = re.sub('<br/>', '\n', job_tmp.group(2))
                            # print token[3].get_text()
                            # print self.result['baseInfo']['cvId']
                            if self.result['baseInfo']['cvId'] == '330482369' or self.result['baseInfo']['cvId'] == '333633446'\
                                    or self.result['baseInfo']['cvId'] == '338845280' or self.result['baseInfo']['cvId'] == '65996657':
                                if len(token) > 3:
                                    tmp["jobDesc"] = re.sub(u'#.*?#', '', token[3].get_text(separator='\n').strip())
                            else:
                                tmp["jobDesc"] = re.sub(u'#.*?#', '', token[-1].get_text(separator='\n').strip())


                        # print tmp['jobDesc']
                        # print job_title
                    if job_title:

                        if token[0].find("td").find("b") and not re.search(u"年|月", job_title):
                            tmp["incName"] = token[0].find("td").find("b").get_text().strip()
                        else:
                            if self.INC_NAME.search(job_title):
                                #                                print self.INC_NAME.search(job_title).group(1)
                                tmp["incName"] = self.INC_NAME.search(job_title).group(1).strip()
                            else:
                                tmp["incName"] = ""

                        if re.search(u"所属行业", token[1].get_text()):
                            tmp["incIndustrys"] = token[1].find_all("td")[-1].get_text().strip()
                        else:
                            tmp["jobPosition"] = token[1].find_all("td")[-1].get_text().strip()
                            tmp["jobDepartment"] = token[1].find_all("td")[0].get_text().strip()

                        if "jobDesc" not in tmp.keys():
                            tmp["jobDesc"] = token[-1].find_all("td")[-1].get_text(separator='\n').strip()

                        try:
                            jobTagItem = token[2].find_all("td")
                        except:
                            pass
                           #print self.result['baseInfo']['cvId']
                        else:
                            if len(jobTagItem) == 2:
                                tmp["jobPosition"] = re.sub(u'\s+', '', jobTagItem[1].get_text().strip())
                                tmp["jobDepartment"] = jobTagItem[0].get_text().strip()
                            elif len(jobTagItem) == 3:
                                tmp["jobPosition"] = jobTagItem[1].get_text().strip()
                                tmp["jobDepartment"] = jobTagItem[0].get_text().strip()
                                tmp["jobSalary"] = jobTagItem[2].get_text().strip()


                    else:

                        if token[0].find("td").find('b'):
                            tmp["incName"] = token[0].find("td").find("b").get_text().strip()

                        if re.search(u"职位名称", token[1].get_text()):
                            tmp["jobPosition"] = token[1].find("td").find("b").get_text().strip()
                            tmp["jobDepartment"] = self.JOB_DEPARTMENT.search(token[1].find('td').get_text()).group(
                                1) if self.JOB_DEPARTMENT.search(token[1].find("td").get_text()) else "None"

                        if re.search(u"行业", token[2].get_text()):
                            tmp["incIndustrys"] = token[2].find("td").get_text().strip()[3:]

                    id_ma = 1
                    for t in token[4:]:

                        ttext = t.get_text()

                        # print ttext
                        reportTo = re.search(u'汇报对象：(.*)', ttext)
                        underNum = re.search(u'下属人数：(\d+)', ttext)
                        witness = re.search(u'证 明 人：(.*)', ttext)
                        leaveReason = re.search(u'离职原因：(.*)', ttext)
                        achieveDesc = re.search(u'工作业绩：(.*)', ttext)
                        tmp['manageExp']['itemId'] = str(id_ma)
                        if reportTo:
                            tmp['manageExp']['reportTo'] = reportTo.group(1)
                        elif underNum:
                            tmp['manageExp']['underNum'] = underNum.group(1)
                        elif witness:
                            tmp['manageExp']['witness'] = witness.group(1)
                        elif leaveReason:
                            tmp['manageExp']['leaveReason'] = leaveReason.group(1)
                        elif achieveDesc:
                            tmp['manageExp']['achieveDesc'] = achieveDesc.group(1)
                        id_ma += 1

                    id += 1
                    self.result["jobList"].append(tmp)

    # 语言技能 5
    def regular_language(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"语言.?能.?", field.get_text()):
                soup = field.find_next("table")
                if soup and soup.find_all("table"):
                    soup = soup.find_all("table")[-1]
                break
        self.language_html = soup.get_text().strip() if soup else ""
        res = []
        id = 1
        if soup:
            rows = soup.find_all("tr")
            for item in rows:
                tokens = [i.get_text() for i in item.find_all("td") if i]
                if len(tokens) != 2:
                    tokens = re.split(u"[:：]", item.get_text(), maxsplit=1)
                if not len(tokens) == 2:
                    tokens = re.split(u"[（\(]", item.get_text())
                if len(tokens) == 2:
                    tmp = self.get_languageDict()
                    tmp["itemId"] = str(id)
                    tmp["languageName"] = re.sub(u"[\s+：:　]", "", tokens[0])  # .split("（")[0]
                    tmp["languageLevel"] = re.sub(u"[\s+　]", "", tokens[1])
                    res.append(tmp)
                    id += 1

        self.result["languageList"] = res

    # 　证书 6
    def regular_cert(self):

        soup = ""
        for field in self.field_list:
            if field and re.search(u"证书", field.get_text()):
                soup = field.find_next("table")
                break

        self.cert_html = soup.get_text().strip() if soup else ""
        res = []
        id = 1
        if soup:
            items = soup.find_all("tr")
            for item in items:
                tokens = item.find_all("td")
                if len(tokens) < 2: continue
                tmp = self.get_certDict()

                tmp["itemId"] = str(id)
                tmp["certTime"] = self.clean_edu_time(tokens[0].get_text())
                certName = tokens[1].get_text().strip()
                tmp["certName"] = re.sub(u"#.*?#", '', certName)

                if len(tokens) == 3:
                    tmp["certLevel"] = tokens[2].get_text().strip()
                else:
                    tmp["certLevel"] = "None"
                # cert_str = tmp["certName"]
                # find_level = self.CERT_LEVEL.search(cert_str)
                # if find_level:
                #     tmp["certLevel"] = find_level.group()
                #     tmp["certName"] = tmp["certName"]
                # elif len(tokens)>2:
                #     tmp["certLevel"] = tokens[2].get_text().strip()

                if tmp:
                    res.append(tmp)
                    id += 1

        self.result["certList"] = res

    # 技能 7
    def regular_skill(self):
        """
        技能模块
        """

        soup = ""
        for field in self.field_list:
            if re.search(u"技能", field.get_text()):
                soup = field.find_next("table")
                if soup and soup.find_all("table"):
                    soup = soup.find_all("table")[-1]
                break
        self.skill_html = soup.get_text().strip() if soup else ""
        res = []
        id = 1
        if soup:
            #            items = soup.find_all("table",limit=4)[-1].find_all("tr") if soup.find("table") else []
            items = soup.find_all("tr")
            for item in items:
                tokens = [token.get_text() for token in item.find_all("td")]
                if len(tokens) < 2 or re.search(u"名称", tokens[0]): continue
                tmp = self.get_skillDict()
                tmp["itemId"] = str(id)
                tmp["skillName"] = tokens[0].strip().lower()
                tmp["skillLevel"] = tokens[1].strip()

                if len(tokens) > 2:
                    tmp["skillDuration"] = tokens[2].strip()
                else:
                    find_duration = re.search("\d+月|[半一二三四五六七八九十\d]年", item.get_text())
                    tmp["skillDuration"] = find_duration.group() if find_duration else "None"

                if tmp:
                    res.append(tmp)
                    id += 1

        self.result['skillList'] = res

    # 项目经验 8
    def regular_project(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"项目经.", field.get_text()):
                soup = field.find_next("table")
                break

        self.project_html = soup.get_text().strip() if soup else ""
        res = []
        id = 1
        if soup:
            items = soup.find_all("tr")

            tokens, tmpitem = [], []
            for item in items:
                if item.find("hr"):
                    tokens.append(tmpitem)
                    tmpitem = []
                    continue
                elif item:
                    tmpitem.append(item)
            if tmpitem:
                tokens.append(tmpitem)

            for token in tokens:

                # 解析第一行项目标题
                title_str = re.sub(u"[\s\r\n　]", "", token[0].get_text())
                tmp = self.get_proDict()
                tmp["itemId"] = str(id)
                tmp["proStart"] = self.clean_edu_time(
                    self.JOB_START.search(title_str).group(1)) if self.JOB_START.search(title_str) else "None"
                tmp["proEnd"] = self.clean_edu_time(self.JOB_END.search(title_str).group(1)) if self.JOB_END.search(
                    title_str) else "None"
                tmp["proName"] = re.sub("\s+", "", self.PROJ_NAME.search(title_str).group(1)) if self.PROJ_NAME.search(
                    title_str) else title_str

                # 　解析剩余行标签
                field_list = [item.find("td") for item in token[1:]]
                for field in field_list:
                    field_str = field.get_text().strip()

                    if re.search(u"软件环境", field_str):
                        tmp["softwareEnv"] = field.find_next("td").get_text()
                    elif re.search(u"硬件环境", field_str):
                        tmp["hardwareEnv"] = field.find_next("td").get_text()
                    elif re.search(u"开发工具", field_str):
                        tmp["devTool"] = field.find_next("td").get_text()
                    elif re.search(u"项目描述", field_str):
                        tmp["proDesc"] = field.find_next("td").get_text(separator='\n')
                    elif re.search(u"责任描述", field_str):
                        tmp["proDuty"] = field.find_next("td").get_text(separator='\n')

                if tmp:
                    res.append(tmp)
                    id += 1

        self.result['proList'] = res

    # 培训经历 9
    def regular_train(self):

        soup = ""
        for field in self.field_list:
            if re.search(u"培训经.", field.get_text()):
                soup = field.find_next("table")
                break

        self.train_html = soup.get_text().strip() if soup else ""
        res = []
        id = 1
        if soup:
            items = soup.find_all("tr")
            for item in items:

                tokens = [item.get_text() for item in item.find_all("td") if len(item.get_text()) > 1]

                if len(tokens) < 3:
                    continue
                # print res
                tmp = self.get_trainDict()
                tmp["itemId"] = str(id)
                tmp["trainStart"] = self.clean_edu_time(tokens[0].split(u'-')[0])
                tmp["trainEnd"] = self.clean_edu_time(tokens[0].split(u"-")[-1])
                tmp["trainAgency"] = tokens[1].strip()
                tmp["trainTitle"] = tokens[2].strip()
                if len(tokens) > 3:
                    tmp["trainCert"] = tokens[3].strip()

                tt = item.find_next('td').get_text().strip().split('\\')
                tmp["trainDesc"] = '/'.join(tt)

                # print tmp["trainDesc"]

                res.append(tmp)
                id += 1

        self.result["trainList"] = res
        # print self.result["trainList"]

    def regular_private(self):
        """
        身份证号，联系电话等隐私信息
        """

        base_info = self.topsoup.get_text()
        find_phone = self.PHONE.search(base_info)
        find_email = self.EMAIL.search(base_info)
        find_qq = self.QQ.search(base_info)
        find_idNum = self.IDNUM.search(base_info)

        userName = ""

        if self.HasName:
            find_name = self.topsoup.find_previous("tr").find_previous("tr").find("b")
            if not find_name:
                find_name = self.topsoup.find_previous("tr").find_previous("tr").find("strong")
            if find_name and len(find_name.get_text().strip()) < 5:
                userName = find_name.get_text().strip()

        self.result["privateInfo"]["userName"] = userName if userName else ""
        self.result["privateInfo"]["phoneNumber"] = find_phone.group(1) if find_phone else ""
        self.result["privateInfo"]["email"] = find_email.group(1) if find_email else ""
        self.result["privateInfo"]["qq"] = find_qq.group(1) if find_qq else ""
        self.result["privateInfo"]["idNumber"] = find_idNum.group(1) if find_idNum else ""

        find_key_word = self.soup.find("span", text=re.compile(u"简历关键字"))
        key_words = ""
        if find_key_word and find_key_word.find_next("span", "rsblue"):
            key_words = find_key_word.find_next("span", "rsblue").get_text()
        elif find_key_word and find_key_word.find_next("td"):
            key_words = find_key_word.find_next("td").get_text()

        if key_words and re.search(u"有|熟悉|经验|强|善于|精通|证", key_words):
            self.result["privateInfo"]["keyWords"] = key_words.strip().split()

    # 其他信息 10
    def regular_other(self):
        self.other_html = []
        for field in self.field_list:
            # print field.get_text()
            if re.search(u"自我介绍|个人简介|亮点|自我评价", field.get_text()):
                self.result["others"]["selfIntro"] = field.find_previous("table").get_text(separator='\n')[4:].strip()
                # print self.result["others"]["selfIntro"]
                self.other_html.append(self.result["others"]["selfIntro"])

            elif re.search(u"实践|实习", field.get_text()):
                self.result["others"]["stuPractice"] = re.sub("\s+", " ", field.find_next("table").get_text().strip())
                # print self.result["others"]['stuPractice']
                self.other_html.append(self.result["others"]["stuPractice"])

            elif re.search(u"校内|校园|社团", field.get_text()):
                self.result["others"]["schoolExp"] = re.sub("\s+", " ", field.find_next("table").get_text().strip())
                self.other_html.append(self.result["others"]["schoolExp"])

            elif re.search(u"论文|著作|作品", field.get_text()):
                self.result["others"]["pubWork"] = self.result["others"]["pubWork"] + "\n" + self.CLEAN_TEXT.sub(" ",
                                                                                                                 field.find_next(
                                                                                                                     "table").get_text().strip())
                self.other_html.append(self.result["others"]["pubWork"])

            elif re.search(u"奖项|荣誉", field.get_text()):
                self.result["others"]["gainHoner"] = self.result["others"]["gainHoner"] + "\n" + self.CLEAN_TEXT.sub(
                    " ", field.find_next("table").get_text().strip())
                self.other_html.append(self.result["others"]["gainHoner"])

            elif re.search(u"兴趣|爱好|特长", field.get_text()):
                self.result["others"]["otherHobby"] = self.result["others"]["otherHobby"] + "\n" + self.CLEAN_TEXT.sub(
                    " ", field.find_next("table").get_text().strip())
                self.other_html.append(self.result["others"]["otherHobby"])

                # 附件信息
            elif re.search(u"附件", field.get_text().strip()):
                self.result["others"]["attachment"] = field.find_next("table").get_text()
                self.other_html.append(self.result["others"]["attachment"])

            elif re.search(u"其他信息", field.get_text()):

                if self.resumeType == 1:
                    self.result["others"]["otherInfo"] = field.find_parent("table").get_text().strip()
                else:
                    self.result["others"]["otherInfo"] = field.find_next("table").get_text().strip()
                self.other_html.append(self.result["others"]["otherInfo"])

        if self.HasName == 1:
            find_jobPositionName = re.search(u"应聘职位", self.job_for_soup.get_text())
            if find_jobPositionName:
                self.result["others"]["jobPositionFor"] = self.job_for_soup.find_next("span").get_text().strip()
                self.other_html.append(self.result["others"]["jobPositionFor"])

            find_jobIncName = re.search(u"应聘公司", self.job_for_soup.get_text())
            if find_jobIncName:
                self.result["others"]["jobIncNameFor"] = self.job_for_soup.find_next("span").find_next(
                    "span").get_text().strip()
                self.other_html.append(self.result["others"]["jobIncNameFor"])

            find_updateTime = re.search(u"投递时间", self.job_for_soup.get_text())
            if self.result["baseInfo"]["updateTime"] == "None" and find_updateTime:
                self.result["baseInfo"]["updateTime"] = self.job_for_soup.find_next("span").find_next(
                    "span").get_text().strip()


        elif self.HasName > 1:
            items = self.job_for_soup.find_all("td", limit=6)
            for item in items:
                if re.search(u"应聘职位", item.get_text()):
                    self.result["others"]["jobPositionFor"] = item.find_next_sibling("td").get_text().strip()
                    self.other_html.append(self.result["others"]["jobPositionFor"])
                elif re.search(u"应聘公司", item.get_text()):
                    self.result["others"]["jobIncNameFor"] = item.find_next_sibling("td").get_text().strip()
                    self.other_html.append(self.result["others"]["jobIncNameFor"])
                elif re.search(u"投递时间", item.get_text()):
                    self.result["baseInfo"]["updateTime"] = item.find_next_sibling("td").get_text().strip()
                    break

        # 求职信
        recommendLetter = self.soup.find("table", {"id": "tabCvletter"})
        if recommendLetter:
            text = recommendLetter.get_text().strip()
            self.result["others"]["recommendLetter"] = text[5:].strip()
            self.other_html.append(self.result["others"]["recommendLetter"])
        # 针对粘贴简历
        find_other_info = self.soup.find("div", "titleLineB", text=re.compile(u"其它信息"))
        if find_other_info:
            self.result["others"]["otherInfo"] = find_other_info.find_parent("table").get_text().strip()
            self.other_html.append(self.result["others"]["otherInfo"])

        # 工作地址
        if re.search(u"（(.*)）", self.result["others"]["jobPositionFor"]):
            self.result["others"]["workLoc"] = re.search(u"（(.*)）", self.result["others"]["jobPositionFor"]).group(1)

            self.other_html.append(self.result["others"]["workLoc"])

    def parser(self, htmlContent=None, fname=None, url=None):
        self.preprocess(htmlContent, fname, url)
        self.regular_basic()
        self.regular_private()
        self.regular_expect()
        self.regular_educate()
        self.regular_workexp()
        self.regular_skill()
        self.regular_cert()
        self.regular_language()
        self.regular_project()
        self.regular_train()
        self.regular_other()
        return self.result

    def output(self):
        res = "\n"
        for k in self.result:
            res += k + ":" + "\n"
            if isinstance(self.result[k], dict):
                for kk, vv in self.result[k].iteritems():
                    res += '%1s: %s\n' % (kk, vv)
            elif isinstance(self.result[k], list):
                for i, exp in enumerate(self.result[k]):
                    res += "%12s\n" % (str(i + 1))
                    if isinstance(exp, dict):
                        for kk, vv in exp.iteritems():
                            res += "%22s: %s\n" % (kk, vv)
                    elif isinstance(exp, tuple):
                        for kk in exp:
                            res += '%22s \n' % (kk)
                    res += " " * 10 + '---' * 10 + '\n'
            else:
                res += " " * 10 + "%s\n" % (self.result[k])
        return res

Example #24

0

Show file

File: parsebz2xml.py Project: shaferab/EdwardsLab

"""

"""

import os
import sys
import argparse
import bz2
from bs4 import BeautifulSoup



# with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input:
#     for i in range(10):
#         l = input.readline()
#         print("{}\n".format(l))


# <Id db="BioSample" is_primary="1">SAMN00000002</Id>\n'

def primaryId(tag):
    return tag['db'] == 'BioSample' and tag['is_primary']


with bz2.BZ2File('biosample_set.xml.bz2', 'r') as input:
    soup = BeautifulSoup(input, 'xml')
    pi = soup.find_next(primaryId)
    print("{}".format(pi))

Example #25

0

Show file

File: ColoradoSearcher.py Project: laraschull/crawlers

def saveInmateProfile(browser):
    inmate = Inmate()  # inmate profile
    record = InmateRecord()  # inmate current record
    facility = Facility()

    # find inmate ID, will go in active record
    time.sleep(2)

    element = browser.find_elements_by_class_name("section_data")

    backgroundPersonInfo = BeautifulSoup(element[1].get_attribute('innerHTML'), 'html.parser').find("tbody").find("tbody")
    personInfoCells = backgroundPersonInfo.find_all("tr")

    for ind in range(len(personInfoCells)):
        cell = personInfoCells[ind]
        if not isinstance(cell, NavigableString):
            txt = " ".join(cell.text.strip().split())

            if "Name" in txt:
                fullName = txt.replace(",", "").split(" ")
                lastName = fullName[1]
                firstName = fullName[2]
                middleName = fullName[-1] if len(fullName) == 4 else None
                inmate.name = Name(firstName, middleName, lastName)
            elif "Age" in txt:
                inmate.age = txt.split(" ")[-1]
                inmate.DOB = Date(inmate.age, None, None, True)
            elif "Gender" in txt:
                inmate.sex = txt.split(" ")[-1]
            elif "Ethnicity" in txt:
                inmate.race = txt.split(" ")[-1]
            elif "Hair Color" in txt:
                inmate.hairColor = txt.split(" ")[-1]
            elif "Eye Color" in txt:
                inmate.eyeColor = txt.split(" ")[-1]
            elif "Height" in txt:
                inmate.height = txt.split(" ")[-2] + txt.split(" ")[-1]
            elif "Weight" in txt:
                inmate.weight = txt.split(" ")[-1]

    backgroundPersonPrisonInfo = backgroundPersonInfo.find_next("tbody")
    personPrisonInfoCells = backgroundPersonPrisonInfo.find_all("tr")

    for ind in range(len(personPrisonInfoCells)):
        cell = personPrisonInfoCells[ind]
        if not isinstance(cell, NavigableString):
            txt = " ".join(cell.text.strip().split())

            if "DOC Number" in txt:
                # inmate's id given by Colorado Department of Corrections
                record.recordNumber = txt.split(" ")[-1]
            elif "Est. Parole Eligibility Date" in txt:
                dateSplit = txt.split(" ")[-1].split("/")
                if len(dateSplit) > 1:
                    record.paroleEligibilityDate = Date(dateSplit[-1], dateSplit[0], dateSplit[1])
            elif "Next Parole Hearing Date" in txt:
                dateSplit = txt.split(":")
                if len(dateSplit) > 1:
                    record.nextParoleHearingDate = dateSplit[-1].strip()
            elif "Est. Sentence Discharge Date" in txt:
                dateSplit = txt.split(" ")[-1].split("/")
                if len(dateSplit) > 1:
                    record.estReleaseDate = Date(dateSplit[-1], dateSplit[0], dateSplit[1])
            elif "Current Facility Assignment" in txt:
                facility.name = txt.split(":")[-1].strip()

    # saves profile to the database
    writeToDB(inmate)
    browser.find_element_by_id("btn_search_txt").click()
    return inmate.name.first + " " + inmate.name.last

Example #26

0

Show file

File: api.py Project: nata502ly/api_task

import requests
from lxml.html import fromstring
from bs4 import BeautifulSoup

r = requests.get("http://github.com/")
print(r.status_code)
if r.status_code == 200:
    tree = fromstring(r.content)
    a = tree.findtext('.//title')
    soup = BeautifulSoup(r.text)
    metas = soup.find_all('meta')
    get_text = r.text
    soup = BeautifulSoup(get_text, "html.parser")

    b = soup.find_next('h1', 'class:listing-name')
    print(a, metas, b)
r = requests.get("https://github.com/")
print(r.status_code)
if r.status_code == 200:
    tree = fromstring(r.content)
    a = tree.findtext('.//title')
    soup = BeautifulSoup(r.text)
    metas = soup.find_all('meta')
    get_text = r.text
    soup = BeautifulSoup(get_text, "html.parser")

    b = soup.find_next('h1', 'class:listing-name')
    print(a, metas, b)
r = requests.get("https://www.github.com/")
print(r.status_code)
if r.status_code == 200:

Example #27

0

Show file

for data in URLs:
    race = {}

    link = data['link']

    print("Processing %s" % link)
    pageURL = link

    if MOCK_RACE:
        content = BeautifulSoup(open(MOCK_RACE), features="lxml").body
    else:
        content = BeautifulSoup(urllib.request.urlopen(pageURL).read(),
                                features="lxml").body

    # titre
    name = content.find_next('h1', {'class': 'pagetitle'}).string.strip()
    if name.startswith('Les '):
        name = name[4:-1].title()

    race[u'01Nom'] = name

    # source
    race[u'03Source'] = data['source']

    # référence
    race[u'04Référence'] = link

    # traits
    race[u'05Traits'] = []
    section = jumpTo(html, 'h2', {'class': 'separator'},
                     u"Traits raciaux standards")

Example #28

0

Show file

File: tests2.py Project: tal-sitton/get-tags-and-lyrics-for-music

def searchForArtist(title):
    query = "{title}".format(title=title).replace(" ", "+").replace("&", "and")
    search = "https://google.com/search?hl={lang}&q={query}+song".format(
        lang='en', query=query)
    res = requests.get(search, headers=HEADERS_GET).text
    with open(r"C:\Users\talsi\Desktop\test.html", 'w', encoding='utf-8') as f:
        f.write(res)
    # try:
    text = res.split('Other recordings of this song')
    res = text[0]
    soup = BeautifulSoup(res, 'html.parser')
    i = 0
    songs = [tag.text for tag in soup.find_all(attrs=GOOGLE_SONG_TAG_ATTRS)]
    while i < len(songs):
        z = songs[i]
        if z.lower() != title.lower():
            print("z.lower= " + z.lower())
            print("title.lower= " + title.lower())
            print("what we going to del: " + songs[i])
            del songs[i]
            i -= 1
        else:
            print("we didnt delete: " + z.lower())
            print("cause we thought its: " + title.lower())
        i += 1
    print(songs)
    i = 0
    print("songs[0]= " + songs[0])
    print("soup.find({place0})= ".format(place0=songs[0]) +
          soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=songs[0]).text)
    lastArt = soup.find(
        attrs=GOOGLE_SONG_TAG_ATTRS,
        text=songs[0]).find_next(attrs=GOOGLE_ARTISTS_TAG_ATTRS).text
    artist = lastArt
    print("lastArt= " + lastArt)
    artists = []
    newSongs = []
    nextIsOK = True
    while i < len(songs):
        song = songs[i]
        lastSong = song
        if nextIsOK:
            newSongs.append(song)
            artists.append(artist)
        print(lastArt[:-7])
        print(soup.find_next(text=lastArt[:-7]))
        print(
            soup.find(attrs=GOOGLE_SONG_TAG_ATTRS,
                      text=lastSong).find_next(text=lastArt).text)
        if soup.find(attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next(
                attrs=GOOGLE_ARTISTS_TAG_ATTRS, text=lastArt).find_next(
                    "span").find_next("span").text == soup.find(
                        attrs=GOOGLE_SONG_TAG_ATTRS, text=lastSong).find_next(
                            attrs=GOOGLE_ARTISTS_TAG_ATTRS,
                            text=lastArt).find_next("span").find_next(
                                attrs=GOOGLE_ARTISTS_TAG_ATTRS).text:
            print("got in")
            nextIsOK = True
            artist = soup.find(attrs=GOOGLE_SONG_TAG_ATTRS,
                               text=lastSong).find_next(
                                   attrs=GOOGLE_ARTISTS_TAG_ATTRS,
                                   text=lastArt).find_next("span").find_next(
                                       attrs=GOOGLE_ARTISTS_TAG_ATTRS).text
        else:
            nextIsOK = False
            artist = ""

Example #29

0

Show file

File: webSpider.py Project: ko9ma7/PythonLearning

import re
from bs4 import BeautifulSoup

email_id_example = """<br/> 
<div>The below HTML has the information that has email ids.</div>  
[email protected] 
<div>[email protected]</div> 
<span>[email protected]</span> 
"""

soup = BeautifulSoup(email_id_example, "lxml")
emailid_regexp = re.compile(
    "\w[-\w.+]*@([A-Za-z0-9][-A-Za-z0-9]+\.)+[A-Za-z]{2,14}")
first_email_id = soup.find(text=emailid_regexp)
print(first_email_id)
next_email_id = soup.find_next(text=emailid_regexp)
print(next_email_id)

Example #30

0

Show file

File: ass_article.py Project: boudjhakim/ASS_Project

class JasssArticle(ASSArticle):
    bs_article: BeautifulSoup

    def __init__(self, *args, **kwargs):
        # args -- tuple of anonymous arguments
        # kwargs -- dictionary of named arguments
        """init article from an url
        *args
        :param int volume:
        :param int issue:
        :param int article:
        **kwargs
        :param url url:
        """
        if len(args) == 0:
            req = requests.get(
                kwargs.get('url', ass_scrap_util.get_latest_url()))
            if req.status_code == requests.codes.ok:
                self.url = req.url
                self.bs_article = BeautifulSoup(req.content, 'html5lib')
            else:
                raise HTTPError(req.reason)
        else:
            basic_url = ass_scrap_util.base_url + str(
                args[0]) + ass_scrap_util.separator + str(
                    args[1]) + ass_scrap_util.separator
            req = requests.get(basic_url + str(args[2]) + ass_scrap_util.html)
            self.url = req.url
            if req.status_code == requests.codes.ok:
                self.bs_article = BeautifulSoup(req.content, 'html5lib')
            else:
                self.bs_article = BeautifulSoup(
                    requests.get(basic_url + str("review" + args[2]) +
                                 ass_scrap_util.html), 'html5lib')

    def __repr__(self):
        return self.url

    def is_review(self):
        """ Tells if this article is a review or not """
        return True if "review" in self.__repr__() else False

    def keywords(self):
        """
        Get the key worlds from an article
        :param html bs_article:
        :return: a tuple made of key worlds
        """
        return [
            x.strip()
            for x in self.get_meta_content_with_tag("tags").split(',')
        ]

    def title(self):
        """ Retrieve the title of the article """
        return self.get_meta_content_with_tag()

    def authors(self):
        """
        Retrieve the authors of the article
        :param html bs_article:
        :return: a tuple of authors
        """
        return [
            x.strip()
            for x in self.get_meta_content_with_tag("authors").split(';')
        ]

    def abstract(self):
        """ Retrieve the abstract of the article"""
        the_abstract = self.get_meta_content_with_tag("abstract")

        if len(the_abstract.split()) < 5:
            return str(
                self.bs_article.find(string="Abstract").findNext(
                    "dl").next.contents[0]).strip()
        return the_abstract

    def issn(self):
        return '1460-7425'

    def doi(self):
        """
        Give the DOI stored in meta data
        :return: a unique *string* that represent this article
        """
        if self.is_review():
            return self.__repr__()
        try:
            doi = self.get_meta_content_with_tag("doi")
        except TypeError:
            doi = self.get_art_content_with_tag("doi")
        return doi

    def _text(self):
        body = self.bs_article.findAll("article")
        if len(body) == 1:
            return body[0]
        else:
            art = self.bs_article.findAll("div", {'class': 'article'})
            if len(art) > 0:
                return art[0]
            else:
                if len(art) == 0:
                    art = self.bs_article
                body = art.find("body")
                the_ps = body.findAll("p")
                for ppps in the_ps:
                    ppps.extract()
                dls = body.findAll("dl")
                if len(dls) > 0:
                    dds = dls[0].findAll("dd")
                    if len(dds) > 1:
                        dds[0].extract()
                        dds[1].extract()

                return body

    def text(self):
        """
        Text content of the article
        :return: The plain text of the article
        """
        html_text = self._text()
        bibliography: BeautifulSoup.Tag = html_text.findAll(
            "div", {'class': 'refs'})
        log.debug("Looking for the bibilography div: " + str(bibliography))
        if not bibliography:
            ref_tag = html_text.findAll(
                "h3", text=ass_scrap_util.jasss_biblio_match)[-1]
            log.debug("Match html tag for bibliography " + str(ref_tag))
            for n in ref_tag.next_siblings:
                log.debug("Extract " + str(n) + " from the text")
                n.extract()
        else:
            bibliography.extract()
        return ass_scrap_util.text_cleaner(html_text.getText())

    def get_meta_content_with_tag(self, tag="title"):
        """
        Retrieve the content of a tag as define by *beautifulsoup*
        :param string tag: the tag to find in the soup
        :return: a string representation of the content of the tag
        """
        m_name = ass_scrap_util.jasss_meta_name
        m_content = ass_scrap_util.jasss_meta_content
        if self.bs_article.find_next(
                ass_scrap_util.jasss_meta_tag,
            {ass_scrap_util.jasss_meta_name.upper(): "title"}):
            m_name = ass_scrap_util.jasss_meta_name.upper()
            m_content = ass_scrap_util.jasss_meta_content.upper()

        if isinstance(ass_scrap_util.meta[tag], str):
            meta_context = self.bs_article.find(
                ass_scrap_util.jasss_meta_tag,
                {m_name: ass_scrap_util.meta[tag]})
        else:
            for tg in ass_scrap_util.meta[tag]:
                meta_context = self.bs_article.find(
                    ass_scrap_util.jasss_meta_tag, {m_name: tg})
                if meta_context is not None:
                    break
        return meta_context[m_content]

    def get_art_content_with_tag(self, tag="title"):
        """
        Retrieve the content of a tag define in the *art* section of JASSS article pages
        :param tag:
        :return: a string representation of the content of the tag
        """
        balise: str = "p"
        if tag == "doi":
            balise = "span"
        result = self.bs_article.find(balise,
                                      {'class': ass_scrap_util.art[tag]})
        if result is None:
            return "-".join([str(s) for s in self.__repr__() if s.isdigit()])
        if tag == "doi":
            result = result.contents[0].replace('DOI:', '')
        return result.strip()

    def get_soup(self):
        """
        
        :return: the soup of the source retrieve by *beautifulsoup* 
        """
        return self.bs_article

Example #31

0

Show file

File: scraper.py Project: leolwelter/2eTools-scraper

def parse_ancestries(pages: List[str]) -> Optional[List[object]]:
    ancestries: List[Ancestry] = []
    for ind, page in enumerate(pages):
        if page == '':
            continue  # placeholder to properly enumerate "bad" array items
        anc: Ancestry = Ancestry()
        anc.id = ind + 1
        whole_text = BeautifulSoup(page, 'html5lib').find(
            'span', {'id': 'ctl00_MainContent_DetailedOutput'})

        # get name
        name_tags = [
            t for t in whole_text.find_next('h1').children if t.string
        ]
        for m in name_tags:
            if m.string:
                anc.name = ''.join((anc.name, m.string))

        # if this is just a heritage, we don't parse it
        if 'Heritage' in anc.name:
            continue

        # get rarity and traits
        uncommon: Tag = whole_text.find_next(class_='traituncommon')
        rare: Tag = whole_text.find_next(class_='traitrare')
        if uncommon:
            anc.rarity = uncommon.string.strip()
        elif rare:
            anc.rarity = rare.string.strip()
        traits: List[Tag] = whole_text.find_all_next(class_='trait')
        anc.traits = [str(t.string) for t in traits if t]

        # get source
        src: str = whole_text.find_next(
            name='b', text='Source').find_next(name='a').string
        anc.source.book = src.split('pg.')[0].strip()
        anc.source.page = int(src.split('pg.')[1])

        # get description and other entries
        m_tag = whole_text.find_next(name='b',
                                     text='Source').find_next(name='br')
        d_str = ''
        d_header = ''
        d_entries = []
        while m_tag.next:
            if m_tag.name and m_tag.name == 'h1':
                d_str = d_str.replace(d_header, '', 1)
                d_entries.append(AncestryHeader(d_header, d_str.strip()))
                break
            if type(m_tag) == NavigableString:
                d_str = ''.join((d_str, m_tag.string))
            elif m_tag.name == 'br':
                d_str = ''.join((d_str, '\n'))
            elif m_tag.name == 'h2' or m_tag.name == 'h3':
                d_str = d_str.replace(d_header, '', 1)
                d_entries.append(AncestryHeader(d_header, d_str.strip()))
                d_header = m_tag.string.strip()
                d_str = ''
            m_tag = m_tag.next
        anc.description = d_entries

        # Hit Points, Size, Speed, Ability Boosts (Flaws), Languages, Senses, Extra(s)
        # usually in that order (?)
        # then break when m_tag == m_tag.find_next(name='div', class_='clear').previous.previous.previous
        d_str = ''
        anc.hitPoints = str(
            m_tag.find_next(name='h2', text='Hit Points').next.next)
        anc.size = str(m_tag.find_next(name='h2', text='Size').next.next)
        anc.speed = str(m_tag.find_next(name='h2', text='Speed').next.next)
        boosts_tag = m_tag.find_next(name='h2',
                                     text='Ability Boosts').next.next
        while boosts_tag.next:
            if boosts_tag.name and boosts_tag.name == 'h2':
                break
            if type(boosts_tag) == NavigableString:
                d_str = ''.join((d_str, boosts_tag.string))
            elif boosts_tag.name == 'br':
                d_str = ''.join((d_str, '\n'))
            boosts_tag = boosts_tag.next
        anc.abilityBoosts = [d for d in d_str.split('\n') if d]
        flaws_tag = m_tag.find_next(name='h2', text='Ability Flaw(s)')
        if flaws_tag:
            d_str = ''  # reset
            flaws_tag = flaws_tag.next.next
            while flaws_tag.next:
                if flaws_tag.name and flaws_tag.name == 'h2':
                    break
                if type(flaws_tag) == NavigableString:
                    d_str = ''.join((d_str, flaws_tag.string))
                elif flaws_tag.name == 'br':
                    d_str = ''.join((d_str, '\n'))
                flaws_tag = flaws_tag.next
            anc.abilityFlaws = [d for d in d_str.split('\n') if d]

        lang_tag = m_tag.find_next(name='h2', text='Languages').next_sibling
        d_str = ''
        while lang_tag.next:
            if lang_tag.name and lang_tag.name == 'h2':
                anc.languages.append(d_str)
                break
            if type(lang_tag) == NavigableString:
                d_str = ''.join((d_str, lang_tag.string))
            elif lang_tag.name == 'br':
                anc.languages.append(d_str)
                d_str = ''
            lang_tag = lang_tag.next

        dvision_tag = m_tag.find_next(name='h2', text='Darkvision')
        ll_tag = m_tag.find_next(name='h2', text='Low-Light Vision')
        if dvision_tag:
            anc.senses.append(
                AncestryHeader(
                    'Darkvision',
                    'You can see in darkness and dim light just as well as you can see in bright light, though your vision in darkness is in black and white.'
                ))
        elif ll_tag:
            anc.senses.append(
                AncestryHeader(
                    'Low-Light Vision',
                    'You can see in dim light as though it were bright light, so you ignore the concealed condition due to dim light.'
                ))

        # the rest are extras
        extras_tag: Tag = lang_tag.previous.find_next('h2')
        if extras_tag:
            end_tag = [x for x in extras_tag.parent.children][-1]
            d_str: Union[str, List[List[str]]] = ''
            d_header = ''
            while extras_tag.next:
                if extras_tag == end_tag.next:
                    if type(d_str) == str:
                        anc.extras.append(
                            AncestryHeader(d_header,
                                           d_str.replace(d_header, '', 1)))
                    elif type(d_str) == list:
                        anc.extras.append(AncestryHeader(d_header, '', d_str))
                    break
                if type(extras_tag) == NavigableString:
                    d_str = ''.join((d_str, extras_tag.string))
                elif extras_tag.name == 'br':
                    d_str = ''.join((d_str, '\n'))
                elif extras_tag.name == 'h2':
                    if d_str != '':
                        anc.extras.append(
                            AncestryHeader(d_header,
                                           d_str.replace(d_header, '', 1)))
                    d_header = str(extras_tag.string)
                    d_str = ''
                elif extras_tag.name == 'table':
                    d_str = parse_table_into_list(extras_tag)
                    extras_tag = extras_tag.next_sibling.previous if extras_tag.next_sibling else extras_tag
                extras_tag = extras_tag.next

        ancestries.append(dataclasses.asdict(anc))
    return ancestries

Example #32

0

Show file

            # hacks
            if value == 'Utilisation des objets magiques':
                value = 'Utilisation d\'objets magiques'
            elif value == 'Connaissances (mystère)':
                value = 'Connaissances (mystères)'

            if value == 'Connaissances (toutes)' or value == 'Connaissances (tous les domaines)' or value == 'Connaissances (au choix, chaque compétence devant être prise séparément)':
                for c in CONN:
                    cl['CompétencesDeClasse'].append({'Compétence': c})
            else:
                cl['CompétencesDeClasse'].append({'Compétence': value.strip()})
        elif s.name == 'br':
            break

    # tableau (progression)
    rows = content.find_next('table', {"class": "tablo"}).find_all('tr')

    maxSpellLvl = 0
    minSpellLvl = 1
    if 'spellLvl' in data.keys():
        maxSpellLvl = data['spellLvl']
    if 'spellLvl0' in data.keys() and data['spellLvl0']:
        minSpellLvl = 0

    cl['Progression'] = []
    for r in rows:
        # ignorer les en-têtes
        if r.has_attr('class') and (r['class'][0] == 'titre'
                                    or r['class'][0] == 'soustitre'):
            continue
        idx = 0

Example #33

0

Show file

 def _find_name(self, name_column: BeautifulSoup) -> str:
     # posrela_tag = player_row.find_next('td', {'class': 'posrela'})
     name_tag = name_column.find_next('a', {'class': 'spielprofil_tooltip'})
     return name_tag.getText()