Python BeautifulSoup.index Examples, bs4.BeautifulSoup.index Python Examples

Example #1

0

Show file

File: Scraper.py Project: Saiyan-God/Spell-It-Out

def scrapeMedical(url, frontTrim, backTrim):
    '''Function specific to scraping the 24 pages medical terminology pages on wikipedia'''

    request = urllib.request.Request(url)
    html = urllib.request.urlopen(request).read()
    soup = BeautifulSoup(html, 'html.parser')

    'Scrapes webpage within a specific class'
    main_table = soup.find('div', attrs={'class': 'mw-parser-output'})
    items = main_table.find_all('tr')

    items = str(items)
    items = items.replace('\n', ' ')
    items = BeautifulSoup(items, 'lxml').text
    items = items.split(' ,  ')

    #get rid of useless html at beginning and end of page
    del items[:frontTrim]
    del items[-backTrim:]

    #breaks string into list, so that the first item (the acromyn) can be seperated
    for definition in items:
        items[items.index(definition)] = definition.split(' ')

    #combines everything but the acromyn into a string
    for definition in items:
        items[items.index(definition)][1:len(definition)] = [
            ' '.join(items[items.index(definition)][1:len(definition)])
        ]

    return items

Example #2

0

Show file

def get_url(city):

    str0 = 'http://sugg.us.search.yahoo.net/gossip-gl-location/?appid=weather&output=xml&command=' + str(
        city)
    with request.urlopen(str0) as f:
        data = f.read().decode('utf-8')
    s = BeautifulSoup(data, 'lxml').find("s")["d"]
    woeid = s[s.index('woeid') + 6:s.index('woeid') + 13]
    q = 'select+*+from+weather.forecast+where+woeid%3D' + woeid + '+and+u%3D%22c%22&diagnostics=true'
    url = 'https://query.yahooapis.com/v1/public/yql?q=' + q
    return url

Example #3

0

Show file

File: spider.py Project: Yii0423/yBlog

def getProxies():
    ips = BeautifulSoup(
        requests.get('http://www.xicidaili.com/nn/1', headers=headers).text,
        'html.parser').select("td")
    _ip = ''
    for ip in ips:
        if ips.index(ip) % 10 == 1:
            _ip += str(ip).replace('<td>', '').replace('</td>', '')
        if ips.index(ip) % 10 == 2:
            _ip += ':' + str(ip).replace('<td>', '').replace('</td>', '')
        if ips.index(ip) % 10 == 5:
            proxies[str(ip).replace('<td>', '').replace('</td>', '')] = _ip
            break

Example #4

0

Show file

def parser_requester(company_name):
    try:
        url_string = BeautifulSoup(
            requests.get(url + company_name + tag_student_progs).content,
            "html.parser").find('div', {
                'class': 'kCrYT'
            }).find_all('a')[0]['href']
        return url_string[7:url_string.index('&')]
    except:
        url_string = BeautifulSoup(
            requests.get(url + company_name + tag_careers).content,
            "html.parser").find('div', {
                'class': 'kCrYT'
            }).find_all('a')[0]['href']
        return url_string[7:url_string.index('&')]

Example #5

0

Show file

File: etl.py Project: 0ptaq0/etl-reviews-public

def scrap_reviews():
    global reviews_list
    # iterate through global reviews_list_html which is output of method get_reviews_for_movie()
    for review in reviews_list_html:
        result = re.search('href=\"(.*)\">', str(review))
        url = "https://www.filmweb.pl" + result.group(1)
        review_page = get_page(url)
        review_soup = BeautifulSoup(review_page.content, 'html.parser')
        # get review content using specific HTML attributes for filmweb
        review_content_html = review_soup.find("div",
                                               attrs={
                                                   "itemprop": "reviewBody"
                                               }).text
        review_content = BeautifulSoup(review_content_html, "lxml").text
        index = review_content.index('waitingModule')
        review_content = review_content[:index]

        rev_title = review_soup.find("h2", attrs={"itemprop": "name"}).text

        author = review_soup.find("div", attrs={"itemprop": "author"}).text

        review_rating = review_soup.find("span", {
            "class": "reviewRatingPercent"
        }).text[:-1]

        result = re.search('\"(.*) ',
                           review_soup.find("ul", {
                               "class": "newsInfo"
                           }).text)
        pub_date = result.group(1)[:10]

        # Using Review class create an object review
        review = make_review(0, rev_title, review_content, author,
                             review_rating, pub_date)
        reviews_list.append(review)

Example #6

0

Show file

 def get_stock(self, code="A005930", start="19000101", end="30000101"):
     url = self.base_url.format(code, code[1:], code, start, end)
     response = requests.get(url)
     download_url = "http://file.krx.co.kr/download.jspx"
     json_data = {"code": response.content}
     headers_json = {
         "Referer":
         "http://marketdata.krx.co.kr/contents/MKD/99/MKD9900001.jspx"
     }
     data = requests.post(download_url,
                          data=json_data,
                          headers=headers_json)
     parsing = BeautifulSoup(data.text)
     parsing = parsing.text.split("\n")
     parsing = [
         line.replace(",", "").strip('"').replace('""', '"')
         for line in parsing
     ]
     parsing = [x.split('"') for x in parsing]
     self.parse = parsing
     parsing = pd.DataFrame(parsing[1:], columns=self.columns)
     parsing.index = pd.to_datetime(parsing["date"], format="%Y/%m/%d")
     parsing.drop("date", axis=1, inplace=True)
     self.data = parsing.astype("float64")
     return self.data

Example #7

0

Show file

File: 2.py Project: zw900/91pron_python

def download_img(title, link):
    nomakechar = [":", "/", "\\", "?", "*", "“", "<", ">", "|"]
    for item in nomakechar:
        if title.find(item) > -1:
            title = title.replace(item, '')
    if os.path.exists(title):
        return
    os.makedirs(title)
    re = s.get(link, headers=header)
    re.encoding = 'gbk'
    div = BeautifulSoup(re.text, "html.parser").find_all(
        'div', class_='tpc_content do_not_catch')[0]
    img = BeautifulSoup(str(div), "html.parser").find_all('img')
    pbar = tqdm(total=len(img))
    for i in img:
        file_name = title + '/' + str(img.index(i)) + '.jpg'
        if not os.path.exists(file_name):
            download_link = ''
            if i.get('data-src') == None:
                download_link = i.get('ess-data')
            else:
                download_link = i.get('data-src')
            index = 1
            while True:
                try:
                    re = s.get(download_link, headers=header)
                    with open(file_name, 'wb') as f:
                        f.write(re.content)
                except:
                    index += 1
                    continue
                else:
                    break
        pbar.update(1)
    pbar.close()

Example #8

0

Show file

def currency_get():
    fp = urllib.request.urlopen(
        'http://info.finance.naver.com/marketindex/exchangeList.nhn')
    source = fp.read()
    fp.close()
    class_list = ["tit", "sale"]
    soup = BeautifulSoup(source, 'html.parser')
    soup = soup.find_all("td", class_=class_list)
    money_data = {}
    for data in soup:
        if soup.index(data) % 2 == 0:
            data = data.get_text().replace('\n', '').replace('\t', '')
            money_key = data
        elif soup.index(data) % 2 == 1:
            money_value = data.get_text()
            money_data[money_key] = money_value
    return money_data

Example #9

0

Show file

File: main.py Project: aanchal02/CBB-Project

def getMostStableStructure(HTMLFile):
    print("Creating Most Stable Structure")
    soup = BeautifulSoup(HTMLFile, "html.parser").get_text()
    ind = soup.index("Structures sorted by energy")
    proteinNumber = int(soup[ind + 59:ind + 61])
    print(soup[ind:ind+61])
    print(proteinNumber)
    return proteinNumber

Example #10

0

Show file

File: NaverApi.py Project: NaturaAurum/NaturaAurumBot

 def get_currency(self):
     url = urllib.request.urlopen(
         "http://info.finance.naver.com/marketindex/exchangeList.nhn")
     source = url.read()
     url.close()
     class_list = ["tit", "sale"]
     soup = BeautifulSoup(source, 'lxml')
     soup = soup.find_all("td", class_=class_list)
     money_data = {}
     for data in soup:
         if soup.index(data) % 2 == 0:
             data = data.get_text().replace('\n', '').replace('\t', '')
             money_key = data
         elif soup.index(data) % 2 == 1:
             money_value = data.get_text()
             money_data[money_key] = money_value
             money_key = None
             money_value = None
     return money_data

Example #11

0

Show file

File: pod.py Project: schwifty42069/piptv_od

 def scrape_media_titles(self):
     req = requests.get(self.search_address).text
     results = Soup(req, 'html.parser').findAll("td",
                                                {"class": "result_text"})
     for result in results:
         self.titles.append("{}. {}".format(
             results.index(result) + 1,
             str(result.contents[1]).split(">")[1].split("</")[0] +
             result.contents[2]))
     return

Example #12

0

Show file

File: fetcher_csv.py Project: Breedoon/SolarBears

def fetch(view, site_id):
    try:
        filename = DIR + "/" + get_file_name(site_id, view, get_timezone_time(site_id))
    except (KeyError, TypeError):
        filename = "filename unknown"
    # print("Fetching file:", filename)
    if not os.path.exists(DIR):
        os.makedirs(DIR)
    if not os.path.exists(filename):
        global last_fetch
        while time.time() - last_fetch < wait_time:
            time.sleep(0.1)
        last_fetch = time.time()

        url = URL + '?view={view}&cond=site_id={site_id}'.format(view=view, site_id=site_id)
        try:
            with requests.get(url) as r:
                s = r.content
                r.close()
            try:
                s = BeautifulSoup(s, "html.parser").find_all('script')[-2].text
                new_url = "https://solrenview.com" + s[s.index('/downloads'):s.index('.csv') + 4]
            except:
                return ""

            filename = DIR + '/' + new_url.split('/')[-1]  # .csv file

            with requests.get(new_url) as r:
                raw = r.text
                r.close()

        except ChunkedEncodingError as e:
            print(e)
            time.sleep(wait_time)
            return fetch(view, site_id)  # might not be the best solution but idk how else to fix it

        with open(filename, 'w+') as f:
            f.write(raw)
    else:
        with open(filename, 'r') as f:
            raw = f.read()
    return raw

Example #13

0

Show file

File: exchange.py Project: Geenie-Lee/week4

    def collect(self):
        fp = urllib.request.urlopen(self.url)
        source = fp.read()
        fp.close()
        class_list = ["tit", "sale"]
        soup = BeautifulSoup(source, 'html.parser')
        soup = soup.find_all("td", class_=class_list)
        money_data = {}

        for data in soup:
            if soup.index(data) % 2 == 0:
                data = data.get_text().replace('\n', '').replace('\t', '')
                money_key = data
            elif soup.index(data) % 2 == 1:
                money_value = data.get_text()
                if money_key == '미국 USD':
                    self.price = money_value.replace(',','')    # float(aa.replace(',',''))
                    money_data[money_key] = money_value

        self.dao.upsert(self.price)

Example #14

0

Show file

    def get_url_info(self, url):
        pic_first_page = self.request(url)
        description = BeautifulSoup(pic_first_page.text,
                                    'lxml').find('div', class_='main-meta')
        # 基本信息
        pic_meta = self.deal_pic_info(description.text)
        # 页数
        pic_pagenavi = BeautifulSoup(pic_first_page.text, 'lxml').find(
            'div', class_='pagenavi').find_all('span')[-2].text

        # 图片地址
        pic_img = BeautifulSoup(pic_first_page.text, 'lxml').find(
            'div', class_='main-image').find('img')['src']

        # 拼接 图片地址组
        # 当前只看到这种类型
        pre_url = ''
        end_url = ''
        if '.jpg' in pic_img:
            lc_jpg = pic_img.index('.jpg')
            pre_url = pic_img[0:lc_jpg - 2]
            end_url = pic_img[lc_jpg:]
        elif '.jpeg' in pic_img:
            lc_jpg = pic_img.index('.jpeg')
            pre_url = pic_img[0:lc_jpg - 2]
            end_url = pic_img[lc_jpg:]
        # 扩展
        else:
            logging.warning('******未处理图片类型：%s' % pic_img)
        all_img = []
        for i in range(1, int(pic_pagenavi) + 1):
            if i < 10:
                val = '0' + str(i)
            else:
                val = str(i)
            all_img.append({'order': i, 'img_url': pre_url + val + end_url})

        # 以第一页浏览量统计
        pic_meta['views'] = int(pic_meta['views']) * int(pic_pagenavi)
        pic_meta['img_list'] = all_img
        return pic_meta

Example #15

0

Show file

def crawling(web_url):
    html = urlopen(web_url)  
    bsObject = BeautifulSoup(html, "html.parser") 
    #print(bsObject)
    #print(bsObject.find('script'))

    bsObject=str(bsObject)
    #print(type(bsObject))
    #ingredient=re.findall('[[]재료[]].*',bsObject)
    start=bsObject.index("application")
    end=bsObject.index("</script>",start)
    dic=eval(bsObject[start+len('application/ld+json">'):end])
    title=dic['name']
    author=dic['author']['name']
    food_ingredient=dic['recipeIngredient']
    for i in range(len(food_ingredient)):   
        ingredient=food_ingredient[i]
        food_ingredient[i]=re.findall('[가-힣\s]+',ingredient)[0].rstrip()
    picture=dic['image'][0]
    link=web_url
    return {'title':title, 'author':author, 'food_ingredient':food_ingredient, 'picture':picture, 'link':link}

Example #16

0

Show file

File: scraper.py Project: AlphaPL/LinkedInEmployeeScraper

    def __get_job(self, data, se, link):
        if se == 'bing':
            result = re.search('.*-(.*)-.*|.*', data.getText()).group(
                1)  # re.search('((?<=>)[A-Z].+?) - ', str(data)).group(1)

        result = re.search('.*-(.*)-.*|.*', data.getText()).group(1)
        if result:
            return result
        try:

            self.linked_in_driver.get(link)
            result = BeautifulSoup(linked_in_driver.page_source, "lxml")
            result = result.findAll(
                'h2',
                {'class': 'mt1 t-18 t-black t-normal break-words'})[0].text
            result = result[:result.index('at')]
            result = result[:result.index('@')]
            result = result.strip()
        except:
            pass
        return result

Example #17

0

Show file

def __init__(self):
    # Naver 환율 페이지 크롤링(Crawling) 작업
    fp = urllib.request.urlopen(
        'http://info.finance.naver.com/marketindex/exchangeList.nhn')
    source = fp.read()
    fp.close()

    # 크롤링한 정보
    # tit - 통화명
    # sale - 매매기준율
    class_list = ["tit", "sale"]

    # BeautifulSoup으로 html소스를 python객체로 변환
    # (html소스코드, 이용할 parser)
    # python 내장 html.parser를 이용
    soup = BeautifulSoup(source, 'html.parser')
    soup = soup.find_all("td", class_=class_list)

    # 각 국가별 환율 정보 저장
    money_data = {}
    for data in soup:
        if soup.index(data) % 2 == 0:
            data = data.get_text().replace('\n', '').replace('\t',
                                                             '')  # HTML 태그 제거
            money_key = data  # key 값에 통화명 저장
        elif soup.index(data) % 2 == 1:
            money_value = data.get_text()
            money_data[money_key] = money_value  # 통화명 key 값에 각각의 매매기준율 저장
            money_key = None
            money_value = None

    # 사전 money_data를 리스트 money_data_keys, money_data_values로 변환
    money_data_keys = []
    money_data_values = []

    for key, values in money_data.items():
        money_data_keys.append([key])
        money_data_values.append([values])

    print(money_data_keys)

Example #18

0

Show file

def get_img_url(raw_url):
    # Get text from page
    bs4_text = BeautifulSoup(get_page(raw_url), 'lxml').get_text()
    img_url_location = bs4_text.index(
        'Image URL')  # 'Image URL is what we are searching for in the text
    bs4_text = bs4_text[img_url_location:]
    bs4_text_list = bs4_text.splitlines()

    # Get line that has the image URL and return it
    img_url_line = bs4_text_list[0]
    img_url = img_url_line.split(' ')[4]

    return img_url

Example #19

0

Show file

def location_parser(company_name):
    url = ""
    final_list = []
    try:
        url = BeautifulSoup(
            requests.get(URL + company_name + locations).content,
            "html.parser").find('div', {
                'class': 'kCrYT'
            }).find_all('a')[0]['href']
        url = url[7:url.index('&')]
        page = BeautifulSoup(requests.get(url).content,
                             "html.parser").find_all(text=True)
        visible_texts = filter(tag_visible, page)
        visible_texts = u" ".join(t.strip() for t in visible_texts)
        for t in all_locations:
            if t in visible_texts:
                final_list.append(t)

    except:
        try:
            url = BeautifulSoup(
                requests.get(URL + company_name + tag_locations).content,
                "html.parser").find('div', {
                    'class': 'kCrYT'
                }).find_all('a')[0]['href']
            url = url[7:url.index('&')]
            page = BeautifulSoup(requests.get(url).content,
                                 "html.parser").find_all(text=True)
            visible_texts = filter(tag_visible, page)
            visible_texts = u" ".join(t.strip() for t in visible_texts)
            for t in all_locations:
                if t in visible_texts:
                    final_list.append(t)

        except:
            print(company_name, " NOT FOUND!")

    return url, final_list

Example #20

0

Show file

def generateSnippetNgram(queryTerms, doc, ngramSize):
    lookAhead = 40
    postTail = 50
    htmlContent = BeautifulSoup(doc, "html.parser").find('pre').get_text()

    for i in range(len(queryTerms) - (ngramSize - 1)):
        if ngramSize == 3:
            queryTerm = queryTerms[i] + " " + queryTerms[i + 1] + " " + queryTerms[i + 2]
        elif ngramSize == 2:
            queryTerm = queryTerms[i] + " " + queryTerms[i + 1]
        else:
            queryTerm = queryTerms[i]
        termLocation = htmlContent.find(queryTerm)
        if termLocation != -1:
            startIndex = termLocation - lookAhead
            if startIndex <= 0:
                startIndex = 0
            else:
                while startIndex > 0:
                    if htmlContent[startIndex - 1:startIndex] not in [" ", "\n"]:
                        startIndex -= 1
                    else:
                        break
            endIndex = htmlContent.index(queryTerm) + len(queryTerm) + postTail
            if endIndex > len(htmlContent):
                endIndex = len(htmlContent)
            while endIndex < len(htmlContent):
                if htmlContent[endIndex:startIndex - 1] not in [" ", "\n"]:
                    endIndex += 1
                else:
                    break
            first = htmlContent[startIndex: htmlContent.index(queryTerm)]
            second = htmlContent[htmlContent.index(queryTerm): htmlContent.index(queryTerm) + len(queryTerm)]
            third = htmlContent[htmlContent.index(queryTerm) + len(queryTerm): endIndex]
            return first, second, third

    return False, False, False

Example #21

0

Show file

File: Scraper.py Project: Saiyan-God/Spell-It-Out

def scrape(url, frontTrim, backTrim, splitMethod, seperator):

    request = urllib.request.Request(url)
    html = urllib.request.urlopen(request).read()
    soup = BeautifulSoup(html, 'html.parser')

    main_table = soup.find('div', attrs={'class': 'mw-parser-output'})

    items = main_table.find_all(seperator)

    items = str(items)
    items = BeautifulSoup(items, 'lxml').text
    items = items.split(',')

    del items[:frontTrim]
    del items[-backTrim:]

    for pair in items:
        items[items.index(pair)] = pair.strip()

    for pair in items:
        items[items.index(pair)] = pair.split(splitMethod)

    return (items)

Example #22

0

Show file

def parse_and_render_recipe(input, js_path_prefix="", css_path_prefix=""):
    """
    Parse a recipe from markdown

    Args:
        input: str, The recipe as markdown
    """
    parser = commonmark.Parser()
    ast = parser.parse(input)

    renderer = commonmark.HtmlRenderer()
    html = renderer.render(ast)
    soup = BeautifulSoup(html, 'html.parser')
    # print(soup.prettify())
    steps = soup.find("h2", text="Zubereitung")

    if (nextPart := steps.findNextSibling("h2")) is not None:
        steps_upto = soup.index(nextPart)

Example #23

0

Show file

    def linkWordMatching(self, links, words):
        print("Start Links Contents")

        linkContent = {}
        for link in links:
            rows = self.queries.Get_html(link)
            soup = rows[0][0]
            soup = soup.replace(
                'class="srow bigbox container mi-df-local locked-single"',
                'class="row bigbox container mi-df-local single-local"')
            soup = BeautifulSoup(soup, "html.parser")
            [s.extract() for s in soup('script')]
            [s.extract() for s in soup('style')]
            soup = soup.text
            soup = soup.lower()
            soup = " ".join(soup.split('\n'))
            soup = " ".join(soup.split('\r'))
            for word in words:
                word_unstemed = self.queries.Get_unstemed(word, link)
                if (word_unstemed in soup):
                    index = soup.index(word_unstemed)
                    if (index - 500 >= 0):
                        start = index - 500
                    else:
                        start = 0
                    if (index + 500 <= len(soup)):
                        end = index + 500
                    else:
                        end = len(soup) - 1
                    while (start != 0 and soup[start] != " "
                           and soup[start] != "\n"):
                        start = start - 1
                    if (start != 0):
                        start = start + 1
                    while (end != 0 and soup[end] != " "
                           and soup[end] != "\n"):
                        end = end + 1
                    if (end != 0):
                        end = end - 1
                    text = ''.join(soup[start:end])
                    linkContent[link] = text
                    break
        print("End Links Contents")
        return linkContent

Example #24

0

Show file

    def get_Tbils(self, number_of_results):
        global browser

        ## browser.headers['origin']=" https://www.bot.go.tz"
        ## browser.headers['Cookie']=" ASPSESSIONIDCUBSCTSB=CGIFEPJBOJIOFIPBMIHFGDEP"
        url_transaction = 'http://www.bot.go.tz/FinancialMarkets/FinancialMarkets.asp'
        raw_html = browser.get(url_transaction)
        data_raw = Parser(raw_html.text, 'html.parser').findAll(
            'select', {'name': "TreasuryBillAuctionResults"})[0]('option')
        if number_of_results is 0:
            auctions = [auc.attrs['value'] for auc in data_raw]
        else:
            auctions = list(self.stop() if data_raw.index(auc) ==
                            number_of_results else auc.attrs['value']
                            for auc in data_raw)
            print(auctions)

        result = {auc: self.get_Treasury_bill(auc) for auc in auctions}

        return result

Example #25

0

Show file

File: 1.py Project: zw900/91pron_python

def download_img(title, link):
    nomakechar =  [":","/","\\","?","*","“","<",">","|","”"]
    for item in nomakechar:
        if title.find(item)>-1:
            title = title.replace(item, '')
    if os.path.exists(title):
        return
    os.makedirs(title)

    re = s.get(link, headers=header)
    re.encoding = 'gbk'
    img = BeautifulSoup(re.text, "html.parser").find_all(
        'img', class_='zoom')
    pbar = tqdm(total=len(img))
    for i in img:
        file_name = title + '/' + str(img.index(i)) + '.jpg'
        if not os.path.exists(file_name):
            download_link = ''
            # if i.get('src') == None:
            #     download_link = i.get('file')
            # else:
            #     download_link = i.get('src')
            download_link = i.get('file')
            index = 1
            while index<=1:
                try:
                    re = s.get(download_link, headers=header, timeout=5) 
                    with open(file_name, 'wb') as f:
                        f.write(re.content)
                except:
                    index += 1
                    continue
                else:
                    break
        pbar.update(1)
    pbar.close()

Example #26

0

Show file

    print geocode
    #Google imposes query limits, this lets us pass a failure and have the loop sleep and try again after 2 seconds
    if geocode['status'] == "OVER_QUERY_LIMIT":
        return 0
    if geocode['status'] != 'ZERO_RESULTS':
        coord_lat = geocode['results'][0]['geometry']['location']['lat']
        coord_lon = geocode['results'][0]['geometry']['location']['lng']
        coord.append(coord_lat)
        coord.append(coord_lon)
    print coord
    return coord

url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf"
xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read())
parsed = BeautifulSoup(xml).text.split("\n")
filtered_list = parsed[parsed.index('Location: '):]
closures = []

i = 0
current_closure = -1

while i < len(filtered_list):
    text = filtered_list[i]
    if text == "Location: ":
        closures.append({})
        current_closure = len(closures) - 1
        i += 1
        closures[current_closure]['location'] = filtered_list[i]
        #print filtered_list[i]
        coordinate = geocode(filtered_list[i])
        if(coordinate == 0):

Example #27

0

Show file

File: _Cleanner.py Project: KrishnnaOUT/web_crawling

def getDom(pageurl, charset):

    if charset is None:
        charset = 'utf-8'

    soup = BeautifulSoup(pageurl, 'html.parser', from_encoding=charset)

    #去除特定的head、script、style、img、input标签
    [
        body.extract()
        for body in soup(['head', 'img', 'script', 'style', 'input'])
    ]  #

    #去除注释
    for element in soup(text=lambda text: isinstance(text, Comment)):
        element.extract()
        pass

    #将soup中的文本提取出来，并存储到body数组中
    soup = soup.text.strip().lstrip().rstrip().split()

    #获取当前日期和具体时间，以便提出soup数组中出现的无效数据（当前系统时间）
    currentDate = time.strftime('%Y-%m-%d', time.localtime(time.time()))
    #处理日期格式，例：2017-04-20---->2017-4-20
    currentDate1 = currentDate[0:5] + currentDate[6:]
    #获取当前具体时间，以便提出soup数组中出现的无效数据（当前系统时间）
    currentTime = time.strftime('%H:%M', time.localtime(time.time()))
    #处理日期格式，例：23:58---->23:5,防止因为程序运行而导致的时间误差
    currentTime1 = currentTime[0:4]

    #剔除soup数组中无效字符串，减少干扰
    #剔除soup数组中'copyright'后半部分的版权内容
    #剔除soup数组中无效的年月。如"1999",剔除“2001-2007”格式的时间字符串
    #剔除soup数组中类似于”最后登录：2017-04-20 23：:55“的无效时间
    #剔除soup数组中出现的当前系统时间字符串
    re0 = re.compile(r'.*Copyright.*')
    re1 = re.compile(r'.*((19\d{2}\D)|(\d{4}-\d{4}\D)).*')
    re2 = re.compile(r'(^|.*)注册.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}$')
    re3 = re.compile(r'.*(' + currentDate1 + '|' + currentDate + ').*')
    re4 = re.compile(r'.*' + currentTime1 + '\d.*')
    re5 = re.compile(r'^最后.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}.*')
    for item in soup:

        #剔除soup数组中以":"、“：”结尾的文本字符串
        if item.endswith("："):
            soup.pop(soup.index(item))
            pass
        if item.endswith(":"):
            soup.pop(soup.index(item))
            pass
        #剔除soup数组中"|"和“»”和“›”文本字符串
        if '|' in soup:
            soup.pop(soup.index('|'))
            pass
        if '>' in soup:
            soup.pop(soup.index('>'))
            pass
        if '»' in soup:
            soup.pop(soup.index('»'))
            pass
        if '›' in soup:
            soup.pop(soup.index('›'))
            pass
        if re0.match(item):
            CopyrightIndex = soup.index(item) - 5
            while CopyrightIndex <= len(soup) - 1:
                popItem = soup.pop(CopyrightIndex)
                pass
            pass
        #剔除不规则时间
        if re1.match(item):
            #获得主体内容中re1指定格式匹配到的文本所在的下标timeIndex
            timeIndex = soup.index(item)
            #,剔除不规则不正常时间数据，防止被重复遍历
            soup.pop(timeIndex)
            pass

        if re2.match(item):
            #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex
            timeIndex = soup.index(item)
            #,剔除不规则不正常时间数据，并让该下标下的值置为'|'，防止被重复遍历
            soup.pop(timeIndex)
        pass
        #剔除当前系统时间
        if re3.match(item):
            if item in soup:
                if re4.match(soup[soup.index(item) + 1]):
                    #剔除当前系统时间
                    soup.pop(soup.index(item) + 1)
                    #去除当前系统日期
                    soup.pop(soup.index(item))
                    pass
                pass
            pass
        if re5.match(item):
            if item in soup:
                #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex
                timeIndex = soup.index(item)
                #,剔除不规则不正常时间数据，并让该下标下的值置为'|'，防止被重复遍历
                soup.pop(timeIndex)
                pass
            pass
        pass
    #返回经过数据预处理的soup数组
    return soup
    pass

Example #28

0

Show file

File: HockeyGoalsFunctions.py Project: DavidWBennett/NHLGoals

for schedule in schedules:
    date = schedule
    game_list = []
    for dates in date:
        url = f"https://www.hockey-reference.com/boxscores/{dates}.html#all_scoring"
        games_html = urlopen(url)
        #games_html = open(f"C:\\Users\\dbge\\OneDrive - Chevron\\Random\\{dates}.html") #This is for use if you have individual games saves as HTML code.
        games_soup = BeautifulSoup(games_html, 'lxml')
            
        table = games_soup.find('table')
        rows = table.findAll('tr')
        str_cells = str(rows)
        cleantext = BeautifulSoup(str_cells, 'lxml').get_text()
        
        try: #This is needed because some of the games have two types of goals. For example, SH and EN. I needed to account for these.
            commaindex = cleantext.index("\n\t\t\t,") #
            commaindex = commaindex + 4 #
            cleantext = cleantext[:(commaindex-4)] +  cleantext[(commaindex+6):(commaindex+8)] + "\n\n" + cleantext[(commaindex+8):] #
            s = cleantext.split(',') #
        except ValueError:
            s = cleantext.split(',')
            
        try: # Some games had two types of goals, twice. Therefore, I needed to repeat this section of code. I never saw a game with two types of goals, three times.
            commaindex = cleantext.index("\n\t\t\t,") 
            commaindex = commaindex + 4 #
            cleantext = cleantext[:(commaindex-4)] +  cleantext[(commaindex+6):(commaindex+8)] + "\n\n" + cleantext[(commaindex+8):] #
            s = cleantext.split(',') #
        except ValueError:
            s = s

Example #29

0

Show file

    def crawl(self, response):

        item = AlbamonItem()
        css_main = "#allcontent > div.viewContent.viewRecruitType > "

        item['aa00'] = response.meta['num']  # 게시글 고유번호

        # 등록일자
        css_regist_time = css_main + "div.registInfo.clearfix.devHidePrint > div.regDate > div > span::text"
        item['aa01'] = response.css(css_regist_time).get().replace("등록",
                                                                   "").strip()

        # 수집일자
        y = datetime.now().year
        m = datetime.now().month
        d = datetime.now().day
        h = datetime.now().hour
        n = datetime.now().minute
        date = f"{y}-{str(0)*(2-len(str(m)))}{m}-{str(0)*(2-len(str(d)))}{d}"
        time = f" {str(0)*(2-len(str(h)))}{h}:{str(0)*(2-len(str(n)))}{n}"
        item['aa02'] = date + time

        # 맨 처음 노출되는 기업명 & 게시물 제목
        css_recruitInfo = "div.viewTypeFullWidth > div.companyInfo.infoBox > div.recruitInfo > "

        css_company = css_main + css_recruitInfo + "div.company > span::text"
        company = response.css(css_company).extract()
        item['aa03'] = company[0]  # 기업이름

        css_title = css_main + css_recruitInfo + "h1::text"
        item['aa04'] = response.css(css_title).get()  # 게시물 제목

        # 근무장소 세부사항
        css_workarea = css_main + "div.viweTab > div.tabItem_workArea > div.workAddr > span::text"

        item['ab00'] = response.meta['workarea']  # 근무장소 간단히 (시군구)
        item['ab01'] = response.css(css_workarea).get()  # 근무장소 자세히 (좌표 변환 가능)

        css_near = "div.viweTab > div.tabItem_workArea > div.mapInfo > div > ul > li"
        near = response.css(css_near).extract()
        subway = ""
        college = ""
        for html in near:
            s = BeautifulSoup(html, 'lxml')
            if s.select_one('span.mapItemTitle').text == '인근지하철':
                subnums = [
                    x['href'][x['href'].index("CodSubway=") + 10:]
                    for x in s.select('div > a')
                ]
                times = [
                    x.text[x.text.index("도보") + 3:x.text.index("분")]
                    for x in s.select('span.areaSummary')
                ]
                for subnum, time in zip(subnums, times):
                    subway += subnum + "_" + time + " "
            elif s.select_one('span.mapItemTitle').text == '인근대학':
                college = s.select_one("span.areaSummary").text
        item['ab02'] = subway.strip()
        item['ab03'] = college.strip()

        # 기업정보
        css_firmid = "#section_cropInfo > a::attr(href)"
        firmid = response.css(css_firmid).get()
        item['ac00'] = firmid[firmid.index("C_No=") + 5:]
        item['ac01'] = 1 * ('근로계약서 작성약속' in company)
        item['ac02'] = 1 * ('성희롱 예방교육수료' in company)

        # 지원 방법
        css_regist_type = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \
        "div.column.column_340.infoBox.devHidePrint > button::text"
        list_regist_type = [
            e.replace(" ", "").replace("\n", "").replace("\r", "")
            for e in response.css(css_regist_type).extract()
        ]
        registtype = " ".join(list_regist_type)
        css_regist_type2 = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \
        "div.column.column_340.infoBox.devHidePrint > div.recruitType.one > ul > li > button::text"
        if len(response.css(css_regist_type2).extract()):
            registtype += " " + response.css(css_regist_type2).get().replace(
                "\n", "")

        item['ad00'] = 1 * ('온라인지원' in registtype)
        item['ad01'] = 1 * ('간편문자지원' in registtype)
        item['ad02'] = 1 * ('이메일지원' in registtype)
        item['ad03'] = 1 * ('홈페이지' in registtype)
        item['ad04'] = 1 * ('전화연락' in registtype)
        item['ad05'] = 1 * ('바로방문' in registtype)

        # b. 모집 조건----------------------------------------------------------------------------

        css_recruit = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \
        "div.column.column_620.infoBox > div.recruitCondition > div > table"
        html_recruit = str(response.css(css_recruit).get())
        table_recruit = read_html(html_recruit)[0].set_index(0)

        try:
            finaldate = table_recruit.loc['마감일', 1]
            if '마감' in finaldate:
                item['ba00'] = finaldate[:finaldate.index('(')].replace(
                    '.', '')
            item['ba01'] = 1 * ('상시모집' in finaldate)
        except:
            pass

        try:
            pplnumraw = table_recruit.loc['인원', 1]
            if '인원미정' in pplnumraw:
                item['bb01'] = sum(['0' == x for x in pplnumraw])
            else:
                item['bb00'] = int(pplnumraw[:pplnumraw.index('명')])
            item['bh04'] = 1 * ('친구와 함께 근무가능' in pplnumraw)
        except:
            pass

        try:
            sex = table_recruit.loc['성별', 1]
            if sex == '무관':
                item['bc00'] = 0
            elif sex == '남자':
                item['bc00'] = 1
            elif sex == '여자':
                item['bc00'] = 2
        except:
            pass

        try:
            age = table_recruit.loc['연령', 1]
            if '무관' in age:
                item['bd00'] = 1
            elif '~' in age:
                agemin, agemax = [
                    int(s[s.index('년') - 4:s.index('년')])
                    for s in age.split('~')
                ]
                item['bd01'] = agemin
                item['bd02'] = agemax
            item['bh01'] = 1 * ('주부가능' in age)
            item['bh02'] = 1 * ('장년가능' in age)
            item['bh03'] = 1 * ('청소년가능' in age)  # 초보가능(bh00)은 뒤에 등장
        except:
            pass

        try:
            eduraw = table_recruit.loc['학력', 1]
            if eduraw == '무관':
                item['be00'] = 0
            elif '초등학교' in eduraw:
                item['be00'] = 1
            elif '중학교' in eduraw:
                item['be00'] = 2
            elif eduraw == '고등학교 졸업 이상':
                item['be00'] = 3
            elif eduraw == '대학(2,3년제) 졸업 이상':
                item['be00'] = 4
            elif eduraw == '대학(4년제) 졸업 이상':
                item['be00'] = 5
            elif '대학원' in eduraw:
                item['be00'] = 6
        except:
            pass

        # 표준화가 되어 있지 않아서 크게 쓸모가 없어 보임
        try:
            item['bf00'] = table_recruit.loc['모집분야', 1]
        except:
            pass

        try:
            prefer = table_recruit.loc['우대', 1]
            item['bg00'] = 1 * ('영어가능' in prefer)
            item['bg01'] = 1 * ('중국어가능' in prefer)
            item['bg02'] = 1 * ('일본어가능' in prefer)
            item['bg03'] = 1 * ('군필자' in prefer)
            item['bg04'] = 1 * ('업무 관련 자격증 소지' in prefer)
            item['bg05'] = 1 * ('유사업무 경험' in prefer)
            item['bg06'] = 1 * ('워드가능' in prefer)
            item['bg07'] = 1 * ('엑셀가능' in prefer)
            item['bg08'] = 1 * ('파워포인트 가능' in prefer)
            item['bg09'] = 1 * ('한글(HWP)가능' in prefer)
            item['bg10'] = 1 * ('포토샵가능' in prefer)
            item['bg11'] = 1 * ('컴퓨터활용가능' in prefer)
            item['bg12'] = 1 * ('대학재학생' in prefer)
            item['bg13'] = 1 * ('대학휴학생' in prefer)
            item['bg14'] = 1 * ('인근거주' in prefer)
            item['bg15'] = 1 * ('차량소지' in prefer)
            item['bg16'] = 1 * ('운전가능' in prefer)
            item['bg17'] = 1 * ('장애인' in prefer)
        except:
            pass

        # c.근무조건 --------------------------------------------------------

        item['ca00'] = response.meta['payamount']  # 급여액

        # 급여지급방식
        if response.meta['paytype'] == '시급':
            item['ca01'] = 0
        elif response.meta['paytype'] == '일급':
            item['ca01'] = 1
        elif response.meta['paytype'] == '주급':
            item['ca01'] = 2
        elif response.meta['paytype'] == '월급':
            item['ca01'] = 3
        elif response.meta['paytype'] == '연봉':
            item['ca01'] = 4
        elif response.meta['paytype'] == '건별':
            item['ca01'] = 5
        item['cd00'] = 1 * (response.meta['worktime'] == '시간협의')
        if '~' in response.meta['worktime']:
            wtbegin, wtend = response.meta['worktime'].split('~')
            item['cd01'] = wtbegin
            item['cd02'] = wtend

        # 근무 조건
        css_recruit = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \
            "div.column.column_620.infoBox > div.workCondition > div.viewTable > table"
        html_recruit = str(response.css(css_recruit).get())
        table_recruit = read_html(html_recruit)[0].set_index(0)

        try:
            payraw = table_recruit.loc['급여', 1]
            paydetail = payraw[payraw.index("0원") + 2:]
            item['ca02'] = 1 * ('협의가능' in paydetail)
            paydetail = paydetail.replace('협의가능', '')
            item['ca03'] = 1 * ('당일지급' in paydetail)
            paydetail = paydetail.replace('당일지급', '')
            item['ca04'] = 1 * ('주급가능' in paydetail)
            paydetail = paydetail.replace('주급가능', '')
            item['ca05'] = 1 * ('식대별도지급' in paydetail)
            paydetail = paydetail.replace('식대별도지급', '')
            item['ca06'] = 1 * ('수습기간있음' in paydetail)
            paydetail = paydetail.replace('수습기간있음', '')
            item['ca07'] = 1 * ('시간외수당 별도' in paydetail)
            paydetail = paydetail.replace('시간외수당 별도', '')
            item['ca08'] = paydetail.strip()
        except:
            pass

        try:
            workperiod = table_recruit.loc['근무기간', 1]
            item['cb01'] = 1 * ('협의가능' in workperiod)
            if '하루(1일)' in workperiod:
                item['cb00'] = 0
            elif '1주일이하' in workperiod:
                item['cb00'] = 1
            elif '1주일~1개월' in workperiod:
                item['cb00'] = 2
            elif '1개월~3개월' in workperiod:
                item['cb00'] = 3
            elif '3개월~6개월' in workperiod:
                item['cb00'] = 4
            elif '6개월~1년' in workperiod:
                item['cb00'] = 5
            elif '1년이상' in workperiod:
                item['cb00'] = 6
        except:
            pass

        try:
            workdays = table_recruit.loc['근무요일', 1]
            item['cc00'] = 1 * ('요일협의' in workdays)
            item['cc01'] = 1 * ('월~일' in workdays)
            item['cc02'] = 1 * ('월~토' in workdays)
            item['cc03'] = 1 * ('월~금' in workdays)
            item['cc04'] = 1 * ('토,일' in workdays)
            item['cc05'] = 1 * ('주6일' in workdays)
            item['cc06'] = 1 * ('주5일' in workdays)
            item['cc07'] = 1 * ('주4일' in workdays)
            item['cc08'] = 1 * ('주3일' in workdays)
            item['cc09'] = 1 * ('주2일' in workdays)
            item['cc10'] = 1 * ('주1일' in workdays)
        except:
            pass

        try:
            wtdetail = table_recruit.loc['근무시간', 1]
            item['cd03'] = 1 * ('익일' in wtdetail)
            wtdetail = wtdetail.replace('(익일)', '')
            item['cd04'] = wtdetail[wtdetail.index('휴게시간') +
                                    5:wtdetail.index('분')]
        except:
            pass

        try:
            emptype = table_recruit.loc['고용형태', 1]
            item['ce00'] = 1 * ('알바' in emptype)
            item['ce01'] = 1 * ('정규직' in emptype)
            item['ce02'] = 1 * ('계약직' in emptype)
            item['ce03'] = 1 * ('파견직' in emptype)
            item['ce04'] = 1 * ('청년인턴직' in emptype)
            item['ce05'] = 1 * ('위촉직' in emptype)
            item['ce06'] = 1 * ('연수생/교육생' in emptype)
        except:
            pass

        try:
            welfare = table_recruit.loc['복리후생', 1]
            # 보험 (wf_isr)
            item['cf00'] = 1 * ('국민연금' in welfare)
            item['cf01'] = 1 * ('고용보험' in welfare)
            item['cf02'] = 1 * ('산재보험' in welfare)
            item['cf03'] = 1 * ('건강보험' in welfare)

            # 휴가, 휴무
            item['cf04'] = 1 * ('정기휴가' in welfare)
            item['cf05'] = 1 * ('연차' in welfare)
            item['cf06'] = 1 * ('월차' in welfare)

            # 보상제도
            item['cf07'] = 1 * ('인센티브제' in welfare)
            item['cf08'] = 1 * ('정기보너스' in welfare)
            item['cf09'] = 1 * ('퇴직금' in welfare)
            item['cf10'] = 1 * ('퇴직연금' in welfare)
            item['cf11'] = 1 * ('우수사원 표창/포상' in welfare)
            item['cf12'] = 1 * ('장기근속자 포상' in welfare)

            # 수당제도
            item['cf13'] = 1 * ('야간근로수당' in welfare)
            item['cf14'] = 1 * ('휴일근로수당' in welfare)
            item['cf15'] = 1 * ('연월차수당' in welfare)
            item['cf16'] = 1 * ('장기근속수당' in welfare)
            item['cf17'] = 1 * ('위험수당' in welfare)
            item['cf18'] = 1 * ('연장근로수당' in welfare)

            # 생활안정 지원
            item['cf19'] = 1 * ('기숙사운영' in welfare)
            item['cf20'] = 1 * ('명절 귀향비 지급' in welfare)

            # 생활편의 지원
            item['cf21'] = 1 * ('조식제공' in welfare)
            item['cf22'] = 1 * ('중식제공' in welfare)
            item['cf23'] = 1 * ('석식제공' in welfare)
            item['cf24'] = 1 * ('근무복 지급' in welfare)
            item['cf25'] = 1 * ('통근버스 운행' in welfare)
            item['cf26'] = 1 * ('야간교통비 지급' in welfare)
            item['cf27'] = 1 * ('차량유류보조금' in welfare)
            item['cf28'] = 1 * ('주차비지원' in welfare)
            item['cf29'] = 1 * ('주차가능' in welfare)

            # 경조사 지원
            item['cf30'] = 1 * ('경조휴가제' in welfare)
            item['cf31'] = 1 * ('각종 경조금' in welfare)

        except:
            pass

        # d.업직종 --------------------------------------------------------

        try:
            jobtype = table_recruit.loc['업직종', 1]
            item['bh00'] = 1 * ('초보가능' in jobtype)

            item['da00'] = 1 * ('일반음식점' in jobtype)
            item['da01'] = 1 * ('레스토랑' in jobtype) * ('패밀리' not in jobtype)
            item['da02'] = 1 * ('레스토랑' in jobtype) * ('패밀리' in jobtype)
            item['da03'] = 1 * ('패스트푸드점' in jobtype)
            item['da04'] = 1 * ('치킨·피자전문점' in jobtype)
            item['da05'] = 1 * ('커피전문점' in jobtype)
            item['da06'] = 1 * ('아이스크림·디저트' in jobtype)
            item['da07'] = 1 * ('베이커리·도넛·떡' in jobtype)
            item['da08'] = 1 * ('호프·일반주점' in jobtype)
            item['da09'] = 1 * ('급식·푸드시스템' in jobtype)
            item['da10'] = 1 * ('도시락·반찬' in jobtype)

            # 유통 & 판매 (jt_sl)
            item['db00'] = 1 * ('백화점·면세점' in jobtype)
            item['db01'] = 1 * ('복합쇼핑몰·아울렛' in jobtype)
            item['db02'] = 1 * ('쇼핑몰·소셜커머스·홈쇼핑' in jobtype)
            item['db03'] = 1 * ('유통점·마트' in jobtype)
            item['db04'] = 1 * ('편의점' in jobtype)
            item['db05'] = 1 * ('의류·잡화매장' in jobtype)
            item['db06'] = 1 * ('뷰티·헬스스토어' in jobtype)
            item['db07'] = 1 * ('휴대폰·전자기기매장' in jobtype)
            item['db08'] = 1 * ('가구·침구·생활소품' in jobtype)
            item['db09'] = 1 * ('서점·문구·팬시' in jobtype)
            item['db10'] = 1 * ('약국' in jobtype)
            item['db11'] = 1 * ('농수산·청과·축산' in jobtype)
            item['db12'] = 1 * ('화훼·꽃집' in jobtype)
            item['db13'] = 1 * ('유통·판매·기타' in jobtype)

            # 문화 & 여가 & 생활 (leisure)
            item['dc00'] = 1 * ('놀이공원·테마파크' in jobtype)
            item['dc01'] = 1 * ('호텔·리조트·숙박' in jobtype)
            item['dc02'] = 1 * ('여행·캠프·레포츠' in jobtype)
            item['dc03'] = 1 * ('영화·공연' in jobtype)
            item['dc04'] = 1 * ('전시·컨벤션·세미나' in jobtype)
            item['dc05'] = 1 * ('스터디룸·독서실·고시원' in jobtype)
            item['dc06'] = 1 * ('PC방' in jobtype)
            item['dc07'] = 1 * ('노래방' in jobtype)
            item['dc08'] = 1 * ('볼링·당구장' in jobtype)
            item['dc09'] = 1 * ('스크린 골프·야구' in jobtype)
            item['dc10'] = 1 * ('DVD·멀티방·만화카페' in jobtype)
            item['dc11'] = 1 * ('오락실·게임장' in jobtype)
            item['dc12'] = 1 * ('이색테마카페' in jobtype)
            item['dc13'] = 1 * ('키즈카페' in jobtype)
            item['dc14'] = 1 * ('찜질방·사우나·스파' in jobtype)
            item['dc15'] = 1 * ('피트니스·스포츠' in jobtype)
            item['dc16'] = 1 * ('공인중개' in jobtype)
            item['dc17'] = 1 * ('골프캐디' in jobtype)
            item['dc18'] = 1 * ('고속도로휴게소' in jobtype)
            item['dc19'] = 1 * ('문화·여가·생활 기타' in jobtype)

            # 서비스
            item['dd00'] = 1 * ('매장관리·판매' in jobtype)
            item['dd01'] = 1 * ('MD' in jobtype)
            item['dd02'] = 1 * ('캐셔·카운터' in jobtype)
            item['dd03'] = 1 * ('서빙' in jobtype)
            item['dd04'] = 1 * ('주방장·조리사' in jobtype)
            item['dd05'] = 1 * ('주방보조·설거지' in jobtype)
            item['dd06'] = 1 * ('바리스타' in jobtype)
            item['dd07'] = 1 * ('안내데스크' in jobtype)
            item['dd08'] = 1 * ('주차관리·주차도우미' in jobtype)
            item['dd09'] = 1 * ('보안·경비·경호' in jobtype)
            item['dd10'] = 1 * ('주유·세차' in jobtype)
            item['dd11'] = 1 * ('전단지배포' in jobtype)
            item['dd12'] = 1 * ('청소·미화' in jobtype)
            item['dd13'] = 1 * ('렌탈관리·A/S' in jobtype)
            item['dd14'] = 1 * ('헤어·미용·네일샵' in jobtype)
            item['dd15'] = 1 * ('피부관리·마사지' in jobtype)
            item['dd16'] = 1 * ('반려동물케어' in jobtype)
            item['dd17'] = 1 * ('베이비시터·가사도우미' in jobtype)
            item['dd18'] = 1 * ('결혼·연회·장례도우미' in jobtype)
            item['dd19'] = 1 * ('판촉도우미' in jobtype)
            item['dd20'] = 1 * ('이벤트·행사스텝' in jobtype)
            item['dd21'] = 1 * ('나레이터모델' in jobtype)
            item['dd22'] = 1 * ('피팅모델' in jobtype)
            item['dd23'] = 1 * ('서비스 기타' in jobtype)

            # 사무직
            item['de00'] = 1 * ('사무보조' in jobtype)
            item['de01'] = 1 * ('문서작성·자료조사' in jobtype)
            item['de02'] = 1 * ('비서' in jobtype)
            item['de03'] = 1 * ('경리·회계보조' in jobtype)
            item['de04'] = 1 * ('인사·총무' in jobtype)
            item['de05'] = 1 * ('마케팅·광고·홍보' in jobtype)
            item['de06'] = 1 * ('번역·통역' in jobtype)
            item['de07'] = 1 * ('복사·출력·제본' in jobtype)
            item['de08'] = 1 * ('편집·교정·교열' in jobtype)
            item['de09'] = 1 * ('공공기관·공기업·협회' in jobtype)
            item['de10'] = 1 * ('학교·도서관·교육기관' in jobtype)

            # 고객상담 & 리서치 & 영업
            item['df00'] = 1 * ('고객상담·인바운드' in jobtype)
            item['df01'] = 1 * ('레마케팅·아웃바운드' in jobtype)
            item['df02'] = 1 * ('금융·보험영업' in jobtype)
            item['df03'] = 1 * ('일반영업·판매' in jobtype)
            item['df04'] = 1 * ('설문조사·리서치' in jobtype)
            item['df05'] = 1 * ('영업관리·지원' in jobtype)

            # 생산 & 건설 & 노무
            item['dg00'] = 1 * ('제조·가공·조립' in jobtype)
            item['dg01'] = 1 * ('포장·품질검사' in jobtype)
            item['dg02'] = 1 * ('입출고·창고관리' in jobtype)
            item['dg03'] = 1 * ('상하차·소화물 분류' in jobtype)
            item['dg04'] = 1 * ('기계·전자·전기' in jobtype)
            item['dg05'] = 1 * ('정비·수리·설치·A/' in jobtype)
            item['dg06'] = 1 * ('공사·건설현장' in jobtype)
            item['dg07'] = 1 * ('PVC(닥트·배관설치)' in jobtype)
            item['dg08'] = 1 * ('조선소' in jobtype)
            item['dg09'] = 1 * ('재단·재봉' in jobtype)
            item['dg10'] = 1 * ('생산·건설·노무 기타' in jobtype)

            # IT & 컴퓨터
            item['dh00'] = 1 * ('웹·모바일기획' in jobtype)
            item['dh01'] = 1 * ('사이트·콘텐츠 운영' in jobtype)
            item['dh02'] = 1 * ('바이럴·SNS마케팅' in jobtype)
            item['dh03'] = 1 * ('프로그래머' in jobtype)
            item['dh04'] = 1 * ('HTML코딩' in jobtype)
            item['dh05'] = 1 * ('QA·테스터·검증' in jobtype)
            item['dh06'] = 1 * ('시스템·네트워크·보안' in jobtype)
            item['dh07'] = 1 * ('PC·디지털기기 설치·관리' in jobtype)

            item['di00'] = 1 * ('입시·보습학원' in jobtype)
            item['di01'] = 1 * ('외국어·어학원' in jobtype)
            item['di02'] = 1 * ('컴퓨터·정보통신' in jobtype)
            item['di03'] = 1 * ('요가·필라테스 강사' in jobtype)
            item['di04'] = 1 * ('피트니스 트레이너' in jobtype)
            item['di05'] = 1 * ('레져스포츠 강사' in jobtype)
            item['di06'] = 1 * ('예체능 강사' in jobtype)
            item['di07'] = 1 * ('유아·유치원' in jobtype)
            item['di08'] = 1 * ('방문·학습지' in jobtype)
            item['di09'] = 1 * ('보조교사' in jobtype)
            item['di10'] = 1 * ('자격증·기술학원' in jobtype)
            item['di11'] = 1 * ('국비교육기관' in jobtype)
            item['di12'] = 1 * ('교육·강사 기타' in jobtype)

            # 디자인 (design -> ds)
            item['dj00'] = 1 * ('웹·모바일디자인' in jobtype)
            item['dj01'] = 1 * ('그래픽·편집디자인' in jobtype)
            item['dj02'] = 1 * ('제품·산업디자인' in jobtype)
            item['dj03'] = 1 * ('CAD·CAM·인테리어디자인' in jobtype)
            item['dj04'] = 1 * ('캐릭터·애니메이션디자인' in jobtype)
            item['dj05'] = 1 * ('패션·잡화디자인' in jobtype)
            item['dj06'] = 1 * ('디자인 기타' in jobtype)

            # 미디어 (media -> md)
            item['dk00'] = 1 * ('보조출연·방청' in jobtype)
            item['dk01'] = 1 * ('방송스텝·촬영보조' in jobtype)
            item['dk02'] = 1 * ('동영상촬영·편집' in jobtype)
            item['dk03'] = 1 * ('사진촬영·편집' in jobtype)
            item['dk04'] = 1 * ('조명·음향' in jobtype)
            item['dk05'] = 1 * ('방송사·프로덕션' in jobtype)
            item['dk06'] = 1 * ('신문·잡지·출판' in jobtype)
            item['dk07'] = 1 * ('미디어 기타' in jobtype)

            # 운전 & 배달 (delivery -> dv)
            item['dl00'] = 1 * ('운송·이사' in jobtype)
            item['dl01'] = 1 * ('대리운전·일반운전' in jobtype)
            item['dl02'] = 1 * ('택시·버스운전' in jobtype)
            item['dl03'] = 1 * ('수행기사' in jobtype)
            item['dl04'] = 1 * ('화물·중장비·특수차' in jobtype)
            item['dl05'] = 1 * ('택배·퀵서비스' in jobtype)
            item['dl06'] = 1 * ('배달' in jobtype)

            # 병원 & 간호 & 연구 (medical research -> mr)
            item['dm00'] = 1 * ('간호조무사·간호사' in jobtype)
            item['dm01'] = 1 * ('간병·요양보호사' in jobtype)
            item['dm02'] = 1 * ('원무·코디네이터' in jobtype)
            item['dm03'] = 1 * ('수의테크니션·동물보건사' in jobtype)
            item['dm04'] = 1 * ('실험·연구보조' in jobtype)
            item['dm05'] = 1 * ('생동성·임상실험' in jobtype)

        except:
            pass

        yield item

Example #30

0

Show file

    def extract_intent(text):

        if not text or text == 'None':
            return None

        # remove links in swagger []()
        if "](" in text or "] (" in text:
            text = replace_hyperlinks(text)

        text = BeautifulSoup(text, "lxml").text

        if ':' in text:
            text = text[:text.index(':')]

        if '(' in text:
            text = re.sub(r"\(.*\)", '', text)

        expr = None
        for sent in to_sentences(text):

            if len(sent) > 120:
                continue

            sent = sent.lower()

            tagged_sent = nlp.pos_tag(sent)

            count = 0
            for w, t in tagged_sent:
                if t in {'VB', 'VBZ'}:
                    count += 1

            if count > 1 and len(sent) > 80:
                # print("More than one verb: ", sent)
                continue

            if tagged_sent[0][1] == 'VB' or tagged_sent[0][0] in common_verbs or \
                    (tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB'):
                expr = sent
            elif tagged_sent[0][1] == 'VBZ' or tagged_sent[0][0] in common_sverbs or \
                    (tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB'):
                if tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB':
                    verb = tagged_sent[1][0]
                else:
                    verb = tagged_sent[0][0]
                old_verb = verb
                if verb not in {"was", "is", "has"}:
                    verb = lemmatizer.lemmatize(verb, pos=VERB)

                expr = sent.replace(old_verb, verb)

            if expr and 'http' in expr:
                continue

            if expr and 'see' in expr and (':' in expr or 'please' in expr or 'href' in expr):
                continue

            if expr and ('<' in expr and '>' in expr or '<p>' in expr):
                continue

            if expr:
                expr = finalize_utterance(expr)
                if " by " in expr:
                    expr = expr[:expr.index(" by ")]
                return expr

        return None