def scrapeMedical(url, frontTrim, backTrim): '''Function specific to scraping the 24 pages medical terminology pages on wikipedia''' request = urllib.request.Request(url) html = urllib.request.urlopen(request).read() soup = BeautifulSoup(html, 'html.parser') 'Scrapes webpage within a specific class' main_table = soup.find('div', attrs={'class': 'mw-parser-output'}) items = main_table.find_all('tr') items = str(items) items = items.replace('\n', ' ') items = BeautifulSoup(items, 'lxml').text items = items.split(' , ') #get rid of useless html at beginning and end of page del items[:frontTrim] del items[-backTrim:] #breaks string into list, so that the first item (the acromyn) can be seperated for definition in items: items[items.index(definition)] = definition.split(' ') #combines everything but the acromyn into a string for definition in items: items[items.index(definition)][1:len(definition)] = [ ' '.join(items[items.index(definition)][1:len(definition)]) ] return items
def get_url(city): str0 = 'http://sugg.us.search.yahoo.net/gossip-gl-location/?appid=weather&output=xml&command=' + str( city) with request.urlopen(str0) as f: data = f.read().decode('utf-8') s = BeautifulSoup(data, 'lxml').find("s")["d"] woeid = s[s.index('woeid') + 6:s.index('woeid') + 13] q = 'select+*+from+weather.forecast+where+woeid%3D' + woeid + '+and+u%3D%22c%22&diagnostics=true' url = 'https://query.yahooapis.com/v1/public/yql?q=' + q return url
def getProxies(): ips = BeautifulSoup( requests.get('http://www.xicidaili.com/nn/1', headers=headers).text, 'html.parser').select("td") _ip = '' for ip in ips: if ips.index(ip) % 10 == 1: _ip += str(ip).replace('<td>', '').replace('</td>', '') if ips.index(ip) % 10 == 2: _ip += ':' + str(ip).replace('<td>', '').replace('</td>', '') if ips.index(ip) % 10 == 5: proxies[str(ip).replace('<td>', '').replace('</td>', '')] = _ip break
def parser_requester(company_name): try: url_string = BeautifulSoup( requests.get(url + company_name + tag_student_progs).content, "html.parser").find('div', { 'class': 'kCrYT' }).find_all('a')[0]['href'] return url_string[7:url_string.index('&')] except: url_string = BeautifulSoup( requests.get(url + company_name + tag_careers).content, "html.parser").find('div', { 'class': 'kCrYT' }).find_all('a')[0]['href'] return url_string[7:url_string.index('&')]
def scrap_reviews(): global reviews_list # iterate through global reviews_list_html which is output of method get_reviews_for_movie() for review in reviews_list_html: result = re.search('href=\"(.*)\">', str(review)) url = "https://www.filmweb.pl" + result.group(1) review_page = get_page(url) review_soup = BeautifulSoup(review_page.content, 'html.parser') # get review content using specific HTML attributes for filmweb review_content_html = review_soup.find("div", attrs={ "itemprop": "reviewBody" }).text review_content = BeautifulSoup(review_content_html, "lxml").text index = review_content.index('waitingModule') review_content = review_content[:index] rev_title = review_soup.find("h2", attrs={"itemprop": "name"}).text author = review_soup.find("div", attrs={"itemprop": "author"}).text review_rating = review_soup.find("span", { "class": "reviewRatingPercent" }).text[:-1] result = re.search('\"(.*) ', review_soup.find("ul", { "class": "newsInfo" }).text) pub_date = result.group(1)[:10] # Using Review class create an object review review = make_review(0, rev_title, review_content, author, review_rating, pub_date) reviews_list.append(review)
def get_stock(self, code="A005930", start="19000101", end="30000101"): url = self.base_url.format(code, code[1:], code, start, end) response = requests.get(url) download_url = "http://file.krx.co.kr/download.jspx" json_data = {"code": response.content} headers_json = { "Referer": "http://marketdata.krx.co.kr/contents/MKD/99/MKD9900001.jspx" } data = requests.post(download_url, data=json_data, headers=headers_json) parsing = BeautifulSoup(data.text) parsing = parsing.text.split("\n") parsing = [ line.replace(",", "").strip('"').replace('""', '"') for line in parsing ] parsing = [x.split('"') for x in parsing] self.parse = parsing parsing = pd.DataFrame(parsing[1:], columns=self.columns) parsing.index = pd.to_datetime(parsing["date"], format="%Y/%m/%d") parsing.drop("date", axis=1, inplace=True) self.data = parsing.astype("float64") return self.data
def download_img(title, link): nomakechar = [":", "/", "\\", "?", "*", "“", "<", ">", "|"] for item in nomakechar: if title.find(item) > -1: title = title.replace(item, '') if os.path.exists(title): return os.makedirs(title) re = s.get(link, headers=header) re.encoding = 'gbk' div = BeautifulSoup(re.text, "html.parser").find_all( 'div', class_='tpc_content do_not_catch')[0] img = BeautifulSoup(str(div), "html.parser").find_all('img') pbar = tqdm(total=len(img)) for i in img: file_name = title + '/' + str(img.index(i)) + '.jpg' if not os.path.exists(file_name): download_link = '' if i.get('data-src') == None: download_link = i.get('ess-data') else: download_link = i.get('data-src') index = 1 while True: try: re = s.get(download_link, headers=header) with open(file_name, 'wb') as f: f.write(re.content) except: index += 1 continue else: break pbar.update(1) pbar.close()
def currency_get(): fp = urllib.request.urlopen( 'http://info.finance.naver.com/marketindex/exchangeList.nhn') source = fp.read() fp.close() class_list = ["tit", "sale"] soup = BeautifulSoup(source, 'html.parser') soup = soup.find_all("td", class_=class_list) money_data = {} for data in soup: if soup.index(data) % 2 == 0: data = data.get_text().replace('\n', '').replace('\t', '') money_key = data elif soup.index(data) % 2 == 1: money_value = data.get_text() money_data[money_key] = money_value return money_data
def getMostStableStructure(HTMLFile): print("Creating Most Stable Structure") soup = BeautifulSoup(HTMLFile, "html.parser").get_text() ind = soup.index("Structures sorted by energy") proteinNumber = int(soup[ind + 59:ind + 61]) print(soup[ind:ind+61]) print(proteinNumber) return proteinNumber
def get_currency(self): url = urllib.request.urlopen( "http://info.finance.naver.com/marketindex/exchangeList.nhn") source = url.read() url.close() class_list = ["tit", "sale"] soup = BeautifulSoup(source, 'lxml') soup = soup.find_all("td", class_=class_list) money_data = {} for data in soup: if soup.index(data) % 2 == 0: data = data.get_text().replace('\n', '').replace('\t', '') money_key = data elif soup.index(data) % 2 == 1: money_value = data.get_text() money_data[money_key] = money_value money_key = None money_value = None return money_data
def scrape_media_titles(self): req = requests.get(self.search_address).text results = Soup(req, 'html.parser').findAll("td", {"class": "result_text"}) for result in results: self.titles.append("{}. {}".format( results.index(result) + 1, str(result.contents[1]).split(">")[1].split("</")[0] + result.contents[2])) return
def fetch(view, site_id): try: filename = DIR + "/" + get_file_name(site_id, view, get_timezone_time(site_id)) except (KeyError, TypeError): filename = "filename unknown" # print("Fetching file:", filename) if not os.path.exists(DIR): os.makedirs(DIR) if not os.path.exists(filename): global last_fetch while time.time() - last_fetch < wait_time: time.sleep(0.1) last_fetch = time.time() url = URL + '?view={view}&cond=site_id={site_id}'.format(view=view, site_id=site_id) try: with requests.get(url) as r: s = r.content r.close() try: s = BeautifulSoup(s, "html.parser").find_all('script')[-2].text new_url = "https://solrenview.com" + s[s.index('/downloads'):s.index('.csv') + 4] except: return "" filename = DIR + '/' + new_url.split('/')[-1] # .csv file with requests.get(new_url) as r: raw = r.text r.close() except ChunkedEncodingError as e: print(e) time.sleep(wait_time) return fetch(view, site_id) # might not be the best solution but idk how else to fix it with open(filename, 'w+') as f: f.write(raw) else: with open(filename, 'r') as f: raw = f.read() return raw
def collect(self): fp = urllib.request.urlopen(self.url) source = fp.read() fp.close() class_list = ["tit", "sale"] soup = BeautifulSoup(source, 'html.parser') soup = soup.find_all("td", class_=class_list) money_data = {} for data in soup: if soup.index(data) % 2 == 0: data = data.get_text().replace('\n', '').replace('\t', '') money_key = data elif soup.index(data) % 2 == 1: money_value = data.get_text() if money_key == '미국 USD': self.price = money_value.replace(',','') # float(aa.replace(',','')) money_data[money_key] = money_value self.dao.upsert(self.price)
def get_url_info(self, url): pic_first_page = self.request(url) description = BeautifulSoup(pic_first_page.text, 'lxml').find('div', class_='main-meta') # 基本信息 pic_meta = self.deal_pic_info(description.text) # 页数 pic_pagenavi = BeautifulSoup(pic_first_page.text, 'lxml').find( 'div', class_='pagenavi').find_all('span')[-2].text # 图片地址 pic_img = BeautifulSoup(pic_first_page.text, 'lxml').find( 'div', class_='main-image').find('img')['src'] # 拼接 图片地址组 # 当前只看到这种类型 pre_url = '' end_url = '' if '.jpg' in pic_img: lc_jpg = pic_img.index('.jpg') pre_url = pic_img[0:lc_jpg - 2] end_url = pic_img[lc_jpg:] elif '.jpeg' in pic_img: lc_jpg = pic_img.index('.jpeg') pre_url = pic_img[0:lc_jpg - 2] end_url = pic_img[lc_jpg:] # 扩展 else: logging.warning('******未处理图片类型:%s' % pic_img) all_img = [] for i in range(1, int(pic_pagenavi) + 1): if i < 10: val = '0' + str(i) else: val = str(i) all_img.append({'order': i, 'img_url': pre_url + val + end_url}) # 以第一页浏览量统计 pic_meta['views'] = int(pic_meta['views']) * int(pic_pagenavi) pic_meta['img_list'] = all_img return pic_meta
def crawling(web_url): html = urlopen(web_url) bsObject = BeautifulSoup(html, "html.parser") #print(bsObject) #print(bsObject.find('script')) bsObject=str(bsObject) #print(type(bsObject)) #ingredient=re.findall('[[]재료[]].*',bsObject) start=bsObject.index("application") end=bsObject.index("</script>",start) dic=eval(bsObject[start+len('application/ld+json">'):end]) title=dic['name'] author=dic['author']['name'] food_ingredient=dic['recipeIngredient'] for i in range(len(food_ingredient)): ingredient=food_ingredient[i] food_ingredient[i]=re.findall('[가-힣\s]+',ingredient)[0].rstrip() picture=dic['image'][0] link=web_url return {'title':title, 'author':author, 'food_ingredient':food_ingredient, 'picture':picture, 'link':link}
def __get_job(self, data, se, link): if se == 'bing': result = re.search('.*-(.*)-.*|.*', data.getText()).group( 1) # re.search('((?<=>)[A-Z].+?) - ', str(data)).group(1) result = re.search('.*-(.*)-.*|.*', data.getText()).group(1) if result: return result try: self.linked_in_driver.get(link) result = BeautifulSoup(linked_in_driver.page_source, "lxml") result = result.findAll( 'h2', {'class': 'mt1 t-18 t-black t-normal break-words'})[0].text result = result[:result.index('at')] result = result[:result.index('@')] result = result.strip() except: pass return result
def __init__(self): # Naver 환율 페이지 크롤링(Crawling) 작업 fp = urllib.request.urlopen( 'http://info.finance.naver.com/marketindex/exchangeList.nhn') source = fp.read() fp.close() # 크롤링한 정보 # tit - 통화명 # sale - 매매기준율 class_list = ["tit", "sale"] # BeautifulSoup으로 html소스를 python객체로 변환 # (html소스코드, 이용할 parser) # python 내장 html.parser를 이용 soup = BeautifulSoup(source, 'html.parser') soup = soup.find_all("td", class_=class_list) # 각 국가별 환율 정보 저장 money_data = {} for data in soup: if soup.index(data) % 2 == 0: data = data.get_text().replace('\n', '').replace('\t', '') # HTML 태그 제거 money_key = data # key 값에 통화명 저장 elif soup.index(data) % 2 == 1: money_value = data.get_text() money_data[money_key] = money_value # 통화명 key 값에 각각의 매매기준율 저장 money_key = None money_value = None # 사전 money_data를 리스트 money_data_keys, money_data_values로 변환 money_data_keys = [] money_data_values = [] for key, values in money_data.items(): money_data_keys.append([key]) money_data_values.append([values]) print(money_data_keys)
def get_img_url(raw_url): # Get text from page bs4_text = BeautifulSoup(get_page(raw_url), 'lxml').get_text() img_url_location = bs4_text.index( 'Image URL') # 'Image URL is what we are searching for in the text bs4_text = bs4_text[img_url_location:] bs4_text_list = bs4_text.splitlines() # Get line that has the image URL and return it img_url_line = bs4_text_list[0] img_url = img_url_line.split(' ')[4] return img_url
def location_parser(company_name): url = "" final_list = [] try: url = BeautifulSoup( requests.get(URL + company_name + locations).content, "html.parser").find('div', { 'class': 'kCrYT' }).find_all('a')[0]['href'] url = url[7:url.index('&')] page = BeautifulSoup(requests.get(url).content, "html.parser").find_all(text=True) visible_texts = filter(tag_visible, page) visible_texts = u" ".join(t.strip() for t in visible_texts) for t in all_locations: if t in visible_texts: final_list.append(t) except: try: url = BeautifulSoup( requests.get(URL + company_name + tag_locations).content, "html.parser").find('div', { 'class': 'kCrYT' }).find_all('a')[0]['href'] url = url[7:url.index('&')] page = BeautifulSoup(requests.get(url).content, "html.parser").find_all(text=True) visible_texts = filter(tag_visible, page) visible_texts = u" ".join(t.strip() for t in visible_texts) for t in all_locations: if t in visible_texts: final_list.append(t) except: print(company_name, " NOT FOUND!") return url, final_list
def generateSnippetNgram(queryTerms, doc, ngramSize): lookAhead = 40 postTail = 50 htmlContent = BeautifulSoup(doc, "html.parser").find('pre').get_text() for i in range(len(queryTerms) - (ngramSize - 1)): if ngramSize == 3: queryTerm = queryTerms[i] + " " + queryTerms[i + 1] + " " + queryTerms[i + 2] elif ngramSize == 2: queryTerm = queryTerms[i] + " " + queryTerms[i + 1] else: queryTerm = queryTerms[i] termLocation = htmlContent.find(queryTerm) if termLocation != -1: startIndex = termLocation - lookAhead if startIndex <= 0: startIndex = 0 else: while startIndex > 0: if htmlContent[startIndex - 1:startIndex] not in [" ", "\n"]: startIndex -= 1 else: break endIndex = htmlContent.index(queryTerm) + len(queryTerm) + postTail if endIndex > len(htmlContent): endIndex = len(htmlContent) while endIndex < len(htmlContent): if htmlContent[endIndex:startIndex - 1] not in [" ", "\n"]: endIndex += 1 else: break first = htmlContent[startIndex: htmlContent.index(queryTerm)] second = htmlContent[htmlContent.index(queryTerm): htmlContent.index(queryTerm) + len(queryTerm)] third = htmlContent[htmlContent.index(queryTerm) + len(queryTerm): endIndex] return first, second, third return False, False, False
def scrape(url, frontTrim, backTrim, splitMethod, seperator): request = urllib.request.Request(url) html = urllib.request.urlopen(request).read() soup = BeautifulSoup(html, 'html.parser') main_table = soup.find('div', attrs={'class': 'mw-parser-output'}) items = main_table.find_all(seperator) items = str(items) items = BeautifulSoup(items, 'lxml').text items = items.split(',') del items[:frontTrim] del items[-backTrim:] for pair in items: items[items.index(pair)] = pair.strip() for pair in items: items[items.index(pair)] = pair.split(splitMethod) return (items)
def parse_and_render_recipe(input, js_path_prefix="", css_path_prefix=""): """ Parse a recipe from markdown Args: input: str, The recipe as markdown """ parser = commonmark.Parser() ast = parser.parse(input) renderer = commonmark.HtmlRenderer() html = renderer.render(ast) soup = BeautifulSoup(html, 'html.parser') # print(soup.prettify()) steps = soup.find("h2", text="Zubereitung") if (nextPart := steps.findNextSibling("h2")) is not None: steps_upto = soup.index(nextPart)
def linkWordMatching(self, links, words): print("Start Links Contents") linkContent = {} for link in links: rows = self.queries.Get_html(link) soup = rows[0][0] soup = soup.replace( 'class="srow bigbox container mi-df-local locked-single"', 'class="row bigbox container mi-df-local single-local"') soup = BeautifulSoup(soup, "html.parser") [s.extract() for s in soup('script')] [s.extract() for s in soup('style')] soup = soup.text soup = soup.lower() soup = " ".join(soup.split('\n')) soup = " ".join(soup.split('\r')) for word in words: word_unstemed = self.queries.Get_unstemed(word, link) if (word_unstemed in soup): index = soup.index(word_unstemed) if (index - 500 >= 0): start = index - 500 else: start = 0 if (index + 500 <= len(soup)): end = index + 500 else: end = len(soup) - 1 while (start != 0 and soup[start] != " " and soup[start] != "\n"): start = start - 1 if (start != 0): start = start + 1 while (end != 0 and soup[end] != " " and soup[end] != "\n"): end = end + 1 if (end != 0): end = end - 1 text = ''.join(soup[start:end]) linkContent[link] = text break print("End Links Contents") return linkContent
def get_Tbils(self, number_of_results): global browser ## browser.headers['origin']=" https://www.bot.go.tz" ## browser.headers['Cookie']=" ASPSESSIONIDCUBSCTSB=CGIFEPJBOJIOFIPBMIHFGDEP" url_transaction = 'http://www.bot.go.tz/FinancialMarkets/FinancialMarkets.asp' raw_html = browser.get(url_transaction) data_raw = Parser(raw_html.text, 'html.parser').findAll( 'select', {'name': "TreasuryBillAuctionResults"})[0]('option') if number_of_results is 0: auctions = [auc.attrs['value'] for auc in data_raw] else: auctions = list(self.stop() if data_raw.index(auc) == number_of_results else auc.attrs['value'] for auc in data_raw) print(auctions) result = {auc: self.get_Treasury_bill(auc) for auc in auctions} return result
def download_img(title, link): nomakechar = [":","/","\\","?","*","“","<",">","|","”"] for item in nomakechar: if title.find(item)>-1: title = title.replace(item, '') if os.path.exists(title): return os.makedirs(title) re = s.get(link, headers=header) re.encoding = 'gbk' img = BeautifulSoup(re.text, "html.parser").find_all( 'img', class_='zoom') pbar = tqdm(total=len(img)) for i in img: file_name = title + '/' + str(img.index(i)) + '.jpg' if not os.path.exists(file_name): download_link = '' # if i.get('src') == None: # download_link = i.get('file') # else: # download_link = i.get('src') download_link = i.get('file') index = 1 while index<=1: try: re = s.get(download_link, headers=header, timeout=5) with open(file_name, 'wb') as f: f.write(re.content) except: index += 1 continue else: break pbar.update(1) pbar.close()
print geocode #Google imposes query limits, this lets us pass a failure and have the loop sleep and try again after 2 seconds if geocode['status'] == "OVER_QUERY_LIMIT": return 0 if geocode['status'] != 'ZERO_RESULTS': coord_lat = geocode['results'][0]['geometry']['location']['lat'] coord_lon = geocode['results'][0]['geometry']['location']['lng'] coord.append(coord_lat) coord.append(coord_lon) print coord return coord url = "https://www.denvergov.org/Portals/707/documents/mydenverdrive/1-22-25-2013.pdf" xml = scraperwiki.pdftoxml(urllib2.urlopen(url).read()) parsed = BeautifulSoup(xml).text.split("\n") filtered_list = parsed[parsed.index('Location: '):] closures = [] i = 0 current_closure = -1 while i < len(filtered_list): text = filtered_list[i] if text == "Location: ": closures.append({}) current_closure = len(closures) - 1 i += 1 closures[current_closure]['location'] = filtered_list[i] #print filtered_list[i] coordinate = geocode(filtered_list[i]) if(coordinate == 0):
def getDom(pageurl, charset): if charset is None: charset = 'utf-8' soup = BeautifulSoup(pageurl, 'html.parser', from_encoding=charset) #去除特定的head、script、style、img、input标签 [ body.extract() for body in soup(['head', 'img', 'script', 'style', 'input']) ] # #去除注释 for element in soup(text=lambda text: isinstance(text, Comment)): element.extract() pass #将soup中的文本提取出来,并存储到body数组中 soup = soup.text.strip().lstrip().rstrip().split() #获取当前日期和具体时间,以便提出soup数组中出现的无效数据(当前系统时间) currentDate = time.strftime('%Y-%m-%d', time.localtime(time.time())) #处理日期格式,例:2017-04-20---->2017-4-20 currentDate1 = currentDate[0:5] + currentDate[6:] #获取当前具体时间,以便提出soup数组中出现的无效数据(当前系统时间) currentTime = time.strftime('%H:%M', time.localtime(time.time())) #处理日期格式,例:23:58---->23:5,防止因为程序运行而导致的时间误差 currentTime1 = currentTime[0:4] #剔除soup数组中无效字符串,减少干扰 #剔除soup数组中'copyright'后半部分的版权内容 #剔除soup数组中无效的年月。如"1999",剔除“2001-2007”格式的时间字符串 #剔除soup数组中类似于”最后登录:2017-04-20 23::55“的无效时间 #剔除soup数组中出现的当前系统时间字符串 re0 = re.compile(r'.*Copyright.*') re1 = re.compile(r'.*((19\d{2}\D)|(\d{4}-\d{4}\D)).*') re2 = re.compile(r'(^|.*)注册.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}$') re3 = re.compile(r'.*(' + currentDate1 + '|' + currentDate + ').*') re4 = re.compile(r'.*' + currentTime1 + '\d.*') re5 = re.compile(r'^最后.*(\d{2,4}(-|/))?\d{1,2}(-|/)\d{1,2}.*') for item in soup: #剔除soup数组中以":"、“:”结尾的文本字符串 if item.endswith(":"): soup.pop(soup.index(item)) pass if item.endswith(":"): soup.pop(soup.index(item)) pass #剔除soup数组中"|"和“»”和“›”文本字符串 if '|' in soup: soup.pop(soup.index('|')) pass if '>' in soup: soup.pop(soup.index('>')) pass if '»' in soup: soup.pop(soup.index('»')) pass if '›' in soup: soup.pop(soup.index('›')) pass if re0.match(item): CopyrightIndex = soup.index(item) - 5 while CopyrightIndex <= len(soup) - 1: popItem = soup.pop(CopyrightIndex) pass pass #剔除不规则时间 if re1.match(item): #获得主体内容中re1指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,防止被重复遍历 soup.pop(timeIndex) pass if re2.match(item): #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,并让该下标下的值置为'|',防止被重复遍历 soup.pop(timeIndex) pass #剔除当前系统时间 if re3.match(item): if item in soup: if re4.match(soup[soup.index(item) + 1]): #剔除当前系统时间 soup.pop(soup.index(item) + 1) #去除当前系统日期 soup.pop(soup.index(item)) pass pass pass if re5.match(item): if item in soup: #获得主体内容中re2指定格式匹配到的文本所在的下标timeIndex timeIndex = soup.index(item) #,剔除不规则不正常时间数据,并让该下标下的值置为'|',防止被重复遍历 soup.pop(timeIndex) pass pass pass #返回经过数据预处理的soup数组 return soup pass
for schedule in schedules: date = schedule game_list = [] for dates in date: url = f"https://www.hockey-reference.com/boxscores/{dates}.html#all_scoring" games_html = urlopen(url) #games_html = open(f"C:\\Users\\dbge\\OneDrive - Chevron\\Random\\{dates}.html") #This is for use if you have individual games saves as HTML code. games_soup = BeautifulSoup(games_html, 'lxml') table = games_soup.find('table') rows = table.findAll('tr') str_cells = str(rows) cleantext = BeautifulSoup(str_cells, 'lxml').get_text() try: #This is needed because some of the games have two types of goals. For example, SH and EN. I needed to account for these. commaindex = cleantext.index("\n\t\t\t,") # commaindex = commaindex + 4 # cleantext = cleantext[:(commaindex-4)] + cleantext[(commaindex+6):(commaindex+8)] + "\n\n" + cleantext[(commaindex+8):] # s = cleantext.split(',') # except ValueError: s = cleantext.split(',') try: # Some games had two types of goals, twice. Therefore, I needed to repeat this section of code. I never saw a game with two types of goals, three times. commaindex = cleantext.index("\n\t\t\t,") commaindex = commaindex + 4 # cleantext = cleantext[:(commaindex-4)] + cleantext[(commaindex+6):(commaindex+8)] + "\n\n" + cleantext[(commaindex+8):] # s = cleantext.split(',') # except ValueError: s = s
def crawl(self, response): item = AlbamonItem() css_main = "#allcontent > div.viewContent.viewRecruitType > " item['aa00'] = response.meta['num'] # 게시글 고유번호 # 등록일자 css_regist_time = css_main + "div.registInfo.clearfix.devHidePrint > div.regDate > div > span::text" item['aa01'] = response.css(css_regist_time).get().replace("등록", "").strip() # 수집일자 y = datetime.now().year m = datetime.now().month d = datetime.now().day h = datetime.now().hour n = datetime.now().minute date = f"{y}-{str(0)*(2-len(str(m)))}{m}-{str(0)*(2-len(str(d)))}{d}" time = f" {str(0)*(2-len(str(h)))}{h}:{str(0)*(2-len(str(n)))}{n}" item['aa02'] = date + time # 맨 처음 노출되는 기업명 & 게시물 제목 css_recruitInfo = "div.viewTypeFullWidth > div.companyInfo.infoBox > div.recruitInfo > " css_company = css_main + css_recruitInfo + "div.company > span::text" company = response.css(css_company).extract() item['aa03'] = company[0] # 기업이름 css_title = css_main + css_recruitInfo + "h1::text" item['aa04'] = response.css(css_title).get() # 게시물 제목 # 근무장소 세부사항 css_workarea = css_main + "div.viweTab > div.tabItem_workArea > div.workAddr > span::text" item['ab00'] = response.meta['workarea'] # 근무장소 간단히 (시군구) item['ab01'] = response.css(css_workarea).get() # 근무장소 자세히 (좌표 변환 가능) css_near = "div.viweTab > div.tabItem_workArea > div.mapInfo > div > ul > li" near = response.css(css_near).extract() subway = "" college = "" for html in near: s = BeautifulSoup(html, 'lxml') if s.select_one('span.mapItemTitle').text == '인근지하철': subnums = [ x['href'][x['href'].index("CodSubway=") + 10:] for x in s.select('div > a') ] times = [ x.text[x.text.index("도보") + 3:x.text.index("분")] for x in s.select('span.areaSummary') ] for subnum, time in zip(subnums, times): subway += subnum + "_" + time + " " elif s.select_one('span.mapItemTitle').text == '인근대학': college = s.select_one("span.areaSummary").text item['ab02'] = subway.strip() item['ab03'] = college.strip() # 기업정보 css_firmid = "#section_cropInfo > a::attr(href)" firmid = response.css(css_firmid).get() item['ac00'] = firmid[firmid.index("C_No=") + 5:] item['ac01'] = 1 * ('근로계약서 작성약속' in company) item['ac02'] = 1 * ('성희롱 예방교육수료' in company) # 지원 방법 css_regist_type = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \ "div.column.column_340.infoBox.devHidePrint > button::text" list_regist_type = [ e.replace(" ", "").replace("\n", "").replace("\r", "") for e in response.css(css_regist_type).extract() ] registtype = " ".join(list_regist_type) css_regist_type2 = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \ "div.column.column_340.infoBox.devHidePrint > div.recruitType.one > ul > li > button::text" if len(response.css(css_regist_type2).extract()): registtype += " " + response.css(css_regist_type2).get().replace( "\n", "") item['ad00'] = 1 * ('온라인지원' in registtype) item['ad01'] = 1 * ('간편문자지원' in registtype) item['ad02'] = 1 * ('이메일지원' in registtype) item['ad03'] = 1 * ('홈페이지' in registtype) item['ad04'] = 1 * ('전화연락' in registtype) item['ad05'] = 1 * ('바로방문' in registtype) # b. 모집 조건---------------------------------------------------------------------------- css_recruit = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \ "div.column.column_620.infoBox > div.recruitCondition > div > table" html_recruit = str(response.css(css_recruit).get()) table_recruit = read_html(html_recruit)[0].set_index(0) try: finaldate = table_recruit.loc['마감일', 1] if '마감' in finaldate: item['ba00'] = finaldate[:finaldate.index('(')].replace( '.', '') item['ba01'] = 1 * ('상시모집' in finaldate) except: pass try: pplnumraw = table_recruit.loc['인원', 1] if '인원미정' in pplnumraw: item['bb01'] = sum(['0' == x for x in pplnumraw]) else: item['bb00'] = int(pplnumraw[:pplnumraw.index('명')]) item['bh04'] = 1 * ('친구와 함께 근무가능' in pplnumraw) except: pass try: sex = table_recruit.loc['성별', 1] if sex == '무관': item['bc00'] = 0 elif sex == '남자': item['bc00'] = 1 elif sex == '여자': item['bc00'] = 2 except: pass try: age = table_recruit.loc['연령', 1] if '무관' in age: item['bd00'] = 1 elif '~' in age: agemin, agemax = [ int(s[s.index('년') - 4:s.index('년')]) for s in age.split('~') ] item['bd01'] = agemin item['bd02'] = agemax item['bh01'] = 1 * ('주부가능' in age) item['bh02'] = 1 * ('장년가능' in age) item['bh03'] = 1 * ('청소년가능' in age) # 초보가능(bh00)은 뒤에 등장 except: pass try: eduraw = table_recruit.loc['학력', 1] if eduraw == '무관': item['be00'] = 0 elif '초등학교' in eduraw: item['be00'] = 1 elif '중학교' in eduraw: item['be00'] = 2 elif eduraw == '고등학교 졸업 이상': item['be00'] = 3 elif eduraw == '대학(2,3년제) 졸업 이상': item['be00'] = 4 elif eduraw == '대학(4년제) 졸업 이상': item['be00'] = 5 elif '대학원' in eduraw: item['be00'] = 6 except: pass # 표준화가 되어 있지 않아서 크게 쓸모가 없어 보임 try: item['bf00'] = table_recruit.loc['모집분야', 1] except: pass try: prefer = table_recruit.loc['우대', 1] item['bg00'] = 1 * ('영어가능' in prefer) item['bg01'] = 1 * ('중국어가능' in prefer) item['bg02'] = 1 * ('일본어가능' in prefer) item['bg03'] = 1 * ('군필자' in prefer) item['bg04'] = 1 * ('업무 관련 자격증 소지' in prefer) item['bg05'] = 1 * ('유사업무 경험' in prefer) item['bg06'] = 1 * ('워드가능' in prefer) item['bg07'] = 1 * ('엑셀가능' in prefer) item['bg08'] = 1 * ('파워포인트 가능' in prefer) item['bg09'] = 1 * ('한글(HWP)가능' in prefer) item['bg10'] = 1 * ('포토샵가능' in prefer) item['bg11'] = 1 * ('컴퓨터활용가능' in prefer) item['bg12'] = 1 * ('대학재학생' in prefer) item['bg13'] = 1 * ('대학휴학생' in prefer) item['bg14'] = 1 * ('인근거주' in prefer) item['bg15'] = 1 * ('차량소지' in prefer) item['bg16'] = 1 * ('운전가능' in prefer) item['bg17'] = 1 * ('장애인' in prefer) except: pass # c.근무조건 -------------------------------------------------------- item['ca00'] = response.meta['payamount'] # 급여액 # 급여지급방식 if response.meta['paytype'] == '시급': item['ca01'] = 0 elif response.meta['paytype'] == '일급': item['ca01'] = 1 elif response.meta['paytype'] == '주급': item['ca01'] = 2 elif response.meta['paytype'] == '월급': item['ca01'] = 3 elif response.meta['paytype'] == '연봉': item['ca01'] = 4 elif response.meta['paytype'] == '건별': item['ca01'] = 5 item['cd00'] = 1 * (response.meta['worktime'] == '시간협의') if '~' in response.meta['worktime']: wtbegin, wtend = response.meta['worktime'].split('~') item['cd01'] = wtbegin item['cd02'] = wtend # 근무 조건 css_recruit = "div.viewTypeFullWidth > div.conditionInfo.verticalLine > " \ "div.column.column_620.infoBox > div.workCondition > div.viewTable > table" html_recruit = str(response.css(css_recruit).get()) table_recruit = read_html(html_recruit)[0].set_index(0) try: payraw = table_recruit.loc['급여', 1] paydetail = payraw[payraw.index("0원") + 2:] item['ca02'] = 1 * ('협의가능' in paydetail) paydetail = paydetail.replace('협의가능', '') item['ca03'] = 1 * ('당일지급' in paydetail) paydetail = paydetail.replace('당일지급', '') item['ca04'] = 1 * ('주급가능' in paydetail) paydetail = paydetail.replace('주급가능', '') item['ca05'] = 1 * ('식대별도지급' in paydetail) paydetail = paydetail.replace('식대별도지급', '') item['ca06'] = 1 * ('수습기간있음' in paydetail) paydetail = paydetail.replace('수습기간있음', '') item['ca07'] = 1 * ('시간외수당 별도' in paydetail) paydetail = paydetail.replace('시간외수당 별도', '') item['ca08'] = paydetail.strip() except: pass try: workperiod = table_recruit.loc['근무기간', 1] item['cb01'] = 1 * ('협의가능' in workperiod) if '하루(1일)' in workperiod: item['cb00'] = 0 elif '1주일이하' in workperiod: item['cb00'] = 1 elif '1주일~1개월' in workperiod: item['cb00'] = 2 elif '1개월~3개월' in workperiod: item['cb00'] = 3 elif '3개월~6개월' in workperiod: item['cb00'] = 4 elif '6개월~1년' in workperiod: item['cb00'] = 5 elif '1년이상' in workperiod: item['cb00'] = 6 except: pass try: workdays = table_recruit.loc['근무요일', 1] item['cc00'] = 1 * ('요일협의' in workdays) item['cc01'] = 1 * ('월~일' in workdays) item['cc02'] = 1 * ('월~토' in workdays) item['cc03'] = 1 * ('월~금' in workdays) item['cc04'] = 1 * ('토,일' in workdays) item['cc05'] = 1 * ('주6일' in workdays) item['cc06'] = 1 * ('주5일' in workdays) item['cc07'] = 1 * ('주4일' in workdays) item['cc08'] = 1 * ('주3일' in workdays) item['cc09'] = 1 * ('주2일' in workdays) item['cc10'] = 1 * ('주1일' in workdays) except: pass try: wtdetail = table_recruit.loc['근무시간', 1] item['cd03'] = 1 * ('익일' in wtdetail) wtdetail = wtdetail.replace('(익일)', '') item['cd04'] = wtdetail[wtdetail.index('휴게시간') + 5:wtdetail.index('분')] except: pass try: emptype = table_recruit.loc['고용형태', 1] item['ce00'] = 1 * ('알바' in emptype) item['ce01'] = 1 * ('정규직' in emptype) item['ce02'] = 1 * ('계약직' in emptype) item['ce03'] = 1 * ('파견직' in emptype) item['ce04'] = 1 * ('청년인턴직' in emptype) item['ce05'] = 1 * ('위촉직' in emptype) item['ce06'] = 1 * ('연수생/교육생' in emptype) except: pass try: welfare = table_recruit.loc['복리후생', 1] # 보험 (wf_isr) item['cf00'] = 1 * ('국민연금' in welfare) item['cf01'] = 1 * ('고용보험' in welfare) item['cf02'] = 1 * ('산재보험' in welfare) item['cf03'] = 1 * ('건강보험' in welfare) # 휴가, 휴무 item['cf04'] = 1 * ('정기휴가' in welfare) item['cf05'] = 1 * ('연차' in welfare) item['cf06'] = 1 * ('월차' in welfare) # 보상제도 item['cf07'] = 1 * ('인센티브제' in welfare) item['cf08'] = 1 * ('정기보너스' in welfare) item['cf09'] = 1 * ('퇴직금' in welfare) item['cf10'] = 1 * ('퇴직연금' in welfare) item['cf11'] = 1 * ('우수사원 표창/포상' in welfare) item['cf12'] = 1 * ('장기근속자 포상' in welfare) # 수당제도 item['cf13'] = 1 * ('야간근로수당' in welfare) item['cf14'] = 1 * ('휴일근로수당' in welfare) item['cf15'] = 1 * ('연월차수당' in welfare) item['cf16'] = 1 * ('장기근속수당' in welfare) item['cf17'] = 1 * ('위험수당' in welfare) item['cf18'] = 1 * ('연장근로수당' in welfare) # 생활안정 지원 item['cf19'] = 1 * ('기숙사운영' in welfare) item['cf20'] = 1 * ('명절 귀향비 지급' in welfare) # 생활편의 지원 item['cf21'] = 1 * ('조식제공' in welfare) item['cf22'] = 1 * ('중식제공' in welfare) item['cf23'] = 1 * ('석식제공' in welfare) item['cf24'] = 1 * ('근무복 지급' in welfare) item['cf25'] = 1 * ('통근버스 운행' in welfare) item['cf26'] = 1 * ('야간교통비 지급' in welfare) item['cf27'] = 1 * ('차량유류보조금' in welfare) item['cf28'] = 1 * ('주차비지원' in welfare) item['cf29'] = 1 * ('주차가능' in welfare) # 경조사 지원 item['cf30'] = 1 * ('경조휴가제' in welfare) item['cf31'] = 1 * ('각종 경조금' in welfare) except: pass # d.업직종 -------------------------------------------------------- try: jobtype = table_recruit.loc['업직종', 1] item['bh00'] = 1 * ('초보가능' in jobtype) item['da00'] = 1 * ('일반음식점' in jobtype) item['da01'] = 1 * ('레스토랑' in jobtype) * ('패밀리' not in jobtype) item['da02'] = 1 * ('레스토랑' in jobtype) * ('패밀리' in jobtype) item['da03'] = 1 * ('패스트푸드점' in jobtype) item['da04'] = 1 * ('치킨·피자전문점' in jobtype) item['da05'] = 1 * ('커피전문점' in jobtype) item['da06'] = 1 * ('아이스크림·디저트' in jobtype) item['da07'] = 1 * ('베이커리·도넛·떡' in jobtype) item['da08'] = 1 * ('호프·일반주점' in jobtype) item['da09'] = 1 * ('급식·푸드시스템' in jobtype) item['da10'] = 1 * ('도시락·반찬' in jobtype) # 유통 & 판매 (jt_sl) item['db00'] = 1 * ('백화점·면세점' in jobtype) item['db01'] = 1 * ('복합쇼핑몰·아울렛' in jobtype) item['db02'] = 1 * ('쇼핑몰·소셜커머스·홈쇼핑' in jobtype) item['db03'] = 1 * ('유통점·마트' in jobtype) item['db04'] = 1 * ('편의점' in jobtype) item['db05'] = 1 * ('의류·잡화매장' in jobtype) item['db06'] = 1 * ('뷰티·헬스스토어' in jobtype) item['db07'] = 1 * ('휴대폰·전자기기매장' in jobtype) item['db08'] = 1 * ('가구·침구·생활소품' in jobtype) item['db09'] = 1 * ('서점·문구·팬시' in jobtype) item['db10'] = 1 * ('약국' in jobtype) item['db11'] = 1 * ('농수산·청과·축산' in jobtype) item['db12'] = 1 * ('화훼·꽃집' in jobtype) item['db13'] = 1 * ('유통·판매·기타' in jobtype) # 문화 & 여가 & 생활 (leisure) item['dc00'] = 1 * ('놀이공원·테마파크' in jobtype) item['dc01'] = 1 * ('호텔·리조트·숙박' in jobtype) item['dc02'] = 1 * ('여행·캠프·레포츠' in jobtype) item['dc03'] = 1 * ('영화·공연' in jobtype) item['dc04'] = 1 * ('전시·컨벤션·세미나' in jobtype) item['dc05'] = 1 * ('스터디룸·독서실·고시원' in jobtype) item['dc06'] = 1 * ('PC방' in jobtype) item['dc07'] = 1 * ('노래방' in jobtype) item['dc08'] = 1 * ('볼링·당구장' in jobtype) item['dc09'] = 1 * ('스크린 골프·야구' in jobtype) item['dc10'] = 1 * ('DVD·멀티방·만화카페' in jobtype) item['dc11'] = 1 * ('오락실·게임장' in jobtype) item['dc12'] = 1 * ('이색테마카페' in jobtype) item['dc13'] = 1 * ('키즈카페' in jobtype) item['dc14'] = 1 * ('찜질방·사우나·스파' in jobtype) item['dc15'] = 1 * ('피트니스·스포츠' in jobtype) item['dc16'] = 1 * ('공인중개' in jobtype) item['dc17'] = 1 * ('골프캐디' in jobtype) item['dc18'] = 1 * ('고속도로휴게소' in jobtype) item['dc19'] = 1 * ('문화·여가·생활 기타' in jobtype) # 서비스 item['dd00'] = 1 * ('매장관리·판매' in jobtype) item['dd01'] = 1 * ('MD' in jobtype) item['dd02'] = 1 * ('캐셔·카운터' in jobtype) item['dd03'] = 1 * ('서빙' in jobtype) item['dd04'] = 1 * ('주방장·조리사' in jobtype) item['dd05'] = 1 * ('주방보조·설거지' in jobtype) item['dd06'] = 1 * ('바리스타' in jobtype) item['dd07'] = 1 * ('안내데스크' in jobtype) item['dd08'] = 1 * ('주차관리·주차도우미' in jobtype) item['dd09'] = 1 * ('보안·경비·경호' in jobtype) item['dd10'] = 1 * ('주유·세차' in jobtype) item['dd11'] = 1 * ('전단지배포' in jobtype) item['dd12'] = 1 * ('청소·미화' in jobtype) item['dd13'] = 1 * ('렌탈관리·A/S' in jobtype) item['dd14'] = 1 * ('헤어·미용·네일샵' in jobtype) item['dd15'] = 1 * ('피부관리·마사지' in jobtype) item['dd16'] = 1 * ('반려동물케어' in jobtype) item['dd17'] = 1 * ('베이비시터·가사도우미' in jobtype) item['dd18'] = 1 * ('결혼·연회·장례도우미' in jobtype) item['dd19'] = 1 * ('판촉도우미' in jobtype) item['dd20'] = 1 * ('이벤트·행사스텝' in jobtype) item['dd21'] = 1 * ('나레이터모델' in jobtype) item['dd22'] = 1 * ('피팅모델' in jobtype) item['dd23'] = 1 * ('서비스 기타' in jobtype) # 사무직 item['de00'] = 1 * ('사무보조' in jobtype) item['de01'] = 1 * ('문서작성·자료조사' in jobtype) item['de02'] = 1 * ('비서' in jobtype) item['de03'] = 1 * ('경리·회계보조' in jobtype) item['de04'] = 1 * ('인사·총무' in jobtype) item['de05'] = 1 * ('마케팅·광고·홍보' in jobtype) item['de06'] = 1 * ('번역·통역' in jobtype) item['de07'] = 1 * ('복사·출력·제본' in jobtype) item['de08'] = 1 * ('편집·교정·교열' in jobtype) item['de09'] = 1 * ('공공기관·공기업·협회' in jobtype) item['de10'] = 1 * ('학교·도서관·교육기관' in jobtype) # 고객상담 & 리서치 & 영업 item['df00'] = 1 * ('고객상담·인바운드' in jobtype) item['df01'] = 1 * ('레마케팅·아웃바운드' in jobtype) item['df02'] = 1 * ('금융·보험영업' in jobtype) item['df03'] = 1 * ('일반영업·판매' in jobtype) item['df04'] = 1 * ('설문조사·리서치' in jobtype) item['df05'] = 1 * ('영업관리·지원' in jobtype) # 생산 & 건설 & 노무 item['dg00'] = 1 * ('제조·가공·조립' in jobtype) item['dg01'] = 1 * ('포장·품질검사' in jobtype) item['dg02'] = 1 * ('입출고·창고관리' in jobtype) item['dg03'] = 1 * ('상하차·소화물 분류' in jobtype) item['dg04'] = 1 * ('기계·전자·전기' in jobtype) item['dg05'] = 1 * ('정비·수리·설치·A/' in jobtype) item['dg06'] = 1 * ('공사·건설현장' in jobtype) item['dg07'] = 1 * ('PVC(닥트·배관설치)' in jobtype) item['dg08'] = 1 * ('조선소' in jobtype) item['dg09'] = 1 * ('재단·재봉' in jobtype) item['dg10'] = 1 * ('생산·건설·노무 기타' in jobtype) # IT & 컴퓨터 item['dh00'] = 1 * ('웹·모바일기획' in jobtype) item['dh01'] = 1 * ('사이트·콘텐츠 운영' in jobtype) item['dh02'] = 1 * ('바이럴·SNS마케팅' in jobtype) item['dh03'] = 1 * ('프로그래머' in jobtype) item['dh04'] = 1 * ('HTML코딩' in jobtype) item['dh05'] = 1 * ('QA·테스터·검증' in jobtype) item['dh06'] = 1 * ('시스템·네트워크·보안' in jobtype) item['dh07'] = 1 * ('PC·디지털기기 설치·관리' in jobtype) item['di00'] = 1 * ('입시·보습학원' in jobtype) item['di01'] = 1 * ('외국어·어학원' in jobtype) item['di02'] = 1 * ('컴퓨터·정보통신' in jobtype) item['di03'] = 1 * ('요가·필라테스 강사' in jobtype) item['di04'] = 1 * ('피트니스 트레이너' in jobtype) item['di05'] = 1 * ('레져스포츠 강사' in jobtype) item['di06'] = 1 * ('예체능 강사' in jobtype) item['di07'] = 1 * ('유아·유치원' in jobtype) item['di08'] = 1 * ('방문·학습지' in jobtype) item['di09'] = 1 * ('보조교사' in jobtype) item['di10'] = 1 * ('자격증·기술학원' in jobtype) item['di11'] = 1 * ('국비교육기관' in jobtype) item['di12'] = 1 * ('교육·강사 기타' in jobtype) # 디자인 (design -> ds) item['dj00'] = 1 * ('웹·모바일디자인' in jobtype) item['dj01'] = 1 * ('그래픽·편집디자인' in jobtype) item['dj02'] = 1 * ('제품·산업디자인' in jobtype) item['dj03'] = 1 * ('CAD·CAM·인테리어디자인' in jobtype) item['dj04'] = 1 * ('캐릭터·애니메이션디자인' in jobtype) item['dj05'] = 1 * ('패션·잡화디자인' in jobtype) item['dj06'] = 1 * ('디자인 기타' in jobtype) # 미디어 (media -> md) item['dk00'] = 1 * ('보조출연·방청' in jobtype) item['dk01'] = 1 * ('방송스텝·촬영보조' in jobtype) item['dk02'] = 1 * ('동영상촬영·편집' in jobtype) item['dk03'] = 1 * ('사진촬영·편집' in jobtype) item['dk04'] = 1 * ('조명·음향' in jobtype) item['dk05'] = 1 * ('방송사·프로덕션' in jobtype) item['dk06'] = 1 * ('신문·잡지·출판' in jobtype) item['dk07'] = 1 * ('미디어 기타' in jobtype) # 운전 & 배달 (delivery -> dv) item['dl00'] = 1 * ('운송·이사' in jobtype) item['dl01'] = 1 * ('대리운전·일반운전' in jobtype) item['dl02'] = 1 * ('택시·버스운전' in jobtype) item['dl03'] = 1 * ('수행기사' in jobtype) item['dl04'] = 1 * ('화물·중장비·특수차' in jobtype) item['dl05'] = 1 * ('택배·퀵서비스' in jobtype) item['dl06'] = 1 * ('배달' in jobtype) # 병원 & 간호 & 연구 (medical research -> mr) item['dm00'] = 1 * ('간호조무사·간호사' in jobtype) item['dm01'] = 1 * ('간병·요양보호사' in jobtype) item['dm02'] = 1 * ('원무·코디네이터' in jobtype) item['dm03'] = 1 * ('수의테크니션·동물보건사' in jobtype) item['dm04'] = 1 * ('실험·연구보조' in jobtype) item['dm05'] = 1 * ('생동성·임상실험' in jobtype) except: pass yield item
def extract_intent(text): if not text or text == 'None': return None # remove links in swagger []() if "](" in text or "] (" in text: text = replace_hyperlinks(text) text = BeautifulSoup(text, "lxml").text if ':' in text: text = text[:text.index(':')] if '(' in text: text = re.sub(r"\(.*\)", '', text) expr = None for sent in to_sentences(text): if len(sent) > 120: continue sent = sent.lower() tagged_sent = nlp.pos_tag(sent) count = 0 for w, t in tagged_sent: if t in {'VB', 'VBZ'}: count += 1 if count > 1 and len(sent) > 80: # print("More than one verb: ", sent) continue if tagged_sent[0][1] == 'VB' or tagged_sent[0][0] in common_verbs or \ (tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB'): expr = sent elif tagged_sent[0][1] == 'VBZ' or tagged_sent[0][0] in common_sverbs or \ (tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB'): if tagged_sent[0][1] == 'RB' and tagged_sent[1][1] == 'VB': verb = tagged_sent[1][0] else: verb = tagged_sent[0][0] old_verb = verb if verb not in {"was", "is", "has"}: verb = lemmatizer.lemmatize(verb, pos=VERB) expr = sent.replace(old_verb, verb) if expr and 'http' in expr: continue if expr and 'see' in expr and (':' in expr or 'please' in expr or 'href' in expr): continue if expr and ('<' in expr and '>' in expr or '<p>' in expr): continue if expr: expr = finalize_utterance(expr) if " by " in expr: expr = expr[:expr.index(" by ")] return expr return None