def refine(self): cc = cm.CrawlClass() crawl_version = self.now.strftime('%Y%m%d') rows = cc.original_select(self.convention_name, crawl_version) convention_info = cc.convention_select(self.convention_name) for row in rows: data = {} animal = re.search(cc.pattern_title_animal(), row[2]) plant = re.search(cc.pattern_title_plant(), row[2]) if animal is not None: pet_cat_cd = 'animal' match = animal elif plant is not None: pet_cat_cd = 'plant' match = plant else: pet_cat_cd = '' match = False pattern_host = r'\"주최\".*\<\/dt\>\n\<dd\>(.*)\<\/dd\>' pattern_supvsn = r'\"주관사\".*\<\/dt\>\n\<dd\>(.*)\n*\t*\<\/dd\>' pattern_addt_dtl = r'장소.*\s*\<dd\>\s*(.*\s*.*)\s*' pattern_date = r'\"기간\"\s.*\<\/dt\>\n\<dd\>\s*\t*(.*)\s*\t*\<\/dd\>' pattern_time = r'\"관람시간\"\s?.*\<\/dt\>\n\<dd\>\s*\t*(.*)\s*\t*\<\/dd\>' pattern_cost = r'입장료\".*\s*\<dd\>\s*(.*)\s*\<\/dd\>' pattern_phone = r'\"전화번호\".*\s*\<dd\>\<span.*\s*([0-9]*-[0-9]*-[0-9]*).*\s*\<\/span\>' pattern_home = r'\"홈페이지\".*\s*\<dd\>.*\s*\<a href\=\".*\"\>(.*)\<\/a\>' # pattern_ctn = r'' reg_date = self.now.strftime('%Y-%m-%d %H:%M:%S') source_url = row[7] if match: host = re.findall(pattern_host, row[5]) if len(host) != 0: str_host = host[0] else: str_host = '' supvsn = re.findall(pattern_supvsn, row[5]) if len(supvsn) != 0: str_supvsn = supvsn[0].strip() else: str_supvsn = '' addt_dtl = re.findall(pattern_addt_dtl, row[5]) if len(addt_dtl) != 0: str_addt_dtl = addt_dtl[0].strip().replace('\n', '') else: str_addt_dtl = '' date = re.findall(pattern_date, row[5]) tempdate = str(date).replace("['", "").replace("']", "").strip() date_index = tempdate.find('~') date_start = datetime.datetime.strptime( tempdate[0:date_index].replace('.', '-').strip(), '%Y-%m-%d') date_end = datetime.datetime.strptime( tempdate[date_index + 1:len(tempdate)].replace( '.', '-').strip(), '%Y-%m-%d') time = re.findall(pattern_time, row[5]) if len(time) > 0: str_time = time[0] else: str_time = '' cost = re.findall(pattern_cost, row[5]) if len(cost) != 0: str_cost = cost[0] else: str_cost = '' phone = re.findall(pattern_phone, row[5]) if len(phone) != 0: str_phone = phone[0].strip() else: str_phone = '' home = re.findall(pattern_home, row[5]) if len(home[0]) > 0: str_home_url = home[0] pattern_home_url = r'http://' home_url = re.search(pattern_home_url, str_home_url) if home_url: res = urllib.request.urlopen(str_home_url).read() else: encoding_url = parse.urlparse(str_home_url) temp_url = encoding_url.scheme if len( encoding_url.scheme ) != 0 else 'http' + '://' + encoding_url.netloc + quote( encoding_url.path) res = urllib.request.urlopen(temp_url).read() soup = Bs(res, 'html.parser') home_title = soup.find_all('title', limit=1) str_home_title = home_title[0].text else: str_home_url = '' str_home_title = '' data['TP_CD'] = 'fest' data['PET_CAT_CD'] = pet_cat_cd data['TTL'] = match.string data['HOST_NM'] = str_host data['SUPVSN'] = str_supvsn data['ADDR'] = convention_info[0][4] data['ADDR_DTL'] = str_addt_dtl data['LOC'] = self.convention_name data['ZIPNO'] = convention_info[0][6] data['LAT'] = convention_info[0][7] data['LNG'] = convention_info[0][8] data['FR_DATE'] = date_start data['TO_DATE'] = date_end data['EVNT_TIME'] = str_time data['ONLN_YN'] = 'N' data['OFFLN_YN'] = 'Y' data['ENTR_COST'] = str_cost data['HPG_NM'] = str_home_title data['HPG_URL'] = str_home_url data['QNA'] = str_phone data['CTN'] = row[6] if row[6] is not None else '' data['M_IMG_ID'] = row[8] if row[8] is not None else '' data['LIST_IMG_ID'] = '' data['COMP_NM'] = str_host data['DAY_CD'] = cc.get_day_cd(date_start, date_end) data['RGN_CD'] = cc.get_rgn_cd(convention_info[0][4][0:2]) data['DEL_YN'] = 'N' data['REG_ID'] = 'crawler' data['REG_DTTM'] = reg_date data['UPD_ID'] = 'crawler' data['UPD_DTTM'] = reg_date data['CRAWL_VERSION'] = crawl_version data['SOURCE_URL'] = source_url data['CONVENTION_NAME'] = self.convention_name data['EVENT_TYPE'] = row[3] if row[3] is not None else '' cc.evnt_insert(data) cc.commit() cc.close()
def refine(self): cc = cm.CrawlClass() crawl_version = self.now.strftime('%Y%m%d') rows = cc.original_select(self.convention_name, crawl_version) convention_info = cc.convention_select(self.convention_name) for row in rows: data = {} animal = re.search(cc.pattern_title_animal(), row[2]) plant = re.search(cc.pattern_title_plant(), row[2]) if animal is not None: pet_cat_cd = 'animal' match = animal elif plant is not None: pet_cat_cd = 'plant' match = plant else: pet_cat_cd = '' match = False pattern_host = r'\<dt\>주최\/주관\<\/dt\>\n\<dd\>(.*?)\<\/dd\>' # pattern_supvsn = r'<li><span class="tit">주관<\/span>(.*?)<\/li>' pattern_addt_dtl = r'\<dt\>장소\<\/dt\>\n\<dd\>(.*?)\<\/dd\>' pattern_date = r'\<dt\>기간<\/dt\>\n\<dd\>(.*?)\<\/dd\>' pattern_time = r'\<dt\>시간\<\/dt\>\n\<dd\>(.*?)\<\/dd\>' pattern_cost = r'\<dt\>관람료\<\/dt\>\n\<dd\>(.*?)\<\/dd\>' pattern_phone = r'\<dt\>전화\<\/dt\>\n\<dd\>\n\t{4}(.*?)\n\t{4}\<\/dd\>' pattern_home = r'\<dt\>홈페이지\<\/dt\>\n\<dd\>\n\t*\<a.*\"\>(.*?)\<\/a\>\n\t*\<\/dd\>' # pattern_ctn = r'' reg_date = self.now.strftime('%Y-%m-%d %H:%M:%S') source_url = row[7] if match: host = re.findall(pattern_host, row[5]) if len(host) > 0: str_host = host[0] else: str_host = '' # supvsn = re.findall(pattern_supvsn, row[5]) # if len(supvsn) > 0: # str_supvsn = supvsn[0] # else: # str_supvsn = '' addt_dtl = re.findall(pattern_addt_dtl, row[5]) if len(addt_dtl) > 0: str_addt_dtl = addt_dtl[0] else: str_addt_dtl = '' date = re.findall(pattern_date, row[5]) tempdate = str(date).replace("['", "").replace("']", "").strip() date_index = tempdate.find('~') date_start = datetime.datetime.strptime( tempdate[0:date_index].replace('.', '-').strip(), '%Y-%m-%d') date_end = datetime.datetime.strptime( tempdate[date_index + 1:len(tempdate)].replace( '.', '-').strip(), '%Y-%m-%d') time = re.findall(pattern_time, row[5]) if len(time) > 0: str_time = time[0] else: str_time = '' cost = re.findall(pattern_cost, row[5]) if len(cost) > 0: str_cost = cost[0] else: str_cost = '' phone = re.findall(pattern_phone, row[5]) if len(phone) > 0: str_phone = phone[0] else: str_phone = '' home = re.findall(pattern_home, row[5]) if len(home) > 0: str_home_url = home[0] res = urllib.request.urlopen(str_home_url).read() soup = Bs(res, 'html.parser') home_title = soup.find_all('title', limit=1) str_home_title = home_title[0].text else: str_home_url = '' str_home_title = '' data['TP_CD'] = 'fest' data['PET_CAT_CD'] = pet_cat_cd data['TTL'] = match.string data['HOST_NM'] = str_host data['SUPVSN'] = str_host data['ADDR'] = convention_info[0][4] data['ADDR_DTL'] = str_addt_dtl data['LOC'] = self.convention_name data['ZIPNO'] = convention_info[0][6] data['LAT'] = convention_info[0][7] data['LNG'] = convention_info[0][8] data['FR_DATE'] = date_start data['TO_DATE'] = date_end data['EVNT_TIME'] = str_time data['ONLN_YN'] = 'N' data['OFFLN_YN'] = 'Y' data['ENTR_COST'] = str_cost data['HPG_NM'] = str_home_title data['HPG_URL'] = str_home_url data['QNA'] = str_phone data['CTN'] = row[6] if row[6] is not None else '' data['M_IMG_ID'] = row[8] if row[8] is not None else '' data['LIST_IMG_ID'] = '' data['COMP_NM'] = str_host data['DAY_CD'] = cc.get_day_cd(date_start, date_end) data['RGN_CD'] = cc.get_rgn_cd(convention_info[0][4][0:2]) data['DEL_YN'] = 'N' data['REG_ID'] = 'crawler' data['REG_DTTM'] = reg_date data['UPD_ID'] = 'crawler' data['UPD_DTTM'] = reg_date data['CRAWL_VERSION'] = crawl_version data['SOURCE_URL'] = source_url data['CONVENTION_NAME'] = self.convention_name data['EVENT_TYPE'] = row[3] if row[3] is not None else '' cc.evnt_insert(data) cc.commit() cc.close()
def insert(self): cc = cm.CrawlClass() crawl_version = self.now.strftime('%Y%m%d') rows = cc.original_select(self.convention_name, crawl_version) # convention_info = cc.convention_select(self.convention_name) for row in rows: data = {} animal = re.search(cc.pattern_title_animal(), row[2]) plant = re.search(cc.pattern_title_plant(), row[2]) if animal is not None: pet_cat_cd = 'animal' match = animal elif plant is not None: pet_cat_cd = 'plant' match = plant else: pet_cat_cd = '' match = False pattern_addr = r'위치\<\/th\>\s*\<td\>(.*)\<\/td\>' pattern_addr_dtl = r'행사장소\<\/th\>\s*\<td\>(.*)\<\/td\>' pattern_cost = r'입장료\<\/th\>\s*\<td\>(.*)\s*\<\/td\>' pattern_date = r'행사기간\<\/th\>\s*\<td\>(.*)\<\/td\>' pattern_home = r'홈페이지\<\/th\>\s*\<td\>.*\_blank\"\>(.*)\<\/a\>\<\/td\>' pattern_host = r'주최\<\/th\>\s*\<td\>(.*)\<\/td\>' pattern_phone = r'연락처\<\/th\>\s*\<td\>.*\"\>(.*)\<\/a\>\<\/td\>' pattern_supvsn = r'주관\<\/th\>\s*\<td\>(.*)\<\/td\>' pattern_time = r'\<dt\>시간\<\/dt\>\n\<dd\>(.*?)\<\/dd\>' # pattern_ctn = r'' reg_date = self.now.strftime('%Y-%m-%d %H:%M:%S') source_url = row[7] insert_flag = True if match: host = re.findall(pattern_host, row[5]) if len(host) > 0: str_host = host[0] else: str_host = '' supvsn = re.findall(pattern_supvsn, row[5]) if len(supvsn) > 0: str_supvsn = supvsn[0] else: str_supvsn = '' addr = re.findall(pattern_addr, row[5]) if len(addr) > 0: str_addr = addr[0] else: str_addr = '' addt_dtl = re.findall(pattern_addr_dtl, row[5]) if len(addt_dtl) > 0: str_addt_dtl = addt_dtl[0] else: str_addt_dtl = '' date = re.findall(pattern_date, row[5]) tempdate = str(date).replace("['", "").replace("']", "").strip() date_index = tempdate.find('~') date_start = datetime.datetime.strptime( tempdate[0:date_index].replace('.', '-').strip(), '%Y-%m-%d') date_end = datetime.datetime.strptime( tempdate[date_index + 1:len(tempdate)].replace( '.', '-').strip(), '%Y-%m-%d') temp_time = re.findall(pattern_time, row[5]) if len(temp_time) > 0: str_time = temp_time[0] else: str_time = '' cost = re.findall(pattern_cost, row[5]) if len(cost) > 0: str_cost = cost[0] else: str_cost = '' phone = re.findall(pattern_phone, row[5]) if len(phone) > 0: str_phone = phone[0] else: str_phone = '' home = re.findall(pattern_home, row[5]) if len(home) > 0 and home[0] != '': pattern_hangul = r'[가-힣]' hangul = re.findall(pattern_hangul, home[0]) if len(hangul) > 0: str_home_url = home[0] pre_url = home[0][0:home[0].index(hangul[0])] sub_url = home[0][home[0].index(hangul[len(hangul) - 1]) + 1:len(home[0])] encode_url1 = self.encode_url(hangul) str_home_url = str( encode_url1)[2:len(str(encode_url1)) - 1] + sub_url res = urllib.request.urlopen(pre_url + str_home_url) soup = Bs(res, 'html.parser') home_title = soup.select('head > title') str_home_title = home_title[0].text if len( home_title) > 0 else '' else: str_home_url = home[0] res = urllib.request.urlopen(str_home_url).read() soup = Bs(res, 'html.parser') home_title = soup.select('head > title') str_home_title = home_title[0].text if len( home_title) > 0 else '' else: str_home_url = '' str_home_title = '' if match.string[0:1] == '[': insert_flag = False else: str_rgn_cd = '9999' if insert_flag: print(match.string) data['TP_CD'] = 'fest' data['PET_CAT_CD'] = pet_cat_cd data['TTL'] = match.string data['HOST_NM'] = str_host data['SUPVSN'] = str_supvsn data['ADDR'] = str_addr data['ADDR_DTL'] = str_addt_dtl data['LOC'] = str_addt_dtl data['ZIPNO'] = '' data['LAT'] = 0 data['LNG'] = 0 data['FR_DATE'] = date_start data['TO_DATE'] = date_end data['EVNT_TIME'] = str_time data['ONLN_YN'] = 'N' data['OFFLN_YN'] = 'Y' data['ENTR_COST'] = str_cost data['HPG_NM'] = str_home_title data['HPG_URL'] = str_home_url data['QNA'] = str_phone data['CTN'] = row[6] if row[6] is not None else '' data['M_IMG_ID'] = row[8] if row[8] is not None else '' data['LIST_IMG_ID'] = '' data['COMP_NM'] = str_host data['DAY_CD'] = cc.get_day_cd(date_start, date_end) data['RGN_CD'] = str_rgn_cd data['DEL_YN'] = 'N' data['REG_ID'] = 'crawler' data['REG_DTTM'] = reg_date data['UPD_ID'] = 'crawler' data['UPD_DTTM'] = reg_date data['CRAWL_VERSION'] = crawl_version data['SOURCE_URL'] = source_url data['CONVENTION_NAME'] = self.convention_name data['EVENT_TYPE'] = row[3] if row[3] is not None else '' cc.evnt_insert(data) cc.commit() cc.close()
def test_insert1(self): cc = cm.CrawlClass() rows = cc.content_select(self.convention_name) cnt = 0 for row in rows: data = {} cnt += 1 title_songdoconvensia = r'\<strong.*\>(.*)\<\/strong\>' title_pattern = r"(캣|도그|펫|동물|애견|애완)" # row[3] => 페이지소스 # match = re.findall(title_songdoconvensia, row[3]) # 제목을 먼저 찾아낸다 match2 = re.search(title_pattern, row[2]) # 찾아낸 제목에서 키워드로 필터링 pattern_host = r'주최\<\/th\>\n\<td\>[\n\t ]*(.*)[\n\t ]*\<\/td\>' pattern_manage = r'주관\<\/th\>\n\<td\>[\n\t ]*(.*)[\n\t ]*\<\/td\>' #pattern_date = r'일정\<\/th\>\n.*(.*)\n' # pattern_date = r'일정\<\/th\>\n\<td colspan\=\"3\"\>\<strong\>(.*)\n' pattern_time = r'([0-9]{2}:[0-9]{2}.?[0-9]{2}:[0-9]{2})' pattern_place = r'위치\<\/th\>\n.*\<strong\>(.*)\<\/strong\>' pattern_money = r'입장료\<\/th\>\n\<td\>(.*)\<\/td\>' pattern_phone = r'문의처\<\/th\>\n\<td\>(.*)\<\/td\>' pattern_url = row[6] # 해당 페이지 주소 pattern_home = r'행사홈페이지\<\/th\>\n.*\<a.*\>(.*)\<\/a\>' now = datetime.datetime.now() reg_date = now.strftime('%Y-%m-%d %H:%M:%S') z_start = '' z_end = '' if match2: place = re.findall(pattern_place, row[5]) if len(place) != 0: str_place = place[0] else: str_place = '' date = re.findall(pattern_date, row[5]) tempdate = str(date).replace("['", "").replace("']", "").strip() date_index = tempdate.find('~') date_start = tempdate[0:date_index].replace('년 ', '-').replace( '월 ', '-').replace('일', '') date_end = tempdate[date_index + 1:len(tempdate)].replace( '년 ', '-').replace('월 ', '-').replace('일', '') time = re.findall(pattern_time, row[5]) if len(time) == 0: time_start = '' time_end = '' else: temptime = str(time[0]).replace("['", "").replace("']", "").strip() time_index = temptime.find('~') time_start = temptime[0:time_index] time_end = temptime[time_index + 1:len(temptime)] phone = re.findall(pattern_phone, row[5]) if len(phone) != 0: str_phone = phone[0].strip() else: str_phone = '' home = re.findall(pattern_home, row[5]) if len(home) > 0: str_home = home[0].strip() else: str_home = '' manage = re.findall(pattern_manage, row[5]) if len(manage) != 0: str_manage = manage[0].strip() else: str_manage = '' host = re.findall(pattern_host, row[5]) if len(host) != 0: str_host = host[0] else: str_host = '' money = re.findall(pattern_money, row[5]) if len(money) != 0: str_money = money[0] else: str_money = '' print("주최 {}".format(str_host)) print("주관 {}".format(str_manage)) print(date) print( datetime.datetime.strptime(date_start.strip(), '%Y-%m-%d')) d_start = datetime.datetime.strptime(date_start.strip(), '%Y-%m-%d') print(datetime.datetime.strptime(date_end.strip(), '%Y-%m-%d')) d_end = datetime.datetime.strptime(date_end.strip(), '%Y-%m-%d') #print('{start} ~ {end}'.format(start=time_start, end=time_end)) print("장소 {}".format(str_place)) print("돈 {}".format(str_money)) print("폰번호 {}".format(str_phone)) print("홈페이지 {}".format(str_home)) data['convention_name'] = self.convention_name data['event_name'] = match2.string data['event_type'] = row[3] data['place'] = str_place data['date_start'] = d_start data['data_end'] = d_end data['time_start'] = time_start data['time_end'] = time_end data['phone'] = str_phone data['home_page'] = str_home data['manage'] = str_manage data['host'] = str_host data['money'] = str_money data['source_url'] = pattern_url data['reg_date'] = reg_date cc.content_insert(data) cc.commit() cc.close()
def test_insert1(self): cc = cm.CrawlClass() rows = cc.content_select(self.convention_name) cnt = 0 for row in rows: cnt += 1 # title_iccjeju = r'<li\b[^>]*>행 사 명 : (.*?)<\/li>' title_pattern = r"(캣|도그|펫|동물|애견|애완)" # row[3] => 페이지소스 # match = re.findall(title_iccjeju, row[3]) # 제목을 먼저 찾아낸다 match2 = re.search(title_pattern, row[2]) # 찾아낸 제목에서 키워드로 필터링 pattern_host = r'\<li\>주\s*최\s?\:\s?(.*)\<\/li\>' # pattern_manage = r'\<td id\=\"etc6\"\>(.*)\<\/td\>' pattern_date = r'\<li\>기\s*간\s?\:\s?(.*)\<\/li\>' # pattern_time = r'\<em\>시 간\<\/em\>\<span\>(.*)\<\/span\>\<\/li\> \<li\>\<em\>장' pattern_place = r'\<li\>장\s*소\s?\:\s?(.*)\<\/li\>' # pattern_money = r'\<td id\=\"etc4\"\>(.*)\<\/td\>' pattern_phone = r'[0-9]{2,3}\-[0-9]{4}\-[0-9]{4}' # pattern_url = row[4] # 해당 페이지 주소 pattern_home = r'웹사이트.*\>\s*(.*)\s*\<\/a\>' now = datetime.datetime.now() reg_date = now.strftime('%Y-%m-%d %H:%M:%S') # z_start = '' # z_end = '' if match2: data = {} place = re.findall(pattern_place, row[5]) if len(place) != 0: str_place = place[0] else: str_place = '' date = re.findall(pattern_date, row[5]) tempdate = str(date).replace("['", "").replace("']", "").strip() date_index = tempdate.find('~') date_start = tempdate[0:date_index].replace('.', '-') date_end = tempdate[date_index + 1:len(tempdate)].replace( '.', '-') # time = re.findall(pattern_time, row[5]) # temptime = str(time).replace("['", "").replace("']", "").strip() # time_index = temptime.find('~') # time_start = temptime[0:time_index] # time_end = temptime[time_index+1:len(temptime)] phone = re.findall(pattern_phone, row[5]) if len(phone) != 0: str_phone = phone[0].strip() else: str_phone = '' home = re.findall(pattern_home, row[5]) if len(home) > 0: str_home = home[0].strip() else: str_home = '' # manage = re.findall(pattern_manage, row[5]) # if len(manage) != 0: # str_manage = manage[0].strip() # else: # str_manage = '' host = re.findall(pattern_host, row[5]) if len(host) != 0: str_host = host[0] else: str_host = '' # money = re.findall(pattern_money, row[5]) # if len(money) != 0: # str_money = money[0] # else: # str_money = '' print("주최 {}".format(host[0])) # print("주관 {}".format(str_manage)) print(date) print( datetime.datetime.strptime(date_start.strip(), '%Y-%m-%d')) d_start = datetime.datetime.strptime(date_start.strip(), '%Y-%m-%d') print(datetime.datetime.strptime(date_end.strip(), '%Y-%m-%d')) d_end = datetime.datetime.strptime(date_end.strip(), '%Y-%m-%d') # print('{start} ~ {end}'.format(start=time_start, end=time_end)) print("장소 {}".format(str_place)) # print("돈 {}".format(str_money)) print("폰번호 {}".format(str_phone)) print("홈페이지 {}".format(str_home)) data['convention_name'] = self.convention_name data['event_name'] = match2.string.strip() data['event_type'] = row[3] data['place'] = str_place data['date_start'] = d_start data['data_end'] = d_end data['time_start'] = '' data['time_end'] = '' data['phone'] = str_phone data['home_page'] = str_home data['manage'] = '' data['host'] = str_host data['money'] = '' data['source_url'] = 'http://www.iccjeju.co.kr/Event/Schedule' data['reg_date'] = reg_date cc.content_insert(data) cc.commit() cc.close()