def parse_paper_data(target_content, paper_data_id, search_type): """ 페이지 정보를 입력받아 정리한 내용을 리스트로 반환하는 함수 :param target_content: 페이지 내용 :param paper_data_id: 랜덤 값 ID (10자리) :type: single, dupl search인지 판단 :return: 페이지 정보, 인용 수 반환 """ soup = BeautifulSoup(target_content, 'html.parser') if search_type == "single": # 검색 결과 수 pagination_btn = soup.select('a.paginationNext') # 결과 수가 없을 경우 즉시 종료 if not pagination_btn or len(pagination_btn) == 0: raise sju_exceptions.NoPaperDataError() pagination_btn_alt = soup.select('a.paginationNext')[0].attrs['alt'] # 결과 수가 1개가 아닐 경우 즉시 종료 # and pagination_btn_alt.find('비활성') == -1 if pagination_btn_alt.find('Inactive') == -1: raise sju_exceptions.MultiplePaperDataError() # 논문 제목 title = soup.select('div.title')[0].text.replace('\n', '') # ISSN ISSN = soup.select('p.sameLine') if ISSN: ISSN = ISSN[0].value.contents[0] else: ISSN = '' # 등급 grades = [] caped_grades = [] box_label = soup.select('span.box-label') for label in box_label: if label.text.find('- ') != -1: temp = label.text.replace('- ', '') grades += [temp] caped_grades += [re.sub(r'[ a-z]+', r'', temp)] # 임팩트 팩토 Impact_Factor_table = soup.select('table.Impact_Factor_table') impact_factor = {} if len(Impact_Factor_table) > 0: trs = Impact_Factor_table[0].find_all('tr') tds = trs[0].find_all('td') ths = trs[1].find_all('th') for idx, th in enumerate(ths): impact_factor[th.text.strip()] = tds[idx].text.strip() else: impact_factor = {} # JCR 랭크 JCR_Category_table = soup.select('table.JCR_Category_table') jcr_headers = [] jcr = [] ranks = [] good_rank = '' trs = [] if len(JCR_Category_table) > 0: JCR_Category_table = JCR_Category_table[0] trs = JCR_Category_table.find_all('tr') if trs: jcr.append([x.text.strip() for x in trs[0].find_all('th')]) for tr in trs[1:]: temp = [x.text.strip() for x in tr.find_all('td')] jcr.append(temp) jrank, jall = map(int, temp[1].split(' of ')) temp.append(round(jrank / jall * 100, 2)) ranks.append(temp) good_rank = max(ranks, key=lambda x: -x[-1])[-1] # 인용 횟수 및 링크 cnt_link = soup.select( 'a.snowplow-citation-network-times-cited-count-link') if not cnt_link: times_cited = '0' else: cnt_link = cnt_link[0] times_cited = cnt_link.span.text #저널 명 journal_name = soup.select('span.sourceTitle') journal_name = journal_name[0].text.replace('\n', '') #print("[1type]journal_name : ", journal_name) #print("[2type]journal_name : ",type(journal_name)) # 기타 필드 correction_form = soup.find( action= 'http://ips.clarivate.com/cgi-bin/forms/wok_datachange/wok-proc.pl') if not correction_form: correction_form = soup.find( action= 'https://support.clarivate.com/ScientificandAcademicResearch/s/datachanges' ) correction_form_inputs_by_name = {} for inputTag in correction_form.find_all('input'): inputDict = inputTag.attrs correction_form_inputs_by_name[inputDict['name']] = inputDict['value'] doc_type = '' published_month = '' research_areas = '' publisher = '' language = '' reprint = '' authors = [] fr_authors = [] fr_addresses = [] for fr_field in soup.select('p.FR_field'): if fr_field.text.find('Document Type:') != -1: doc_type = fr_field.text.split(':')[1] if fr_field.text.find('Published:') != -1: published_month = fr_field.text.split(':')[1] if fr_field.text.find('Research Areas:') != -1: research_areas = fr_field.text.split(':')[1] if fr_field.text.find('Publisher ') != -1: publisher = ' '.join(fr_field.text.split(' ')[1:]) publisher = publisher.split(',') if fr_field.text.find('Language:') != -1: language = fr_field.text.split(':')[1] if fr_field.text.find('Reprint Address:') != -1: reprint = fr_field.text.split(':')[1].replace('\n', '').strip() if fr_field.text.find('By:') != -1: fr_authors = fr_field # if fr_field.text.find('Addresses:') != -1: # if fr_field.text.find('E-mail') != -1: # continue # fr_addresses = fr_field.nextSibling addresses = {} #저자, 연구기관 fconts = fr_authors.select('a') fr_authors_text = fr_authors.text.replace('\n', '') fr_authors_text = fr_authors_text.split(':')[1].split(';') # 풀 네임 full_name = {} for fa in fr_authors_text: p_count = fa.count('(') if p_count > 1: fa_match = re.search(r'(.+) \((.+)\(.+\)\)', fa) elif p_count == 1: fa_match = re.search(r'(.+) \((.+)\)', fa) if fa_match: full_name[fa_match.group(1).strip()] = fa_match.group(2).replace( r'\(|\)', '').strip() target_author = '' tauthor_address = [] for con in fconts: isSub = con.get('href').find('javascript') != -1 if not isSub: if target_author != '': addresses[target_author] = tauthor_address if target_author in full_name.keys(): addresses[full_name[target_author]] = tauthor_address tauthor_address = [] target_author = con.text.strip() authors += [target_author] else: addressId = re.sub(r'.+\'(.+)\'.+', r'\1', con.get('href')) temp = soup.find('a', id=addressId) if temp != None: # tauthor_address += [temp.contents[0]] tauthor_address += [temp.text] if target_author != '': addresses[target_author] = tauthor_address addresses[full_name[target_author]] = tauthor_address if reprint == '': reprint = 'None' paperData = { 'id': paper_data_id, # 'authors' : correction_form_inputs_by_name['00N70000002C0wa'].split(';'), 'authors': authors, 'full_name': full_name, 'fr_authors_text': fr_authors_text, 'firstAuthor': authors[0], 'addresses': addresses, 'authorsCnt': str( len(correction_form_inputs_by_name['00N70000002C0wa'].split(';')) - 1), 'doi': correction_form_inputs_by_name['00N70000002n88A'], 'volume': correction_form_inputs_by_name['00N70000002Bdnt'], 'issue': correction_form_inputs_by_name['00N700000029W18'], 'pages': correction_form_inputs_by_name['00N70000002C0vh'], 'published': correction_form_inputs_by_name['00N70000002BdnY'], 'publishedMonth': published_month, 'publisher': publisher, 'journal_name': journal_name, # 'title' : correction_form_inputs_by_name['00N70000002BdnX'], 'title': title, 'impact_factor': impact_factor, 'prevYearIF': 'None', 'goodRank': good_rank, 'timesCited': times_cited, 'grades': grades, 'capedGrades': caped_grades, 'docType': doc_type, 'researchAreas': research_areas, 'language': language, 'reprint': reprint, 'jcr': jcr, 'citingArticles': [], 'issn': ISSN, } paperData['ivp'] = [ '%s/%s' % (paperData['issue'], paperData['volume']), paperData['pages'] ] # 전년도 임팩트 팩토 now = datetime.datetime.now() prev_year = str(now.year - 2) # prev_year = str(int(paperData['published']) - 1) if prev_year in impact_factor.keys(): try: paperData['prevYearIF'] = impact_factor[prev_year] except: prev_year = str(now.year - 1) paperData['prevYearIF'] = impact_factor[prev_year] return paperData, cnt_link
def start(self, query, start_year, end_year, gubun): ''' 둘 이상의 논문에 관한 상세 정보 제공함수 :param query: 입력 값 (논문 제목,저자) :param start_year: 시작년도 :param end_year: 끝년도 :param gubun: 검색 구분 카테고리 :return: ''' # Sejong Univ 로 고정 query = (query[0], query[1], 'Sejong Univ') session = self.session base_url = self.base_url ui_stream = self.ui_stream threading_amount = self.threading_amount keyword = query[0] p_authors = query[1] organization = query[2] # [단계 1/3] 최초 검색 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4002]) ui_stream.push(command='log', msg='검색어 : %s' % keyword) if keyword.find('=') != -1: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1300][0] }) return action_url = '/WOS_GeneralSearch.do' form_data = { 'action': 'search', 'product': 'WOS', 'search_mode': 'GeneralSearch', 'sa_params': 'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID, 'SID': self.SID, 'value(input1)': keyword, 'value(select1)': gubun, 'startYear': start_year, 'endYear': end_year, } if organization != '': form_data.update({ 'limitStatus': 'expanded', 'value(bool_1_2)': 'AND', 'value(input2)': organization, 'value(select2)': 'AD', 'fieldCount': '2', }) # 검색 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4102]) url = base_url + action_url self.qid += 1 http_res = session.post(url, form_data) # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') atag_list = soup.select('a.snowplow-full-record') #report_link = soup.select('a.citation-report-summary-link') # 1페이지 검색 결과가 없을 경우 try: if soup.find(id="footer_formatted_count") == None: raise sju_exceptions.NoPaperDataError() else: total_count = soup.find(id="footer_formatted_count").text total_count = int(total_count.replace(",", "")) if len(atag_list) == 0: raise sju_exceptions.NoPaperDataError() # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[4302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[4302][0] }) return except Exception as e: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4303][0]) raise Exception(e) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4202]) # [단계 2/3] 상세 정보 페이지 fetch ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4003]) query_string = atag_list[0]['href'] action_url = query_string[0:query_string.find('page')] # 검색할 모든 페이지 url 저장 page_count = 1 query_string_list = [] # 50건의 url만 저장 if total_count > 50: doc_count_range = 51 else: doc_count_range = total_count + 1 for doc_count in range(1, doc_count_range): url = base_url + action_url + "page=" + str( page_count) + "&doc=" + str(doc_count) query_string_list.append(url) if doc_count % 10 == 0: page_count += 1 # 각 페이지 상세 정보 요청 시작 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4103]) MSC = MultiSearchContainer(ui_stream) with concurrent.futures.ThreadPoolExecutor( max_workers=threading_amount) as exe: future_run = { exe.submit(MSC.start, q_url, session, p_authors): q_url for q_url in query_string_list } for future in concurrent.futures.as_completed(future_run): q_url = future_run[future] try: future.result() except Exception as e: ui_stream.push(command='err', msg='[다중검색] %d 검색 중 에러발생') raise e if self.qid > 180: self.set_session() ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4200][0]) return
def start(self, query, start_year, end_year, gubun): ''' 하나의 논문에 관한 상세 정보 제공함수 :param query: keyword, p_authors, organization 각각 문자열 :param start_year: 시작년도 :param end_year: 끝년도 :param gubun: 검색 구분 카테고리 :return: ''' # Sejong Univ 로 고정 ##################### query = (query[0], query[1], 'Sejong Univ') # driver = self.driver session = self.session base_url = self.base_url ui_stream = self.ui_stream keyword = query[0] p_authors = query[1] organization = query[2] paper_data_id = str(random.getrandbits(32)) # 검색속도 향상을 위한 헤더 랜더마이즈 # orginal_headers = session.headers # session.headers.update({'User-Agent': str(random.getrandbits(32))}) # [단계 1/3] 최초 검색 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002]) ui_stream.push(command='log', msg='검색어 : %s' % keyword) if keyword.find('=') != -1: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1300][0] }) return action_url = '/WOS_GeneralSearch.do' form_data = { 'action': 'search', 'product': 'WOS', 'search_mode': 'GeneralSearch', 'sa_params': 'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID, 'SID': self.SID, 'value(input1)': keyword, 'value(select1)': gubun, 'startYear': start_year, 'endYear': end_year, } if organization != '': form_data.update({ 'limitStatus': 'expanded', 'value(bool_1_2)': 'AND', 'value(input2)': organization, 'value(select2)': 'AD', 'fieldCount': '2', }) # 검색 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102]) url = base_url + action_url # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상 # 어떤 문자인 지 찾아서 수정하는 작업이 필요. # form_data = sju_utiles.get_form_data(action_url, form_data) self.qid += 1 http_res = session.post(url, form_data) # # 검색 성공 # if http_res.status_code == requests.codes.ok: # location = http_res.history[0].headers['Location'] # reffer = base_url + '/' + location # # 검색 실패 # else: # ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) # raise sju_exceptions.RequestsError # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # http_res = session.get(reffer) # # Access Denied # if http_res.status_code == 403: # ui_stream.push( # command='res', target='errQuery', # res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'} # ) # return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') atag_list = soup.select('a.snowplow-full-record') report_link = soup.select('a.citation-report-summary-link') try: if len(atag_list) == 0: raise sju_exceptions.NoPaperDataError() elif len(atag_list) > 1: raise sju_exceptions.MultiplePaperDataError() # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return # 검색 결과가 2개 이상일 경우 except sju_exceptions.MultiplePaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][1] }) return except Exception as e: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0]) raise Exception(e) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202]) # [단계 2/3] 상세 정보 페이지 fetch, 인용년도 조회 (스레딩) ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1003]) tc_data = {'tc_dict': []} tc_parsing_thread = None # 인용 보고서 링크가 잡힐 때 if len(report_link) != 0: # 인용년도 조회 스레딩 tc_parsing_thread = threading.Thread(target=self.get_tc_data, args=(report_link, paper_data_id, tc_data)) tc_parsing_thread.start() # 결과 리스트 페이지를 들렀다 오는 경우 query_string = atag_list[0]['href'] # # 상세 보기 바로 진입 하는 경우 # # qid가 랜덤한 경우가 존재... 사용하기 위해선 # # 이슈가 해결되야함. # action_url = '/full_record.do' # query_data = { # 'page': '1', # 'qid': str(self.qid), # 'SID': self.SID, # 'doc': '1', # } # query_string = sju_utiles.get_query_string(action_url, query_data) # 상세 정보 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1103]) # session.headers['Reffer'] = reffer http_res = session.get(base_url + query_string) # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '해당 논문의 상세 정보를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return target_content = http_res.content # 상세 정보 파싱 try: paper_data, cnt_link = sju_utiles.parse_paper_data( target_content, paper_data_id, "single") # paper_data['subsidy'] = sju_utiles.get_subsidy01(paper_data, p_authors) # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return # 검색 결과가 2개 이상일 경우 except sju_exceptions.MultiplePaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][1] }) return except Exception as e: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][2] }) # raise sju_exceptions.FailedToParseError(e, query) return # 요청 성공 else: ui_stream.push(command='res', target='paperData', res=paper_data) # 인용 년도 조회 완료를 기다림 if tc_parsing_thread: tc_parsing_thread.join() ui_stream.push(command='log', msg='인용 년도 조회가 완료되었습니다.') tc_dict = tc_data['tc_dict'] # 인용 년도 조회 성공 시 출력 if len(tc_dict) > 0: ui_stream.push(command='res', target='tc_data', res={ 'id': paper_data_id, 'tc_data': tc_dict }) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1203]) # # 요청 실패 # [단계 3/3] 인용 논문 정보 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1004]) # 인용 횟수에 따른 분기 if not cnt_link: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][0]) self.qid += 1 return elif int(paper_data['timesCited']) > 4999: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1304][1]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1304][1] }) self.qid += 1 return # 인용 리포트 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1104]) url = base_url + cnt_link['href'] http_res = session.get(url) target_content = http_res.content # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 리포트를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return soup = BeautifulSoup(target_content, 'html.parser') # 인용문 링크는 존재하나, 클릭할 경우 검색 결과가 없다는 메세지가 뜰 때 if soup.text.find( 'Your search found no records') != -1 or soup.text.find( 'None of the Citing Articles are in your subscription' ) != -1: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][3]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1304][3] }) return qid = soup.select('input#qid')[0].attrs['value'] rurl = soup.select('input#rurl')[0].attrs['value'] times_cited = paper_data['timesCited'] self.qid = int(qid) # Fast 5000 요청 및 다운로드 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204]) action_url = '/OutboundService.do?action=go&&' form_data = { 'qid': str(self.qid), 'SID': self.SID, 'mark_to': times_cited, 'markTo': times_cited, } form_data = sju_utiles.get_form_data(action_url, form_data) url = base_url + action_url http_res = session.post(url, form_data) self.qid += 1 # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # Fast 5000 데이터 처리 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404]) fast_5000 = http_res.content.decode('utf-8').replace('\r', '') fast_5000_list = fast_5000.split('\n') keys = fast_5000_list[0].split('\t') fast_5000_list = fast_5000_list[1:] if fast_5000_list[-1] == '': fast_5000_list.pop() article = {} citing_articles = [] for row in fast_5000_list: row_list = row.split('\t') for idx, key in enumerate(keys): article[key] = row_list[idx] citing_articles.append(article) article = {} # UI 응답 형식에 맞게 변환 citingArticles = { 'id': paper_data['id'], 'selfCitation': 0, 'othersCitation': 0, 'titles': [], 'authors': [], 'isSelf': [] } # 기준 저자 검증 if p_authors != '': ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1504]) p_authors = list( map(lambda x: x.replace(' ', '').replace(',', ''), p_authors.split(';'))) for article in citing_articles: citingArticles['titles'] += [article['TI']] citingArticles['authors'] += [article['AU']] au_temp = article['AU'].replace(' ', '').replace(',', '') if p_authors != '': found = False for pa in p_authors: if re.search(pa, au_temp, flags=re.IGNORECASE): found = True citingArticles['selfCitation'] += 1 citingArticles['isSelf'] += ['Self'] break if not found: citingArticles['othersCitation'] += 1 citingArticles['isSelf'] += ['Others\''] else: citingArticles['isSelf'] += ['-'] ui_stream.push(command='res', target='citingArticles', res=citingArticles) # [단계 종료] 단일 상세 검색 ######################################################################### # history 제한 방지 if self.qid > 180: self.set_session() ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1200][0]) return
def start(self, query, start_year, end_year, gubun): ''' 논문 빠른 검색 및 상세정보 제공함수 :param query: keyword, p_authors, organization 각각 문자열 :param start_year: 시작년도 :param end_year: 끝년도 :param gubun: 검색 구분 카테고리 :return: ''' # Sejong Univ 로 고정 ##################### query = (query[0], query[1], 'Sejong Univ') session = self.session base_url = self.base_url ui_stream = self.ui_stream keyword = query[0] p_authors = query[1] organization = query[2] # 검색속도 향상을 위한 헤더 랜더마이즈 # orginal_headers = session.headers # session.headers.update({'User-Agent': str(random.getrandbits(32))}) # [단계 1/3] 최초 검색 ######################################################################### ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002]) ui_stream.push(command='log', msg='검색어 : %s' % keyword) if keyword.find('=') != -1: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1300][0] }) return action_url = '/WOS_GeneralSearch.do' form_data = { 'action': 'search', 'product': 'WOS', 'search_mode': 'GeneralSearch', 'sa_params': 'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID, 'SID': self.SID, 'value(input1)': keyword, 'value(select1)': gubun, 'startYear': start_year, 'endYear': end_year, } if organization != '': form_data.update({ 'limitStatus': 'expanded', 'value(bool_1_2)': 'AND', 'value(input2)': organization, 'value(select2)': 'AD', 'fieldCount': '2', }) # 검색 요청 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102]) url = base_url + action_url # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상 # 어떤 문자인 지 찾아서 수정하는 작업이 필요. # form_data = sju_utiles.get_form_data(action_url, form_data) self.qid += 1 http_res = sju_utiles.sju_post(session, url, form_data, 5, query) #http_res = session.post(url, form_data, verify=False) # # 검색 성공 # if http_res.status_code == requests.codes.ok: # location = http_res.history[0].headers['Location'] # reffer = base_url + '/' + location # # 검색 실패 # else: # ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2]) # raise sju_exceptions.RequestsError # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # http_res = session.get(reffer) # # Access Denied # if http_res.status_code == 403: # ui_stream.push( # command='res', target='errQuery', # res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'} # ) # return target_content = http_res.content soup = BeautifulSoup(target_content, 'html.parser') atag = soup.select_one('a.snowplow-full-record') try: if not atag: raise sju_exceptions.NoPaperDataError() # 검색 결과가 없을 경우 except sju_exceptions.NoPaperDataError: ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0]) ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': sju_CONSTANTS.STATE_MSG[1302][0] }) return except Exception as e: ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0]) raise Exception(e) ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202]) # [단계 3/3] 전체 Fast 데이터 다운로드 ######################################################################### # Fast 5000 요청 및 다운로드 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204]) qid = soup.select('input#qid')[0].attrs['value'] rurl = soup.select('input#rurl')[0].attrs['value'] self.qid = int(qid) action_url = '/OutboundService.do?action=go&&' form_data = { 'qid': str(self.qid), 'SID': self.SID, 'mark_to': '5000', 'markTo': '5000', } form_data = sju_utiles.get_form_data(action_url, form_data) url = base_url + action_url http_res = sju_utiles.sju_post(session, url, form_data, 5, query) #http_res = session.post(url, form_data, verify=False) self.qid += 1 # Access Denied if http_res.status_code == 403: ui_stream.push(command='res', target='errQuery', res={ 'query': query, 'msg': '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.' }) return # Fast 5000 데이터 처리 ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404]) fast_5000 = http_res.content.decode('utf-8').replace('\r', '') fast_5000_list = fast_5000.split('\n') keys = fast_5000_list[0].split('\t') fast_5000_list = fast_5000_list[1:] if fast_5000_list[-1] == '': fast_5000_list.pop() article = {} articles = [] for row in fast_5000_list: row_list = row.split('\t') for idx, key in enumerate(keys): article[key] = row_list[idx] article['id'] = str(random.getrandbits(8)) articles.append(article) article = {} if self.qid > 180: self.set_session() ui_stream.push(command='res', target='fast_5000', res=articles) return
def parse_paper_data(target_content, paper_data_id, search_type, SID_name): """ 페이지 정보를 입력받아 정리한 내용을 리스트로 반환하는 함수 :param target_content: 페이지 내용 :param paper_data_id: 랜덤 값 ID (10자리) :type: single, dupl search인지 판단 :return: 페이지 정보, 인용 수 반환 """ soup = BeautifulSoup(target_content, 'html.parser') if search_type == "single": # 검색 결과 수 pagination_btn = soup.select('a.paginationNext') # 결과 수가 없을 경우 즉시 종료 if not pagination_btn or len(pagination_btn) == 0: raise sju_exceptions.NoPaperDataError() pagination_btn_alt = soup.select('a.paginationNext')[0].attrs['alt'] # 결과 수가 1개가 아닐 경우 즉시 종료 # and pagination_btn_alt.find('비활성') == -1 if pagination_btn_alt.find('Inactive') == -1: raise sju_exceptions.MultiplePaperDataError() # 논문 제목 title = soup.select('div.title')[0].text.replace('\n', '') # ISSN ISSN = soup.select('p.sameLine') if ISSN: ISSN = ISSN[0].value.contents[0] else: ISSN = '' # 등급 grades = [] caped_grades = [] box_label = soup.select('span.box-label') for label in box_label: if label.text.find('- ') != -1: temp = label.text.replace('- ', '') grades += [temp] caped_grades += [re.sub(r'[ a-z]+', r'', temp)] # 인용 횟수 및 링크 cnt_link = soup.select('a.snowplow-citation-network-times-cited-count-link') if not cnt_link: times_cited = '0' else: cnt_link = cnt_link[0] times_cited = cnt_link.span.text #저널 명 journal_name = soup.select('span.sourceTitle') journal_name = journal_name[0].text.replace('\n','') #print("[1type]journal_name : ", journal_name) #print("[2type]journal_name : ",type(journal_name)) # 기타 필드 correction_form = soup.find(action='http://ips.clarivate.com/cgi-bin/forms/wok_datachange/wok-proc.pl') if not correction_form: correction_form = soup.find(action='https://support.clarivate.com/ScientificandAcademicResearch/s/datachanges') correction_form_inputs_by_name = {} for inputTag in correction_form.find_all('input'): inputDict = inputTag.attrs correction_form_inputs_by_name[inputDict['name']] = inputDict['value'] doc_type = '' published_month = '' research_areas = '' publisher = '' language = '' reprint = '' authors = [] fr_authors = [] fr_addresses = [] for fr_field in soup.select('p.FR_field'): if fr_field.text.find('Document Type:') != -1: doc_type = fr_field.text.split(':')[1] if fr_field.text.find('Published:') != -1: published_month = fr_field.text.split(':')[1] if fr_field.text.find('Research Areas:') != -1: research_areas = fr_field.text.split(':')[1] if fr_field.text.find('Publisher ') != -1: publisher = ' '.join(fr_field.text.split(' ')[1:]) publisher = publisher.split(',') if fr_field.text.find('Language:') != -1: language = fr_field.text.split(':')[1] if fr_field.text.find('Reprint Address:') != -1: reprint = fr_field.text.split(':')[1].replace('\n', '').strip() if fr_field.text.find('By:') != -1: fr_authors = fr_field # if fr_field.text.find('Addresses:') != -1: # if fr_field.text.find('E-mail') != -1: # continue # fr_addresses = fr_field.nextSibling addresses = {} # (NEWPART) IF, 백분율 # 발행년도-1 가져오기(필터링) incite_published_month = published_month incite_published_month = incite_published_month.strip() incite_published_month = re.findall(r'2[0-9][0-9][0-9]|19[0-9][0-9]', incite_published_month)[0] incite_published_month = str(int(incite_published_month)-1) # IF, 백분율 [1단계] 세부 페이지 -> Incite 페이지 URL publish_id = soup.find("a",{"id":"HS_JCRLink"}) publish_id = publish_id['onclick'] publish_id = publish_id[publish_id.find('toPID')+6:publish_id.find('cacheurl')-1] ISSN_name = str(ISSN) jr_name = journal_name jr_name = jr_name.replace(" ", "%20") JRC_url = get_incite_form(SID_name, publish_id, ISSN_name, jr_name) #print(JRC_url) #print(type(JRC_url)) #print("start111") # IF, 백분율 [2단계] Incite 페이지 파싱(1차) try: ua = UserAgent() new_user_agent = {'User-Agent': ua.random} r = requests.Session() http_res = r.get(JRC_url, headers= new_user_agent) #print("start222") # [2단계]-1 Impact Factor 파싱 (2차) # incite jr name find incite_jr_name = "https://jcr.clarivate.com/SearchJournalsJson.action?query=" + ISSN_name #incite_jr_name = "https://jcr.incites.thomsonreuters.com/SearchJournalsJson.action?query=" + ISSN_name http_incite_jr = r.get(incite_jr_name) http_incite_jr_text = http_incite_jr.text incite_edition_name = http_incite_jr_text[http_incite_jr_text.find('edition')+10:http_incite_jr_text.find('jcrCoverageYears')-3] incite_jr_name = http_incite_jr_text[http_incite_jr_text.find('abbrJournal')+14:http_incite_jr_text.find('journalTitle')-3] incite_jr_name = incite_jr_name.replace(' ','%20') #base_json_url = "https://jcr.incites.thomsonreuters.com/JournalProfileGraphDataJson.action?abbrJournal=" + incite_jr_name base_json_url = "https://jcr.clarivate.com/JournalProfileGraphDataJson.action?abbrJournal=" + incite_jr_name base_json_url += "&edition=" + incite_edition_name + "&page=1&start=0&limit=25&sort=%5B%7B%22property%22%3A%22year%22%2C%22direction%22%3A%22DESC%22%7D%5D" http_incite_if = r.get(base_json_url) findall_if = 'year":"' + incite_published_month + '.{700,800}' http_incite_if_text = re.findall(findall_if, http_incite_if.text)[0] #print("start333") # ImpactFactor findnall_if = 'journalImpactFactor":"[0-9]{0,10}.{0,1}[0-9]{1,10}",' impactFactor_one = re.findall(findnall_if, http_incite_if_text)[0] impactFactor_one = impactFactor_one[impactFactor_one.find(':')+2:-2] # 5 year ImpactFactor findall_if = 'fiveYearImpactFactor":("[0-9]{0,10}.{0,1}[0-9]{1,10}"|null),"' impactFactor_two = re.findall(findall_if, http_incite_if_text)[0] if impactFactor_two == "null": impactFactor_two = "None" else: impactFactor_two = impactFactor_two[1:-1] #print("start444") # [2단계]-2 백분율 파싱 (2차) #base_json_url = "https://jcr.incites.thomsonreuters.com/JCRImpactFactorJson.action?&abbrJournal=" + incite_jr_name base_json_url = "https://jcr.clarivate.com/JCRImpactFactorJson.action?&abbrJournal=" + incite_jr_name base_json_url += "&edition=" + incite_edition_name http_incite_per = r.get(base_json_url) http_incite_per_LIST = ast.literal_eval(http_incite_per.text) http_incite_per_LIST = http_incite_per_LIST['data'] # JCR 랭크 ranks = [] temp = [] jcr = [] good_rank = '' for PER_LIST in http_incite_per_LIST: if PER_LIST['year'] == str(incite_published_month): test = str(PER_LIST) find_per = '.{1,3}\/.{1,3}-Q[0-9]' JCRS = re.findall(find_per, test) for JCR in JCRS: JCR_P = JCR[JCR.find("'")+1:JCR.find("-")] temp = [JCR_P] jrank, jall = map(int,JCR_P.split('/')) temp.append(round(jrank/jall * 100, 2)) ranks.append(temp) jcr.append('num') jcr.append(temp) good_rank = max(ranks, key=lambda x: -x[-1])[-1] #print("====================================") #print("title : ", title) #print(good_rank) #print("====================================") """ # JCR 랭크 JCR_Category_table = soup.select('table.JCR_Category_table') jcr_headers = [] jcr = [] ranks = [] good_rank = '' trs = [] if len(JCR_Category_table) > 0: JCR_Category_table = JCR_Category_table[0] trs = JCR_Category_table.find_all('tr') if trs: jcr.append([x.text.strip() for x in trs[0].find_all('th')]) for tr in trs[1:]: temp = [x.text.strip() for x in tr.find_all('td')] jcr.append(temp) jrank, jall = map(int, temp[1].split(' of ')) temp.append(round(jrank/jall * 100, 2)) ranks.append(temp) good_rank = max(ranks, key=lambda x: -x[-1])[-1] """ """ 이전방식 백분율 정규표현식 #findall_if = '("year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}})' #("year":"2002".{10,200}},{"year|"year":"2002".{10,200}}|,{.{10,200},"year":"2002",.{10,500}"},) #findall_if = '("year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",.{10,400}"},|,{".{10,200},"year":"'+ incite_published_month +'",.{10,50}},)' #findall_if = '("year":"'+ incite_published_month + '".{10,100}},{"year|"year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}}|,{".{10,200},"year":"' + incite_published_month +'",.{10,50}},{"|,{.{10,200},"year":"' + incite_published_month + '",.{10,400}"},)' #findall_if = '("year":"' + incite_published_month + '"(.{10,100}|.{10,200})},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",(.{10,50}|.{10,100}|.{10,200}|.{10,300}|.{10,400})"},)' #http_incite_per_text = re.findall(findall_if, http_incite_per.text) #findall_if = '("year":"' + incite_published_month + '"(.{10,100}|.{10,200})},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",(.{10,50}|.{10,100}|.{10,200}|.{10,300}|.{10,400})"},)' #http_incite_per_text = re.findall(findall_if, http_incite_per.text) """ except: #print("====================================") #print("error 발생!!") #print("title : ", title) #print("====================================") impactFactor_one = "Except" impactFactor_two = "Except" good_rank = '' jcr = [] impact_factor = {} if impactFactor_one and impactFactor_two: if impactFactor_one == "Except" and impactFactor_two == "Except": impact_factor[incite_published_month] = "None" impact_factor['5 year'] = "None" else: impact_factor[incite_published_month] = impactFactor_one impact_factor['5 year'] = impactFactor_two else: impact_factor = {} #print("final impact_factor = ", impact_factor) #print("impactFactor one = ", impactFactor_one) #print("impactFactor two = ", impactFactor_two) # incite session 종료 r.close() #print("finish") #저자, 연구기관 fconts = fr_authors.select('a') fr_authors_text = fr_authors.text.replace('\n', '') fr_authors_text = fr_authors_text.split(':')[1].split(';') # 풀 네임 full_name = {} for fa in fr_authors_text: p_count = fa.count('(') if p_count > 1: fa_match = re.search(r'(.+) \((.+)\(.+\)\)', fa) elif p_count == 1: fa_match = re.search(r'(.+) \((.+)\)', fa) if fa_match: full_name[fa_match.group(1).strip()] = fa_match.group(2).replace(r'\(|\)', '').strip() target_author = '' tauthor_address = [] for con in fconts: isSub = con.get('href').find('javascript') != -1 if not isSub: if target_author != '': addresses[target_author] = tauthor_address if target_author in full_name.keys(): addresses[full_name[target_author]] = tauthor_address tauthor_address = [] target_author = con.text.strip() authors += [target_author] else: addressId = re.sub(r'.+\'(.+)\'.+', r'\1', con.get('href')) temp = soup.find('a', id=addressId) if temp != None: # tauthor_address += [temp.contents[0]] tauthor_address += [temp.text] if target_author != '': addresses[target_author] = tauthor_address addresses[full_name[target_author]] = tauthor_address if reprint == '': reprint = 'None' paperData = { 'id' : paper_data_id, # 'authors' : correction_form_inputs_by_name['00N70000002C0wa'].split(';'), 'authors' : authors, 'full_name' : full_name, 'fr_authors_text' : fr_authors_text, 'firstAuthor': authors[0], 'addresses' : addresses, 'authorsCnt' : str(len(correction_form_inputs_by_name['00N70000002C0wa'].split(';')) - 1), 'doi' : correction_form_inputs_by_name['00N70000002n88A'], 'volume' : correction_form_inputs_by_name['00N70000002Bdnt'], 'issue' : correction_form_inputs_by_name['00N700000029W18'], 'pages' : correction_form_inputs_by_name['00N70000002C0vh'], 'published' : correction_form_inputs_by_name['00N70000002BdnY'], 'publishedMonth' : published_month, 'publisher' : publisher, 'journal_name' : journal_name, # 'title' : correction_form_inputs_by_name['00N70000002BdnX'], 'title' : title, 'impact_factor' : impact_factor, 'prevYearIF' : 'None', 'goodRank' : good_rank, 'timesCited' : times_cited, 'grades' : grades, 'capedGrades' : caped_grades, 'docType' : doc_type, 'researchAreas' : research_areas, 'language' : language, 'reprint' : reprint, 'jcr' : jcr, 'citingArticles' : [], 'issn' : ISSN, } paperData['ivp'] = ['%s/%s'%(paperData['issue'], paperData['volume']), paperData['pages']] #print("finish222222222") # 전년도 임팩트 팩토 if incite_published_month in impact_factor.keys(): paperData['prevYearIF'] = impact_factor[incite_published_month] #print("finish3333333333333") return paperData, cnt_link