Beispiel #1
0
def parse_paper_data(target_content, paper_data_id, search_type):
    """
        페이지 정보를 입력받아 정리한 내용을 리스트로 반환하는 함수
        :param target_content: 페이지 내용
        :param paper_data_id: 랜덤 값 ID (10자리)
        :type: single, dupl search인지 판단
        :return: 페이지 정보, 인용 수 반환
    """
    soup = BeautifulSoup(target_content, 'html.parser')

    if search_type == "single":
        # 검색 결과 수
        pagination_btn = soup.select('a.paginationNext')

        # 결과 수가 없을 경우 즉시 종료
        if not pagination_btn or len(pagination_btn) == 0:
            raise sju_exceptions.NoPaperDataError()

        pagination_btn_alt = soup.select('a.paginationNext')[0].attrs['alt']
        # 결과 수가 1개가 아닐 경우 즉시 종료
        # and pagination_btn_alt.find('비활성') == -1
        if pagination_btn_alt.find('Inactive') == -1:
            raise sju_exceptions.MultiplePaperDataError()

    # 논문 제목
    title = soup.select('div.title')[0].text.replace('\n', '')

    # ISSN
    ISSN = soup.select('p.sameLine')
    if ISSN:
        ISSN = ISSN[0].value.contents[0]
    else:
        ISSN = ''

    # 등급
    grades = []
    caped_grades = []
    box_label = soup.select('span.box-label')
    for label in box_label:
        if label.text.find('- ') != -1:
            temp = label.text.replace('- ', '')
            grades += [temp]
            caped_grades += [re.sub(r'[ a-z]+', r'', temp)]

    # 임팩트 팩토
    Impact_Factor_table = soup.select('table.Impact_Factor_table')
    impact_factor = {}
    if len(Impact_Factor_table) > 0:
        trs = Impact_Factor_table[0].find_all('tr')
        tds = trs[0].find_all('td')
        ths = trs[1].find_all('th')

        for idx, th in enumerate(ths):
            impact_factor[th.text.strip()] = tds[idx].text.strip()

    else:
        impact_factor = {}

    # JCR 랭크
    JCR_Category_table = soup.select('table.JCR_Category_table')
    jcr_headers = []
    jcr = []
    ranks = []
    good_rank = ''
    trs = []
    if len(JCR_Category_table) > 0:
        JCR_Category_table = JCR_Category_table[0]
        trs = JCR_Category_table.find_all('tr')
        if trs:
            jcr.append([x.text.strip() for x in trs[0].find_all('th')])
            for tr in trs[1:]:
                temp = [x.text.strip() for x in tr.find_all('td')]
                jcr.append(temp)
                jrank, jall = map(int, temp[1].split(' of '))
                temp.append(round(jrank / jall * 100, 2))
                ranks.append(temp)

        good_rank = max(ranks, key=lambda x: -x[-1])[-1]

    # 인용 횟수 및 링크
    cnt_link = soup.select(
        'a.snowplow-citation-network-times-cited-count-link')
    if not cnt_link:
        times_cited = '0'
    else:
        cnt_link = cnt_link[0]
        times_cited = cnt_link.span.text

    #저널 명
    journal_name = soup.select('span.sourceTitle')
    journal_name = journal_name[0].text.replace('\n', '')

    #print("[1type]journal_name : ", journal_name)
    #print("[2type]journal_name : ",type(journal_name))

    # 기타 필드
    correction_form = soup.find(
        action=
        'http://ips.clarivate.com/cgi-bin/forms/wok_datachange/wok-proc.pl')
    if not correction_form:
        correction_form = soup.find(
            action=
            'https://support.clarivate.com/ScientificandAcademicResearch/s/datachanges'
        )
    correction_form_inputs_by_name = {}
    for inputTag in correction_form.find_all('input'):
        inputDict = inputTag.attrs
        correction_form_inputs_by_name[inputDict['name']] = inputDict['value']

    doc_type = ''
    published_month = ''
    research_areas = ''
    publisher = ''
    language = ''
    reprint = ''
    authors = []
    fr_authors = []
    fr_addresses = []
    for fr_field in soup.select('p.FR_field'):
        if fr_field.text.find('Document Type:') != -1:
            doc_type = fr_field.text.split(':')[1]

        if fr_field.text.find('Published:') != -1:
            published_month = fr_field.text.split(':')[1]

        if fr_field.text.find('Research Areas:') != -1:
            research_areas = fr_field.text.split(':')[1]

        if fr_field.text.find('Publisher ') != -1:
            publisher = ' '.join(fr_field.text.split(' ')[1:])
            publisher = publisher.split(',')

        if fr_field.text.find('Language:') != -1:
            language = fr_field.text.split(':')[1]

        if fr_field.text.find('Reprint Address:') != -1:
            reprint = fr_field.text.split(':')[1].replace('\n', '').strip()

        if fr_field.text.find('By:') != -1:
            fr_authors = fr_field

        # if fr_field.text.find('Addresses:') != -1:
        #     if fr_field.text.find('E-mail') != -1:
        #         continue
        #     fr_addresses = fr_field.nextSibling

    addresses = {}

    #저자, 연구기관
    fconts = fr_authors.select('a')
    fr_authors_text = fr_authors.text.replace('\n', '')
    fr_authors_text = fr_authors_text.split(':')[1].split(';')

    # 풀 네임
    full_name = {}
    for fa in fr_authors_text:
        p_count = fa.count('(')
        if p_count > 1: fa_match = re.search(r'(.+) \((.+)\(.+\)\)', fa)
        elif p_count == 1: fa_match = re.search(r'(.+) \((.+)\)', fa)
        if fa_match:
            full_name[fa_match.group(1).strip()] = fa_match.group(2).replace(
                r'\(|\)', '').strip()

    target_author = ''
    tauthor_address = []
    for con in fconts:
        isSub = con.get('href').find('javascript') != -1
        if not isSub:
            if target_author != '':
                addresses[target_author] = tauthor_address
                if target_author in full_name.keys():
                    addresses[full_name[target_author]] = tauthor_address
            tauthor_address = []
            target_author = con.text.strip()
            authors += [target_author]
        else:
            addressId = re.sub(r'.+\'(.+)\'.+', r'\1', con.get('href'))
            temp = soup.find('a', id=addressId)
            if temp != None:
                # tauthor_address += [temp.contents[0]]
                tauthor_address += [temp.text]

    if target_author != '':
        addresses[target_author] = tauthor_address
        addresses[full_name[target_author]] = tauthor_address
    if reprint == '':
        reprint = 'None'

    paperData = {
        'id':
        paper_data_id,
        # 'authors' : correction_form_inputs_by_name['00N70000002C0wa'].split(';'),
        'authors':
        authors,
        'full_name':
        full_name,
        'fr_authors_text':
        fr_authors_text,
        'firstAuthor':
        authors[0],
        'addresses':
        addresses,
        'authorsCnt':
        str(
            len(correction_form_inputs_by_name['00N70000002C0wa'].split(';')) -
            1),
        'doi':
        correction_form_inputs_by_name['00N70000002n88A'],
        'volume':
        correction_form_inputs_by_name['00N70000002Bdnt'],
        'issue':
        correction_form_inputs_by_name['00N700000029W18'],
        'pages':
        correction_form_inputs_by_name['00N70000002C0vh'],
        'published':
        correction_form_inputs_by_name['00N70000002BdnY'],
        'publishedMonth':
        published_month,
        'publisher':
        publisher,
        'journal_name':
        journal_name,
        # 'title' : correction_form_inputs_by_name['00N70000002BdnX'],
        'title':
        title,
        'impact_factor':
        impact_factor,
        'prevYearIF':
        'None',
        'goodRank':
        good_rank,
        'timesCited':
        times_cited,
        'grades':
        grades,
        'capedGrades':
        caped_grades,
        'docType':
        doc_type,
        'researchAreas':
        research_areas,
        'language':
        language,
        'reprint':
        reprint,
        'jcr':
        jcr,
        'citingArticles': [],
        'issn':
        ISSN,
    }
    paperData['ivp'] = [
        '%s/%s' % (paperData['issue'], paperData['volume']), paperData['pages']
    ]

    # 전년도 임팩트 팩토
    now = datetime.datetime.now()
    prev_year = str(now.year - 2)
    # prev_year = str(int(paperData['published']) - 1)
    if prev_year in impact_factor.keys():
        try:
            paperData['prevYearIF'] = impact_factor[prev_year]
        except:
            prev_year = str(now.year - 1)
            paperData['prevYearIF'] = impact_factor[prev_year]

    return paperData, cnt_link
Beispiel #2
0
    def start(self, query, start_year, end_year, gubun):
        '''
			둘 이상의 논문에 관한 상세 정보 제공함수
            :param query: 입력 값 (논문 제목,저자)
            :param start_year: 시작년도
            :param end_year: 끝년도
            :param gubun: 검색 구분 카테고리
            :return:
        '''
        # Sejong Univ 로 고정
        query = (query[0], query[1], 'Sejong Univ')

        session = self.session
        base_url = self.base_url

        ui_stream = self.ui_stream
        threading_amount = self.threading_amount

        keyword = query[0]
        p_authors = query[1]
        organization = query[2]

        # [단계 1/3] 최초 검색
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4002])
        ui_stream.push(command='log', msg='검색어 : %s' % keyword)

        if keyword.find('=') != -1:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1300][0]
                           })
            return

        action_url = '/WOS_GeneralSearch.do'
        form_data = {
            'action': 'search',
            'product': 'WOS',
            'search_mode': 'GeneralSearch',
            'sa_params':
            'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID,
            'SID': self.SID,
            'value(input1)': keyword,
            'value(select1)': gubun,
            'startYear': start_year,
            'endYear': end_year,
        }
        if organization != '':
            form_data.update({
                'limitStatus': 'expanded',
                'value(bool_1_2)': 'AND',
                'value(input2)': organization,
                'value(select2)': 'AD',
                'fieldCount': '2',
            })

        # 검색 요청
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4102])

        url = base_url + action_url

        self.qid += 1
        http_res = session.post(url, form_data)

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        target_content = http_res.content
        soup = BeautifulSoup(target_content, 'html.parser')
        atag_list = soup.select('a.snowplow-full-record')

        #report_link = soup.select('a.citation-report-summary-link')

        # 1페이지 검색 결과가 없을 경우
        try:
            if soup.find(id="footer_formatted_count") == None:
                raise sju_exceptions.NoPaperDataError()
            else:
                total_count = soup.find(id="footer_formatted_count").text
                total_count = int(total_count.replace(",", ""))

            if len(atag_list) == 0:
                raise sju_exceptions.NoPaperDataError()

        # 검색 결과가 없을 경우
        except sju_exceptions.NoPaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[4302][0])

            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[4302][0]
                           })
            return

        except Exception as e:
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4303][0])
            raise Exception(e)

        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4202])

        # [단계 2/3] 상세 정보 페이지 fetch
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4003])
        query_string = atag_list[0]['href']
        action_url = query_string[0:query_string.find('page')]

        # 검색할 모든 페이지 url 저장
        page_count = 1
        query_string_list = []

        # 50건의 url만 저장
        if total_count > 50:
            doc_count_range = 51
        else:
            doc_count_range = total_count + 1

        for doc_count in range(1, doc_count_range):
            url = base_url + action_url + "page=" + str(
                page_count) + "&doc=" + str(doc_count)
            query_string_list.append(url)
            if doc_count % 10 == 0:
                page_count += 1

        # 각 페이지 상세 정보 요청 시작
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4103])

        MSC = MultiSearchContainer(ui_stream)

        with concurrent.futures.ThreadPoolExecutor(
                max_workers=threading_amount) as exe:
            future_run = {
                exe.submit(MSC.start, q_url, session, p_authors): q_url
                for q_url in query_string_list
            }
            for future in concurrent.futures.as_completed(future_run):
                q_url = future_run[future]
                try:
                    future.result()
                except Exception as e:
                    ui_stream.push(command='err', msg='[다중검색] %d 검색 중 에러발생')
                    raise e

        if self.qid > 180:
            self.set_session()

        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[4200][0])
        return
Beispiel #3
0
    def start(self, query, start_year, end_year, gubun):
        '''
			하나의 논문에 관한 상세 정보 제공함수
			:param query: keyword, p_authors, organization 각각 문자열 
			:param start_year: 시작년도
			:param end_year: 끝년도
			:param gubun: 검색 구분 카테고리
			:return:
        '''
        # Sejong Univ 로 고정
        #####################
        query = (query[0], query[1], 'Sejong Univ')

        # driver = self.driver
        session = self.session
        base_url = self.base_url
        ui_stream = self.ui_stream

        keyword = query[0]
        p_authors = query[1]
        organization = query[2]

        paper_data_id = str(random.getrandbits(32))

        # 검색속도 향상을 위한 헤더 랜더마이즈
        # orginal_headers = session.headers
        # session.headers.update({'User-Agent': str(random.getrandbits(32))})
        # [단계 1/3] 최초 검색
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002])
        ui_stream.push(command='log', msg='검색어 : %s' % keyword)

        if keyword.find('=') != -1:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1300][0]
                           })
            return

        action_url = '/WOS_GeneralSearch.do'
        form_data = {
            'action': 'search',
            'product': 'WOS',
            'search_mode': 'GeneralSearch',
            'sa_params':
            'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID,
            'SID': self.SID,
            'value(input1)': keyword,
            'value(select1)': gubun,
            'startYear': start_year,
            'endYear': end_year,
        }
        if organization != '':
            form_data.update({
                'limitStatus': 'expanded',
                'value(bool_1_2)': 'AND',
                'value(input2)': organization,
                'value(select2)': 'AD',
                'fieldCount': '2',
            })

        # 검색 요청
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102])

        url = base_url + action_url
        # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상
        # 어떤 문자인 지 찾아서 수정하는 작업이 필요.
        # form_data = sju_utiles.get_form_data(action_url, form_data)

        self.qid += 1
        http_res = session.post(url, form_data)

        # # 검색 성공
        # if http_res.status_code == requests.codes.ok:
        #     location = http_res.history[0].headers['Location']
        #     reffer = base_url + '/' + location

        # # 검색 실패
        # else:
        #     ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2])
        #     raise sju_exceptions.RequestsError

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        # http_res = session.get(reffer)

        # # Access Denied
        # if http_res.status_code == 403:
        #     ui_stream.push(
        #         command='res', target='errQuery',
        #         res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'}
        #     )
        #     return

        target_content = http_res.content
        soup = BeautifulSoup(target_content, 'html.parser')

        atag_list = soup.select('a.snowplow-full-record')
        report_link = soup.select('a.citation-report-summary-link')

        try:
            if len(atag_list) == 0:
                raise sju_exceptions.NoPaperDataError()
            elif len(atag_list) > 1:
                raise sju_exceptions.MultiplePaperDataError()
        # 검색 결과가 없을 경우
        except sju_exceptions.NoPaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0])

            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][0]
                           })
            return
        # 검색 결과가 2개 이상일 경우
        except sju_exceptions.MultiplePaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][1]
                           })
            return
        except Exception as e:
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0])
            raise Exception(e)

        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202])

        # [단계 2/3] 상세 정보 페이지 fetch, 인용년도 조회 (스레딩)
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1003])

        tc_data = {'tc_dict': []}
        tc_parsing_thread = None
        # 인용 보고서 링크가 잡힐 때
        if len(report_link) != 0:
            # 인용년도 조회 스레딩
            tc_parsing_thread = threading.Thread(target=self.get_tc_data,
                                                 args=(report_link,
                                                       paper_data_id, tc_data))
            tc_parsing_thread.start()

        # 결과 리스트 페이지를 들렀다 오는 경우
        query_string = atag_list[0]['href']

        # # 상세 보기 바로 진입 하는 경우
        # # qid가 랜덤한 경우가 존재... 사용하기 위해선
        # # 이슈가 해결되야함.
        # action_url = '/full_record.do'
        # query_data = {
        #     'page': '1',
        #     'qid': str(self.qid),
        #     'SID': self.SID,
        #     'doc': '1',
        # }
        # query_string = sju_utiles.get_query_string(action_url, query_data)

        # 상세 정보 요청
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1103])

        # session.headers['Reffer'] = reffer
        http_res = session.get(base_url + query_string)

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg':
                               '해당 논문의 상세 정보를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        target_content = http_res.content

        # 상세 정보 파싱
        try:
            paper_data, cnt_link = sju_utiles.parse_paper_data(
                target_content, paper_data_id, "single")
            # paper_data['subsidy'] = sju_utiles.get_subsidy01(paper_data, p_authors)

        # 검색 결과가 없을 경우
        except sju_exceptions.NoPaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][0]
                           })
            return
        # 검색 결과가 2개 이상일 경우
        except sju_exceptions.MultiplePaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][1])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][1]
                           })
            return
        except Exception as e:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][2]
                           })
            # raise sju_exceptions.FailedToParseError(e, query)
            return
        # 요청 성공
        else:

            ui_stream.push(command='res', target='paperData', res=paper_data)

            # 인용 년도 조회 완료를 기다림
            if tc_parsing_thread:
                tc_parsing_thread.join()
                ui_stream.push(command='log', msg='인용 년도 조회가 완료되었습니다.')

            tc_dict = tc_data['tc_dict']
            # 인용 년도 조회 성공 시 출력
            if len(tc_dict) > 0:
                ui_stream.push(command='res',
                               target='tc_data',
                               res={
                                   'id': paper_data_id,
                                   'tc_data': tc_dict
                               })

            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1203])
        # # 요청 실패

        # [단계 3/3] 인용 논문 정보
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1004])

        # 인용 횟수에 따른 분기
        if not cnt_link:
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][0])
            self.qid += 1
            return
        elif int(paper_data['timesCited']) > 4999:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1304][1])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1304][1]
                           })
            self.qid += 1
            return

        # 인용 리포트 요청
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1104])

        url = base_url + cnt_link['href']

        http_res = session.get(url)
        target_content = http_res.content

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': '인용 리포트를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        soup = BeautifulSoup(target_content, 'html.parser')
        # 인용문 링크는 존재하나, 클릭할 경우 검색 결과가 없다는 메세지가 뜰 때
        if soup.text.find(
                'Your search found no records') != -1 or soup.text.find(
                    'None of the Citing Articles are in your subscription'
                ) != -1:
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1304][3])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1304][3]
                           })
            return

        qid = soup.select('input#qid')[0].attrs['value']
        rurl = soup.select('input#rurl')[0].attrs['value']
        times_cited = paper_data['timesCited']
        self.qid = int(qid)

        # Fast 5000 요청 및 다운로드
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204])

        action_url = '/OutboundService.do?action=go&&'
        form_data = {
            'qid': str(self.qid),
            'SID': self.SID,
            'mark_to': times_cited,
            'markTo': times_cited,
        }
        form_data = sju_utiles.get_form_data(action_url, form_data)

        url = base_url + action_url
        http_res = session.post(url, form_data)
        self.qid += 1

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg':
                               '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        # Fast 5000 데이터 처리
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404])

        fast_5000 = http_res.content.decode('utf-8').replace('\r', '')
        fast_5000_list = fast_5000.split('\n')
        keys = fast_5000_list[0].split('\t')
        fast_5000_list = fast_5000_list[1:]
        if fast_5000_list[-1] == '': fast_5000_list.pop()

        article = {}
        citing_articles = []
        for row in fast_5000_list:
            row_list = row.split('\t')
            for idx, key in enumerate(keys):
                article[key] = row_list[idx]
            citing_articles.append(article)
            article = {}

        # UI 응답 형식에 맞게 변환
        citingArticles = {
            'id': paper_data['id'],
            'selfCitation': 0,
            'othersCitation': 0,
            'titles': [],
            'authors': [],
            'isSelf': []
        }

        # 기준 저자 검증
        if p_authors != '':
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1504])
            p_authors = list(
                map(lambda x: x.replace(' ', '').replace(',', ''),
                    p_authors.split(';')))

        for article in citing_articles:
            citingArticles['titles'] += [article['TI']]
            citingArticles['authors'] += [article['AU']]
            au_temp = article['AU'].replace(' ', '').replace(',', '')
            if p_authors != '':
                found = False
                for pa in p_authors:
                    if re.search(pa, au_temp, flags=re.IGNORECASE):
                        found = True
                        citingArticles['selfCitation'] += 1
                        citingArticles['isSelf'] += ['Self']
                        break

                if not found:
                    citingArticles['othersCitation'] += 1
                    citingArticles['isSelf'] += ['Others\'']
            else:
                citingArticles['isSelf'] += ['-']

        ui_stream.push(command='res',
                       target='citingArticles',
                       res=citingArticles)

        # [단계 종료] 단일 상세 검색
        #########################################################################
        # history 제한 방지
        if self.qid > 180:
            self.set_session()

        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1200][0])

        return
Beispiel #4
0
    def start(self, query, start_year, end_year, gubun):
        '''
			논문 빠른 검색 및 상세정보 제공함수
			:param query: keyword, p_authors, organization 각각 문자열 
			:param start_year: 시작년도
			:param end_year: 끝년도
			:param gubun: 검색 구분 카테고리
			:return:
        '''
        # Sejong Univ 로 고정
        #####################
        query = (query[0], query[1], 'Sejong Univ')

        session = self.session
        base_url = self.base_url
        ui_stream = self.ui_stream

        keyword = query[0]
        p_authors = query[1]
        organization = query[2]

        # 검색속도 향상을 위한 헤더 랜더마이즈
        # orginal_headers = session.headers
        # session.headers.update({'User-Agent': str(random.getrandbits(32))})
        # [단계 1/3] 최초 검색
        #########################################################################
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1002])
        ui_stream.push(command='log', msg='검색어 : %s' % keyword)

        if keyword.find('=') != -1:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1300][0])
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1300][0]
                           })
            return

        action_url = '/WOS_GeneralSearch.do'
        form_data = {
            'action': 'search',
            'product': 'WOS',
            'search_mode': 'GeneralSearch',
            'sa_params':
            'WOS||%s|http://apps.webofknowledge.com|\'' % self.SID,
            'SID': self.SID,
            'value(input1)': keyword,
            'value(select1)': gubun,
            'startYear': start_year,
            'endYear': end_year,
        }
        if organization != '':
            form_data.update({
                'limitStatus': 'expanded',
                'value(bool_1_2)': 'AND',
                'value(input2)': organization,
                'value(select2)': 'AD',
                'fieldCount': '2',
            })

        # 검색 요청
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1102])

        url = base_url + action_url
        # SEJONG WIFI 접속 시 변수명에 특정 문자를 바르게 인코딩하지 못하는 현상
        # 어떤 문자인 지 찾아서 수정하는 작업이 필요.
        # form_data = sju_utiles.get_form_data(action_url, form_data)

        self.qid += 1

        http_res = sju_utiles.sju_post(session, url, form_data, 5, query)
        #http_res = session.post(url, form_data, verify=False)

        # # 검색 성공
        # if http_res.status_code == requests.codes.ok:
        #     location = http_res.history[0].headers['Location']
        #     reffer = base_url + '/' + location

        # # 검색 실패
        # else:
        #     ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][2])
        #     raise sju_exceptions.RequestsError

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': '검색을 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        # http_res = session.get(reffer)

        # # Access Denied
        # if http_res.status_code == 403:
        #     ui_stream.push(
        #         command='res', target='errQuery',
        #         res={'query': query, 'msg': '결과 리스트 페이지를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'}
        #     )
        #     return

        target_content = http_res.content
        soup = BeautifulSoup(target_content, 'html.parser')
        atag = soup.select_one('a.snowplow-full-record')
        try:
            if not atag:
                raise sju_exceptions.NoPaperDataError()
        # 검색 결과가 없을 경우
        except sju_exceptions.NoPaperDataError:
            ui_stream.push(command='err', msg=sju_CONSTANTS.STATE_MSG[1302][0])

            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg': sju_CONSTANTS.STATE_MSG[1302][0]
                           })
            return
        except Exception as e:
            ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1303][0])
            raise Exception(e)

        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1202])
        # [단계 3/3] 전체 Fast 데이터 다운로드
        #########################################################################

        # Fast 5000 요청 및 다운로드
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1204])

        qid = soup.select('input#qid')[0].attrs['value']
        rurl = soup.select('input#rurl')[0].attrs['value']
        self.qid = int(qid)

        action_url = '/OutboundService.do?action=go&&'
        form_data = {
            'qid': str(self.qid),
            'SID': self.SID,
            'mark_to': '5000',
            'markTo': '5000',
        }
        form_data = sju_utiles.get_form_data(action_url, form_data)

        url = base_url + action_url

        http_res = sju_utiles.sju_post(session, url, form_data, 5, query)
        #http_res = session.post(url, form_data, verify=False)
        self.qid += 1

        # Access Denied
        if http_res.status_code == 403:
            ui_stream.push(command='res',
                           target='errQuery',
                           res={
                               'query': query,
                               'msg':
                               '인용 논문 자료 다운로드를 요청했으나 서버가 접근 권한 없음을 반환했습니다.'
                           })
            return

        # Fast 5000 데이터 처리
        ui_stream.push(command='log', msg=sju_CONSTANTS.STATE_MSG[1404])

        fast_5000 = http_res.content.decode('utf-8').replace('\r', '')
        fast_5000_list = fast_5000.split('\n')
        keys = fast_5000_list[0].split('\t')
        fast_5000_list = fast_5000_list[1:]
        if fast_5000_list[-1] == '': fast_5000_list.pop()

        article = {}
        articles = []
        for row in fast_5000_list:
            row_list = row.split('\t')
            for idx, key in enumerate(keys):
                article[key] = row_list[idx]
            article['id'] = str(random.getrandbits(8))
            articles.append(article)
            article = {}

        if self.qid > 180:
            self.set_session()

        ui_stream.push(command='res', target='fast_5000', res=articles)
        return
Beispiel #5
0
def parse_paper_data(target_content, paper_data_id, search_type, SID_name):
    """
        페이지 정보를 입력받아 정리한 내용을 리스트로 반환하는 함수
        :param target_content: 페이지 내용
        :param paper_data_id: 랜덤 값 ID (10자리)
        :type: single, dupl search인지 판단
        :return: 페이지 정보, 인용 수 반환
    """
    soup = BeautifulSoup(target_content, 'html.parser')

    if search_type == "single":
        # 검색 결과 수
        pagination_btn = soup.select('a.paginationNext')
    
        # 결과 수가 없을 경우 즉시 종료
        if not pagination_btn or len(pagination_btn) == 0:
            raise sju_exceptions.NoPaperDataError()

        pagination_btn_alt = soup.select('a.paginationNext')[0].attrs['alt']
        # 결과 수가 1개가 아닐 경우 즉시 종료
        # and pagination_btn_alt.find('비활성') == -1
        if pagination_btn_alt.find('Inactive') == -1:
            raise sju_exceptions.MultiplePaperDataError()

    # 논문 제목
    title = soup.select('div.title')[0].text.replace('\n', '')

    # ISSN
    ISSN = soup.select('p.sameLine')
    if ISSN:
        ISSN = ISSN[0].value.contents[0]
    else: ISSN = ''

    # 등급
    grades = []
    caped_grades = []
    box_label = soup.select('span.box-label')
    for label in box_label:
        if label.text.find('- ') != -1:
            temp = label.text.replace('- ', '')
            grades += [temp]
            caped_grades += [re.sub(r'[ a-z]+', r'', temp)]

    # 인용 횟수 및 링크
    cnt_link = soup.select('a.snowplow-citation-network-times-cited-count-link')
    if not cnt_link:
        times_cited = '0'
    else:
        cnt_link = cnt_link[0]
        times_cited = cnt_link.span.text

    #저널 명
    journal_name = soup.select('span.sourceTitle')
    journal_name = journal_name[0].text.replace('\n','')

    #print("[1type]journal_name : ", journal_name)
    #print("[2type]journal_name : ",type(journal_name))

    # 기타 필드
    correction_form = soup.find(action='http://ips.clarivate.com/cgi-bin/forms/wok_datachange/wok-proc.pl')
    if not correction_form:
        correction_form = soup.find(action='https://support.clarivate.com/ScientificandAcademicResearch/s/datachanges')
    correction_form_inputs_by_name = {}
    for inputTag in correction_form.find_all('input'):
        inputDict = inputTag.attrs
        correction_form_inputs_by_name[inputDict['name']] = inputDict['value']
    
    doc_type = ''
    published_month = ''
    research_areas = ''
    publisher = ''
    language = ''
    reprint = ''
    authors = []
    fr_authors = []
    fr_addresses = []
    for fr_field in soup.select('p.FR_field'):
        if fr_field.text.find('Document Type:') != -1:
            doc_type = fr_field.text.split(':')[1]
        
        if fr_field.text.find('Published:') != -1:
            published_month = fr_field.text.split(':')[1]

        if fr_field.text.find('Research Areas:') != -1:
            research_areas = fr_field.text.split(':')[1]

        if fr_field.text.find('Publisher ') != -1:
            publisher = ' '.join(fr_field.text.split(' ')[1:])
            publisher = publisher.split(',')

        if fr_field.text.find('Language:') != -1:
            language = fr_field.text.split(':')[1]
            
        if fr_field.text.find('Reprint Address:') != -1:
            reprint = fr_field.text.split(':')[1].replace('\n', '').strip()

        if fr_field.text.find('By:') != -1:
            fr_authors = fr_field

        # if fr_field.text.find('Addresses:') != -1:
        #     if fr_field.text.find('E-mail') != -1:
        #         continue
        #     fr_addresses = fr_field.nextSibling
            
    addresses = {}


    # (NEWPART) IF, 백분율

    # 발행년도-1 가져오기(필터링)
    incite_published_month = published_month
    incite_published_month = incite_published_month.strip()
    incite_published_month = re.findall(r'2[0-9][0-9][0-9]|19[0-9][0-9]', incite_published_month)[0]
    incite_published_month = str(int(incite_published_month)-1)
    
    # IF, 백분율 [1단계] 세부 페이지 -> Incite 페이지 URL
    publish_id = soup.find("a",{"id":"HS_JCRLink"})
    publish_id = publish_id['onclick']
    publish_id = publish_id[publish_id.find('toPID')+6:publish_id.find('cacheurl')-1]
    
    ISSN_name = str(ISSN)
    
    jr_name = journal_name
    jr_name = jr_name.replace(" ", "%20")
    
    JRC_url = get_incite_form(SID_name, publish_id, ISSN_name, jr_name)
    #print(JRC_url)
    #print(type(JRC_url))
    #print("start111")

    # IF, 백분율 [2단계] Incite 페이지 파싱(1차)
    try:
        ua = UserAgent()
        new_user_agent = {'User-Agent': ua.random}
        r = requests.Session()
        http_res = r.get(JRC_url, headers= new_user_agent)
        #print("start222")

        # [2단계]-1 Impact Factor 파싱 (2차)
        
        # incite jr name find  
        incite_jr_name = "https://jcr.clarivate.com/SearchJournalsJson.action?query=" + ISSN_name
        #incite_jr_name = "https://jcr.incites.thomsonreuters.com/SearchJournalsJson.action?query=" + ISSN_name
        http_incite_jr = r.get(incite_jr_name)
        http_incite_jr_text = http_incite_jr.text

        incite_edition_name = http_incite_jr_text[http_incite_jr_text.find('edition')+10:http_incite_jr_text.find('jcrCoverageYears')-3]
        incite_jr_name = http_incite_jr_text[http_incite_jr_text.find('abbrJournal')+14:http_incite_jr_text.find('journalTitle')-3]
        incite_jr_name = incite_jr_name.replace(' ','%20')
        
        #base_json_url = "https://jcr.incites.thomsonreuters.com/JournalProfileGraphDataJson.action?abbrJournal=" + incite_jr_name
        base_json_url = "https://jcr.clarivate.com/JournalProfileGraphDataJson.action?abbrJournal=" + incite_jr_name
        base_json_url += "&edition=" + incite_edition_name + "&page=1&start=0&limit=25&sort=%5B%7B%22property%22%3A%22year%22%2C%22direction%22%3A%22DESC%22%7D%5D"
        http_incite_if = r.get(base_json_url)

        findall_if = 'year":"' + incite_published_month + '.{700,800}'
        http_incite_if_text = re.findall(findall_if, http_incite_if.text)[0]

        #print("start333")

        # ImpactFactor
        findnall_if = 'journalImpactFactor":"[0-9]{0,10}.{0,1}[0-9]{1,10}",'
        impactFactor_one = re.findall(findnall_if, http_incite_if_text)[0]
        impactFactor_one = impactFactor_one[impactFactor_one.find(':')+2:-2]
        
        # 5 year ImpactFactor
        findall_if = 'fiveYearImpactFactor":("[0-9]{0,10}.{0,1}[0-9]{1,10}"|null),"'
        impactFactor_two = re.findall(findall_if, http_incite_if_text)[0]
        if impactFactor_two == "null":
            impactFactor_two = "None"
        else:
            impactFactor_two = impactFactor_two[1:-1]
        #print("start444")

        # [2단계]-2 백분율 파싱 (2차)
        #base_json_url = "https://jcr.incites.thomsonreuters.com/JCRImpactFactorJson.action?&abbrJournal=" + incite_jr_name
        base_json_url = "https://jcr.clarivate.com/JCRImpactFactorJson.action?&abbrJournal=" + incite_jr_name
        base_json_url += "&edition=" + incite_edition_name
        http_incite_per = r.get(base_json_url)
        http_incite_per_LIST = ast.literal_eval(http_incite_per.text)
        http_incite_per_LIST = http_incite_per_LIST['data']
        
        # JCR 랭크
        ranks = []
        temp = []
        jcr = []
        good_rank = ''
        for PER_LIST in http_incite_per_LIST:
            if PER_LIST['year'] == str(incite_published_month):
                test = str(PER_LIST)
                find_per = '.{1,3}\/.{1,3}-Q[0-9]'
                JCRS = re.findall(find_per, test)
                for JCR in JCRS:
                    JCR_P = JCR[JCR.find("'")+1:JCR.find("-")]
                    temp = [JCR_P]
                    jrank, jall = map(int,JCR_P.split('/'))
                    temp.append(round(jrank/jall  * 100, 2))
                    ranks.append(temp)
                    jcr.append('num')
                    jcr.append(temp)
                    
                good_rank = max(ranks, key=lambda x: -x[-1])[-1]
                #print("====================================")
                #print("title : ", title)
                #print(good_rank)
                #print("====================================")
        
        """
         # JCR 랭크
        JCR_Category_table = soup.select('table.JCR_Category_table')
        jcr_headers = []
        jcr = []
        ranks = []
        good_rank = ''
        trs = []
        if len(JCR_Category_table) > 0: 
            JCR_Category_table = JCR_Category_table[0]
            trs = JCR_Category_table.find_all('tr')
            if trs:
                jcr.append([x.text.strip() for x in trs[0].find_all('th')])
                for tr in trs[1:]:
                    temp = [x.text.strip() for x in tr.find_all('td')]
                    jcr.append(temp)
                    jrank, jall = map(int, temp[1].split(' of '))
                    temp.append(round(jrank/jall  * 100, 2))
                    ranks.append(temp)
        
            good_rank = max(ranks, key=lambda x: -x[-1])[-1]
        """
        """
        이전방식 백분율 정규표현식
        #findall_if = '("year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}})'
        #("year":"2002".{10,200}},{"year|"year":"2002".{10,200}}|,{.{10,200},"year":"2002",.{10,500}"},)
        #findall_if = '("year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",.{10,400}"},|,{".{10,200},"year":"'+ incite_published_month +'",.{10,50}},)'
        #findall_if = '("year":"'+ incite_published_month + '".{10,100}},{"year|"year":"' + incite_published_month + '".{10,200}},{"year|"year":"' + incite_published_month + '".{10,200}}|,{".{10,200},"year":"' + incite_published_month +'",.{10,50}},{"|,{.{10,200},"year":"' + incite_published_month + '",.{10,400}"},)'

        #findall_if = '("year":"' + incite_published_month + '"(.{10,100}|.{10,200})},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",(.{10,50}|.{10,100}|.{10,200}|.{10,300}|.{10,400})"},)'
        #http_incite_per_text = re.findall(findall_if, http_incite_per.text)
        
        #findall_if = '("year":"' + incite_published_month + '"(.{10,100}|.{10,200})},{"year|"year":"' + incite_published_month + '".{10,200}}|,{.{10,200},"year":"' + incite_published_month + '",(.{10,50}|.{10,100}|.{10,200}|.{10,300}|.{10,400})"},)'
        #http_incite_per_text = re.findall(findall_if, http_incite_per.text)
        """
        
    except:
        #print("====================================")
        #print("error 발생!!")
        #print("title : ", title)
        #print("====================================")
        impactFactor_one = "Except"
        impactFactor_two = "Except"
        good_rank = ''
        jcr = []

    impact_factor = {}
    if impactFactor_one and impactFactor_two:
        if impactFactor_one == "Except" and impactFactor_two == "Except":
            impact_factor[incite_published_month] = "None"    
            impact_factor['5 year'] = "None"
        else:
            impact_factor[incite_published_month] = impactFactor_one
            impact_factor['5 year'] = impactFactor_two
    else:
        impact_factor = {}
    
    #print("final impact_factor = ", impact_factor)
    #print("impactFactor one = ", impactFactor_one)
    #print("impactFactor two = ", impactFactor_two)

    # incite session 종료
    r.close()
    #print("finish")

    #저자, 연구기관
    fconts = fr_authors.select('a')
    fr_authors_text = fr_authors.text.replace('\n', '')
    fr_authors_text = fr_authors_text.split(':')[1].split(';')

    # 풀 네임
    full_name = {}
    for fa in fr_authors_text:
        p_count = fa.count('(')
        if p_count > 1: fa_match = re.search(r'(.+) \((.+)\(.+\)\)', fa)
        elif p_count == 1: fa_match = re.search(r'(.+) \((.+)\)', fa)
        if fa_match:
            full_name[fa_match.group(1).strip()] = fa_match.group(2).replace(r'\(|\)', '').strip()
    
    target_author = ''
    tauthor_address = []
    for con in fconts:
        isSub = con.get('href').find('javascript') != -1
        if not isSub:
            if target_author != '':
                addresses[target_author] = tauthor_address
                if target_author in full_name.keys(): addresses[full_name[target_author]] = tauthor_address
            tauthor_address = []
            target_author =  con.text.strip()
            authors += [target_author]
        else:
            addressId = re.sub(r'.+\'(.+)\'.+', r'\1', con.get('href'))
            temp = soup.find('a', id=addressId)
            if temp != None:
                # tauthor_address += [temp.contents[0]]
                tauthor_address += [temp.text]

    if target_author != '':
                addresses[target_author] = tauthor_address
                addresses[full_name[target_author]] = tauthor_address
    if reprint == '':
        reprint = 'None'

    paperData = {
        'id' : paper_data_id,
        # 'authors' : correction_form_inputs_by_name['00N70000002C0wa'].split(';'),
        'authors' : authors,
        'full_name' : full_name,
        'fr_authors_text' : fr_authors_text,
        'firstAuthor': authors[0],
        'addresses' : addresses,
        'authorsCnt' : str(len(correction_form_inputs_by_name['00N70000002C0wa'].split(';')) - 1),
        'doi' : correction_form_inputs_by_name['00N70000002n88A'],
        'volume' : correction_form_inputs_by_name['00N70000002Bdnt'],
        'issue' : correction_form_inputs_by_name['00N700000029W18'],
        'pages' : correction_form_inputs_by_name['00N70000002C0vh'],
        'published' : correction_form_inputs_by_name['00N70000002BdnY'],
        'publishedMonth' : published_month,
        'publisher' : publisher,
        'journal_name' : journal_name,
        # 'title' : correction_form_inputs_by_name['00N70000002BdnX'],
        'title' : title,
        'impact_factor' : impact_factor,
        'prevYearIF' : 'None',
        'goodRank' : good_rank,
        'timesCited' : times_cited,
        'grades' : grades,
        'capedGrades' : caped_grades,
        'docType' : doc_type,
        'researchAreas' : research_areas,
        'language' : language,
        'reprint' : reprint,
        'jcr' : jcr,
        'citingArticles' : [],
        'issn' : ISSN,
    }
    paperData['ivp'] = ['%s/%s'%(paperData['issue'], paperData['volume']), paperData['pages']]
    #print("finish222222222")
    # 전년도 임팩트 팩토
    if incite_published_month in impact_factor.keys():
        paperData['prevYearIF'] = impact_factor[incite_published_month]
    #print("finish3333333333333")
    return paperData, cnt_link