def overcite_an(path):
    titles = []

    with open(path, "r", encoding='latin1') as file:
        reader = csv.reader(file)
        for idx,line in enumerate(reader):
            if idx>0:
                t=[line[1].strip(),line[2]]
                if t[0] != 'Paper Title' and t[0] != 'Total' and t[1].isdigit():
                    titles.append([t[0], int(t[1])])

    titles = [t for t in titles if t[1] >=25]
    print(len(titles))
    titles = dedupe(titles)
    print(len(titles))

    s = 0
    for t in titles:
        s += t[1]

    print(s)

    session = SessionInitializer.getSesh()
    headers = SessionInitializer.getHeaders()
    final_dict = {'authors': {}, 'journals': {}, 'publishers': {}}


    for t in titles:
        paper_name = t[0]
        paper_cites = t[1]
        query = "+".join(paper_name.split())

        url = SessionInitializer.ROOT_URL + '/scholar?q=' + query + '&btnG=&hl=en&as_sdt=0%2C5'
        print(url)
        response = session.get(url, headers=headers)
        soup = BeautifulSoup(response.content, 'lxml')
        info_list = ""
        try:
            info_list = soup.find('div', attrs={'class' : 'gs_a'}).text
        except:
            print('cant find for ' + paper_name)
            continue
        info_list = [i.strip() for i in info_list.split(' - ')]

        authors = info_list[0].split(',')
        journal = info_list[1].split(',')[0]
        publisher = info_list[2]

        for a in authors:
            add_freq_dict(final_dict['authors'], a)

        add_freq_dict(final_dict['journals'], journal)
        add_freq_dict(final_dict['publishers'], publisher)
        print(publisher)
    print(final_dict)
    def __init__ (self, link, loadPdf=True):
        self.__url = link
        self.__pdfObj = None
        self.__pap_info = {}
        self.__pap_info['Publisher'] = ''
        self.__citedByUrl = None
        self.__citedByNum = 0
        self.__allAuthors = None

        #Internet Session Setup
        self.session = SessionInitializer.getSesh()
        self.headers = SessionInitializer.getHeaders()

        self.loadFromGoogleScholar(loadPdf=loadPdf)
    def __init__(self, mainUrl, numPapers, loadPaperPDFs=True, pubFilter=False):

        self.first_name = None
        self.last_name = None
        self.url = None
        self.__paper_list = []

        #Internet Session Setup
        self.session = SessionInitializer.getSesh()
        self.headers = SessionInitializer.getHeaders()

        if (mainUrl is not None):
            self.url = mainUrl
            self.loadPapers(numPapers, loadPaperPDFs=loadPaperPDFs, pubFilter=pubFilter)
Example #4
0
    def __init__(self, link, loadPdf=True):
        self.__url = link
        self.__pdfObj = None
        self.__pap_info = {}
        self.__pap_info['Publisher'] = ''
        self.__citedByUrl = None
        self.__citedByNum = 0
        self.__allAuthors = None

        #Internet Session Setup
        self.session = SessionInitializer.getSesh()
        self.headers = SessionInitializer.getHeaders()

        self.loadFromGoogleScholar(loadPdf=loadPdf)
def downloadSpringerOpen(path):

        
    try:
        pdfTag = ch.find_element_by_xpath("//p[@class='SideBox_action']/a[text()='Download PDF']")
        pdfLink = pdfTag.get_attribute('href')
        p1idx = pdfLink.find(';jwcn')
        pdfLink = pdfLink[p1idx+1:] + pdfLink[:p1idx + 1] 
        extidx = pdfLink.find('?site=')
        if (extidx!=-1):
            pdfLink = pdfLink[:extidx]

    except selenium.common.exceptions.NoSuchElementException:
        print('Springer open link has no PDF, return trying v2...')
        try:
            pdfTag = ch.find_element_by_xpath("//p[@class='u-marginBtmM']/a[text()='Download PDF']")
            pdfLink = pdfTag.get_attribute('href')
        except selenium.common.exceptions.NoSuchElementException:
            print('Springer open link has no PDF, trying V2...')
            return None

    print(pdfLink)
    session = SessionInitializer.getSesh()
    r = session.get(pdfLink, stream=True)

    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        return 1

    print('ERROR: watlib pdf springeropen was not downloaded correctly')
    return None
Example #6
0
    def downloadIEEE(self, href, path):
        href.click()
        session = SessionInitializer.getSesh()
        resp = session.get(self.ch.current_url)
        src = BeautifulSoup(resp.content, 'lxml').text

        idx = src.find('"pdfUrl"')
        src = src[idx:]
        idx = src.find(':')
        idx2 = src.find(',')
        src = src[idx+1:idx2].strip().strip('"')
        url = 'http://ieeexplore.ieee.org.proxy.lib.uwaterloo.ca' + src
        print(url)
        resp2 = session.get(url)
        wrapperPage = BeautifulSoup(resp2.content, 'lxml')
        frames = wrapperPage.findAll('frame')
        srcFrame = None
        for frame in frames:
            if frame['src'] and 'http' in frame['src']:
                srcFrame = frame['src']

        if srcFrame:
            return self.downloadPdfLink(srcFrame, path, 'IEEE')
        else:
            return None
Example #7
0
def count_journal_frequency(author, num_papers):

    author.loadPapers(num_papers, loadPaperPDFs=False)
    print("Author fully loaded. Processing loaded papers...")
    pap_arr = []
    count = 0

    try:
        for idx, paper in enumerate(author.getPapers()[45:]):
            time.sleep(10)
            info_list = []
            one_pap_arr = []
            cited_by_url = paper.getCitedByUrl()
            session = SessionInitializer.getSesh()

            url_part_one = SessionInitializer.ROOT_URL + '/scholar?start='
            url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites='
            cited_by_url = cited_by_url[:cited_by_url.rfind('&')]
            paper_code = cited_by_url[cited_by_url.rfind('=') + 1:]

            for i in range(0, 30, 10):
                time.sleep(10)
                final_url = url_part_one + str(i) + url_part_two + paper_code
                print(final_url)
                response = session.get(final_url)
                soup = BeautifulSoup(response.content, "lxml")
                info_list += soup.findAll('div', attrs={'class': 'gs_a'})

            journal_dict = {}

            for info_str in info_list:
                info_str = info_str.text
                info_str = info_str.split(' - ')[1].split(',')[0].replace(
                    '…', '').strip()
                if info_str.isdigit():
                    continue

                #print('final info string: ' + info_str)
                info_str = info_str.lower().title()
                if (info_str in journal_dict):
                    journal_dict[info_str] += 1
                else:
                    journal_dict[info_str] = 1

            one_pap_arr.append(paper.getInfo()['Title'])
            one_pap_arr.append(journal_dict)
            print(one_pap_arr)
            pap_arr.append(one_pap_arr)
            print('Paper ' + str(idx) + ' complete.')
            count += 1
    except KeyboardInterrupt:
        print('User ended program, returning journal array')

    print(pap_arr)
    return pap_arr
def count_journal_frequency (author, num_papers):

    author.loadPapers(num_papers, loadPaperPDFs=False)
    print("Author fully loaded. Processing loaded papers...")
    pap_arr = []
    count = 0

    try:
        for idx, paper in enumerate(author.getPapers()[45:]):
            time.sleep(10)
            info_list = []
            one_pap_arr = []
            cited_by_url = paper.getCitedByUrl()
            session = SessionInitializer.getSesh()

            url_part_one = SessionInitializer.ROOT_URL + '/scholar?start='
            url_part_two = '&hl=en&as_sdt=0,5&sciodt=0,5&cites='
            cited_by_url = cited_by_url[:cited_by_url.rfind('&')]
            paper_code = cited_by_url[cited_by_url.rfind('=')+1:]

            for i in range(0, 30, 10):
                time.sleep(10)
                final_url = url_part_one+str(i)+url_part_two+paper_code
                print(final_url)
                response = session.get(final_url)
                soup = BeautifulSoup(response.content, "lxml")
                info_list += soup.findAll('div', attrs={'class':'gs_a'})

            journal_dict = {}

            for info_str in info_list:
                info_str = info_str.text
                info_str = info_str.split(' - ')[1].split(',')[0].replace('…', '').strip()
                if info_str.isdigit():
                    continue

                #print('final info string: ' + info_str)
                info_str = info_str.lower().title()
                if (info_str in journal_dict):
                    journal_dict[info_str]+=1
                else:
                    journal_dict[info_str] = 1

            one_pap_arr.append(paper.getInfo()['Title']) 
            one_pap_arr.append(journal_dict)
            print(one_pap_arr)
            pap_arr.append(one_pap_arr)
            print('Paper ' + str(idx) + ' complete.')
            count+=1
    except KeyboardInterrupt:
        print('User ended program, returning journal array')

    print(pap_arr)
    return pap_arr
Example #9
0
    def __init__(self,
                 mainUrl,
                 numPapers,
                 loadPaperPDFs=True,
                 pubFilter=False):

        self.first_name = None
        self.last_name = None
        self.url = None
        self.__paper_list = []

        #Internet Session Setup
        self.session = SessionInitializer.getSesh()
        self.headers = SessionInitializer.getHeaders()

        if (mainUrl is not None):
            self.url = mainUrl
            self.loadPapers(numPapers,
                            loadPaperPDFs=loadPaperPDFs,
                            pubFilter=pubFilter)
Example #10
0
    def downloadPdfLink(self, link, path, source):
        print(link)
        session = SessionInitializer.getSesh()
        r = session.get(link, stream=True)

        if r.status_code == 200:
            with open(path, 'wb') as f:
                r.raw.decode_content = True
                shutil.copyfileobj(r.raw, f)
            return 1

        print('Error: ' + source + ' pdf was not downloaded correctly.')
        return None
def downloadMdpi(path):
    try:
        pdfTag = ch.find_element_by_xpath("//li/a[text()='Full-Text PDF']")
        pdfLink = pdfTag.get_attribute('href')
    except selenium.common.exceptions.NoSuchElementException:
        print('MDPI link has no PDF, returning None...')
        return None

    print(pdfLink)
    session = SessionInitializer.getSesh()
    r = session.get(pdfLink, stream=True)

    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        return 1

    print('ERROR: watlib pdf MDPI was not downloaded correctly')
    return None
def downloadMdpi(path):
    try:
        pdfTag = ch.find_element_by_xpath("//li/a[text()='Full-Text PDF']")
        pdfLink = pdfTag.get_attribute('href')
    except selenium.common.exceptions.NoSuchElementException:
        print('MDPI link has no PDF, returning None...')
        return None

    print(pdfLink)
    session = SessionInitializer.getSesh()
    r = session.get(pdfLink, stream=True)

    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        return 1

    print('ERROR: watlib pdf MDPI was not downloaded correctly')
    return None
def downloadScholarPortal(href, path):
    href.click()
    try:
        pdfxmlTag = ch.find_element_by_xpath("//div[@class='download-btn']/a[text()='PDF Download']")
        pdfxmllink = pdfxmlTag.get_attribute('href')
    except selenium.common.exceptions.NoSuchElementException:
        print('Racer or invalid link only, no scholarsportal returning none...')
        return None

    print(pdfxmllink)

    session = SessionInitializer.getSesh()
    r = session.get(pdfxmllink, stream=True)

    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        return 1

    print('ERROR: watlib pdf scholarsportal was not downloaded correctly')
    return None
def downloadScholarPortal(href, path):
    href.click()
    try:
        pdfxmlTag = ch.find_element_by_xpath(
            "//div[@class='download-btn']/a[text()='PDF Download']")
        pdfxmllink = pdfxmlTag.get_attribute('href')
    except selenium.common.exceptions.NoSuchElementException:
        print(
            'Racer or invalid link only, no scholarsportal returning none...')
        return None

    print(pdfxmllink)

    session = SessionInitializer.getSesh()
    r = session.get(pdfxmllink, stream=True)

    if r.status_code == 200:
        with open(path, 'wb') as f:
            r.raw.decode_content = True
            shutil.copyfileobj(r.raw, f)
        return 1

    print('ERROR: watlib pdf scholarsportal was not downloaded correctly')
    return None
 def __init__(self):
     #Internet Session Setup
     self.session = SessionInitializer.getSesh()
     self.headers = SessionInitializer.getHeaders()
Example #16
0
 def __init__(self):
     #Internet Session Setup
     self.session = SessionInitializer.getSesh()
     self.headers = SessionInitializer.getHeaders()
'''
Created on Jan 05, 2016

@author: Ankai
'''
from bs4 import BeautifulSoup
import time
from ReferenceParser import IeeeReferenceParser, SpringerReferenceParser, PaperReferenceExtractor, PdfObj
import SessionInitializer
import WatLibSeleniumParser

SESSION = SessionInitializer.getSesh()

class Paper:
    def __init__ (self, link, loadPaperPDFs=True):
        self.url = link
        self.pdfObj = None
        self.pap_info = {}
        #self.__pap_info['Publisher'] = ''
        self.citedByUrl = None
        self.citedByNum = 0

        #Internet Session Setup
        self.loadFromScopus(loadPaperPDFs=loadPaperPDFs)

    def loadFromScopus(self, loadPaperPDFs=True):
        response = SESSION.get(self.url)
        soup = BeautifulSoup(response.content, 'lxml')

        # PDF Object
        if loadPaperPDFs:
'''
Created on Jan 05, 2016

@author: Ankai
'''
from bs4 import BeautifulSoup
import time
from ReferenceParser import IeeeReferenceParser, SpringerReferenceParser, PaperReferenceExtractor, PdfObj
import SessionInitializer
import WatLibSeleniumParser

SESSION = SessionInitializer.getSesh()
WATPARSER = WatLibSeleniumParser.WatLibParser()


class Paper:
    def __init__(self, link, loadPaperPDFs=True):
        self.url = link
        self.pdfObj = None
        self.pap_info = {}
        #self.__pap_info['Publisher'] = ''
        self.citedByUrl = None
        self.citedByNum = 0

        #Internet Session Setup
        self.loadFromScopus(loadPaperPDFs=loadPaperPDFs)

    def loadFromScopus(self, loadPaperPDFs=True):
        response = SESSION.get(self.url)
        soup = BeautifulSoup(response.content, 'lxml')