Ejemplo n.º 1
0
def works_with_prefix(prefix):
    filter = {'prefix': prefix}
    for p in iterate_publications_as_json(max_results=100000, filter=filter):
        if 'URL' in p:
            click.echo(p['URL'] + "\t", nl=False)
            r = requests.head(p['URL'])
            if r.is_redirect:
                click.echo(r.headers['Location'])
        time.sleep(0.2)
Ejemplo n.º 2
0
def getPapersInfo(papers, scholar_search_link, restrict, scholar_results):
    papers_return = []
    num = 1
    for paper in papers:
        while num <= scholar_results:
            title = paper['title']
            queries = {
                'query.bibliographic': title.lower(),
                'sort': 'relevance',
                "select": "DOI,title,deposited,author,short-container-title"
            }

            print("Searching paper {} of {} on Crossref...".format(
                num, scholar_results))
            num += 1

            found_timestamp = 0
            paper_found = Paper(title, paper['link'], scholar_search_link,
                                paper['cites'], paper['link_pdf'],
                                paper['year'], paper['authors'])
            while True:
                try:
                    for el in iterate_publications_as_json(max_results=30,
                                                           queries=queries):

                        el_date = 0
                        if "deposited" in el and "timestamp" in el["deposited"]:
                            el_date = int(el["deposited"]["timestamp"])

                        if (paper_found.DOI == None
                                or el_date > found_timestamp
                            ) and "title" in el and similarStrings(
                                title.lower(), el["title"][0].lower()) > 0.75:
                            found_timestamp = el_date

                            if "DOI" in el:
                                paper_found.DOI = el["DOI"].strip().lower()
                            if "short-container-title" in el and len(
                                    el["short-container-title"]) > 0:
                                paper_found.jurnal = el[
                                    "short-container-title"][0]

                            if restrict == None or restrict != 1:
                                paper_found.setBibtex(
                                    getBibtex(paper_found.DOI))

                    break
                except ConnectionError as e:
                    print("Wait 10 seconds and try again...")
                    time.sleep(10)

            papers_return.append(paper_found)

            time.sleep(random.randint(1, 10))

    return papers_return
Ejemplo n.º 3
0
def get_data(count=5):
    filter = {"has-abstract": "true", "type": "journal-article"}
    queries = {}  # {'query': 'machine learning'}

    try:
        publications = iterate_publications_as_json(max_results=count,
                                                    filter=filter,
                                                    queries=queries)
    except e:
        print("There was an error accessing the Crossref API")
    else:
        data = []
        datasource = ("Crossref API", )
        datasource_url = "https://api.crossref.org/"

        for p in publications:
            # if p['language'] != 'en':
            #     continue

            abstract = p["abstract"]
            authors = []
            for author in p["author"]:
                authors += [author["given"] + " " + author["family"]]

            title = ""
            for t in p["title"]:
                title = t
                break

            links = []
            for link in p["link"]:
                links += [link]

            # TODO: find most relevant link in list
            ref = "" if len(links) == 0 else links[0]

            # TODO: extract keywords from pdf (in link) if available
            keywords = []

            data += [{
                "title": title,
                "abstract": abstract,
                "keywords": [],
                "author": authors,
                "ref": ref,
                "datasource": datasource,
                "datasource_url": datasource_url,
            }]

        return data
Ejemplo n.º 4
0
def getPapersInfo(papers, scholar_search_link, restrict):
    papers_return = []
    num = 1
    for paper in papers:
        title = paper[0].lower()
        queries = {
            'query.bibliographic': title,
            'sort': 'relevance',
            "select": "DOI,title,deposited,author,short-container-title"
        }

        print("Searching paper {} of {} on Crossref...".format(
            num, len(papers)))
        num += 1

        found_timestamp = 0
        paper_found = Paper(title, paper[1], scholar_search_link, paper[2],
                            paper[3])
        for el in iterate_publications_as_json(max_results=30,
                                               queries=queries):

            el_date = 0
            if "deposited" in el and "timestamp" in el["deposited"]:
                el_date = int(el["deposited"]["timestamp"])

            if (paper_found.DOI == None or el_date >
                    found_timestamp) and "title" in el and similarStrings(
                        title, el["title"][0].lower()) > 0.75:
                found_timestamp = el_date

                if "DOI" in el:
                    paper_found.DOI = el["DOI"].strip().lower()
                if "short-container-title" in el and len(
                        el["short-container-title"]) > 0:
                    paper_found.jurnal = el["short-container-title"][0]

                if restrict == None or restrict != 1:
                    paper_found.setBibtex(getBibtex(paper_found.DOI))

        papers_return.append(paper_found)

        time.sleep(random.randint(1, 10))

    return papers_return
Ejemplo n.º 5
0
    def _get_bibtex_crossref(self) -> None:
        """Internal function to fetch the bibtex entry and determine existence
        and uniqueness.

        Note:
            Results are cached.

        Returns:
            str: A bibtex entry.
        """
        queries = dict(
            zip(["query.author", "query.title", "query.bibliographic"], self.pieces)
        )
        queries["sort"] = "relevance"

        iter_pub = iterate_publications_as_json(queries=queries)
        try:
            doi = next(iter_pub)["DOI"]
            self._exists = True
        except StopIteration:
            self._exists = False
            return
        try:
            next(iter_pub)
            self._is_unique = False
        except StopIteration:
            self._is_unique = True

        # This is almost correct! We just need to change the citation key to
        # self.key
        raw_bibtex: str = get_publication_as_refstring(doi, "bibtex")

        # Here we assume the first line is always
        # @something {OLD_CITATION,
        # Replace OLD_CITATION with citation
        # Use { and , to find and replace
        a, b = raw_bibtex.split("{", 1)
        self._bibtex = a + "{" + self.key + "," + b.split(",", 1)[1]

        # Remove leading whitespace if necessary
        if self._bibtex[0] == " ":
            self._bibtex = self._bibtex[1:]
        return
Ejemplo n.º 6
0
def call_from_front_end(NAME):
    if not heroku:
        scholar_link = str(
            'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C3&q=') + str(
                NAME)
        #for link in scholar_link:
        #    st.text(link)

        _, _, ar = enter_name_here(scholar_link, NAME)

    if heroku:
        filter_ = {'type': 'journal-article'}
        queries = {'query.author': NAME}
        ar = []
        bi = [
            p for p in iterate_publications_as_json(
                max_results=50, filter=filter_, queries=queries)
        ]
        for p in bi[0:9]:
            res = str('https://api.unpaywall.org/v2/') + str(
                p['DOI']) + str('?email=YOUR_EMAIL')
            response = requests.get(res)
            temp = response['best_oa_location']['url_for_pdf']

            #temp=str('https://unpaywall.org/'+str(p['DOI']))
            #st.text(temp)
            urlDat = process(temp)
            if not isinstance(urlDat, type(None)):
                ar.append(urlDat)
                #st.text(urlDat)

    (ar, trainingDats) = ar_manipulation(ar)
    '''
    with open('data/traingDats.p','rb') as f:            
        trainingDats_old = pickle.load(f)
    trainingDats.extend(trainingDats_old)    
    with open('data/traingDats.p','wb') as f:            
        pickle.dump(trainingDats,f)        
    '''
    return ar