コード例 #1
0
def search_doi(journal_title, start_date, end_date, print_issn, online_issn, count):
    """
    A defined number of doi are put into a list
    If no doi are found using the online issn, the print issn

    :param journal_title: The title of the journal
    :param start_date: The start date in the format yyyy-mm-dd
    :param end_date: The end date in the format yyyy-mm-dd
    :param print_issn: The International Standard Serial Number for the print journal
    :param online_issn: The International Standard Serial Number for the online journal
    :param count: How many doi the method should try to find
    :return: A list of doi, this can be empty, but should be no larger than count
    """
    works = Works()
    received_doi = []

    if print_issn == '' and online_issn == '':
        return 'Both ISSNs are empty'

    if online_issn != '':  # online ISSN exists
        for i in works.query(journal_title).filter(
                issn=online_issn,
                from_pub_date=start_date,
                until_pub_date=end_date).sample(count).select('DOI'):
            received_doi.append(i['DOI'])

    if print_issn != '':  # print ISSN exists
        if not received_doi:
            for j in works.query(journal_title).filter(
                    issn=print_issn,
                    from_pub_date=start_date,
                    until_pub_date=end_date).sample(count).select('DOI'):
                received_doi.append(j['DOI'])

    return received_doi
コード例 #2
0
	def query_crossref(self, author = None, pub = None):
		from crossref.restful import Works
		works = Works()

		if author is not None and pub is not None:
			return works.query(title = pub, author = author)
		elif author is not None:
			return works.query(author = author)
		elif pub is not None:
			return works.query(title = pub)
コード例 #3
0
def cross(L):
    works = Works()
    print("no")
    for e in works.query("cancer"):
        print("yes")
        print(e["DOI"])
        L.append(e["DOI"])
コード例 #4
0
ファイル: views.py プロジェクト: yellowshower/VisRef
def search(request):
    keyword = request.GET.get('search')
    if (keyword == None):
        return render(request, 'search.html', {})
    else:
        find_keyword = Sea.objects.filter(keyword__iexact=keyword)
        l = len(find_keyword)
        if (l != 0):
            print('数据库Sea中找到记录')
            return render(request, 'search.html', {
                'results': find_keyword,
                'keyword': keyword
            })
        else:
            print('数据库Sea未中找到记录')
            works = Works()
            w1 = works.query(title=keyword)
            for index, item in enumerate(w1):
                if (index > 499):
                    break
                result = searchDo.parse_result(keyword, item)
                #存入数据库
                sea_record = Sea(**result)
                sea_record.save()
                print(str(index) + ' record is ok')
            find_keyword = Sea.objects.filter(keyword__iexact=keyword)
            return render(request, 'search.html', {
                'results': find_keyword,
                'keyword': keyword
            })
コード例 #5
0
ファイル: extract.py プロジェクト: wangvei/papers
def fetch_bibtex_by_fulltext_crossref(txt, **kw):
    work = Works(etiquette=my_etiquette)
    logger.debug(six.u('crossref fulltext seach:\n') + six.u(txt))

    # get the most likely match of the first results
    # results = []
    # for i, r in enumerate(work.query(txt).sort('score')):
    #     results.append(r)
    #     if i > 50:
    #         break
    query = work.query(txt, **kw).sort('score')
    query_result = query.do_http_request('get',
                                         query.url,
                                         custom_header=str(
                                             query.etiquette)).text
    results = json.loads(query_result)['message']['items']

    if len(results) > 1:
        maxscore = 0
        result = results[0]
        for res in results:
            score = _crossref_score(txt, res)
            if score > maxscore:
                maxscore = score
                result = res
        logger.info('score: ' + str(maxscore))

    elif len(results) == 0:
        raise ValueError('crossref fulltext: no results')

    else:
        result = results[0]

    # convert to bibtex
    return crossref_to_bibtex(result).strip()
コード例 #6
0
def get_doi(title, author):
    works = Works()
    work = works.query(bibliographic=title, author=author).url
    response = requests.get(work)
    json_response = json.loads(response.text)
    try:
        document = json_response["message"]["items"][0]
        return document.get(
            "URL"
        )  # print(document["DOI"], document["title"], document["URL"])
    except IndexError:
        return None
コード例 #7
0
def data(request):

    if request.method == 'POST':
        form = SimpleForm(request.POST)
        if form.is_valid():
            #query = input('Enter the query to be searched: ')
            query = form.cleaned_data.get("enterUrl")
            parameter_values_list = [1, 10, '9ipXPomYaSrHLAIuONZfzUGk3t57RcBD']
            response = requests.get(edited_search_coreAPI(query, parameter_values_list))
            # response = requests.get(edited_search_coreAPI(form.enterUrl, parameter_values_list))
            content = response.json()
            works = Works()
            w1 = works.query(container_title='zika', author='johannes', publisher_name='Wiley-Blackwell')
            for item in w1:
                print(item['title'])


            print(content)

            print(type(content))

            context = {
                'form': form,
                'content': content
            }

            messages.success(request, f'Your Url has been generated')

            return redirect("query",data=str(content))
            # return render(request,'users/query.html', {'content': [content]})
            #return render(json.dumps(content,sort_keys=True, indent=4),'users/query.html', content_type="application/json"))
            #return (HttpResponse(json.dumps(content,sort_keys=True, indent=4), content_type="application/json"))


            # print(lists[0])

        #     form.save()

        else:
                messages.error(request,f'Wrong Url')
                return render(request, 'users/query.html', {'form': form})

    else:
        form = SimpleForm()
        return render(request, 'users/data.html', {'form': form})
コード例 #8
0
ファイル: app.py プロジェクト: jcheong0428/paperwiki
def search():
    """
    Uses Crossref API to search documents.
    """
    queries = {}
    for key in ['author', 'words', 'doi']:
        val = request.form[key]
        if len(val) > 0:
            queries[key] = request.form[key]
        else:
            queries[key] = None
    # Init API and query
    works = Works()
    articles_q = []
    if queries['doi']:
        articles_q = [works.doi(doi=queries['doi'])]
    else:
        articles_q = works.query(bibliographic=queries['words'],
                                 author=queries['author']).sample(20)
    # Check if article is in database already and wiki exists
    articles = []
    for article in articles_q:
        articles.append(article)
        doi = article['DOI']
        search_result = mongo.db.paperwiki.find_one({"DOI": doi})
        if search_result:
            if 'content' in search_result.keys():
                article['actionurl'] = "see_wiki?id=" + doi
                article['wiki_exists'] = True
            else:
                article['actionurl'] = "create_wiki?id=" + doi
                article['wiki_exists'] = False
        else:
            insert_id = mongo.db.paperwiki.insert_one(article)
            article['actionurl'] = "create_wiki?id=" + doi
            article['wiki_exists'] = False
    context = {"docs": articles}
    resp = render_template("home.html", docs=articles)
    return resp
コード例 #9
0
def crossrefAPI_query(keyword: str) -> Dict:
    '''This function takes a keyword str and sends an according GET request to the CrossRef API.
	A normalized version of the first (most 'relevant') result is returned.'''
    article_dict = False
    works = Works()
    # If there is a timeout, try again (5 times)
    for _ in range(5):
        try:
            result = works.query(keyword).sort("relevance")
            for entry in result:
                # Take first result
                article_dict = entry
                break
        except:
            pass
    else:
        return
    if article_dict:
        #article_dict = normalize_crossref_dict(article_dict)
        #if contains_minimal_information(article_dict):
        article_dict = add_retrieval_information(article_dict, 'Crossref',
                                                 'unstructured_ID', keyword)
        return article_dict
コード例 #10
0
def find_meta(title, doi):
    """ find metadata with title or doi
    Keyword Arguments:
    title --
    doi   --
    """
    ylog.info(title)
    works = Works()
    w1 = works.query(title).sort('relevance').order('desc')
    i = 0
    for item in w1:
        i = i + 1
        try:
            t = item.get('title')[0]
            sub_title = item.get('subtitle')[0]
        except:
            continue
        if SequenceMatcher(a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                a=title, b=sub_title).ratio > 0.9:
            return item
        if i > 18:
            ylog.debug('[x]%s' % title)
            # ylog.debug(item['title'])
            return None
コード例 #11
0
#!/usr/bin/python3

paper_title = "The Emotional Voices Database: Towards Controlling the Emotion Dimension in Voice Generation Systems"

from crossref.restful import Works

works = Works()
w1 = works.query(author="birkan kolcu").sample(5)

for item in w1:
    print(item['title'])
コード例 #12
0
            text = title + text.get_text()
            title = text
        os.mkdir(path + str(index) + 'figure\\')
        for fig in obj_figure:  #图片
            fig_url = 'https://onlinelibrary.wiley.com' + fig['src']
            r = requests.get(fig_url, stream=True)
            image_name = fig_url[-20:].split('/')
            with open(path + str(index) + 'figure\\' + image_name[0],
                      'wb') as ff:
                for chunk in r.iter_content(chunk_size=128):
                    ff.write(chunk)

        file = open(path + str(index) + title + '.txt',
                    mode='w+',
                    encoding='UTF-8')
        file.write(abstract + content)
        file.close()
        driver.close()


index = 0

for i in works.query(bibliographic='mof',
                     publisher_name='Wiley-Blackwell').filter(
                         from_online_pub_date='2017').sample(10):
    index += 1
    acquire_text(i['URL'], index)
    print(i['URL'])

#URL = 'http://dx.doi.org/10.1002/aoc.4820'
#acquire_text(URL,index)
コード例 #13
0
from pdfminer.pdfdocument import PDFDocument
from crossref.restful import Works
from PyPDF2 import PdfFileReader
import re
from ylib import ylog
import logging
from difflib import SequenceMatcher
import bibtexparser
ylog.set_level(logging.DEBUG)
ylog.console_on()
ylog.filelog_on("app")

works = Works()

title = """Heterogeneous resistance to vancomycin in Staphylococcus epidermidis, Staphylococcus haemolyticus and Staphylococcus warneri clinical strains: characterisation"""
w1 = works.query(title).sort('relevance').order('desc')
i = 0
target_doi = '10.1109/icdcs.2006.48'
items_result = None
for item in w1:
    i = i + 1
    try:
        t = item.get('title')[0]
        sub_title = item.get('subtitle')[0]
        ylog.debug('crossref item title ')
        ylog.debug(t)
        ylog.debug(sub_title)
    except:
        ylog.debug(item)
        continue
    if SequenceMatcher(a=title, b=t).ratio() > 0.8:
コード例 #14
0
    def meta_data_search(self,
                         search_terms,
                         save_path,
                         publisher_name='Wiley'):
        """
        A method to collect the metadata from crossref based on specific search terms passed in.
        The results are filtered down to a specific publisher. It then saves all the URLs as a
        single list. This function writes full_publist to a json file, which contains a list of 
        meta-data for the articles identified in the query.
        
        Parameters:
            search_terms (list of str): List of search terms used to identify articles to be added
                in the corpus
            
            save_path (str): Absolute or relative filepath leading to the directory where
                the fulltexts will be saved
                
            publisher_name (str): The name of the publisher that is to be queried for articles.
                This must correspond to a publisher within the CrossRef database.
                
        Returns:
            full_publist (pd.DataFrame): A dataframe that contains a list of articles and their
                DOI's and URL's
        """

        works = Works()
        members = Members()

        #query publisher for articles with given search terms
        chem_subset = works.query(search_terms)
        pub = next(iter(members.query(publisher_name)))
        pub_id = pub['id']
        chem_subset = chem_subset.filter(member=pub_id)
        doc_number = chem_subset.count()
        headers = {
            'CR-Clickthrough-Client-Token': self.clickthrough,
            "User-Agent": self.agent,
            "Connection": 'close'
        }

        url = chem_subset.url + "&select=DOI,link&rows=1000&mailto=" + self.mailto + "&cursor="

        #starting cursor value. It will be updated in the loop with each request we make.
        cursor = '*'
        url_list = []
        doi_list = []
        saved_docs = 0

        #make the first request before entering the while loop
        response = requests.get(url + cursor, headers=headers).json()

        while len(response['message']['items']) > 0:

            #Add the total number of papers from the response to the saved docs list.
            saved_docs += len(response['message']['items'])
            pcnt_comp = 100 * saved_docs / doc_number
            print(f"{pcnt_comp:.3f}% complete")

            #for every entry in the response, loop through each entry.
            for entries in response['message']['items']:

                #Check to see if the response item has a link in it
                keycheck = True
                try:
                    entries['link']
                except KeyError:
                    keycheck = False

                if keycheck:
                    #If the link exists, then append article meta-data
                    URL = entries['link'][0]['URL']
                    DOI = entries['DOI']

                    #Update all http to https
                    if URL[:5] != "https" and URL[:4] == 'http':
                        URL = 'https' + URL[4:]

                    #Check to see if the URL format is correct. If yes, add it,
                    #otherwise you don't add it to the list.
                    if URL[8] == 'a':
                        url_list.append(URL)
                        doi_list.append(DOI)

            #Build a dataframe of URLs and DOIs from our requests. Save checkpoint
            full_publist = pd.DataFrame()
            full_publist['URL'] = url_list
            full_publist['DOI'] = doi_list
            full_publist.to_json(save_path + 'wiley_meta_list.json')

            #update the cursor, and make a new request for a new response.
            cursor = response['message']['next-cursor']
            cursor = cursor.replace("+", "%2B")
            response_nojson = requests.get(url + cursor, headers=headers)

            #check the status code from this response
            if response_nojson.status_code == 200:

                response = response_nojson.json()

            else:
                print("Status code is bad! The response code is: " +
                      str(response_nojson.status_code))

                break

        #Build a dataframe from the full list of URLs and DOIs from our requests
        full_publist['URL'] = url_list
        full_publist['DOI'] = doi_list

        #Save the final list after dropping duplicates
        full_publist.drop_duplicates()
        full_publist.to_json(save_path + 'wiley_meta_list.json')

        return full_publist
コード例 #15
0
ファイル: Datasets.py プロジェクト: bonej079/FAIR-Automation
        try:
            doi = driver.find_element_by_xpath("//*[contains(text(), 'doi')]")
            doi.click()
            print(driver.current_url)
            doiTitle = driver.find_element_by_tag_name("title").get_attribute("textContent")
            doi = ""
            print(doiTitle)
        except Exception:
            pass

    if not doi:
        works = Works()
        i=0
        if doiTitle:
            print(doiTitle)
            w1 = works.query(title=doiTitle).filter(type="journal-article").sort('relevance')
            for item in w1:
                i=i+1
                if item['title'] is doiTitle:
                    doi = item['DOI']
                    persistent = "available"
                    break
                if i > 10:
                    break
        else:
            w1 = works.query(title=search_phrase).filter(type="journal-article").sort("relevance")
            for item in w1:
                i=i+1
                if search_phrase in item['title']:
                    doi = item['DOI']
                    persistent = "available"
コード例 #16
0
def Getdoiplus(markupdict):
    #### Get DOI of journal article from CrossRef XML queries
    #### Uses crossref.restful Works python module
    #### Inputs are Title and First Author
    #### Output is DOI

    print 'Attempting to get DOI from CrossRef'
    works = Works()
    titlestr = markupdict['Title']
    ###Find First Author Surname
    if ' ' in markupdict['FirstAuthor']:
        dummydict = markupdict['FirstAuthor'].split(' ')
        if ',' in markupdict['FirstAuthor']:
            authorstr = dummydict[0].strip()
            authorstr = authorstr.replace(',', '', 1)
        else:
            authorstr = dummydict.pop().strip()
    else:
        authorstr = markupdict['FirstAuthor']
    if '-' in authorstr:
        authorstr = authorstr.split('-').pop()

    #print markupdict['FirstAuthor'], 'authorst=',authorstr

    ### Query does not do exact phrase matching - find word in title that returns fewest results
    titlestr = ''
    leastquerynumber = 999999999
    #print markupdict['Title'].split(' ')
    #dummy=markupdict['Title'].replace('\u\xa0','')
    for word in markupdict['Title'].split(' '):
        if len(word) > 5:
            #print word, works.query(title=word, author=authorstr).count()
            if works.query(title=word, author=authorstr).count(
            ) < leastquerynumber and works.query(title=word,
                                                 author=authorstr).count() > 0:
                titlestr = word
                #print 'titlestr is', word

    if titlestr == '': titlestr = markupdict['Title']
    #print markupdict['Title'], 'titlestr=', titlestr

    DOIstr = '10.1016/s0022-3115(98)00906-4'

    print 'number of titles is:', works.query(title='Uranium dioxide',
                                              author='Bae').count()
    #print 'number of titles is:',  works.query(title=titlestr, author='Bae' ).count()

    #print 'doi no. titles:', works.query(DOI=DOIstr)
    doi = ''
    ### Perform query and find exact or partial title matches
    print 'Querying Title=%s Author=%s number of titles is: %d' % (
        titlestr, authorstr, works.query(title=titlestr,
                                         author=authorstr).count())

    for item in works.query(title=titlestr, author=authorstr):
        #for item in works.query(title='Uranium dioxide', author='Bae' ):
        #for item in works.query(DOI=DOIstr):
        #print item['title'][0]
        if markupdict['Title'].lower() == item['title'][0].lower():
            print '**** exact match ****'
            print '[1]', item['title'][0]
            print '[2]', markupdict['Title']

            doi = item['DOI']
        elif abs(
                len(item['title'][0].split(' ')) -
                len(markupdict['Title'].split(' '))
        ) == 0:  ### Word by word matching
            #dummytitle1=item['title'][0].replace('(',"",1).replace(')',"",1).replace('/',"",1).replace('.',"",1).replace(',',"",1)
            n_match = 0
            for i in range(len(item['title'][0].split(' '))):
                if item['title'][0].split(' ')[i].strip().lower(
                ) == markupdict['Title'].split(' ')[i].strip().lower():
                    n_match = n_match + 1
            #if abs(len(item['title'][0].split(' '))-n_match)<3:'
            if n_match / (len(item['title'][0].split(' ')) *
                          1.0) > 0.7 and abs(
                              len(item['title'][0].split(' ')) - n_match) < 3:
                doi = item['DOI']
                print '**** partial match ****', n_match, ' of', len(
                    item['title'][0].split(
                        ' ')), 'matches. Proportion:', n_match / (
                            len(item['title'][0].split(' ')) * 1.0)
                print '[1]', item['title'][0]
                print '[2]', markupdict['Title']

            #print 'matches=', n_match, 'total=', len(item['title'][0].split(' '))

        ##    print item['DOI']

    ##for item in works.sample(2):
    ##    print (item['title'])
    ##    print item['DOI']
    if doi == '':
        print '**** No Match found ****\n'
    else:
        pass
        #print doi

    print '*************\n'
    #print item.keys()

    return (doi)
コード例 #17
0
def getDoiWithCrossRef(entry, my_etiquette):
    """ Get the doi of a bibtex entry thanks to crossref.

    Parameters
    ----------
    entry : BibDatabase
        The bibtex record with  missing doi.
    my_etiquette : tuple
        A record that contains all require fields to create Etiqette object.

    Returns
    -------
    doi : string
        the doi code.

    """
    # tries counter for each entry
    count = 0
    # store if a match has been found
    match = False
    # if provide create the Etiquette object
    if my_etiquette:
        etiquette = Etiquette(*my_etiquette)
        print(etiquette)
    else:
        etiquette = None

    # create crossref api instance for request
    works = Works(etiquette=etiquette)
    # convert entry to unicode for searching
    entry_unicode = bp.customization.convert_to_unicode(entry.copy())

    # Check for mandatory field
    try:
        # extract basic fields
        author1 = entry_unicode['author'].split(',')[0].strip()
        title = entry_unicode['title'].strip()
        year = entry_unicode['year'].strip()
    except Exception:
        warnings.warn("author, title and year fields are missing in entry {}\
                      ".format(entry_unicode))
        doi = None
        return doi

    w1 = works.query(author=author1,
                     bibliographic=title).filter(
                         until_pub_date=year,
                         from_pub_date=year,
                         type='journal-article').sort('score').order('desc')
    # parse the crossref record to find the "best" match
    for item in w1:
        count += 1
        # fuzzy comprare
        ratio = SM(None, title, item['title'][0]).ratio()
        if ratio > TOL_MATCH:
            match = True
            break
        # limit the number of query
        if count > COUNT:
            print('  Reach maximal number of tries ({}) \
for this record  {}'.format(COUNT, entry_unicode))
            break

    if match:
        doi = item['DOI']
    else:
        print("  MISSING : {}, {}".format(entry_unicode['author'],
                                          entry_unicode['title']))
        doi = None

    return doi
コード例 #18
0
class OutputDOIs(SqliteDataResource):
    """
    This resource is a cached set of output IDs matched to DOIs using regexes, URLs, Crossref searches, and Refindit
    searches. This resource takes several hours to update, depending on throttling from Crossref and number of threads
    available for multiprocessing.
    """
    def __init__(self, context):
        super().__init__(context, DataResource.data_dir / 'output_dois.db')
        etiquette = Etiquette(
            'SYNTH transform', '0.1',
            'https://github.com/NaturalHistoryMuseum/synth_transform',
            '*****@*****.**')
        self.works = Works(etiquette=etiquette)
        self._handled = set()
        self._added = set()
        self._errors = {}
        self._methods = {}

    @property
    def keys(self):
        return [tuple(json.loads(k)) for k in self.data.keys()]

    def mapped_items(self, new_id_map):
        """
        Transform the stored keys (tuples of (synth round, output ID)) into new IDs using a map generated during the
        rebuild process. Resource must be open.
        :param new_id_map: a dict with tuple keys and new ID values
        """
        if self.data is None:
            raise Exception('Resource is not open.')
        mapped = {}
        for k, v in self.data.items():
            try:
                new_key = new_id_map[tuple(json.loads(k))]
                mapped[new_key] = v
            except KeyError:
                continue
        return mapped

    def _search_output(self, conn, output, synth_round):
        """
        Search for a single output using title and author. Searches the Crossref API first, then ReFindIt if that
        doesn't return a suitable result. Compares the output title with each result using fuzzywuzzy and considers
        them a match if the two strings are at least 80% similar.
        :param conn: SqliteDataResource with an open SQLiteDict, e.g. 'self' within 'with self:'
        :param output: the Output instance we're attempting to find a DOI for
        :param synth_round: the round this output was recorded in
        """
        output_key = json.dumps((synth_round, output.Output_ID))
        self._handled.add(output_key)
        try:
            authors = find_names(clean_string(output.Authors) or '')
            title = output.Title.rstrip('.')
            q = self.works.query(
                author=authors,
                bibliographic=title).sort('relevance').order('desc')
            for ri, result in enumerate(q):
                result_title = result.get('title', [None])[0]
                if result_title is None:
                    continue
                similarity = fuzz.partial_ratio(result_title, title.lower())
                if similarity >= 80:
                    self._added.add(output.Output_ID)
                    conn.add(output_key, result['DOI'].upper())
                    self._methods[output_key] = 'crossref'
                    return
                if ri >= 3 - 1:
                    return
            # refindit also searches a few other databases, so try that if crossref doesn't find it
            refindit_url = 'https://refinder.org/find?search=advanced&limit=5&title=' \
                           f'{title}&author={"&author=".join(authors)}'
            refindit_response = requests.get(refindit_url)
            if refindit_response.ok:
                for ri, result in enumerate(refindit_response.json()):
                    result_title = result.get('title')
                    if result_title is None:
                        continue
                    similarity = fuzz.partial_ratio(result_title,
                                                    title.lower())
                    if similarity >= 80:
                        self._added.add(output.Output_ID)
                        conn.add(output_key, result['DOI'].upper())
                        self._methods[output_key] = 'refindit'
                        return
        except Exception as e:
            self._errors[(synth_round, output.Output_ID)] = e

    def update(self, context, target, *synth_sources):
        """
        Attempt to find a DOI for each output in the NHMOutput tables.
        """
        with self:
            super(OutputDOIs, self).update(context, target, *synth_sources)
        self._handled = set()
        self._errors = {}
        self._methods = {}

        for db_ix, synth_db in enumerate(synth_sources):
            db_ix += 1
            self._added = set()

            def _extract_doi(conn, output, col):
                output_key = json.dumps((db_ix, output.Output_ID))
                self._handled.add(output_key)
                for x in DOIExtractor.dois(getattr(output, col), fix=True):
                    doi, fn = x
                    doi_metadata = self.works.doi(doi)
                    if doi_metadata:
                        doi_title = doi_metadata.get('title', '')
                        doi_title = clean_string(doi_title[0]).lower()
                        output_title = output.Title
                        if output_title is not None:
                            output_title = clean_string(output_title.lower())
                        match = fuzz.partial_ratio(doi_title, output_title)
                        if match > 50:
                            self._added.add(output.Output_ID)
                            conn.add(output_key, doi.upper())
                            self._methods[output_key] = fn
                            break

            def _search_columns(col, *filters):
                outputs = synth_db.query(NHMOutput).filter(
                    NHMOutput.Output_ID.notin_(self._added), *filters)
                thread_workers = context.config.resource_opt(
                    'dois.threads', 20)
                with self, ThreadPoolExecutor(
                        thread_workers) as thread_executor:
                    thread_map(lambda x: _extract_doi(self, x, col),
                               outputs.all(),
                               desc=col,
                               unit=' records',
                               leave=False,
                               position=1)

            _search_columns('URL', NHMOutput.URL.isnot(None))
            _search_columns(
                'Volume',
                or_(NHMOutput.Volume.ilike('%doi%'),
                    NHMOutput.Volume.ilike('%10.%/%')))
            _search_columns(
                'Pages',
                or_(NHMOutput.Pages.ilike('%doi%'),
                    NHMOutput.Pages.ilike('%10.%/%')))

            # now for searching based on metadata
            title_and_author = synth_db.query(NHMOutput).filter(
                NHMOutput.Output_ID.notin_(self._added),
                NHMOutput.Title.isnot(None), NHMOutput.Authors.isnot(None))

            workers = context.config.resource_opt('dois.threads', 20)
            with self, ThreadPoolExecutor(workers) as executor:
                thread_map(lambda x: self._search_output(self, x, db_ix),
                           title_and_author.all(),
                           desc='Crossref',
                           unit=' records',
                           leave=False,
                           position=1)

        methods = {}
        for k, v in self._methods.items():
            methods[v] = methods.get(v, []) + [k]

        for k, v in methods.items():
            click.echo(f'{k}: {len(v)}')
コード例 #19
0
def crossrefAPI_improved_query(parsed_ref_dict: Dict) -> Dict:
    '''
	This function takes a parsed reference dict as returned by the parsers from reference_parser.
	It uses the information given in the dict to create a cleaned up string for a  Crossref keyword 
	query and goes through the first 200 entries to check if the returned result overlaps with the 
	parsed information and returns the result.
	'''
    article_dict = False
    works = Works()
    for _ in range(5):
        try:
            # Create clean query string
            # If everything is given
            if 'volume' not in parsed_ref_dict.keys():
                return None
            if 'authors' in parsed_ref_dict.keys():
                if 'issue' in parsed_ref_dict.keys():
                    formatted_bib_str = '{}, {}, {}, ({}), ({}), {}'.format(
                        parsed_ref_dict['authors'], parsed_ref_dict['journal'],
                        parsed_ref_dict['volume'], parsed_ref_dict['issue'],
                        parsed_ref_dict['year'], parsed_ref_dict['pages'])
                # Everything but the issue is given
                else:
                    formatted_bib_str = '{}, {}, {}, ({}), {}'.format(
                        parsed_ref_dict['authors'], parsed_ref_dict['journal'],
                        parsed_ref_dict['volume'], parsed_ref_dict['year'],
                        parsed_ref_dict['pages'])
            # Everything but author given
            elif 'issue' in parsed_ref_dict.keys():
                formatted_bib_str = '{}, {}, ({}), ({}), {}'.format(
                    parsed_ref_dict['journal'], parsed_ref_dict['volume'],
                    parsed_ref_dict['issue'], parsed_ref_dict['year'],
                    parsed_ref_dict['pages'])

            result = works.query(formatted_bib_str).sort("relevance")

            # Browse first 200 entries to check if one of the results fit
            a = 0
            try:
                for entry in result:
                    a += 1
                    if a == 200:
                        break
                    entry = add_retrieval_information(
                        entry, 'Crossref', 'Crossref_extended_query',
                        str(parsed_ref_dict))

                    normalized_dict = normalize_crossref_dict(entry)
                    #print(normalized_dict)
                    if normalized_dict:
                        if is_same_publication(parsed_ref_dict,
                                               normalized_dict):
                            article_dict = entry
                            break
            except JSONDecodeError:
                pass
            break
        except:
            pass
    if article_dict:
        return article_dict
コード例 #20
0
class NarrativeDataset:
    LICENSE_WHITELIST = [
        'http://creativecommons.org/licenses/by/4.0/',
        'http://creativecommons.org/licenses/by/3.0/'
    ]
    download_links = dict()

    def __init__(self, reset_cache=False):
        self.journals = Journals()
        self.works = Works()
        self.filter_kwargs = dict(has_license='true', has_full_text='true')
        self.keywords = 'business financial merger entrepreneur banking insurance commerce trade economics'
        UnpywallCredentials('*****@*****.**')
        cache_path = path.join(DATA_DIR, 'unpaywall_cache')
        if reset_cache and path.exists(cache_path):
            remove(cache_path)
        self.unpywall_cache = UnpywallCache(cache_path)
        Unpywall.init_cache(self.unpywall_cache)

    def get_dois_from_journal(self, journal_issn):
        doi_list = []
        try:
            if self.journals.journal_exists(journal_issn):
                works = self.journals.works(journal_issn).filter(
                    **self.filter_kwargs).select('DOI', 'license')
                for response_dict in tqdm(works):
                    license_dict = response_dict['license']
                    if self.is_license_whitelist(license_dict[0]['URL']):
                        doi_list.append(response_dict['DOI'])
        except Exception as e:
            logger.error("Error while getting DOIs from REST service",
                         e,
                         exc_info=True)
        return doi_list

    def get_dois_from_keywords(self):
        doi_list = []
        try:
            results = self.works.query(self.keywords).filter(
                **self.filter_kwargs).select('DOI', 'license')
            for response_dict in tqdm(results):
                license_dict = response_dict['license']
                if self.is_license_whitelist(license_dict[0]['URL']):
                    doi_list.append(response_dict['DOI'])
        except Exception as e:
            logger.error("Error while getting DOIs from REST service",
                         e,
                         exc_info=True)
        return doi_list

    def get_oa_urls(self, doi_list):
        logger.info('Retreiving doc urls for DOIs now (cached/uncached)')
        oa_urls = []
        for i, doi in tqdm(enumerate(doi_list), total=len(doi_list)):
            try:
                oa_urls.append(Unpywall.get_doc_link(doi))
            except HTTPError:
                logger.warning(
                    '\nError received for DOI: {}, will retry 3 times in 20 secs'
                    .format(doi))
                sleep(20)
                for i in range(3):
                    try:
                        logger.info('Retry :{}'.format(i + 1))
                        oa_urls.append(Unpywall.get_doc_link(doi))
                        break
                    except HTTPError as e:
                        logger.error('Retry failed', e, exc_info=True)
        return oa_urls

    def is_license_whitelist(self, license):
        license = str(license).replace('https', 'http')
        return license in self.LICENSE_WHITELIST

    def retry_from_another_src(self, faulty_files_list, doi_list):
        src_dict = {'scirp': []}
        for file in faulty_files_list:
            base_name = ntpath.basename(file)
            doi_list_ind = int(base_name.replace("Sample_", "")[:-8]) - 1
            doi = doi_list[doi_list_ind]
            doc_url = Unpywall.get_pdf_link(doi)
            if doc_url is not None and 'scirp' in doc_url.lower():
                try:
                    scirp_id = doc_url[doc_url.index('paperID=') + 8:]
                except (IndexError, ValueError):
                    continue
                if scirp_id != "":
                    src_dict['scirp'].append((file, scirp_id))
        return download_frm_another_src(src_dict)

    @staticmethod
    def download_doi_pdf(works, doi_list, download_dir):
        logger.info(
            "Trying to download the required data now for {} DOIs".format(
                len(doi_list)))
        for i, doi in enumerate(doi_list):

            name_pattern = 'Sample_{}.pdf'.format(str(i + 1))
            download_link = Unpywall.get_pdf_link(doi)
            try:
                if not download_link:
                    result = works.doi(doi)['link']
                    for item in result:
                        application = item['intended-application']
                        type = item['content-type']
                        if application is not None and application == 'text-mining' and type == 'application/pdf':
                            download_link = item['URL']
                            break
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
                if not path.exists(path.join(download_dir, name_pattern)):
                    if download_link and filter_url(download_link):
                        logger.debug('Downloading ' + name_pattern + " : " +
                                     doi + ' from url: ' + download_link)
                        download_pdf_file(download_link,
                                          name_pattern,
                                          download_dir,
                                          progress=True)
                        sleep(5)
            except Exception as e:
                logger.error(
                    "Error while downloading the article ({}, {})".format(
                        str(i + 1), doi),
                    e,
                    exc_info=True)
                NarrativeDataset.download_links[
                    name_pattern[:-4]] = download_link
        return True
コード例 #21
0
def crossref_ref_by_count(doi):
    work = works.doi(doi)
    if work == None:
        return -1
    else:
        return work['is-referenced-by-count']


# print(crossref_ref_by_count("10.1108/07363760110410263"))
# print(works.query("10.1108/07363760110410263"))
# print(works.query("10.1108/07363760110410263" + "&[email protected]"))

if __name__ == "__main__":

    works.query("unable to specify mailto in works.query call&mailto=" +
                sys.argv[3])

    f = open(sys.argv[2], "w")
    f.write("doi,citedby_crossref")
    f.close()

    f = open(sys.argv[2], "a")

    for doi in sys.argv[1].split(',\n'):
        time.sleep(0.1)
        citedby = crossref_ref_by_count(doi)
        if citedby == -1:
            # Couldn't find doi, don't add it
            ()
        else:
            # Append doi to csv
コード例 #22
0
from crossref.restful import Works
import signal, time

works = Works()


class Timeout(Exception):
    pass


def raiseTimeout(sig, frame):
    raise Timeout


signal.signal(signal.SIGALRM, raiseTimeout)
signal.alarm(2)

array = []

try:
    for e in works.query("cancer"):
        array.add(e["DOI"])
except Timeout:
    print("took too long")

print(array)
コード例 #23
0
'''


##<输入模块
key_words = 'nanozyme'  #搜索关键词
path = 'c:\\Users\\asdqw\\Desktop\\Get_Literature\\' + key_words + '\\'  #存储目录
date = '2000'  #起始年份
num = 100  #文献量
##输入结束>
try:
    os.mkdir(path)
except:
    pass
index = 0  #文献序号
metalist = works.query(
    bibliographic=key_words,
    publisher_name='Wiley-Blackwell').filter(from_online_pub_date=date)
count = works.query(bibliographic=key_words,
                    publisher_name='Wiley-Blackwell').filter(
                        from_online_pub_date=date).count()
print('总文献数:' + str(count))
print('完成Corssref元数据检索')
for i in metalist:  #.sample(num):
    index += 1
    print('目前进度: ' + str(index) + '/' + str(count + 1))
    try:
        acquire_text(i, index, path)
    except:
        print('下载失败')
        file = open(path + 'Failed' + str(index) + '.txt',
                    mode='w+',
コード例 #24
0
ファイル: getdoi.py プロジェクト: bougui505/getdoi
# Author: Guillaume Bouvier -- [email protected]
# https://research.pasteur.fr/en/member/guillaume-bouvier/
# 2021-01-21 09:52:56 (UTC+0100)

from crossref.restful import Works

if __name__ == '__main__':
    import argparse
    # argparse.ArgumentParser(prog=None, usage=None, description=None, epilog=None, parents=[], formatter_class=argparse.HelpFormatter, prefix_chars='-', fromfile_prefix_chars=None, argument_default=None, conflict_handler='error', add_help=True, allow_abbrev=True, exit_on_error=True)
    parser = argparse.ArgumentParser(description='')
    # parser.add_argument(name or flags...[, action][, nargs][, const][, default][, type][, choices][, required][, help][, metavar][, dest])
    parser.add_argument(
        '-s',
        '--search',
        type=str,
        required=True,
        help=
        "Query bibliographic information, useful for citation look up. Includes titles, authors, ISSNs and publication years"
    )
    args = parser.parse_args()

    works = Works()
    w = works.query(bibliographic=args.search)
    for item in w:
        title = item['title'][0]
        doi = item['DOI']
        print(f"title: {title}")
        print(f"doi: {doi}")
        print()
コード例 #25
0
    def find_meta(self, identifier):
        """ find metadata with title or DOI
        Keyword Arguments:
        identifier --
        """
        try:
            # verify=False is dangerous but sci-hub.io
            # requires intermediate certificates to verify
            # and requests doesn't know how to download them.
            # as a hacky fix, you can add them to your store
            # and verifying would work. will fix this later.
            url = self.base_url + identifier['article_link']
            self.sess.headers = {'user-agent': self.get_random_user_agent()}
            res = self.sess.get(url, verify=False, allow_redirects=False)
            re_bracket = re.compile("\[(.*?)\]\s")
            title = re.sub(re_bracket, "", identifier['name'])
            ylog.debug('*' * 80)
            ylog.debug("title: %s" % title)
            ylog.debug(res.status_code)
            # self.out.ix[title]['status_code'] = res.status_code
            ylog.debug("headers: %s" % res.headers['Content-Type'])
            ylog.debug('location: %s' % res.headers.get("Location"))
            # self.out.ix[title]['location'] = res.headers.get("Location")
            search_title = True
            if not res.headers.get("Location"):
                content = res.content
                if len(content) > 2:
                    import cchardet
                    charset = cchardet.detect(content)
                    text = content.decode(charset['encoding'])
                    soup = BeautifulSoup(text, "lxml")
                    script = soup.script.get_text()
                    doi_regexp = '10[.][0-9]{4,}(?:[.][0-9]+)*/(?:(?!["&\'<>])\S)+'
                    try:
                        doi_match = re.compile(doi_regexp).findall(script)[0]
                        ylog.info("DOI: %s" % doi_match)
                        search_title = False
                        # use crossref API to get metadata
                        works = Works()
                        w1 = works.query(doi_match).sort('relevance').order(
                            'desc')
                        i = 0
                        for item in w1:
                            # TODO: verify title
                            # self.out.ix[title]['DOI'] = item['DOI']
                            return {'meta': item['DOI'], 'url': url}
                    except IndexError:
                        ylog.debug('failed to find regexp')
            elif search_title:
                works = Works()
                w1 = works.query(title).sort('relevance').order('desc')
                i = 0
                for item in w1:
                    i = i + 1
                    try:
                        # ylog.debug('crossref item title ')
                        t = item.get('title')[0]
                        # ylog.debug(t)
                        sub_title = item.get('subtitle')[0]
                        # ylog.debug(sub_title)
                        # ylog.debug("ratio: %s" %
                        #            (SequenceMatcher(a=title, b=t).ratio()))
                    except TypeError:
                        sub_title = ''
                    if SequenceMatcher(
                            a=title, b=t).ratio() > 0.9 or SequenceMatcher(
                                a=title, b=sub_title).ratio(
                                ) > 0.9 or t.startswith(title):
                        ylog.debug("DOI %s" % item['DOI'])
                        # self.out.ix[title]['DOI'] = item['DOI']
                        return {'meta': item['DOI'], 'url': url}
                    if i > 18:
                        # ylog.debug('[x]%s' % title)
                        # ylog.debug(item['title'])
                        return None

        except requests.exceptions.ConnectionError:
            logger.info('{} cannot acess,changing'.format(
                self.available_base_url_list[0]))
            self._change_base_url()

        except requests.exceptions.RequestException as e:

            return {
                'err':
                'Failed to fetch pdf with identifier %s (resolved url %s) due to request exception.'
                % (identifier, url)
            }
コード例 #26
0
ファイル: doi_tools.py プロジェクト: Svdvoort/BibClean
def get_doi(entry, config):
    has_doi = bib_parser.has_doi(entry)
    my_etiquette = Etiquette(constants.PROJECT_NAME, constants.VERSION,
                             constants.URL, constants.EMAIL)
    max_levenshtein_distance = config.get_max_levenshtein_distance()
    update_URL = config.get_update_URL()

    works = Works(etiquette=my_etiquette)

    if not has_doi and bib_parser.has_url(entry):
        entry_url = bib_parser.get_url(entry)
        if "doi" in entry_url:
            doi = cleaner.clean_doi(entry_url)

            if is_crossref_work(doi):
                crossref_info = works.doi(doi)
                if crossref_is_similar(crossref_info, entry,
                                       max_levenshtein_distance):
                    entry = set_doi(entry, doi, update_URL)
                    has_doi = True

    if not has_doi:
        # we try to find the doi for the title
        entry_title = bib_parser.get_title(entry)
        entry_title = cleaner.clean_braces(entry_title)
        author = bib_parser.get_author(entry)
        first_author = splitname(author[0], strict_mode=False)
        first_author_last_name = first_author["last"][0]

        query_parameters = {
            "author": first_author_last_name,
            "bibliographic": entry_title
        }

        works_query = works.query(**query_parameters)
        works_query = works_query.sort("score").order("desc").select(
            ["title", "DOI"])
        i_i_item = 0
        max_items = min(works_query.count(), 10)
        works_results = iter(works_query)
        while i_i_item < max_items and not has_doi:
            i_item = next(works_results)
            if crossref_is_similar(i_item, entry, max_levenshtein_distance):
                doi = cr_parser.get_doi(i_item)
                entry = set_doi(entry, doi, update_URL)
                has_doi = True
            i_i_item += 1
    else:
        # We check to see if the doi is correct
        doi = bib_parser.get_doi(entry)
        doi = cleaner.clean_doi(doi)
        if is_crossref_work(doi):
            crossref_info = works.doi(doi)

            if crossref_is_similar(crossref_info, entry,
                                   max_levenshtein_distance):
                entry = set_doi(entry, doi, update_URL)
            else:
                entry.pop("doi", None)
                if "doi" in bib_parser.get_url(entry):
                    entry.pop("url", None)
                has_doi = False

        else:
            entry = set_doi(entry, doi, update_URL)

    return entry, has_doi