def index(self): """ By Default Flask will come into this when we run the file. * The can be accessed by 125.0.0.5000/api?doi= * User can either search for a particular doi or can get all the data available in the database """ if 'doi' in request.args: global doi_s doi_s = str(request.args['doi']) else: return 'Error:' global out_db if doi_s == 'all': out_db = database.read_all() return redirect(url_for('ApiView:display_all')) try: doi.validate_doi(doi_s) domain = doi.get_real_url_from_doi(doi_s) except ValueError: return 'Invalid doi' doi_temp = database.check([doi_s]) if doi_temp: scrap = scrape.Scrape() scrap.scrape([domain], current_app.config['DICT_OF_SPIDERS']) out_db = database.read([doi_s]) return redirect(url_for('ApiView:display_all'))
def main(f, failed_out, new_out, log_out): logging.basicConfig(filename=log_out, level=logging.INFO) logging.getLogger().addHandler(logging.StreamHandler()) new_docs = [] failed_docs = [] try: with open(f) as fd: docs = list(yaml.load_all(fd)) N = len(docs) for K, d in enumerate(docs): affs = has_affiliation(d) success_rate = 100.0 - float(len(failed_docs)) / N * 100 if affs: new_docs.append(d) continue try: url = doi.get_real_url_from_doi(d['doi']) logging.info( "{c.Fore.CYAN}[{s:.1f}%]: {K}/{N}. {t}>>" "{c.Fore.YELLOW}Trying {0}{c.Style.RESET_ALL}" .format(url, s=success_rate, K=K, N=N, t=time.ctime(), c=colorama)) downs = papis.downloaders.get_matching_downloaders(url) downs[0].fetch_data() ctx = downs[0].ctx affs = has_affiliation(ctx.data) if affs: d['author_list'] = ctx.data['author_list'] else: raise Exception('No affiliations') except Exception as e: d['_error_msg'] = str(e) logging.info("{c.Fore.RED}\tFailed ({e}){c.Style.RESET_ALL}" .format(e=e, c=colorama)) failed_docs.append(d) else: logging.info("{c.Fore.GREEN}\tsuccess " "{affs!s:.100}{c.Style.RESET_ALL}" .format(url, affs=affs, c=colorama)) new_docs.append(d) except Exception as e: logging.error(e) finally: with open(failed_out, 'w+') as fd: logging.info('writing ' + failed_out) yaml.dump_all( list(failed_docs), fd, allow_unicode=True, default_flow_style=False) with open(new_out, 'w+') as fd: logging.info('writing '+ new_out) yaml.dump_all( list(new_docs), fd, allow_unicode=True, default_flow_style=False)
def test_get_real_url_from_doi() -> None: data = [ ('10.1016/S0009-2614(97)04014-1', 'https://www.sciencedirect.com/science/' 'article/abs/pii/S0009261497040141'), ] for doi, url in data: assert url == get_real_url_from_doi(doi)
def search_doi(self): """ Get Input from the user , validate and return the bibliographical details After clicking the submit button or search button , flask comes here. The values of DOI 's are obtained from the user either as string separated by comma or as a json or csv file. Uploaded files are saved in the Upload folder. The DOI 's are parsed and saved as a list , removed the duplicate ones. Validated the DOI 's by checking the correct format of DOI provided by DOI.org . The url link is obtained from doi.get_url_from_doi(doi). Check the database for the details for each doi. If DOI 's are not present in the database, the domains are saved as a list and Scrape object is called. The data corresponds to the DOI 's are obtained. :return: html page containing the bibliographical data """ from project_doi import database global out_db, doi_s list_doi = [] if request.method == 'POST': if 'doi' in request.form: list_doi = request.form['doi'].split(',') if 'file' in request.files: file = request.files['file'] if file and self.allowed_file(file.filename): filename = secure_filename(file.filename) extension = file.filename.rsplit('.', 1)[1].lower() path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) file.save(path) list_doi = self.upload_contents(extension, path) else: flash('Please upload only csv and json formats') list_doi = list(dict.fromkeys(list_doi)) doi_s = list_doi.copy() domain = {} for i in list_doi: try: doi.validate_doi(i) domain[i] = doi.get_real_url_from_doi(i) except ValueError: flash(f'{i} : is not valid , please try again') doi_s.remove(i) if doi_s is None: return redirect(url_for('DOIView:index')) doi_temp = database.check(doi_s) if doi_temp: doi_ = doi_temp domains = [domain[i] for i in doi_ if i in domain] doi_temp.clear() scrap = scrape.Scrape() scrap.scrape(domains, app.config['DICT_OF_SPIDERS']) out_db = database.read(doi_s) return render_template("search/search_doi.html", context=out_db)
def index(self): if 'doi' in request.args: global doi_s doi_s = str(request.args['doi']) else: return 'Error:' global out_db if doi_s == 'all': out_db = database.read_all() return redirect(url_for('ApiView:display_all')) try: doi.validate_doi(doi_s) domain = doi.get_real_url_from_doi(doi_s) except ValueError: return 'Invalid doi' doi_temp = database.check([doi_s]) if doi_temp: scrap = scrape.Scrape() scrap.scrape([domain], current_app.config['DICT_OF_SPIDERS']) out_db = database.read([doi_s]) return redirect(url_for('ApiView:display_all'))
def search_doi(self): global out_db, doi_s list_doi = [] if request.method == 'POST': if 'doi' in request.form: list_doi = request.form['doi'].split(',') if 'file' in request.files: file = request.files['file'] if file and self.allowed_file(file.filename): filename = secure_filename(file.filename) extension = file.filename.rsplit('.', 1)[1].lower() path = os.path.join(current_app.config['UPLOAD_FOLDER'], filename) file.save(path) list_doi = self.upload_contents(extension, path) else: flash('Please upload only csv and json formats') list_doi = list(dict.fromkeys(list_doi)) doi_s = list_doi domain = {} for i in list_doi: try: doi.validate_doi(i) domain[i] = doi.get_real_url_from_doi(i) except ValueError: flash(f'{i} : is not valid , please try again') doi_s.remove(i) if doi_s is None: return redirect(url_for('DOIView:index')) doi_temp = database.check(doi_s) if doi_temp: doi_ = doi_temp domains = [domain[i] for i in doi_ if i in domain] doi_temp.clear() scrap = scrape.Scrape() success = scrap.scrape(domains, app.config['DICT_OF_SPIDERS']) if success: for i in success: print('i in succscc', i) out_db = database.read(doi_s) return render_template("search/search_doi.html", context=out_db)