def search_allengines(query, headers, _pages, _gs_pages, _acm_pages, _els_pages, records, _title, _keyword, _abstract, _search_yr, _from_yr, _to_yr_, logging_flag, data): # Search all engines try: '--- Engines for Title, Keyword and Abstract---' search_googleScholar(query, headers, _gs_pages, records, _title, _keyword, _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag, data) search_msAcademic(query, headers, _pages, records, _title, _keyword, _abstract, ms_api, _from_yr, _to_yr_, logging_flag, data) search_core(query, headers, _pages, records, _title, _keyword, _abstract, core_api, _search_yr, logging_flag, data) search_pubMed(query, headers, _pages, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) search_acmlibrary(query, headers, _acm_pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) '--- Engines only for Keyword and Abstract ---' search_PlosOne(query, headers, _pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) search_academia(query, headers, _pages, records, _title, _keyword, _abstract, _search_yr, logging_flag, data) search_scopus(query, headers, _els_pages, records, _title, _keyword, _abstract, scp_api, _from_yr, _to_yr_, logging_flag, data) search_springer(query, headers, _pages, records, _title, _keyword, _abstract, spr_api, _search_yr, logging_flag, data) search_sciDirect(query, headers, _pages, records, _title, _keyword, _abstract, sd1_api, sd2_api, _from_yr, _to_yr_, logging_flag, data) return data except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, "Search Exception :", logging_flag, filename, line_number)
def search_engines(x, data=None): # Call Search Modules try: if len(x) != 0: # call the search function for all try: if 0 in x: data = searchAllEngines.search_allengines( query, headers, _pages, _gs_pages, _acm_pages, _els_pages, records, _title, _keyword, _abstract, _search_yr, _from_yr, _to_yr_, logging_flag, data) SaveOutput.saveOutput(data, out, output_path) except Exception as e: # raise e pass # print('error:', e) try: if 0 not in x: data = searchSpecificEngine.search_engines( x, query, headers, _pages, _gs_pages, _acm_pages, _els_pages, records, _title, _keyword, _abstract, _search_yr, _from_yr, _to_yr_, logging_flag, data) SaveOutput.saveOutput(data, out, output_path) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, "Google Scholar", logging_flag, filename, line_number) else: print('Select search engine!') exit except Exception as e: # raise e pass
def search_sciDirect(query, headers, _pages, records, _title, _keyword, _abstract, sd1_api, sd2_api, _from_yr, _to_yr_, logging_flag, data): if _pages > 3: _pages = 3 if _title: url = 'https://www.sciencedirect.com/search/api?qs=%22' + query + '%22&apiKey=' + sd1_api # response object response = requests.get(url, headers={'User-agent': 'your bot 0.1'}, timeout=30) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) print('Searching in Science Direct...') # set the counter for records count count = 0 for i in tqdm(range(1)): # Find required attributes in the response object for item in obj['searchResults']: try: publish_date = str(item['publicationDate']) # get document ID from the result first doi = item['doi'] # call again api with DOI to the get the attriutes url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api response1 = requests.get(url2, headers=headers, timeout=30) soup1 = BeautifulSoup(response1.content, 'lxml') if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all('coredata'): issn = str(['No information found']) # Find required attributes in the response object for item in soup1.find_all('coredata'): resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, " "ISSN, Type, Published date, Abstract", "items": [ {"DOI": item.find_all('prism:doi')[0].get_text(), "Title": item.find_all('dc:title')[0].get_text().strip(), "URLs": item.find_all('prism:url')[0].get_text(), "Authors": item.find_all('dc:creator')[0].get_text(), "Publication Name": item.find_all('prism:publicationname')[ 0].get_text(), "ISSN": issn, # "ISSN": item.find_all('prism:issn')[0].get_text(), "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item.find_all('document-type'), "Published date": publish_date, "Abstract": str(item.find_all('dc:description')[ 0].get_text().strip()).replace('\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if not _from_yr: if _keyword or _abstract: for j in tqdm(range(1)): print('Searching in Science Direct...') # set the counter for records count count = 0 for i in range(_pages): url = 'https://api.elsevier.com/content/search/sciencedirect?query=' + query + '&apiKey=' + sd1_api + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers={'User-agent': 'your bot 0.1'}) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) if 'entry' in obj['search-results']: # Find required attributes in the response object for item in obj['search-results']['entry']: try: publish_date = str(item['load-date']).split('T', -1)[0] # get document ID from the result first doi = item['prism:doi'] # call again api with DOI to the get the attriutes url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api response1 = requests.get(url2, headers=headers) soup1 = BeautifulSoup(response1.content, 'lxml') if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all( 'coredata'): issn = str(['No information found']) # Find required attributes in the response object for item in soup1.find_all('coredata'): resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine", "Attributes found": "DOI, Title, URLs, Authors, " "Publication Name, ISSN, Type, " "Published date, Abstract", "items": [ {"DOI": item.find_all('prism:doi')[0].get_text(), "Title": item.find_all('dc:title')[ 0].get_text().strip(), "URLs": item.find_all('prism:url')[0].get_text(), "Authors": item.find_all('dc:creator')[0].get_text(), "Publication Name": item.find_all('prism:publicationname')[ 0].get_text(), "ISSN": issn, # "ISSN": item.find_all('prism:issn')[0].get_text(), "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item.find_all('document-type'), "Published date": publish_date, "Abstract": str(item.find_all('dc:description')[ 0].get_text().strip()).replace( '\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: if _keyword or _abstract: for i in tqdm(range(1)): print('Searching in Science Direct...') # set the counter for records count count = 0 for i in range(_pages): url = 'https://api.elsevier.com/content/search/sciencedirect?query=' + query + '&date=' + _from_yr + '-' + _to_yr_ + '&apiKey=' + sd1_api + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers={'User-agent': 'your bot 0.1'}) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) if 'entry' in obj['search-results']: # Find required attributes in the response object for item in obj['search-results']['entry']: try: publish_date = str(item['load-date']).split('T', -1)[0] # get document ID from the result first doi = item['prism:doi'] # call again api with DOI to the get the attriutes url2 = 'https://api.elsevier.com/content/article/doi/' + doi + '?apiKey=' + sd2_api response1 = requests.get(url2, headers=headers) soup1 = BeautifulSoup(response1.content, 'lxml') if "prism:Issn" and "prism:issn" "prism:eIssn" and "prism:eissn" not in soup1.find_all( 'coredata'): issn = str(['No information found']) # Find required attributes in the response object for item in soup1.find_all('coredata'): resp_obj = {"entities": {"Search Engine": "Science Direct Search Engine", "Attributes found": "DOI, Title, URLs, Authors, " "Publication Name, ISSN, Type, " "Published date, Abstract", "items": [ {"DOI": item.find_all('prism:doi')[0].get_text(), "Title": item.find_all('dc:title')[ 0].get_text().strip(), "URLs": item.find_all('prism:url')[0].get_text(), "Authors": item.find_all('dc:creator')[0].get_text(), "Publication Name": item.find_all('prism:publicationname')[ 0].get_text(), "ISSN": issn, # "ISSN": item.find_all('prism:issn')[0].get_text(), "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item.find_all('document-type'), "Published date": publish_date, "Abstract": str(item.find_all('dc:description')[ 0].get_text().strip()).replace( '\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data
def search_academia(query, headers, _pages, records, _title, _keyword, _abstract, _search_yr, logging_flag, data): if _title: print('Searching in Academia...') q = query.title().replace(' ', '_') url = 'https://www.academia.edu/search?q=' + query # response object response = requests.get(url, headers=headers, timeout=30) count = 0 if response.status_code == 200: # check for ok response soup = BeautifulSoup(response.content, 'html.parser') for i in tqdm(range(1)): # Find required attributes in the response object for item in soup.find_all('div', class_='a-fadeInDown'): abs = '' try: # few records doesnt have summary attribute so check them if bool( item.find_all('div', class_='work-card--abstract')): # if 'summarized' in soup.find_all('div', class_='u-borderBottom1'): abs = item.find_all('div', class_='work-card--abstract') else: abs = ['No information found'] resp_obj = { "entities": { "Search Engine": "Academia Search Engine", "Attributes found": "Title, URLs, Authors, Abstract", "items": [{ "DOI": ['No information found'], "Title": item.find_all('div', class_='work-card--title') [0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": item.find_all( 'div', class_='work-card--author-name') [0].get_text(), "Publication Name": str( item.find_all( 'div', class_='work-card--publish-wrapper' )[0].get_text()).split(',', 1)[1], "ISSN": ['No information found'], "Cited count": ['No information found'], "Affiliation": ['No information found '], "Type": ['No information found'], "Published date": str( item.find_all( 'div', class_='work-card--publish-wrapper' )[0].get_text()).split(',', 1)[0], "Abstract": abs }] } } count += 1 data.append(resp_obj) # append dict object data except Exception as e: # raise e pass logger.writeError("Logging Erorr:" + str(e), None, _engine, logging_flag) else: pass time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if _keyword or _abstract: print('Searching in Academia...') if _search_yr: print( 'Date parameter search either not supported or not available in this search engine!' ) else: count = 0 for i in tqdm(range(1)): for i in range(_pages): # url = 'https://www.academia.edu/search?q=' + query q = query.title().replace(' ', '_') url = 'https://www.academia.edu/Documents/in/' + q + '?page=' + str( i) # response object response = requests.get(url, headers=headers, timeout=30) if response.status_code == 200: # check for ok response soup = BeautifulSoup(response.content, 'html.parser') # Find required attributes in the response object for item in soup.find_all('div', class_='u-borderBottom1'): abs = '' try: # try: # few records doesnt have summary attribute so check them if bool(item.select('.summarized')): # if 'summarized' in soup.find_all('div', class_='u-borderBottom1'): abs = item.select( '.summarized')[0].get_text() # except Exception as e: # raise e elif bool(item.select('.summary')): abs = item.select('.summary')[0].get_text() else: abs = ['No information found'] resp_obj = { "entities": { "Search Engine": "Academia Search Engine", "Attributes found": "Title, URLs, Authors, Abstract", "items": [{ "DOI": ['No information found'], "Title": item.select('a')[0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": item.select( '.u-fw700')[0].get_text(), "Publication Name": ['No information found'], "ISSN": ['No information found'], "Cited count": ['No information found'], "Affiliation": ['No information found '], "Type": ['No information found'], "Published date": ['No information found'], "Abstract": abs }] } } count += 1 data.append(resp_obj) # append dict object data except Exception as e: # raise e # pass logger.writeError("Logging Erorr:" + str(e), None, _engine, logging_flag) else: pass time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data
def search_googleScholar(query, headers, _gs_pages, records, _title, _keyword, _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag, data): rec = 0 if _title: # request url url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=%22' + query + '%22&btnG=' # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') print('Searching in Google Scholar...') # set the counter for records count count = 0 for i in tqdm(range(1)): # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: if bool(item.select('.gs_or_ggsm')): cc = str( re.findall( r'\d+', str(item.select('.gs_fl') [1].get_text()))).split(',', 1)[0].replace( '[', '') else: cc = str( re.findall( r'\d+', str(item.select('.gs_fl') [0].get_text()))).split(',', 1)[0].replace( '[', '') if bool(item.select('.gs_ct1')): type = str(item.select('.gs_ct1')[0].get_text()) else: type = str(['Research Article']) resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, Type, Published " "date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3')[0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str(item.select('.gs_a') [0].get_text()).split('-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select('.gs_a') [0].get_text()))).strip(), "Abstract": item.select('.gs_rs')[0].get_text() }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, "1", count, logging_flag) return data if _keyword or _abstract: if _gs_pages != 0: pages = pagination(_gs_pages) else: pages = 1 # search for dates if _from_yr: # use of scraper api to avoid IP block issue by Google scholar client = ScraperAPIClient(scrpr_api) count = 0 for i in tqdm(range(1)): print("Searching Google Scholar Engine now please wait...") url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=' response = client.get(url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with status", response.status_code) logger.writeError( "Logging Error:" + str(response.status_code), None, _engine, logging_flag) else: soup = BeautifulSoup(response.content, 'lxml') # count no of records returned by google scholar for item in soup.find_all('div', class_='gs_ab_st'): rec = \ str(item.find_all('div', id='gs_ab_md')[0].get_text()).split(' ', 1)[1].replace(',', "").split( ' ', 1)[0] pages = 1 if _gs_pages != 0: pages = pagination(_gs_pages) else: pages = pagination(rec) # check if records are greater than 1000 or not if int(pages) > 100: print( "NOTE:Google Scholar returns data for max 1000 records irrespective of total records. " "Total No of total records found :", rec, "\n Fetching records details now...") pages = 100 for i in range(pages): url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str( i) + '0' # response = requests.get(url, proxies={"http": proxy, "https": proxy}, headers=headers) response = client.get( url, headers={'User-agent': 'your bot 0.1'}) soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool( item.select('.gs_rs') [0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str( ['No information found']) except: abstract = str( ['No information found']) pass try: if bool(item.select('.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split( ',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split( ',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1') [0].get_text()) else: type = str(['Research Article']) except: type = str(['No information found']) pass # response object resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, " "Type, Published date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3') [0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select('.gs_a') [0].get_text()).split( '-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select( '.gs_a') [0].get_text())) ).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: for i in range(pages): url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&as_ylo=' + _from_yr + '&as_yhi=' + _to_yr_ + '&btnG=&start=' + str( i) + '0' response = client.get( url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with stauts", response.status_code) logger.writeError( "Logging Erorr:" + str(response.status_code), None, _engine, logging_flag) else: soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool( item.select('.gs_rs') [0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str( ['No information found']) except: abstract = str( ['No information found']) pass try: if bool(item.select( '.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split( ',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split( ',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1') [0].get_text()) else: type = str( ['Research Article']) except: type = str( ['No information found']) pass resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited " "count, Type, Published date, " "Abstract", "items": [{ "DOI": str([ 'No information found' ]), "Title": item.select( 'h3')[0].get_text(), "URLs": item.select('a')[0] ['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select( '.gs_a')[0]. get_text()).split( '-', 1)[0]), "Publication Name": str([ 'No information found' ]), "ISSN": str([ 'No information found' ]), "Cited count": cc, "Affiliation": str([ 'No information found' ]), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select( '.gs_a') [0].get_text()) )).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError( e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, rec, count, logging_flag) return data # search without dates else: print("Searching Google Scholar Engine now please wait...") client = ScraperAPIClient(scrpr_api) count = 0 for i in tqdm(range(1)): for i in range(pages): # request url url = 'https://scholar.google.com/scholar?hl=en&as_sdt=0%2C5&q=' + query + '&btnG=&start=' + str( i) + '0' # response object response = client.get( url, headers={'User-agent': 'your bot 0.1'}) if response.status_code != 200: print("Request failed with stauts", response.status_code) logger.writeError( "Logging Erorr:" + str(response.status_code), None, _engine, logging_flag) soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object by checking tag [data-lid]')) for item in soup.select('[data-lid]'): try: try: if bool(item.select('.gs_rs')[0].get_text()): abstract = item.select( '.gs_rs')[0].get_text() else: abstract = str(['No information found']) except: abstract = str(['No information found']) pass try: if bool(item.select('.gs_or_ggsm')): cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[1].get_text()))).split(',', 1)[ 0].replace('[', '') else: cc = \ str(re.findall(r'\d+', str(item.select('.gs_fl')[0].get_text()))).split(',', 1)[ 0].replace('[', '') except: cc = str(['No information found']) pass try: if bool(item.select('.gs_ct1')): type = str( item.select('.gs_ct1')[0].get_text()) else: type = str(['Research Article']) except: type = str(['No information found']) pass resp_obj = { "entities": { "Search Engine": "Google Scholar", "Attributes found": "Title, URLs, Authors, Cited count, Type, " "Published date, Abstract", "items": [{ "DOI": str(['No information found']), "Title": item.select('h3')[0].get_text(), "URLs": item.select('a')[0]['href'], "Authors": re.sub( "[^A-Za-z]", " ", str( item.select('.gs_a') [0].get_text()).split('-', 1)[0]), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": cc, "Affiliation": str(['No information found']), "Type": type, "Published date": str( re.findall( r'\d+', str( item.select('.gs_a') [0].get_text()))).strip(), "Abstract": abstract }] } } # append dict object data count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) print(f'Finished with total {count} records returned.') logger.writeRecords(query, None, _engine, rec, count, logging_flag) return data
def search_core(query, headers, _pages, records, _title, _keyword, _abstract, core_api, _search_yr, logging_flag, data): if _title: print('Searching in CORE...') url = 'https://core.ac.uk:443/api-v2/articles/search/%22' + query + '%22?page=1&pageSize=10&apiKey=' + core_api # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert soup object into json obj = json.loads(soup.text) # set the counter for records count count = 0 for i in tqdm(range(1)): if obj['data'] is not None: # Find required attributes in the response object for item in obj['data']: try: if 'publisher' not in obj: publisher = ['No Information'] else: publisher = item['publisher'] resp_obj = {"entities": {"Search Engine": "CORE Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication Name," "Type, Published Date", "items": [{"DOI": item['oai'], "Title": item['title'], "URLs": item['downloadUrl'], "Authors": item['authors'], "Publication Name": publisher, "ISSN": ['No Information'], "Cited count": ['No Information'], "Affiliation": ['No Information'], "Type": ['Article'], # "Keywords": item['topics'], "Published Date": item['datePublished'], "Abstract": ['No Information'] }]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: pass # print('error core:', e) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if not _search_yr: if _keyword or _abstract: print('Searching in CORE...') count = 0 for i in tqdm(range(1)): for i in range(_pages): i += 1 url = 'https://core.ac.uk:443/api-v2/search/' + query + '?page=' + str( i) + '&pageSize=20&year=' + _search_yr + '&apiKey=' + core_api # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert soup object into json obj = json.loads(soup.text) # set the counter for records count if obj['data'] is not None: # Find required attributes in the response object for item in obj['data']: try: resp_obj = {"entities": {"Search Engine": "CORE Search Engine", "Attributes found": "DOI, Title, URLs, Authors,Publication " "Name, IISN, Cited Count,Type, " "Published Date, Abstract", "items": [{"DOI": item['_source']['doi'], "Title": item['_source']['title'], "URLs": item['_source']['urls'], "Authors": item['_source']['authors'], "Publication Name": item['_source']['publisher'], "ISSN": item['_source']['issn'], "Cited count": item['_source']['citationCount'], "Affiliation": ['No Information'], "Type": item['_type'], # "Keywords": item['topics'], "Published Date": str(item['_source']['datePublished']).split('T', 1)[ 0], "Abstract": str( item['_source']['description']).replace('\n', '') }]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: pass time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: print('Searching in CORE...') print("Date Parameter not supported in this CORE API!")
def search_pubMed(query, headers, _pages, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data): if _title: print('Searching in PubMed...') count = 0 for i in tqdm(range(1)): for i in range(_pages): url = 'https://pubmed.ncbi.nlm.nih.gov/?term=%22' + query + '%22' + '&size=10&page=1' # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') for item in soup.find_all('div', class_='article-page'): try: try: # few records doesnt have summary attribute so check them if bool( item.find_all( 'div', class_='abstract')[0].get_text()): abs = str( item.find_all('div', class_='abstract') [0].get_text()).strip().replace('\n', '') except Exception as e: # raise e abs = ['No information found'] if bool(item.select('.secondary-date')): pub_date = str( item.find_all('span', class_='secondary-date') [0].get_text()).split(';', -1)[0] else: pub_date = ['No information found'] resp_obj = { "entities": { "Search Engine": "PubMed Engine", "Attributes found": "DOI,Title, URLs, Authors,Type, Published Date, Abstract", "items": [{ "DOI": str( item.find_all('span', class_='citation-doi') [0].get_text()).replace('\n', ''), "Title": str( item.find_all('h1', class_='heading-title') [0].get_text()).strip(), "URLs": 'https://pubmed.ncbi.nlm.nih.gov' + item.find_all('a', class_='id-link')[0]['href'], "Authors": str( item.find_all( 'span', class_='authors-list-item') [0].get_text()).strip().replace( '\n', ''), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": str(['article']), "Published date": pub_date, "Abstract": abs }] } } count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') # Enable if you want to get MESH terms of articles # print(f'Now fetching Mesh Terms for {count} records returned.') # getMeshTerms.getMeshIDs(data,_email) # print(f'MeshTerms File saved for {count} records in text format.') return data if _keyword or _abstract: print('Searching in PubMed...') count = 0 authr_list = [] if _from_yr: for i in tqdm(range(1)): for i in range(_pages): i += 1 url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + query + '&filter=years.' + _from_yr + '-' + _to_yr_ + '&format=abstract&size=10&page=' + str( i) # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') for item in soup.select('div', class_='search-results-chunks'): try: try: doi = str( item.find_all('span', class_='citation-doi') [0].get_text()).split('doi', 1)[1].replace( '\n', '') except Exception as e: # raise e doi = ['No information found'] try: if bool(item.find_all('span', class_='cit')): pub_date = str( item.find_all( 'span', class_='cit')[0].get_text()).split( ';', 1)[0].replace('\n', '') else: pub_date = \ str(item.find_all('span', class_='secondary-date')[0].get_text()).split('b', 1)[ 1].replace('\n', '') except Exception as e: # raise e pub_date = ['No information found'] if bool(item.select('.copyright')): pub_name = str( item.find_all('p', class_='copyright') [0].get_text()).strip() else: pub_name = ['No information found'] if bool(item.select('.authors-list')): for i in range( len( item.find('div', class_='authors-list'). find_all('a'))): authr_list.append( str( item.find('div', class_='authors-list' ).find_all('a') [i].get_text()).strip()) else: authr_list = ['No information found'] resp_obj = { "entities": { "Search Engine": "PubMed Engine", "Attributes found": "DOI,Title, URLs, Authors,Type, Published Date,Publication Name,Affiliation, Abstract", "items": [{ "DOI": doi, "Title": str( item.find_all( 'h1', class_='heading-title') [0].get_text()).strip(), "URLs": 'https://pubmed.ncbi.nlm.nih.gov' + item.find('h1', class_="heading-title" ).find_all("a")[0]['href'], "Authors": authr_list, "Publication Name": pub_name, "ISSN": str(['No information found']), "Cited count": str(['No information found']), "Affiliation": str( item.select( 'li[data-affiliation-id]') [0].get_text()), "Type": str(['article']), "Published date": pub_date, "Abstract": str( item.find_all( 'div', class_='abstract-content') [0].get_text()).strip() }] } } if (len(data) != 0): if not (checkItem( data, resp_obj['entities']['items'][0] ['Title'])): count += 1 data.append(resp_obj) else: count += 1 data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') # Enable if you want to get MESH terms of articles # print(f'Now fetching Mesh Terms for {count} records returned.') # getMeshTerms.getMeshIDs(data,_email) # print(f'MeshTerms File saved for {count} records in text format.') return data else: for i in tqdm(range(1)): for i in range(_pages): i += 1 url = 'https://pubmed.ncbi.nlm.nih.gov/?term=' + query + '&format=abstract&size=10&page=' + str( i) # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') for item in soup.select('div', class_='search-results'): try: if bool(item.select('.secondary-date')): pub_date = \ str(item.find_all('span', class_='secondary-date')[0].get_text()).split('b', 1)[ 1].replace('\n', '') else: pub_date = ['No information found'] if bool(item.select('.copyright')): pub_name = str( item.find_all('p', class_='copyright') [0].get_text()).strip() else: pub_name = ['No information found'] if bool(item.select('.authors-list')): for i in range( len( item.find('div', class_='authors-list'). find_all('a'))): authr_list.append( str( item.find('div', class_='authors-list' ).find_all('a') [i].get_text()).strip()) else: authr_list = ['No information found'] resp_obj = { "entities": { "Search Engine": "PubMed Engine", "Attributes found": "DOI,Title, URLs, Authors,Type, Published Date,Publication Name,Affiliation, Abstract", "items": [{ "DOI": str( item.find_all( 'span', class_='citation-doi') [0].get_text()).split( 'doi', 1)[1].replace('\n', ''), "Title": str( item.find_all( 'h1', class_='heading-title') [0].get_text()).strip(), "URLs": 'https://pubmed.ncbi.nlm.nih.gov' + item.find('h1', class_="heading-title" ).find_all("a")[0]['href'], "Authors": authr_list, "Publication Name": pub_name, "ISSN": str(['No information found']), "Cited count": str(['No information found']), "Affiliation": str( item.select( 'li[data-affiliation-id]') [0].get_text()), "Type": str(['article']), "Published date": pub_date, "Abstract": str( item.find_all( 'div', class_='abstract-content') [0].get_text()).strip() }] } } count += 1 data.append(resp_obj) # print(data.items()) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') # Enable if you want to get MESH terms of articles # print(f'Now fetching Mesh Terms for {count} records returned.') # getMeshTerms.getMeshIDs(data,_email) # print(f'MeshTerms File saved for {count} records in text format.') return data
def search_acmlibrary(query, headers, _acm_pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data): query = processInputQuery(query) if _title: url = 'https://dl.acm.org/action/doSearch?AllField=%22' + query + '%22' # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') # obj = json.loads(soup.text) print('Searching in ACM Library...') # set the counter for records count count = 0 for i in tqdm(range(1)): ######## Find required attributes in the response object for item in soup.select( 'li', class_='search__item issue-item-container'): try: resp_obj = { "entities": { "Search Engine": "ACM Library Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract", "items": [{ "DOI": item.find("span", class_='hlFld-Title').find_all( 'a')[0]['href'], "Title": item.find_all("h5", class_='issue-item__title') [0].get_text().strip(), "URLs": item.find_all( "a", class_='issue-item__doi')[0]['href'], "Authors": item.find_all("ul", class_='truncate-list') [0].get_text().strip().replace('\n', ''), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": item.find("span", class_='citation').find_all( 'span')[0].get_text(), "Affiliation": str(['No information found ']), "Type": item.find_all( "div", class_='issue-heading')[0].get_text(), "Published date": item.find("span", class_='dot-separator').find_all( 'span')[0].get_text(), "Abstract": str( item.find_all( "div", class_='issue-item__abstract') [0].get_text()).strip().replace( '\n', '').replace(' ', '') }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if _keyword or _abstract: print('Searching in ACM Library...') if (_acm_pages != 0): pages = pagination(_acm_pages) else: pages = 1 if len(_from_yr) != 0: count = 0 for i in tqdm(range(1)): url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&AfterYear=' + _from_yr + '&BeforeYear=' + _to_yr_ + '&pageSize=20&startPage=' + str( i) # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') # count no of records returned by engine for item in soup.find_all('span', class_='hitsLength'): rec = str( soup.find_all( 'span', class_='hitsLength')[0].get_text()).replace( ',', '').replace(" ", "") pages = 1 if (_acm_pages != 0): pages = pagination(_acm_pages) else: pages = pagination(int(rec)) if int(pages) > 100: print( "NOTE:ACM Library returns data for max 2000 records irrespective of total records. Total No of total records found :", rec, "\n Fetching records details now...") pages = 50 for i in range(pages): url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&AfterYear=' + _from_yr + '&BeforeYear=' + _to_yr_ + '&pageSize=20&startPage=' + str( i) response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') # Find required attributes in the response object for item in soup.select( 'li', class_='search__item issue-item-container'): try: resp_obj = { "entities": { "Search Engine": "ACM Library Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract", "items": [{ "DOI": item.find( "span", class_='hlFld-Title').find_all( 'a')[0]['href'], "Title": item.find_all( "h5", class_='issue-item__title') [0].get_text().strip(), "URLs": item.find_all( "a", class_='issue-item__doi') [0]['href'], "Authors": item.find_all( "ul", class_='truncate-list') [0].get_text().strip().replace( '\n', ''), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": item.find( "span", class_='citation').find_all( 'span')[0].get_text(), "Affiliation": str(['No information found']), "Type": item.find_all( "div", class_='issue-heading') [0].get_text(), "Published date": item.find("span", class_='dot-separator'). find_all('span')[0].get_text(), "Abstract": str( item.find_all( "div", class_= 'issue-item__abstract')[0]. get_text()).strip().replace( '\n', '').replace(' ', '') }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: for i in range(pages): ######## Find required attributes in the response object for item in soup.select( 'li', class_='search__item issue-item-container'): try: resp_obj = { "entities": { "Search Engine": "ACM Library Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract", "items": [{ "DOI": item.find( "span", class_='hlFld-Title').find_all( 'a')[0]['href'], "Title": item.find_all( "h5", class_='issue-item__title') [0].get_text().strip(), "URLs": item.find_all("a", class_='issue-item__doi') [0]['href'], "Authors": item.find_all("ul", class_='truncate-list') [0].get_text().strip().replace( '\n', ''), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": item.find("span", class_='citation').find_all( 'span')[0].get_text(), "Affiliation": str(['No information found']), "Type": item.find_all("div", class_='issue-heading') [0].get_text(), "Published date": item.find( "span", class_='dot-separator').find_all( 'span')[0].get_text(), "Abstract": str( item.find_all( "div", class_='issue-item__abstract') [0].get_text()).strip().replace( '\n', '').replace(' ', '') }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: count = 0 for i in tqdm(range(1)): for i in range(pages): url = 'https://dl.acm.org/action/doSearch?AllField=' + query + '&pageSize=20&startPage=' + str( i) # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') # obj = json.loads(soup.text) # set the counter for records count # Find required attributes in the response object for item in soup.select( 'li', class_='search__item issue-item-container'): try: resp_obj = { "entities": { "Search Engine": "ACM Library Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Citation count, Type, Published date, Abstract", "items": [{ "DOI": item.find( "span", class_='hlFld-Title').find_all( 'a')[0]['href'], "Title": item.find_all( "h5", class_='issue-item__title') [0].get_text().strip(), "URLs": item.find_all("a", class_='issue-item__doi') [0]['href'], "Authors": item.find_all("ul", class_='truncate-list') [0].get_text().strip().replace( '\n', ''), "Publication Name": str(['No information found']), "ISSN": str(['No information found']), "Cited count": item.find("span", class_='citation').find_all( 'span')[0].get_text(), "Affiliation": str(['No information found']), "Type": item.find_all("div", class_='issue-heading') [0].get_text(), "Published date": item.find( "span", class_='dot-separator').find_all( 'span')[0].get_text(), "Abstract": str( item.find_all( "div", class_='issue-item__abstract') [0].get_text()).strip().replace( '\n', '').replace(' ', '') }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data
def search_engines(x, query, headers, _pages, _gs_pages, _acm_pages, _els_pages, records, _title, _keyword, _abstract, _search_yr, _from_yr, _to_yr_, logging_flag, data): # Search all engines try: if len(x) != 0: # Engines for Title, Keyword and Abstract # if 1 in x: search_googleScholar(query, headers, _gs_pages, records, _title, _keyword, _abstract, scrpr_api, _from_yr, _to_yr_, logging_flag, data) elif 2 in x: search_msAcademic(query, headers, _pages, records, _title, _keyword, _abstract, ms_api, _from_yr, _to_yr_, logging_flag, data) elif 3 in x: search_core(query, headers, _pages, records, _title, _keyword, _abstract, core_api, _search_yr, logging_flag, data) elif 4 in x: search_pubMed(query, headers, _pages, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) elif 5 in x: search_acmlibrary(query, headers, _acm_pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) # Engines only for Keyword and Abstract # elif 6 in x: search_PlosOne(query, headers, _pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data) elif 7 in x: search_academia(query, headers, _pages, records, _title, _keyword, _abstract, _search_yr, logging_flag, data) elif 8 in x: search_scopus(query, headers, _els_pages, records, _title, _keyword, _abstract, scp_api, _from_yr, _to_yr_, logging_flag, data) elif 9 in x: search_springer(query, headers, _pages, records, _title, _keyword, _abstract, spr_api, _search_yr, logging_flag, data) elif 10 in x: search_sciDirect(query, headers, _pages, records, _title, _keyword, _abstract, sd1_api, sd2_api, _from_yr, _to_yr_, logging_flag, data) else: print('Select search engine!') exit except Exception as e: # raise e # pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, "MS Academic", logging_flag, filename, line_number) return data
def search_msAcademic(query, headers, _pages, records, _title, _keyword, _abstract, ms_api, _from_yr, _to_yr_, logging_flag, data): q = str(re.sub('["!,*)@#%(&$_?.^]', '', query.lower())) # title search if _title: url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=Ti=%27' + q + '%27&model=latest' \ '&count=10&offset' \ '=0&attributes' \ '=DOI,Ti,Y,BT,D,W,' \ 'PB,CC,AA.AuN,' \ 'AA.AuId,AA.DAfN,' \ 'AA.AfN,S,' \ 'AW&subscription' \ '-key=' + ms_api # response object response = requests.get(url1, headers=headers) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) print('Searching in Microsoft Academic...') # set the counter for records count count = 0 for i in tqdm(range(1)): # Find required attributes in the response object for item in obj['entities']: try: # extract abstract keywords from the response as it doesnt have a specific abstract attribute if bool(str(item['AW'])): abs_str = str(item['AW']) abs_new = abs_str.replace(',', '').replace("'", '') else: abs_new = str(['No information found']) if bool(item['S'][0]['U']): urls = item['S'][0]['U'] else: urls = str(['No information found']) if bool(item['BT']): if item['BT'] == 'a': type = 'Journal/Article' elif item['BT'] == 'b': type = 'Book' elif item['BT'] == 'p': type = 'Conference Paper' else: type = str(['No information found']) else: type = str(['No information found']) if 'DOI' not in obj: doi = str(['No information found']) else: doi = item['DOI'] if 'PB' not in obj: pb = str(['No information found']) else: pb = item['PB'] resp_obj = { "entities": { "Search Engine": "Microsoft Academy", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, Cited " "count, Affiliation name, Type, Published date, " "Abstract", "items": [{ "DOI": doi, "Title": item['Ti'], "URLs": urls, "Authors": item['AA'][0]['AuN'], "Publication Name": pb, "ISSN": str(['No Information found']), "Cited count": item['CC'], "Affiliation": item['AA'][0]['DAfN'], "Type": type, "Published date": item['D'], "Abstract": abs_new }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if (not _from_yr): # keyword search if _keyword or _abstract: print('Searching in Microsoft Academic...') count = 0 for i in tqdm(range(1)): url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=Composite(F.FN=%27' + q + '%27)&model=latest&count=' + str( records) + '&offset=0&attributes=DOI,Ti,Y,BT,D,W,PB,CC,AA.AuN,AA.AuId,AA.DAfN,AA.AfN,S,' \ 'AW&subscription-key=' + ms_api # response object response = requests.get(url1, headers=headers) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) # set the counter for records count # Find required attributes in the response object for item in obj['entities']: try: # extract abstract keywords from the response as it doesnt have a spefcific abstract attribute if 'AW' in item: abs_str = str(item['AW']) abs_new = abs_str.replace(',', '').replace("'", '') else: abs_new = str(['No information found']) if bool('S' in item): urls = item['S'][0]['U'] else: urls = str(['No information found']) if bool('BT' in item): if item['BT'] == 'a': type = 'Journal/Article' elif item['BT'] == 'b': type = 'Book' elif item['BT'] == 'p': type = 'Conference Paper' else: type = str(['No information found']) else: type = str(['No information found']) if 'DOI' not in obj: doi = str(['No information found']) else: doi = item['DOI'] if 'PB' not in obj: pb = str(['No information found']) else: pb = item['PB'] resp_obj = { "entities": { "Search Engine": "Microsoft Academy", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, Cited count, Affiliation name, Type, Published date, Abstract", "items": [{ "DOI": doi, "Title": item['Ti'], "URLs": urls, "Authors": item['AA'][0]['AuN'], "Publication Name": pb, "ISSN": str(['No Information found']), "Cited count": item['CC'], "Affiliation": item['AA'][0]['DAfN'], "Type": type, "Published date": item['D'], "Abstract": abs_new }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e # pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: if _keyword or _abstract: print('Searching in Microsoft Academic...') count = 0 for i in tqdm(range(1)): url1 = 'https://api.labs.cognitive.microsoft.com/academic/v1.0/evaluate?expr=And(Y=' + '[' + _from_yr + ',' + _to_yr_ + ']' + ',Composite(F.FN==%27' + q + '%27))' + '&model=latest&count=' + str( records) + '&offset=0&attributes=DOI,Ti,Y,BT,D,W,PB,CC,AA.AuN,AA.AuId,AA.DAfN,AA.AfN,S,' \ 'AW&subscription-key=' + ms_api # response object response = requests.get(url1, headers=headers) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) # set the counter for records count # Find required attributes in the response object for item in obj['entities']: try: # extract abstract keywords from the response as it doesnt have a spefcific abstract attribute if ('AW' in item): abs_str = str(item['AW']) abs_new = abs_str.replace(',', '').replace("'", '') else: abs_new = str(['No information found']) if bool('S' in item): urls = item['S'][0]['U'] else: urls = str(['No information found']) if bool('BT' in item): if item['BT'] == 'a': type = 'Journal/Article' elif item['BT'] == 'b': type = 'Book' elif item['BT'] == 'p': type = 'Conference Paper' else: type = str(['No information found']) else: type = str(['No information found']) if 'DOI' not in obj: doi = str(['No information found']) else: doi = item['DOI'] if 'PB' not in obj: pb = str(['No information found']) else: pb = item['PB'] resp_obj = { "entities": { "Search Engine": "Microsoft Academy", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, " "Cited count, Affiliation name, Type, " "Published date, Abstract", "items": [{ "DOI": doi, "Title": item['Ti'], "URLs": urls, "Authors": item['AA'][0]['AuN'], "Publication Name": pb, "ISSN": str(['No Information found']), "Cited count": item['CC'], "Affiliation": item['AA'][0]['DAfN'], "Type": type, "Published date": item['D'], "Abstract": abs_new }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e # pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data
def search_springer(query, headers, _pages, records, _title, _keyword, _abstract, spr_api, _search_yr, logging_flag, data): print('Searching in Springer...') if not _search_yr: count = 0 for i in tqdm(range(1)): for i in range(_pages): url = 'http://api.springernature.com/meta/v2/json?q=' + query + '&s=' + str( i) + '&p=10&api_Key=' + spr_api # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'lxml') obj = json.loads(soup.text) # set the counter for records count # Find required attributes in the response object for item in obj['records']: if 'issn' in obj['records']: issn = item['issn'] elif 'isbn' in obj['records']: issn = item['isbn'] else: issn = str(['No Information found']) try: resp_obj = {"entities": {"Search Engine": "Springer Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication " "Name, ISSN, Type, Published date, Abstract", "items": [ {"DOI": item['identifier'], "Title": item['title'], "URLs": item['url'][0]['value'], "Authors": item['creators'][0]['creator'], "Publication Name": item['publicationName'], "ISSN": issn, "Cited count": str(['No Information found']), "Affiliation": str(['No information found']), "Type": item['contentType'], "Published date": item['onlineDate'], "Abstract": item['abstract'] } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: print("Date parameter either not supported or not available in Springer API!") return
def search_scopus(query, headers, _els_pages, records, _title, _keyword, _abstract, scp_api, _from_yr, _to_yr_, logging_flag, data): query = processInputQuery(query) if _title: url = 'https://api.elsevier.com/content/search/scopus?query=%22' + query + '%22&apiKey=' + scp_api # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert resonse into josn obj = json.loads(soup.text) print('Searching in Elsevier Scopus...') # set the counter for records count count = 0 for i in tqdm(range(1)): # Find required attributes in the response object for item in obj['search-results']['entry']: try: if "prism:Issn" and "prism:issn" not in obj: issn = item['prism:eIssn'] else: issn = item['prism:issn'] resp_obj = { "entities": { "Search Engine": "Elsevier SCOPUS Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, ISSN, " "Cited count, Affiliation name, Type, " "Published date, Abstract", "items": [{ "DOI": item['prism:doi'], "Title": item['dc:title'], "URLs": item['prism:url'], "Authors": item['dc:creator'], "Publication Name": item['prism:publicationName'], "ISSN": issn, "Cited count": item['citedby-count'], "Affiliation": item['affiliation'][0]['affilname'], "Type": item['subtypeDescription'], "Published date": item['prism:coverDate'], "Abstract": item['prism:publicationName'] }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data if not _from_yr: if _keyword or _abstract: rec = 0 if (_els_pages != 0): pages = pagination(_els_pages) else: pages = 1 print('Searching in Elsevier Scopus...') count = 0 for i in tqdm(range(1)): for i in range(pages): url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert response into json obj = json.loads(soup.text) # set the counter for records count # Find required attributes in the response object for item in obj['search-results']['entry']: try: if "prism:eIssn" in item: issn = item['prism:eIssn'] elif "prism:Issn" or "prism:issn" in item: issn = item['prism:issn'] else: issn = str(['No information found']) resp_obj = { "entities": { "Search Engine": "Elsevier SCOPUS Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication " "Name, ISSN, Cited count, Affiliation name, " "Type, Published date, Abstract", "items": [{ "DOI": item['prism:doi'], "Title": item['dc:title'], "URLs": item['prism:url'], "Authors": item['dc:creator'], "Publication Name": item['prism:publicationName'], "ISSN": issn, "Cited count": item['citedby-count'], "Affiliation": item['affiliation'][0]['affilname'], "Type": item['subtypeDescription'], "Published date": item['prism:coverDate'], "Abstract": item['prism:publicationName'] }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data else: if _keyword or _abstract: print('Searching in Elsevier Scopus...') count = 0 for i in tqdm(range(1)): url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str( i) + '&count=10' url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert response into json obj = json.loads(soup.text) rec = obj['search-results']['opensearch:totalResults'] if _els_pages != 0: pages = pagination(_els_pages) else: pages = pagination(rec) if int(pages) > 1000: pages = 100 for i in range(pages): url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert response into json obj = json.loads(soup.text) # Find required attributes in the response object for item in obj['search-results']['entry']: try: if "prism:eIssn" in item: issn = item['prism:eIssn'] elif "prism:Issn" or "prism:issn" in item: issn = item['prism:issn'] else: issn = str(['No information found']) resp_obj = { "entities": { "Search Engine": "Elsevier SCOPUS Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication " "Name, ISSN, Cited count, Affiliation " "name, Type, Published date, Abstract", "items": [{ "DOI": item['prism:doi'], "Title": item['dc:title'], "URLs": item['prism:url'], "Authors": item['dc:creator'], "Publication Name": item['prism:publicationName'], "ISSN": issn, "Cited count": item['citedby-count'], "Affiliation": item['affiliation'][0] ['affilname'], "Type": item['subtypeDescription'], "Published date": item['prism:coverDate'], "Abstract": item['prism:publicationName'] }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) else: for i in range(pages): url = 'https://api.elsevier.com/content/search/scopus?query=' + query + '&apiKey=' + scp_api + '&date=' + _from_yr + '-' + _to_yr_ + '&start=' + str( i) + '&count=10' # response object response = requests.get(url, headers=headers, timeout=30) soup = BeautifulSoup(response.content, 'lxml') # convert response into json obj = json.loads(soup.text) # Find required attributes in the response object for item in obj['search-results']['entry']: try: if "prism:eIssn" in item: issn = item['prism:eIssn'] elif "prism:Issn" or "prism:issn" in item: issn = item['prism:issn'] else: issn = str(['No information found']) resp_obj = { "entities": { "Search Engine": "Elsevier SCOPUS Search Engine", "Attributes found": "DOI, Title, URLs, Authors, Publication Name, ISSN, Cited count, Affiliation name, Type, Published date, Abstract", "items": [{ "DOI": item['prism:doi'], "Title": item['dc:title'], "URLs": item['prism:url'], "Authors": item['dc:creator'], "Publication Name": item['prism:publicationName'], "ISSN": issn, "Cited count": item['citedby-count'], "Affiliation": item['affiliation'][0] ['affilname'], "Type": item['subtypeDescription'], "Published date": item['prism:coverDate'], "Abstract": item['prism:publicationName'] }] } } count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info( ) filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, rec, count, logging_flag) print(f'Finished with total {count} records returned.') return data
def search_PlosOne(query, headers, _pages, records, _title, _keyword, _abstract, _from_yr, _to_yr_, logging_flag, data): if _title: print('Searching in PLOS ONE...') count = 0 # search_PlosOne_title(query) url = 'http://api.plos.org/search?q=title:' + query + '&start=1&rows=' + str(records) # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') obj = json.loads(soup.text) # set the counter for records count try: for i in tqdm(range(1)): # Find required attributes in the response object for item in obj['response']['docs']: try: resp_obj = {"entities": {"Search Engine": "PLOS Engine", "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, " "Published date, Abstract", "items": [ {"DOI": item['id'], "Title": item['title_display'], "URLs": 'https://doi.org/' + item['id'], "Authors": item['author_display'], "Publication Name": str(['No information found']), # "Publication Name": item['publisher'], "ISSN": item['eissn'], "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item['article_type'], "Published date": str(item['publication_date']).split('T', -1)[0], "Abstract": str(item['abstract']).strip().replace('\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data except Exception as e: # raise e pass time.sleep(1) print('Some error happend in PLOS Engine!') if not _from_yr: if _keyword or _abstract: print('Searching in PLOS ONE...') _rec = round(float(records)) count = 0 try: for i in tqdm(range(1)): url = 'http://api.plos.org/search?q=' + query + '&start=1&rows=' + str(_rec) # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') obj = json.loads(soup.text) for item in obj['response']['docs']: try: resp_obj = {"entities": {"Search Engine": "PLOS Engine", "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, Published date, Abstract", "items": [ {"DOI": item['id'], "Title": item['title_display'], "URLs": 'https://doi.org/' + item['id'], "Authors": item['author_display'], "Publication Name": str(['No information found']), # "Publication Name": item['publisher'], "ISSN": item['eissn'], "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item['article_type'], "Published date": str(item['publication_date']).split('T', -1)[0], "Abstract": str(item['abstract']).strip().replace('\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data except Exception as e: # raise e pass else: if _keyword or _abstract: print('Searching in PLOS ONE...') _rec = round(float(records)) count = 0 try: for i in tqdm(range(1)): url = 'http://api.plos.org/search?q=' + query + ' AND publication_date:[' + _from_yr + '-01-01T00:00:00Z TO ' + _to_yr_ + '-12-31T23:59:59Z]' + '&start=1&rows=' + str( _rec) # response object response = requests.get(url, headers=headers) soup = BeautifulSoup(response.content, 'html.parser') obj = json.loads(soup.text) for item in obj['response']['docs']: try: resp_obj = {"entities": {"Search Engine": "PLOS Engine", "Attributes found": "DOI, Title, URLs, Authors, ISSN, Type, Published date, Abstract", "items": [ {"DOI": item['id'], "Title": item['title_display'], "URLs": 'https://doi.org/' + item['id'], "Authors": item['author_display'], "Publication Name": str(['No information found']), # "Publication Name": item['publisher'], "ISSN": item['eissn'], "Cited count": str(['No information found']), "Affiliation": str(['No information found ']), "Type": item['article_type'], "Published date": str(item['publication_date']).split('T', -1)[0], "Abstract": str(item['abstract']).strip().replace('\n', '').replace( ' ', '') } ]}} count += 1 # append dict object data data.append(resp_obj) except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number) time.sleep(1) logger.writeRecords(query, None, _engine, count, count, logging_flag) print(f'Finished with total {count} records returned.') return data except Exception as e: # raise e pass exception_type, exception_object, exception_traceback = sys.exc_info() filename = exception_traceback.tb_frame.f_code.co_filename line_number = exception_traceback.tb_lineno logger.writeError(e, None, _engine, logging_flag, filename, line_number)