def fetch_citations(author, filesave="citations.json", proxy="", proxy_list=""): """ Fetch citations from google scholar using scholarly """ if proxy != "": print("Setting up proxy ", proxy) scholarly.use_proxy(scholarly.SingleProxy(http=proxy, https=proxy)) if proxy_list != "": lproxies = open(proxy_list, 'r').readlines() def proxy_gen(): if proxy_gen.counter >= len(lproxies): raise IndexError("We ran out of proxies...") proxy = lproxies[proxy_gen.counter] if not proxy.startswith("http"): proxy = "http://" + proxy proxy_gen.counter += 1 return proxy proxy_gen.counter = 0 scholarly.use_proxy(proxy_gen) print("Looking up " + author) search = scholarly.search_author(author) author = scholarly.fill(next(search)) publications = [] for i, pub in enumerate(author['publications']): cites = pub['num_citations'] # often this gets messed up upon .fill() if "pub_year" in pub['bib']: pubyear = pub['bib'][ "pub_year"] # also this gets messed up upon .fill() pub = scholarly.fill(pub) pub['bib']["pub_year"] = pubyear else: pub = scholarly.fill(pub) if not "pub_year" in pub.bib: # skip publications that really don't have a year, # they probably are crap that was picked up by the search robot continue pub['num_citations'] = cites print("Fetching: " + str(i) + "/" + str(len(author['publications'])) + ": " + pub['bib']["title"] + " (" + str(pub['bib']["pub_year"]) + ")") pub['bib'].pop("abstract", None) pub.pop("source", None) publications.append(pub) f = open(filesave, "w") f.write(json.dumps(publications)) f.close()
def get_new_proxy(): proxy_works = False while not proxy_works: proxy = FreeProxy(country_id=["US"], rand=True, timeout=1).get() proxy_works = scholarly.use_proxy(http=proxy, https=proxy) print("Found new proxy!") return proxy
def set_new_proxy(): while True: proxy = FreeProxy(rand=True, timeout=1).get() proxy_works = scholarly.use_proxy(http=proxy, https=proxy) if proxy_works: break return proxy
def set_new_proxy(text=True): """ Reset the identity using FreeProxy Parameters ---------- arg1 [OPTIONAL]| text: bool A boolean flag to return the IP address tuple (old, morphed) Returns ------- Address fp.fp.FreeProxy """ while True: # call the freeproxy object proxy = FreeProxy(rand=True, timeout=1).get() # allocate the proxy address to scholarly proxy_works = scholarly.use_proxy(http=proxy, https=proxy) # check it the ip address works if proxy_works: # come out break # print the ip address depending on the text argument if text: # print the working ip print("Working proxy:", proxy) # return the proxy details return proxy
def _set_new_proxy(self): while True: proxy = FreeProxy(rand=True, timeout=1).get() proxy_works = scholarly.use_proxy(http=proxy, https=proxy) if proxy_works: break print("Working proxy:", proxy) return proxy
def set_new_proxy(): while True: proxy = FreeProxy().get() proxy_works = scholarly.use_proxy( http="http://123.179.163.100:53954", https="https://123.179.163.100:53954") if proxy_works: break print("Working proxy:", "http://123.179.163.100:53954") return proxy
def proxy(self): proxy_works = scholarly.use_proxy( http= "http://29ea0d9d66134811b51ead72601a1181:@proxy.crawlera.com:8010/" ) print(proxy_works) test_query = scholarly.search_pubs( 'Perception of physical stability and center of mass of 3D objects' ) print(test_query)
def setUp(self): proxy_generator = ProxyGenerator() if "CONNECTION_METHOD" in scholarly.env: self.connection_method = os.getenv("CONNECTION_METHOD") else: self.connection_method = "none" if self.connection_method == "tor": tor_sock_port = None tor_control_port = None tor_password = "******" # Tor uses the 9050 port as the default socks port # on windows 9150 for socks and 9151 for control if sys.platform.startswith("linux") or sys.platform.startswith( "darwin"): tor_sock_port = 9050 tor_control_port = 9051 elif sys.platform.startswith("win"): tor_sock_port = 9150 tor_control_port = 9151 proxy_generator.Tor_External(tor_sock_port, tor_control_port, tor_password) scholarly.use_proxy(proxy_generator) elif self.connection_method == "tor_internal": if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' proxy_generator.Tor_Internal(tor_cmd=tor_cmd) scholarly.use_proxy(proxy_generator) elif self.connection_method == "luminati": scholarly.set_retries(10) proxy_generator.Luminati(usr=os.getenv("USERNAME"), passwd=os.getenv("PASSWORD"), proxy_port=os.getenv("PORT")) scholarly.use_proxy(proxy_generator) elif self.connection_method == "freeproxy": proxy_generator.FreeProxies() scholarly.use_proxy(proxy_generator) else: scholarly.use_proxy(None)
def get_research_articles(self, max_num): # Search string for Google Scholar to look for. # e.g. "{self.title} {self.director.name}" would equate to "Concussion Peter Landesman" for the movie Concussion. search_str = f'{self.title} {self.director.name}' output = f"" try: pg = ProxyGenerator() ip = os.environ['PROXY_IP'] pg.SingleProxy(http=ip, https=ip) o = scholarly.use_proxy(pg) search_query = scholarly.search_pubs(search_str) for i in range(0, max_num): curr = next(search_query) # For debugging purposes, this is how you pretty print the search query's contents. #scholarly.pprint(curr) # Grab the title of the article. title = curr['bib']['title'] # Begin our formatted html output for each found research article. output += f""" <li> """ # See if a publication url (i.e. curr['pub_url']) exists. If so, add an external link to it. if 'pub_url' in curr: output += f""" <a target='_blank' href=\"{curr['pub_url']}\">{title}</a> """ else: output += f""" {title} """ output += f""" <br> """ # Writes the abstract (i.e.curr['bib']['abstract']) if it exists. if 'bib' in curr and 'abstract' in curr['bib']: output += f""" <p>{curr['bib']['abstract']}</p> """ output += f""" </li> """ except Exception as e: pass # Useful for seeing errors in your terminal. Replace pass with the print statement below. #print(sys.stderr, e) return output
def test_tor_launch_own_process(self): """ Test that we can launch a Tor process """ proxy_generator = ProxyGenerator() if sys.platform.startswith("linux"): tor_cmd = 'tor' elif sys.platform.startswith("win"): tor_cmd = 'tor.exe' tor_sock_port = random.randrange(9000, 9500) tor_control_port = random.randrange(9500, 9999) result = proxy_generator.Tor_Internal(tor_cmd, tor_sock_port, tor_control_port) self.assertTrue(result["proxy_works"]) self.assertTrue(result["refresh_works"]) self.assertEqual(result["tor_control_port"], tor_control_port) self.assertEqual(result["tor_sock_port"], tor_sock_port) # Check that we can issue a query as well query = 'Ipeirotis' scholarly.use_proxy(proxy_generator) authors = [a for a in scholarly.search_author(query)] self.assertGreaterEqual(len(authors), 1)
async def __call__(self): UserCancel = KeyboardInterrupt # region various embed types creation def publication_embeds(result) -> discord.Embed: embed = discord.Embed( title=result["bib"]["title"], description=result["bib"]["abstract"], url=result["eprint_url"] if "eprint_url" in result.keys() else result["pub_url"], ) embed.add_field( name="Authors", value=", ".join(result["bib"]["author"]).strip(), inline=True, ) embed.add_field(name="Publisher", value=result["bib"]["venue"], inline=True) embed.add_field( name="Publication Year", value=result["bib"]["pub_year"], inline=True ) embed.add_field( name="Cited By", value=result["num_citations"] if "num_citations" in result.keys() else "0", inline=True, ) embed.add_field( name="Related Articles", value=f'https://scholar.google.com{result["url_related_articles"]}', inline=True, ) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed def author_embeds(result) -> discord.Embed: embed = discord.Embed(title=result["name"]) embed.add_field( name="Cited By", value=f"{result['citedby']} articles", inline=True ) embed.add_field(name="Scholar ID", value=result["scholar_id"], inline=True) embed.add_field( name="Affiliation", value=result["affiliation"] if "affiliation" in result.keys() else "None", inline=True, ) embed.add_field( name="Interests", value=f"{', '.join(result['interests']) if 'interests' in result.keys() else 'None'}", inline=True, ) embed.set_image(url=result["url_picture"]) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed def citation_embeds(result) -> discord.Embed: embed = discord.Embed( title=result["bib"]["title"], description=f"```{scholarly.bibtex(result)}```", url=result["eprint_url"] if "eprint_url" in result.keys() else result["pub_url"], ) embed.set_footer(text=f"Requested by {self.ctx.author}") return embed # endregion try: # region user flags processing pg = ProxyGenerator() proxy = FreeProxy(rand=True, timeout=1, country_id=["BR"]).get() pg.SingleProxy(http=proxy, https=proxy) scholarly.use_proxy(pg) # self.args processing if self.args is None: results = [next(scholarly.search_pubs(self.query)) for _ in range(5)] embeds = list(map(publication_embeds, results)) elif "author" in self.args: results = [ next(scholarly.search_author(self.query)) for _ in range(5) ] embeds = list(map(author_embeds, results)) elif "cite" in self.args: results = scholarly.search_pubs(self.query) results = [results for _ in range(5)] embeds = list(map(citation_embeds, results)) else: await self.message.edit(content="Invalid flag") return # endregion # sets the reactions for the search result if len(embeds) > 1: buttons = [[ {Button(style=ButtonStyle.grey, label="◀️", custom_id="◀️"): None}, {Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️"): None}, {Button(style=ButtonStyle.grey, label="▶️", custom_id="▶️"): None} ]] else: buttons = [[ Button(style=ButtonStyle.red, label="🗑️", custom_id="🗑️") ]] await Sudo.multi_page_system(self.bot, self.ctx, self.message, tuple(embeds), buttons) return except asyncio.TimeoutError: raise except (asyncio.CancelledError, discord.errors.NotFound): pass except scholarly_exceptions._navigator.MaxTriesExceededException: await self.message.edit( content="Google Scholar is currently blocking our requests. Please try again later" ) Log.append_to_log(self.ctx, f"{self.ctx.command} error", "MaxTriesExceededException") return except Exception as e: await error_handler(self.bot, self.ctx, e, self.query) finally: return
# Parse the author names file_in = sys.argv[1] authornames = [] with open(file_in, 'r') as f: for line in f: line = line.split('\n')[0] authornames.append(line) # Indicate what data to get (see Author class in https://pypi.org/project/scholarly/) sections = ['basics', 'indices'] max_homonyms = 5 #pip install free-proxy from fp.fp import FreeProxy proxy = FreeProxy(rand=True, timeout=1, country_id=['NO']).get() scholarly.use_proxy(http=proxy, https=proxy) # Loop through the authors t0 = time.time() data = list({}) for i, authname in enumerate(authornames): hindices = [] emails, names, affiliations, citedbys = [], [], [], [] try: search_query = scholarly.search_author(authname) for _ in range(max_homonyms): try: author = next(search_query) tmp_data = author.fill(sections=sections) hindices.append(tmp_data.hindex) emails.append(tmp_data.email)
import sys from scholarly import scholarly import time from tqdm import tqdm import pickle as pkl from stem import Signal from stem.control import Controller import requests proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } scholarly.use_proxy(**proxies) def refresh_socket(): print(requests.get('https://ident.me', proxies=proxies).text) with Controller.from_port(port=9051) as c: c.authenticate() c.signal(Signal.NEWNYM) print(requests.get('https://ident.me', proxies=proxies).text) # RL - reinforcement learning # CF - catastrophic forgetting # STS - semantic textual similarity # NLI - natural language inference (same as recognizing textual entailment) # MC - machine comphrehension
# FLAME GPU publications flame_pubs = [ 'High performance cellular level agent-based simulation with FLAME for the GPU', 'FLAME: simulating large populations of agents on parallel hardware architectures', 'A high performance agent based modelling framework on graphics card hardware with CUDA', 'Template-driven agent-based modeling and simulation with CUDA', 'Simulating heterogeneous behaviours in complex systems on GPUs', 'FLAME GPU technical report and user guide (CS-11-03)', 'Resolving conflicts between multiple competing agents in parallel simulations' ] # Free proxies get blocked #proxy_generator = ProxyGenerator() #proxy_generator.FreeProxies() scholarly.use_proxy(None) # open file for dumping flame publication details f_pubs = open("_data/publications.yml", "w") f_cites = open("_data/citations.yml", "w") all_pubs = [] all_cites = [] for paper_title in flame_pubs: results = scholarly.search_pubs(paper_title) pubs = [p for p in results] assert len(pubs) > 0 # Paper not found? print(f"Found '{paper_title}'.") # fill by querying site pub = scholarly.fill(pubs[0])
def set_proxy(): if ALLOW_PROXY_ON_SCHOLAR: pg = ProxyGenerator() pg.SingleProxy(http_proxy, https_proxy) scholarly.use_proxy(pg)
from scholarly import scholarly, ProxyGenerator import json from dotenv import load_dotenv from pathlib import Path import os env_path = Path('../') / '.env' load_dotenv(dotenv_path=env_path) SCRAPER = os.getenv("SCRAPER") proxy_generator = ProxyGenerator() proxy_generator.ScraperAPI(SCRAPER) scholarly.set_timeout(60) scholarly.use_proxy(proxy_generator) search_query = scholarly.search_author('Maël Montévil') author = scholarly.fill(next(search_query)) pubs = [ scholarly.fill(pub) for pub in author['publications'] if (pub['num_citations'] > 0) ] pubs2 = [[pub, (list(scholarly.citedby(pub)))] for pub in pubs if 'citedby_url' in pub] print(json.dumps(pubs2, indent=2, default=lambda o: '<not serializable>'))
from scholarly import scholarly, ProxyGenerator from tqdm import tqdm from yattag import Doc, indent # Settings PEOPLE = [ "James O'Shea", "Alex Saywell", "Philip Moriarty", "Peter Beton", "James Sharp" ] OUTPUT_DIR = "D:/Nano Group Page/all_pubs" MIN_YEAR = 1990 # Setup proxy to avoid ignored requests pg = ProxyGenerator() scholarly.use_proxy(pg.FreeProxies()) # Preallocate pubs_by_year = defaultdict( list) # Defaultdict creates entries if no already existing, so can append. pubs = [] # Get all publications in an unordered list for p in PEOPLE: search_query = scholarly.search_author(f'{p}, Nottingham') author = next(search_query) info = scholarly.fill(author, sections=['publications']) pubs.append(info["publications"]) pubs = functools.reduce(operator.iconcat, pubs, []) # For every publication
from scholarly import scholarly import yaml # something like ssh -D 9050 -q -C -N [email protected] from scholarly import scholarly, ProxyGenerator # default values are shown below proxies = { 'http': 'socks5://127.0.0.1:9050', 'https': 'socks5://127.0.0.1:9050' } pg = ProxyGenerator() pg.SingleProxy(**proxies) scholarly.use_proxy(pg) # Retrieve the author's data, fill-in, and print #author=scholarly.search_author_id('4poYWhEAAAAJ') search_query = scholarly.search_author('Vassil Vassilev') while True: print("Iter") try: author = next(search_query).fill() if 'cern' in author.email: break #print(author) except StopIteration: break #sys.exit(1) print(author) print("Titles")
def scrape_scholar(query, pages=0, max_proxy_tries=5): ''' Name: scrape_scholar Description: Searches Google Scholar using query and returns data for results. Input: @query: search term @pages: number of pages (10 articles per page) to request @start_year: minimum number of words in body of text @log_path: file path for where to create log file Output: A pandas DataFrame with one paper per row ''' generator = FreeProxy(rand=True) page_size = 10 # create log file to write errors to log = open(f'{query}' + log_path + '.txt', 'w+') # initialize list which will contain all article data and be used for DataFrame rows = [] # the number of the current result being pulled from google scholar index = 0 results = str(1) num_tries = 0 while num_tries<max_proxy_tries: # try-catch block that allows errors to be written in log file if they occur try: # proxy = generator.get() # print(proxy) # pg = ProxyGenerator() # pg.SingleProxy(http = "http://157.245.203.17:3128") scholarly.use_proxy(None) # creates a generator object for results for the query results = scholarly.search_pubs(query) #, start=0) # detects whether the limit has been passed, if there is one while not pages or index<page_size*pages: result = next(results) # retrieves current results object curr_result_bib = result.bib #instantiates current row container row = dict() # passes link to article row['Link'] = curr_result['url'] if 'url' in curr_result else np.nan # title of paper, removes quotes at the start and end if there row['Title'] = curr_result['title'] if 'title' in curr_result else np.nan # True if pdf is available, False otherwise # row['Accessible'] = bool(paper['repositoryDocument']['pdfStatus']) # page number paper would be on on the website assuming 10 papers per page row['Page number'] = index//page_size + 1 # list of [initials last-name] row['Authors'] = curr_result['author'] if 'author' in curr_result else np.nan # checks published year row['Publish year'] = int(curr_result['year']) if 'year' in curr_result else np.nan # number of citations row['Citations'] = curr_result['cites'] if 'cites' in curr_result else np.nan # links to related articles row['Related articles'] = 'https://scholar.google.com/scholar?q=related:' + results['url_scholarbib'].split(':')[1] + ':scholar.google.com/&scioq=' + query + '&hl=en&as_sdt=0,14' # checks if publisher is available row['Publisher'] = curr_result['venue'] if 'venue' in curr_result else np.nan rows.append(row) index += 1 # returns pandas DataFrame where each row is 1 paper return pd.DataFrame(rows) # write any errors to log file except Exception as e: # log.write(str(e)) # print(str(e)) # traceback.print_exc(file=sys.stdout) # log.write('\n') if rows: return pd.DataFrame(rows) if str(e) == "Cannot fetch the page from Google Scholar.": num_tries += 1 continue else: return pd.DataFrame(rows) # returns partially filled DataFrame if failed return pd.DataFrame(rows)