def pdf_url(self, phrase): """Fetch a paper by *phrase* (usually the title) from scholar.google.com Tries to download a valid PDF. Returns the pdf url if one is found. Return None in case of errors. TODO: Actually check if the file can be downloaded... if error, continue with next candidate """ # Run initial query query = SearchScholarQuery() query.set_phrase(phrase) # --phrase "<phrase>" query.set_num_page_results(1) # -c 1 self.querier.send_query(query) if len(self.querier.articles) == 0: self.status.warning('No search results. Blocked maybe?') # TODO: Open result page in a browser (to answer the captcha) self.on_blocked() return None # Absolutely nothing returned; Abort self.timeout = TIMEOUT # Initial PDF url art = self.querier.articles[0] pdf_url = strip_url(art.attrs['url_pdf'][0]) # Some status self.status.result('Title', art.attrs['title'][0]) self.status.result('Year', art.attrs['year'][0]) self.status.result('PDF', pdf_url) # Check PDF url if pdf_url is None or is_blacklisted(pdf_url): self.status.result('URL', art.attrs['url'][0]) # Article found, but no PDF. Resort to searching by cluster. if art.attrs['cluster_id'][0] is not None: cluster = ClusterScholarQuery(cluster=art.attrs['cluster_id'][0]) self.querier.send_query(cluster) # Walk through results for cart in self.querier.articles: curl = strip_url(cart.attrs['url_pdf'][0]) if curl is not None and not is_blacklisted(curl): # Valid PDF found! pdf_url = curl # More status #self.status.result('Cluster', art.attrs['cluster_id'][0]) self.status.result('Title', cart.attrs['title'][0]) self.status.result('Year', cart.attrs['year'][0]) self.status.result('PDF', pdf_url) # We have a result, abort search break # pdf_url can stil be None if is_book(pdf_url) or is_book(art.attrs['url'][0]): self.status.warning('Might be a book') return pdf_url
def setCitationByTitle(paper_title): query = SearchScholarQuery() query.set_author("Si Chen") query.set_phrase(paper_title) query.set_num_page_results(1) #querier.send_query(query) #citations = 0 return query
def get_url(querier, phrase): # Setup query query = SearchScholarQuery() # Query title / phrase query.set_phrase(phrase) # Set title search only query.set_scope(True) # Result count query.set_num_page_results(1) try: # Send query querier.send_query(query) articles = querier.articles for art in articles: url, _, _ = art.attrs['url'] url_pdf, _, _ = art.attrs['url_pdf'] return url except: return None
def get_results_for(title, author): query = SearchScholarQuery() query.set_author(author) query.set_phrase(title) query.set_num_page_results(1) query.set_scope(True) settings = ScholarSettings() settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) querier = ScholarQuerier() querier.apply_settings(settings) querier.send_query(query) for art in querier.articles: print art.as_citation(); bibtex_split = art.as_citation().split("\n") reftype = bibtex_split[0][1:-1].split("{")[0].lower(); refid = bibtex_split[0][1:-1].split("{")[1].lower(); bibtex_split.remove(bibtex_split[0]) #print reftype + " " + refid + " " + str(bibtex_split) thismodule = sys.modules[__name__] while(True): try: features_of_type = getattr(thismodule, reftype).func_code.co_varnames[ 1: getattr(thismodule, reftype).func_code.co_argcount ] break; except AttributeError: var = raw_input("Type " + reftype + " not recongised, please enter a known type: "); reftype = var; while (True): arranged_name = [] arranged_value = [] for i in range(1, 10): arranged_name.append(None) arranged_value.append(None) for line in bibtex_split: if ( line.find("=") > -1 ): stored_name = line.split("=")[0].strip() stored_value = line.split("=")[1].strip(); stored_value = stored_value[1:-(len(stored_value)-stored_value.rfind("}"))] if stored_name in features_of_type: arranged_name[features_of_type.index(stored_name)] = stored_name arranged_value[features_of_type.index(stored_name)] = stored_value short_arranged_name = arranged_name[ 0 : arranged_name.index(None)]; short_arranged_value = arranged_value[ 0 : arranged_value.index(None)]; if len(short_arranged_name) == len(features_of_type): return getattr(thismodule, reftype)(refid, *short_arranged_value).__getprintable__(True) else: for feature in features_of_type: if ( feature not in arranged_name ): var = raw_input(feature + " is not provided by the retrieved bibtex entry. Would you like to enter it now? (Y) or (N)"); if var == "Y": var = raw_input("Enter value for " + feature + ": "); bibtex_split.append(feature + " = {" + var + "}");
query = SearchScholarQuery() query.set_scope(True) alldata=[] counter=1 xlsxfile=sys.argv[1] wb = load_workbook(xlsxfile, use_iterators=True) print wb.get_sheet_names() ws = wb.get_sheet_by_name('Qatar_Scopus') for row in ws.iter_rows(row_offset=1): if row[0].value is not None: temp=[] title=row[0].value.encode("utf-8") query.set_phrase(title) query.set_num_page_results(1) querier.send_query(query) x=onecsv(querier) if (x!=None): year=x.split("|")[2] numcit=x.split("|")[3] weburl=x.split("|")[1] else: year=None;numcit=None;weburl=None; temp.append(title),temp.append(numcit);temp.append(year);temp.append(weburl); alldata.append(temp) print "Title: ",title,"query number: ",counter,"No. citations: ",numcit print "sleeeping for 5 seconds" sleep(5) counter=counter+1