def main(): # Path to the excel sheet containing the list of paper title in the second colum, heading as 'Name'. pathToFile = "PaperList.xlsx" xl = pd.ExcelFile(pathToFile) df = xl.parse("Sheet1") bt = [] f = df['PaperName'] for i in range(f.size): a = f[i] x = a.replace(u'\xa0', ' ') args = x.encode('ascii', 'ignore') # args ="Detection of skin cancer by classification of Raman spectra" biblist = gs.query(args) print(biblist[0]) k = biblist[0] k1 = k.replace(u'\n ', ' ') x1 = k1.encode('ascii', 'ignore') bt.append(x1) df1 = pd.DataFrame({'bibtex': bt}) f = pd.concat([df, df1], axis=1) # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter('bibtexFile.xlsx', engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. f.to_excel(writer, sheet_name='Sheet1') # Get the xlsxwriter objects from the dataframe writer object. workbook = writer.book worksheet = writer.sheets['Sheet1']
def extractr(filePath): # Path to the excel sheet containing the list of paper title in the second colum, heading as 'Name'. pathToFile = filePath cwd = os.getcwd() sdir = cwd + '/bibtexFile.xlsx' xl = pd.ExcelFile(pathToFile) df = xl.parse("Sheet1") bt = [] f = df['PaperName'] for i in range(f.size): a1 = f[i] x1 = a1.replace(u'\xa0', ' ') args1 = x1.encode('ascii', 'ignore') biblist = gs.query(args1) print(biblist[0]) k = biblist[0] k1 = k.replace(u'\n ', ' ') x2 = k1.encode('ascii', 'ignore') x2 = x2.decode('utf-8') bt.append(x2) df1 = pd.DataFrame({'bibtex': bt}) f = pd.concat([df, df1], axis=1) # Create a Pandas Excel writer using XlsxWriter as the engine. writer = pd.ExcelWriter(sdir, engine='xlsxwriter') # Convert the dataframe to an XlsxWriter Excel object. f.to_excel(writer, sheet_name='Sheet1') writer.save()
def pull_info_from_gscholar(query, accepted_fields=None): """Look for entry in google scholar.""" import gscholar bibtex_string = gscholar.query(query)[0] if accepted_fields is None: accepted_fields = [ 'title', 'year', 'volume', 'number', 'pages', 'ISBN', 'journal', 'publisher', 'month', 'author', 'doi' ] details = {} for field in accepted_fields: value = re.search(field + r'={([^}]*)}', bibtex_string) if value: details[field] = value[1] return details
def display_info(fpath): if not os.path.exists(fpath): raise IOError('File Not Found') info = pdfinfo(fpath) guessed_data = ['title', 'author'] guess = not all(guess in info for guess in guessed_data) info_from_text = extract_from_pdftext(fpath, guess) for k in info_from_text: if k not in info or (k is 'title' and not is_title_like(info[k])): info[k] = info_from_text[k] query_string = info['doi'] if 'doi' in info else ( info['title'] if 'title' in info else info['abstract'] if 'abstract' in info else None) bibtex = '' if query_string: # query google try: query_string = query_string.encode('ascii','replace') text = gscholar.query(query_string.decode('utf8', 'replace'), gscholar.FORMAT_BIBTEX, False) if text: text = text[0] # assume the first one lines = text.splitlines() if 'doi' in info: lines.insert(1,' doi={{{}}},'.format(info['doi'])) bibtex = '\n'.join(lines) except urllib2.URLError: pass if bibtex: update_info_with_bibtex(info, bibtex, guessed_data, True) notetitle = ''.join([info.get('title', 'Unknown Title'), ' - ', info.get('author', 'Authors Unknown')]) if guess: notetitle += ' METADATA NEEDS REVIEW' notebody = u'' if 'abstract' in info: notebody += info['abstract'] + u'\n\n' if bibtex: notebody += bibtex.decode('utf8') + u'\n\n' insert_into_evernote(notetitle, notebody, fpath)
def ask_google_for_clean_citations(): washed_bibdata_from_google=list() counter = 0 for dirty_title in get_list_of_titles_to_feed_google(): try: print dirty_title counter += 1 ##Note that the readme and documentation on github are wrong as ##of 1 Dec 2013. You have to feed 2 arguments to gscholar.py, ##instead of 1 argument as they suggest. See here for more ##details: http://stackoverflow.com/questions/13200709/extract-google-scholar-results-using-python-or-r washed_bibdata_from_google.append(gscholar.query(dirty_title,outformat=4)) except Exception: print "\nYou have hit Google scholar with too many queries and they banned you. The program will now wait a few hours and try again." print "The relevant error message is \n" print traceback.format_exc() countdown_timer(9999,10000) pass if counter > 25: if not dirty_title == get_list_of_titles_to_feed_google()[-1]: countdown_timer(207,503)##you can turn this off if you only have a few citations (approx. 50 or fewer). ## Otherwise you have to rate limit yourself to avoid a ban. return washed_bibdata_from_google
def scholar_get(title, db): # print(gscholar.query("linked open data", allresults=True)) if title not in db: query = gscholar.query(title) if len(query) < 1: return {"title": "", "authors": [], "year": "", "bibtex": ""} db[title] = query[0] time.sleep(random.randint(0, 10)) meta = {} parser = bibtexparser.bparser.BibTexParser() parser.customization = customizations raw_entry = bibtexparser.loads(db[title], parser=parser) entry = raw_entry.entries[0] meta["title"] = entry['title'].strip() meta["authors"] = entry['author'].replace(", ", " ").split("and") meta["year"] = entry.get('year', "").strip() meta["bibtex"] = bibtexparser.dumps(raw_entry) return meta
def query_gscholar(self, query): import gscholar; return gscholar.query(query);
import gscholar import os import sys # Take the current bibtex open on chrome and append it to the bibtex file # Set up bibtex filepath filepath = "/Users/janmeppe/Dropbox/School/Master/Master Thesis/tex/" filename = "master-thesis.bib" os.chdir(filepath) # Change dir to filepath # Get user input as query, request to gscholar and encode to UTF8 user_arg = sys.argv[1] query_unicode = gscholar.query(user_arg) query = [x.encode('UTF8') for x in query_unicode] # Append bibtex output to prespecified path with open(filename,'a') as out: for item in query: out.write("%s" % item) out.write("\n") # such that the next item is well spaced # @article{nameYEARword, ... find cite_id between the first '{' and ',' cite_id = query[0].split(',', 1)[0].split('{',1)[1] print "Citation added succefully!\n\cite{%s}\n\citeA{%s}" % (cite_id, cite_id)
# coding:utf-8 import gscholar items = gscholar.query('') gscholar.query("some author or title")
def main(): usage = 'Usage: %prog [options] {pdf | "search terms"}' parser = optparse.OptionParser(usage) parser.add_option("-a", "--all", action="store_true", dest="all", default=False, help="show all bibtex results") parser.add_option("-d", "--debug", action="store_true", dest="debug", default=False, help="show debugging output") parser.add_option("-r", "--rename", action="store_true", dest="rename", default=False, help="rename file (asks before doing it)") parser.add_option( "-f", "--outputformat", dest='output', default="bibtex", help= "Output format. Available formats are: bibtex, endnote, refman, wenxianwang [default: %default]" ) parser.add_option("-s", "--startpage", dest='startpage', help="Page number to start parsing PDF file at.") parser.add_option('-V', '--version', action='store_true', help='Print version and quit.') (options, args) = parser.parse_args() if options.debug is True: logger.setLevel(logging.DEBUG) if options.version: print(gs.__VERSION__) return if options.output == 'bibtex': outformat = gs.FORMAT_BIBTEX elif options.output == 'endnote': outformat = gs.FORMAT_ENDNOTE elif options.output == 'refman': outformat = gs.FORMAT_REFMAN elif options.output == 'wenxianwang': outformat = gs.FORMAT_WENXIANWANG if len(args) != 1: parser.error("No argument given, nothing to do.") sys.exit(1) args = args[0] pdfmode = False if os.path.exists(args): logger.debug( "File exist, assuming you want me to lookup the pdf: {filename}.". format(filename=args)) pdfmode = True biblist = gs.pdflookup(args, all, outformat, options.startpage) else: logger.debug( "Assuming you want me to lookup the query: {query}".format( query=args)) biblist = gs.query(args, outformat, options.all) if len(biblist) < 1: print("No results found, try again with a different query!") sys.exit(1) if options.all is True: logger.debug("All results:") for i in biblist: print(i) else: logger.debug("First result:") print(biblist[0]) if options.rename is True: if not pdfmode: print( "You asked me to rename the pdf but didn't tell me which file to rename, aborting." ) sys.exit(1) else: gs.rename_file(args, biblist[0])
######## Google Scholar import scholarly search_query_2 = scholarly.search_keyword('thermodynamics') keyword = next(search_query_2).fill() title2 = [pub.bib['title'] for pub in keyword.publications] title3 = [pub for pub in keyword.publications] ################################################################################ import gscholar gscholar.query("thermodynamics")
import bibtexparser import gscholar import time with open('ref.bib') as bib: db = bibtexparser.load(bib) ress = [] for it in (db.entries): title = it['title'] id = it['ID'] title = title.strip('{}') succ = False while not succ: try: res = gscholar.query(title) time.sleep(10) succ = True except Exception as e: print(e) # sleep(10) break if not succ: break it_gs = bibtexparser.loads(res[0]) it_gs = it_gs.entries[0] # from IPython import embed; embed() it_gs['ID'] = id ress.append(it_gs) print(it_gs) # break
for file in pdffiles: entry = gscholar.pdflookup(input_dir + file, allresults=False, outformat=4)[0] bib_list.add(entry) # Get bib entries for titles in papers.txt: try: f = open(join(input_dir, 'papers.txt'), "r") papers = f.read().split("\n") f.close except Exception as e: papers = [] for p in papers: if p == "": continue try: entry = gscholar.query(p)[0] bib_list.add(entry) except IndexError as e: print("Bad input line: {}".format(p)) failed_list.append(p) # Print bib file if len(bib_list) > 0: with open(join(output_dir, "library.bib"), "w+") as f: f.write("The following entries have been generated by a script and therefore should be checked for accuracy.\nCreated by Ross Gales, https://github.com/rosscg/generate-bib\n\n") for l in bib_list: f.write(l) f.close() # Print failed lines to file if len(failed_list) > 0:
def getGScholar(self): """If you are feeling lucky.""" bibtex = query(self.doi, 4)[0] self.bibtex = bibtex.decode('utf-8') return self.bibtex
path_reading_group = path_work + r'\Reading_group' path_code = path_work + r'\Code\Etienne_repos\code_google_scholar' list_papers = [f for f in os.listdir(path_reading_group)\ if os.path.isfile(path_reading_group + r'\%s' %f)] list_papers = list_papers[:-2] # exclude two files... # txt_file = open(path_code + r'\list_papers_reading_group.txt', 'w') # txt_file.write('\n'.join(list_papers)) # txt_file.close() str_all_bibtex = '' list_not_found = [] for paper in list_papers: print paper paper_bibtex = gscholar.query(paper.split('_')[2].rstrip('.pdf'), 4) if paper_bibtex: str_all_bibtex += '%s\n' %paper_bibtex[0] else: list_not_found.append(paper) time.sleep(0.5) # with open(path_code + r'\file_bibtex.txt', 'w') as fichier: # fichier.write(str_bibtex) # # ############ # # DEPRECATED # # ############ # # build urllib2 opener