def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = "HTTP response data follow" if err_msg is None: err_msg = "request failed" try: ScholarUtils.log("info", "requesting %s" % unquote(url)) text = "Please show you're not a robot" text2 = "Per continuare, digita i caratteri nell'immagine sottostante:" while True: self.firefox.get(url) time.sleep(1) html = self.firefox.page_source.encode("utf-8") if text in html or text2 in html: pdb.set_trace() html = self.firefox.page_source.encode("utf-8") break ScholarUtils.log("debug", log_msg) ScholarUtils.log("debug", ">>>>" + "-" * 68) ScholarUtils.log("debug", "data:\n" + html.decode("utf-8")) ScholarUtils.log("debug", "<<<<" + "-" * 68) return html except Exception as err: print err pdb.set_trace() return None
def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: err_msg = 'request failed' try: ScholarUtils.log('info', 'requesting %s' % unquote(url)) text = 'Please show you\'re not a robot' text2 = 'Per continuare, digita i caratteri nell\'immagine sottostante:' while True: self.firefox.get(url) time.sleep(1) html = self.firefox.page_source.encode('utf-8') if text in html or text2 in html: pdb.set_trace() html = self.firefox.page_source.encode('utf-8') break ScholarUtils.log('debug', log_msg) ScholarUtils.log('debug', '>>>>' + '-'*68) ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) ScholarUtils.log('debug', '<<<<' + '-'*68) return html except Exception as err: print err pdb.set_trace() return None
def _get_http_response(self, url, log_msg=None, err_msg=None): """ Helper method, sends HTTP request and returns response payload. """ if log_msg is None: log_msg = 'HTTP response data follow' if err_msg is None: err_msg = 'request failed' try: ScholarUtils.log('info', 'requesting %s' % unquote(url)) text = 'Please show you\'re not a robot' text2 = 'Per continuare, digita i caratteri nell\'immagine sottostante:' while True: self.firefox.get(url) time.sleep(1) html = self.firefox.page_source.encode('utf-8') if text in html or text2 in html: pdb.set_trace() html = self.firefox.page_source.encode('utf-8') break ScholarUtils.log('debug', log_msg) ScholarUtils.log('debug', '>>>>' + '-' * 68) ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8')) ScholarUtils.log('debug', '<<<<' + '-' * 68) return html except Exception as err: print err pdb.set_trace() return None
def set_timeframe(self, start=None, end=None): """ Sets timeframe (in years as integer) in which result must have appeared. It's fine to specify just start or end, or both. """ if start: start = ScholarUtils.ensure_int(start) if end: end = ScholarUtils.ensure_int(end) self.timeframe = [start, end]
def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that you adjusted the settings to tell Google Scholar to actually provide this information, *prior* to retrieving the article. """ if article['url_citation'] is None: return False if article.citation_data is not None: return True ScholarUtils.log('info', 'retrieving citation export data') data = self._get_http_response(url=article['url_citation'], log_msg='citation data response', err_msg='requesting citation data failed') if data is None: return False article.set_citation_data(data) return True
def get_citation_data(self, article): """ Given an article, retrieves citation link. Note, this requires that you adjusted the settings to tell Google Scholar to actually provide this information, *prior* to retrieving the article. """ if article['url_citation'] is None: return False if article.citation_data is not None: return True ScholarUtils.log('info', 'retrieving citation export data') data = self._get_http_response( url=article['url_citation'], log_msg='citation data response', err_msg='requesting citation data failed') if data is None: return False article.set_citation_data(data) return True
def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. self.firefox.get(self.GET_SETTINGS_URL) tag = self.firefox.find_element_by_id("gs_settings_form") # tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) if tag is None: ScholarUtils.log("info", "parsing settings failed: no form") return False tag = [ x for x in self.firefox.find_elements_by_tag_name("input") if x.get_attribute("type") == "hidden" and x.get_attribute("name") == "scisig" ][0] if tag is None: ScholarUtils.log("info", "parsing settings failed: scisig") # return False urlargs = { "start": settings.starting_number, "scisig": tag["value"], "num": settings.per_page_results, "scis": "no", "scisf": "", } if settings.citform != 0: urlargs["scis"] = "yes" urlargs["scisf"] = "&scisf=%d" % settings.citform self.firefox.get(self.SET_SETTINGS_URL % urlargs) ScholarUtils.log("info", "settings applied") return True
def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. self.firefox.get(self.GET_SETTINGS_URL) tag = self.firefox.find_element_by_id('gs_settings_form') #tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) if tag is None: ScholarUtils.log('info', 'parsing settings failed: no form') return False tag = [ x for x in self.firefox.find_elements_by_tag_name('input') if x.get_attribute('type') == 'hidden' and x.get_attribute('name') == 'scisig' ][0] if tag is None: ScholarUtils.log('info', 'parsing settings failed: scisig') # return False urlargs = { 'start': settings.starting_number, 'scisig': tag['value'], 'num': settings.per_page_results, 'scis': 'no', 'scisf': '' } if settings.citform != 0: urlargs['scis'] = 'yes' urlargs['scisf'] = '&scisf=%d' % settings.citform self.firefox.get(self.SET_SETTINGS_URL % urlargs) ScholarUtils.log('info', 'settings applied') return True
def apply_settings(self, settings): """ Applies settings as provided by a ScholarSettings instance. """ if settings is None or not settings.is_configured(): return True self.settings = settings # This is a bit of work. We need to actually retrieve the # contents of the Settings pane HTML in order to extract # hidden fields before we can compose the query for updating # the settings. self.firefox.get(self.GET_SETTINGS_URL) tag = self.firefox.find_element_by_id('gs_settings_form') #tag = soup.find(name='form', attrs={'id': 'gs_settings_form'}) if tag is None: ScholarUtils.log('info', 'parsing settings failed: no form') return False tag = [x for x in self.firefox.find_elements_by_tag_name('input') if x.get_attribute( 'type') == 'hidden' and x.get_attribute('name') == 'scisig'][0] if tag is None: ScholarUtils.log('info', 'parsing settings failed: scisig') # return False urlargs = {'start': settings.starting_number, 'scisig': tag['value'], 'num': settings.per_page_results, 'scis': 'no', 'scisf': ''} if settings.citform != 0: urlargs['scis'] = 'yes' urlargs['scisf'] = '&scisf=%d' % settings.citform self.firefox.get(self.SET_SETTINGS_URL % urlargs) ScholarUtils.log('info', 'settings applied') return True
def set_starting_number(self, starting_number): msg = 'starting number of results on page must be numeric' self.starting_number = ScholarUtils.ensure_int(starting_number, msg)
def main(): usage = """scholar.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: scholar.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: scholar.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup( parser, "Query arguments", "These options define search query arguments and parameters." ) group.add_option("-a", "--author", metavar="AUTHORS", default=None, help="Author name(s)") group.add_option( "-A", "--all", metavar="WORDS", default=None, dest="allw", help="Results must contain all of these words" ) group.add_option( "-s", "--some", metavar="WORDS", default=None, help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases', ) group.add_option( "-n", "--none", metavar="WORDS", default=None, help="Results must contain none of these words. See -s|--some re. formatting", ) group.add_option("-p", "--phrase", metavar="PHRASE", default=None, help="Results must contain exact phrase") group.add_option("-t", "--title-only", action="store_true", default=False, help="Search title only") group.add_option( "-P", "--pub", metavar="PUBLICATIONS", default=None, help="Results must have appeared in this publication" ) group.add_option("--after", metavar="YEAR", default=None, help="Results must have appeared in or after given year") group.add_option( "--before", metavar="YEAR", default=None, help="Results must have appeared in or before given year" ) group.add_option("--no-patents", action="store_true", default=False, help="Do not include patents in results") group.add_option("--no-citations", action="store_true", default=False, help="Do not include citations in results") group.add_option( "-C", "--cluster-id", metavar="CLUSTER_ID", default=None, help="Do not search, just use articles in given cluster ID", ) group.add_option("-c", "--count", type="int", default=None, help="Maximum number of results") group.add_option("-S", "--start", type="int", default=0, help="Starting page of results") group.add_option("-u", "--url", metavar="URL", default=None, help="Citation list's url") group.add_option( "-U", "--urls_file", metavar="URL", dest="urls", default=0, help="Citation list's urls json file (['http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en', ...])", ) parser.add_option_group(group) group = optparse.OptionGroup(parser, "Output format", "These options control the appearance of the results.") group.add_option("--txt", action="store_true", help="Print article data in text format (default)") group.add_option("--txt-globals", action="store_true", help="Like --txt, but first print global results too") group.add_option("--csv", action="store_true", help='Print article data in CSV form (separator is "|")') group.add_option("--csv-header", action="store_true", help="Like --csv, but print header with column names") group.add_option("--json", action="store_true", help='Save article data in JSON form (default file: "../res.json")') group.add_option( "--citation", metavar="FORMAT", default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).', ) parser.add_option_group(group) group = optparse.OptionGroup(parser, "Miscellaneous") group.add_option( "--cookie-file", metavar="FILE", default=None, help="File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.", ) group.add_option( "-d", "--debug", action="count", default=0, help="Enable verbose logging to stderr. Repeated options increase detail of debug output.", ) group.add_option("-v", "--version", action="store_true", default=False, help="Show version information") parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, ScholarUtils.LOG_LEVELS["debug"]) ScholarConf.LOG_LEVEL = options.debug ScholarUtils.log("info", "using log level %d" % ScholarConf.LOG_LEVEL) if options.version: print ("This is scholar.py %s." % ScholarConf.VERSION) return 0 if options.cookie_file: ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if ( options.author or options.allw or options.some or options.none or options.phrase or options.title_only or options.pub or options.after or options.before ): print ("Cluster ID queries do not allow additional search arguments.") return 1 querier = ScholarQuerier() settings = ScholarSettings() if options.citation == "bt": settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) elif options.citation == "en": settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) elif options.citation == "rm": settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) elif options.citation == "rw": settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print ('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') return 1 querier.apply_settings(settings) if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) else: query = SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.url is not None: query.set_url(options.url) if options.urls is not None: try: with open(options.urls) as data_file: urls = json.load(data_file) if isinstance(urls, list) and len(urls) > 0 and isinstance(urls[0], dict): urls = [x["url_citations"] for x in urls] for url in urls: query.set_url(url) reset_res() loop(options, query, querier, file_name="../results/" + re.match(".*?([0-9]+)", url).group(1) + ".json") except Exception, e: import pdb pdb.set_trace() print e print "error with the urls json file provided"
def set_cluster(self, cluster): """ Sets search to a Google Scholar results cluster ID. """ msg = 'cluster ID must be numeric' self.cluster = ScholarUtils.ensure_int(cluster, msg)
def main(): print "" usage = """scholar.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: scholar.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: scholar.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup( parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option( '-s', '--some', metavar='WORDS', default=None, help= 'Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases' ) group.add_option( '-n', '--none', metavar='WORDS', default=None, help= 'Results must contain none of these words. See -s|--some re. formatting' ) group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option('--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option('--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option( '-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, help='Maximum number of results') group.add_option('-S', '--start', type='int', default=0, help='Starting page of results') group.add_option('-u', '--url', metavar='URL', default=None, help='Citation list\'s url') group.add_option( '-U', '--urls_file', metavar='URL', dest='urls', default=None, help= 'Citation list\'s urls json file ([\'http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en\', ...])' ) parser.add_option_group(group) group = optparse.OptionGroup( parser, 'Output format', 'These options control the appearance of the results.') group.add_option('--txt', action='store_true', help='Print article data in text format (default)') group.add_option('--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option('--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option('--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option( '--json', action='store_true', help='Save article data in JSON form (default file: "../res.json")') group.add_option( '--citation', metavar='FORMAT', default=None, help= 'Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).' ) parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option( '--cookie-file', metavar='FILE', default=None, help= 'File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.' ) group.add_option( '-d', '--debug', action='count', default=0, help= 'Enable verbose logging to stderr. Repeated options increase detail of debug output.' ) group.add_option('-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug']) ScholarConf.LOG_LEVEL = options.debug ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL) if options.version: print('This is scholar.py %s.' % ScholarConf.VERSION) return 0 if options.cookie_file: ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print( 'Cluster ID queries do not allow additional search arguments.') return 1 querier = ScholarQuerier() settings = ScholarSettings() if options.citation == 'bt': settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print( 'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".' ) return 1 querier.apply_settings(settings) if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) else: query = SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.url is not None: query.set_url(options.url) print options.url if options.urls is not None: print options.urls try: with open(options.urls) as data_file: urls = json.load(data_file) if isinstance(urls, list) and len(urls) > 0 and isinstance( urls[0], dict): urls = [x['url_citations'] for x in urls] for url in urls: query.set_url(url) reset_res() loop(options, query, querier, file_name='../results/' + re.match('.*?([0-9]+)', url).group(1) + '.json') except Exception, e: print e
def main(): print "" usage = """scholar.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: scholar.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: scholar.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option('-s', '--some', metavar='WORDS', default=None, help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') group.add_option('-n', '--none', metavar='WORDS', default=None, help='Results must contain none of these words. See -s|--some re. formatting') group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option('--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option('--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, help='Maximum number of results') group.add_option('-S', '--start', type='int', default=0, help='Starting page of results') group.add_option('-u', '--url', metavar='URL', default=None, help='Citation list\'s url') group.add_option('-U', '--urls_file', metavar='URL', dest='urls', default=None, help='Citation list\'s urls json file ([\'http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en\', ...])') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', 'These options control the appearance of the results.') group.add_option('--txt', action='store_true', help='Print article data in text format (default)') group.add_option('--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option('--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option('--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option('--json', action='store_true', help='Save article data in JSON form (default file: "../res.json")') group.add_option('--citation', metavar='FORMAT', default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option('--cookie-file', metavar='FILE', default=None, help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') group.add_option('-d', '--debug', action='count', default=0, help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') group.add_option('-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug']) ScholarConf.LOG_LEVEL = options.debug ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL) if options.version: print('This is scholar.py %s.' % ScholarConf.VERSION) return 0 if options.cookie_file: ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print( 'Cluster ID queries do not allow additional search arguments.') return 1 querier = ScholarQuerier() settings = ScholarSettings() if options.citation == 'bt': settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print( 'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') return 1 querier.apply_settings(settings) if options.cluster_id: query = ClusterScholarQuery(cluster=options.cluster_id) else: query = SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.url is not None: query.set_url(options.url) print options.url if options.urls is not None: print options.urls try: with open(options.urls) as data_file: urls = json.load(data_file) if isinstance(urls, list) and len(urls) > 0 and isinstance(urls[0], dict): urls = [x['url_citations'] for x in urls] for url in urls: query.set_url(url) reset_res() loop(options, query, querier, file_name='../results/' + re.match('.*?([0-9]+)', url).group(1) + '.json') except Exception, e: print e
def set_num_page_results(self, num_page_results): msg = 'maximum number of results on page must be numeric' self.num_results = ScholarUtils.ensure_int(num_page_results, msg)