Example #1
0
    def _get_http_response(self, url, log_msg=None, err_msg=None):
        """
        Helper method, sends HTTP request and returns response payload.
        """
        if log_msg is None:
            log_msg = "HTTP response data follow"
        if err_msg is None:
            err_msg = "request failed"
        try:
            ScholarUtils.log("info", "requesting %s" % unquote(url))

            text = "Please show you're not a robot"
            text2 = "Per continuare, digita i caratteri nell'immagine sottostante:"
            while True:
                self.firefox.get(url)
                time.sleep(1)
                html = self.firefox.page_source.encode("utf-8")
                if text in html or text2 in html:
                    pdb.set_trace()
                    html = self.firefox.page_source.encode("utf-8")
                break

            ScholarUtils.log("debug", log_msg)
            ScholarUtils.log("debug", ">>>>" + "-" * 68)
            ScholarUtils.log("debug", "data:\n" + html.decode("utf-8"))
            ScholarUtils.log("debug", "<<<<" + "-" * 68)

            return html
        except Exception as err:
            print err
            pdb.set_trace()
            return None
Example #2
0
    def _get_http_response(self, url, log_msg=None, err_msg=None):
        """
        Helper method, sends HTTP request and returns response payload.
        """
        if log_msg is None:
            log_msg = 'HTTP response data follow'
        if err_msg is None:
            err_msg = 'request failed'
        try:
            ScholarUtils.log('info', 'requesting %s' % unquote(url))

            text = 'Please show you\'re not a robot'
            text2 = 'Per continuare, digita i caratteri nell\'immagine sottostante:'
            while True:
                self.firefox.get(url)
                time.sleep(1)
                html = self.firefox.page_source.encode('utf-8')
                if text in html or text2 in html:
                    pdb.set_trace()
                    html = self.firefox.page_source.encode('utf-8')
                break

            ScholarUtils.log('debug', log_msg)
            ScholarUtils.log('debug', '>>>>' + '-'*68)
            ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8'))
            ScholarUtils.log('debug', '<<<<' + '-'*68)

            return html
        except Exception as err:
            print err
            pdb.set_trace()
            return None
Example #3
0
    def _get_http_response(self, url, log_msg=None, err_msg=None):
        """
        Helper method, sends HTTP request and returns response payload.
        """
        if log_msg is None:
            log_msg = 'HTTP response data follow'
        if err_msg is None:
            err_msg = 'request failed'
        try:
            ScholarUtils.log('info', 'requesting %s' % unquote(url))

            text = 'Please show you\'re not a robot'
            text2 = 'Per continuare, digita i caratteri nell\'immagine sottostante:'
            while True:
                self.firefox.get(url)
                time.sleep(1)
                html = self.firefox.page_source.encode('utf-8')
                if text in html or text2 in html:
                    pdb.set_trace()
                    html = self.firefox.page_source.encode('utf-8')
                break

            ScholarUtils.log('debug', log_msg)
            ScholarUtils.log('debug', '>>>>' + '-' * 68)
            ScholarUtils.log('debug', 'data:\n' + html.decode('utf-8'))
            ScholarUtils.log('debug', '<<<<' + '-' * 68)

            return html
        except Exception as err:
            print err
            pdb.set_trace()
            return None
Example #4
0
 def set_timeframe(self, start=None, end=None):
     """
     Sets timeframe (in years as integer) in which result must have
     appeared. It's fine to specify just start or end, or both.
     """
     if start:
         start = ScholarUtils.ensure_int(start)
     if end:
         end = ScholarUtils.ensure_int(end)
     self.timeframe = [start, end]
Example #5
0
 def set_timeframe(self, start=None, end=None):
     """
     Sets timeframe (in years as integer) in which result must have
     appeared. It's fine to specify just start or end, or both.
     """
     if start:
         start = ScholarUtils.ensure_int(start)
     if end:
         end = ScholarUtils.ensure_int(end)
     self.timeframe = [start, end]
Example #6
0
    def get_citation_data(self, article):
        """
        Given an article, retrieves citation link. Note, this requires that
        you adjusted the settings to tell Google Scholar to actually
        provide this information, *prior* to retrieving the article.
        """
        if article['url_citation'] is None:
            return False
        if article.citation_data is not None:
            return True

        ScholarUtils.log('info', 'retrieving citation export data')
        data = self._get_http_response(url=article['url_citation'],
                                       log_msg='citation data response',
                                       err_msg='requesting citation data failed')
        if data is None:
            return False

        article.set_citation_data(data)
        return True
Example #7
0
    def get_citation_data(self, article):
        """
        Given an article, retrieves citation link. Note, this requires that
        you adjusted the settings to tell Google Scholar to actually
        provide this information, *prior* to retrieving the article.
        """
        if article['url_citation'] is None:
            return False
        if article.citation_data is not None:
            return True

        ScholarUtils.log('info', 'retrieving citation export data')
        data = self._get_http_response(
            url=article['url_citation'],
            log_msg='citation data response',
            err_msg='requesting citation data failed')
        if data is None:
            return False

        article.set_citation_data(data)
        return True
Example #8
0
    def apply_settings(self, settings):
        """
        Applies settings as provided by a ScholarSettings instance.
        """
        if settings is None or not settings.is_configured():
            return True

        self.settings = settings

        # This is a bit of work. We need to actually retrieve the
        # contents of the Settings pane HTML in order to extract
        # hidden fields before we can compose the query for updating
        # the settings.

        self.firefox.get(self.GET_SETTINGS_URL)

        tag = self.firefox.find_element_by_id("gs_settings_form")
        # tag = soup.find(name='form', attrs={'id': 'gs_settings_form'})
        if tag is None:
            ScholarUtils.log("info", "parsing settings failed: no form")
            return False

        tag = [
            x
            for x in self.firefox.find_elements_by_tag_name("input")
            if x.get_attribute("type") == "hidden" and x.get_attribute("name") == "scisig"
        ][0]
        if tag is None:
            ScholarUtils.log("info", "parsing settings failed: scisig")
        #     return False

        urlargs = {
            "start": settings.starting_number,
            "scisig": tag["value"],
            "num": settings.per_page_results,
            "scis": "no",
            "scisf": "",
        }

        if settings.citform != 0:
            urlargs["scis"] = "yes"
            urlargs["scisf"] = "&scisf=%d" % settings.citform

        self.firefox.get(self.SET_SETTINGS_URL % urlargs)

        ScholarUtils.log("info", "settings applied")
        return True
Example #9
0
    def apply_settings(self, settings):
        """
        Applies settings as provided by a ScholarSettings instance.
        """
        if settings is None or not settings.is_configured():
            return True

        self.settings = settings

        # This is a bit of work. We need to actually retrieve the
        # contents of the Settings pane HTML in order to extract
        # hidden fields before we can compose the query for updating
        # the settings.

        self.firefox.get(self.GET_SETTINGS_URL)

        tag = self.firefox.find_element_by_id('gs_settings_form')
        #tag = soup.find(name='form', attrs={'id': 'gs_settings_form'})
        if tag is None:
            ScholarUtils.log('info', 'parsing settings failed: no form')
            return False

        tag = [
            x for x in self.firefox.find_elements_by_tag_name('input')
            if x.get_attribute('type') == 'hidden'
            and x.get_attribute('name') == 'scisig'
        ][0]
        if tag is None:
            ScholarUtils.log('info', 'parsing settings failed: scisig')
        #     return False

        urlargs = {
            'start': settings.starting_number,
            'scisig': tag['value'],
            'num': settings.per_page_results,
            'scis': 'no',
            'scisf': ''
        }

        if settings.citform != 0:
            urlargs['scis'] = 'yes'
            urlargs['scisf'] = '&scisf=%d' % settings.citform

        self.firefox.get(self.SET_SETTINGS_URL % urlargs)

        ScholarUtils.log('info', 'settings applied')
        return True
Example #10
0
    def apply_settings(self, settings):
        """
        Applies settings as provided by a ScholarSettings instance.
        """
        if settings is None or not settings.is_configured():
            return True

        self.settings = settings

        # This is a bit of work. We need to actually retrieve the
        # contents of the Settings pane HTML in order to extract
        # hidden fields before we can compose the query for updating
        # the settings.

        self.firefox.get(self.GET_SETTINGS_URL)

        tag = self.firefox.find_element_by_id('gs_settings_form')
        #tag = soup.find(name='form', attrs={'id': 'gs_settings_form'})
        if tag is None:
            ScholarUtils.log('info', 'parsing settings failed: no form')
            return False

        tag = [x for x in self.firefox.find_elements_by_tag_name('input') if x.get_attribute(
            'type') == 'hidden' and x.get_attribute('name') == 'scisig'][0]
        if tag is None:
            ScholarUtils.log('info', 'parsing settings failed: scisig')
        #     return False

        urlargs = {'start': settings.starting_number,
                   'scisig': tag['value'],
                   'num': settings.per_page_results,
                   'scis': 'no',
                   'scisf': ''}

        if settings.citform != 0:
            urlargs['scis'] = 'yes'
            urlargs['scisf'] = '&scisf=%d' % settings.citform

        self.firefox.get(self.SET_SETTINGS_URL % urlargs)

        ScholarUtils.log('info', 'settings applied')
        return True
Example #11
0
 def set_starting_number(self, starting_number):
     msg = 'starting number of results on page must be numeric'
     self.starting_number = ScholarUtils.ensure_int(starting_number, msg)
Example #12
0
def main():
    usage = """scholar.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
scholar.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
scholar.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(
        parser, "Query arguments", "These options define search query arguments and parameters."
    )
    group.add_option("-a", "--author", metavar="AUTHORS", default=None, help="Author name(s)")
    group.add_option(
        "-A", "--all", metavar="WORDS", default=None, dest="allw", help="Results must contain all of these words"
    )
    group.add_option(
        "-s",
        "--some",
        metavar="WORDS",
        default=None,
        help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases',
    )
    group.add_option(
        "-n",
        "--none",
        metavar="WORDS",
        default=None,
        help="Results must contain none of these words. See -s|--some re. formatting",
    )
    group.add_option("-p", "--phrase", metavar="PHRASE", default=None, help="Results must contain exact phrase")
    group.add_option("-t", "--title-only", action="store_true", default=False, help="Search title only")
    group.add_option(
        "-P", "--pub", metavar="PUBLICATIONS", default=None, help="Results must have appeared in this publication"
    )
    group.add_option("--after", metavar="YEAR", default=None, help="Results must have appeared in or after given year")
    group.add_option(
        "--before", metavar="YEAR", default=None, help="Results must have appeared in or before given year"
    )
    group.add_option("--no-patents", action="store_true", default=False, help="Do not include patents in results")
    group.add_option("--no-citations", action="store_true", default=False, help="Do not include citations in results")
    group.add_option(
        "-C",
        "--cluster-id",
        metavar="CLUSTER_ID",
        default=None,
        help="Do not search, just use articles in given cluster ID",
    )
    group.add_option("-c", "--count", type="int", default=None, help="Maximum number of results")
    group.add_option("-S", "--start", type="int", default=0, help="Starting page of results")
    group.add_option("-u", "--url", metavar="URL", default=None, help="Citation list's url")
    group.add_option(
        "-U",
        "--urls_file",
        metavar="URL",
        dest="urls",
        default=0,
        help="Citation list's urls json file (['http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en', ...])",
    )
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, "Output format", "These options control the appearance of the results.")
    group.add_option("--txt", action="store_true", help="Print article data in text format (default)")
    group.add_option("--txt-globals", action="store_true", help="Like --txt, but first print global results too")
    group.add_option("--csv", action="store_true", help='Print article data in CSV form (separator is "|")')
    group.add_option("--csv-header", action="store_true", help="Like --csv, but print header with column names")
    group.add_option("--json", action="store_true", help='Save article data in JSON form (default file: "../res.json")')
    group.add_option(
        "--citation",
        metavar="FORMAT",
        default=None,
        help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).',
    )
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, "Miscellaneous")
    group.add_option(
        "--cookie-file",
        metavar="FILE",
        default=None,
        help="File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.",
    )
    group.add_option(
        "-d",
        "--debug",
        action="count",
        default=0,
        help="Enable verbose logging to stderr. Repeated options increase detail of debug output.",
    )
    group.add_option("-v", "--version", action="store_true", default=False, help="Show version information")
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, ScholarUtils.LOG_LEVELS["debug"])
        ScholarConf.LOG_LEVEL = options.debug
        ScholarUtils.log("info", "using log level %d" % ScholarConf.LOG_LEVEL)

    if options.version:
        print ("This is scholar.py %s." % ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if (
            options.author
            or options.allw
            or options.some
            or options.none
            or options.phrase
            or options.title_only
            or options.pub
            or options.after
            or options.before
        ):
            print ("Cluster ID queries do not allow additional search arguments.")
            return 1

    querier = ScholarQuerier()
    settings = ScholarSettings()

    if options.citation == "bt":
        settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == "en":
        settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == "rm":
        settings.set_citation_format(ScholarSettings.CITFORM_REFMAN)
    elif options.citation == "rw":
        settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print ('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.url is not None:
        query.set_url(options.url)

    if options.urls is not None:
        try:
            with open(options.urls) as data_file:
                urls = json.load(data_file)
            if isinstance(urls, list) and len(urls) > 0 and isinstance(urls[0], dict):
                urls = [x["url_citations"] for x in urls]
            for url in urls:
                query.set_url(url)
                reset_res()
                loop(options, query, querier, file_name="../results/" + re.match(".*?([0-9]+)", url).group(1) + ".json")
        except Exception, e:
            import pdb

            pdb.set_trace()
            print e
            print "error with the urls json file provided"
Example #13
0
 def set_cluster(self, cluster):
     """
     Sets search to a Google Scholar results cluster ID.
     """
     msg = 'cluster ID must be numeric'
     self.cluster = ScholarUtils.ensure_int(cluster, msg)
Example #14
0
 def set_cluster(self, cluster):
     """
     Sets search to a Google Scholar results cluster ID.
     """
     msg = 'cluster ID must be numeric'
     self.cluster = ScholarUtils.ensure_int(cluster, msg)
Example #15
0
def main():
    print ""
    usage = """scholar.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
scholar.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
scholar.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(
        parser, 'Query arguments',
        'These options define search query arguments and parameters.')
    group.add_option('-a',
                     '--author',
                     metavar='AUTHORS',
                     default=None,
                     help='Author name(s)')
    group.add_option('-A',
                     '--all',
                     metavar='WORDS',
                     default=None,
                     dest='allw',
                     help='Results must contain all of these words')
    group.add_option(
        '-s',
        '--some',
        metavar='WORDS',
        default=None,
        help=
        'Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases'
    )
    group.add_option(
        '-n',
        '--none',
        metavar='WORDS',
        default=None,
        help=
        'Results must contain none of these words. See -s|--some re. formatting'
    )
    group.add_option('-p',
                     '--phrase',
                     metavar='PHRASE',
                     default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t',
                     '--title-only',
                     action='store_true',
                     default=False,
                     help='Search title only')
    group.add_option('-P',
                     '--pub',
                     metavar='PUBLICATIONS',
                     default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after',
                     metavar='YEAR',
                     default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before',
                     metavar='YEAR',
                     default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents',
                     action='store_true',
                     default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations',
                     action='store_true',
                     default=False,
                     help='Do not include citations in results')
    group.add_option(
        '-C',
        '--cluster-id',
        metavar='CLUSTER_ID',
        default=None,
        help='Do not search, just use articles in given cluster ID')
    group.add_option('-c',
                     '--count',
                     type='int',
                     default=None,
                     help='Maximum number of results')
    group.add_option('-S',
                     '--start',
                     type='int',
                     default=0,
                     help='Starting page of results')
    group.add_option('-u',
                     '--url',
                     metavar='URL',
                     default=None,
                     help='Citation list\'s url')
    group.add_option(
        '-U',
        '--urls_file',
        metavar='URL',
        dest='urls',
        default=None,
        help=
        'Citation list\'s urls json file ([\'http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en\', ...])'
    )
    parser.add_option_group(group)

    group = optparse.OptionGroup(
        parser, 'Output format',
        'These options control the appearance of the results.')
    group.add_option('--txt',
                     action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals',
                     action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv',
                     action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header',
                     action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option(
        '--json',
        action='store_true',
        help='Save article data in JSON form (default file: "../res.json")')
    group.add_option(
        '--citation',
        metavar='FORMAT',
        default=None,
        help=
        'Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).'
    )
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option(
        '--cookie-file',
        metavar='FILE',
        default=None,
        help=
        'File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.'
    )
    group.add_option(
        '-d',
        '--debug',
        action='count',
        default=0,
        help=
        'Enable verbose logging to stderr. Repeated options increase detail of debug output.'
    )
    group.add_option('-v',
                     '--version',
                     action='store_true',
                     default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug'])
        ScholarConf.LOG_LEVEL = options.debug
        ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is scholar.py %s.' % ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print(
                'Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = ScholarQuerier()
    settings = ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print(
            'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".'
        )
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.url is not None:
        query.set_url(options.url)
        print options.url

    if options.urls is not None:
        print options.urls
        try:
            with open(options.urls) as data_file:
                urls = json.load(data_file)
            if isinstance(urls, list) and len(urls) > 0 and isinstance(
                    urls[0], dict):
                urls = [x['url_citations'] for x in urls]
            for url in urls:
                query.set_url(url)
                reset_res()
                loop(options,
                     query,
                     querier,
                     file_name='../results/' +
                     re.match('.*?([0-9]+)', url).group(1) + '.json')
        except Exception, e:
            print e
Example #16
0
def main():
    print ""
    usage = """scholar.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
scholar.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
scholar.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
scholar.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(parser, 'Query arguments',
                                 'These options define search query arguments and parameters.')
    group.add_option('-a', '--author', metavar='AUTHORS', default=None,
                     help='Author name(s)')
    group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                     help='Results must contain all of these words')
    group.add_option('-s', '--some', metavar='WORDS', default=None,
                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
    group.add_option('-n', '--none', metavar='WORDS', default=None,
                     help='Results must contain none of these words. See -s|--some re. formatting')
    group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t', '--title-only', action='store_true', default=False,
                     help='Search title only')
    group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after', metavar='YEAR', default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before', metavar='YEAR', default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents', action='store_true', default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations', action='store_true', default=False,
                     help='Do not include citations in results')
    group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
                     help='Do not search, just use articles in given cluster ID')
    group.add_option('-c', '--count', type='int', default=None,
                     help='Maximum number of results')
    group.add_option('-S', '--start', type='int', default=0,
                     help='Starting page of results')
    group.add_option('-u', '--url', metavar='URL', default=None,
                     help='Citation list\'s url')
    group.add_option('-U', '--urls_file', metavar='URL', dest='urls', default=None,
                     help='Citation list\'s urls json file ([\'http: // scholar.google.com/scholar?cites=4412725301034017472 & as_sdt=2005 & sciodt=1, 5 & hl=en\', ...])')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Output format',
                                 'These options control the appearance of the results.')
    group.add_option('--txt', action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals', action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv', action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header', action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option('--json', action='store_true',
                     help='Save article data in JSON form (default file: "../res.json")')
    group.add_option('--citation', metavar='FORMAT', default=None,
                     help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option('--cookie-file', metavar='FILE', default=None,
                     help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.')
    group.add_option('-d', '--debug', action='count', default=0,
                     help='Enable verbose logging to stderr. Repeated options increase detail of debug output.')
    group.add_option('-v', '--version', action='store_true', default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, ScholarUtils.LOG_LEVELS['debug'])
        ScholarConf.LOG_LEVEL = options.debug
        ScholarUtils.log('info', 'using log level %d' % ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is scholar.py %s.' % ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print(
                'Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = ScholarQuerier()
    settings = ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print(
            'Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.url is not None:
        query.set_url(options.url)
        print options.url

    if options.urls is not None:
        print options.urls
        try:
            with open(options.urls) as data_file:
                urls = json.load(data_file)
            if isinstance(urls, list) and len(urls) > 0 and isinstance(urls[0], dict):
                urls = [x['url_citations'] for x in urls]
            for url in urls:
                query.set_url(url)
                reset_res()
                loop(options, query, querier, file_name='../results/' +
                     re.match('.*?([0-9]+)', url).group(1) + '.json')
        except Exception, e:
            print e
Example #17
0
 def set_starting_number(self, starting_number):
     msg = 'starting number of results on page must be numeric'
     self.starting_number = ScholarUtils.ensure_int(starting_number, msg)
Example #18
0
 def set_num_page_results(self, num_page_results):
     msg = 'maximum number of results on page must be numeric'
     self.num_results = ScholarUtils.ensure_int(num_page_results, msg)
Example #19
0
 def set_num_page_results(self, num_page_results):
     msg = 'maximum number of results on page must be numeric'
     self.num_results = ScholarUtils.ensure_int(num_page_results, msg)