def SearchScholar(options): """Send Google Scholar Query""" querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() querier.apply_settings(settings) if options['cluster_id']: query = scholar.ClusterScholarQuery(cluster=options.cluster_id) else: query = scholar.SearchScholarQuery() if options['author']: query.set_author(options['author']) if options['allw']: query.set_words(options['allw']) if options['some']: query.set_words_some(options['some']) if options['none']: query.set_words_none(options['none']) if options['phrase']: query.set_phrase(options['phrase']) if options['title_only']: query.set_scope(True) if options['pub']: query.set_pub(options['pub']) if options['after'] or options['before']: query.set_timeframe(options.after, options.before) if options['no_patents']: query.set_include_patents(False) query.get_url() querier.send_query(query) return scholar.get_results_objects(querier)
def get_paper_data(querier, paper): if type(paper) is dict: title = paper.get('title') cluster_id = paper.get('cluster_id') elif type(paper) is str: title = paper else: raise "Input arg paper is of an invalid format %s" % repr(paper) if cluster_id: print 'Query by cluster_id' query = scholar.ClusterScholarQuery(cluster=cluster_id) else: print 'Query by title "%s"' % title query = scholar.SearchScholarQuery() query.set_phrase(title) query.set_num_page_results(1) # This is important, set this to 1 can reduce the possiblility of get blocked by google querier.send_query(query) scholar.txt(querier, with_globals=True) articles = querier.articles time.sleep(1) # for art in articles: # print(encode(art.as_txt()) + '\n') return articles[0] # Only return the top result
def main(): data = pd.DataFrame() f = open('../results/5556531000720111691.csv.bkup', 'r') for idx, line in enumerate(f): data_values = line.split(',', 2) to_append = pd.DataFrame([data_values]) data = data.append(to_append) f.close() # # for each cluster id # for from_cluster_id in range( data.shape[0])[:1]: # just get the first one, for now\ print(from_cluster_id) cluster_id = data.iloc[from_cluster_id, 0] try: cluster_id = int(cluster_id) except ValueError: continue querier = scholar.ScholarQuerier() settings = scholar.ScholarSettings() query = scholar.SearchScholarQuery() query_cluster = scholar.ClusterScholarQuery(cluster=cluster_id) querier.send_query(query_cluster) # # for each article in search results # for article in querier.articles[: 1]: # get first article result, for now article.attrs.get('url_citations')[0] current_article = GoogleScholarArticleSimple() current_article.cluster_id = cluster_id current_article.set_search_soup().set_num_search_results( ).set_num_search_pages() # gs_r = current_article.soup.find_all("div", class_="gs_r") # # for each search page result of citing article # for page_idx, search_page_number in enumerate( range(current.article.num_search_pages) [:1]): # get first page result for now url = citations_url_generic.format(search_page_number * 10, from_cluster_id) r = requests.get(url) soup = BeautifulSoup(r.text) gs_r = soup.find_all("div", class_="gs_r") # print(len(gs_r)) output_file_path = '../results/01-{}.csv'.format( from_cluster_id) f = open(output_file_path, 'w') f.close() # # for each search result # for citing_article_soup in gs_r: result_article = DanGoogleScholarArticle( soup=citing_article_soup) result_article.parse_title() # print(result_article.title) result_article.parse_cluster_id() # seed_cluster_id = result_article.cluster_id # print(seed_cluster_id) f = open(output_file_path, 'a+') str_to_write = '{}\t|\t{}\t|\t{}\n'.\ format(result_article.cluster_id, cluster_id, citing_article_soup) f.write(str_to_write) f.close() sleep_time = random() * randint(10, 100) print('cluster_id: {}, page: {}, sleeping: {}'.format( from_cluster_id, page_number, sleep_time)) sleep(sleep_time)
def main(): usage = """demo.py [options] <query string> A command-line interface to Google Scholar. Examples: # Retrieve one article written by Einstein on quantum theory: demo.py -c 1 --author "albert einstein" --phrase "quantum theory" # Retrieve a BibTeX entry for that quantum theory paper: demo.py -c 1 -C 17749203648027613321 --citation bt # Retrieve five articles written by Einstein after 1970 where the title # does not contain the words "quantum" and "theory": demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970""" fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100) parser = optparse.OptionParser(usage=usage, formatter=fmt) group = optparse.OptionGroup(parser, 'Query arguments', 'These options define search query arguments and parameters.') group.add_option('-a', '--author', metavar='AUTHORS', default=None, help='Author name(s)') group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw', help='Results must contain all of these words') group.add_option('-s', '--some', metavar='WORDS', default=None, help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases') group.add_option('-n', '--none', metavar='WORDS', default=None, help='Results must contain none of these words. See -s|--some re. formatting') group.add_option('-p', '--phrase', metavar='PHRASE', default=None, help='Results must contain exact phrase') group.add_option('-t', '--title-only', action='store_true', default=False, help='Search title only') group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None, help='Results must have appeared in this publication') group.add_option('--after', metavar='YEAR', default=None, help='Results must have appeared in or after given year') group.add_option('--before', metavar='YEAR', default=None, help='Results must have appeared in or before given year') group.add_option('--no-patents', action='store_true', default=False, help='Do not include patents in results') group.add_option('--no-citations', action='store_true', default=False, help='Do not include citations in results') group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None, help='Do not search, just use articles in given cluster ID') group.add_option('-c', '--count', type='int', default=None, help='Maximum number of results') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Output format', 'These options control the appearance of the results.') group.add_option('--txt', action='store_true', help='Print article data in text format (default)') group.add_option('--txt-globals', action='store_true', help='Like --txt, but first print global results too') group.add_option('--csv', action='store_true', help='Print article data in CSV form (separator is "|")') group.add_option('--csv-header', action='store_true', help='Like --csv, but print header with column names') group.add_option('--citation', metavar='FORMAT', default=None, help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).') parser.add_option_group(group) group = optparse.OptionGroup(parser, 'Miscellaneous') group.add_option('--cookie-file', metavar='FILE', default=None, help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.') group.add_option('-d', '--debug', action='count', default=0, help='Enable verbose logging to stderr. Repeated options increase detail of debug output.') group.add_option('-v', '--version', action='store_true', default=False, help='Show version information') parser.add_option_group(group) options, _ = parser.parse_args() # Show help if we have neither keyword search nor author name if len(sys.argv) == 1: parser.print_help() return 1 if options.debug > 0: options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug']) sc.ScholarConf.LOG_LEVEL = options.debug sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL) if options.version: print('This is demo.py %s.' % sc.ScholarConf.VERSION) return 0 if options.cookie_file: sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file # Sanity-check the options: if they include a cluster ID query, it # makes no sense to have search arguments: if options.cluster_id is not None: if options.author or options.allw or options.some or options.none \ or options.phrase or options.title_only or options.pub \ or options.after or options.before: print('Cluster ID queries do not allow additional search arguments.') return 1 querier = sc.ScholarQuerier() settings = sc.ScholarSettings() if options.citation == 'bt': settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX) elif options.citation == 'en': settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE) elif options.citation == 'rm': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN) elif options.citation == 'rw': settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS) elif options.citation is not None: print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".') return 1 querier.apply_settings(settings) if options.cluster_id: query = sc.ClusterScholarQuery(cluster=options.cluster_id) else: query = sc.SearchScholarQuery() if options.author: query.set_author(options.author) if options.allw: query.set_words(options.allw) if options.some: query.set_words_some(options.some) if options.none: query.set_words_none(options.none) if options.phrase: query.set_phrase(options.phrase) if options.title_only: query.set_scope(True) if options.pub: query.set_pub(options.pub) if options.after or options.before: query.set_timeframe(options.after, options.before) if options.no_patents: query.set_include_patents(False) if options.no_citations: query.set_include_citations(False) if options.count is not None: options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS) query.set_num_page_results(options.count) querier.send_query(query) if options.csv: sc.csv(querier) elif options.csv_header: sc.csv(querier, header=True) elif options.citation is not None: sc.citation_export(querier) else: sc.txt(querier, with_globals=options.txt_globals) if options.cookie_file: querier.save_cookies() return 0
metavar='CLUSTER_ID', default=None, dest='cluster_id', ## help='Do not search, just use articles in given cluster ID') ## ################### HAVE NOT TESTED ############################################################## args = parser.parse_args() start_idx = 0 num_results = args.num_results all_articles = [] while num_results - PAGE_RESULT >= 0: print('working on results', start_idx, 'through', start_idx + PAGE_RESULT, '...') if args.cluster_id: query = scholar.ClusterScholarQuery(cluster=args.cluster_id) else: query = scholar.SearchScholarQuery() query.set_num_page_results(PAGE_RESULT) query.set_phrase(args.phrase) query.set_timeframe(args.after, None) query.set_include_citations(not args.citations) query.set_include_patents(not args.patents) query.set_author(args.author) query.set_start(start_idx) querier = scholar.ScholarQuerier() querier.send_query(query) #either no query results or the robot checker is blocking the IP ADDR / USER_AGENT