Example #1
0
        def SearchScholar(options):
            """Send Google Scholar Query"""
            querier = scholar.ScholarQuerier()
            settings = scholar.ScholarSettings()
            querier.apply_settings(settings)

            if options['cluster_id']:
                query = scholar.ClusterScholarQuery(cluster=options.cluster_id)
            else:
                query = scholar.SearchScholarQuery()
                if options['author']:
                    query.set_author(options['author'])
                if options['allw']:
                    query.set_words(options['allw'])
                if options['some']:
                    query.set_words_some(options['some'])
                if options['none']:
                    query.set_words_none(options['none'])
                if options['phrase']:
                    query.set_phrase(options['phrase'])
                if options['title_only']:
                    query.set_scope(True)
                if options['pub']:
                    query.set_pub(options['pub'])
                if options['after'] or options['before']:
                    query.set_timeframe(options.after, options.before)
                if options['no_patents']:
                    query.set_include_patents(False)

            query.get_url()
            querier.send_query(query)
            return scholar.get_results_objects(querier)
Example #2
0
def get_paper_data(querier, paper):
    if type(paper) is dict:
        title = paper.get('title')
        cluster_id = paper.get('cluster_id')
    elif type(paper) is str:
        title = paper
    else:
        raise "Input arg paper is of an invalid format %s" % repr(paper)

    if cluster_id:
        print 'Query by cluster_id'
        query = scholar.ClusterScholarQuery(cluster=cluster_id)
    else:
        print 'Query by title "%s"' % title
        query = scholar.SearchScholarQuery()
        query.set_phrase(title)

    query.set_num_page_results(1)
    # This is important, set this to 1 can reduce the possiblility of get blocked by google
    querier.send_query(query)
    scholar.txt(querier, with_globals=True)

    articles = querier.articles
    time.sleep(1)
    # for art in articles:
    #     print(encode(art.as_txt()) + '\n')
    return articles[0]  # Only return the top result
Example #3
0
def main():
    data = pd.DataFrame()

    f = open('../results/5556531000720111691.csv.bkup', 'r')
    for idx, line in enumerate(f):
        data_values = line.split(',', 2)
        to_append = pd.DataFrame([data_values])
        data = data.append(to_append)
    f.close()

    #
    # for each cluster id
    #
    for from_cluster_id in range(
            data.shape[0])[:1]:  # just get the first one, for now\
        print(from_cluster_id)
        cluster_id = data.iloc[from_cluster_id, 0]
        try:
            cluster_id = int(cluster_id)
        except ValueError:
            continue

        querier = scholar.ScholarQuerier()
        settings = scholar.ScholarSettings()
        query = scholar.SearchScholarQuery()
        query_cluster = scholar.ClusterScholarQuery(cluster=cluster_id)
        querier.send_query(query_cluster)

        #
        # for each article in search results
        #
        for article in querier.articles[:
                                        1]:  # get first article result, for now
            article.attrs.get('url_citations')[0]
            current_article = GoogleScholarArticleSimple()
            current_article.cluster_id = cluster_id
            current_article.set_search_soup().set_num_search_results(
            ).set_num_search_pages()

            # gs_r = current_article.soup.find_all("div", class_="gs_r")

            #
            # for each search page result of citing article
            #
            for page_idx, search_page_number in enumerate(
                    range(current.article.num_search_pages)
                [:1]):  # get first page result for now
                url = citations_url_generic.format(search_page_number * 10,
                                                   from_cluster_id)
                r = requests.get(url)
                soup = BeautifulSoup(r.text)
                gs_r = soup.find_all("div", class_="gs_r")
                # print(len(gs_r))

                output_file_path = '../results/01-{}.csv'.format(
                    from_cluster_id)

                f = open(output_file_path, 'w')
                f.close()

                #
                # for each search result
                #
                for citing_article_soup in gs_r:
                    result_article = DanGoogleScholarArticle(
                        soup=citing_article_soup)
                    result_article.parse_title()
                    # print(result_article.title)
                    result_article.parse_cluster_id()
                    # seed_cluster_id = result_article.cluster_id
                    # print(seed_cluster_id)
                    f = open(output_file_path, 'a+')
                    str_to_write = '{}\t|\t{}\t|\t{}\n'.\
                                   format(result_article.cluster_id,
                                          cluster_id,
                                          citing_article_soup)
                    f.write(str_to_write)
                    f.close()
                    sleep_time = random() * randint(10, 100)
                    print('cluster_id: {}, page: {}, sleeping: {}'.format(
                        from_cluster_id, page_number, sleep_time))
                    sleep(sleep_time)
Example #4
0
def main():
    usage = """demo.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
demo.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
demo.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(parser, 'Query arguments',
                                 'These options define search query arguments and parameters.')
    group.add_option('-a', '--author', metavar='AUTHORS', default=None,
                     help='Author name(s)')
    group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                     help='Results must contain all of these words')
    group.add_option('-s', '--some', metavar='WORDS', default=None,
                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
    group.add_option('-n', '--none', metavar='WORDS', default=None,
                     help='Results must contain none of these words. See -s|--some re. formatting')
    group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t', '--title-only', action='store_true', default=False,
                     help='Search title only')
    group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after', metavar='YEAR', default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before', metavar='YEAR', default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents', action='store_true', default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations', action='store_true', default=False,
                     help='Do not include citations in results')
    group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
                     help='Do not search, just use articles in given cluster ID')
    group.add_option('-c', '--count', type='int', default=None,
                     help='Maximum number of results')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Output format',
                                 'These options control the appearance of the results.')
    group.add_option('--txt', action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals', action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv', action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header', action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option('--citation', metavar='FORMAT', default=None,
                     help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option('--cookie-file', metavar='FILE', default=None,
                     help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.')
    group.add_option('-d', '--debug', action='count', default=0,
                     help='Enable verbose logging to stderr. Repeated options increase detail of debug output.')
    group.add_option('-v', '--version', action='store_true', default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug'])
        sc.ScholarConf.LOG_LEVEL = options.debug
        sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is demo.py %s.' % sc.ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print('Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = sc.ScholarQuerier()
    settings = sc.ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = sc.ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = sc.SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.count is not None:
        options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        sc.csv(querier)
    elif options.csv_header:
        sc.csv(querier, header=True)
    elif options.citation is not None:
        sc.citation_export(querier)
    else:
        sc.txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0
    metavar='CLUSTER_ID',
    default=None,
    dest='cluster_id',  ##
    help='Do not search, just use articles in given cluster ID')  ##
################### HAVE NOT TESTED ##############################################################

args = parser.parse_args()
start_idx = 0
num_results = args.num_results
all_articles = []

while num_results - PAGE_RESULT >= 0:
    print('working on results', start_idx, 'through', start_idx + PAGE_RESULT,
          '...')
    if args.cluster_id:
        query = scholar.ClusterScholarQuery(cluster=args.cluster_id)
    else:
        query = scholar.SearchScholarQuery()

    query.set_num_page_results(PAGE_RESULT)
    query.set_phrase(args.phrase)
    query.set_timeframe(args.after, None)
    query.set_include_citations(not args.citations)
    query.set_include_patents(not args.patents)
    query.set_author(args.author)
    query.set_start(start_idx)

    querier = scholar.ScholarQuerier()
    querier.send_query(query)

    #either no query results or the robot checker is blocking the IP ADDR / USER_AGENT