Exemple #1
0
        def SearchScholar(options):
            """Send Google Scholar Query"""
            querier = scholar.ScholarQuerier()
            settings = scholar.ScholarSettings()
            querier.apply_settings(settings)

            if options['cluster_id']:
                query = scholar.ClusterScholarQuery(cluster=options.cluster_id)
            else:
                query = scholar.SearchScholarQuery()
                if options['author']:
                    query.set_author(options['author'])
                if options['allw']:
                    query.set_words(options['allw'])
                if options['some']:
                    query.set_words_some(options['some'])
                if options['none']:
                    query.set_words_none(options['none'])
                if options['phrase']:
                    query.set_phrase(options['phrase'])
                if options['title_only']:
                    query.set_scope(True)
                if options['pub']:
                    query.set_pub(options['pub'])
                if options['after'] or options['before']:
                    query.set_timeframe(options.after, options.before)
                if options['no_patents']:
                    query.set_include_patents(False)

            query.get_url()
            querier.send_query(query)
            return scholar.get_results_objects(querier)
Exemple #2
0
def get_scholar_data(paper_list):
    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX)
    querier.apply_settings(settings)
    scholar.ScholarConf.LOG_LEVEL = 3

    cache = read_cache(cache_file)
    assert (cache != None)

    if cache.get('paper_list') == paper_list:
        print 'Use cache from file %s' % cache_file
        # Use cache to reduce the number of google scholar request
    else:
        # Update cache, instead of flushing a complete new one
        print 'Get data from google scholar'
        cache_paper_title = [p['title'] for p in cache['paper_list']]

        missing_paper = [
            p for p in paper_list if p['title'] not in cache_paper_title
        ]
        missing_scholar_data = [
            get_paper_data(querier, v) for v in missing_paper
        ]
        # update cache

        cache['paper_list'] += missing_paper
        cache['scholar_data'] += missing_scholar_data
        save_cache(cache_file, cache)

    save_cache(cache_file, cache)  # Enforce to flush cache
    return cache['scholar_data']
Exemple #3
0
def citation_retriever(title):
	""" 
	Retrieve number of citation for each paper in our collection 
	"""
	acc = 0
	# Set up scraper
	querier = scholar.ScholarQuerier()
	settings = scholar.ScholarSettings()
	querier.apply_settings(settings)
	query = scholar.SearchScholarQuery()

	while True:
		print(acc)

		# Set query parameters
		query.set_author("Alan Turing")
		query.set_words("computing")
		query.set_num_page_results(1)

		querier.send_query(query)
		# Print the URL of the first article found
		print(querier.articles[0]['url'])
		BLOCK_BYPASS = random.randint(15,60)
		time.sleep(BLOCK_BYPASS)
		acc += 1
Exemple #4
0
def main():
    filename = 'all_papers'

    title_author_list = []
    with open(filename + str('.csv'), 'r') as fin:
        for line in fin.readlines():
            title_author_list.append(
                [line.split(',')[0],
                 int(line.split(',')[1])])
    print('The number of papers ------ : ', len(title_author_list))
    print('One of paper titles  ------ : ', title_author_list[99][0])

    my_querier = scholar.ScholarQuerier()
    my_querier.apply_settings(scholar.ScholarSettings())

    title_author_citation_list = []
    counter = 0
    for title_author in title_author_list:
        print('paper @ {}/{}'.format(counter, len(title_author_list)))

        my_query = scholar.SearchScholarQuery()
        my_query.set_words(title_author[0])
        my_querier.send_query(my_query)
        print(my_querier.articles)

        if len(my_querier.articles) == 0:
            print('NOT found... ', title_author[0])
        else:
            citation = my_querier.articles[0].attrs["num_citations"][0]
            title_author_citation_list.append(title_author.append(citation))
        time.sleep(20 + random.randint(0, 20))
def buscadorAvanzado(frase, words, autor, after, before):
    # nombre_directorio= str(id_user)+ "."+ str(id_proyecto)
    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    query = scholar.SearchScholarQuery()
    if frase != "":
        query.set_phrase(frase)
    if words != "":
        query.set_words(words)
    if autor != "":
        query.set_author(autor)
    if after != "" or before != "":
        query.set_timeframe(after, before)

    query.set_num_page_results(40)
    querier.send_query(query)
    scholar.getArticles(querier)
    articles = querier.articles

    articulos = getArticlesDict(articles)

    # if articulos is not None:
    #	moveFiles()
    #	indexarArchivos()
    return articulos
Exemple #6
0
    def __init__(self):
        self.querier = scholar.ScholarQuerier()
        self.settings = scholar.ScholarSettings()

        self.settings.set_citation_format(
            scholar.ScholarSettings.CITFORM_BIBTEX)
        self.querier.apply_settings(self.settings)
    def parse(self, response):
        table = response.xpath('//*[@class="post-content"]//table//tbody')
        rows = table.xpath('//tr')
        total = len(rows)

        for i in range(total):
            if i == 0 or i == total - 1:
                print('skipped: unusable html')
            else:
                print(i)
                name = rows[i].xpath('td//div//text()')[1].extract()
                url = rows[i].xpath('td//div//a/@href').extract()
                tags = rows[i].xpath('td//ul//li//div//text()').extract()
                print(name)
                print(url)
                print(tags)
                querier = scholar.ScholarQuerier()
                settings = scholar.ScholarSettings()

                querier.apply_settings(settings)
                query = scholar.SearchScholarQuery()
                query.set_words(name)

                querier.send_query(query)
                scholar.txt(querier, with_globals=1)
def queryGoogleScholar(andkeywords, orkeywords, pnt, filename, header):
    query = scholar.SearchScholarQuery()
    query.set_words(' '.join(andkeywords))
    query.set_words_some(' '.join(orkeywords))
    query.set_num_page_results(10)

    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.send_query(query)
    scholar.csv(querier, header=header, sep='|', filename=filename, geo=pnt)
Exemple #9
0
def google_scholar_query(querystr):
    querier = scholar.ScholarQuerier()
    query = scholar.SearchScholarQuery()
    query.set_phrase(querystr)
    querier.send_query(query)

    ret = "\n=========\n"
    for article in querier.articles:
        ret += article.as_txt() + "\n=========\n"

    return ret
Exemple #10
0
def scrape_citation_count(p):
    scholar.ScholarConf.COOKIE_JAR_FILE = COOKIE
    query = scholar.SearchScholarQuery()
    query.set_words(p.title)
    querier = scholar.ScholarQuerier()
    querier.send_query(query)
    try:
        print(querier.articles[0].attrs['num_citations'][0])
        return querier.articles[0].attrs['num_citations'][0]
    except:
        #Practically only fails on Captchas or connection timeout
        print("Google Scholar captcha :(")
        return -1
Exemple #11
0
def query():
    doi = request.query['doi']
    response.content_type = 'text/plain'

    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.apply_settings(settings)

    query = scholar.SearchScholarQuery()
    query.set_num_page_results(1)
    query.set_phrase(doi)

    querier.send_query(query)
    scholar.txt(querier, with_globals=False)
    citation = querier.articles[0]['num_citations']
    return template('{{citation}}', citation=citation)
Exemple #12
0
def getPaper(papertitle, querier=scholar.ScholarQuerier()):

    papers = scholar.papers_by_title(papertitle, querier)

    if len(papers) > 0:
        print "[DATA COLLECTOR INFO]"
        print "  Found paper:"
        print "    ", papers[0]["title"], "(", papers[0]["papernumber"], ")"
        print "    ", "with", papers[0]["num_citations"], "citations"
        print

        return papers[0]
    else:
        print "[DATA COLLECTOR INFO]"
        print "  Didn't find any papers."
        print

        return None
Exemple #13
0
def search_author(get_links):
    # from https://github.com/ckreibich/scholar.py/issues/80
    se_, index, category, category, buff = get_links
    querier = scholar.ScholarQuerier()
    settings = scholar.ScholarSettings()
    querier.apply_settings(settings)
    query = scholar.SearchScholarQuery()

    query.set_words(category)
    querier.send_query(query)
    links = [
        a.attrs['url'][0] for a in querier.articles
        if a.attrs['url'][0] is not None
    ]
    #links = query.get_url()
    #print(links)
    #if len(links) > NUM_LINKS: links = links[0:NUM_LINKS]

    [process((se_, index, l, category, buff)) for index, l in enumerate(links)]
Exemple #14
0
def getAllCitingPapersIncremental(papertitle,
                                  querier=scholar.ScholarQuerier()):

    paper = getPaper(papertitle, querier)
    numPapersProcessedCumulative = 1
    numDuplicatesRemoved = 0
    paper["depth"] = 0
    paper["numPapersProcessedCumulative"] = numPapersProcessedCumulative
    paper["numDuplicatesRemoved"] = numDuplicatesRemoved
    allPapers = dict()
    toCheckPapers = [paper]

    while (len(toCheckPapers) > 0):

        paper = toCheckPapers.pop(0)

        if paper["title"] in allPapers:

            numDuplicatesRemoved = numDuplicatesRemoved + 1

        else:

            print "[DATA COLLECTOR INFO] Found paper: " + paper["title"]

            paper[
                "numPapersProcessedCumulative"] = numPapersProcessedCumulative
            paper["numDuplicatesRemoved"] = numDuplicatesRemoved
            allPapers[paper["title"]] = paper

            yield paper

            if (paper["papernumber"]):

                newCitations = scholar.citations_by_papernr(
                    paper["papernumber"], querier)
                numPapersProcessedCumulative = numPapersProcessedCumulative + len(
                    newCitations)

                for art in newCitations:

                    art["depth"] = paper["depth"] + 1
                    toCheckPapers.append(art)
Exemple #15
0
    def operation(keyword_x, index):
        print('index: ' + str(index))
        print('keyword: ' + str(keyword_x))
        index = int(index)

        querier = scholar.ScholarQuerier()
        settings = scholar.ScholarSettings()
        querier.apply_settings(settings)

        query = scholar.SearchScholarQuery()
        query.set_author("")
        query.set_words(str(keyword_x))
        query.set_num_page_results(10)

        querier.send_query(query)

        try:
            url = querier.articles[index]['url'].encode('utf-8')
        except AttributeError:
            url = "No URL"

        try:
            title = querier.articles[index]['title'].encode('utf-8')
        except AttributeError:
            url = "No Title"

        try:
            year = querier.articles[index]['year'].encode('utf-8')
        except AttributeError:
            year = "No Date"

        # Will fix this
        author = ""
        publication = ""

        line = "'" + title.decode('utf-8') + \
            "'. " + \
            year.decode('utf-8') + ", " + url.decode('utf-8') + "."

        print(line)

        citations.append(line)
Exemple #16
0
def buscadorSimple(frase):
    # nombre_directorio=str(id_user)+ "."+ str(id_proyecto)
    querier = scholar.ScholarQuerier()

    settings = scholar.ScholarSettings()

    query = scholar.SearchScholarQuery()

    query.set_phrase(frase)
    query.set_num_page_results(40)

    querier.send_query(query)
    scholar.getArticles(querier)

    articles = querier.articles
    articulos = getArticlesDict(articles)
    # MOVER ARTICULOS A CARPETA TMP
    #if articulos is not None:
    #	moveFiles()
    #	indexarArchivos()
    return articulos
Exemple #17
0
        m = re.search(r'^.*&.*&(.*)&(.*)& \\cite\{(.*)\}.*\\\\', line)
        if m and m.group(1).strip() != 'License':
            cites.add(m.group(3))

seen = set()
dois = []
with open('bibliography/biblio.bib') as bibtex_file:
    bib_database = bibtexparser.load(bibtex_file)
    for e in bib_database.entries:
        if 'ID' in e and e['ID'] in cites and 'doi' in e:
            seen.add(e['ID'])
            dois.append(e['doi'])

notseen = cites - seen

querier = scholar.ScholarQuerier()

citecnts = []

for doi in dois:
    query = scholar.SearchScholarQuery()
    query.set_words(doi)
    querier.send_query(query)

    if len(querier.articles) > 0:
        art = querier.articles[0]
        txt = art.as_txt()
        if doi in txt:
            #print art['num_citations'],doi
            citecnts.append((doi, art['num_citations']))
        else:
Exemple #18
0
def main():
    data = pd.DataFrame()

    f = open('../results/5556531000720111691.csv.bkup', 'r')
    for idx, line in enumerate(f):
        data_values = line.split(',', 2)
        to_append = pd.DataFrame([data_values])
        data = data.append(to_append)
    f.close()

    #
    # for each cluster id
    #
    for from_cluster_id in range(
            data.shape[0])[:1]:  # just get the first one, for now\
        print(from_cluster_id)
        cluster_id = data.iloc[from_cluster_id, 0]
        try:
            cluster_id = int(cluster_id)
        except ValueError:
            continue

        querier = scholar.ScholarQuerier()
        settings = scholar.ScholarSettings()
        query = scholar.SearchScholarQuery()
        query_cluster = scholar.ClusterScholarQuery(cluster=cluster_id)
        querier.send_query(query_cluster)

        #
        # for each article in search results
        #
        for article in querier.articles[:
                                        1]:  # get first article result, for now
            article.attrs.get('url_citations')[0]
            current_article = GoogleScholarArticleSimple()
            current_article.cluster_id = cluster_id
            current_article.set_search_soup().set_num_search_results(
            ).set_num_search_pages()

            # gs_r = current_article.soup.find_all("div", class_="gs_r")

            #
            # for each search page result of citing article
            #
            for page_idx, search_page_number in enumerate(
                    range(current.article.num_search_pages)
                [:1]):  # get first page result for now
                url = citations_url_generic.format(search_page_number * 10,
                                                   from_cluster_id)
                r = requests.get(url)
                soup = BeautifulSoup(r.text)
                gs_r = soup.find_all("div", class_="gs_r")
                # print(len(gs_r))

                output_file_path = '../results/01-{}.csv'.format(
                    from_cluster_id)

                f = open(output_file_path, 'w')
                f.close()

                #
                # for each search result
                #
                for citing_article_soup in gs_r:
                    result_article = DanGoogleScholarArticle(
                        soup=citing_article_soup)
                    result_article.parse_title()
                    # print(result_article.title)
                    result_article.parse_cluster_id()
                    # seed_cluster_id = result_article.cluster_id
                    # print(seed_cluster_id)
                    f = open(output_file_path, 'a+')
                    str_to_write = '{}\t|\t{}\t|\t{}\n'.\
                                   format(result_article.cluster_id,
                                          cluster_id,
                                          citing_article_soup)
                    f.write(str_to_write)
                    f.close()
                    sleep_time = random() * randint(10, 100)
                    print('cluster_id: {}, page: {}, sleeping: {}'.format(
                        from_cluster_id, page_number, sleep_time))
                    sleep(sleep_time)
Exemple #19
0
def main():
    usage = """demo.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
demo.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
demo.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(parser, 'Query arguments',
                                 'These options define search query arguments and parameters.')
    group.add_option('-a', '--author', metavar='AUTHORS', default=None,
                     help='Author name(s)')
    group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                     help='Results must contain all of these words')
    group.add_option('-s', '--some', metavar='WORDS', default=None,
                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
    group.add_option('-n', '--none', metavar='WORDS', default=None,
                     help='Results must contain none of these words. See -s|--some re. formatting')
    group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t', '--title-only', action='store_true', default=False,
                     help='Search title only')
    group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after', metavar='YEAR', default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before', metavar='YEAR', default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents', action='store_true', default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations', action='store_true', default=False,
                     help='Do not include citations in results')
    group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
                     help='Do not search, just use articles in given cluster ID')
    group.add_option('-c', '--count', type='int', default=None,
                     help='Maximum number of results')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Output format',
                                 'These options control the appearance of the results.')
    group.add_option('--txt', action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals', action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv', action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header', action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option('--citation', metavar='FORMAT', default=None,
                     help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option('--cookie-file', metavar='FILE', default=None,
                     help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.')
    group.add_option('-d', '--debug', action='count', default=0,
                     help='Enable verbose logging to stderr. Repeated options increase detail of debug output.')
    group.add_option('-v', '--version', action='store_true', default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug'])
        sc.ScholarConf.LOG_LEVEL = options.debug
        sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is demo.py %s.' % sc.ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print('Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = sc.ScholarQuerier()
    settings = sc.ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = sc.ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = sc.SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.count is not None:
        options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        sc.csv(querier)
    elif options.csv_header:
        sc.csv(querier, header=True)
    elif options.citation is not None:
        sc.citation_export(querier)
    else:
        sc.txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0
Exemple #20
0
from __future__ import print_function
import scholar as gs
import time
import pandas as pd
from random import betavariate


querier = gs.ScholarQuerier()
q = gs.SearchScholarQuery()

df = pd.read_csv('JPE.csv')
titles = df.Title

output = df.iloc[0:20]
titles = titles[0:20]
# for _title in titles:


def query(_title):
    time.sleep(betavariate(2, 2)/2)  # to prevent overrequesting Google's server
    q.set_phrase(_title)
    querier.send_query(q)
    q_title = querier.articles[0].attrs['title'][0]
    q_num_cit = querier.articles[0].attrs['num_citations'][0]
    print((q_title, q_num_cit))
    return (q_title, q_num_cit)

(output['q_title'], output['q_num_cit']) = zip(*titles.map(query))
output.to_csv('JPE2.csv')
Exemple #21
0
def main():

    # variables
    regexp = list()
    standard_handler = 'biotechnology'

    # Read the regexp from file
    file = open('regexp.in', 'r')

    for line in file.readlines():
        regexp.append(line)
    file.close()

    # Scholar Parser Variables
    querier = sc.ScholarQuerier()
    settings = sc.ScholarSettings()

    # Varibles for metric
    results_opt1 = list()
    results_opt2 = list()
    results_opt3 = list()

    # loop for each regexp in file
    for index, item in enumerate(regexp):
        query1 = sc.SearchScholarQuery()
        query2 = sc.SearchScholarQuery()
        query3 = sc.SearchScholarQuery()

        # Fixed 1000 because the specificity of query
        query1.set_num_page_results(1000)
        query2.set_num_page_results(1000)
        query3.set_num_page_results(1000)

        # remove new line
        item = item.rstrip()

        # I am using three types of searches because it is not clear how the search handles
        # more than 1 mandatory expression.

        # Search 1:
        # words = ['ascidia curvata', 'biotechnology']
        # phrase = []

        query1.set_words(list([item, standard_handler]))
        querier.send_query(query1)
        result = statistics(querier, item, opt=1)

        results_opt1.append(result)

        # Search 2:
        # words = ['biotechnology']
        # phrase = ['ascidia curvata']

        query2.set_words(list([standard_handler]))
        query2.set_phrase(item)
        querier.send_query(query2)
        result = statistics(querier, item, opt=2)

        results_opt2.append(result)

        # Search 3:
        # words = ['ascidia', 'curvata', 'biotechnology']
        # phrase = []

        split_regexp = item.split()
        split_regexp.append(standard_handler)

        query3.set_words(split_regexp)
        querier.send_query(query3)
        result = statistics(querier, item, opt=3)

        results_opt3.append(result)

        # You may want to ajust the binning of the data
        # If you want to see citations check results_opt#[0][2]

        plt.hist(results_opt1[index][1],
                 alpha=0.5,
                 label='Option 1 (Number of results: %s)' % results_opt1[0][0])
        plt.hist(results_opt2[index][1],
                 alpha=0.5,
                 label='Option 2 (Number of results: %s)' % results_opt1[0][0])
        plt.hist(results_opt3[index][1],
                 alpha=0.5,
                 label='Option 3 (Number of results: %s)' % results_opt1[0][0])
        plt.legend()
        plt.ylabel('Number of Articles')
        plt.xlabel('Year')
        plt.title(item.upper())
        plt.tight_layout()
        plt.savefig(item.replace(' ', '-') + 'histogram.png')
        plt.clf()

        del (query1)
        del (query2)
        del (query3)

    # each array is N dimensional, for N regexp specified in the input file
    return results_opt1, results_opt2, results_opt3
Exemple #22
0
def main():
    # scholar_article = scholar.ScholarArticle()
    # scholar_article_parser = scholar.ScholarArticleParser()

    querier = scholar.ScholarQuerier()
    # settings = scholar.ScholarSettings()
    query = scholar.SearchScholarQuery()

    query.set_author('eagly')

    query.set_words('psychology of attitudes')

    querier.send_query(query)

    querier.articles[0].as_txt()

    querier.articles[0].attrs

    SEED_ARTICLE = querier.articles[0]
    assert (
        SEED_ARTICLE.attrs.get('title')[0] == 'The psychology of attitudes.')

    type(SEED_ARTICLE)

    SEED_ARTICLE.attrs['cluster_id'][0]

    citations_url = SEED_ARTICLE.attrs.get('url_citations')[0]

    citations_url

    citations_url_generic = 'https://scholar.google.com/scholar?start={}&hl=en&as_sdt=2005&sciodt=0,5&cites=5556531000720111691&scipsc='
    citations_url_generic

    citations_url_generic.format('0')

    r = requests.get(citations_url_generic.format('0'))

    soup = BeautifulSoup(r.text)

    citation_results = CitationResults(soup=soup)
    citation_results.set_num_search_results().set_num_search_pages()
    citation_results.num_results

    num_search_pages = citation_results.num_search_pages
    num_search_pages

    gs_r = soup.find_all("div", class_="gs_r")

    len(gs_r)

    citing_article_soup = gs_r[2]

    result_article = DanGoogleScholarArticle(soup=citing_article_soup)

    result_article.parse_title()
    result_article.title

    result_article.parse_cluster_id()

    SEED_CLUSTER_ID = result_article.cluster_id
    SEED_CLUSTER_ID

    output_file_path = '../results/{}.csv'.\
                       format(SEED_ARTICLE.attrs['cluster_id'][0])

    f = open(output_file_path, 'w')
    f.close()

    for page_url, page_number in enumerate(range(num_search_pages)):
        r = requests.get(citations_url_generic.format(page_url * 10))
        soup = BeautifulSoup(r.text)
        citations_url_generic.format('0')
        gs_r = soup.find_all("div", class_="gs_r")
        # print(len(gs_r))
        for citing_article_soup in gs_r:
            result_article = DanGoogleScholarArticle(soup=citing_article_soup)
            result_article.parse_title()
            # print(result_article.title)
            result_article.parse_cluster_id()
            # seed_cluster_id = result_article.cluster_id
            # print(seed_cluster_id)
            f = open(output_file_path, 'a+')
            str_to_write = '{}\t|\t{}\t|\t{}\n'.\
                           format(result_article.cluster_id,
                                  SEED_CLUSTER_ID,
                                  citing_article_soup)
            f.write(str_to_write)
            f.close()
            sleep_time = random() * randint(10, 100)
            print('page: {}, sleeping: {}'.format(page_number, sleep_time))
            sleep(sleep_time)
Exemple #23
0
 def getAuthorsORG(self):
     query = scholar.SearchScholarQuery()
     query.set_author(self.author)
     query.set_phrase(self.keywords)
     query.set_timeframe(self.ystart, self.yend)
     query.set_num_page_results(self.count)
     query.set_include_patents(False)
     query.set_include_citations(False)
     query.set_scope(True)
     querier = scholar.ScholarQuerier()
     settings = scholar.ScholarSettings()
     settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX)
     querier.apply_settings(settings)
     querier.send_query(query)
     print "Query Sent"
     for i in xrange(0, min(len(querier.articles), self.count)):
         tempList = []
         print str(i) + "processed"
         pubUnicode = querier.articles[i].attrs['title'][0]
         pubName = unicodedata.normalize('NFKD', pubUnicode).encode(
             'ascii', 'ignore')
         pubName.replace(" ", "+")
         pubSearchUrl = "https://www.researchgate.net/publicliterature.PublicLiterature.search.html?type=keyword&search-keyword=" + pubName.replace(
             " ", "+") + "&search-abstract=&search=Search"
         searchPage = urllib2.urlopen(pubSearchUrl)
         soupPub = BeautifulSoup(searchPage)
         pubUrl = "https://www.researchgate.net/" + soupPub.select(
             ".ga-publication-item")[0]['href']
         pubPage = urllib2.urlopen(pubUrl)
         soupPub = BeautifulSoup(pubPage)
         authorInArtical = False
         for j in range(0, len(soupPub.select(".ga-top-coauthor-name"))):
             authorUrl = "https://www.researchgate.net/" + soupPub.select(
                 ".ga-top-coauthor-name")[j].a['href']
             pageAuthor = urllib2.urlopen(authorUrl)
             soupAuthor = BeautifulSoup(pageAuthor)
             tempList.append([
                 unicodedata.normalize(
                     'NFKD',
                     soupPub.select(".ga-top-coauthor-name")
                     [j].text.strip()).encode('ascii', 'ignore'),
                 unicodedata.normalize(
                     'NFKD',
                     soupAuthor.select(".header-institution-name")
                     [0].text.strip()).encode('ascii', 'ignore')
             ])
             if stringMatching(
                     self.author,
                     unicodedata.normalize(
                         'NFKD',
                         soupPub.select(".ga-top-coauthor-name")
                         [j].text.strip()).encode('ascii', 'ignore')):
                 authorInArtical = True
             else:
                 print unicodedata.normalize(
                     'NFKD',
                     soupPub.select(".ga-top-coauthor-name")
                     [j].text.strip()).encode('ascii', 'ignore')
         if authorInArtical:
             self.authorList.append(tempList)
             print "In" + str(len(tempList))
     return self.authorList