コード例 #1
0
# sample = decode(parse_file)
Todo_list = [3]

# print(len(citation))
# dict_data = writeDictToCSV(current_path, csv_columns, citation)
# print(dict_data)
querier = scholar.ScholarQuerier()
settings = scholar.ScholarSettings()
query = scholar.SearchScholarQuery()
settings.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX)
querier.apply_settings(settings)
for citation in Todo_list:
    cur_citation = decode(str(citation) + '.txt')
    citation_file_name = str(citation) +'.bib'
    # os.chdir("C:\\Users\\JC\\Desktop\\CEGA-txt\\Individual Source Text Filesa\\allparsed")
    citationFile = codecs.open(citation_file_name, 'w','utf-8')
    for c in cur_citation:
        time.sleep(random.random())
        # print(c)
        query.set_words(c)
        querier.send_query(query)
        print(scholar.citation_export(querier))
        citationFile.write(scholar.citation_export(querier))
        citationFile.write('\n')
        # writer.writerow(scholar.csv(querier, header=False, sep=','))
    # os.chdir("C:\\Users\\JC\\Desktop\\CEGA-txt\\Individual Source Text Filesa")

print('finish writing to BibTeX file!')

コード例 #2
0
ファイル: test.py プロジェクト: gfhuertac/scholar.py
def main():
    usage = """demo.py [options] <query string>
A command-line interface to Google Scholar.

Examples:

# Retrieve one article written by Einstein on quantum theory:
demo.py -c 1 --author "albert einstein" --phrase "quantum theory"

# Retrieve a BibTeX entry for that quantum theory paper:
demo.py -c 1 -C 17749203648027613321 --citation bt

# Retrieve five articles written by Einstein after 1970 where the title
# does not contain the words "quantum" and "theory":
demo.py -c 5 -a "albert einstein" -t --none "quantum theory" --after 1970"""

    fmt = optparse.IndentedHelpFormatter(max_help_position=50, width=100)
    parser = optparse.OptionParser(usage=usage, formatter=fmt)
    group = optparse.OptionGroup(parser, 'Query arguments',
                                 'These options define search query arguments and parameters.')
    group.add_option('-a', '--author', metavar='AUTHORS', default=None,
                     help='Author name(s)')
    group.add_option('-A', '--all', metavar='WORDS', default=None, dest='allw',
                     help='Results must contain all of these words')
    group.add_option('-s', '--some', metavar='WORDS', default=None,
                     help='Results must contain at least one of these words. Pass arguments in form -s "foo bar baz" for simple words, and -s "a phrase, another phrase" for phrases')
    group.add_option('-n', '--none', metavar='WORDS', default=None,
                     help='Results must contain none of these words. See -s|--some re. formatting')
    group.add_option('-p', '--phrase', metavar='PHRASE', default=None,
                     help='Results must contain exact phrase')
    group.add_option('-t', '--title-only', action='store_true', default=False,
                     help='Search title only')
    group.add_option('-P', '--pub', metavar='PUBLICATIONS', default=None,
                     help='Results must have appeared in this publication')
    group.add_option('--after', metavar='YEAR', default=None,
                     help='Results must have appeared in or after given year')
    group.add_option('--before', metavar='YEAR', default=None,
                     help='Results must have appeared in or before given year')
    group.add_option('--no-patents', action='store_true', default=False,
                     help='Do not include patents in results')
    group.add_option('--no-citations', action='store_true', default=False,
                     help='Do not include citations in results')
    group.add_option('-C', '--cluster-id', metavar='CLUSTER_ID', default=None,
                     help='Do not search, just use articles in given cluster ID')
    group.add_option('-c', '--count', type='int', default=None,
                     help='Maximum number of results')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Output format',
                                 'These options control the appearance of the results.')
    group.add_option('--txt', action='store_true',
                     help='Print article data in text format (default)')
    group.add_option('--txt-globals', action='store_true',
                     help='Like --txt, but first print global results too')
    group.add_option('--csv', action='store_true',
                     help='Print article data in CSV form (separator is "|")')
    group.add_option('--csv-header', action='store_true',
                     help='Like --csv, but print header with column names')
    group.add_option('--citation', metavar='FORMAT', default=None,
                     help='Print article details in standard citation format. Argument Must be one of "bt" (BibTeX), "en" (EndNote), "rm" (RefMan), or "rw" (RefWorks).')
    parser.add_option_group(group)

    group = optparse.OptionGroup(parser, 'Miscellaneous')
    group.add_option('--cookie-file', metavar='FILE', default=None,
                     help='File to use for cookie storage. If given, will read any existing cookies if found at startup, and save resulting cookies in the end.')
    group.add_option('-d', '--debug', action='count', default=0,
                     help='Enable verbose logging to stderr. Repeated options increase detail of debug output.')
    group.add_option('-v', '--version', action='store_true', default=False,
                     help='Show version information')
    parser.add_option_group(group)

    options, _ = parser.parse_args()

    # Show help if we have neither keyword search nor author name
    if len(sys.argv) == 1:
        parser.print_help()
        return 1

    if options.debug > 0:
        options.debug = min(options.debug, sc.ScholarUtils.LOG_LEVELS['debug'])
        sc.ScholarConf.LOG_LEVEL = options.debug
        sc.ScholarUtils.log('info', 'using log level %d' % sc.ScholarConf.LOG_LEVEL)

    if options.version:
        print('This is demo.py %s.' % sc.ScholarConf.VERSION)
        return 0

    if options.cookie_file:
        sc.ScholarConf.COOKIE_JAR_FILE = options.cookie_file

    # Sanity-check the options: if they include a cluster ID query, it
    # makes no sense to have search arguments:
    if options.cluster_id is not None:
        if options.author or options.allw or options.some or options.none \
           or options.phrase or options.title_only or options.pub \
           or options.after or options.before:
            print('Cluster ID queries do not allow additional search arguments.')
            return 1

    querier = sc.ScholarQuerier()
    settings = sc.ScholarSettings()

    if options.citation == 'bt':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_BIBTEX)
    elif options.citation == 'en':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_ENDNOTE)
    elif options.citation == 'rm':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFMAN)
    elif options.citation == 'rw':
        settings.set_citation_format(sc.ScholarSettings.CITFORM_REFWORKS)
    elif options.citation is not None:
        print('Invalid citation link format, must be one of "bt", "en", "rm", or "rw".')
        return 1

    querier.apply_settings(settings)

    if options.cluster_id:
        query = sc.ClusterScholarQuery(cluster=options.cluster_id)
    else:
        query = sc.SearchScholarQuery()
        if options.author:
            query.set_author(options.author)
        if options.allw:
            query.set_words(options.allw)
        if options.some:
            query.set_words_some(options.some)
        if options.none:
            query.set_words_none(options.none)
        if options.phrase:
            query.set_phrase(options.phrase)
        if options.title_only:
            query.set_scope(True)
        if options.pub:
            query.set_pub(options.pub)
        if options.after or options.before:
            query.set_timeframe(options.after, options.before)
        if options.no_patents:
            query.set_include_patents(False)
        if options.no_citations:
            query.set_include_citations(False)

    if options.count is not None:
        options.count = min(options.count, sc.ScholarConf.MAX_PAGE_RESULTS)
        query.set_num_page_results(options.count)

    querier.send_query(query)

    if options.csv:
        sc.csv(querier)
    elif options.csv_header:
        sc.csv(querier, header=True)
    elif options.citation is not None:
        sc.citation_export(querier)
    else:
        sc.txt(querier, with_globals=options.txt_globals)

    if options.cookie_file:
        querier.save_cookies()

    return 0
コード例 #3
0
def add_title_using_gscholar(
    input_data_file_name,
    output_data_file_name,
    input_field_delimiter=',',
    output_field_delimiter=',',
    ):

    found_match_count = 0
    nofound_match_count = 0
    
    # create csv reader for input records
    timestamp.timestamp("Reading input records from '{0}'.".format(input_data_file_name))
    fr = open(input_data_file_name,'rb')
    input_data = csv.DictReader(fr, delimiter = input_field_delimiter)
    
    # fieldnames/keys of original input data (dictionary)
    original_data_fieldnames = input_data.fieldnames
    
    # find corresponding column position for specified header
    author_pos = fuzzymatch(original_data_fieldnames,'author')
    year_pos = fuzzymatch(original_data_fieldnames,'year')
    title_pos = fuzzymatch(original_data_fieldnames,'title')
    journal_pos = fuzzymatch(original_data_fieldnames,'journal')
    publisher_pos = fuzzymatch(original_data_fieldnames,'publisher')
    volume_pos = fuzzymatch(original_data_fieldnames,'volume')
    issue_pos = fuzzymatch(original_data_fieldnames,'issue')
    page_pos = fuzzymatch(original_data_fieldnames,'page')
    
    #count total data records
    record_num = 0
    
    for original_record in input_data:
        record_num += 1
        print
        timestamp.timestamp("Reading input record '{0:05d}'.".format(record_num))
        # exact values of fields to be validated
        original_record_title = original_record[title_pos.values()[0]]
        original_record_author = original_record[author_pos.values()[0]]
        original_record_year = original_record[year_pos.values()[0]]
        original_record_journal = original_record[journal_pos.values()[0]]
    
        if original_record_author == "[no agent data]":
            original_record_author = None
        if original_record_year == "0" or original_record_year == "":
            original_record_year = None
        if original_record_title == "no article title available":
            original_record_title = None
        if original_record_journal == "":
            original_record_journal = None
        
        output_record = original_record
        gscholar_match_result = None
        
        # try match search of the combination of author, publicaton year, title (if existing), and journal name.
        timestamp.timestamp("Trying Google Scholar match search for original record: '{0}'.".format(original_record)) 
        querier1 = scholar.ScholarQuerier()
        query1 = scholar.SearchScholarQuery()
        query1.set_num_page_results(1)
        query1.set_author(original_record_author)
        query1.set_pub(original_record_journal)
        query1.set_scope(original_record_title)
        if original_record_year is not None:          
            # extend the publisher year to an interval [original_record_year-10 original_record_year+10]
            query1.set_timeframe(str(int(original_record_year)-10),str(int(original_record_year)+10))
        else:
            query1.set_timeframe(None,None)
        settings1 = scholar.ScholarSettings()
        settings1.set_citation_format(scholar.ScholarSettings.CITFORM_BIBTEX)
        querier1.apply_settings(settings1)
        querier1.send_query(query1)        
        gscholar_record = scholar.citation_export(querier1) 
        if len(gscholar_record) < 1:
            timestamp.timestamp('Google Scholar match FAILED!')
            nofound_match_count += 1
        else:
            timestamp.timestamp('Google Scholar match was SUCESSFUL!')
            gscholar_match_result = True
            found_match_count += 1
            gscholar_record_str = gscholar_record[0]
            if gscholar_record_str.find('@') > -1:
                at_type = gscholar_record_str.find('@')
                sp_type = gscholar_record_str.find('{',at_type)
                new_type = gscholar_record_str[at_type+1 : sp_type]
                output_record['type'] = new_type
            if gscholar_record_str.find('title={') > -1:      
                at_title = gscholar_record_str.find('title={')
                sp_title = gscholar_record_str.find('}',at_title)
                new_title = gscholar_record_str[at_title+7 : sp_title]
                output_record['title'] = new_title 
            if gscholar_record_str.find('author={') > -1:
                at_author = gscholar_record_str.find('author={')
                sp_author = gscholar_record_str.find('}',at_author)
                new_author = gscholar_record_str[at_author+8 : sp_author]
                output_record['author'] = new_author
            if gscholar_record_str.find('journal={') > -1:
                at_journal = gscholar_record_str.find('journal={')
                sp_journal = gscholar_record_str.find('}',at_journal)
                new_journal = gscholar_record_str[at_journal+9 : sp_journal]
                output_record['journal'] = new_journal      
            if gscholar_record_str.find('volume={') > -1:
                at_volume = gscholar_record_str.find('volume={')
                sp_volume = gscholar_record_str.find('}',at_volume)
                new_volume = gscholar_record_str[at_volume+8 : sp_volume]
                output_record['volume'] = new_volume 
            if gscholar_record_str.find('pages={') > -1:
                at_pages = gscholar_record_str.find('pages={')
                sp_pages = gscholar_record_str.find('}',at_pages)
                new_pages = gscholar_record_str[at_pages+7 : sp_pages]
                output_record['pages'] = new_pages 
            if gscholar_record_str.find('year={') > -1:
                at_year = gscholar_record_str.find('year={')
                sp_year = gscholar_record_str.find('}',at_year)
                new_year = gscholar_record_str[at_year+6 : sp_year]
                output_record['year'] = new_year
            if gscholar_record_str.find('publisher={') > -1:
                at_publisher = gscholar_record_str.find('publisher={')
                sp_publisher = gscholar_record_str.find('}',at_publisher)
                new_publisher = gscholar_record_str[at_publisher+11 : sp_publisher]
                output_record['publisher'] = new_publisher               
    
        # open file for storing output data if not already open
        if 'output_data' not in locals():
            extra_fieldnames = ['type','title','author','journal','volume','pages','year','publisher']
            output_data_fieldnames = input_data.fieldnames + extra_fieldnames
            fw = open(output_data_file_name,'w')
            output_data = csv.DictWriter(fw,output_data_fieldnames,
                                        delimiter = output_field_delimiter)
            output_data.writeheader()
        output_data.writerow(output_record)
        
    print
    timestamp.timestamp("Summary: {0} matches found and {1} matches not found to '{2}'.".format(found_match_count, nofound_match_count, output_data_file_name))

    fr.close()
    fw.close()