Exemple #1
0
def bibfile2dictlist(fname,
                     do_postprocess=True,
                     scopus=False,
                     printupdates=False):
    """
    Takes a *.bib file name as input, and returns a list, with each
    element a dictionary corresponding to one of the BibTeX entries
    in the file.

    This should really be rewritten as a proper parser.
    Issues:
        - Chokes on blank lines in the middle of bibtex entries
    """

    from bibliograph.parsing.parsers.bibtex import BibtexParser
    import time

    bp = BibtexParser()
    f = file(fname)
    line = f.readline()

    biblist = []

    entry = ''
    while True:
        try:
            line = f.readline()
        except:
            bibrec = bp.parseEntry(entry)
            biblist.append(bibrec)
            if do_postprocess: biblist = postprocess(biblist)
            return biblist
        if line.startswith('@'):
            bibrec = bp.parseEntry(entry)
            if type(bibrec) is dict: biblist.append(bibrec)
            else: print 'Not a bibtex entry: ' + entry
            entry = line
            if printupdates: print len(biblist)
            continue
        else:
            if scopus:  #Scopus messes up the author format
                if line.strip().startswith('author='):
                    line = line.replace('a ', ' ')
                    line = line.replace('b ', ' ')
                    line = line.replace('c ', ' ')
                    line = line.replace('d ', ' ')
                    line = line.replace('e ', ' ')
                    line = line.replace(' , ', ' and ')
                    line = line.replace('., ', '. and ')
            entry = entry + line
            if len(line) == 0:
                bibrec = bp.parseEntry(entry)
                if type(bibrec) is dict: biblist.append(bibrec)
                else: print 'Not a bibtex entry: ' + entry
                if do_postprocess: biblist = postprocess(biblist)
                return biblist

    if do_postprocess: biblist = postprocess(biblist)
    return biblist
Exemple #2
0
def extract_all_bibtex(html, start, limit):
    """
    Look up Bibtex links to obtain the publication information
    """
    from BeautifulSoup import BeautifulSoup
    from bibliograph.parsing.parsers.bibtex import BibtexParser

    bp = BibtexParser()
    html = html.decode('ascii', 'ignore')
    soup = BeautifulSoup(html)
    results = []
    for irec, record in enumerate(soup('div', attrs={'class': 'gs_ri'})):
        print start + irec
        #Skip records that are just citations, as they are often erroneous
        if str(record.contents[0]).find('CITATION') > -1: continue
        #If there's not BibTeX link, we're at the end:
        if str(record.contents[-1]).find('Import') == -1: break

        #if irec==limit-1: #The last entry is special
        #Bibtex links are tagged gs_fl
        links = record.find('div', {'class': 'gs_fl'})
        biblink = [link for link in links('a') if 'bib?' in str(link)]
        biblink = biblink[0]
        #else:
        #    biblink=record('a')[-1]

        url_end = str(biblink.attrs[0][1])
        url = SEARCH_HOST + url_end
        print url
        req = Request(url, None, headers)
        try:
            handle = urlopen(req)
        except:
            print 'Search did not finish -- GScholar blocked you!'
            print 'restart at ', start + irec
            return irec, results, False

        bibtex_entry = handle.read()
        handle.close()

        bibrec = bp.parseEntry(bibtex_entry)
        try:
            print bibrec['pid']
        except:
            print 'something weird happened!!!!'
            return irec, results, True
        #Try to ignore junk entries
        if bibrec.has_key('publication_year'):
            if bibrec['publication_year'] is not '':
                results.append(bibrec)
                print 'accepted'
            else:
                print 'rejected'

        sleep(30)  #Go slowly so we aren't flagged as a bot

    nrec = len(soup('p')) - 2
    if nrec == limit: return nrec, results, True
    else: return nrec, results, False
Exemple #3
0
def parsefile(filename):
    """
    Takes a file name (string, including path) and returns a list of dictionaries,
    one dictionary for each bibtex entry in the file.

    Uses the bibliograph.parsing package.
    """
    from bibliograph.parsing.parsers.bibtex import BibtexParser
    bp = BibtexParser()

    f = file(filename)
    ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())]
    f.close()

    #Parsing errors give strings, so keep only dicts:
    entries = [x for x in ents if x.__class__ is dict]
    return entries
Exemple #4
0
 def setUp(self):
     self.parser = BibtexParser()