class TestBibtexParsing2(unittest.TestCase):
    """ more tests """

    def setUp(self):
        self.parser = BibtexParser()

    def testBibtexWithCustomFieldnames(self):
        source = open(setup.BIBTEX_TEST_BIB2, 'r').read()
        results = self.parser.getEntries(source)
        r1 = results[0]
        self.assertEqual(r1['month'], 'Mar')
        self.assertEqual(r1['doi'], '10.1002/(ISSN)1097-0231')
        self.assertEqual(r1['date-added'], '2008-08-06 17:48:48 +0200')
        self.assertEqual(r1['rating'], '0')
        self.assertEqual(r1['keywords'], ['biology', 'chemistry'])
        r2 = results[1]
        self.assertEqual(r2['keywords'], ['something strange'])

    def testBibtexEncodedChars(self):
        source = open(setup.BIBTEX_TEST_BIB3, 'r').read()
        results = self.parser.getEntries(source)
        self.assertEqual(len(results), 2)
        r = results[0]
        self.assertEqual(r['title'], unicode('Der Fürst', 'iso-8859-15').encode('utf-8'))
        self.assertEqual(r['publisher'], unicode('Alfred Körner Verlag', 'iso-8859-15').encode('utf-8'))
        r = results[1]
        self.assertEqual(r['address'], unicode('Göttingen', 'iso-8859-15').encode('utf-8'))
Ejemplo n.º 2
0
 def __init__(self,
              id = 'ris',
              title = "RIS format parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Ejemplo n.º 3
0
 def __init__(self,
              id = 'endnote',
              title = "EndNote's text format parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Ejemplo n.º 4
0
 def __init__(self,
              id = 'xml_mods',
              title = "XML(MODS) parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Ejemplo n.º 5
0
def extract_all_bibtex(html,start,limit):
    """
    Look up Bibtex links to obtain the publication information
    """
    from BeautifulSoup import BeautifulSoup
    from bibliograph.parsing.parsers.bibtex import BibtexParser

    bp=BibtexParser()
    html = html.decode('ascii', 'ignore')
    soup = BeautifulSoup(html)
    results=[]
    for irec, record in enumerate(soup('div',attrs={'class':'gs_ri'})):
        print start+irec
        #Skip records that are just citations, as they are often erroneous
        if str(record.contents[0]).find('CITATION')>-1: continue
        #If there's not BibTeX link, we're at the end:
        if str(record.contents[-1]).find('Import')==-1: break

        #if irec==limit-1: #The last entry is special
        #Bibtex links are tagged gs_fl
        links=record.find('div',{'class':'gs_fl'}) 
        biblink=[link for link in links('a') if 'bib?' in str(link)]
        biblink=biblink[0]
        #else:
        #    biblink=record('a')[-1]

        url_end=str(biblink.attrs[0][1])
        url = SEARCH_HOST+url_end
        print url
        req=Request(url,None,headers)
        try:
            handle=urlopen(req)
        except:
            print 'Search did not finish -- GScholar blocked you!'
            print 'restart at ', start+irec
            return irec,results,False
 
        bibtex_entry=handle.read()
        handle.close()

        bibrec=bp.parseEntry(bibtex_entry)
        try:
            print bibrec['pid']
        except:
            print 'something weird happened!!!!'
            return irec,results,True
        #Try to ignore junk entries
        if bibrec.has_key('publication_year'):
            if bibrec['publication_year'] is not '':
                results.append(bibrec)
                print 'accepted'
            else: print 'rejected'

        sleep(30) #Go slowly so we aren't flagged as a bot

    nrec=len(soup('p'))-2
    if nrec==limit: return nrec, results, True
    else: return nrec, results, False
Ejemplo n.º 6
0
def bibfile2dictlist(fname,do_postprocess=True,scopus=False,printupdates=False):
    """
    Takes a *.bib file name as input, and returns a list, with each
    element a dictionary corresponding to one of the BibTeX entries
    in the file.

    This should really be rewritten as a proper parser.
    Issues:
        - Chokes on blank lines in the middle of bibtex entries
    """

    from bibliograph.parsing.parsers.bibtex import BibtexParser
    import time

    bp=BibtexParser()
    f=file(fname)
    line=f.readline()

    biblist=[]

    entry=''
    while True:
        try: line=f.readline()
        except:
            bibrec=bp.parseEntry(entry)
            biblist.append(bibrec)
            if do_postprocess: biblist = postprocess(biblist)
            return biblist
        if line.startswith('@'):
            bibrec=bp.parseEntry(entry)
            if type(bibrec) is dict: biblist.append(bibrec)
            else: print 'Not a bibtex entry: '+entry
            entry=line
            if printupdates: print len(biblist)
            continue
        else:
            if scopus: #Scopus messes up the author format
                if line.strip().startswith('author='):
                    line=line.replace('a ',' ')
                    line=line.replace('b ',' ')
                    line=line.replace('c ',' ')
                    line=line.replace('d ',' ')
                    line=line.replace('e ',' ')
                    line=line.replace(' , ',' and ')
                    line=line.replace('., ','. and ')
            entry=entry+line
            if len(line)==0: 
                bibrec=bp.parseEntry(entry)
                if type(bibrec) is dict: biblist.append(bibrec)
                else: print 'Not a bibtex entry: '+entry
                if do_postprocess: biblist = postprocess(biblist)
                return biblist

    if do_postprocess: biblist = postprocess(biblist)
    return biblist
Ejemplo n.º 7
0
def parsefile(filename):
    """
    Takes a file name (string, including path) and returns a list of dictionaries,
    one dictionary for each bibtex entry in the file.

    Uses the bibliograph.parsing package.
    """
    from bibliograph.parsing.parsers.bibtex import BibtexParser
    bp=BibtexParser()

    f=file(filename)
    ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())]
    f.close()

    #Parsing errors give strings, so keep only dicts:
    entries=[x for x in ents if x.__class__ is dict]
    return entries
Ejemplo n.º 8
0
 def parseEntry(self, entry):
     return fixupresult(BibtexParser.parseEntry(self, entry))
 def setUp(self):
     self.parser = BibtexParser()
class TestBibtexParsing(unittest.TestCase):
    """
    """

    def setUp(self):
        self.parser = BibtexParser()

    def testFormatDetection(self):
        source_files = (setup.MEDLINE_TEST_BIB, setup.BIBTEX_TEST_BIB, 
                        setup.IDCOOKING_TEST_BIB, setup.PDFFOLDER_TEST_BIB, 
                        setup.BIBTEX_TEST_BIB_DUP, setup.BIBTEX_TEST_MULTI_AUTHORS,
                        setup.BIBTEX_TEST_INBOOKREFERENCES, setup.BIBTEX_TEST_LASTFIELDKOMMA,
                        setup.BIBTEX_TEST_TYPEFIELD, setup.BIBTEX_TEST_CITE_KEY)

        for source_file in source_files:
            source = open(source_file, 'r').read()
            self.failUnless(self.parser.checkFormat(source), 'BibTeX parser failed to detect BibTeX format in file %s' % source_file)

        # check negative detection (check properly rejects non-bibtex format files)
        source = open(setup.MEDLINE_TEST_MED, 'r').read()
        self.failIf(self.parser.checkFormat(source), 'BibTeX parser incorrectly detected BibTeX format in file %s' % setup.MEDLINE_TEST_MED)

    def testBibtexAuthorParsing(self):
        source = open(setup.BIBTEX_TEST_MULTI_AUTHORS, 'r').read()
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)
        heckman =  {'middlename': 'J.',
                    'firstname' : 'James',
                    'lastname'  : 'Heckman'}
        carneiro = {'middlename': '',
                    'firstname' : 'Pedro',
                    'lastname'  : 'Carneiro'}
        self.failUnless( len( result['authors'] ) == 2 )
        author1 = result['authors'][0]
        self.failUnless(author1['middlename'] == carneiro['middlename'])
        self.failUnless(author1['firstname'] == carneiro['firstname'])
        self.failUnless(author1['lastname'] == carneiro['lastname'])
        author2 = result['authors'][1]
        self.failUnless(author2['middlename'] == heckman['middlename'])
        self.failUnless(author2['firstname'] == heckman['firstname'])
        self.failUnless(author2['lastname'] == heckman['lastname'])

    def testBibtexInbookReferenceParsing(self):
        source = open(setup.BIBTEX_TEST_INBOOKREFERENCES, 'r').read()
        ref = {
            'booktitle': 'In einem fiktiven Buch vor unserer Zeit',
            'title': 'Die Tage der Ankunft',
            'chapter': 'Die Tage der Ankunft',
            'publication_url': 'http://www.sunweavers.net/',
        }

        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless( result.has_key(key) and (ref[key] == result[key]),key )

    def testAnnoteParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        self.failUnless(results[-1]['annote'] == 'I really like it.')

    def testIdentifierParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        result = results[2]
        self.assertEqual(result['identifiers'], [{'label' : 'ISBN', 'value' : '3874402436'},
                                                 {'label' : 'DOI', 'value' : '1-23-345'}])

    def testBibtexTypeFieldParsing(self):
        source = open(setup.BIBTEX_TEST_TYPEFIELD, 'r').read()
        ref = {
            'publication_type': 'Doktorarbeit',
            'title': 'Mein Herr Doktor',
            'school': 'CAU Kiel',
            'institution': 'Ökologie-Zentrum',
        }

        source = self.parser.checkEncoding(source)
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless( result.has_key(key) and (ref[key] == result[key]) )

    def testBibtexTypeLastFieldTrailingKomma(self):
        source = open(setup.BIBTEX_TEST_LASTFIELDKOMMA, 'r').read()
        results = self.parser.getEntries(source)

        # the last field in a bibtex entry always had a trailing ","
        self.failUnless( len(results) == 2  )
        self.failUnless( results[0]['institution'] == results[1]['institution']  )
        self.failUnless( results[0]['publication_type'] == results[1]['publication_type']  )
        self.failUnless( results[0]['publication_type'] == 'Doktorarbeit,,,' )