Exemple #1
0
class TestBibtexParsing2(unittest.TestCase):
    """ more tests """
    def setUp(self):
        self.parser = BibtexParser()

    def testBibtexWithCustomFieldnames(self):
        source = open(setup.BIBTEX_TEST_BIB2, 'r').read()
        results = self.parser.getEntries(source)
        r1 = results[0]
        self.assertEqual(r1['month'], 'Mar')
        self.assertEqual(r1['doi'], '10.1002/(ISSN)1097-0231')
        self.assertEqual(r1['date-added'], '2008-08-06 17:48:48 +0200')
        self.assertEqual(r1['rating'], '0')
        self.assertEqual(r1['keywords'], ['biology', 'chemistry'])
        r2 = results[1]
        self.assertEqual(r2['keywords'], ['something strange'])

    def testBibtexEncodedChars(self):
        source = open(setup.BIBTEX_TEST_BIB3, 'r').read()
        results = self.parser.getEntries(source)
        self.assertEqual(len(results), 2)
        r = results[0]
        self.assertEqual(r['title'],
                         unicode('Der Fürst', 'iso-8859-15').encode('utf-8'))
        self.assertEqual(
            r['publisher'],
            unicode('Alfred Körner Verlag', 'iso-8859-15').encode('utf-8'))
        r = results[1]
        self.assertEqual(r['address'],
                         unicode('Göttingen', 'iso-8859-15').encode('utf-8'))
class TestBibtexParsing2(unittest.TestCase):
    """ more tests """

    def setUp(self):
        self.parser = BibtexParser()

    def testBibtexWithCustomFieldnames(self):
        source = open(setup.BIBTEX_TEST_BIB2, 'r').read()
        results = self.parser.getEntries(source)
        r1 = results[0]
        self.assertEqual(r1['month'], 'Mar')
        self.assertEqual(r1['doi'], '10.1002/(ISSN)1097-0231')
        self.assertEqual(r1['date-added'], '2008-08-06 17:48:48 +0200')
        self.assertEqual(r1['rating'], '0')
        self.assertEqual(r1['keywords'], ['biology', 'chemistry'])
        r2 = results[1]
        self.assertEqual(r2['keywords'], ['something strange'])

    def testBibtexEncodedChars(self):
        source = open(setup.BIBTEX_TEST_BIB3, 'r').read()
        results = self.parser.getEntries(source)
        self.assertEqual(len(results), 2)
        r = results[0]
        self.assertEqual(r['title'], unicode('Der Fürst', 'iso-8859-15').encode('utf-8'))
        self.assertEqual(r['publisher'], unicode('Alfred Körner Verlag', 'iso-8859-15').encode('utf-8'))
        r = results[1]
        self.assertEqual(r['address'], unicode('Göttingen', 'iso-8859-15').encode('utf-8'))
Exemple #3
0
def extract_all_bibtex(html, start, limit):
    """
    Look up Bibtex links to obtain the publication information
    """
    from BeautifulSoup import BeautifulSoup
    from bibliograph.parsing.parsers.bibtex import BibtexParser

    bp = BibtexParser()
    html = html.decode('ascii', 'ignore')
    soup = BeautifulSoup(html)
    results = []
    for irec, record in enumerate(soup('div', attrs={'class': 'gs_ri'})):
        print start + irec
        #Skip records that are just citations, as they are often erroneous
        if str(record.contents[0]).find('CITATION') > -1: continue
        #If there's not BibTeX link, we're at the end:
        if str(record.contents[-1]).find('Import') == -1: break

        #if irec==limit-1: #The last entry is special
        #Bibtex links are tagged gs_fl
        links = record.find('div', {'class': 'gs_fl'})
        biblink = [link for link in links('a') if 'bib?' in str(link)]
        biblink = biblink[0]
        #else:
        #    biblink=record('a')[-1]

        url_end = str(biblink.attrs[0][1])
        url = SEARCH_HOST + url_end
        print url
        req = Request(url, None, headers)
        try:
            handle = urlopen(req)
        except:
            print 'Search did not finish -- GScholar blocked you!'
            print 'restart at ', start + irec
            return irec, results, False

        bibtex_entry = handle.read()
        handle.close()

        bibrec = bp.parseEntry(bibtex_entry)
        try:
            print bibrec['pid']
        except:
            print 'something weird happened!!!!'
            return irec, results, True
        #Try to ignore junk entries
        if bibrec.has_key('publication_year'):
            if bibrec['publication_year'] is not '':
                results.append(bibrec)
                print 'accepted'
            else:
                print 'rejected'

        sleep(30)  #Go slowly so we aren't flagged as a bot

    nrec = len(soup('p')) - 2
    if nrec == limit: return nrec, results, True
    else: return nrec, results, False
Exemple #4
0
def bibfile2dictlist(fname,
                     do_postprocess=True,
                     scopus=False,
                     printupdates=False):
    """
    Takes a *.bib file name as input, and returns a list, with each
    element a dictionary corresponding to one of the BibTeX entries
    in the file.

    This should really be rewritten as a proper parser.
    Issues:
        - Chokes on blank lines in the middle of bibtex entries
    """

    from bibliograph.parsing.parsers.bibtex import BibtexParser
    import time

    bp = BibtexParser()
    f = file(fname)
    line = f.readline()

    biblist = []

    entry = ''
    while True:
        try:
            line = f.readline()
        except:
            bibrec = bp.parseEntry(entry)
            biblist.append(bibrec)
            if do_postprocess: biblist = postprocess(biblist)
            return biblist
        if line.startswith('@'):
            bibrec = bp.parseEntry(entry)
            if type(bibrec) is dict: biblist.append(bibrec)
            else: print 'Not a bibtex entry: ' + entry
            entry = line
            if printupdates: print len(biblist)
            continue
        else:
            if scopus:  #Scopus messes up the author format
                if line.strip().startswith('author='):
                    line = line.replace('a ', ' ')
                    line = line.replace('b ', ' ')
                    line = line.replace('c ', ' ')
                    line = line.replace('d ', ' ')
                    line = line.replace('e ', ' ')
                    line = line.replace(' , ', ' and ')
                    line = line.replace('., ', '. and ')
            entry = entry + line
            if len(line) == 0:
                bibrec = bp.parseEntry(entry)
                if type(bibrec) is dict: biblist.append(bibrec)
                else: print 'Not a bibtex entry: ' + entry
                if do_postprocess: biblist = postprocess(biblist)
                return biblist

    if do_postprocess: biblist = postprocess(biblist)
    return biblist
 def __init__(self,
              id = 'endnote',
              title = "EndNote's text format parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Exemple #6
0
 def __init__(self,
              id = 'xml_mods',
              title = "XML(MODS) parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Exemple #7
0
 def __init__(self,
              id = 'ris',
              title = "RIS format parser"
              ):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Exemple #8
0
def extract_all_bibtex(html,start,limit):
    """
    Look up Bibtex links to obtain the publication information
    """
    from BeautifulSoup import BeautifulSoup
    from bibliograph.parsing.parsers.bibtex import BibtexParser

    bp=BibtexParser()
    html = html.decode('ascii', 'ignore')
    soup = BeautifulSoup(html)
    results=[]
    for irec, record in enumerate(soup('div',attrs={'class':'gs_ri'})):
        print start+irec
        #Skip records that are just citations, as they are often erroneous
        if str(record.contents[0]).find('CITATION')>-1: continue
        #If there's not BibTeX link, we're at the end:
        if str(record.contents[-1]).find('Import')==-1: break

        #if irec==limit-1: #The last entry is special
        #Bibtex links are tagged gs_fl
        links=record.find('div',{'class':'gs_fl'}) 
        biblink=[link for link in links('a') if 'bib?' in str(link)]
        biblink=biblink[0]
        #else:
        #    biblink=record('a')[-1]

        url_end=str(biblink.attrs[0][1])
        url = SEARCH_HOST+url_end
        print url
        req=Request(url,None,headers)
        try:
            handle=urlopen(req)
        except:
            print 'Search did not finish -- GScholar blocked you!'
            print 'restart at ', start+irec
            return irec,results,False
 
        bibtex_entry=handle.read()
        handle.close()

        bibrec=bp.parseEntry(bibtex_entry)
        try:
            print bibrec['pid']
        except:
            print 'something weird happened!!!!'
            return irec,results,True
        #Try to ignore junk entries
        if bibrec.has_key('publication_year'):
            if bibrec['publication_year'] is not '':
                results.append(bibrec)
                print 'accepted'
            else: print 'rejected'

        sleep(30) #Go slowly so we aren't flagged as a bot

    nrec=len(soup('p'))-2
    if nrec==limit: return nrec, results, True
    else: return nrec, results, False
Exemple #9
0
def bibfile2dictlist(fname,do_postprocess=True,scopus=False,printupdates=False):
    """
    Takes a *.bib file name as input, and returns a list, with each
    element a dictionary corresponding to one of the BibTeX entries
    in the file.

    This should really be rewritten as a proper parser.
    Issues:
        - Chokes on blank lines in the middle of bibtex entries
    """

    from bibliograph.parsing.parsers.bibtex import BibtexParser
    import time

    bp=BibtexParser()
    f=file(fname)
    line=f.readline()

    biblist=[]

    entry=''
    while True:
        try: line=f.readline()
        except:
            bibrec=bp.parseEntry(entry)
            biblist.append(bibrec)
            if do_postprocess: biblist = postprocess(biblist)
            return biblist
        if line.startswith('@'):
            bibrec=bp.parseEntry(entry)
            if type(bibrec) is dict: biblist.append(bibrec)
            else: print 'Not a bibtex entry: '+entry
            entry=line
            if printupdates: print len(biblist)
            continue
        else:
            if scopus: #Scopus messes up the author format
                if line.strip().startswith('author='):
                    line=line.replace('a ',' ')
                    line=line.replace('b ',' ')
                    line=line.replace('c ',' ')
                    line=line.replace('d ',' ')
                    line=line.replace('e ',' ')
                    line=line.replace(' , ',' and ')
                    line=line.replace('., ','. and ')
            entry=entry+line
            if len(line)==0: 
                bibrec=bp.parseEntry(entry)
                if type(bibrec) is dict: biblist.append(bibrec)
                else: print 'Not a bibtex entry: '+entry
                if do_postprocess: biblist = postprocess(biblist)
                return biblist

    if do_postprocess: biblist = postprocess(biblist)
    return biblist
Exemple #10
0
def parsefile(filename):
    """
    Takes a file name (string, including path) and returns a list of dictionaries,
    one dictionary for each bibtex entry in the file.

    Uses the bibliograph.parsing package.
    """
    from bibliograph.parsing.parsers.bibtex import BibtexParser
    bp = BibtexParser()

    f = file(filename)
    ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())]
    f.close()

    #Parsing errors give strings, so keep only dicts:
    entries = [x for x in ents if x.__class__ is dict]
    return entries
Exemple #11
0
def parsefile(filename):
    """
    Takes a file name (string, including path) and returns a list of dictionaries,
    one dictionary for each bibtex entry in the file.

    Uses the bibliograph.parsing package.
    """
    from bibliograph.parsing.parsers.bibtex import BibtexParser
    bp=BibtexParser()

    f=file(filename)
    ents = [bp.parseEntry(x) for x in bp.splitSource(f.read())]
    f.close()

    #Parsing errors give strings, so keep only dicts:
    entries=[x for x in ents if x.__class__ is dict]
    return entries
Exemple #12
0
 def __init__(self, id='ris', title="RIS format parser"):
     """
     initializes including the regular expression patterns
     """
     BaseParser.__init__(self, id=id, title=title)
Exemple #13
0
 def setUp(self):
     self.parser = BibtexParser()
Exemple #14
0
class TestBibtexParsing(unittest.TestCase):
    """
    """
    def setUp(self):
        self.parser = BibtexParser()

    def testFormatDetection(self):
        source_files = (setup.MEDLINE_TEST_BIB, setup.BIBTEX_TEST_BIB,
                        setup.IDCOOKING_TEST_BIB, setup.PDFFOLDER_TEST_BIB,
                        setup.BIBTEX_TEST_BIB_DUP,
                        setup.BIBTEX_TEST_MULTI_AUTHORS,
                        setup.BIBTEX_TEST_INBOOKREFERENCES,
                        setup.BIBTEX_TEST_LASTFIELDKOMMA,
                        setup.BIBTEX_TEST_TYPEFIELD,
                        setup.BIBTEX_TEST_CITE_KEY)

        for source_file in source_files:
            source = open(source_file, 'r').read()
            self.failUnless(
                self.parser.checkFormat(source),
                'BibTeX parser failed to detect BibTeX format in file %s' %
                source_file)

        # check negative detection (check properly rejects non-bibtex format files)
        source = open(setup.MEDLINE_TEST_MED, 'r').read()
        self.failIf(
            self.parser.checkFormat(source),
            'BibTeX parser incorrectly detected BibTeX format in file %s' %
            setup.MEDLINE_TEST_MED)

    def testBibtexAuthorParsing(self):
        source = open(setup.BIBTEX_TEST_MULTI_AUTHORS, 'r').read()
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)
        heckman = {
            'middlename': 'J.',
            'firstname': 'James',
            'lastname': 'Heckman'
        }
        carneiro = {
            'middlename': '',
            'firstname': 'Pedro',
            'lastname': 'Carneiro'
        }
        self.failUnless(len(result['authors']) == 2)
        author1 = result['authors'][0]
        self.failUnless(author1['middlename'] == carneiro['middlename'])
        self.failUnless(author1['firstname'] == carneiro['firstname'])
        self.failUnless(author1['lastname'] == carneiro['lastname'])
        author2 = result['authors'][1]
        self.failUnless(author2['middlename'] == heckman['middlename'])
        self.failUnless(author2['firstname'] == heckman['firstname'])
        self.failUnless(author2['lastname'] == heckman['lastname'])

    def testBibtexInbookReferenceParsing(self):
        source = open(setup.BIBTEX_TEST_INBOOKREFERENCES, 'r').read()
        ref = {
            'booktitle': 'In einem fiktiven Buch vor unserer Zeit',
            'title': 'Die Tage der Ankunft',
            'chapter': 'Die Tage der Ankunft',
            'publication_url': 'http://www.sunweavers.net/',
        }

        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless(
                result.has_key(key) and (ref[key] == result[key]), key)

    def testAnnoteParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        self.failUnless(results[-1]['annote'] == 'I really like it.')

    def testIdentifierParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        result = results[2]
        self.assertEqual(result['identifiers'], [{
            'label': 'ISBN',
            'value': '3874402436'
        }, {
            'label': 'DOI',
            'value': '1-23-345'
        }])

    def testBibtexTypeFieldParsing(self):
        source = open(setup.BIBTEX_TEST_TYPEFIELD, 'r').read()
        ref = {
            'publication_type': 'Doktorarbeit',
            'title': 'Mein Herr Doktor',
            'school': 'CAU Kiel',
            'institution': 'Ökologie-Zentrum',
        }

        source = self.parser.checkEncoding(source)
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless(result.has_key(key) and (ref[key] == result[key]))

    def testBibtexTypeLastFieldTrailingKomma(self):
        source = open(setup.BIBTEX_TEST_LASTFIELDKOMMA, 'r').read()
        results = self.parser.getEntries(source)

        # the last field in a bibtex entry always had a trailing ","
        self.failUnless(len(results) == 2)
        self.failUnless(results[0]['institution'] == results[1]['institution'])
        self.failUnless(
            results[0]['publication_type'] == results[1]['publication_type'])
        self.failUnless(results[0]['publication_type'] == 'Doktorarbeit,,,')
 def parseEntry(self, entry):
     return fixupresult(BibtexParser.parseEntry(self, entry))
 def setUp(self):
     self.parser = BibtexParser()
class TestBibtexParsing(unittest.TestCase):
    """
    """

    def setUp(self):
        self.parser = BibtexParser()

    def testFormatDetection(self):
        source_files = (setup.MEDLINE_TEST_BIB, setup.BIBTEX_TEST_BIB, 
                        setup.IDCOOKING_TEST_BIB, setup.PDFFOLDER_TEST_BIB, 
                        setup.BIBTEX_TEST_BIB_DUP, setup.BIBTEX_TEST_MULTI_AUTHORS,
                        setup.BIBTEX_TEST_INBOOKREFERENCES, setup.BIBTEX_TEST_LASTFIELDKOMMA,
                        setup.BIBTEX_TEST_TYPEFIELD, setup.BIBTEX_TEST_CITE_KEY)

        for source_file in source_files:
            source = open(source_file, 'r').read()
            self.failUnless(self.parser.checkFormat(source), 'BibTeX parser failed to detect BibTeX format in file %s' % source_file)

        # check negative detection (check properly rejects non-bibtex format files)
        source = open(setup.MEDLINE_TEST_MED, 'r').read()
        self.failIf(self.parser.checkFormat(source), 'BibTeX parser incorrectly detected BibTeX format in file %s' % setup.MEDLINE_TEST_MED)

    def testBibtexAuthorParsing(self):
        source = open(setup.BIBTEX_TEST_MULTI_AUTHORS, 'r').read()
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)
        heckman =  {'middlename': 'J.',
                    'firstname' : 'James',
                    'lastname'  : 'Heckman'}
        carneiro = {'middlename': '',
                    'firstname' : 'Pedro',
                    'lastname'  : 'Carneiro'}
        self.failUnless( len( result['authors'] ) == 2 )
        author1 = result['authors'][0]
        self.failUnless(author1['middlename'] == carneiro['middlename'])
        self.failUnless(author1['firstname'] == carneiro['firstname'])
        self.failUnless(author1['lastname'] == carneiro['lastname'])
        author2 = result['authors'][1]
        self.failUnless(author2['middlename'] == heckman['middlename'])
        self.failUnless(author2['firstname'] == heckman['firstname'])
        self.failUnless(author2['lastname'] == heckman['lastname'])

    def testBibtexInbookReferenceParsing(self):
        source = open(setup.BIBTEX_TEST_INBOOKREFERENCES, 'r').read()
        ref = {
            'booktitle': 'In einem fiktiven Buch vor unserer Zeit',
            'title': 'Die Tage der Ankunft',
            'chapter': 'Die Tage der Ankunft',
            'publication_url': 'http://www.sunweavers.net/',
        }

        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless( result.has_key(key) and (ref[key] == result[key]),key )

    def testAnnoteParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        self.failUnless(results[-1]['annote'] == 'I really like it.')

    def testIdentifierParsing(self):
        source = open(setup.BIBTEX_TEST_BIB, 'r').read()
        results = self.parser.getEntries(source)
        result = results[2]
        self.assertEqual(result['identifiers'], [{'label' : 'ISBN', 'value' : '3874402436'},
                                                 {'label' : 'DOI', 'value' : '1-23-345'}])

    def testBibtexTypeFieldParsing(self):
        source = open(setup.BIBTEX_TEST_TYPEFIELD, 'r').read()
        ref = {
            'publication_type': 'Doktorarbeit',
            'title': 'Mein Herr Doktor',
            'school': 'CAU Kiel',
            'institution': 'Ökologie-Zentrum',
        }

        source = self.parser.checkEncoding(source)
        source = self.parser.preprocess(source)
        result = self.parser.parseEntry(source)

        for key in ref.keys():
            self.failUnless( result.has_key(key) and (ref[key] == result[key]) )

    def testBibtexTypeLastFieldTrailingKomma(self):
        source = open(setup.BIBTEX_TEST_LASTFIELDKOMMA, 'r').read()
        results = self.parser.getEntries(source)

        # the last field in a bibtex entry always had a trailing ","
        self.failUnless( len(results) == 2  )
        self.failUnless( results[0]['institution'] == results[1]['institution']  )
        self.failUnless( results[0]['publication_type'] == results[1]['publication_type']  )
        self.failUnless( results[0]['publication_type'] == 'Doktorarbeit,,,' )