Ejemplo n.º 1
0
def test_parsers_generic():

    file_name = os.path.join(PATH_TEST_DATA, 'post-capitale-culturale.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(r.title, u'Il capitale culturale')
    assert_equals(
        r.title_extended,
        u'Il capitale culturale. Studies on the Value of Cultural Heritage')
    assert_equals(r.url,
                  'http://www.unimc.it/riviste/index.php/cap-cult/index')
    assert_equals(
        r.description,
        u'Il capitale culturale. Studies on the Value of Cultural Heritage. ISSN: 2039-2362. Il capitale culturale (ISSN: 2039-2362) \xe8 la rivista del Dipartimento di Beni Culturali dell\u2019Universit\xe0 di Macerata con sede a Fermo, che si avvale di molteplici competenze disciplinari (archeologia, archivistica, diritto, economia aziendale, informatica, museologia, restauro, storia, storia dell\u2019arte) unite dal comune obiettivo della implementazione di attivit\xe0 di studio, ricerca e progettazione per la valorizzazione del patrimonio culturale.'
    )
    assert_equals(r.languages, ['it'])
    assert_equals(r.domain, 'www.unimc.it')
    assert_equals(
        sorted(r.keywords),
        sorted([
            u'culture', u'journal', u'cultural heritage', u'open access',
            u'heritage'
        ]))
    assert_equals(r.identifiers, {'issn': {'generic': [u'2039-2362']}})
    del resources
Ejemplo n.º 2
0
def test_parsers_authors():
    """Can we really extract authors?"""

    file_name = os.path.join(PATH_TEST_DATA, 'post-theban-necropolis.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equal(resources[0].authors, [
        u'Jiro Kondo',
    ])
Ejemplo n.º 3
0
def test_parsers_elephantine():
    """Flakey description extraction?"""

    file_name = os.path.join(PATH_TEST_DATA, 'post-elephantine-reports.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equal(
        resources[0].description,
        u'Elephantine Grenzstadt und Handelsposten an der S\xfcdgrenze \xc4gyptens - Southern border town and trading post of Ancient Egypt. DAI - Deutsches Arch\xe4ologisches Institut. The aim of the excavations at Elephantine is to provide a coherent picture of the different parts of an ancient settlement and the interrelations between its temples, houses and cemeteries. Detailing the cultural development of the site, and using it as a source to extrapolate settlement patterns in other, less archaeologically accessible settlements is part of the objective of the mission. It is a rare moment when mud-brick settlement remains can be viewed by the public. This was formally made available as an open-air onsite museum in 1998. The research program at Elephantine intends to not only excavate large portions of the site and to study and restore it, but to try to understand Elephantine\u2019s role in the larger economical, political, ethnical and social contexts, both on the regional and the supra-regional level. The work aims to follow, diachronically, the developments across the different \xe9poques and disciplines. For such an approach, the preservation of the site and its layers with its moderate extension offers ideal conditions. Currently, the mission is supporting the efforts of the Supreme Council of Antiquities to restore and refurbish the old museum on Elephantine Island.'
    )
Ejemplo n.º 4
0
def test_parsers_oi():

    file_name = os.path.join(PATH_TEST_DATA, 'post-egyptian-antiquity.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(
        r.description,
        u"A Call to Protect Egyptian Antiquities, Cultural Heritage and Tourism Economy. We, the undersigned, strongly urge immediate action to protect Egyptian antiquities, important sites, and cultural heritage. In so doing, significant archaeological artifacts and irreplaceable historic objects will be preserved. Importantly, such protection will help the Egyptian economy in the wake of political revolution. Such an initiative will also help stem illicit international crime organizations that have links to money laundering, human trafficking and the drug trade. Whereas, Egyptian antiquities and sites are among the most historically significant and important in the world, Whereas, Egypt has numerous museums and historical sites, some of which are victims of ongoing looting, including recent reports that artifacts originally from Tutankhamen\u2019s tomb have been stolen, Whereas, more than 50 ancient Egyptian artifacts have been reported stolen from the Cairo Museum alone, Whereas, UNESCO has called for international mobilization to block cultural artifacts stolen from Egypt, Whereas, the tourism industry in Egypt is closely tied to cultural expeditions, employs one in eight Egyptians, accounts for some $11 billion in revenue for the Egyptian economy, and is the one of the largest sectors of the Egyptian economy. Read the rest here."
    )
Ejemplo n.º 5
0
def test_parsers_description_table_stop():
    """Add 'table' as a tag on which to stop when parsing descriptions"""

    file_name = os.path.join(PATH_TEST_DATA, 'post-freiburger-hefte.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equal(
        resources[0].description,
        u"Cahiers d'arch\xe9ologie fribourgeoise = Freiburger Hefte f\xfcr Arch\xe4ologie. ISSN: 1423-8756. As successor to the Chroniques Arch\xe9ologiques, edited between 1984 and 1997, the Cahiers d'Arch\xe9ologie Fribourgeoise present the results of excavations that took place in the Canton of Fribourg as well as the various activities of the Archaeology Department of the State of Fribourg. Since 1999, this yearly publication contains a series of richly illustrated articles and thematic reports in French or in German."
    )
Ejemplo n.º 6
0
def test_parsers_issue56():
    """Make sure we're not favoring article titles over anchor titles."""
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    file_name = os.path.join(PATH_TEST_DATA, 'post-quick-list.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(
        r.title,
        u'OIP 139. Early Megiddo on the East Slope (the "Megiddo Stages"): A Report on the Early Occupation of the East Slope of Megiddo (Results of the Oriental Institute’s Excavations, 1925-1933)'
    )
Ejemplo n.º 7
0
def test_parsers_issue56():
    """Make sure we're not getting raw HTML in descriptions."""

    logger = logging.getLogger(sys._getframe().f_code.co_name)
    file_name = os.path.join(PATH_TEST_DATA, 'post-oxford-archaeology.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    if u'div' in r.description:
        logger.debug(r.description)
        raise Exception
Ejemplo n.º 8
0
def test_parsers_get_domains():

    file_name = os.path.join(PATH_TEST_DATA, 'post-capitale-culturale.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    # verify generic parser can get domains
    domains = parsers.parsers['generic'].get_domains(a.soup)
    assert_equals(len(domains), 1)
    assert_equals(domains[0], 'www.unimc.it')
    # verify parser collection can do the same (using generic underneath)
    domains = parsers.get_domains(a.soup)
    assert_equals(len(domains), 1)
    assert_equals(domains[0], 'www.unimc.it')
Ejemplo n.º 9
0
def test_parsers_generic_external_biblio():
    file_name = os.path.join(PATH_TEST_DATA,
                             'post-numismatico-dello-stato.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(
        r.identifiers,
        {'uri': ['http://www.numismaticadellostato.it/web/pns/notiziario']})
    assert_equals(r.title, u'Notiziario del Portale Numismatico dello Stato')
    assert_equals(
        sorted(r.keywords),
        sorted([u'journal', u'Italy', u'open access', u'numismatics']))
    assert_equals(r.provenance[0]['resource_date'], '2015-02-03T17:54:05.0')
Ejemplo n.º 10
0
def test_parsers_omit_by_title():
    """Make sure colon-omit articles and overall omit articles are ignored."""
    logger = logging.getLogger(sys._getframe().f_code.co_name)

    #file_name = os.path.join(PATH_TEST_DATA, 'post-admin-colon.xml')
    #a = AwolArticle(atom_file_name=file_name)
    #parsers = AwolParsers()
    #resources = parsers.parse(a)
    #assert_is_none(resources)
    #del resources
    #del a
    file_name = os.path.join(PATH_TEST_DATA, 'post-admin.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_is_none(resources)
Ejemplo n.º 11
0
def test_parsers_umcj():
    file_name = os.path.join(PATH_TEST_DATA, 'post-umcj.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equals(len(resources), 11)
    rtop = resources[0]
    assert_equals(rtop.title, u'University Museums and Collections Journal')
    assert_equals(rtop.url,
                  'http://edoc.hu-berlin.de/browsing/umacj/index.php')
    assert_equals(len(rtop.subordinate_resources), 10)
    assert_equals(sorted(list(set([r.year for r in resources[1:]]))), [
        u'2001', u'2002', u'2003', u'2004', u'2005', u'2006', u'2008', u'2009',
        u'2010', u'2011'
    ])
    assert_equals(sorted(list(set([r.volume for r in resources[1:]]))),
                  [None, u'1', u'2', u'3', u'4'])
Ejemplo n.º 12
0
def test_parsers_ascsa():

    file_name = os.path.join(PATH_TEST_DATA, 'post-akoue.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(r.title, u'ákoue News')
    assert_equals(
        r.url, 'http://www.ascsa.edu.gr/index.php/publications/newsletter/')
    assert_equals(
        r.description,
        u"\xe1koue News. The School's newsletter, \xe1koue, has become a new, shorter print publication as we transition an increasing number of news articles and stories to the School website. Often there will be links to additional photos or news in the web edition that we haven't room to place in the print edition. Also supplemental articles that did not make it into print will be placed on the newsletter's home page here. The last issue of \xe1koue had asked for subscribers to notify us of their delivery preference--print or web edition. If you have do wish to have a print edition mailed to you, please contact us. See."
    )
    assert_equals(r.domain, 'www.ascsa.edu.gr')
    assert_equals(
        r.keywords,
        [u'American School of Classical Studies at Athens', u'ASCSA'])
Ejemplo n.º 13
0
def test_parsers_when_rome_attacks():
    file_name = os.path.join(PATH_TEST_DATA, 'post-mitdai-roem.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equals(len(resources), 14)
    rtop = resources[0]
    assert_equals(rtop.url,
                  'http://www.digizeitschriften.de/dms/toc/?PPN=PPN783873484')
    assert_equals(len(rtop.subordinate_resources), 13)
    assert_equals(sorted([r.year for r in resources[1:]]), [
        None, u'1888', u'1889', u'1890', u'1891', u'1892', u'1893', u'1895',
        u'1896', u'1897', u'1898', u'1899', u'1900'
    ])
    assert_equals(sorted(list(set([r.volume for r in resources[1:]]))), [
        None, u'10', u'11', u'12', u'13', u'14', u'15', u'3', u'4', u'5', u'6',
        u'7', u'8'
    ])
Ejemplo n.º 14
0
def test_parsers_oi():

    file_name = os.path.join(PATH_TEST_DATA, 'post-grammatical-case.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(
        r.title,
        u'Grammatical Case in the Languages of the Middle East and Europe')
    assert_equals(r.url, 'http://oi.uchicago.edu/pdf/saoc64.pdf')
    assert_equals(
        r.description,
        u"Announced today: SAOC 64. Grammatical Case in the Languages of the Middle East and EuropeActs of the International Colloquium Variations, concurrence et evolution des cas dans divers domaines linguistiques, Paris, 2-4 April 2007 Edited by Michèle Fruyt, Michel Mazoyer, and Dennis Pardee Purchase Book Download PDF Terms of Use Studies in Ancient Oriental Civilization (SAOC) volume 64 contains twenty-eight studies of various aspects of the case systems of Sumerian, Hurrian, Elamite, Eblaite, Ugaritic, Old Aramaic, Biblical Hebrew, Indo-European, the languages of the Bisitun inscription, Hittite, Armenian, Sabellic, Gothic, Latin, Icelandic, Slavic, Russian, Ouralien, Tokharian, and Etruscan. The volume concludes with a paper on future directions. Studies in Ancient Oriental Civilization 64 Chicago: The Oriental Institute, 2011 ISBN-13: 978-1-885923-84-4 ISBN-10: 1-885923-84-8 Pp. viii+ 420; 25 figures, 3 tables $45.00 Table of Contents Cas et analyse en morphèmes? Christian Touratier The Conjugation Prefixes, the Dative Case, and the Empathy Hierarchy in Sumerian. Christopher Woods Agent, Subject, Patient, and Beneficiary: Grammatical Roles in Hurrian. Dennis R. M. Campbell Des cas en élamite? Florence Malbran-Labat Évolution des cas dans le sémitique archaïque: la contribution de l’éblaïte. Pelio Fronzaroli Some Case Problems in Ugaritic. Robert Hawley Early Canaanite and Old Aramaic Case in the Light of Language Typology. Rebecca Hasselbach Vestiges du système casuel entre le nom et le pronom suffixe en hébreu biblique. Dennis Pardee Genèse et évolution du système casuel indo-européen: questions et hypothèses. Jean Haudry Allative in Indo-European. Folke Josephson Anomalies grammaticales à Bisotun. É. Pirart The Problem of the Ergative Case in Hittite. Craig Melchert A propos de l’opposition entre le statique et le dynamique en hittite. Michel Mazoyer Sur l’évolution du locatif en arménien. Matthias Fritz Énigmes autour du datif et de l’instrumental. Françoise Bader Les marques casuelles dans les documents paléo‑sabelliques et la morphologie du génitif pluriel sud-picénien. Vincent Martzloff Formation et variations dans les systèmes flexionnels des langues sabelliques: entre synchronie et diachronie. Paolo Poccetti Cas et évolution linguistique en latin. Michèle Fruyt La casualité latine en variation diastratique: du parler populaire à la diction poétique. Carole Fry Le flottement entre les cas en latin tardif. Gerd V. M. Haverling Case Marking of Core Arguments and Alignment in Late Latin. Michela Cennamo Cas grammaticaux et cas locaux en gotique: les modèles casuels en gotique. André Rousseau Remarques sur le datif en islandais moderne. Patrick Guelpa Mécanismes de réaffectation désinentielle et hiérarchie des oppositions casuelles en slave. Claire Le Feuvre Pourquoi deux génitifs et deux locatifs en russe pour certains substantifs? Etat actuel des paradigmes et aspects diachroniques. Sergueï Sakhno Regards sur les cas dans les langues ouraliennes. Jean Perrot† Sur l’histoire des cas en tokharien. Georges-Jean Pinault Accord sur le désaccord: quelques réflexions sur les rapports entre morphèmes casuels et adpositions en étrusque. G. van Heems Synthèse: The Dynamics of Case — Recapitulation and Future Directions. Gene Gragg"
    )
    assert_equals(r.domain, 'oi.uchicago.edu')
    assert_equals(r.keywords, [
        u'Europe', u'book', u'Middle East', u'language', u'Oriental Institute'
    ])
Ejemplo n.º 15
0
def test_parsers_issn_variation():
    """Capture ISSN-Print: and ISSN-Internet:"""

    file_name = os.path.join(PATH_TEST_DATA, 'post-jahrbuck-mainz.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    assert_equals(len(resources), 25)
    rtop = resources[0]
    assert_equals(
        rtop.identifiers,
        {'issn': {
            'electronic': [u'2198-9400'],
            'generic': [u'0076-2741']
        }})
    assert_equals(len(rtop.subordinate_resources), 24)
    assert_equals(sorted([r.title for r in resources[1:]]), [
        u'Bd. 52, Nr. 1 (2005)', u'Bd. 52, Nr. 2 (2005)',
        u'Bd. 53, Nr. 1 (2006)', u'Bd. 53, Nr. 2 (2006)',
        u'Bd. 53, Nr. 3 (2006)', u'Bd. 54, Nr. 1 (2007)',
        u'Bd. 54, Nr. 2 (2007)', u'Bd. 54, Nr. 3 (2007)',
        u'Bd. 55, Nr. 1 (2008)', u'Bd. 55, Nr. 2 (2008)',
        u'Bd. 55, Nr. 3 (2008)', u'Bd. 56, Nr. 1 (2009)',
        u'Bd. 56, Nr. 2 (2009)', u'Bd. 56, Nr. 3 (2009)',
        u'Bd. 57, Nr. 1 (2010)', u'Bd. 57, Nr. 2 (2010)',
        u'Bd. 58, Nr. 1 (2011)', u'Bd. 58, Nr. 2 (2011)',
        u'Bd. 58, Nr. 3 (2011)', u'Bd. 59, Nr. 1 (2012)',
        u'Bd. 59, Nr. 2 (2012)', u'Bd. 59, Nr. 3 (2012)',
        u'Bd. 60, Nr. 1 (2013)', u'Bd. 60, Nr. 2 (2013)'
    ])
    assert_equals(sorted(list(set([r.year for r in resources[1:]]))), [
        u'2005', u'2006', u'2007', u'2008', u'2009', u'2010', u'2011', u'2012',
        u'2013'
    ])
    assert_equals(
        sorted(list(set([r.volume for r in resources[1:]]))),
        [u'52', u'53', u'54', u'55', u'56', u'57', u'58', u'59', u'60'])
    assert_equals(sorted(list(set([r.issue for r in resources[1:]]))),
                  [u'1', u'2', u'3'])
    kids = [kid['url'] for kid in rtop.subordinate_resources]
    for url in [r.url for r in resources[1:]]:
        assert_in(url, kids)
Ejemplo n.º 16
0
def test_parsers_init():

    parsers = AwolParsers()
    plist = parsers.parsers
    # trap for untested addition of a parser
    assert_equals(len(plist.keys()), 5)
    # test for known parsers
    assert_true('generic' in plist.keys())
    assert_true('generic-single' in plist.keys())
    assert_true('www.ascsa.edu.gr' in plist.keys())
    assert_true('oi.uchicago.edu' in plist.keys())
    assert_true('othes.univie.ac.at' in plist.keys())
Ejemplo n.º 17
0
def test_parsers_persee():

    file_name = os.path.join(PATH_TEST_DATA, 'post-archeonautica.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(r.title, u'Archaeonautica')
    assert_equals(
        r.url, 'http://www.persee.fr/web/revues/home/prescript/revue/nauti')
    assert_equals(
        r.description,
        u'Archaeonautica. eISSN - 2117-6973. Archaeonautica est une collection créée en 1977 par le CNRS et le Ministère de la Culture à l’initiative de Bernard Liou. Publiée par CNRS Edition, le secrétariat de rédaction de la collection est assuré par le Centre Camille Jullian. Le but de la collection est la publication des recherches d’archéologie sous-marines ou, plus généralement, subaquatique, de la Préhistoire à l’époque moderne. Elle est aussi destinée à accueillir des études d’archéologie maritime et d’archéologie navale, d’histoire maritime et d’histoire économique.'
    )
    assert_equals(r.languages, ['fr'])
    assert_equals(r.domain, 'www.persee.fr')
    assert_equals(
        sorted(r.keywords),
        sorted([
            u'France', u'journal', u'open access', u'archaeology',
            u'nautical archaeology'
        ]))
    assert_equals(r.identifiers, {'issn': {'electronic': [u'2117-6973']}})
    assert_is_none(r.is_part_of)
    assert_equals(len(r.provenance), 2)
    assert_equals(r.provenance[0]['term'],
                  'http://purl.org/spar/cito/citesAsDataSource')
    assert_equals(r.provenance[1]['term'],
                  'http://purl.org/spar/cito/citesAsMetadataDocument')
    #assert_equals(len(r.related_resources), 0)
    #assert_equals(len(r.subordinate_resources), 14)
    #assert_equals(len(r.subordinate_resources[0].provenance), 2)
    #assert_equals(r.subordinate_resources[0].provenance[0]['term'], 'http://purl.org/spar/cito/citesAsDataSource')
    del resources

    file_name = os.path.join(PATH_TEST_DATA, 'post-gallia-prehistoire.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(r.title, u'Gallia Préhistoire')
    assert_equals(
        r.url, 'http://www.persee.fr/web/revues/home/prescript/revue/galip')
    assert_equals(
        r.description,
        u'Gallia Préhistoire. Créée par le CNRS, la revue Gallia Préhistoire est, depuis plus d’un demi-siècle, la grande revue de l’archéologie nationale, réputée pour la rigueur de ses textes et la qualité de ses illustrations. Gallia Préhistoire publie des articles de synthèse sur les découvertes et les recherches les plus signifiantes dans le domaine de la Préhistoire en France. Son champ chronologique couvre toute la Préhistoire depuis le Paléolithique inférieur jusqu’à la fin de l’âge du Bronze. Son champ géographique est celui de la France; cependant, Gallia Préhistoire publie aussi des études traitant des cultures limitrophes.'
    )
    assert_equals(r.languages, ['fr'])
    assert_equals(r.domain, 'www.persee.fr')
    assert_equals(r.keywords, [u'journal', u'open access'])
Ejemplo n.º 18
0
def test_parsers_goofy_initial_links():
    """Some files have initial links with no text content, just a br or something"""
    file_name = os.path.join(PATH_TEST_DATA,
                             'post-gouden-hoorn.xml')  # <a><br></a>
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(
        r.title,
        u'Gouden hoorn: tijdschrift over Byzantium = Golden horn: journal of Byzantium'
    )
    del r
    del parsers
    del a
    file_name = os.path.join(
        PATH_TEST_DATA,
        'post-filologia-neotestamentaria.xml')  # <a><span></span></a>
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)
    r = resources[0]
    assert_equals(r.title, u'Filología Neotestamentaria')
Ejemplo n.º 19
0
def main(args):
    """
    main functions
    """
    logger = logging.getLogger(sys._getframe().f_code.co_name)
    root_dir = args.whence[0]
    dest_dir = args.thence[0]
    walk_count = 0
    resources = None
    index = {}
    parsers = AwolParsers()
    logger.info(list(os.walk(root_dir)))
    for dir_name, sub_dir_list, file_list in os.walk(root_dir):  #ask Tom
        logger.info("Blah3")
        if resources is not None:
            logger.info("Blah2")
            del resources

        for file_name in file_list:
            logger.info("Blah4")
            if 'post-' in file_name and file_name[-4:] == '.xml':
                walk_count = walk_count + 1
                if args.progress and walk_count % 50 == 1:
                    print(
                        '\n*****************************\nPERCENT COMPLETE: {0:.0f}\n'
                        .format(float(walk_count) / 4261.0 * 100.0))
                logger.info(
                    '\n=========================================================================================\nARTICLE:\n'
                )
                target = os.path.join(dir_name, file_name)
                try:
                    a = awol_article.AwolArticle(atom_file_name=target)
                except (ValueError, RuntimeError) as e:
                    logger.warning(e)
                else:
                    logger.info(u'article title: {0}'.format(a.title))
                    logger.info(u'url: {0}'.format(a.url))
                    awol_id = '-'.join(('awol', a.id.split('.')[-1]))
                    logger.info('awol_id: {0}'.format(awol_id))
                    resources = None
                    try:
                        resources = parsers.parse(a)
                    except NotImplementedError as e:
                        logger.warning(e)
                    else:
                        try:
                            length = len(resources)
                        except TypeError:
                            length = 0
                        if length > 0:
                            for i, r in enumerate(resources):
                                logger.info(
                                    u'\n-----------------------------------------------------------------------------------------\nRESOURCE\n'
                                )
                                logger.info(u'url: {0}'.format(r.url))
                                logger.info(u'title: {0}'.format(r.title))
                                domain = r.domain
                                this_dir = os.path.join(dest_dir, domain)
                                try:
                                    os.makedirs(this_dir)
                                except OSError as exc:
                                    if exc.errno == errno.EEXIST and os.path.isdir(
                                            this_dir):
                                        pass
                                    else:
                                        raise
                                try:
                                    domain_index = index[domain]
                                except KeyError:
                                    domain_index = index[domain] = {}
                                stub = r.url.split(domain)[-1][1:].encode(
                                    'utf-8')
                                if stub == '' or stub == '/':
                                    stub = domain.encode('utf-8').replace(
                                        '.', '-')
                                if stub[-1] == '/':
                                    stub = stub[:-1]
                                if len(
                                        stub
                                ) > 80 or '?' in stub or '&' in stub or '%' in stub or ' ' in stub:
                                    m = hashlib.sha1()
                                    m.update(stub)
                                    resource_key = m.hexdigest()
                                else:
                                    resource_key = RX_DEDUPEH.sub(
                                        '-', RX_URLFLAT.sub('-', stub))
                                filename = '.'.join((resource_key, 'json'))
                                this_path = os.path.join(this_dir, filename)
                                try:
                                    domain_resources = domain_index[
                                        resource_key]
                                except KeyError:
                                    pass
                                else:
                                    # collision! load earlier version from disk and merge
                                    logger.warning(
                                        'collision in {0}: {1}/{2}'.format(
                                            a.url, r.domain, resource_key))
                                    r_earlier = resource.Resource()
                                    r_earlier.json_load(this_path)
                                    try:
                                        r_merged = resource.merge(r_earlier, r)
                                    except ValueError(e):
                                        logger.error(
                                            unicode(e) +
                                            u' while trying to merge; saving separately'
                                        )
                                        m = hashlib.sha1()
                                        m.update(r.url)
                                        resource_key = m.hexdigest()
                                        filename = '.'.join(
                                            (resource_key, 'json'))
                                        this_path = os.path.join(
                                            this_dir, filename)
                                    else:
                                        r = r_merged
                                    del r_earlier
                                r.resource_key = resource_key
                                r.json_dump(this_path, formatted=True)
                                logger.info(u'filename: {0}'.format(this_path))
                                try:
                                    resource_title = r.extended_title
                                except AttributeError:
                                    resource_title = r.title
                                resource_package = {
                                    'title_full': resource_title,
                                    'url': r.url,
                                    'key': resource_key,
                                }
                                if resource_title != r.title:
                                    resource_package['title'] = r.title
                                try:
                                    resource_list = domain_index[resource_key]
                                except KeyError:
                                    resource_list = domain_index[
                                        resource_key] = []
                                resource_list.append(resource_package)
            else:
                logger.debug('skipping {0}'.format(file_name))
        for ignore_dir in ['.git', '.svn', '.hg']:
            if ignore_dir in sub_dir_list:
                sub_dir_list.remove(ignore_dir)

    logger.info('sorting domain list')
    domain_list = sorted(index.keys())
    domain_count = len(domain_list)
    resource_count = 0
    record_count = 0
    max_collisions = 0
    total_collisions = 0
    redundant_resources = 0
    logger.info("FULL INDEX OF RESOURCES")
    logger.info("=======================")
    for domain in domain_list:
        logger.info(domain)
        i = 0
        dash = ''
        while i < len(domain):
            dash = dash + '-'
            i = i + 1
        logger.info(dash)
        logger.info(u'sorting resource list for domain {0}'.format(domain))
        resource_list = sorted(index[domain].keys())
        logger.info('{0} unique resources in this domain'.format(
            len(resource_list)))
        resource_count = resource_count + len(resource_list)
        for resource_key in resource_list:
            resources = index[domain][resource_key]
            logger.info(u'    {0}'.format(resources[0]['title_full']))
            record_count = record_count + len(resources)
            if len(resources) > 1:
                logger.info('        multiple records: {0}'.format(
                    len(resources)))
                total_collisions = total_collisions + len(resources)
                redundant_resources = redundant_resources + 1
                if len(resources) > max_collisions:
                    max_collisions = len(resources)
    logger.info("=======================")
    logger.info("Total {0} domains".format(domain_count))
    logger.info("Total {0} unique resources recorded".format(resource_count))
    logger.info("Total number of records: {0}".format(record_count))
    logger.info("Highest number of redundancies (collisions): {0}".format(
        max_collisions))
    logger.info(
        "Total number of redundant records: {0}".format(total_collisions))
    try:
        logger.info(
            "Percentage of redundantly recorded resources:  {0:.2f}".format(
                round(
                    float(redundant_resources) / float(resource_count) *
                    100.0), 2))
    except ZeroDivisionError:
        print("No records!")
Ejemplo n.º 20
0
def test_parsers_umcj():
    file_name = os.path.join(PATH_TEST_DATA, 'post-waseda.xml')
    a = AwolArticle(atom_file_name=file_name)
    parsers = AwolParsers()
    resources = parsers.parse(a)