Esempi in Python per parse, esempi in Python per htmlparser.parse

Esempio n. 1

0

Mostra file

File: honoursparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'honour'
    data['_template'] = 'honour.html'

    # filename, title, headline and update date for this
    data['title'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)

    # parse biography
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    data['tags'] = ''

    # alphabetical display entries
    parsed_entries = []
    if data['title'].strip() != '':
        s = data['title'].strip()
        parsed_entries.append(s)
    elif data['headline'].strip() != '':
        s = data['headline'].strip()
        parsed_entries.append(s)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data

Esempio n. 2

0

Mostra file

File: quotationsparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'quotation'
    data['_template'] = 'quotation.html'

    # filename, name
    data['name'] = symbolreplace.tags_to_unicode(datasheet['NAME'])

    content = datasheet['CONTENT']
    numquotes = datasheet['NUMQUOTES']

    # special case cleaning rules
    if datasheet['FILENAME'] == 'Carmichael':
        content = content.replace('<p>', '')
    if datasheet['FILENAME'] in NUMBER_CORRECTIONS:
        numquotes = NUMBER_CORRECTIONS[datasheet['FILENAME']]

    # now parse the individual quotes
    content = content.split('<p>')
    quotes = []
    for quote in content:
        if quote.strip() != '':
            quotes.append(quote.strip())

    # holding 'more quotes' links, or 'translations by'
    data['more'] = ''

    if len(quotes) != 0 and 'More ' in quotes[-1] and '<a href' in quotes[-1]:
        #print('I *think* this is a *more quotes* paragraph:', quotes[-1])
        data['more'] = quotes.pop()

    if len(quotes) != 0 and data['more'] == '' and 'Translations ' in quotes[-1]:
        #print('I *think* this is a *translations by* paragraph:', quotes[-1])
        data['more'] = quotes.pop()

    if len(quotes) != int(numquotes):
        print('ERROR', len(quotes), 'expcting', int(numquotes))
        print(quotes)
        assert False

    # now parse the quotes and convert to html
    for idx, quote in enumerate(quotes):
        q = parse_quote(quote)
        q['quote'] = htmlparser.parse(q['quote'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=True, url_context=url_context)
        q['source'] = htmlparser.parse(q['source'], 'Quotations/%s' % datasheet['FILENAME'], paragraphs=False, url_context=url_context)
        quotes[idx] = q

    quotations = flow.to_flow_block('quotation', quotes)
    data['quotations'] = quotations

    return quotations

Esempio n. 3

0

Mostra file

File: emsparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'page'
    data['_template'] = 'page.html'

    # sidebar
    data['sidebar'] = ''

    # easily translatable info
    data['authors'] = htmlparser.parse(datasheet['WHODIDIT'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)
    data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE']))

    # check that this is a standard page
    #assert datasheet['USEHTMLFORMAT'] == 'Y'

    # need to convert it to a standard page
    content = datasheet['CONTENT']

    regex = re.compile(r'<html>(?P<content>.*?)</html>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, strip, content)

    regex = re.compile(r'<head>(?P<content>.*?)</head>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, strip_all, content)

    regex = re.compile(r'<title>(.*?)</title>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<meta (.*?)/>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<style>(.*?)</style>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    regex = re.compile(r'<body(.*?)>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)
    content = content.replace('</body>', '')
    content = content.strip()

    # also get rid of the 'show larger image' button
    regex = re.compile(r'<form>(.*?)</form>', re.MULTILINE | re.DOTALL)
    content = re.sub(regex, r'', content)

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(content,
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data

Esempio n. 4

0

Mostra file

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'curve'
    data['_template'] = 'curve.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['FULLNAME']))

    # need to parse the individual equations out, and convert to flow
    equations = parse_equations(datasheet['EQUATIONS'], datasheet['FILENAME'])
    data['equations'] = flow.to_flow_block('curveequation', equations)

    # parse java applet options
    options = '{\n'
    pattern = re.compile(
        r'\<PARAM NAME="(?P<name>.+?)" VALUE="(?P<value>.+?)">')
    for match in re.finditer(pattern, datasheet['JAVA']):
        name = match.group('name')
        value = match.group('value')
        line = '%s: "%s",\n' % (name, value)
        options += line
    options += '}'
    data['appletoptions'] = options

    # parse content
    data['content'] = htmlparser.parse(datasheet['CONTENTS'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    return data

Esempio n. 5

0

Mostra file

File: referenceparser.py Progetto: davidferguson/mactutor-converter

def parse_cross_references(references, name, url_context):
    parsed = []

    for line in references.splitlines():
        line = line.strip()

        # match against reference line
        #bio_regex = re.compile(r'^(?P<number>\d+)\s*,\s*(?P<link>.+?)\s*,\s*(?P<text>.+?)(?:,\s*(?P<extratext>.+?))?$')
        bio_regex = re.compile(
            r'^(?P<number>\d+)\s*,\s*(?P<link>.+?)\s*(?:,\s*(?P<text>.+?))?(?:,\s*(?P<extratext>.+?))?$'
        )
        match = re.match(bio_regex, line)
        if match:
            # this is a reference line
            number = match.group('number')
            link = match.group('link')
            text = match.group('text')
            if not text:
                text = 'THIS LINK'
            link = urls.convert(link, url_context)
            if match.group('extratext'):
                text += ' ' + match.group('extratext')
                text = text.strip()
            if not text:
                text = link
            else:
                text = htmlparser.parse(text, name)
            reference = {'link': link, 'text': text, 'number': number}
            parsed.append(reference)
            continue

    return_str = {'data': parsed}
    return_str = json.dumps(return_str)
    return return_str

Esempio n. 6

0

Mostra file

File: referenceparser.py Progetto: davidferguson/mactutor-converter

def parse_references(references, name, url_context):
    parsed_references = []
    in_reference = False
    reference = None

    for line in references.splitlines():
        line = line.strip()

        # match against reference line
        bio_regex = re.compile(r'^(?P<number>\d+)\s*,\s*(?P<reference>.+)$')
        match = re.match(bio_regex, line)
        if match:
            # this is a reference line
            reference = match.group('reference')
            number = match.group('number')

            ref = {'number': number, 'reference': reference.strip()}
            parsed_references.append(ref)
            in_reference = True
            continue

        # match against url
        if (line.startswith('http://')
                or line.startswith('https://')) and in_reference:
            # check there's not an issue with the line
            assert '<' not in line and '>' not in line
            # make the entire reference a link
            href = line
            href = urls.convert(href, url_context)
            text = parsed_references[-1]['reference']
            text = text.replace('<br>', '')
            '''link = '<a href="%s">%s</a>' % (href, text)
            # only do this if there isn't already a link in the reference
            if '<a' not in text:
                parsed_references[-1]['reference'] = link'''
            parsed_references[-1]['reference'] = '%s <a href="%s">%s</a>' % (
                text, href, href)
            in_reference = False

        # match against empty line
        if line == '' or '<p>' in line:
            in_reference = False
            continue

        # any other line
        if in_reference:
            parsed_references[-1]['reference'] += (' ' + line.strip())

    for reference in parsed_references:
        reference['reference'] = htmlparser.parse(reference['reference'], name)

    return_str = {'data': parsed_references}
    return_str = json.dumps(return_str)
    return return_str

Esempio n. 7

0

Mostra file

File: extrasparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'extra'
    data['_template'] = 'extra.html'

    # filename, title, headline and update date for this
    data['title'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['TITLE']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)
    data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context)
    data['references'] = flow.to_flow_block('reference', json.loads(references)['data'])

    # parse biography
    data['content'] = htmlparser.parse(datasheet['EXTRA'],
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data

Esempio n. 8

0

Mostra file

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'obituary'
    data['_template'] = 'obituary.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['HEADING1']))
    data['summary'] = htmlparser.parse(datasheet['HEADING2'],
                                       datasheet['FILENAME'],
                                       paragraphs=False,
                                       url_context=url_context)
    data['wherefrom'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    return data

Esempio n. 9

0

Mostra file

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'glossary'
    data['_template'] = 'glossary.html'

    # easily translatable info
    data['term'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['WORD']))

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENTS'],
                                datasheet['FILENAME'],
                                paragraphs=True,
                                url_context=url_context)

    return data

Esempio n. 10

0

Mostra file

File: converter.py Progetto: davidferguson/mactutor-converter

def chronology_convert(input_dir, output_dir, url_context):
    # get all the files that need to be processed
    path = os.path.join(input_dir, '*')
    files = glob.glob(path)

    dates = {}

    # process all the files
    for file in files:
        # parse sections from datasheet
        datasheet = datasheetparser.parse_file(file)
        date = datasheet['DATE']

        content = htmlparser.parse(datasheet['BIG'],
                                   os.path.basename(file),
                                   paragraphs=False,
                                   url_context=url_context)
        data = {
            'about': 'yes' if datasheet['ABOUT'] != '' else 'no',
            'content': content
        }

        if date not in dates:
            dates[date] = []
        dates[date].append(data)

    # convert to nested flow
    chronology = []
    for date, events in dates.items():
        data = {
            '_model': 'chronologyyear',
            '_hidden': 'yes',
            'year': date,
            'events': flow.to_flow_block('chronology-event', events)
        }
        filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir, date)
        save(data, filename)

Esempio n. 11

0

Mostra file

def parse_equations(text, filename):
    equations = []
    eqtype = None

    typeregex = re.compile(
        r'^<b><font color=green>(?P<type>.+?)</font></b>.*$')
    equationregex = re.compile(r'^(?P<equation>\\.+?\\\\)$')

    text = text.split('\n')
    for line in text:
        line = line.strip()
        typematch = typeregex.search(line)
        equationmatch = equationregex.search(line)
        if typematch:
            # it's a type!
            assert eqtype == None
            eqtype = typematch.group('type')
        elif equationmatch:
            # it's an equation!
            assert eqtype
            eqtype = symbolreplace.strip_tags(
                symbolreplace.tags_to_unicode(eqtype))
            equation = {
                'type':
                eqtype,
                'equation':
                htmlparser.parse(equationmatch.group('equation'),
                                 filename,
                                 paragraphs=False)
            }
            eqtype = None
            equations.append(equation)
        else:
            assert False

    return equations

Esempio n. 12

0

Mostra file

File: server.py Progetto: aminfa/Pase

def read_logs():
    msg = servicehandler.getlogs()
    return htmlparser.parse(msg)

Esempio n. 13

0

Mostra file

File: converter.py Progetto: davidferguson/mactutor-converter

def project_convert(input_dir, output_dir, url_context, name):
    # get all the files that need to be processed
    path = os.path.join(input_dir, '*')
    files = glob.glob(path)

    titles = {
        'Ayel': 'The French Grandes Ecoles',
        'Brunk': 'The development of Galois theory',
        'Burslem': 'Sofia Kovalevskaya',
        'Daxenberger':
        'Johan de Witt - The first calculation on the valuation of life annuities',
        'Ellison': 'Sofia Kovalevskaya',
        'Johnson': 'James Clerk Maxwell - The Great Unknown',
        'MacQuarrie': 'Mathematics and Chess',
        'Pearce': 'Indian Mathematics - Redressing the balance',
        'Watson': 'Some topics in the history of mathematical education',
        'Ledermann': 'Walter Ledermann - Encounters of a Mathematician',
        'DickinsonCernokova':
        "An investigation of some of D'Arcy Thompson's correspondence",
        'GowenlockTuminauskaite': "D'Arcy Thompson and Mathematics"
    }
    authors = {
        'Ayel': 'Mathieu Ayel',
        'Brunk': 'Fiona Brunk',
        'Burslem': 'Tom Burslem',
        'Daxenberger': 'Livia Daxenberger',
        'Ellison': 'Leigh Ellison',
        'Johnson': 'Kevin Johnson',
        'MacQuarrie': 'John MacQuarrie',
        'Pearce': 'Ian G Pearce',
        'Watson': 'Helen Watson',
        'Ledermann': "J J O'Connor and E F Robertson",
        'DickinsonCernokova': 'Heather Dickinson and Barbora Cernokova',
        'GowenlockTuminauskaite': 'Alice Gowenlock and Indre Tuminauskaite'
    }

    pages = []
    references = ''

    # process all the files
    for file in files:
        # parse sections from datasheet
        datasheet = datasheetparser.parse_file(file)
        if datasheet['NUMBER'] == 'refs' and 'REFERENCES' in datasheet:
            # this is the references, not a page
            references = referenceparser.parse_references(
                datasheet['REFERENCES'], file, url_context)
            references = flow.to_flow_block('reference',
                                            json.loads(references)['data'])
            continue

        pagenum = int(datasheet['NUMBER'])
        assert pagenum == len(pages)

        content = cleaning.project_cleaning(datasheet['CONTENT'])
        data = {
            '_model':
            'projectpage',
            '_template':
            'projectpage.html',
            'title':
            datasheet['TITLE'],
            'content':
            htmlparser.parse(content,
                             file,
                             paragraphs=True,
                             url_context=url_context),
            'chapter':
            str(len(pages) + 1)
        }
        pages.append(data)

    # main project page
    data = {
        '_model': 'project',
        '_template': 'project.html',
        'title': titles[name],
        'author': authors[name],
        'references': '' if references is None else references
    }
    filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir)
    save(data, filename)

    # the chapters
    for page in pages:
        filename = os.path.join(LEKTOR_CONTENT_PATH, output_dir,
                                'chapter-%s' % page['chapter'])
        save(page, filename)
    print('processed', name)

Esempio n. 14

0

Mostra file

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'historytopic'
    data['_template'] = 'historytopic.html'

    # filename, short and full name, authors, update
    data['shortname'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = htmlparser.parse(datasheet['FULLNAME'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'],
                                       datasheet['FILENAME'],
                                       paragraphs=False,
                                       url_context=url_context)
    data['update'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    # something about indexes, not sure how this is used yet
    data['indexref'] = datasheet['INDEXREF']
    data['indexreffile'] = datasheet['INDEXREFFILE']

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'],
                                                  datasheet['FILENAME'],
                                                  url_context)
    data['references'] = flow.to_flow_block('reference',
                                            json.loads(references)['data'])

    # parse additional links (they use the same format as cross references)
    additional = referenceparser.parse_cross_references(
        datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb',
                                            json.loads(additional)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'],
                                                    datasheet['FILENAME'],
                                                    url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{
        'number': d['number'],
        'translation': d['reference']
    } for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'],
                                                      datasheet['FILENAME'],
                                                      url_context)
    data['otherweb'] = flow.to_flow_block('otherweb',
                                          json.loads(otherweb)['data'])

    # parse history topic
    data['content'] = htmlparser.parse(
        datasheet['HISTTOPIC'],
        datasheet['FILENAME'],
        translations=json.loads(translations)['data'],
        extras=json.loads(additional)['data'],
        paragraphs=True,
        url_context=url_context)

    # discover categories for this mathematician
    path = '/HistTopics/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical index names for this history topic
    parsed_entries = []
    if 'INDEXNAMES' not in datasheet:
        if data['fullname'].strip() != '':
            parsed_entries.append(data['fullname'].strip())
        elif data['shortname'].strip() != '':
            parsed_entries.append(data['shortname'].strip())
        else:
            print('no names for this topic')
            assert False
    else:
        entries = datasheet['INDEXNAMES'].strip().split('\n')

        for entry in entries:
            entry = entry.strip()
            entry = symbolreplace.strip_tags(
                symbolreplace.tags_to_unicode(entry))
            parsed_entries.append(entry)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data

Esempio n. 15

0

Mostra file

File: biographyparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'biography'
    data['_template'] = 'biography.html'

    # name and shortname
    data['shortname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['SHORTNAME']))
    data['fullname'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['FULLNAME']))

    # authors
    data['authors'] = htmlparser.parse(datasheet['AUTHORS'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # last update
    data['update'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['UPDATE']))

    data['summary'] = htmlparser.parse(datasheet['SUMMARY'], datasheet['FILENAME'], paragraphs=False, url_context=url_context)

    # dates are tricky. for now leave them as they are
    data['birthdate'] = datasheet['BIRTHDATE']
    data['deathdate'] = datasheet['DEATHDATE']

    # birth and death year - remove the ,? if necessary
    date_pattern = re.compile(r'(\d+)(?:,\??)?')
    data['birthyear'] = re.sub(date_pattern, r'\1', datasheet['BIRTHYEAR'])
    data['deathyear'] = re.sub(date_pattern, r'\1', datasheet['DEATHYEAR'])

    # birthplace, deathplace
    data['birthplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['BIRTHPLACE']))
    data['deathplace'] = symbolreplace.strip_tags(symbolreplace.tags_to_unicode(datasheet['DEATHPLACE']))

    # mapinfo - just take the name, ignore mapnum and lat/long
    mapinfo = re.compile(r'\d,(?P<name>.+?),(?:(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+))?')
    match = mapinfo.search(datasheet['MAPINFO'])
    data['maplocation'] = '--Unknown--'
    data['maplocation'] = ''
    if match:
        data['maplocation'] = match.group('name')

    # country
    data['country'] = '--Unknown--'
    if datasheet['COUNTRY'].strip() != '':
        data['country'] = datasheet['COUNTRY']

        if data['country'] == 'Czech_Republic':
            data['country'] = 'Czech Republic'
        elif data['country'] == 'Sicily':
            data['country'] = 'Italy'
        elif data['country'].endswith(')'):
            data['country'] = data['country'][:-1]
        elif data['country'] == '':
            data['country'] == '--Unknown--'

        # also add countries to global array
        if not data['country'] in countries:
            countries.append(data['country'])

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'], datasheet['FILENAME'], url_context)
    data['references'] = flow.to_flow_block('reference', json.loads(references)['data'])

    # parse translations (use the same format as references)
    # don't add them to data, as we're combining them with bio
    translations = referenceparser.parse_references(datasheet['TRANSLATION'], datasheet['FILENAME'], url_context)
    translation_data = json.loads(translations)['data']
    translation_data = [{'number':d['number'],'translation':d['reference']} for d in translation_data]
    data['translations'] = flow.to_flow_block('translation', translation_data)

    # parse cross references
    #xrefs = referenceparser.parse_cross_references(datasheet['XREFS'], datasheet['FILENAME'])
    #data['xrefs'] = xrefs

    # parse additional links (they use the same format as cross references)
    # don't add them to data, as we're combining them with bio
    additional = referenceparser.parse_cross_references(datasheet['ADDITIONAL'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb', json.loads(additional)['data'])

    # parse otherweb links (they use the same format as cross references)
    otherweb = referenceparser.parse_cross_references(datasheet['OTHERWEB'], datasheet['FILENAME'], url_context)
    data['otherweb'] = flow.to_flow_block('otherweb', json.loads(otherweb)['data'])

    # parse honours links (they use the same format as cross references)
    honours = referenceparser.parse_cross_references(datasheet['HONOURS'], datasheet['FILENAME'], url_context)
    data['honours'] = flow.to_flow_block('otherweb', json.loads(honours)['data'])

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['BIOGRAPHY'],
                                datasheet['FILENAME'],
                                translations=json.loads(translations)['data'],
                                extras=json.loads(additional)['data'],
                                paragraphs=True,
                                url_context=url_context)

    # discover categories for this mathematician
    path = '/Biographies/%s' % datasheet['FILENAME']
    tags = []
    #with open('../datasheets/Indexes/data.json') as f:
    #    category_data = json.load(f)
    category_data = categories.categories()
    for category in category_data:
        if path in category['entries']:
            tags.append(category['name'])
    data['tags'] = ', '.join(tags)

    # discover alphabetical tags for this mathematician
    displays = alphaindexparser.get_displays_2(datasheet['FILENAME'])
    if not displays:
        assert False
    displays = '\n'.join(displays)
    data['alphabetical'] = displays

    return data

Esempio n. 16

0

Mostra file

File: urltools.py Progetto: fkloft/fuse-netstorage

	def get_dom(self):
		return htmlparser.parse(self.file.read())

Esempio n. 17

0

Mostra file

                continue
            pattern = re.compile(r'^(?P<position>[SCLR]),(?P<path>.+?),(?P<height>.+?)(?:,(?P<description>.*))?$')
            match = pattern.search(line)
            if not match:
                print('not a match! (%s), (%s)' % (name, line))
                assert False
            position = match.group('position')
            path = match.group('path')
            height = match.group('height')
            description = match.group('description') or ''

            description = strip_br(description)

            # parse the description
            description = htmlparser.parse(description,
                                        'PictDisplay/%s' % name,
                                        paragraphs=False,
                                        url_context='PictDisplay/%s' % name)

            description = strip_br(description)

            # check this person exists
            biography_dir = os.path.join(CONTENT_DIR, 'Biographies/', name)
            if not os.path.isdir(biography_dir):
                with open('not-exists.txt', 'a') as f:
                    f.write('%s\n' % name)
                    continue

            # copy that image in
            img_dst = os.path.join(biography_dir, os.path.basename(path))
            img_src = os.path.join(SERVER_FILES, path)

Esempio n. 18

0

Mostra file

File: societiesparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'society'
    data['_template'] = 'society.html'

    # easily translatable info
    data['name'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLENAME']))
    data['headline'] = htmlparser.parse(datasheet['HEADLINE'],
                                        datasheet['FILENAME'],
                                        paragraphs=False,
                                        url_context=url_context)
    data['update'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['UPDATE']))
    data['foundation'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['FOUNDATION']))

    # external site parsing
    link = re.compile(
        r'<a\s+href ?= ?[\'"]?(?P<href>.+?)[\'"]?\s*>(?P<text>.*?)<\/a>')
    if datasheet['OTHERWEB'].strip() == '':
        data['website'] = ''
    else:
        match = link.search(datasheet['OTHERWEB'].strip())
        if not match:
            print('not link "%s"' % datasheet['OTHERWEB'].strip())
            assert match
        data['website'] = match.group('href')

    # parse references
    references = referenceparser.parse_references(datasheet['REFERENCES'],
                                                  datasheet['FILENAME'],
                                                  url_context)
    data['references'] = flow.to_flow_block('reference',
                                            json.loads(references)['data'])

    # parse additional links (they use the same format as cross references)
    # don't add them to data, as we're combining them with bio
    additional = referenceparser.parse_cross_references(
        datasheet['EXTRAS'], datasheet['FILENAME'], url_context)
    data['additional'] = flow.to_flow_block('otherweb',
                                            json.loads(additional)['data'])

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet['CONTENT'],
                                       datasheet['FILENAME'],
                                       extras=json.loads(additional)['data'],
                                       paragraphs=True,
                                       url_context=url_context)

    data['tags'] = ''

    # alphabetical display entries
    parsed_entries = []
    if data['name'].strip() != '':
        s = data['name'].strip()
        parsed_entries.append(s)
    elif data['headline'].strip() != '':
        s = data['headline'].strip()
        parsed_entries.append(s)
    data['alphabetical'] = '\n'.join(parsed_entries)

    return data

Esempio n. 19

0

Mostra file

File: gazplaceparser.py Progetto: davidferguson/mactutor-converter

def convert(datasheet, url_context):
    data = {}

    # metadata, the template and model
    data['_model'] = 'gazplace'
    data['_template'] = 'gazplace.html'

    data['place'] = symbolreplace.strip_tags(
        symbolreplace.tags_to_unicode(datasheet['TITLE']))

    pattern = re.compile(r'(?P<lat>-?[\d.]+),(?P<long>-?[\d.]+)')
    match = pattern.search(datasheet['COORDS'])
    data['latitude'] = ''
    data['longitude'] = ''
    if match:
        data['latitude'] = match.group('lat')
        data['longitude'] = match.group('long')

    # i was an idiot, and made a mistake in generating the datasheets for GazData
    # the correct CONTENTS is in GazData3
    # so we have to read that instead
    path = os.path.join('../datasheets/GazData3/', datasheet['FILENAME'])
    datasheet2 = datasheetparser.parse_file(path)

    # convert the references to the new style of references
    refcount = 1
    parsed_references = []
    references = datasheet['REFERENCES'].strip().split('\n')
    for reference in references:
        reference = reference.strip()
        if reference == '':
            continue
        parts = reference.split('@')
        if len(parts) != 3:
            print(reference)
            assert len(parts) == 3
        replacement = parts[0].strip()
        text = parts[2].strip()

        if replacement not in datasheet2['CONTENTS']:
            print(reference)
        assert replacement in datasheet2['CONTENTS']
        datasheet2['CONTENTS'] = datasheet2['CONTENTS'].replace(
            replacement, '[%s]' % refcount)

        parsed_references.append({
            'number':
            str(refcount),
            'reference':
            htmlparser.parse(text, datasheet['FILENAME'])
        })

        refcount = refcount + 1

    data['references'] = flow.to_flow_block('reference', parsed_references)

    # parse biography, and add in extras and translations
    data['content'] = htmlparser.parse(datasheet2['CONTENTS'],
                                       datasheet['FILENAME'],
                                       paragraphs=True,
                                       url_context=url_context)

    if data['place'] == 'Whitburn, Tyne & Wear':
        # add in the missing lat and long
        data['latitude'] = '54.9550395'
        data['longitude'] = '-1.3867149'

    if data['latitude'] == '' and data['longitude'] == '':
        # this is not a place, it should just be a page
        newdata = {}
        newdata['_model'] = 'page'
        newdata['_template'] = 'gazplace.html'
        newdata['title'] = data['place']
        newdata['authors'] = ''
        newdata['content'] = data['content']
        return newdata

    return data