Python remove_html_markup Examples, invenio.htmlutils.remove_html_markup Python Examples

Example #1

0

Show file

File: bibmerge_engine.py Project: metandrey/invenio-metandrey

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {'resultCode': 0, 'resultText': ''}
    if requestType == "searchCandidates":
        recids = perform_request_search(p=data['query'])
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [search_result_info(x) for x in recids]
            alternative_titles = [
                remove_html_markup(print_record(x, "hs")) for x in recids
            ]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids(data['recID1'])
        captions = [split_revid(x, 'datetext')[1] for x in revisions]
        search_results = [revisions, captions]

    if too_many == True:
        result['resultCode'] = 1
        result['resultText'] = 'Too many results'
    else:
        result['results'] = search_results
        result['resultText'] = '%s results' % len(search_results[0])

    return result

Example #2

0

Show file

File: bibmerge_engine.py Project: flannery/invenio-flannery

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {
        'resultCode': 0,
        'resultText': ''
        }
    if requestType == "searchCandidates":
        recids = perform_request_search( p=data['query'] )
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [ search_result_info(x) for x in recids ]
            alternative_titles = [ remove_html_markup(print_record(x, "hs")) for x in recids ]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids( data['recID1'] )
        captions = [ split_revid(x, 'datetext')[1] for x in revisions ]
        search_results = [revisions, captions]

    if too_many == True:
        result['resultCode'] = 1
        result['resultText'] = 'Too many results'
    else:
        result['results'] = search_results
        result['resultText'] = '%s results' % len(search_results[0])

    return result

Example #3

0

Show file

File: htmlparser.py Project: aw-bib/tind-invenio

def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG):
    """Return the record in a textual format"""
    _ = gettext_set_language(ln)
    out = ""
    if record_id != 0:
        rec_in_hb = format_record(record_id, of="hb")
    elif xml_record:
        rec_in_hb = format_record(0, of="hb", xml_record=xml_record)
    rec_in_hb = rec_in_hb.replace("\n", " ")
    htparser = RecordHTMLParser()
    try:
        htparser.feed(rec_in_hb)
        htparser.close()
        out = htparser.result
    except:
        out = remove_html_markup(rec_in_hb)

    # Remove trailing whitespace and linefeeds
    out = out.strip("\n").strip()
    # Merge consecutive whitespaces. Must be done here, once all HTML
    # tags have been removed
    out = whitespaces_pattern.sub(" ", out)
    # Now consider non-breakable spaces
    out = out.replace("&nbsp;", " ")
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out)
    return out.strip()

Example #4

0

Show file

File: bibmerge_engine.py Project: pombredanne/invenio-old

def perform_candidate_record_search(requestType, data):
    """Handle search requests.
    """
    max_results = 999
    too_many = False
    result = {"resultCode": 0, "resultText": ""}
    if requestType == "searchCandidates":
        recids = perform_request_search(p=data["query"])
        if len(recids) > max_results:
            too_many = True
        else:
            captions = [search_result_info(x) for x in recids]
            alternative_titles = [remove_html_markup(print_record(x, "hs")) for x in recids]
            search_results = [recids, captions, alternative_titles]
    elif requestType == "searchRevisions":
        revisions = get_record_revision_ids(data["recID1"])
        captions = [split_revid(x, "datetext")[1] for x in revisions]
        search_results = [revisions, captions]

    if too_many == True:
        result["resultCode"] = 1
        result["resultText"] = "Too many results"
    else:
        result["results"] = search_results
        result["resultText"] = "%s results" % len(search_results[0])

    return result

Example #5

0

Show file

File: htmlparser.py Project: epfl-si/invenio-infoscience

def get_as_text(record_id=0, xml_record=None, ln=CFG_SITE_LANG):
    """Return the record in a textual format"""
    _ = gettext_set_language(ln)
    out = ""
    if record_id != 0:
        rec_in_hb = format_record(record_id, of="hb")
    elif xml_record:
        rec_in_hb = format_record(0, of="hb", xml_record=xml_record)
    rec_in_hb = rec_in_hb.replace('\n', ' ')
    htparser = RecordHTMLParser()
    try:
        htparser.feed(rec_in_hb)
        htparser.close()
        out = htparser.result
    except:
        out = remove_html_markup(rec_in_hb)

    # Remove trailing whitespace and linefeeds
    out = out.strip('\n').strip()
    # Merge consecutive whitespaces. Must be done here, once all HTML
    # tags have been removed
    out = whitespaces_pattern.sub(' ', out)
    # Now consider non-breakable spaces
    out = out.replace('&nbsp;', ' ')
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Detailed record"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Similar records"), "", out)
    out = re.sub(r"[\-:]?\s*%s\s*[\-:]?" % _("Cited by"), "", out)
    return out.strip()

Example #6

0

Show file

    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()

Example #7

0

Show file

File: BibIndexDefaultAuthorityTokenizer.py Project: aw-bib/tind-invenio

 def tokenize_for_words(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     formulas = []
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         formulas = latex_formula_re.findall(phrase)
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             stemmed_block = remove_stopwords(block, self.remove_stopwords)
             stemmed_block = length_check(stemmed_block)
             stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
             if stemmed_block:
                 words[stemmed_block] = 1
             if re_arxiv.match(block):
                 # special case for blocks like `arXiv:1007.5048' where
                 # we would like to index the part after the colon
                 # regardless of dot or other punctuation characters:
                 words[block.split(":", 1)[1]] = 1
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                 stemmed_subblock = length_check(stemmed_subblock)
                 stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                 if stemmed_subblock:
                     words[stemmed_subblock] = 1
                 # 4th break each subblock into alphanumeric groups and add groups:
                 for alphanumeric_group in re_separators.split(subblock):
                     stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                     stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                     stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                     if stemmed_alphanumeric_group:
                         words[stemmed_alphanumeric_group] = 1
     for block in formulas:
         words[block] = 1
     return words.keys()

Example #8

0

Show file

File: generate_scoap3_repo_page.py Project: jalavik/scoap3

def main():
    for journal in CFG_JOURNALS:
        name = get_coll_i18nname(journal)
        reclist = get_collection_reclist(journal)
        print "<h2>%s</h2>" % escape(name)
        if not reclist:
            print "<p>None yet.</p>"
            continue
        print "<p><ul>"
        for recid in reclist:
            record = get_record(recid)
            title = remove_html_markup(record_get_field_value(record, '245', code='a'), remove_escaped_chars_p=False).strip()
            doi = record_get_field_value(record, '024', '7', code='a')
            print '<li><a href="http://dx.doi.org/%s" target="_blank">%s</a>: %s</li>' % (escape(doi, True), escape(doi), title)
        print "</ul></p>"

Example #9

0

Show file

    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()

Example #10

0

Show file

File: BibIndexDefaultAuthorityTokenizer.py Project: aw-bib/tind-invenio

 def tokenize_for_pairs(self, phrase, recid):
     """Return list of words found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     if not self.isAuthority(recid):
         return []
     words = {}
     if self.remove_html_markup and phrase.find("</") > -1:
         phrase = remove_html_markup(phrase)
     if self.remove_latex_markup:
         phrase = remove_latex_markup(phrase)
         phrase = latex_formula_re.sub(" ", phrase)
     phrase = wash_for_utf8(phrase)
     phrase = lower_index_term(phrase)
     # 1st split phrase into blocks according to whitespace
     last_word = ""
     for block in strip_accents(phrase).split():
         # 2nd remove leading/trailing punctuation and add block:
         block = re_block_punctuation_begin.sub("", block)
         block = re_block_punctuation_end.sub("", block)
         if block:
             block = remove_stopwords(block, self.remove_stopwords)
             block = length_check(block)
             block = apply_stemming(block, self.stemming_language)
             # 3rd break each block into subblocks according to punctuation and add subblocks:
             for subblock in re_punctuation.split(block):
                 subblock = remove_stopwords(subblock, self.remove_stopwords)
                 subblock = length_check(subblock)
                 subblock = apply_stemming(subblock, self.stemming_language)
                 if subblock:
                     # 4th break each subblock into alphanumeric groups and add groups:
                     for alphanumeric_group in re_separators.split(subblock):
                         alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                         alphanumeric_group = length_check(alphanumeric_group)
                         alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                         if alphanumeric_group:
                             if last_word:
                                 words["%s %s" % (last_word, alphanumeric_group)] = 1
                             last_word = alphanumeric_group
     return words.keys()

Example #11

0

Show file

File: htmlutils_unit_tests.py Project: chezjohnny/invenio

 def test_remove_html_markup_replacement(self):
     """htmlutils - remove HTML markup, some replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is XtestX.'
     self.assertEqual(remove_html_markup(test_input, 'X'),
                      test_expected)

Example #12

0

Show file

File: htmlutils_unit_tests.py Project: chezjohnny/invenio

 def test_remove_html_markup_empty(self):
     """htmlutils - remove HTML markup, empty replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is test.'
     self.assertEqual(remove_html_markup(test_input, ''),
                      test_expected)

Example #13

0

Show file

File: bfe_webjournal_articles_overview.py Project: ppiotr/Invenio

def _get_feature_text(record, language):
    """
    Looks for a text (header) that can be featured on the article overview
    page.
    """
    washer = HTMLWasher()
    header_text = ""
    # Check if there is a header
    if language == "fr":
        header = record.field('590__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('520__a')
    else:
        header = record.field('520__a')
        if header.strip() in \
               ['', '<br/>', '<!--HTML--><br />', '<!--HTML-->']:
            header = record.field('590__a')
    header = washer.wash(html_buffer=header,
                         allowed_tag_whitelist=[],
                         allowed_attribute_whitelist=[])
    if header != "":
        header_text = header
    else:
        if language == "fr":
            article = record.fields('590__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('520__b')
        else:
            article = record.fields('520__b')
            if not article or \
                   (len(article) == 1 and \
                    article[0].strip() in \
                    ['', '<br />', '<!--HTML--><br />', '<!--HTML-->']):
                article = record.fields('590__b')
        try:
            article = article[0]
        except:
            return ''

        match_obj = re.search(header_pattern, article)
        if not match_obj:
            match_obj = re.search(header_pattern2, article)
        try:
            header_text = match_obj.group("header")
            header_text = washer.wash(html_buffer=header_text,
                                      allowed_tag_whitelist=['a'],
                                      allowed_attribute_whitelist=['href',
                                                                   'target',
                                                                   'class'])
            if header_text == "":
                raise Exception
        except:
            article = article.replace(header_text, '')
            article = article.replace('<p/>', '')
            article = article.replace('<p>&nbsp;</p>', '')
            match_obj = re.search(para_pattern, article)
            try:
                # get the first paragraph
                header_text = match_obj.group("paragraph")
                try:
                    header_text = washer.wash(html_buffer=header_text,
                                              allowed_tag_whitelist=[],
                                              allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    header_text = remove_html_markup(header_text)

                if header_text.strip() == "":
                    raise Exception
                else:
                    if len(header_text) > 250:
                        header_text = _get_first_sentence_or_part(header_text)
            except:
                # in a last instance get the first sentence
                try:
                    article = washer.wash(article,
                                          allowed_tag_whitelist=[],
                                          allowed_attribute_whitelist=[])
                except:
                    # was not able to parse correctly the HTML. Use
                    # this safer function, but producing less good
                    # results
                    article = remove_html_markup(article)

                header_text = _get_first_sentence_or_part(article)

    return header_text

Example #14

0

Show file

 def test_remove_html_markup_replacement(self):
     """htmlutils - remove HTML markup, some replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is XtestX.'
     self.assertEqual(remove_html_markup(test_input, 'X'), test_expected)

Example #15

0

Show file

 def test_remove_html_markup_empty(self):
     """htmlutils - remove HTML markup, empty replacement"""
     test_input = 'This is <a href="test">test</a>.'
     test_expected = 'This is test.'
     self.assertEqual(remove_html_markup(test_input, ''), test_expected)

Example #16

0

Show file

File: simplestore_marc_handler.py Project: peskk3am/b2share

def add_basic_fields(rec, form, email):
    """
    Adds the basic fields from the form. Note that these fields are mapped
    to specific MARC fields. For information on the fields see the www.loc.gov
    website. For example http://www.loc.gov/marc/bibliographic/bd260.html
    contains information on field 260 for publication data.
    """
    # why aren't subfields a dictionary?!
    try:
        if form['title']:
            record_add_field(rec, '245', subfields=[('a', remove_html_markup(form['title']))])

        if form['creator']:
            record_add_field(rec, '100', subfields=[('a', remove_html_markup(form['creator']))])

        if form['domain']:
            record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['domain']))])
        pubfields = []
        if form['publisher']:
            pubfields.append(('b', remove_html_markup(form['publisher'])))
        if form.get('publication_date'):
            pubfields.append(('c', remove_html_markup(form['publication_date'])))
        if pubfields:
            record_add_field(rec, '260', subfields=pubfields)
        record_add_field(rec, '856', ind1='0', subfields=[('f', email)])

        if 'open_access' in form:
            record_add_field(rec, '542', subfields=[('l', 'open')])
        else:
            record_add_field(rec, '542', subfields=[('l', 'restricted')])

        if form['licence']:
            record_add_field(rec, '540', subfields=[('a', remove_html_markup(form['licence']))])
        record_add_field(rec, '520', subfields=[('a', remove_html_markup(form['description']))])

        if form['tags']:
            for kw in form['tags'].split(','):
                record_add_field(rec, '653',
                                 ind1='1',
                                 subfields=[('a', remove_html_markup(kw.strip()))])

        if form['contributors']:
            for kw in form['contributors'].split(';'):
                record_add_field(rec, '700', subfields=[('a', remove_html_markup(kw.strip()))])

        if form['language']:
            record_add_field(rec, '546', subfields=[('a', remove_html_markup(form['language']))])

        # copying zenodo here, but I don't think 980 is the right MARC field
        if form['resource_type']:
            record_add_field(rec, '980', subfields=[('a', remove_html_markup(form['resource_type']))])

        if form['alternate_identifier']:
            record_add_field(rec, '024',
                             subfields=[('a', remove_html_markup(form['alternate_identifier']))])

        if form['version']:
            record_add_field(rec, '250', subfields=[('a', remove_html_markup(form['version']))])
        record_add_field(rec, '264',
                         subfields=[('b', CFG_SITE_NAME),
                                    ('c', str(datetime.utcnow()) + " UTC")])
    except Exception as e:
        current_app.logger.error(e)
        raise