Esempio n. 1
0
def create_html_link(urlbase, urlargd, link_label, linkattrd=None,
                     escape_urlargd=True, escape_linkattrd=True,
                     urlhash=None):
    """Creates a W3C compliant link.
    @param urlbase: base url (e.g. invenio.config.CFG_SITE_URL/search)
    @param urlargd: dictionary of parameters. (e.g. p={'recid':3, 'of'='hb'})
    @param link_label: text displayed in a browser (has to be already escaped)
    @param linkattrd: dictionary of attributes (e.g. a={'class': 'img'})
    @param escape_urlargd: boolean indicating if the function should escape
                           arguments (e.g. < becomes &lt; or " becomes &quot;)
    @param escape_linkattrd: boolean indicating if the function should escape
                           attributes (e.g. < becomes &lt; or " becomes &quot;)
    @param urlhash: hash string to add at the end of the link
    """
    attributes_separator = ' '
    output = '<a href="' + \
             create_url(urlbase, urlargd, escape_urlargd, urlhash) + '"'
    if linkattrd:
        output += ' '
        if escape_linkattrd:
            attributes = [escape(str(key), quote=True) + '="' + \
                          escape(str(linkattrd[key]), quote=True) + '"'
                                for key in linkattrd.keys()]
        else:
            attributes = [str(key) + '="' + str(linkattrd[key]) + '"'
                                for key in linkattrd.keys()]
        output += attributes_separator.join(attributes)
    output = wash_for_utf8(output)
    output += '>' + wash_for_utf8(link_label) + '</a>'
    return output
def wash_pattern(p):
    """Wash pattern passed in URL.

    Check for sanity of the wildcard by removing wildcards if they are appended
    to extremely short words (1-3 letters).
    """
    # strip accents:
    # p = strip_accents(p) # FIXME: when available, strip accents all the time
    # add leading/trailing whitespace for the two following wildcard-sanity
    # checking regexps:
    p = " " + p + " "
    # replace spaces within quotes by __SPACE__ temporarily:
    p = re_pattern_single_quotes.sub(
        lambda x: "'" + x.group(1).replace(' ', '__SPACE__') + "'", p)
    p = re_pattern_double_quotes.sub(
        lambda x: "\""+x.group(1).replace(' ', '__SPACE__') + "\"", p)
    p = re_pattern_regexp_quotes.sub(
        lambda x: "/" + x.group(1).replace(' ', '__SPACE__') + "/", p)
    # get rid of unquoted wildcards after spaces:
    p = re_pattern_wildcards_after_spaces.sub("\\1", p)
    # get rid of extremely short words (1-3 letters with wildcards):
    # p = re_pattern_short_words.sub("\\1", p)
    # replace back __SPACE__ by spaces:
    p = re_pattern_space.sub(" ", p)
    # replace special terms:
    p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p)
    # remove unnecessary whitespace:
    p = p.strip()
    # remove potentially wrong UTF-8 characters:
    p = wash_for_utf8(p)
    return p
Esempio n. 3
0
 def tokenize_for_phrases(self, phrase):
     """Return list of phrases found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     phrase = wash_for_utf8(phrase)
     return [phrase]
Esempio n. 4
0
def wash_pattern(p):
    """Wash pattern passed in URL.

    Check for sanity of the wildcard by removing wildcards if they are appended
    to extremely short words (1-3 letters).
    """
    # strip accents:
    # p = strip_accents(p) # FIXME: when available, strip accents all the time
    # add leading/trailing whitespace for the two following wildcard-sanity
    # checking regexps:
    p = " " + p + " "
    # replace spaces within quotes by __SPACE__ temporarily:
    p = re_pattern_single_quotes.sub(
        lambda x: "'" + x.group(1).replace(' ', '__SPACE__') + "'", p)
    p = re_pattern_double_quotes.sub(
        lambda x: "\"" + x.group(1).replace(' ', '__SPACE__') + "\"", p)
    p = re_pattern_regexp_quotes.sub(
        lambda x: "/" + x.group(1).replace(' ', '__SPACE__') + "/", p)
    # get rid of unquoted wildcards after spaces:
    p = re_pattern_wildcards_after_spaces.sub("\\1", p)
    # get rid of extremely short words (1-3 letters with wildcards):
    # p = re_pattern_short_words.sub("\\1", p)
    # replace back __SPACE__ by spaces:
    p = re_pattern_space.sub(" ", p)
    # replace special terms:
    p = re_pattern_today.sub(time.strftime("%Y-%m-%d", time.localtime()), p)
    # remove unnecessary whitespace:
    p = p.strip()
    # remove potentially wrong UTF-8 characters:
    p = wash_for_utf8(p)
    return p
Esempio n. 5
0
 def tokenize_for_phrases(self, phrase):
     """Return list of phrases found in PHRASE.  Note that the phrase is
        split into groups depending on the alphanumeric characters and
        punctuation characters definition present in the config file.
     """
     phrase = wash_for_utf8(phrase)
     return [phrase]
Esempio n. 6
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block,
                                               self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(
                        subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock,
                                                      self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(
                            alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(
                            stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(
                            stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
Esempio n. 7
0
    def tokenize_for_words(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        formulas = []
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            formulas = latex_formula_re.findall(phrase)
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                stemmed_block = remove_stopwords(block, self.remove_stopwords)
                stemmed_block = length_check(stemmed_block)
                stemmed_block = apply_stemming(stemmed_block, self.stemming_language)
                if stemmed_block:
                    words[stemmed_block] = 1
                if re_arxiv.match(block):
                    # special case for blocks like `arXiv:1007.5048' where
                    # we would like to index the part after the colon
                    # regardless of dot or other punctuation characters:
                    words[block.split(':', 1)[1]] = 1
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    stemmed_subblock = remove_stopwords(subblock, self.remove_stopwords)
                    stemmed_subblock = length_check(stemmed_subblock)
                    stemmed_subblock = apply_stemming(stemmed_subblock, self.stemming_language)
                    if stemmed_subblock:
                        words[stemmed_subblock] = 1
                    # 4th break each subblock into alphanumeric groups and add groups:
                    for alphanumeric_group in re_separators.split(subblock):
                        stemmed_alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                        stemmed_alphanumeric_group = length_check(stemmed_alphanumeric_group)
                        stemmed_alphanumeric_group = apply_stemming(stemmed_alphanumeric_group, self.stemming_language)
                        if stemmed_alphanumeric_group:
                            words[stemmed_alphanumeric_group] = 1
        for block in formulas:
            words[block] = 1
        return words.keys()
Esempio n. 8
0
def create_html_link(urlbase,
                     urlargd,
                     link_label,
                     linkattrd=None,
                     escape_urlargd=True,
                     escape_linkattrd=True,
                     urlhash=None):
    """Creates a W3C compliant link.
    @param urlbase: base url (e.g. invenio.config.CFG_SITE_URL/search)
    @param urlargd: dictionary of parameters. (e.g. p={'recid':3, 'of'='hb'})
    @param link_label: text displayed in a browser (has to be already escaped)
    @param linkattrd: dictionary of attributes (e.g. a={'class': 'img'})
    @param escape_urlargd: boolean indicating if the function should escape
                           arguments (e.g. < becomes &lt; or " becomes &quot;)
    @param escape_linkattrd: boolean indicating if the function should escape
                           attributes (e.g. < becomes &lt; or " becomes &quot;)
    @param urlhash: hash string to add at the end of the link
    """
    attributes_separator = ' '
    output = '<a href="' + \
             create_url(urlbase, urlargd, escape_urlargd, urlhash) + '"'
    if linkattrd:
        output += ' '
        if escape_linkattrd:
            attributes = [escape(str(key), quote=True) + '="' + \
                          escape(str(linkattrd[key]), quote=True) + '"'
                                for key in linkattrd.keys()]
        else:
            attributes = [
                str(key) + '="' + str(linkattrd[key]) + '"'
                for key in linkattrd.keys()
            ]
        output += attributes_separator.join(attributes)
    output = wash_for_utf8(output)
    output += '>' + wash_for_utf8(link_label) + '</a>'
    return output
Esempio n. 9
0
def create_contextfiles(extracted_image_data):
    """
    Saves the context for each image to a file in the current sub-directory,
    returning a list of tuples per file saved in this form: [(image, filename), ..]

    @param extracted_image_data ([(string, string, list, list), ...]):
        a list of tuples of images matched to labels, captions and contexts from
        this document.
    """
    for image, dummy2, dummy3, contexts in extracted_image_data:
        if len(contexts) > 0 and image != "":
            context_filepath = image + '.context'
            fd = open(context_filepath, 'w')
            for context_line in contexts:
                fd.write(wash_for_utf8(context_line) + '\n\n')
            fd.close()
Esempio n. 10
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock,
                                                self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(
                                subblock):
                            alphanumeric_group = remove_stopwords(
                                alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(
                                alphanumeric_group)
                            alphanumeric_group = apply_stemming(
                                alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' %
                                          (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
Esempio n. 11
0
    def tokenize_for_pairs(self, phrase):
        """Return list of words found in PHRASE.  Note that the phrase is
           split into groups depending on the alphanumeric characters and
           punctuation characters definition present in the config file.
        """

        words = {}
        if self.remove_html_markup and phrase.find("</") > -1:
            phrase = remove_html_markup(phrase)
        if self.remove_latex_markup:
            phrase = remove_latex_markup(phrase)
            phrase = latex_formula_re.sub(' ', phrase)
        phrase = wash_for_utf8(phrase)
        phrase = lower_index_term(phrase)
        # 1st split phrase into blocks according to whitespace
        last_word = ''
        for block in strip_accents(phrase).split():
            # 2nd remove leading/trailing punctuation and add block:
            block = re_block_punctuation_begin.sub("", block)
            block = re_block_punctuation_end.sub("", block)
            if block:
                block = remove_stopwords(block, self.remove_stopwords)
                block = length_check(block)
                block = apply_stemming(block, self.stemming_language)
                # 3rd break each block into subblocks according to punctuation and add subblocks:
                for subblock in re_punctuation.split(block):
                    subblock = remove_stopwords(subblock, self.remove_stopwords)
                    subblock = length_check(subblock)
                    subblock = apply_stemming(subblock, self.stemming_language)
                    if subblock:
                        # 4th break each subblock into alphanumeric groups and add groups:
                        for alphanumeric_group in re_separators.split(subblock):
                            alphanumeric_group = remove_stopwords(alphanumeric_group, self.remove_stopwords)
                            alphanumeric_group = length_check(alphanumeric_group)
                            alphanumeric_group = apply_stemming(alphanumeric_group, self.stemming_language)
                            if alphanumeric_group:
                                if last_word:
                                    words['%s %s' % (last_word, alphanumeric_group)] = 1
                                last_word = alphanumeric_group
        return words.keys()
Esempio n. 12
0
 def unicode_gettext_wrapper(text, **kwargs):
     from invenio.base.helpers import unicodifier
     from invenio.utils.text import wash_for_utf8
     return wash_for_utf8(
         gettext(unicodifier(text), **unicodifier(kwargs)))
Esempio n. 13
0
 def unicode_gettext_wrapper(text, **kwargs):
     from invenio.base.helpers import unicodifier
     from invenio.utils.text import wash_for_utf8
     return wash_for_utf8(gettext(unicodifier(text),
                                  **unicodifier(kwargs)))
Esempio n. 14
0
def assemble_caption(begin_line, begin_index, end_line, end_index, lines):
    """
    Take write_messageation about the caption of a picture and put it all together
    in a nice way.  If it spans multiple lines, put it on one line.  If it
    contains controlled characters, strip them out.  If it has tags we don't
    want to worry about, get rid of them, etc.

    @param: begin_line (int): the index of the line where the caption begins
    @param: begin_index (int): the index within the line where the caption
        begins
    @param: end_line (int): the index of the line where the caption ends
    @param: end_index (int): the index within the line where the caption ends
    @param: lines ([string, string, ...]): the line strings of the text

    @return: caption (string): the caption, nicely formatted and pieced together
    """

    # stuff we don't like
    label_head = '\\label{'

    # reassemble that sucker
    if end_line > begin_line:
        # our caption spanned multiple lines
        caption = lines[begin_line][begin_index:]

        for included_line_index in range(begin_line + 1, end_line):
            caption = caption + ' ' + lines[included_line_index]

        caption = caption + ' ' + lines[end_line][:end_index]
        caption = caption.replace('\n', ' ')
        caption = caption.replace('  ', ' ')
    else:
        # it fit on one line
        caption = lines[begin_line][begin_index:end_index]

    # clean out a label tag, if there is one
    label_begin = caption.find(label_head)
    if label_begin > -1:
        # we know that our caption is only one line, so if there's a label
        # tag in it, it will be all on one line.  so we make up some args
        dummy_start, dummy_start_line, label_end, dummy_end = \
                find_open_and_close_braces(0, label_begin, '{', [caption])
        caption = caption[:label_begin] + caption[label_end + 1:]

    # clean out characters not allowed in MARCXML
    # not allowed: & < >
    try:
        caption = wash_for_utf8(caption)
        caption = encode_for_xml(caption.encode('utf-8', 'xmlcharrefreplace'), wash=True)
    except: # that damn encode thing threw an error on astro-ph/0601014
        sys.stderr.write(caption)
        sys.stderr.write(' cannot be processed\n')
        caption = caption.replace('&', '&amp;').replace('<', '&lt;')
        caption = caption.replace('>', '&gt;')

    caption = caption.strip()

    if len(caption) > 1 and caption[0] == '{' and caption[-1] == '}':
        caption = caption[1:-1]

    return caption
Esempio n. 15
0
def create_tag(tag,
               escaper=EscapedHTMLString,
               opening_only=False,
               body=None,
               escape_body=False,
               escape_attr=True,
               indent=0,
               attrs=None,
               **other_attrs):
    """
    Create an XML/HTML tag.

    This function create a full XML/HTML tag, putting toghether an
    optional inner body and a dictionary of attributes.

        >>> print create_html_tag ("select", create_html_tag("h1",
        ... "hello", other_attrs={'class': "foo"}))
        <select>
          <h1 class="foo">
            hello
          </h1>
        </select>

    @param tag: the tag (e.g. "select", "body", "h1"...).
    @type tag: string
    @param body: some text/HTML to put in the body of the tag (this
        body will be indented WRT the tag).
    @type body: string
    @param escape_body: wether the body (if any) must be escaped.
    @type escape_body: boolean
    @param escape_attr: wether the attribute values (if any) must be
        escaped.
    @type escape_attr: boolean
    @param indent: number of level of indentation for the tag.
    @type indent: integer
    @param attrs: map of attributes to add to the tag.
    @type attrs: dict
    @return: the HTML tag.
    @rtype: string
    """

    if attrs is None:
        attrs = {}
    for key, value in iteritems(other_attrs):
        if value is not None:
            if key.endswith('_'):
                attrs[key[:-1]] = value
            else:
                attrs[key] = value
    out = "<%s" % tag
    for key, value in iteritems(attrs):
        if escape_attr:
            value = escaper(value, escape_quotes=True)
        out += ' %s="%s"' % (key, value)
    if body is not None:
        if callable(body) and body.__name__ == 'handle_body':
            body = body()
        out += ">"
        if escape_body and not isinstance(body, EscapedString):
            body = escaper(body)
        out += body
        if not opening_only:
            out += "</%s>" % tag
    elif not opening_only:
        out += " />"
    if indent:
        out = indent_text(out, indent)[:-1]
    from invenio.utils.text import wash_for_utf8
    return EscapedString(wash_for_utf8(out))
Esempio n. 16
0
def create_tag(tag, escaper=EscapedHTMLString, opening_only=False, body=None, escape_body=False, escape_attr=True, indent=0, attrs=None, **other_attrs):
    """
    Create an XML/HTML tag.

    This function create a full XML/HTML tag, putting toghether an
    optional inner body and a dictionary of attributes.

        >>> print create_html_tag ("select", create_html_tag("h1",
        ... "hello", other_attrs={'class': "foo"}))
        <select>
          <h1 class="foo">
            hello
          </h1>
        </select>

    @param tag: the tag (e.g. "select", "body", "h1"...).
    @type tag: string
    @param body: some text/HTML to put in the body of the tag (this
        body will be indented WRT the tag).
    @type body: string
    @param escape_body: wether the body (if any) must be escaped.
    @type escape_body: boolean
    @param escape_attr: wether the attribute values (if any) must be
        escaped.
    @type escape_attr: boolean
    @param indent: number of level of indentation for the tag.
    @type indent: integer
    @param attrs: map of attributes to add to the tag.
    @type attrs: dict
    @return: the HTML tag.
    @rtype: string
    """

    if attrs is None:
        attrs = {}
    for key, value in iteritems(other_attrs):
        if value is not None:
            if key.endswith('_'):
                attrs[key[:-1]] = value
            else:
                attrs[key] = value
    out = "<%s" % tag
    for key, value in iteritems(attrs):
        if escape_attr:
            value = escaper(value, escape_quotes=True)
        out += ' %s="%s"' % (key, value)
    if body is not None:
        if callable(body) and body.__name__ == 'handle_body':
            body = body()
        out += ">"
        if escape_body and not isinstance(body, EscapedString):
            body = escaper(body)
        out += body
        if not opening_only:
            out += "</%s>" % tag
    elif not opening_only:
        out += " />"
    if indent:
        out = indent_text(out, indent)[:-1]
    from invenio.utils.text import wash_for_utf8
    return EscapedString(wash_for_utf8(out))