Python TextBetweenMarkers Examples, logotheras.text.textbetweenmarkers.TextBetweenMarkers Python Examples

Example #1

0

Show file

File: link.py Project: suizokukan/logotheras

 def __init__(self):
     """
             Link.__init__
     """
     TextBetweenMarkers.__init__(self,
                                 marker_start = LINK_START,
                                 marker_end = LINK_END)

Example #2

0

Show file

File: link.py Project: suizokukan/logotheras

    def getExtracts(self, context, source):
        """
                Link.getExtracts

                context         : (str)
                source          : (str)

                Return a list of LinkInfo objects
        """
        res = []

        for textbetweenmarkers in TextBetweenMarkers.getExtracts(self, source):

            name = textbetweenmarkers.substring
            pos0 = textbetweenmarkers.pos0

            separator = source.find(LINK_SEPARATOR)
            if separator == -1:
                # no separator, artiname only :
                artiname = name
                entryname = None
            else:
                # separator between artiname and entryname :
                artiname = source[separator:]
                entryname = source[:separator]

            res.append( LinkInfo( context = context,
                                  pos0 = pos0,
                                  artiname = artiname,
                                  entryname = entryname,
                                  source = source ))

        return res

Example #3

0

Show file

File: entry.py Project: suizokukan/logotheras

    def initFromStr(self,
                    informationsdata,
                    title_hlevel,
                    title_pending_text,
                    str_content,
                    reading_position,
                    srclanguage):
        """
                Entry.initFromStr()

                informationsdata        : InformationsData object
                title_hlevel            : HierarchicalLevel object
                title_pending_text      : None or a string.
                str_content             : a list of strings.
                reading_position        : ReadingPosition object
        """
        self.reset()
        self.reading_position = reading_position

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # hierarchical level :
        self.entrydata.hlevel = title_hlevel.hleveldata

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # title :
        if title_pending_text is None:
            self.entrydata.title = None

        else:

            # links are forbidden in the entries' title :
            if LINK_START in title_pending_text:
                msg = "(ERR059) Link found in an entry's title; title={0} -> {1}"
                self.errors.error(msg.format(title_pending_text,
                                             self.reading_position))

            # to-be-duplicated entry ?
            if BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT in title_pending_text or \
               BODY_ARTICLE_TO_DUPLICATED_IMPORTANT in title_pending_text:

                if BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT in title_pending_text:
                    # 'normal' symbol :
                    dup_symbol = BODY_ARTICLE_TO_DUPLICATED_NOTIMPORTANT
                else:
                    # 'important' symbol :
                    dup_symbol = BODY_ARTICLE_TO_DUPLICATED_IMPORTANT

                # only one result expected :
                tbm = TextBetweenMarkers(marker_start=dup_symbol,
                                         marker_end=dup_symbol)

                results = tbm.getExtracts( title_pending_text )

                if len(results) != 1:
                    msg = "(ERR058) Wrong format : " \
                          "the to-be-duplicated entry can't be read; -> {0}"
                    self.errors.error(msg.format(self.reading_position))
                else:
                    self.entrydata.entry_to_be_duplicated = results[0].substring

                    self.entrydata.important_entry_to_be_dup = (dup_symbol == \
                                                            BODY_ARTICLE_TO_DUPLICATED_IMPORTANT)

                # we erase the markers linked to the 'to-be-duplicated' entry :
                title_pending_text = title_pending_text.replace(dup_symbol, "")

            self.entrydata.title = title_pending_text.strip()

            # links in <self.entrydata.title> ?
            self.entrydata.links = Link().getExtracts(source = self.entrydata.title,
                                                      context = "entry.title+" + \
                                                      self.reading_position.getShortDescription())

        #~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
        # text and extract(s) :
        current_position = None # None or "extract"
        current_extract = []    # list of strings

        for line in str_content:

            if line.strip() == "":
                if current_position == "extract":
                    # end of the current extract :
                    new_extract = Extract( errors = self.errors,
                                           logotherasdata = self.logotherasdata )
                    new_extract.initFromStr( informationsdata = informationsdata,
                                             src = current_extract,
                                             reading_position = reading_position,
                                             srclanguage = srclanguage)

                    extract_freeness = \
                      informationsdata.getTheMinFreenessOfAnExtract(new_extract.extractdata)

                    if extract_freeness >= logotheras.options.OPTIONS["minimal freeness"]:
                        self.entrydata.append( new_extract.extractdata )
                    else:
                        msg = "skipping an extract due to its freeness value; extract={0}; -> {1}"
                        self.errors.info(msg.format(new_extract.extractdata, reading_position))

                    current_position = None
                    current_extract = []

            elif not line.startswith(BODY_PREFIX_BEFORE_EXTRACT):
                self.entrydata.text.append(line)

            else:
                if current_position is None:
                    # first line of an extract :
                    current_position = "extract"
                    current_extract.append( line[len(BODY_PREFIX_BEFORE_EXTRACT):] )
                else:
                    # next line of the current extract :
                    current_extract.append( line[len(BODY_PREFIX_BEFORE_EXTRACT):] )

        if current_position is not None:
            # we add the last extract :
            new_extract = Extract( errors = self.errors,
                                   logotherasdata = self.logotherasdata )
            new_extract.initFromStr( informationsdata = informationsdata,
                                     src = current_extract,
                                     reading_position = reading_position,
                                     srclanguage = srclanguage)

            extract_freeness = \
                informationsdata.getTheMinFreenessOfAnExtract(new_extract.extractdata)

            if extract_freeness >= logotheras.options.OPTIONS["minimal freeness"]:
                self.entrydata.append( new_extract.extractdata )
            else:
                msg = "skipping an extract due to its freeness value; extract={0}; -> {1}"
                self.errors.info(msg.format(new_extract.extractdata, reading_position))

        return self