Esempi in Python per GetHTMLAndParse

Linguaggio di programmazione: Python

Spazio dei nomi/nome del pacchetto: gethtmlandparse

Classe/tipologia: GetHTMLAndParse

Esempi su hotexamples.com: 17

GetHTMLAndParse in Python: 17 esempi trovati. Questi sono i migliori esempi reali in Python per gethtmlandparse.GetHTMLAndParse, estratti da progetti open source. Li puoi valutare, per aiutarci a migliorare la qualità dei nostri esempi.

Metodi utilizzati di frequente

Mostra Nascondi

GetHTMLAndParse(5)

find_anchor_elem(3)

get_etree(2)

ghap(2)

is_wanted_mime(2)

check_file(1)

compare_domains(1)

count_all_headers(1)

get_all_links(1)

get_anchor_from_link(1)

get_charset(1)

get_domain_name(1)

get_pager_links(1)

is_page(1)

look_for_frame(1)

Esempio n. 1

Mostra file

    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

Esempio n. 2

Mostra file

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = [
            "d((eliverables?)|[0-9])",
            "documents?",
            "reports?",
            "public(ation)?s?",
            "results?",
            "presentations?",
            "library",
            #"projects?",
            "outocomes?",
            "downloads?",
            "outputs?"
        ]

        if addkeyw != None:
            self._sigwords.append(addkeyw)
        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = {url: [0, 0, 0]}

        self.base_url = url  # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0  # index/noindex/frame/special
        self.VISIT = 1  # unvisited/visited
        self.RANK = 2  # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug

        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")

Esempio n. 3

Mostra file

File: getdelivrecords.py Progetto: KNOT-GIT/mDeliverables

    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers(
            (
                (
                    "User-Agent",
                    "Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19",
                ),
                ("Accept", "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8"),
            )
        )
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)

        self._unwanted_titles = ["Download here", "PDF format"]
        self._records = []

        ################################
        # manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ("br", "img", "html", "body")
        # tag tolerance
        self.tagtol = 1

Esempio n. 4

Mostra file

File: getdeliverablerecords.py Progetto: xkolac11/Deliverables

 def __init__(self, verbose=False, debug=False):
     # init agent for parsing html
     self.htmlHandler = GetHTMLAndParse()
     # to get region where to search for records
     self.regionHandler = GetDeliverableRegion()
     # init text formatter (encoding, erasing white chars etc.)
     self.formatter = TextFormatUtils()
     # list of acceptable words in title (header) of table
     self.table_sem_words = [
         'deliverable', 'description', 'name', 'date', 'dissemination',
         'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
     ]
     self._omitted_tags = ('br', 'img', 'html', 'body')
     # tag tolerance
     self.tagtol = 1
     # verbose and debug flags
     self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
     self.__verbose = self.debugger.verbose
     self.__debug = self.debugger.debug

Esempio n. 5

Mostra file

File: getdelivpage.py Progetto: KNOT-GIT/mDeliverables

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = [
            "d((eliverables?)|[0-9])",
            "documents?",
            "reports?",
            "public(ation)?s?",
            "results?",
            "presentations?",
            "library",
            # "projects?",
            "outocomes?",
            "downloads?",
            "outputs?",
        ]

        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = {url: [0, 0, 0]}

        self.base_url = url  # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0  # index/noindex/frame/special
        self.VISIT = 1  # unvisited/visited
        self.RANK = 2  # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        # set debug flag
        self.__dbg__ = debug

        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")

Esempio n. 6

Mostra file

File: getdeliverablerecords.py Progetto: KNOT-GIT/mDeliverables

 def __init__(self, verbose=False,debug=False):
     # init agent for parsing html
     self.htmlHandler = GetHTMLAndParse()        
     # to get region where to search for records
     self.regionHandler = GetDeliverableRegion()        
     # init text formatter (encoding, erasing white chars etc.)
     self.formatter = TextFormatUtils()               
     # list of acceptable words in title (header) of table
     self.table_sem_words = ['deliverable', 'description', 'name', 'date',
                             'dissemination', 'no.', 'wp', 'delivery',
                             'particip', 'title', 'nature']
     self._omitted_tags = ('br', 'img', 'html', 'body')
     # tag tolerance
     self.tagtol = 1
     # verbose and debug flags
     self.debugger = DeliverableDebugger(verbose = verbose,debug = debug)
     self.__verbose = self.debugger.verbose
     self.__debug = self.debugger.debug

Esempio n. 7

Mostra file

    def __init__(self, options=opt, url=None):
        # get options
        self.opt = options
        if url != None:
            self.opt_url = url
        else:
            self.opt_url = self.opt.url

        # initialize main html handler and parser
        self.htmlhandler = GetHTMLAndParse()

        # searching deliverable page
        self.pagesearch = GetDelivPage(self.opt_url,
                                       verbose=self.opt.verbose,
                                       debug=self.opt.debug,
                                       addkeyw=self.opt.regexp)

        # extracting informations from page
        self.recordhandler = GetDelivRecords(debug=self.opt.debug)

Esempio n. 8

Mostra file

class GetDelivRecords:
    def __init__(self, verbose=False, debug=False):
        self.__dbg__ = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0,
                                           headercoef=3.0,
                                           mintextlen=20)

        self._unwanted_titles = ['Download here', 'PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()

        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    " + str(msg))
            except UnicodeError:
                print(_err)

    def __verbose(self, msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    " + str(msg))
            except UnicodeError:
                print(_err)

########################Processing sequencewrapper output######################

    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self, entry):

        text = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link") != None:
                if self.agent.is_wanted_mime(e.attrib.get(
                        "link")) and e.attrib.get("link") not in links:
                    links.append(e.attrib.get("link"))

        res = self._deliv_in_text(text, links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res) == list:
            res = self._more_entry_in_record(entry)
            if (res == True):
                self.__debug("")
                return True
            else:
                return False

        res = self._deliv_in_link(text, links, entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""

    def _deliv_in_text(self, text, links):

        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if pattern.search(t):
                    _title = t

            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) == list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                        _link = l
                    else:
                        #if there was already found link
                        if _link[:s.rfind(_link, '.')] == l[:s.rfind(l, '.')]:
                            break
                        else:
                            return ['-3', 'Probably more records in one entry']

        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title, abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            if _link:
                #print "LINK:"+_link
                self.__debug("Link: " + _link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""

    def _deliv_in_link(self, text, links, entry=False):

        ##print text
        ##print links
        #print "*"*40

        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",
                             re.I)

        _link = False

        for l in links:
            if pattern.search(l):
                if _link == False:
                    _link = l
                else:
                    return ['-3', 'Probably more records in one entry']

        #loop through text in entry looking for title and description
        for t in text:
            if _title == False:
                if len(t) > 10:
                    _title = t
            #set the longest string as description of deliverable
            if len(_description) < len(t):
                _description = t

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)

        #create object
        if _link:
            pub = RRSPublication(title=_title, abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*" * 40)
            self.__debug("Title: " + _title)
            self.__debug("Description: " + _description)

            self.__debug("Link: " + _link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """

    def _check_title(self, title, tolerance=10):

        for t in self._unwanted_titles:
            if (s.find(s.lower(title), s.lower(t))) != -1:
                if (len(t) + tolerance) > len(title):
                    return False
        return True

    "looks for an element with highest visibility rank in parent elemet"

    def _repair_title(self, entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
            try:
                if i.attrib.get('visibility') > visibility:
                    visibility = i.attrib.get('visibility')
                    title = i.text
            except AttributeError:
                pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"

    def _more_entry_in_record(self, entry):
        for ch in entry.iter('chunk'):
            if ch.text != None and ch.attrib.get("link") != None:
                if self.agent.is_wanted_mime(ch.attrib.get("link")):
                    _pub = RRSPublication(title=ch.text)
                    typ = RRSPublication_type(type='techreport')
                    _pub['type'] = typ
                    _l = RRSUrl(link=ch.attrib.get("link"))
                    _rel = RRSRelationshipPublicationUrl()
                    _rel.set_entity(_l)
                    _pub['url'] = _rel
                    self._entriesFoundInLinks.append(_pub)

    "Process pages definied by urls"

    def process_pages(self, pages):
        self._entriesFoundInText = []
        self._entriesFoundInLinks = []
        self._urls = pages
        self._pages = self._crawler.start(pages)

        #creates RRSPublication objects with information about deliverables
        for u in self._urls:
            self._wraper.wrap(self._pages[u], u)
            self._tree = self._wraper.get_etree()
            #print self._wraper.get_xml()
            for entry in self._tree.iter("entry"):
                self._make_deliv_record(entry)

        if len(self._entriesFoundInText) > 3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks
        else:
            self._manual_processing()

    "This method is called when ther was no records found in output of sequencewrapper"

    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText) > 0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInText)) +
                         " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks) > 0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " +
                         "{0}".format(len(self._entriesFoundInLinks)) +
                         " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################
    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """

    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del (row_list)
        return

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

    """ Get element texts only, dont look for descendants texts """

    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links, texts]

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(_texts, _links)
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""

    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://' + urlsplit(link)[1] + '/')

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()

#############PUBLIC METHODS TO GET RESULTS

    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables()) == 0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records

Esempio n. 9

Mostra file

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

Esempio n. 10

Mostra file

class GetDeliverableRegion:
    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

    """ Get data region.
    Returns element tree with region where are deliverables stored """

    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()

        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region  # else return region

    """ Stabile searching parent of all elements in elem_list
    using method of making element parent vectors and comparing them.
    Tolerance of n tags makes the region smaller if there are
    >>not deliverable<< pdfs in more regions on the page."""

    def _get_common_parent(self, elem_list, tolerance):

        # supporting method - kind of bigger lambda. Get minimal length of
        # inside lists.
        def _minlength(seq_list):
            return min([len(seq) for seq in seq_list])

        # next supporting method: check the elements in list.
        # if elements are the same, its common parent tag - return True.
        def _iscommon(elem_seq, tol):
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol + 1:
                return False
            # if only two anchors found then we have only two tags
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True

        # get the most frequenced tag in list
        def _most_frequent(seq):
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]

        #
        # now continue with method _get_common_parent()
        #
        vectors = []  # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent()  # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors]
                  for i in range(1,
                                 _minlength(vectors) + 1)]
        # now check all lists in list zipped. If these are filled with the same
        # elements, its a common parent. The last list before difference contains
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)

    """ Get texts from element and his descendants.
    If string is True, returns texts as one string with spaces.
    elem: lxml element """

    def _get_element_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get deliverable region - returns etree with region.
     If 0 returned parent_tag is region,
     if -1 returned some error occured searching,
     if html string returned its a region. """

    def _get_deliverable_region(self, parent_tag):
        def _convert_tag_to_html(tag):
            tag_html = lxml.etree.ElementTree(tag)
            return lxml.etree.tostring(tag_html)

        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['', None, None]
        self._result_html_region = ''
        reg_flag = False  # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1, 7):
            headers.extend(parent_tag.findall('.//h' + str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue
                text = self._get_element_texts(tag)
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break

            # test _reg_atr. If there is no deliverable region, then all
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(
                        tag.getparent())
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0

Esempio n. 11

Mostra file

File: getdelivpage.py Progetto: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDelivPage:

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = ["d((eliverables?)|[0-9])",
                          "documents?",
                          "reports?",
                          "public(ation)?s?",
                          "results?",             
                          "presentations?",
                          "library",
                           #"projects?",
                          "outocomes?", "downloads?",
                          "outputs?"]
        
        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = { url : [0,0,0] }

        self.base_url = url # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression 
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0 # index/noindex/frame/special
        self.VISIT = 1 # unvisited/visited
        self.RANK = 2 # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug
        
        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")


    def __verbose(self, msg):
        _err = "cannot decode verbose message."
        if self.__verbose__ == True:
            try:
                print(str(msg))
            except UnicodeError:
                print(_err) 

        
    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err) 

################################################################################

    """ Initialize item in dictionary to noindex/unvisited/rank=0 """
    def _link_item_init__(self, link, index=1, visit=0, rank=0):
        # default setting: noindex,unvisited,norank
        if not self._link_stack.has_key(link):
           self._link_stack[link] = [index,visit,rank]
        return


    """ Edits item in dictionary self._link_stack """
    def _link_item_edit(self, link, index=None, visit=None, rank=None):
        if index is not None:
            self._link_stack[link][self.IND_FR] = index
        if visit is not None:
            self._link_stack[link][self.VISIT] = visit
        if rank  is not None:
            # null rank if zero is argument
            if rank == 0:
                self._link_stack[link][self.RANK] = 0
            # add rank
            else:
                self._link_stack[link][self.RANK] += rank
        return

    
    """ Method representing one level of cascade. Do almost any job to search 
    one word in dictionary """
    def _level_job(self, index=None):
        # get list of links from anchors containing one of expression
        # from self_sigwords
        result = 0
        if index is not None: # searching with one 
            link_list = self.agent.get_all_links(
                regul = re.compile(self._sigwords[index], re.I), 
                base  = self._current_url)        
        else:
            link_list = self.agent.get_all_links(base = self._current_url)
            index = self.rank_const
        if link_list:
            #
            #   RANK giving & filter
            #       
            if index is None:
                rank = 0
            elif index == 0:
                rank = self.rank_const * 2
            else:
                rank = self.rank_const - index
            for link in link_list:
                # GTFO javascript
                if not link or "javascript:" in link or "mailto:" in link: 
                    continue
                if "#" in link: # if pointer delete it
                    link = re.sub('#.*$', '', link)
                if len(link) > 200:  
                    continue                
                if self._link_stack.get(link):
                    # RANK if you see those links for first
                    if self._link_stack[link][self.VISIT] == 0:
                        self._link_item_edit(self._current_url, rank=rank)
                    continue
                if not self.agent.compare_domains(self.base_url, link):
                    continue

                split_link = re.sub("https?://.+?/", "", link)
                # check whether it is file or not
                 
                if self.agent.is_wanted_mime(link):
                    #
                    #   Some PDF or DOC found
                    #
                    # RANK
                    self._link_item_edit(self._current_url, rank=10)
                    self.__debug("Added rank 10 to "+self._current_url)
                    # 
                    if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", 
                                  split_link, re.I):
                        self.__debug("Type D on "+self._current_url) # debug print
                        # RANK
                        self._link_item_edit(self._current_url, rank=100)
                    continue
                elif not self.agent.is_page(link):
                    continue
                    self.__debug("UNWATED")
                #
                # Add link
                #
                # RANK
                # initialization of link item in dict
                self._link_item_init__(link)
                self._link_item_edit(self._current_url, rank=rank)
                result += 1
                # debug print
                self.__debug("ADD "+link[7:60])
                self.__debug("Rank "+str(rank)+" "+self._current_url)    
        return result


    """ Cascade search. May improve the speed of script """
    def _cascade_search(self):
        result = 0
        # first cascade - look for links cont. deliverables
        result += self._level_job(0)
        if not result == 0:
            return
        # second cascade - look for links cont. documents and publications
        result += self._level_job(1) 
        result += self._level_job(2)
        if not result == 0:
            return
        # last cascade - all the rest
        for i in range(3,self.rank_const):
            result += self._level_job(i)
        # check Intro page (all links) only on index
        if result == 0 and self._link_stack[self._current_url][0] == 0:
            result += self._level_job() 
        """if result == 0:
            # RANK DOWN
            self._link_item_edit(self._current_url, rank=0)
            print "No anchors on the page"""
        return


    """ TRY TO repair link. But for now only append / in base """
    def _repair_links(self, base=None):
        if base is None:
            base = self.base_url
        if re.match(".*[^/]$", base):
            base += "/"
        if self.agent.get_etree() == -1:
            return -1
        links = self.agent.get_all_links(base = base)
        # compare link with base url
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            link = re.sub("https?://.+?/", base, link)
            # if match, save it as special case
            self._link_item_init__(link, index=3)


    """ Checking intro page. It is page without content, only with Enter label """
    def _check_intro(self):
        links = self.agent.get_all_links(base = self._current_url)
        self.__debug("We've found intro links: "+str(links))
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            # save new link as normal page
            self._link_item_init__(link, index=1)
   

    """ Looks for frames on the page """
    def _check_frames(self):
        frames = self.agent.look_for_frame(base = self._current_url)
        if not frames:
            return None
        fcount = len(frames)
        # debug print
        self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) 
        # save new link as frame page
        for link in frames:
            if self.agent.compare_domains(self._current_url, link):
              self._link_item_init__(link, index=2)
        return fcount

    
    """ Checks for titles and gives rank according the result """
    def _check_titles(self):
        for i in range(self.rank_const):
            hcount = self.agent.count_all_headers(
                re.compile( self._sigwords[i], re.I ))
            if not hcount == 0:
                if i == 0: 
                    #
                    # "deliverable" match, the highest rank
                    #
                    # RANK constant is multiplied by 4
                    self.__debug("deliverable match"+str(self.rank_const *
                    4)+" "+self._current_url)
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const * 4)
                else:
                    #
                    # other word match
                    #
                    # do not multiplied rank constant
                    self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) 
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const - i)


    """ Get information about current link """
    def _check_anchor(self):
        # tt is Text and Title
        tt = self.agent.get_anchor_from_link(self._current_url)
        # return 0 if no anchor match
        if tt == 0: return tt;
        # match for deliverables
        if re.search(self._sigwords[0], tt, re.I):
            self.__debug("Anchor matched "+self._current_url) # debug print
            return 1
        

    """ Returns list of unvisited links. Useful in cycle. """
    def _check_unvisited_links(self):
        unvisitedLinks = []
        for link in self._link_stack:
            if self._link_stack[link][self.VISIT] == 0: # if unvisited
                unvisitedLinks.append(link)
        return unvisitedLinks # list of unvisited page links

    
    """ Aplying all methods to unvisited links - next level of searching. 
    It is main private method. Only this method can decide end of searching """
    def _handle_unvis_links(self):
        unvisLinks = self._check_unvisited_links()
        if not unvisLinks:
            return None # end of searching
        for link in unvisLinks: # cycle in unvisited links
            # visit and parse page
            self._link_item_edit(link, visit = 1)

            (res, err) = self.agent.ghap(link)
            if res == -1:
                self.__debug(str(err)+" "+str(link)) # debug print
                # if link is broken (IND_FR == 3)
                if self._link_stack[link][self.IND_FR] != 3:
                    self._repair_links()
                continue
            # little hack with error message, there is no error but URL!
            if res == 2:
                self.base_url = err # URL of the new base
            self.__debug("Getting url in ghap(): "+str(link)) # debug print
            self.__verbose("Searching... URL: "+str(link)) # verbose print
            self._current_url = link
            if self._link_stack[link][self.IND_FR] == 2:
                dname = self.agent.get_domain_name(link)
                if dname is not None:
                    self.base_url = dname

            ###############
            # frame check #
            self._check_frames()

            ################
            # titles check #
            self._check_titles() # rank giving here

            ################
            # anchor check #
            if self._check_anchor():
                self._link_item_edit(link, rank = 10) # rank giving here too

            self._cascade_search() # search for next links on this page
        # when no unvisited links in list, return
        return 1


    """ Returns link of the highest value of rank in self._link_stack. 
    It is called in the end of process."""
    def _get_highest_ranks_link(self):
        hRank = 0
        hLink = ""
        # check all links and choose link with the highest rank
        for link in self._link_stack:
            if self._link_stack[link][self.RANK] > hRank:
                hLink = link
                hRank = self._link_stack[link][self.RANK]
        return hLink # WINNER


    """ Returns list of all links leading to deliverables. 
    Try to find more sites with deliverables.. i.e. like www.awissenet.com has.
    Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one...
    Page usualy looks like:       next pages: 1 2 3 4 ... """
    def _get_deliv_link_list(self,first_link):
        # agent gets first_link
        final_list = []
        nonvisited = [first_link]
        current = nonvisited.pop()
        while current:
            if not current or "javascript:" in current or "mailto:" in current:
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue
            if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe 
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue

            nonvisited.extend(self.agent.get_pager_links(base=current))
            final_list.append(current) # append only one link
            try:
                current = nonvisited.pop()
            except: 
                break
        return final_list # returning all pages with deliverables
    
        
    """ Returns list of links on pages with deliverable-documents.
    If found returns list, if not found, return -1. 
    Only public method in module. """
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result

Esempio n. 12

Mostra file

File: getdelivrecords.py Progetto: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDelivRecords:


    def __init__(self,verbose=False,debug=False):
        self.__dbg__  = debug
        self.__verbos = verbose
        self._crawler = Crawler()
        self._crawler.set_headers((
                   ('User-Agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.19) Gecko/2010040116 Ubuntu/9.04 (jaunty) Firefox/3.0.19'), \
                   ('Accept', 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8')
                 ))
        self._wraper = HTMLSequenceWrapper(childcoef=5.0, headercoef=3.0, mintextlen=20)
             
        self._unwanted_titles = ['Download here','PDF format']
        self._records = []

        ################################
        #manual processing
        self.agent = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        

    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err)
    def __verbose(self,msg):
        _err = "cannot decode debug info."
        if self.__verbose == True:
            try:
                print("Verbose:    "+str(msg))
            except UnicodeError:
                print(_err)

    
    
########################Processing sequencewrapper output######################
    """function gets an entry from output of sequence wrapper
       it tries to create deliv record and retruns true if succed. """
    def _make_deliv_record(self,entry):
        
        text  = []
        links = []

        #harvest links and text form entry
        for e in entry.iter():
            if e.text != None:
                text.append(e.text)
            if e.attrib.get("link")!=None:
                if self.agent.is_wanted_mime(e.attrib.get("link")) and e.attrib.get("link") not in links:
                   links.append(e.attrib.get("link"))


        res = self._deliv_in_text(text,links)
        if type(res) == RRSPublication:
            self._entriesFoundInText.append(res)
            self.__debug("Record found cause of text")
            return True

        elif type(res)==list:
            res=self._more_entry_in_record(entry)
            if(res==True):
               self.__debug("")
               return True
            else:
               return False

        res = self._deliv_in_link(text,links,entry)
        if type(res) == RRSPublication:
            self._entriesFoundInLinks.append(res)
            self.__debug("Record found cause of link")
            return True

        return False

    """look for keyword in text"""
    def _deliv_in_text(self,text,links):
        
        #print text
        #print links
        #print "*"*40
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
              if pattern.search(t):
                     _title = t


           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t

        if _title == _description:
            _description = ""

        _link = False

        if type(links) == str:
            if self.agent.is_wanted_mime(links):
                _link = links
        elif type(links) ==list:
            for l in links:
                if self.agent.is_wanted_mime(l):
                    if _link == False:
                       _link = l
                    else:
                       #if there was already found link
                       if _link[:s.rfind(_link,'.')] == l[:s.rfind(l,'.')]:
                          break
                       else:
                          return ['-3','Probably more records in one entry']

        
        #create object
        if _title:
            #print "TITLE:"+_title
            pub = RRSPublication(title=_title,abstract=_description)
            _typ = RRSPublication_type(type='techreport')
            pub['type'] = _typ
            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)



            if _link:
                #print "LINK:"+_link
                self.__debug("Link: "+_link)
                l = RRSUrl(link=_link)
                pl_rel = RRSRelationshipPublicationUrl()
                pl_rel.set_entity(l)
                pub['url'] = pl_rel

            return pub
        else:
            #this entry is not probably deliverable
            return False

    """look for a key word in link"""
    def _deliv_in_link(self,text,links,entry = False):
        
        ##print text
        ##print links
        #print "*"*40
        
        _title = False
        _description = ""
        pattern = re.compile("(DELIVERABLES?)|(D[0-9][0-9]*(.[0-9][0-9]*)?)",re.I)

        _link = False
        
        for l in links:
            if pattern.search(l):
                  if _link == False:
                     _link =l
                  else:
                     return ['-3','Probably more records in one entry']



        #loop through text in entry looking for title and description
        for t in text:
           if _title == False:
                if len(t)>10 :
                     _title = t
           #set the longest string as description of deliverable
           if len(_description)<len(t):
                _description = t
             

        if _title == _description:
            _description = ""

        #if chosen title is not valid try to find better in parent entry
        if _title and not self._check_title(_title) and entry != False:
            _title = self._repair_title(entry)        
       
        
        #create object
        if _link:
            pub = RRSPublication(title=_title,abstract=_description)
            typ = RRSPublication_type(type='techreport')
            pub['type'] = typ

            self.__debug("*"*40)
            self.__debug("Title: "+_title)
            self.__debug("Description: "+_description)
            
            self.__debug("Link: "+_link)
            l = RRSUrl(link=_link)
            pl_rel = RRSRelationshipPublicationUrl()
            pl_rel.set_entity(l)
            pub['url'] = pl_rel
            
            return pub
        else:
            #this entry is not probably deliverable
            return False

    """Check if title contents only unwanted string with some tolerance
    return true if title is ok
    """
    def _check_title(self,title,tolerance=10):
        
        for t in self._unwanted_titles:
           if (s.find(s.lower(title),s.lower(t))) != -1:
               if (len(t)+tolerance) > len(title):
                   return False
        return True

    "looks for an element with highest visibility rank in parent elemet"
    def _repair_title(self,entry):
        parent = entry.getparent()
        visibility = 0
        title = ""
        for i in parent.iter():
             try:
                 if i.attrib.get('visibility') > visibility:
                     visibility = i.attrib.get('visibility')
                     title = i.text
             except AttributeError:
                 pass

        if title != "":
            return title
        else:
            return False

    "Function try to create array of deliverables from one entry in xml tree"
    def _more_entry_in_record(self,entry):
        for ch in entry.iter('chunk'):
           if ch.text != None and ch.attrib.get("link")!=None:
              if self.agent.is_wanted_mime(ch.attrib.get("link")):
                 _pub= RRSPublication(title=ch.text)
                 typ = RRSPublication_type(type='techreport')
                 _pub['type'] = typ
                 _l = RRSUrl(link=ch.attrib.get("link"))
                 _rel = RRSRelationshipPublicationUrl()
                 _rel.set_entity(_l)
                 _pub['url'] = _rel
                 self._entriesFoundInLinks.append(_pub) 
      
    "Process pages definied by urls"
    def process_pages(self,pages):
       self._entriesFoundInText = []
       self._entriesFoundInLinks = []
       self._urls = pages
       self._pages = self._crawler.start(pages)
       

       #creates RRSPublication objects with information about deliverables
       for u in self._urls:
          self._wraper.wrap(self._pages[u],u)
          self._tree = self._wraper.get_etree()
          #print self._wraper.get_xml()
          for entry in self._tree.iter("entry"):
             self._make_deliv_record(entry)
          
       
       if len(self._entriesFoundInText)>3:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")
       
            self._records = self._entriesFoundInText
       elif len(self._entriesFoundInLinks)>3:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")
       
            self._records = self._entriesFoundInLinks
       else:
            self._manual_processing()
            

    "This method is called when ther was no records found in output of sequencewrapper"
    def _manual_processing(self):
        self._entriesFoundInLinks = []
        self._entriesFoundInText = []
        self._manual_process_page(self._urls, urlsplit(self._urls[0])[1])
        if len(self._entriesFoundInText)>0:
            self.__debug("Deliverbles descriptions content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInText)) + " deliv records")

            self._records = self._entriesFoundInText
        elif len(self._entriesFoundInLinks)>0:
            self.__debug("Deliverbles links content keywords")
            self.__debug("Found " + "{0}".format(len(self._entriesFoundInLinks)) + " deliv records")

            self._records = self._entriesFoundInLinks

    ########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


  
    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """
    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.agent.is_wanted_mime(anchor_link): # check if it is file we want
                return anchor_link
        return None


    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """
    def _handle_table(self):
        for row in self.parentetree:
            if not row.tag == 'tr':
                continue
            row_list = []
            _thislink = self._get_row_link(row)
            if _thislink == None:
                continue

            for column in row:
                text = self._get_descendats_texts(column)
                if not text:
                    continue
                row_list.append(text)

            res = self._deliv_in_text(row_list, [_thislink])
            if type(res) == RRSPublication:
                self._entriesFoundInText.append(res)
                self.__debug("Record found cause of text")
            else:
                res = self._deliv_in_link(row_list, [_thislink])
                if type(res) == RRSPublication:
                    self._entriesFoundInLinks.append(res)
                    self.__debug("Record found cause of link")
            del(row_list)
        return 
       

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check.
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.agent.is_wanted_mime(href):
                return True
        return False


    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """
    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    """ Gets difference between first two anchors. """
    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1


    """ Only anchors found. No optional information. """
    def _get_anch_only(self):
        anchlist = self.agent.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    
    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """
    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)


    """ Helper method - check if sequence of tags rec contains deliv anchor
    """
    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

  
    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        links = []
        texts = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.agent.is_wanted_mime(href):
                links.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                texts.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                texts.append(self.formatter.format(tag.text))
        return [links,texts]


    """ Harvest texts out of tags and return list of lists (record) """
    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        _links = []
        _texts = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                harvested = (self._get_tag_content(tag))
                _links.extend(harvested[0])
                _texts.extend(harvested[1])
            #self._records.append(self._rec)
            res = self._deliv_in_text(_texts, _links)
            if type(res) == RRSPublication:
                 self._entriesFoundInText.append(res)
                 self.__debug("Record found cause of text")
            else:
                 res = self._deliv_in_link(_texts, _links)
                 if type(res) == RRSPublication:
                      self._entriesFoundInLinks.append(res)
                      self.__debug("Record found cause of link")
            _links = []
            _texts = []
            self._rec = []
        return self._records


    """ Text harvesting for sequences. """
    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)



    """ Get records from region according document links
        this method is used when there was no records found
        in output of sequencewrapper"""
    def _manual_process_page(self, links, baseurl):
        _err = None
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                self.__debug(_err)
                continue
            #make all links absolute in parent tree
            hrefs = self.parentetree.findall('.//a[@href]')
            for href in hrefs:
                href.make_links_absolute('http://'+urlsplit(link)[1]+'/')
            
            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())

            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                self._handle_table()
            else:
                self.__verbose("Handling sequences")
                self._handle_sequence()
    

#############PUBLIC METHODS TO GET RESULTS
    def get_deliverables_XML(self):
        """return infromations about deliverables stored in objects as xml"""
        if len(self.get_deliverables())==0:
            return derrno.__err__(derrno.ENOREC)
        output = StringIO.StringIO()
        converter = Model2XMLConverter(stream=output)
        converter.convert(self.get_deliverables())
        result = output.getvalue()
        output.close()
        return result
        

    def get_deliverables(self):
        """return objects containing infromations"""
        if len(self._records) == 0:
            return derrno.__err__(derrno.ENOREC)
        else:
            return self._records

Esempio n. 13

Mostra file

File: getdelivrecords.py Progetto: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()

Esempio n. 14

Mostra file

File: getdelivrecords.py Progetto: Nela-xkaspa35/Rozsireni-portalu-vyzkumnych-projektu

class GetDeliverableRegion:

    def __init__(self):
        # init agent for parsing html
        self.agent = GetHTMLAndParse()

        # format text
        self.formatter = TextFormatUtils()
        
    

    """ Get data region.
    Returns element tree with region where are deliverables stored """
    def get_region(self, url, base, tolerance):
        _res = self.agent.ghap(url)
        if len(_res) == 0:
            return derrno.__err__(errmsg)
        else:
            self._page = self.agent.get_etree()
           
        deliv_elements = self.agent.find_anchor_elem(base=base)
        if len(deliv_elements) == 0:
            return derrno.__err__(derrno.ENODOC, url)
        if len(deliv_elements) == 1:
            return lxml.etree.ElementTree(deliv_elements[0])

        # get parent tag of all deliverable anchors
        parent_element = self._get_common_parent(deliv_elements, tolerance)
        if parent_element == None:
            return derrno.__err__(derrno.ENOREG, "Parent element not found.")

        # get the region out of the parent element
        region = self._get_deliverable_region(parent_element)
        # if parent tag is region
        if region == 0:
            # return element tree made from parent tag
            return lxml.etree.ElementTree(parent_element)
        else:
            print
            #lxml.etree.ElementTree(region).write(sys.stdout,pretty_print=True)
        return region # else return region

   
    """ Stabile searching parent of all elements in elem_list
    using method of making element parent vectors and comparing them.
    Tolerance of n tags makes the region smaller if there are
    >>not deliverable<< pdfs in more regions on the page."""
    def _get_common_parent(self, elem_list, tolerance):

        # supporting method - kind of bigger lambda. Get minimal length of
        # inside lists.
        def _minlength(seq_list):
            return min([len(seq) for seq in seq_list])

        # next supporting method: check the elements in list.
        # if elements are the same, its common parent tag - return True.
        def _iscommon(elem_seq, tol):
            tol_list = []
            for elem in elem_seq:
                if not elem in tol_list:
                    tol_list.append(elem)
            if len(tol_list) > tol+1:
                return False
            # if only two anchors found then we have only two tags
            # and its pretty hard to use tolerance, so we omit it.
            if len(elem_seq) < 3 and len(tol_list) > 1:
                return False
            return True

        # get the most frequenced tag in list
        def _most_frequent(seq):
            suplist = []
            suplist_freq = []
            for el in seq:
                if not el in suplist:
                    suplist.append(el)
                    suplist_freq.append(int(1))
                else:
                    suplist_freq[suplist.index(el)] += 1
            ind = suplist_freq.index(max(suplist_freq))
            return suplist[ind]

        #
        # now continue with method _get_common_parent()
        #
        vectors = [] # here will be vectors stored - list of lists
        for self.elem in elem_list:
            _vector = []
            while 1:
                parent = self.elem.getparent() # exception possible here
                if parent == None:
                    break
                _vector.append(parent)
                self.elem = parent
            vectors.append(_vector)
        # We have parent vectors of all elements from elem_list stored in list
        # $vectors. Then zip the vector list and get sequences of parent tags (and the
        # other tags) sorted from the highest to the lowest parent element.
        zipped = [[row[-i] for row in vectors] for i in range(1, _minlength(vectors)+1)]
        # now check all lists in list zipped. If these are filled with the same
        # elements, its a common parent. The last list before difference contains
        # the main parent tag.
        self.last_seq = []
        for zipvect in zipped:
            if not _iscommon(zipvect, tolerance):
                # return most frequented element in last vector
                return _most_frequent(self.last_seq)
            self.last_seq = zipvect
        return _most_frequent(self.last_seq)


    """ Get texts from element and his descendants.
    If string is True, returns texts as one string with spaces.
    elem: lxml element """
    def _get_element_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    """ Get deliverable region - returns etree with region.
     If 0 returned parent_tag is region,
     if -1 returned some error occured searching,
     if html string returned its a region. """
    def _get_deliverable_region(self, parent_tag):
        def _convert_tag_to_html(tag):
           tag_html = lxml.etree.ElementTree(tag)
           return lxml.etree.tostring(tag_html)

        # list[0] = type, list[1] = atribute, list[2] = lxml tag element
        # in case of headers list[0] = element.tag, then [2] is element
        _reg_atr = ['',None,None]
        self._result_html_region = ''
        reg_flag = False # flag indicating that we are looping over region
        # get headers first
        headers = []
        #lxml.etree.ElementTree(parent_tag).write(sys.stdout,pretty_print=True)
        for i in range(1,7):
            headers.extend(parent_tag.findall('.//h'+str(i)))
        children = parent_tag.getchildren()
        if len(headers) > 0:
            for head in headers:
                text = self._get_element_texts(head)
                if text:
                    if re.search("deliverables", text, re.I):
                        _reg_atr[0] = head.tag
                        _reg_atr[2] = head
                        break
            if _reg_atr[2] == None:
                return 0
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'img': continue;
                text = self._get_element_texts(tag)
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        text = tag.find('img').tail
                    else:
                        text = ' '
                if text:
                    if re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", text, re.I) and \
                        tag.tag == _reg_atr[0]:
                        # next similar title, END of region
                        if reg_flag:
                           break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent())
                    children.remove(tag.getparent())
        # if we dont have headers, try to find other kind of header (title)
        # "Deliverables" and compare with other elements with the same class or id.
        else:
            for tag in parent_tag.iter():
                if tag.text:
                    if re.search("deliverables", tag.text, re.I):
                        if tag.get("class"):
                            _reg_atr[0] = 'class'
                            _reg_atr[1] = tag.get("class")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("id"):
                            _reg_atr[0] = 'id'
                            _reg_atr[1] = tag.get("id")
                            _reg_atr[2] = tag
                            break
                        elif tag.get("style"):
                            _reg_atr[0] = 'style'
                            _reg_atr[1] = tag.get("style")
                            _reg_atr[2] = tag
                            break

            # test _reg_atr. If there is no deliverable region, then all
            # documents make the region
            if _reg_atr[2] == None:
                return 0
            reg_flag = False
            # visit all tag in parent_tag
            for tag in parent_tag.iterdescendants():
                if tag.tag == 'a' and not tag.text:
                    if tag.find('img') is not None:
                        tag.text = tag.find('img').tail
                    else:
                        tag.text = ' '
                if tag.text:
                    if re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # "deliverable" title, BEGIN of region
                        reg_flag = True
                    elif not re.search("deliverables", tag.text, re.I) and \
                        tag.get(_reg_atr[0]) == _reg_atr[1]:
                        # next similar title, END of region
                        if reg_flag:
                            break
                # region content
                if tag in children and reg_flag:
                    self._result_html_region += _convert_tag_to_html(tag)
                    children.remove(tag)
                elif tag.getparent() in children and reg_flag:
                    self._result_html_region+=_convert_tag_to_html(tag.getparent())
                    children.remove(tag.getparent())
        if not self._result_html_region:
            return 0
        # create ElementTree from region
        try:
            return lxml.etree.fromstring(self._result_html_region)
        except:
            try:
                parser = lxml.etree.HTMLParser()
                return lxml.etree.fromstring(self._result_html_region, parser)
            except lxml.etree.XMLSyntaxError:
                return 0

Esempio n. 15

Mostra file

File: getdeliverablerecords.py Progetto: xkolac11/Deliverables

class GetDeliverableRecords:
    def __init__(self, verbose=False, debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        # list of acceptable words in title (header) of table
        self.table_sem_words = [
            'deliverable', 'description', 'name', 'date', 'dissemination',
            'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
        ]
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug

########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get table order (table semantic) """

    def _get_table_order(self):
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr':  # first <tr> match
                for col in desc:  # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                            if self.htmlHandler.check_file(child.get('href')):
                                return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30:
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I):
                return sem_list
        return None

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #

    def _handle_table(self):
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del (row_list)
            return records
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose(
                "Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check. 
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl,
                                                     self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

###
#
#
#
#
#
#
#
#
#
#
#

    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

############################  OVERALL METHODS  ################################

    """ Get records from region according document links """
    def _manual_process_page(self, links, baseurl):
        _err = None
        recordlist = []
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")

                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")

                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist  # returns list of records

Esempio n. 16

Mostra file

File: getdeliverablerecords.py Progetto: KNOT-GIT/mDeliverables

class GetDeliverableRecords:
    """ get records and return dict or list of records with atributes """

    def __init__(self, verbose=False,debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()        
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()        
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()               
        # list of acceptable words in title (header) of table
        self.table_sem_words = ['deliverable', 'description', 'name', 'date',
                                'dissemination', 'no.', 'wp', 'delivery',
                                'particip', 'title', 'nature']
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose = verbose,debug = debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug


########################### TABLE HANDLING METHODS ############################

    
    def _get_descendats_texts(self, elem, string=True):
        """ Get texts from element and his descendants.
        If string isset, returns texts as one string with spaces.
        # elem: lxml element """
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    
    def _get_table_order(self):
        """ Get table order (table semantic) """
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr': # first <tr> match
                for col in desc: # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                             if self.htmlHandler.check_file(child.get('href')):
                                 return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30: 
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I): 
                return sem_list
        return None


    
    def _get_row_link(self, row):
        """ Get link from row of table - go through columns and the only href
        leading to deliverable is returned. """
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(anchor_link): # check if it is file we want
                return anchor_link
        return None


    

    def _handle_table(self):
        """ Handle region as a table.
        Work with region as it's a table. Try to get table semantic (table order)
        and get all records out of it. """
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del(row_list)
            return records     
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose("Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text                    
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    
    def _is_deliv_anch(self, tag):
        """ Tag check. 
        If it is anchor with href leading to deliverable, returns True """
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False


    
    def _tagfilter(self, tag):
        """ Filters useless and messy tags.
        Return false if useless, true if normal tag """
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    
    def _getdiff(self, reg, tol):
        """ Gets difference between first two anchors. """
        # etree reg = element tree region
        # int tol: accepted tolerance of tags 
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1        
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1    
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1
    
    def _get_anch_only(self):
        """ Only anchors found. No optional information. """
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]


    
    def _get_tag_sequences(self, tag_tol=1):
        """ Main method handling tag sequences and recognizing records.
        returns list of records. """
        records = []
        self._rec = []        
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)        
        
        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1       
         
        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)
        
        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0 
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag
            
            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()             
        records.append(self._rec)
        return filter(self._validseq, records)


    
    def _validseq(self, rec):
        """ Helper method - check if sequence of tags rec contains deliv anchor """
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr): 
                return True
        return False
        
    def _get_tag_content(self, tag):
        """ Get element texts only, dont look for descendants texts """
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    
    
    def _harvest_text(self, record_tag_list):
        """ Harvest texts out of tags and return list of lists (record) """
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records


    
    def _handle_sequence(self):
        """ Text harvesting for sequences. """
        seq = self._get_tag_sequences()        
        return self._harvest_text(seq)
        
############################  OVERALL METHODS  ################################

    
    def _manual_process_page(self, links, baseurl):
        """ Get records from region according document links """
        _err = None
        recordlist = []
        self.baseUrl = baseurl
        
        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue
            
            # get the charset. We dont have etree in htmlHandler, 
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())
            
            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                
                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")
                
                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found            
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist # returns list of records

Esempio n. 17

Mostra file

File: getdelivpage.py Progetto: lucidvoci/ResearchProjectPortal

class GetDelivPage:

    def __init__(self, url, verbose=False, debug=False, addkeyw=None):
        # keywords used for document page search
        self._sigwords = ["d((eliverables?)|[0-9])",
                          "documents?",
                          "reports?",
                          "public(ation)?s?",
                          "results?",             
                          "presentations?",
                          "library",
                           #"projects?",
                          "outocomes?", "downloads?",
                          "outputs?"]
        
        if addkeyw != None:
            self._sigwords.append(addkeyw)

        """ Associative array containing links with their flags
        { url : [Index/NoIndex/Frame, Visit/Visited, Rank] }
        index = 0, noindex = 1, frame = 2, unvisited = 0, visited = 1 """
        self._link_stack = { url : [0,0,0] }

        self.base_url = url # save base (input) url

        # Open an parsing agent to get needed data from page
        self.agent = GetHTMLAndParse()

        self._current_url = url

        # a constant used to set rank in order of importance of the expression 
        # being tested (self._sigwords)
        self.rank_const = len(self._sigwords)

        # few a constants for dictionary - just for good-looking source code
        self.IND_FR = 0 # index/noindex/frame/special
        self.VISIT = 1 # unvisited/visited
        self.RANK = 2 # value of rank

        # set verbose flag
        self.__verbose__ = verbose

        #set debug flag
        self.__dbg__ = debug
        
        # checking data types
        if not type(self.__verbose__) == bool:
            raise ValueError("Verbose flag has to be boolean.")


    def __verbose(self, msg):
        _err = "cannot decode verbose message."
        if self.__verbose__ == True:
            try:
                print(str(msg))
            except UnicodeError:
                print(_err) 

        
    def __debug(self, msg):
        _err = "cannot decode debug info."
        if self.__dbg__ == True:
            try:
                print("Debug message:    "+str(msg))
            except UnicodeError:
                print(_err) 

################################################################################

    """ Initialize item in dictionary to noindex/unvisited/rank=0 """
    def _link_item_init__(self, link, index=1, visit=0, rank=0):
        # default setting: noindex,unvisited,norank
        if not self._link_stack.has_key(link):
           self._link_stack[link] = [index,visit,rank]
        return


    """ Edits item in dictionary self._link_stack """
    def _link_item_edit(self, link, index=None, visit=None, rank=None):
        if index is not None:
            self._link_stack[link][self.IND_FR] = index
        if visit is not None:
            self._link_stack[link][self.VISIT] = visit
        if rank  is not None:
            # null rank if zero is argument
            if rank == 0:
                self._link_stack[link][self.RANK] = 0
            # add rank
            else:
                self._link_stack[link][self.RANK] += rank
        return

    
    """ Method representing one level of cascade. Do almost any job to search 
    one word in dictionary """
    def _level_job(self, index=None):
        # get list of links from anchors containing one of expression
        # from self_sigwords
        result = 0
        if index is not None: # searching with one 
            link_list = self.agent.get_all_links(
                regul = re.compile(self._sigwords[index], re.I), 
                base  = self._current_url)        
        else:
            link_list = self.agent.get_all_links(base = self._current_url)
            index = self.rank_const
        if link_list:
            #
            #   RANK giving & filter
            #       
            if index is None:
                rank = 0
            elif index == 0:
                rank = self.rank_const * 2
            else:
                rank = self.rank_const - index
            for link in link_list:
                # GTFO javascript
                if not link or "javascript:" in link or "mailto:" in link: 
                    continue
                if "#" in link: # if pointer delete it
                    link = re.sub('#.*$', '', link)
                if len(link) > 200:  
                    continue                
                if self._link_stack.get(link):
                    # RANK if you see those links for first
                    if self._link_stack[link][self.VISIT] == 0:
                        self._link_item_edit(self._current_url, rank=rank)
                    continue
                if not self.agent.compare_domains(self.base_url, link):
                    continue

                split_link = re.sub("https?://.+?/", "", link)
                # check whether it is file or not
                 
                if self.agent.is_wanted_mime(link):
                    #
                    #   Some PDF or DOC found
                    #
                    # RANK
                    self._link_item_edit(self._current_url, rank=10)
                    self.__debug("Added rank 10 to "+self._current_url)
                    # 
                    if re.search("de?l?(iverable)?[0-9]+([\._-][0-9])?", 
                                  split_link, re.I):
                        self.__debug("Type D on "+self._current_url) # debug print
                        # RANK
                        self._link_item_edit(self._current_url, rank=100)
                    continue
                elif not self.agent.is_page(link):
                    continue
                    self.__debug("UNWATED")
                #
                # Add link
                #
                # RANK
                # initialization of link item in dict
                self._link_item_init__(link)
                self._link_item_edit(self._current_url, rank=rank)
                result += 1
                # debug print
                self.__debug("ADD "+link[7:60])
                self.__debug("Rank "+str(rank)+" "+self._current_url)    
        return result


    """ Cascade search. May improve the speed of script """
    def _cascade_search(self):
        result = 0
        # first cascade - look for links cont. deliverables
        result += self._level_job(0)
        if not result == 0:
            return
        # second cascade - look for links cont. documents and publications
        result += self._level_job(1) 
        result += self._level_job(2)
        if not result == 0:
            return
        # last cascade - all the rest
        for i in range(3,self.rank_const):
            result += self._level_job(i)
        # check Intro page (all links) only on index
        if result == 0 and self._link_stack[self._current_url][0] == 0:
            result += self._level_job() 
        """if result == 0:
            # RANK DOWN
            self._link_item_edit(self._current_url, rank=0)
            print "No anchors on the page"""
        return


    """ TRY TO repair link. But for now only append / in base """
    def _repair_links(self, base=None):
        if base is None:
            base = self.base_url
        if re.match(".*[^/]$", base):
            base += "/"
        if self.agent.get_etree() == -1:
            return -1
        links = self.agent.get_all_links(base = base)
        # compare link with base url
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            link = re.sub("https?://.+?/", base, link)
            # if match, save it as special case
            self._link_item_init__(link, index=3)


    """ Checking intro page. It is page without content, only with Enter label """
    def _check_intro(self):
        links = self.agent.get_all_links(base = self._current_url)
        self.__debug("We've found intro links: "+str(links))
        for link in links:
            if not self.agent.compare_domains(self.base_url, link):
                continue
            # save new link as normal page
            self._link_item_init__(link, index=1)
   

    """ Looks for frames on the page """
    def _check_frames(self):
        frames = self.agent.look_for_frame(base = self._current_url)
        if not frames:
            return None
        fcount = len(frames)
        # debug print
        self.__debug("We've found frames ("+str(fcount)+") on "+self._current_url) 
        # save new link as frame page
        for link in frames:
            if self.agent.compare_domains(self._current_url, link):
              self._link_item_init__(link, index=2)
        return fcount

    
    """ Checks for titles and gives rank according the result """
    def _check_titles(self):
        for i in range(self.rank_const):
            hcount = self.agent.count_all_headers(
                re.compile( self._sigwords[i], re.I ))
            if not hcount == 0:
                if i == 0: 
                    #
                    # "deliverable" match, the highest rank
                    #
                    # RANK constant is multiplied by 4
                    self.__debug("deliverable match"+str(self.rank_const *
                    4)+" "+self._current_url)
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const * 4)
                else:
                    #
                    # other word match
                    #
                    # do not multiplied rank constant
                    self.__debug("Rank "+str(self.rank_const - i)+" "+self._current_url) 
                    self._link_item_edit(self._current_url, 
                                         rank = self.rank_const - i)


    """ Get information about current link """
    def _check_anchor(self):
        # tt is Text and Title
        tt = self.agent.get_anchor_from_link(self._current_url)
        # return 0 if no anchor match
        if tt == 0: return tt;
        # match for deliverables
        if re.search(self._sigwords[0], tt, re.I):
            self.__debug("Anchor matched "+self._current_url) # debug print
            return 1
        

    """ Returns list of unvisited links. Useful in cycle. """
    def _check_unvisited_links(self):
        unvisitedLinks = []
        for link in self._link_stack:
            if self._link_stack[link][self.VISIT] == 0: # if unvisited
                unvisitedLinks.append(link)
        return unvisitedLinks # list of unvisited page links

    
    """ Aplying all methods to unvisited links - next level of searching. 
    It is main private method. Only this method can decide end of searching """
    def _handle_unvis_links(self):
        unvisLinks = self._check_unvisited_links()
        if not unvisLinks:
            return None # end of searching
        for link in unvisLinks: # cycle in unvisited links
            # visit and parse page
            self._link_item_edit(link, visit = 1)

            (res, err) = self.agent.ghap(link)
            if res == -1:
                self.__debug(str(err)+" "+str(link)) # debug print
                # if link is broken (IND_FR == 3)
                if self._link_stack[link][self.IND_FR] != 3:
                    self._repair_links()
                continue
            # little hack with error message, there is no error but URL!
            if res == 2:
                self.base_url = err # URL of the new base
            self.__debug("Getting url in ghap(): "+str(link)) # debug print
            self.__verbose("Searching... URL: "+str(link)) # verbose print
            self._current_url = link
            if self._link_stack[link][self.IND_FR] == 2:
                dname = self.agent.get_domain_name(link)
                if dname is not None:
                    self.base_url = dname

            ###############
            # frame check #
            self._check_frames()

            ################
            # titles check #
            self._check_titles() # rank giving here

            ################
            # anchor check #
            if self._check_anchor():
                self._link_item_edit(link, rank = 10) # rank giving here too

            self._cascade_search() # search for next links on this page
        # when no unvisited links in list, return
        return 1


    """ Returns link of the highest value of rank in self._link_stack. 
    It is called in the end of process."""
    def _get_highest_ranks_link(self):
        hRank = 0
        hLink = ""
        # check all links and choose link with the highest rank
        for link in self._link_stack:
            if self._link_stack[link][self.RANK] > hRank:
                hLink = link
                hRank = self._link_stack[link][self.RANK]
        return hLink # WINNER


    """ Returns list of all links leading to deliverables. 
    Try to find more sites with deliverables.. i.e. like www.awissenet.com has.
    Maybe test for name of link - anchor: i.e. next, prev, [0-9]+ and so one...
    Page usualy looks like:       next pages: 1 2 3 4 ... """
    def _get_deliv_link_list(self,first_link):
        # agent gets first_link
        final_list = []
        nonvisited = [first_link]
        current = nonvisited.pop()
        while current:
            if not current or "javascript:" in current or "mailto:" in current:
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue
            if self.agent.ghap(current)[0] == -1: # CACHE ??? maybe 
                try:
                    current = nonvisited.pop()
                except: 
                    break
                continue

            nonvisited.extend(self.agent.get_pager_links(base=current))
            final_list.append(current) # append only one link
            try:
                current = nonvisited.pop()
            except: 
                break
        return final_list # returning all pages with deliverables
    
        
    """ Returns list of links on pages with deliverable-documents.
    If found returns list, if not found, return -1. 
    Only public method in module. """
    def get_deliverable_page(self):
        # the main searching loop 
        # while we have some unvisited links, search
        while self._handle_unvis_links(): 
            # security case
            if len(self._link_stack) > 10:
                break
            self.__debug("Stack content: "+str(self._link_stack))
        if len(self._link_stack) == 1 :
            return derrno.__err__(derrno.ELNOTFOUND)

        final_link = self._get_highest_ranks_link()
        if not final_link or self._link_stack[final_link][2] == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        self.__debug('#'*79)
        self.__debug("DELIVERABLE PAGE: "+final_link)
        return [final_link]
        
        ####### not in use #############
        result = self._get_deliv_link_list(final_link)
        if len(result) == 0:
            return derrno.__err__(derrno.ELNOTFOUND)
        else:
            return result