class GetDeliverableRecords:
    """ get records and return dict or list of records with atributes """

    def __init__(self, verbose=False,debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()        
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()        
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()               
        # list of acceptable words in title (header) of table
        self.table_sem_words = ['deliverable', 'description', 'name', 'date',
                                'dissemination', 'no.', 'wp', 'delivery',
                                'particip', 'title', 'nature']
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose = verbose,debug = debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug


########################### TABLE HANDLING METHODS ############################

    
    def _get_descendats_texts(self, elem, string=True):
        """ Get texts from element and his descendants.
        If string isset, returns texts as one string with spaces.
        # elem: lxml element """
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts


    
    def _get_table_order(self):
        """ Get table order (table semantic) """
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr': # first <tr> match
                for col in desc: # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                             if self.htmlHandler.check_file(child.get('href')):
                                 return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30: 
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I): 
                return sem_list
        return None


    
    def _get_row_link(self, row):
        """ Get link from row of table - go through columns and the only href
        leading to deliverable is returned. """
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(anchor_link): # check if it is file we want
                return anchor_link
        return None


    

    def _handle_table(self):
        """ Handle region as a table.
        Work with region as it's a table. Try to get table semantic (table order)
        and get all records out of it. """
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del(row_list)
            return records     
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose("Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text                    
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    
    def _is_deliv_anch(self, tag):
        """ Tag check. 
        If it is anchor with href leading to deliverable, returns True """
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False


    
    def _tagfilter(self, tag):
        """ Filters useless and messy tags.
        Return false if useless, true if normal tag """
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True


    
    def _getdiff(self, reg, tol):
        """ Gets difference between first two anchors. """
        # etree reg = element tree region
        # int tol: accepted tolerance of tags 
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1        
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: "+str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2*tol)+1): # tolerance to both sides
            return -1
        if (_max - _min) > 2*tol: # some acceptable tolerance
            return -1    
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1
    
    def _get_anch_only(self):
        """ Only anchors found. No optional information. """
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl, self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]


    
    def _get_tag_sequences(self, tag_tol=1):
        """ Main method handling tag sequences and recognizing records.
        returns list of records. """
        records = []
        self._rec = []        
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)        
        
        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1       
         
        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter, [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)
        
        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0 
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference-1:
                records.append(self._rec) # save
                self._rec = [] # erase the list
                counter = 0 # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i+1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False # match flag
            
            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol+1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol+1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()             
        records.append(self._rec)
        return filter(self._validseq, records)


    
    def _validseq(self, rec):
        """ Helper method - check if sequence of tags rec contains deliv anchor """
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr): 
                return True
        return False
        
    def _get_tag_content(self, tag):
        """ Get element texts only, dont look for descendants texts """
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    
    
    def _harvest_text(self, record_tag_list):
        """ Harvest texts out of tags and return list of lists (record) """
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records


    
    def _handle_sequence(self):
        """ Text harvesting for sequences. """
        seq = self._get_tag_sequences()        
        return self._harvest_text(seq)
        
############################  OVERALL METHODS  ################################

    
    def _manual_process_page(self, links, baseurl):
        """ Get records from region according document links """
        _err = None
        recordlist = []
        self.baseUrl = baseurl
        
        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue
            
            # get the charset. We dont have etree in htmlHandler, 
            # so we have to use the one from regionHandler
            self.formatter.set_charset(self.regionHandler.formatter.get_charset())
            
            self.__debug("*"*100+'\n'+"*"*40+" DATA REGION "+"*"*40)
            self.__debug(lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass
            
            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table','tbody'):
                self.__verbose("Handling table")
                
                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")
                
                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found            
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist # returns list of records
class GetDeliverableRecords:
    def __init__(self, verbose=False, debug=False):
        # init agent for parsing html
        self.htmlHandler = GetHTMLAndParse()
        # to get region where to search for records
        self.regionHandler = GetDeliverableRegion()
        # init text formatter (encoding, erasing white chars etc.)
        self.formatter = TextFormatUtils()
        # list of acceptable words in title (header) of table
        self.table_sem_words = [
            'deliverable', 'description', 'name', 'date', 'dissemination',
            'no.', 'wp', 'delivery', 'particip', 'title', 'nature'
        ]
        self._omitted_tags = ('br', 'img', 'html', 'body')
        # tag tolerance
        self.tagtol = 1
        # verbose and debug flags
        self.debugger = DeliverableDebugger(verbose=verbose, debug=debug)
        self.__verbose = self.debugger.verbose
        self.__debug = self.debugger.debug

########################### TABLE HANDLING METHODS ############################

    """ Get texts from element and his descendants.
    If string isset, returns texts as one string with spaces.
    # elem: lxml element """
    def _get_descendats_texts(self, elem, string=True):
        texts = []
        for child in elem.iter():
            if child.text and isinstance(child.tag, basestring):
                if re.search("[a-z0-9]", child.text, re.I):
                    texts.append(self.formatter.format(child.text))
        if string:
            return " ".join(texts)
        return texts

    """ Get table order (table semantic) """

    def _get_table_order(self):
        sem_list = []
        for desc in self.parentetree.iterdescendants():
            if desc.tag == 'tr':  # first <tr> match
                for col in desc:  # its <th> or <td>
                    for child in col.iterdescendants():
                        if child.tag == 'a':
                            if self.htmlHandler.check_file(child.get('href')):
                                return None
                    value = self._get_descendats_texts(col)
                    if value != None:
                        # if it is not title, but some text.
                        if len(value) > 30:
                            return None
                        sem_list.append(value)
                break
        str_sem_list = " ".join(sem_list)
        for expr in self.table_sem_words:
            # two matches ???
            if re.search(expr, str_sem_list, re.I):
                return sem_list
        return None

    """ Get link from row of table - go through columns and the only href
    leading to deliverable is returned. """

    def _get_row_link(self, row):
        # find all anchors where parent is row
        linkanch = row.findall('.//a[@href]')
        if len(linkanch) == 0:
            return None
        for link in linkanch:
            anchor_link = link.get('href')
            if self.htmlHandler.check_file(
                    anchor_link):  # check if it is file we want
                return anchor_link
        return None

    """ Handle region as a table.
    Work with region as it's a table. Try to get table semantic (table order)
    and get all records out of it. """

    #
    #
    #
    #
    #
    #
    #
    #
    #
    #
    #

    def _handle_table(self):
        # get table semantic
        tbl_order = self._get_table_order()
        # if we didnt recognize table order, get records and return list
        if not tbl_order:
            self.__verbose("Table order not recognized, getting data...")
            records = []
            # tr tag is a record
            for row in self.parentetree:
                if not row.tag == 'tr':
                    continue
                row_list = []
                _thislink = self._get_row_link(row)
                if _thislink == None:
                    continue
                row_list.append(_thislink)
                for column in row:
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    row_list.append(text)
                records.append(row_list)
                del (row_list)
            return records
        # else we have recognized table order, make dict of dicts out of it
        else:
            self.__verbose(
                "Table order recognized, filling dictionary in this order.")
            # every column of the row (every atribute of the record) has it's own
            # semantic in order of table semantic
            semantic_data = dict()
            for row in self.parentetree:
                self._thislink = self._get_row_link(row)
                # if its header or non-deliverable row, omit it.
                if self._thislink == None:
                    continue
                semantic_data[self._thislink] = {}
                for index, column in enumerate(row):
                    # get column text
                    text = self._get_descendats_texts(column)
                    if not text:
                        continue
                    try:
                        # store it
                        semantic_data[self._thislink][tbl_order[index]] = text
                    except:
                        continue
            return semantic_data

########################  TAG SEQUENCE RECOGNIZING METHODS ####################

    """ Tag check. 
    If it is anchor with href leading to deliverable, returns True """
    def _is_deliv_anch(self, tag):
        if tag.tag == 'a':
            href = tag.get('href')
            if self.htmlHandler.check_file(href):
                return True
        return False

    """ Filters useless and messy tags.
    Return false if useless, true if normal tag """

    def _tagfilter(self, tag):
        if tag.tag in self._omitted_tags:
            return False
        #if tag.text:
        #    if not re.search("[a-z0-9\[\]]", tag.text, re.I):
        #        return False
        return True

    """ Gets difference between first two anchors. """

    def _getdiff(self, reg, tol):
        # etree reg = element tree region
        # int tol: accepted tolerance of tags
        d = {}
        index = 0
        # fill the dictionary with differences and their occurences
        for tag in reg.iter():
            if not self._tagfilter(tag):
                continue
            if self._is_deliv_anch(tag) and not index == 0:
                try:
                    d[index] += 1
                except:
                    d[index] = 1
                index = 0
            index += 1
        # check differencies if the variety isn't higher then $tol tolerance
        difflist = d.keys()
        self.__debug("difflist: " + str(difflist))
        if len(difflist) == 0:
            return -1
        _max = max(difflist)
        _min = min(difflist)
        dlen = len(d.keys())
        if dlen == 1:
            return d.keys()[0]
        if dlen > ((2 * tol) + 1):  # tolerance to both sides
            return -1
        if (_max - _min) > 2 * tol:  # some acceptable tolerance
            return -1
        # get the most frequent difference
        most_freq = max(d.values())
        for key in d:
            if d[key] == most_freq:
                return key
        return -1

    """ Only anchors found. No optional information. """

    def _get_anch_only(self):
        anchlist = self.htmlHandler.find_anchor_elem(self.baseUrl,
                                                     self.parentetree)
        # We have to make list of list because XMLOutput
        return [[anch] for anch in anchlist]

    """ Main method handling tag sequences and recognizing records.
    Returns list of records. """

    def _get_tag_sequences(self, tag_tol=1):
        records = []
        self._rec = []
        if len(self.parentetree) == 0:
            return [[self.parentetree]]
        # get interval between anchors, use tolerance tag_tol
        self.difference = self._getdiff(self.parentetree, self.tagtol)
        while self.difference == -1:
            if self.tagtol > 5:
                self.__verbose("Variety of intervals between anchors is too huge. "+\
                               "Getting data out of anchors only")
                return self._get_anch_only()
            self.tagtol += 1
            self.difference = self._getdiff(self.parentetree, self.tagtol)

        # get sequence of first n tags, where n is average interval between anchors
        # this could be tag-sequence describing all records in region.
        self.record_seq = []
        i = 0
        for tag in self.parentetree.iter():
            if not self._tagfilter(tag):
                continue
            if i >= self.difference:
                if not 'a' in self.record_seq:
                    del self.record_seq[0]
                else:
                    break
            self.record_seq.append(tag.tag)
            i += 1

        # counter indicates on which position in tag sequence we actually are
        counter = 0
        # make sequence of tags as they go
        regionlist = filter(self._tagfilter,
                            [tag for tag in self.parentetree.iter()])
        recseqlen = len(self.record_seq)
        reglistlen = len(regionlist)

        # flag indicating begin of records - in region on the beginning can be some garbage
        self.begin = False
        # indicating unpredictable separator between deliverable records
        self.separator = 0
        for i, tag in enumerate(regionlist):
            # skip and save the sequence at the end
            if counter > self.difference - 1:
                records.append(self._rec)  # save
                self._rec = []  # erase the list
                counter = 0  # reset counter
            if not self.begin:
                if tag.tag != self.record_seq[0]:
                    continue
                else:
                    try:
                        if regionlist[i + 1].tag != self.record_seq[1]:
                            continue
                    except:
                        pass
                    self.begin = True
            # handle tolerances, try to compare sibling tags
            self.match = False  # match flag

            # tolerance algorithm. Goes through html and tries to pass irregular tags in sequence.
            for tol in range(self.tagtol + 1):
                if tag.tag == self.record_seq[(counter + tol) % recseqlen] or \
                   regionlist[(i + tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter += tol + 1
                    break
                elif tag.tag == self.record_seq[(counter - tol) % recseqlen] or \
                   regionlist[(i - tol) % reglistlen].tag == self.record_seq[counter % recseqlen]:
                    self.match = True
                    self._rec.append(tag)
                    counter -= tol
                    counter += 1
                    break
            # if nothing matched, its probably out of tolerance
            if not self.match:
                self.separator += 1
                # tolerance 10 separators (tags between boxes or tables of deliverables)
                if self.separator > 10:
                    self.__verbose("Tag sequence doesnt match, probably out of "+\
                                "tolerance, getting data out of anchors only")
                    # maybe here could be tolerance++
                    # we didnt catch the sequence with tolerance...
                    return self._get_anch_only()
        records.append(self._rec)
        return filter(self._validseq, records)

    """ Helper method - check if sequence of tags rec contains deliv anchor
    """

    def _validseq(self, rec):
        for _atr in rec:
            # if we have anchor containing link to document, return true
            if self._is_deliv_anch(_atr):
                return True
        return False

###
#
#
#
#
#
#
#
#
#
#
#

    """ Get element texts only, dont look for descendants texts """
    def _get_tag_content(self, tag):
        l = []
        if tag.tag == 'a':
            href = tag.get('href')
            # if link leading to document found, add string to list
            if href is not None and self.htmlHandler.check_file(href):
                l.append(self.formatter.format(href))
            title = tag.get('title')
            # if title found in tag, add string to list
            if title:
                l.append(self.formatter.format(title))
        # if not anchor, search text in tag.text
        if tag.text:
            if re.search("[a-z0-9]", tag.text, re.I):
                l.append(self.formatter.format(tag.text))
        return l

    """ Harvest texts out of tags and return list of lists (record) """

    def _harvest_text(self, record_tag_list):
        self._records = []
        self._rec = []
        # loop over records and search all possible useful texts
        for rec_list in record_tag_list:
            for tag in rec_list:
                self._rec.extend(self._get_tag_content(tag))
            self._records.append(self._rec)
            self._rec = []
        return self._records

    """ Text harvesting for sequences. """

    def _handle_sequence(self):
        seq = self._get_tag_sequences()
        return self._harvest_text(seq)

############################  OVERALL METHODS  ################################

    """ Get records from region according document links """
    def _manual_process_page(self, links, baseurl):
        _err = None
        recordlist = []
        self.baseUrl = baseurl

        for link in links:
            # find region with tolerance
            self.parentetree = self.regionHandler.get_region(link, baseurl, 1)
            if type(self.parentetree) == tuple:
                # error
                _err = self.parentetree
                continue

            # get the charset. We dont have etree in htmlHandler,
            # so we have to use the one from regionHandler
            self.formatter.set_charset(
                self.regionHandler.formatter.get_charset())

            self.__debug("*" * 100 + '\n' + "*" * 40 + " DATA REGION " +
                         "*" * 40)
            self.__debug(
                lxml.etree.tostring(self.parentetree, pretty_print=True))
            # get root tag
            try:
                self.parentetree = self.parentetree.getroot()
            except:
                pass

            # Parent tag is table
            # call _handle_table
            if self.parentetree.tag in ('table', 'tbody'):
                self.__verbose("Handling table")

                _result = self._handle_table()
                # if we had a dictionary, continue filling it
                if len(recordlist) > 0:
                    for key in _result:
                        recordlist[key] = _result[key]
                else:
                    recordlist = _result
            # Parent tag is not table
            # call _handle_sequence
            else:
                self.__verbose("Handling sequences")

                _result = self._handle_sequence()
                recordlist.extend(_result)
        # no records found
        if len(recordlist) == 0:
            if not _err == None:
                return _err
            return derrno.__err__(derrno.ENOREC)
        self.__debug("DATA RECORDS: ")
        self.__debug(recordlist)
        return recordlist  # returns list of records