def parse_template_main(self):
        proceedings_list = []
        tr = self.grab.tree.xpath(XPATH_SUMMARY)
        for i in range(0, len(tr), 2):
            href = tr[i].find(self.XPATH_SUMMARY_TITLE)
            try:
                if href.get('href') in config.input_urls or len(config.input_urls) == 1:
                    proceedings = dict()
                    proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href'))
                    proceedings['url'] = href.get('href')
                    summary_match = rex.rex(
                        tr[i + 1].find('.//td[last()]').text_content(),
                        r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)',
                        re.I | re.M | re.S)

                    proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n'))
                    proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3)))
                    proceedings['submission_date'] = datetime.strptime(
                        text.normalize_space(summary_match.group(7), ' \n'),
                        '%d-%b-%Y')

                    proceedings_list.append(proceedings)
            except:
                print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href')
                #traceback.print_exc()

        self.data['proceedings_list'] = proceedings_list

        if len(proceedings_list) == 0:
            raise DataNotFound("There is no summary information to parse!")
    def parse_template_main(self):
        proceedings_list = []
        tr = self.grab.tree.xpath(XPATH_SUMMARY)
        for i in range(0, len(tr), 2):
            href = tr[i].find(self.XPATH_SUMMARY_TITLE)
            try:
                if href.get('href') in config.input_urls or len(config.input_urls) == 1:
                    proceedings = dict()
                    proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href'))
                    proceedings['url'] = href.get('href')
                    summary_match = rex.rex(
                        tr[i + 1].find('.//td[last()]').text_content(),
                        r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)',
                        re.I | re.M | re.S)

                    proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n'))
                    proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3)))
                    proceedings['submission_date'] = datetime.strptime(
                        text.normalize_space(summary_match.group(7), ' \n'),
                        '%d-%b-%Y')

                    proceedings_list.append(proceedings)
            except:
                print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href')
                #traceback.print_exc()

        self.data['proceedings_list'] = proceedings_list

        if len(proceedings_list) == 0:
            raise DataNotFound("There is no summary information to parse!")
Example #3
0
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
Example #4
0
 def text(self, default=NULL):
     try:
         return normalize_space(decode_entities(self.one().group(1)))
     except (AttributeError, IndexError):
         if default is NULL:
             raise
         else:
             return default
Example #5
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
Example #6
0
def rex_text_list(body, rex, flags=0):
    """
    Return found matches with stripped tags.
    """

    items = []
    for match in rex_list(body, rex, flags=flags):
        items.append(normalize_space(decode_entities(match.group(1))))
    return items
Example #7
0
def find_content_blocks(tree, min_length=None):
    """
    Iterate over content blocks (russian version)
    """
    from lxml.html import tostring
    from lxml.etree import strip_tags, strip_elements, Comment

    # First, make a copy of DOM-tree to not harm external code
    tree = deepcopy(tree)

    # Completely remove content of following tags
    nondata_tags = ['head', 'style', 'script']
    strip_elements(tree, *nondata_tags)

    # Remove comment nodes (keep tail text)
    strip_tags(tree, Comment)

    # Remove links
    strip_tags(tree, 'a')

    # Drop inline tags
    inline_tags = ('br', 'hr', 'p', 'b', 'i', 'strong', 'em', 'a',
                   'span', 'font')
    strip_tags(tree, *inline_tags)

    # Drop media tags
    media_tags = ('img',)
    strip_tags(tree, *media_tags)

    body = tostring(tree, encoding='utf-8').decode('utf-8')

    # Normalize spaces
    body = normalize_space(body)

    # Remove ALL chars from tags
    re_tag = re.compile(r'<[^>]+>')
    body = re_tag.sub(r'<>', body)

    #with open('/tmp/log.html', 'w') as out:
        #out.write(body.encode('utf-8'))
    #return

    # Find text blocks
    block_rex = re.compile(r'[^<>]+')

    blocks = []
    for match in block_rex.finditer(body):
        block = match.group(0)
        if min_length is None or len(block) >= min_length:
            ratio = _trash_ratio(block)
            if ratio < 0.05:
                words = block.split()
                if not any(len(x) > 50 for x in words):
                    blocks.append(block)
    return blocks
Example #8
0
 def test_normalize_space(self):
     self.assertEqual('', normalize_space(' '))
     self.assertEqual('f', normalize_space(' f '))
     self.assertEqual('f b', normalize_space(' f b '))
     self.assertEqual(u'тр и гла за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за '))
     self.assertEqual(u'тр_и_гла_за', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_'))
     self.assertEqual(u'трABCиABCглаABCза', normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='ABC'))
Example #9
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
Example #10
0
def rex_text(body, regexp, flags=0, default=NULL):
    """
    Search `regexp` expression in `body` text and then strip tags in found result.
    """

    match = rex(body, regexp, flags=flags, default=default)
    try:
        return normalize_space(decode_entities(match.group(1)))
    except AttributeError:
        if default is NULL:
            raise DataNotFound('Regexp not found')
        else:
            return default
Example #11
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in
        `response.unicode_body()` else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
Example #12
0
    def rex_text(self, regexp, flags=0, byte=False, default=NULL):
        """
        Search regular expression in response body and return content of first
        matching group.

        :param byte: if False then search is performed in `response.unicode_body()`
            else the rex is searched in `response.body`.
        """

        try:
            match = self.rex_search(regexp, flags=flags, byte=byte)
        except DataNotFound:
            if default is NULL:
                raise DataNotFound('Regexp not found')
            else:
                return default
        else:
            return normalize_space(decode_entities(match.group(1)))
Example #13
0
 def test_normalize_space(self):
     self.assertEqual('', normalize_space(' '))
     self.assertEqual('f', normalize_space(' f '))
     self.assertEqual('f b', normalize_space(' f b '))
     self.assertEqual(u'тр и гла за',
                      normalize_space(u' тр и гла' + '\t' + '\n' + u' за '))
     self.assertEqual(
         u'тр_и_гла_за',
         normalize_space(u' тр и гла' + '\t' + '\n' + u' за ', replace='_'))
     self.assertEqual(
         u'трABCиABCглаABCза',
         normalize_space(u' тр и гла' + '\t' + '\n' + u' за ',
                         replace='ABC'))