Python rexの例、grab.tools.rex.rex Pythonの例

コード例 #1

0

ファイルを表示

ファイル: workshop_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_5(self, element):
        """
        A template for a workshop with the conference acronym and year in the name

        Examples:
            - http://ceur-ws.org/Vol-958/
        """
        workshop = {}
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1)

        workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        label_part = rex.rex(element[0].text, r'(.*)\sat\s(\w{2,})\s(\d{4})[\s\.]*', re.I | re.S)
        workshop['label'] = label_part.group(1)
        workshop['conf_acronym'] = label_part.group(2)
        workshop['conf_year'] = label_part.group(3)
        workshop['url'] = element[0].get('href')
        workshop['time'] = utils.parse_date(title)
        try:
            workshop['edition'] = tonumber(
                rex.rex(title,
                        r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*'
                        r'.*Workshop.*',
                        re.I, default=None).group(3))
        except:
            #'edition' property is optional
            pass

        self.add_workshop(workshop)

コード例 #2

0

ファイルを表示

    def parse_template_5(self, element):
        """
        A template for a workshop with the conference acronym and year in the name

        Examples:
            - http://ceur-ws.org/Vol-958/
        """
        workshop = {}
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1)

        workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        label_part = rex.rex(element[0].text, r'(.*)\sat\s(\w{2,})\s(\d{4})[\s\.]*', re.I | re.S)
        workshop['label'] = label_part.group(1)
        workshop['conf_acronym'] = label_part.group(2)
        workshop['conf_year'] = label_part.group(3)
        workshop['url'] = element[0].get('href')
        workshop['time'] = utils.parse_date(title)
        try:
            workshop['edition'] = tonumber(
                rex.rex(title,
                        r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*'
                        r'.*Workshop.*',
                        re.I, default=None).group(3))
        except:
            #'edition' property is optional
            pass

        self.add_workshop(workshop)

コード例 #3

0

ファイルを表示

ファイル: workshop_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_1(self, element):
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1).replace('\n', '')
        if re.match(r'^proceedings of the[joint ]*.*workshops.*|^joint proceedings.*', title, re.I | re.S):
            raise DataNotFound()
        labels = rex.rex(title, r".*\((([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})|"
                                r"([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})\s+at.*)\).*",
                         re.I | re.S)
        short_label = labels.group(2)

        self.data['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        self.data['short_label'] = short_label

コード例 #4

0

ファイルを表示

    def parse_template_1(self, element):
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1).replace('\n', '')
        if re.match(r'^proceedings of the[joint ]*.*workshops.*|^joint proceedings.*', title, re.I | re.S):
            raise DataNotFound()
        labels = rex.rex(title, r".*\((([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})|"
                                r"([\da-zA-Z*@\-&:]+?)['\s-]*(\d{2}|\d{4})\s+at.*)\).*",
                         re.I | re.S)
        short_label = labels.group(2)

        self.data['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        self.data['short_label'] = short_label

コード例 #5

0

ファイルを表示

ファイル: workshop_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_2(self, element):
        """
        A template for joint proceedings of two workshops:

        Examples:
            - http://ceur-ws.org/Vol-776/
        """
        workshop_1 = {'id': 1}
        workshop_2 = {'id': 2}
        summary = rex.rex(element[1], r'^\s*(proceedings\s+of\s+joint.*on.*\((\w+)\-(\w+)\s+\d+\).*)Edited by.*',
                          re.I | re.S)

        if len(summary.groups()) != 3:
            raise DataNotFound()

        title = summary.group(1)

        workshop_1['volume_number'] = workshop_2['volume_number'] = \
            WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        workshop_1['url'] = workshop_2['url'] = element[0].get('href')
        workshop_1['time'] = workshop_2['time'] = utils.parse_date(title)

        workshop_1['short_label'] = summary.group(2)
        workshop_2['short_label'] = summary.group(3)

        self.add_workshop(workshop_1)
        self.add_workshop(workshop_2)

コード例 #6

0

ファイルを表示

    def parse_template_2(self, element):
        """
        A template for joint proceedings of two workshops:

        Examples:
            - http://ceur-ws.org/Vol-776/
        """
        workshop_1 = {'id': 1}
        workshop_2 = {'id': 2}
        summary = rex.rex(element[1], r'^\s*(proceedings\s+of\s+joint.*on.*\((\w+)\-(\w+)\s+\d+\).*)Edited by.*',
                          re.I | re.S)

        if len(summary.groups()) != 3:
            raise DataNotFound()

        title = summary.group(1)

        workshop_1['volume_number'] = workshop_2['volume_number'] = \
            WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        workshop_1['url'] = workshop_2['url'] = element[0].get('href')
        workshop_1['time'] = workshop_2['time'] = utils.parse_date(title)

        workshop_1['short_label'] = summary.group(2)
        workshop_2['short_label'] = summary.group(3)

        self.add_workshop(workshop_1)
        self.add_workshop(workshop_2)

コード例 #7

0

ファイルを表示

ファイル: proceedings_parser.py プロジェクト: S6savahd/ceur-ws-lod

    def parse_template_main(self):
        proceedings_list = []
        tr = self.grab.tree.xpath(XPATH_SUMMARY)
        for i in range(0, len(tr), 2):
            href = tr[i].find(self.XPATH_SUMMARY_TITLE)
            try:
                if href.get('href') in config.input_urls or len(config.input_urls) == 1:
                    proceedings = dict()
                    proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href'))
                    proceedings['url'] = href.get('href')
                    summary_match = rex.rex(
                        tr[i + 1].find('.//td[last()]').text_content(),
                        r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)',
                        re.I | re.M | re.S)

                    proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n'))
                    proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3)))
                    proceedings['submission_date'] = datetime.strptime(
                        text.normalize_space(summary_match.group(7), ' \n'),
                        '%d-%b-%Y')

                    proceedings_list.append(proceedings)
            except:
                print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href')
                #traceback.print_exc()

        self.data['proceedings_list'] = proceedings_list

        if len(proceedings_list) == 0:
            raise DataNotFound("There is no summary information to parse!")

コード例 #8

0

ファイルを表示

ファイル: proceedings_parser.py プロジェクト: alexgarciac/ceur-ws-lod

    def parse_template_main(self):
        proceedings_list = []
        tr = self.grab.tree.xpath(XPATH_SUMMARY)
        for i in range(0, len(tr), 2):
            href = tr[i].find(self.XPATH_SUMMARY_TITLE)
            try:
                if href.get('href') in config.input_urls or len(config.input_urls) == 1:
                    proceedings = dict()
                    proceedings['volume_number'] = ProceedingsSummaryParser.extract_volume_number(href.get('href'))
                    proceedings['url'] = href.get('href')
                    summary_match = rex.rex(
                        tr[i + 1].find('.//td[last()]').text_content(),
                        r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)',
                        re.I | re.M | re.S)

                    proceedings['label'] = re.sub(r'\n', '', text.normalize_space(summary_match.group(1), ' \n'))
                    proceedings['editors'] = re.split(r",+\s*", text.normalize_space(summary_match.group(3)))
                    proceedings['submission_date'] = datetime.strptime(
                        text.normalize_space(summary_match.group(7), ' \n'),
                        '%d-%b-%Y')

                    proceedings_list.append(proceedings)
            except:
                print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get('href')
                #traceback.print_exc()

        self.data['proceedings_list'] = proceedings_list

        if len(proceedings_list) == 0:
            raise DataNotFound("There is no summary information to parse!")

コード例 #9

0

ファイルを表示

 def is_invited(publication):
     if rex.rex(publication['link'],
                r'.*(keynote|invite).*',
                re.I,
                default=None):
         return True
     else:
         return False

コード例 #10

0

ファイルを表示

 def check_for_workshop_paper(publication):
     if rex.rex(publication['name'].strip(),
                r'^(preface|overview|introduction|einleitung|foreword)$',
                re.I,
                default=None):
         return False
     if not publication['link'].endswith('.pdf'):
         return False
     return True

コード例 #11

0

ファイルを表示

    def parse_template_6(self, element):
        workshop = {}
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1)

        workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        workshop['label'] = element[0].text.replace('.', '')
        workshop['url'] = element[0].get('href')
        workshop['time'] = utils.parse_date(title)
        try:
            workshop['edition'] = tonumber(
                rex.rex(title,
                        r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*'
                        r'.*Workshop.*',
                        re.I, default=None).group(3))
        except:
            #'edition' property is optional
            pass

        self.add_workshop(workshop)

コード例 #12

0

ファイルを表示

ファイル: workshop_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_6(self, element):
        workshop = {}
        title = rex.rex(element[1], r'(.*)Edited\s*by.*', re.I | re.S).group(1)

        workshop['volume_number'] = WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        workshop['label'] = element[0].text.replace('.', '')
        workshop['url'] = element[0].get('href')
        workshop['time'] = utils.parse_date(title)
        try:
            workshop['edition'] = tonumber(
                rex.rex(title,
                        r'.*Proceedings(\s*of)?(\s*the)?\s*(\d{1,}|first|second|third|forth|fourth|fifth)[thrd]*'
                        r'.*Workshop.*',
                        re.I, default=None).group(3))
        except:
            #'edition' property is optional
            pass

        self.add_workshop(workshop)

コード例 #13

0

ファイルを表示

ファイル: publication_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_3(self):
        self.begin_template()
        publications = []

        elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]')
        if elements is None or len(elements) == 0:
            elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]')

        for publication in elements:
            try:
                name = clean_string(publication.find('a').text_content())
                if rex.rex(name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None):
                    #Examples: 180, 186
                    continue
                link = publication.find('a').get('href')
                editors = []
                editors_tag = None
                if publication.find('i') is not None:
                    editors_tag = publication.findall('i')[-1]
                elif publication.find('em') is not None:
                    editors_tag = publication.find('em')

                if editors_tag is None:
                    editors_tag_content = publication.find('br').tail
                else:
                    editors_tag_content = editors_tag.text_content()

                editors_tag_content = re.sub(r'\s*[,\s]*and\s+', ',', editors_tag_content, flags=re.I | re.S).strip()

                if not editors_tag_content:
                    #a publication should have non-empty list of authors
                    raise DataNotFound(link)

                for publication_editor_name in editors_tag_content.split(","):
                    pen = clean_string(publication_editor_name.strip())
                    if pen:
                        editors.append(pen)

                file_name = link.rsplit('.pdf')[0].rsplit('/')[-1]
                publication_object = {
                    'name': name,
                    'file_name': file_name,
                    'link': self.task.url + link,
                    'editors': editors
                }
                publication_object['is_invited'] = self.is_invited(publication_object)
                if self.check_for_workshop_paper(publication_object):
                    publications.append(publication_object)
            except Exception as ex:
                #traceback.print_exc()
                raise DataNotFound(ex)

        self.data['publications'] = publications
        self.end_template()

コード例 #14

0

ファイルを表示

ファイル: base.py プロジェクト: ceurws/sempubchallenge2014-task1

 def rex(body, patterns, flags=0, default=rex.NULL):
     result = None
     lastexception = DataNotFound()
     found = False
     for pattern in patterns:
         try:
             result = rex.rex(body, pattern, flags, default)
             found = True
             if not result:
                 break
         except DataNotFound as dnf:
             lastexception = dnf
     if found:
         return result
     else:
         raise lastexception

コード例 #15

0

ファイルを表示

 def rex(body, patterns, flags=0, default=rex.NULL):
     result = None
     lastexception = DataNotFound()
     found = False
     for pattern in patterns:
         try:
             result = rex.rex(body, pattern, flags, default)
             found = True
             if not result:
                 break
         except DataNotFound as dnf:
             lastexception = dnf
     if found:
         return result
     else:
         raise lastexception

コード例 #16

0

ファイルを表示

ファイル: workshop_parser.py プロジェクト: ailabitmo/sempubchallenge2014-task1

    def parse_template_1(self):
        """
        Examples:
            - http://ceur-ws.org/Vol-1008/
            - http://ceur-ws.org/Vol-1081/
            - http://ceur-ws.org/Vol-1085/
        """
        self.begin_template()
        try:
            colocated = rex.rex(self.grab.tree.xpath('//span[@class="CEURCOLOCATED"]/text()')[0],
                                r'([a-zA-Z\s*]+)[\s\']*(\d{4}|\d{2})', re.I)
        except IndexError as ex:
            raise DataNotFound(ex)
        self.data['acronym'] = colocated.group(1).strip()
        self.data['year'] = extract_year(colocated.group(2))

        self.end_template()

コード例 #17

0

ファイルを表示

    def parse_template_1(self):
        """
        Examples:
            - http://ceur-ws.org/Vol-1008/
            - http://ceur-ws.org/Vol-1081/
            - http://ceur-ws.org/Vol-1085/
        """
        self.begin_template()
        try:
            colocated = rex.rex(self.grab.tree.xpath('//span[@class="CEURCOLOCATED"]/text()')[0],
                                r'([a-zA-Z\s*]+)[\s\']*(\d{4}|\d{2})', re.I)
        except IndexError as ex:
            raise DataNotFound(ex)
        self.data['acronym'] = colocated.group(1).strip()
        self.data['year'] = extract_year(colocated.group(2))

        self.end_template()

コード例 #18

0

ファイルを表示

ファイル: publication_parser.py プロジェクト: ceurws/sempubchallenge2014-task1

 def is_invited(publication):
     if rex.rex(publication['link'], r'.*(keynote|invite).*', re.I, default=None):
         return True
     else:
         return False

コード例 #19

0

ファイルを表示

ファイル: publication_parser.py プロジェクト: alexgarciac/ceur-ws-lod

    def parse_template_3(self):
        self.begin_template()
        publications = []

        elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]')
        if elements is None or len(elements) == 0:
            elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]')

        for publication in elements:
            try:
                name = clean_string(publication.find('a').text_content())
                if rex.rex(
                        name,
                        r'.*(preface|first\s+pages|author\s+list|foreword).*',
                        re.I,
                        default=None):
                    #Examples: 180, 186
                    continue
                link = publication.find('a').get('href')
                editors = []
                editors_tag = None
                if publication.find('i') is not None:
                    editors_tag = publication.findall('i')[-1]
                elif publication.find('em') is not None:
                    editors_tag = publication.find('em')

                if editors_tag is None:
                    editors_tag_content = publication.find('br').tail
                else:
                    editors_tag_content = editors_tag.text_content()

                editors_tag_content = re.sub(r'\s*[,\s]*and\s+',
                                             ',',
                                             editors_tag_content,
                                             flags=re.I | re.S).strip()

                if not editors_tag_content:
                    #a publication should have non-empty list of authors
                    raise DataNotFound(link)

                for publication_editor_name in editors_tag_content.split(","):
                    pen = clean_string(publication_editor_name.strip())
                    if pen:
                        editors.append(pen)

                file_name = link.rsplit('.pdf')[0].rsplit('/')[-1]
                publication_object = {
                    'name': name,
                    'file_name': file_name,
                    'link': self.task.url + link,
                    'editors': editors
                }
                publication_object['is_invited'] = self.is_invited(
                    publication_object)
                if self.check_for_workshop_paper(publication_object):
                    publications.append(publication_object)
            except Exception as ex:
                #traceback.print_exc()
                raise DataNotFound(ex)

        self.data['publications'] = publications
        self.end_template()

コード例 #20

0

ファイルを表示

ファイル: publication_parser.py プロジェクト: S6savahd/ceur-ws-lod

    def parse_template_5(self):
        """
        Examples: VOL 1513
        """
        self.begin_template()
        publications = []

        i = 0
        for publication in self.grab.tree.xpath('//div[@class="CEURTOC"]/*[@rel="dcterms:hasPart"]/li'):
            try:
                if i == 0:
                    i += 1
                    name = clean_string(publication.find('a').text_content())
                    href = publication.find('a').get('href')
                    link = href if href.startswith('http://') else self.task.url + href
                    num_of_pages, start, end = -1, -1, -1
                    publication_object = {
                        'name': name,
                        'file_name': href,
                        'link': link,
                        'editors': '',
                        'num_of_pages': num_of_pages,
                        'start_page': start,
                        'end_page': end
                    }
                    publication_object['is_invited'] = self.is_invited(publication_object)
                    publications.append(publication_object)
                    if rex.rex(name, r'.*(preface|first\s+pages|author\s+list|foreword).*', re.I, default=None):
                        continue
                name = clean_string(publication.find('span[@rel="dcterms:relation"]').text_content())

                href = publication.find('span[@rel="dcterms:relation"]//a/span[@property="bibo:uri"]').get('content')
                link = href if href.startswith('http://') else self.task.url + href
                num_of_pages, start, end = -1, -1, -1
                if publication.find('span[@class="CEURPAGES"]'):
                    pages = publication.find('span[@class="CEURPAGES"]').text_content().strip().split('-')
                    start, end, num_of_pages = pages[0], pages[1], int(pages[1]) - int(pages[0]) + 1
                # get start and end page number from pdf file if page number not present at web page
                if link.endswith('.pdf') and start == -1:
                    num_of_pages, start, end = get_online_page_number(link)

                editors = []
                for publication_editor in publication.findall('span[@rel="dcterms:creator"]'):
                    editors.append(clean_string(publication_editor.text_content()).strip())

                publication_object = {
                    'name': name,
                    'file_name': href,
                    'link': link,
                    'editors': editors,
                    'num_of_pages': num_of_pages,
                    'start_page': start,
                    'end_page': end
                }
                publication_object['is_invited'] = self.is_invited(publication_object)
                if self.check_for_workshop_paper(publication_object):
                    publications.append(publication_object)
            except Exception as ex:
                raise DataNotFound(ex)

        self.data['publications'] = publications
        self.end_template()

コード例 #21

0

ファイルを表示

ファイル: publication_parser.py プロジェクト: ceurws/sempubchallenge2014-task1

 def check_for_workshop_paper(publication):
     if rex.rex(publication['name'].strip(), r'^(preface|overview|introduction|einleitung|foreword)$', re.I, default=None):
         return False
     if not publication['link'].endswith('.pdf'):
         return False
     return True

コード例 #22

0

ファイルを表示

 def extract_volume_number(url):
     return rex.rex(url, r'.*http://ceur-ws.org/Vol-(\d+).*').group(1)

コード例 #23

0

ファイルを表示

ファイル: base.py プロジェクト: ceurws/sempubchallenge2014-task1

 def extract_volume_number(url):
     return rex.rex(url, r'.*http://ceur-ws.org/Vol-(\d+).*').group(1)