def parse_template_4(self, element):
        """
        A template for joint proceedings of three workshops.

        Examples:
            - http://ceur-ws.org/Vol-981/
            - http://ceur-ws.org/Vol-862/
            - http://ceur-ws.org/Vol-853/
        """
        workshop_1 = {'id': 1}
        workshop_2 = {'id': 2}
        workshop_3 = {'id': 3}
        summary = self.rex(element[1], [
            r'(joint\s+proceedings\s+of\s+[the]*.*workshops:\s*([\s\w]+)\(([a-zA-Z]+)\d+\)'
            r'[and,\s]+([\s\w]+)\(([a-zA-Z]+)\d+\)[and,\s]+([\s\w]+)\(([a-zA-Z]+)\d+\)[,\s]+.*)Edited by.*',
            r"(joint\s+proceedings\s+of\s+([\s\w,]+)\(([a-zA-Z]+)['\s]?\d+\)[and,\s]+([\s\w-]+)\(([a-zA-Z]+)['\s]?\d+\)"
            r"[and,\s]+([\s\w]+)\(([a-zA-Z]+)['\s]?\d+\)[,\s]+.*)Edited by.*"
        ], re.I | re.S)

        if len(summary.groups()) != 7:
            raise DataNotFound()

        title = summary.group(1)

        workshop_1['volume_number'] = workshop_2['volume_number'] = workshop_3['volume_number'] = \
            WorkshopSummaryParser.extract_volume_number(element[0].get('href'))
        workshop_1['url'] = workshop_2['url'] = workshop_3['url'] = element[
            0].get('href')
        workshop_1['time'] = workshop_2['time'] = workshop_3[
            'time'] = utils.parse_date(title)

        workshop_1['label'] = summary.group(2)
        workshop_1['short_label'] = summary.group(3)
        workshop_2['label'] = summary.group(4)
        workshop_2['short_label'] = summary.group(5)
        workshop_3['label'] = summary.group(6)
        workshop_3['short_label'] = summary.group(7)

        self.add_workshop(workshop_1)
        self.add_workshop(workshop_2)
        self.add_workshop(workshop_3)
 def parse_template_1(self):
     """
     Examples:
         - http://ceur-ws.org/Vol-981/
     """
     self.begin_template()
     editors_block = u' '.join(
         self.grab.tree.xpath(
             '/html/body//text()[preceding::*[contains(., "Edited by")] and '
             'following::*[contains(.,"Table of Contents") or @class="CEURTOC"]]'
         ))
     editors = self.graph.objects(self.data['proceedings'], SWRC.editor)
     self.data['chairs'] = dict()
     for editor in editors:
         name = self.graph.objects(editor, FOAF.name).next()
         regexp = u'.*' + name + u'[\s~\xc2\xb0@#$%\^&*+-\xc2\xac]*\((\w+?)\d+\).*'
         match = re.match(regexp, editors_block, re.I | re.S)
         if match:
             self.data['chairs'][editor] = match.group(1)
     if len(self.data['chairs']) == 0:
         raise DataNotFound()
Example #3
0
    def parse_template_main(self):
        proceedings_list = []
        tr = self.grab.tree.xpath(XPATH_SUMMARY)
        for i in range(0, len(tr), 2):
            href = tr[i].find(self.XPATH_SUMMARY_TITLE)
            try:
                if href.get('href') in config.input_urls or len(
                        config.input_urls) == 1:
                    proceedings = dict()
                    proceedings[
                        'volume_number'] = ProceedingsSummaryParser.extract_volume_number(
                            href.get('href'))
                    proceedings['url'] = href.get('href')
                    summary_match = rex.rex(
                        tr[i + 1].find('.//td[last()]').text_content(),
                        r'(.*)(\nEdited\s*by\s*:\s*)(.*)(\nSubmitted\s*by\s*:\s*)(.*)(\nPublished\s*on\s*CEUR-WS:\s*)(.*)(\nONLINE)(.*)',
                        re.I | re.M | re.S)

                    proceedings['label'] = re.sub(
                        r'\n', '',
                        text.normalize_space(summary_match.group(1), ' \n'))
                    proceedings['editors'] = re.split(
                        r",+\s*", text.normalize_space(summary_match.group(3)))
                    proceedings['submission_date'] = datetime.strptime(
                        text.normalize_space(summary_match.group(7), ' \n'),
                        '%d-%b-%Y')

                    proceedings_list.append(proceedings)
            except:
                print "[WORKSHOP %s: ProceedingsSummaryParser] Summary information not found!" % href.get(
                    'href')
                #traceback.print_exc()

        self.data['proceedings_list'] = proceedings_list

        if len(proceedings_list) == 0:
            raise DataNotFound("There is no summary information to parse!")
Example #4
0
 def check_for_completeness(self):
     if len(self.data['publications']) == 0:
         self.data = {}
         raise DataNotFound()
    def parse_template_3(self):
        self.begin_template()
        publications = []

        elements = self.grab.tree.xpath('//li[a[@href] and (i or em or br)]')
        if elements is None or len(elements) == 0:
            elements = self.grab.tree.xpath('//p[a[@href] and (i or em)]')

        for publication in elements:
            try:
                name = clean_string(publication.find('a').text_content())
                if rex.rex(
                        name,
                        r'.*(preface|first\s+pages|author\s+list|foreword).*',
                        re.I,
                        default=None):
                    #Examples: 180, 186
                    continue
                link = publication.find('a').get('href')
                editors = []
                editors_tag = None
                if publication.find('i') is not None:
                    editors_tag = publication.findall('i')[-1]
                elif publication.find('em') is not None:
                    editors_tag = publication.find('em')

                if editors_tag is None:
                    editors_tag_content = publication.find('br').tail
                else:
                    editors_tag_content = editors_tag.text_content()

                editors_tag_content = re.sub(r'\s*[,\s]*and\s+',
                                             ',',
                                             editors_tag_content,
                                             flags=re.I | re.S).strip()

                if not editors_tag_content:
                    #a publication should have non-empty list of authors
                    raise DataNotFound(link)

                for publication_editor_name in editors_tag_content.split(","):
                    pen = clean_string(publication_editor_name.strip())
                    if pen:
                        editors.append(pen)

                file_name = link.rsplit('.pdf')[0].rsplit('/')[-1]
                publication_object = {
                    'name': name,
                    'file_name': file_name,
                    'link': self.task.url + link,
                    'editors': editors
                }
                publication_object['is_invited'] = self.is_invited(
                    publication_object)
                if self.check_for_workshop_paper(publication_object):
                    publications.append(publication_object)
            except Exception as ex:
                #traceback.print_exc()
                raise DataNotFound(ex)

        self.data['publications'] = publications
        self.end_template()
    def parse_template_2(self):
        """
        Examples:
            - http://ceur-ws.org/Vol-1008/
            - http://ceur-ws.org/Vol-1043/
        """

        self.begin_template()
        publications = []

        for element in self.grab.tree.xpath(
                '/html/body//*[@class="CEURTOC"]//*[a and '
                'descendant-or-self::*[@class="CEURAUTHORS"] and '
                'descendant-or-self::*[@class="CEURTITLE"]]'):
            try:
                name_el = element.find_class('CEURTITLE')[0]
                name = clean_string(name_el.text_content()).strip()
                if name is None or not name:
                    # In case of unclosed span element with the author list
                    # Example: http://ceur-ws.org/Vol-640
                    name = clean_string(name_el.tail)
                href = element.find('a').get('href')
                link = href if href.startswith(
                    'http://') else self.task.url + href
                editors = []
                editors_list_el = element.find_class('CEURAUTHORS')[0]
                editors_list = clean_string(editors_list_el.text_content())
                if not editors_list:
                    # In case of unclosed span element with the author list
                    # Example: http://ceur-ws.org/Vol-1043
                    editors_list = clean_string(editors_list_el.tail)

                for editor_name in editors_list.split(","):
                    editor_name = clean_string(editor_name.strip())
                    if editor_name:
                        editors.append(editor_name)

                if not editors:
                    #a publication should have non-empty list of authors
                    raise DataNotFound(link)

                file_name = link.rsplit('.pdf')[0].rsplit('/')[-1]
                publication_object = {
                    'name': name,
                    'file_name': file_name,
                    'link': link,
                    'editors': editors
                }
                publication_object['is_invited'] = self.is_invited(
                    publication_object)

                if len(self.data['workshops']) > 1:
                    try:
                        session = self.grab.tree.xpath(
                            '//a[@href="%s"]/preceding::*[@class="CEURSESSION"][1]'
                            % href)[0]
                        publication_object['presentedAt'] = []
                        for w in self.data['workshops']:
                            if w[1] is not None and w[1] in session.text:
                                publication_object['presentedAt'].append(w[0])
                    except:
                        # traceback.print_exc()
                        pass

                if self.check_for_workshop_paper(publication_object):
                    publications.append(publication_object)
            except Exception as ex:
                raise DataNotFound(ex)

        self.data['publications'] = publications
        self.end_template()