def process_event(event):
    # Get the event page and compute its id
    url = re.search('href="([^"]*)"', event['Event']).group(1)
    event_id = url.replace('/', '').replace('e', 'event_')
    
    # Convert the CFP to text and save it
    document = BeautifulSoup(urllib2.urlopen("http://eventseer.net" + url).read())
    if document.find(id='cfp-content') == None:
        return
    cfp = document.find(id='cfp-content').renderContents()
    cfp = cfp.replace('<p>', '\n').replace('</p>', '').replace('<br />', '')
    cfp = cfp.replace('<ul>', '\n').replace('</ul>', '').replace('<li>', '').replace('</li>', '')
    cfp = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', cfp, flags=re.IGNORECASE)
    cfp = re.sub(r'^\n*', '', cfp, flags=re.IGNORECASE)
    f = open('data/' + event_id + '_cfp.txt', 'w')
    f.write(cfp)
    f.close()

    # Create, populate and save the RDF graph
    graph = ConjunctiveGraph()
    graph.bind('swc', SWC)
    graph.bind('cfp', CFP)
    graph.bind('ical', ICAL)
    graph.bind('foaf', FOAF)
    graph.bind('dct', DCT)
    graph.bind('lode', LODE)
    ### Event
    resource_event = LDES[event_id]
    graph.add((resource_event, RDF.type, SWC['AcademicEvent']))
    if event['StartDate'] != None:
        graph.add((resource_event, ICAL['dtstart'], Literal(datetime.strptime(event['StartDate'], "%d %b %Y"))))
    if event['EndDate'] != None:
        graph.add((resource_event, ICAL['dtend'], Literal(datetime.strptime(event['EndDate'], "%d %b %Y"))))
    if event['City'] != None and event['Country'] != None:
        city = get_location(event['City'], event['Country'])
        if city != None:
            city = URIRef(city)
        else:
            city = Literal(event['City'] + ", " + event['Country'])
        graph.add((resource_event, FOAF['based_near'], city))        
    ### CFP
    resource_cfp = LDES[event_id + '_cfp']
    graph.add((resource_cfp, RDF.type, CFP['CallForPapers']))
    graph.add((resource_cfp, CFP['for'], resource_event))
    graph.add((resource_cfp, CFP['details'], URIRef(BASE + 'data/' + event_id + '_cfp.txt')))
    ### Deadlines
    deadlines = []
    for a in document.findAll('script'):
        res = re.search('var deadlineList = ([^;]*);', a.renderContents())
        if res != None:
            txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"')
            txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE)
            txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":')
            deadlines = json.loads(txt)
    i = 0
    for deadline in deadlines:
        resource_deadline = LDES[event_id + '_deadline-' + str(i)]
        graph.add((resource_deadline, RDF.type, ICAL['Vevent']))
        graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
        graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
        graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title'])))
        graph.add((resource_deadline, ICAL['relatedTo'], resource_event))
        i = i + 1
    ### Topics and persons
    for link in document.find(id='cfp-content').findAll('a'):
        link = link.get('href')
        if link != None:
            if link[:3] == '/t/':
                topic_id = link.replace('/t/', 'topic_').replace('/', '')
                graph.add((resource_event, DCT['subject'], LDES[topic_id]))
            if link[:3] == '/p/':
                person_id = link.replace('/p/', 'person_').replace('/', '')
                graph.add((resource_event, LODE['involvedAgent'], LDES[person_id]))
    ### Save
    f = open('data/' + event_id + '.rdf', 'w')
    f.write(graph.serialize())
    f.close()
Exemple #2
0
def process_event(event):
    # Get the event page and compute its id
    url = re.search('href="([^"]*)"', event['Event']).group(1)
    event_id = url.replace('/', '').replace('e', 'event_')

    # Convert the CFP to text and save it
    document = BeautifulSoup(
        urllib2.urlopen("http://eventseer.net" + url).read())
    if document.find(id='cfp-content') == None:
        return
    cfp = document.find(id='cfp-content').renderContents()
    cfp = cfp.replace('<p>', '\n').replace('</p>', '').replace('<br />', '')
    cfp = cfp.replace('<ul>',
                      '\n').replace('</ul>',
                                    '').replace('<li>',
                                                '').replace('</li>', '')
    cfp = re.sub(r'<a [^>]*>([^<]*)</a>', '\g<1>', cfp, flags=re.IGNORECASE)
    cfp = re.sub(r'^\n*', '', cfp, flags=re.IGNORECASE)
    f = open('data/' + event_id + '_cfp.txt', 'w')
    f.write(cfp)
    f.close()

    # Create, populate and save the RDF graph
    graph = ConjunctiveGraph()
    graph.bind('swc', SWC)
    graph.bind('cfp', CFP)
    graph.bind('ical', ICAL)
    graph.bind('foaf', FOAF)
    graph.bind('dct', DCT)
    graph.bind('lode', LODE)
    ### Event
    resource_event = LDES[event_id]
    graph.add((resource_event, RDF.type, SWC['AcademicEvent']))
    if event['StartDate'] != None:
        graph.add((resource_event, ICAL['dtstart'],
                   Literal(datetime.strptime(event['StartDate'], "%d %b %Y"))))
    if event['EndDate'] != None:
        graph.add((resource_event, ICAL['dtend'],
                   Literal(datetime.strptime(event['EndDate'], "%d %b %Y"))))
    if event['City'] != None and event['Country'] != None:
        city = get_location(event['City'], event['Country'])
        if city != None:
            city = URIRef(city)
        else:
            city = Literal(event['City'] + ", " + event['Country'])
        graph.add((resource_event, FOAF['based_near'], city))
    ### CFP
    resource_cfp = LDES[event_id + '_cfp']
    graph.add((resource_cfp, RDF.type, CFP['CallForPapers']))
    graph.add((resource_cfp, CFP['for'], resource_event))
    graph.add((resource_cfp, CFP['details'],
               URIRef(BASE + 'data/' + event_id + '_cfp.txt')))
    ### Deadlines
    deadlines = []
    for a in document.findAll('script'):
        res = re.search('var deadlineList = ([^;]*);', a.renderContents())
        if res != None:
            txt = res.group(1).replace('\n', '').replace('\t',
                                                         '').replace("'", '"')
            txt = re.sub(r'<span [^>]*>([^<]*)</span>',
                         '\g<1>',
                         txt,
                         flags=re.IGNORECASE)
            txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":')
            deadlines = json.loads(txt)
    i = 0
    for deadline in deadlines:
        resource_deadline = LDES[event_id + '_deadline-' + str(i)]
        graph.add((resource_deadline, RDF.type, ICAL['Vevent']))
        graph.add((resource_deadline, ICAL['dtstart'],
                   Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
        graph.add((resource_deadline, ICAL['dtend'],
                   Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
        graph.add(
            (resource_deadline, ICAL['summary'], Literal(deadline['Title'])))
        graph.add((resource_deadline, ICAL['relatedTo'], resource_event))
        i = i + 1
    ### Topics and persons
    for link in document.find(id='cfp-content').findAll('a'):
        link = link.get('href')
        if link != None:
            if link[:3] == '/t/':
                topic_id = link.replace('/t/', 'topic_').replace('/', '')
                graph.add((resource_event, DCT['subject'], LDES[topic_id]))
            if link[:3] == '/p/':
                person_id = link.replace('/p/', 'person_').replace('/', '')
                graph.add(
                    (resource_event, LODE['involvedAgent'], LDES[person_id]))
    ### Save
    f = open('data/' + event_id + '.rdf', 'w')
    f.write(graph.serialize())
    f.close()
Exemple #3
0
    def _process_data(self, document):
        '''
        Creates the RDF graph describing the event
        @param document: the DOM document of the event
        '''
        # Create the graph
        graph = ConjunctiveGraph()
        graph.bind('swc', SWC)
        graph.bind('cfp', CFP)
        graph.bind('ical', ICAL)
        graph.bind('foaf', FOAF)
        graph.bind('dct', DCT)
        graph.bind('lode', LODE)

        # Init the event
        resource_event = LDES[self.get_resource_name()]
        graph.add((resource_event, RDF.type, SWC['AcademicEvent']))

        # Get the title
        if document.find(id='inner_left') != None:
            title = document.find(id='inner_left').find('h1').text
            graph.add((resource_event, RDFS.label, Literal(title)))

        # Get the location
        if document.find(text='City:') != None and document.find(
                text='Country:') != None:
            city = document.find(
                text='City:').findParent().findNextSibling().renderContents()
            country = document.find(text='Country:').findParent(
            ).findNextSibling().renderContents()
            location = get_location(city, country)
            if location == None:
                location = Literal("%s, %s" % (city, country))
            graph.add((resource_event, FOAF['based_near'], location))

        # Get the starting and ending dates
        if document.find(text='Period:') != None:
            text = document.find(text='Period:').findParent().findNextSibling(
            ).renderContents()
            parts = re.search(
                '(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})',
                text).groupdict()
            if parts['begin'] != None and parts['year'] != None:
                (month, start_day) = parts['begin'].split(' ')
                begin_date = datetime.strptime(
                    "%s %s %s" % (start_day, month, parts['year']), "%d %B %Y")
                graph.add(
                    (resource_event, ICAL['dtstart'], Literal(begin_date)))
                if parts['end'] != None:
                    end_parts = parts['end'].split(' ')
                    end_date = None
                    if len(end_parts) == 2:
                        end_date = datetime.strptime(
                            "%s %s %s" %
                            (end_parts[1], end_parts[0], parts['year']),
                            "%d %B %Y")
                    elif len(end_parts) == 1:
                        end_date = datetime.strptime(
                            "%s %s %s" % (end_parts[0], month, parts['year']),
                            "%d %B %Y")
                    if end_date != None:
                        graph.add(
                            (resource_event, ICAL['dtend'], Literal(end_date)))

        # Get the data for the CFP
        resource_cfp = LDES[self.get_resource_name() + "_cfp"]
        graph.add((resource_cfp, RDF.type, CFP['CallForPapers']))
        graph.add((resource_cfp, CFP['for'], LDES[self.entity_id]))
        graph.add(
            (resource_cfp, CFP['details'],
             URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt')))

        # Get the deadlines
        deadlines = []
        for a in document.findAll('script'):
            res = re.search('var deadlineList = ([^;]*);', a.renderContents())
            if res != None:
                txt = res.group(1).replace('\n',
                                           '').replace('\t',
                                                       '').replace("'", '"')
                txt = re.sub(r'<span [^>]*>([^<]*)</span>',
                             '\g<1>',
                             txt,
                             flags=re.IGNORECASE)
                txt = txt.replace('Date:',
                                  '"Date":').replace('Title:', '"Title":')
                deadlines = json.loads(txt)
        i = 0
        for deadline in deadlines:
            resource_deadline = LDES[self.get_resource_name() + '_deadline_' +
                                     str(i)]
            graph.add((resource_deadline, RDF.type, ICAL['Vevent']))
            graph.add((resource_deadline, ICAL['dtstart'],
                       Literal(datetime.strptime(deadline['Date'],
                                                 "%d %b %Y"))))
            graph.add((resource_deadline, ICAL['dtend'],
                       Literal(datetime.strptime(deadline['Date'],
                                                 "%d %b %Y"))))
            graph.add((resource_deadline, ICAL['summary'],
                       Literal(deadline['Title'])))
            graph.add((resource_deadline, ICAL['relatedTo'], resource_event))
            i = i + 1

        # Add the topics and persons
        if document.find(id='cfp-content') != None:
            for link in document.find(id='cfp-content').findAll('a'):
                link = link.get('href')
                if link != None:
                    if link[:3] == '/t/' and link not in self.topics_set:
                        try:
                            graph.add(
                                (resource_event, DCT['subject'],
                                 LDES[Topic(link[1:-1]).get_resource_name()]))
                            self.topics_set.add(link[1:-1])
                        except:
                            # Ignore bad topic links
                            pass
                    if link[:3] == '/p/' and link not in self.persons_set:
                        try:
                            graph.add(
                                (resource_event, LODE['involvedAgent'],
                                 LDES[Person(link[1:-1]).get_resource_name()]))
                            self.persons_set.add(link[1:-1])
                        except:
                            # Ignore bad person link
                            pass

        # Set the last modification date
        graph.add(
            (self.get_named_graph(), DCT['modified'], Literal(datetime.now())))

        # Save the data
        self.rdf_data = graph.serialize()
Exemple #4
0
 def _process_data(self, document):
     '''
     Creates the RDF graph describing the event
     @param document: the DOM document of the event
     '''
     # Create the graph
     graph = ConjunctiveGraph()
     graph.bind('swc', SWC)
     graph.bind('cfp', CFP)
     graph.bind('ical', ICAL)
     graph.bind('foaf', FOAF)
     graph.bind('dct', DCT)
     graph.bind('lode', LODE)
     
     # Init the event
     resource_event = LDES[self.get_resource_name()]
     graph.add((resource_event, RDF.type, SWC['AcademicEvent']))
     
     # Get the title
     if document.find(id='inner_left') != None:
         title = document.find(id='inner_left').find('h1').text
         graph.add((resource_event, RDFS.label, Literal(title)))
       
     # Get the location
     if document.find(text='City:') != None and document.find(text='Country:') != None:
         city = document.find(text='City:').findParent().findNextSibling().renderContents()
         country = document.find(text='Country:').findParent().findNextSibling().renderContents()
         location = get_location(city, country)
         if location == None:
             location = Literal("%s, %s" % (city, country))
         graph.add((resource_event, FOAF['based_near'], location))
     
     # Get the starting and ending dates
     if document.find(text='Period:') != None:
         text = document.find(text='Period:').findParent().findNextSibling().renderContents()
         parts = re.search('(?P<begin>[^-,]*)(-(?P<end>[^,]*))?, (?P<year>\d{4})', text).groupdict()
         if parts['begin'] != None and parts['year'] != None:
             (month, start_day) = parts['begin'].split(' ')
             begin_date = datetime.strptime("%s %s %s" % (start_day, month, parts['year']), "%d %B %Y")
             graph.add((resource_event, ICAL['dtstart'], Literal(begin_date)))
             if parts['end'] != None:
                 end_parts = parts['end'].split(' ')
                 end_date = None
                 if len(end_parts) == 2:
                     end_date = datetime.strptime("%s %s %s" % (end_parts[1], end_parts[0], parts['year']), "%d %B %Y")
                 elif len(end_parts) == 1:
                     end_date = datetime.strptime("%s %s %s" % (end_parts[0], month, parts['year']), "%d %B %Y")
                 if end_date != None:
                     graph.add((resource_event, ICAL['dtend'], Literal(end_date)))
                 
     # Get the data for the CFP
     resource_cfp = LDES[self.get_resource_name() + "_cfp"] 
     graph.add((resource_cfp, RDF.type, CFP['CallForPapers']))
     graph.add((resource_cfp, CFP['for'], LDES[self.entity_id]))
     graph.add((resource_cfp, CFP['details'], URIRef(BASE + 'data/' + self.get_resource_name() + '_cfp.txt')))
     
     # Get the deadlines 
     deadlines = []
     for a in document.findAll('script'):
         res = re.search('var deadlineList = ([^;]*);', a.renderContents())
         if res != None:
             txt = res.group(1).replace('\n', '').replace('\t', '').replace("'", '"')
             txt = re.sub(r'<span [^>]*>([^<]*)</span>', '\g<1>', txt, flags=re.IGNORECASE)
             txt = txt.replace('Date:', '"Date":').replace('Title:', '"Title":')
             deadlines = json.loads(txt)
     i = 0
     for deadline in deadlines:
         resource_deadline = LDES[self.get_resource_name() + '_deadline_' + str(i)]
         graph.add((resource_deadline, RDF.type, ICAL['Vevent']))
         graph.add((resource_deadline, ICAL['dtstart'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
         graph.add((resource_deadline, ICAL['dtend'], Literal(datetime.strptime(deadline['Date'], "%d %b %Y"))))
         graph.add((resource_deadline, ICAL['summary'], Literal(deadline['Title'])))
         graph.add((resource_deadline, ICAL['relatedTo'], resource_event))
         i = i + 1
         
     # Add the topics and persons
     if document.find(id='cfp-content') != None:
         for link in document.find(id='cfp-content').findAll('a'):
             link = link.get('href')
             if link != None:
                 if link[:3] == '/t/' and link not in self.topics_set:
                     try:
                         graph.add((resource_event, DCT['subject'], LDES[Topic(link[1:-1]).get_resource_name()]))
                         self.topics_set.add(link[1:-1])
                     except:
                         # Ignore bad topic links
                         pass
                 if link[:3] == '/p/' and link not in self.persons_set:
                     try:
                         graph.add((resource_event, LODE['involvedAgent'], LDES[Person(link[1:-1]).get_resource_name()]))
                         self.persons_set.add(link[1:-1])
                     except:
                         # Ignore bad person link
                         pass
     
     # Set the last modification date
     graph.add((self.get_named_graph(), DCT['modified'], Literal(datetime.now()))) 
     
     # Save the data
     self.rdf_data = graph.serialize()