def main(argv):
    #TODO: This an RSS, there should be programatical ways to have an order to limit
    # duplication
    args = args_process(argv)

    soup = BeautifulSoup(open(args.inputfile),'lxml')

    currentbase = Graph()
    currentbase.load(args.current, format='turtle')

    g = Graph()
    c = 0
    for item in soup.find_all('item'):
        c += 1
        subject = URIRef("http://www.edsa-project.eu/jobs/StackOverflow/"+str(c))
        #URL
        url = Literal(item.guid.string)
        if args.verbose:
            print "Processing post " + url
        if is_duplicate(currentbase,url):
            if args.verbose:
                print url +" identified as duplicate, skipping..."
            continue
        g.add((subject,ns.schema.url,url))
        #Title
        g.add((subject,ns.schema.jobTitle,Literal(item.title.string)))
        #Description
        g.add((subject,ns.schema.description,Literal(item.description.string)))
        #PubDate
        date = dtp.parse(item.pubdate.string)
        g.add((subject,ns.schema.datePosted,
            Literal(date.isoformat(),datatype=XSD.Date)))

        for org in item.find_all('a10:name'):
            #hiringOrganization
            #TODO: Service to OpenCorporates to entity matching
            # Low priority, maybe can be done with SILK later
            g.add((subject,ns.schema.hiringOrganization,Literal(org.string)))
        for cat in item.find_all('category'):
            #skills
            skill = URIRef("http://www.edsa-project.eu/skill/"+cat.string)
            g.add((subject,ns.edsa.requiresSkill,skill))
            g.add((skill,ns.edsa.lexicalValue,Literal(cat.string)))
        if item.location is not None:
            #location
            g.add((subject,ns.schema.jobLocation,Literal(item.location.string)))
            try:
                tup = gn.find_location(item.location.string)
                g.add((subject,ns.edsa.Location,URIRef(tup[0])))
                g += tup[1]
            except gn.NotFoundException as e:
                #TODO: Redirect to an error file
                print("%s in subject %s" % (e,subject))
                print("problematic location %s" % item.location.string)


    currentbase += g
    g.serialize(destination=args.outputfile, format='turtle')
    currentbase.serialize(destination=args.current, format='turtle')
Example #2
0
 def test_find_location_simple(self):
     location = "Southampton, UK"
     tup = gn.find_location(location)
     self.assertTrue(isinstance(tup[0], rdflib.URIRef))
     self.assertTrue(isinstance(tup[1], rdflib.Graph))
     # At least we brought the correct name
     askplace = prepareQuery(
         """ ASK {
     ?iri gn:name ?name 
     }
     """,
         initNs={"gn": ns.geonames},
     )
     self.assertTrue(tup[1].query(askplace, initBindings={"name": rdflib.Literal("Southampton")}))
Example #3
0
    def test_find_location_utf8(self):
        location = "Αθήνα, GR"
        tup = gn.find_location(location)
        self.assertTrue(isinstance(tup[0], rdflib.URIRef))
        self.assertTrue(isinstance(tup[1], rdflib.Graph))
        # At least we brought the correct name and country
        askplace = prepareQuery(
            """ ASK {
        ?iri gn:name ?name 
        }
        """,
            initNs={"gn": ns.geonames},
        )
        self.assertTrue(tup[1].query(askplace, initBindings={"name": rdflib.Literal("Athens")}))

        def test_find_location_unknown(self):
            location = "Unknownlandidfgdfg"
            with assertRaises(gn.NotFoundException):
                tup = gn.find_location(location)
Example #4
0
 def test_find_location_unknown(self):
     location = "Unknownlandidfgdfg"
     with assertRaises(gn.NotFoundException):
         tup = gn.find_location(location)
Example #5
0
def main(argv):
    args = args_process(argv)

    soup = BeautifulSoup(open(args.inputfile),'lxml')

    currentbase = Graph()
    currentbase.load(args.current, format='turtle')

    g = Graph()
    for item in soup.find_all('result'):
        url = shrink_url(item.url.string.strip())
        subject = URIRef(url)
        #TODO: Check that with URL as subject, deduplication is not
        # needed
        if args.verbose:
            print "Processing post " + url
        
        if is_duplicate(currentbase,subject) or is_duplicate(g,subject):
            if args.verbose:
                print url +" identified as duplicate, skipping..."
            continue
        #URL
        g.add((subject,ns.schema.url,URIRef(url)))
        #Source
        g.add((subject,ns.schema.source,Literal("Indeed")))
        #Title
        g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip())))
        #Description
        g.add((subject,ns.schema.description,Literal(get_description(url))))
        #PubDate
        date = dtp.parse(item.date.string)
        g.add((subject,ns.schema.datePosted,
            Literal(date.isoformat(),datatype=XSD.Date)))

        #hiringOrganization
        try:
            g.add((subject,ns.schema.hiringOrganization,
                Literal(item.company.string.strip())))
        except AttributeError:
            if args.verbose:
                print ("%s has no company in the source data" % subject)
        #location
        location = item.formattedlocation.string.strip()
        g.add((subject,ns.schema.jobLocation,Literal(location)))
        try:
            #TODO changing point when separating geonames subset from current base
            if gn.is_inside(location,currentbase):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,currentbase)
                g.add((subject,ns.edsa.Location,lociri))
            elif gn.is_inside(location,g):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,g)
                g.add((subject,ns.edsa.Location,lociri))
            else:
                tup = gn.find_location(item.formattedlocation.string.strip())
                g.add((subject,ns.edsa.Location,URIRef(tup[0])))
                g += tup[1]
        except gn.NotFoundException as e:
            #TODO: Redirect to an error file
            print("%s in subject %s" % (e,subject))
            print("problematic location %s" % item.formattedlocation.string)


    currentbase += g
    g.serialize(destination=args.outputfile, format='turtle')
    currentbase.serialize(destination=args.current, format='turtle')