def main(argv): #TODO: This an RSS, there should be programatical ways to have an order to limit # duplication args = args_process(argv) soup = BeautifulSoup(open(args.inputfile),'lxml') currentbase = Graph() currentbase.load(args.current, format='turtle') g = Graph() c = 0 for item in soup.find_all('item'): c += 1 subject = URIRef("http://www.edsa-project.eu/jobs/StackOverflow/"+str(c)) #URL url = Literal(item.guid.string) if args.verbose: print "Processing post " + url if is_duplicate(currentbase,url): if args.verbose: print url +" identified as duplicate, skipping..." continue g.add((subject,ns.schema.url,url)) #Title g.add((subject,ns.schema.jobTitle,Literal(item.title.string))) #Description g.add((subject,ns.schema.description,Literal(item.description.string))) #PubDate date = dtp.parse(item.pubdate.string) g.add((subject,ns.schema.datePosted, Literal(date.isoformat(),datatype=XSD.Date))) for org in item.find_all('a10:name'): #hiringOrganization #TODO: Service to OpenCorporates to entity matching # Low priority, maybe can be done with SILK later g.add((subject,ns.schema.hiringOrganization,Literal(org.string))) for cat in item.find_all('category'): #skills skill = URIRef("http://www.edsa-project.eu/skill/"+cat.string) g.add((subject,ns.edsa.requiresSkill,skill)) g.add((skill,ns.edsa.lexicalValue,Literal(cat.string))) if item.location is not None: #location g.add((subject,ns.schema.jobLocation,Literal(item.location.string))) try: tup = gn.find_location(item.location.string) g.add((subject,ns.edsa.Location,URIRef(tup[0]))) g += tup[1] except gn.NotFoundException as e: #TODO: Redirect to an error file print("%s in subject %s" % (e,subject)) print("problematic location %s" % item.location.string) currentbase += g g.serialize(destination=args.outputfile, format='turtle') currentbase.serialize(destination=args.current, format='turtle')
def test_is_duplicate_literal(self): self.assertTrue(is_duplicate(g,URIRef(URLTRUE))) self.assertFalse(is_duplicate(g,URIRef(URLFALSE)))
def test_is_duplicate_raw(self): self.assertTrue(is_duplicate(g,URLTRUE)) self.assertFalse(is_duplicate(g,URLFALSE))
def main(argv): args = args_process(argv) soup = BeautifulSoup(open(args.inputfile),'lxml') currentbase = Graph() currentbase.load(args.current, format='turtle') g = Graph() for item in soup.find_all('result'): url = shrink_url(item.url.string.strip()) subject = URIRef(url) #TODO: Check that with URL as subject, deduplication is not # needed if args.verbose: print "Processing post " + url if is_duplicate(currentbase,subject) or is_duplicate(g,subject): if args.verbose: print url +" identified as duplicate, skipping..." continue #URL g.add((subject,ns.schema.url,URIRef(url))) #Source g.add((subject,ns.schema.source,Literal("Indeed"))) #Title g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip()))) #Description g.add((subject,ns.schema.description,Literal(get_description(url)))) #PubDate date = dtp.parse(item.date.string) g.add((subject,ns.schema.datePosted, Literal(date.isoformat(),datatype=XSD.Date))) #hiringOrganization try: g.add((subject,ns.schema.hiringOrganization, Literal(item.company.string.strip()))) except AttributeError: if args.verbose: print ("%s has no company in the source data" % subject) #location location = item.formattedlocation.string.strip() g.add((subject,ns.schema.jobLocation,Literal(location))) try: #TODO changing point when separating geonames subset from current base if gn.is_inside(location,currentbase): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,currentbase) g.add((subject,ns.edsa.Location,lociri)) elif gn.is_inside(location,g): if args.verbose: print ("%s already linked to geonames, reusing..." % location) lociri = gn.get_iri(location,g) g.add((subject,ns.edsa.Location,lociri)) else: tup = gn.find_location(item.formattedlocation.string.strip()) g.add((subject,ns.edsa.Location,URIRef(tup[0]))) g += tup[1] except gn.NotFoundException as e: #TODO: Redirect to an error file print("%s in subject %s" % (e,subject)) print("problematic location %s" % item.formattedlocation.string) currentbase += g g.serialize(destination=args.outputfile, format='turtle') currentbase.serialize(destination=args.current, format='turtle')