Python is_duplicate Exemples, URLdeduplicator.is_duplicate Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : mapping-StackOverflow.py Projet : alanponce/dashboard

def main(argv):
    #TODO: This an RSS, there should be programatical ways to have an order to limit
    # duplication
    args = args_process(argv)

    soup = BeautifulSoup(open(args.inputfile),'lxml')

    currentbase = Graph()
    currentbase.load(args.current, format='turtle')

    g = Graph()
    c = 0
    for item in soup.find_all('item'):
        c += 1
        subject = URIRef("http://www.edsa-project.eu/jobs/StackOverflow/"+str(c))
        #URL
        url = Literal(item.guid.string)
        if args.verbose:
            print "Processing post " + url
        if is_duplicate(currentbase,url):
            if args.verbose:
                print url +" identified as duplicate, skipping..."
            continue
        g.add((subject,ns.schema.url,url))
        #Title
        g.add((subject,ns.schema.jobTitle,Literal(item.title.string)))
        #Description
        g.add((subject,ns.schema.description,Literal(item.description.string)))
        #PubDate
        date = dtp.parse(item.pubdate.string)
        g.add((subject,ns.schema.datePosted,
            Literal(date.isoformat(),datatype=XSD.Date)))

        for org in item.find_all('a10:name'):
            #hiringOrganization
            #TODO: Service to OpenCorporates to entity matching
            # Low priority, maybe can be done with SILK later
            g.add((subject,ns.schema.hiringOrganization,Literal(org.string)))
        for cat in item.find_all('category'):
            #skills
            skill = URIRef("http://www.edsa-project.eu/skill/"+cat.string)
            g.add((subject,ns.edsa.requiresSkill,skill))
            g.add((skill,ns.edsa.lexicalValue,Literal(cat.string)))
        if item.location is not None:
            #location
            g.add((subject,ns.schema.jobLocation,Literal(item.location.string)))
            try:
                tup = gn.find_location(item.location.string)
                g.add((subject,ns.edsa.Location,URIRef(tup[0])))
                g += tup[1]
            except gn.NotFoundException as e:
                #TODO: Redirect to an error file
                print("%s in subject %s" % (e,subject))
                print("problematic location %s" % item.location.string)


    currentbase += g
    g.serialize(destination=args.outputfile, format='turtle')
    currentbase.serialize(destination=args.current, format='turtle')

Exemple #2

0

Afficher le fichier

Fichier : test-url-deduplicator.py Projet : edsa-project/dashboard

 def test_is_duplicate_literal(self):
     self.assertTrue(is_duplicate(g,URIRef(URLTRUE)))
     self.assertFalse(is_duplicate(g,URIRef(URLFALSE)))

Exemple #3

0

Afficher le fichier

Fichier : test-url-deduplicator.py Projet : edsa-project/dashboard

 def test_is_duplicate_raw(self):
     self.assertTrue(is_duplicate(g,URLTRUE))
     self.assertFalse(is_duplicate(g,URLFALSE))

Exemple #4

0

Afficher le fichier

Fichier : mapping-Indeed.py Projet : edsa-project/dashboard

def main(argv):
    args = args_process(argv)

    soup = BeautifulSoup(open(args.inputfile),'lxml')

    currentbase = Graph()
    currentbase.load(args.current, format='turtle')

    g = Graph()
    for item in soup.find_all('result'):
        url = shrink_url(item.url.string.strip())
        subject = URIRef(url)
        #TODO: Check that with URL as subject, deduplication is not
        # needed
        if args.verbose:
            print "Processing post " + url
        
        if is_duplicate(currentbase,subject) or is_duplicate(g,subject):
            if args.verbose:
                print url +" identified as duplicate, skipping..."
            continue
        #URL
        g.add((subject,ns.schema.url,URIRef(url)))
        #Source
        g.add((subject,ns.schema.source,Literal("Indeed")))
        #Title
        g.add((subject,ns.schema.jobTitle,Literal(item.jobtitle.string.strip())))
        #Description
        g.add((subject,ns.schema.description,Literal(get_description(url))))
        #PubDate
        date = dtp.parse(item.date.string)
        g.add((subject,ns.schema.datePosted,
            Literal(date.isoformat(),datatype=XSD.Date)))

        #hiringOrganization
        try:
            g.add((subject,ns.schema.hiringOrganization,
                Literal(item.company.string.strip())))
        except AttributeError:
            if args.verbose:
                print ("%s has no company in the source data" % subject)
        #location
        location = item.formattedlocation.string.strip()
        g.add((subject,ns.schema.jobLocation,Literal(location)))
        try:
            #TODO changing point when separating geonames subset from current base
            if gn.is_inside(location,currentbase):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,currentbase)
                g.add((subject,ns.edsa.Location,lociri))
            elif gn.is_inside(location,g):
                if args.verbose:
                    print ("%s already linked to geonames, reusing..." % location)
                lociri = gn.get_iri(location,g)
                g.add((subject,ns.edsa.Location,lociri))
            else:
                tup = gn.find_location(item.formattedlocation.string.strip())
                g.add((subject,ns.edsa.Location,URIRef(tup[0])))
                g += tup[1]
        except gn.NotFoundException as e:
            #TODO: Redirect to an error file
            print("%s in subject %s" % (e,subject))
            print("problematic location %s" % item.formattedlocation.string)


    currentbase += g
    g.serialize(destination=args.outputfile, format='turtle')
    currentbase.serialize(destination=args.current, format='turtle')