Exemple #1
0
	def parse(self, response):
		

		#读取relation及对应的中文名
		entityRelationItem = WikidatarelationItem()
		relationName = dict()
		filePath = os.path.abspath(os.path.join(os.getcwd(),".."))
		#获取已经爬取的数据(避免重复爬)
		alreadyGet = []
		if(os.path.exists(os.path.join(filePath,"entity1_entity2.json"))):
			#读取文件
			with open(os.path.join(filePath,"entity1_entity2.json"),'r') as fr:
				for line in fr:
					entityIds = json.loads(line)
					alreadyGet.append(entityIds['entity1']+entityIds['relatedEntityId'])
		with open(filePath+"/wikidataRelation/relationResult.json", "r",encoding='UTF-8') as fr:
			for line in fr.readlines():
				relationJson = json.loads(line)
				relation = relationJson['rmention']
				relationName[relation] = relationJson['chrmention']

		count = 0 
		with open(filePath+"/wikidataRelation/readytoCrawl1.json","r",encoding='UTF-8') as fr:
			for line in fr.readlines():
				count += 1 
				print(1.0*count/33355)
				entityJson  = json.loads(line)
				link = "https:"+entityJson['entity']['url']
				entityName = entityJson['entityOriginName']
				entity = scrapy.Request(link,callback=self.parseEntity)
				entity.meta['entityName'] = entityName
				entity.meta['link'] = link
				entity.meta['alreadyGet'] = alreadyGet
				yield entity
Exemple #2
0
    def parse(self, response):
        #读取relation及对应的中文名
        entityRelationItem = WikidatarelationItem()
        relationName = dict()
        with open(
                "/home/kuangjun/WikidataSpider/wikidataRelation/relationResult.json",
                "r") as fr:
            for line in fr.readlines():
                relationJson = json.loads(line)
                relation = relationJson['rmention']
                relationName[relation] = relationJson['chrmention']

        count = 0
        with open(
                "/home/kuangjun/WikidataSpider/wikidataRelation/readytoCrawl.json",
                "r") as fr:
            for line in fr.readlines():
                count += 1
                print(1.0 * count / 33355)
                entityJson = json.loads(line)
                link = "https:" + entityJson['entity']['url']
                entityName = entityJson['entityOriginName']
                entity = scrapy.Request(link, callback=self.parseEntity)
                entity.meta['entityName'] = entityName
                entity.meta['link'] = link
                yield entity
    def parse(self, response):
        #读取relation及对应的中文名
        entityRelationItem = WikidatarelationItem()
        relationName = dict()
        filePath = os.path.abspath(os.path.join(os.getcwd(), ".."))
        with open(filePath + "/wikidataRelation/relationResult.json",
                  "r",
                  encoding="utf-8") as fr:
            for line in fr.readlines():
                relationJson = json.loads(line)
                relation = relationJson['rmention']
                relationName[relation] = relationJson['chrmention']

        count = 0
        with open(filePath + "/wikidataRelation/readytoCrawl.json",
                  "r",
                  encoding="utf-8") as fr:
            for line in fr.readlines():
                count += 1
                print(1.0 * count / 33355)
                entityJson = json.loads(line)
                link = "https:" + entityJson['entity']['url']
                entityName = entityJson['entityOriginName']
                entity = scrapy.Request(link, callback=self.parseEntity)
                entity.meta['entityName'] = entityName
                entity.meta['link'] = link
                yield entity
Exemple #4
0
    def parseEntity(self, response):
        print("=======================")

        entity1 = response.meta['entityName']
        entityRelation = WikidatarelationItem()
        headers = {
            "user-agent":
            "Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.108 Safari/537.36",
            "accept-language": "zh-CN,zh;q=0.9,en;q=0.8",
            "keep_alive": "False"
        }
        for section in response.xpath(
                '//h2[contains(@class,"wb-section-heading")]//span/text()'):
            title = section.extract()
            flag = 0
            if (title == "Statements"):
                flag = 1
                for statement in response.xpath(
                        './/div[@class="wikibase-statementgroupview"]'):
                    relationItem = statement.xpath(
                        './/div[@class="wikibase-statementlistview"]')
                    relationName = statement.xpath(
                        './/div[contains(@class,"wikibase-statementgroupview-property-label")]//a[contains(@title,"P")]/text()'
                    ).extract()
                    if (len(relationName) > 0):
                        relationName = relationName[0]
                    else:
                        continue
                    for relatedEntity in relationItem.xpath(
                            './/div[contains(@class,"wikibase-statementview-mainsnak")]//div[contains(@class,"wikibase-statementview-mainsnak")]\
						//div[contains(@class,"wikibase-snakview-value-container")]//div[contains(@class,"wikibase-snakview-body")]\
						//div[contains(@class,"wikibase-snakview-value")]//a[contains(@title,"Q")]'
                    ):
                        entityId = relatedEntity.xpath('./@title').extract()
                        if (len(entityId) == 0):
                            continue
                        else:
                            relatedEntityId = entityId[0]
                            httpRequest = requests.session()
                            httpRequest.mount('https://',
                                              HTTPAdapter(max_retries=30))
                            httpRequest.mount('http://',
                                              HTTPAdapter(max_retries=30))
                            url = "https://www.wikidata.org/w/api.php?action=wbgetentities&ids=" + relatedEntityId + "&format=json"
                            relatedEntityJson = httpRequest.get(
                                url, headers=headers).json()
                            httpRequest.close()
                            entity2 = str()
                            if 'zh' in relatedEntityJson['entities'][
                                    relatedEntityId]['labels']:
                                entity2 = relatedEntityJson['entities'][
                                    relatedEntityId]['labels']['zh']['value']
                            elif 'en' in relatedEntityJson['entities'][
                                    relatedEntityId]['labels']:
                                entity2 = relatedEntityJson['entities'][
                                    relatedEntityId]['labels']['en']['value']
                            else:
                                continue
                            entityRelation['entity1'] = entity1
                            entityRelation['relation'] = relationName
                            entityRelation['entity2'] = entity2
                            yield entityRelation

            if (flag):
                break
        print()
        print("========================")