def project(self,response): ## Get the project info proper. projectObject=ProjectItem() projectObject['author']=User.objects.get(pk=self.user_id) projectObject['title']=response.selector.xpath('//*[contains(@class,\'thing-header-data\')]/h1/text()').extract()[0].strip() projectObject['tags'] = response.selector.xpath("//*[contains(@class,\'thing-info-content thing-detail-tags-container\')]/div/a/text()").extract() yield projectObject ## get special text files. (readme, instructions, license) import html2text h2t = html2text.HTML2Text() #Get the reame file, do stuff to it. readme = h2t.handle(response.selector.xpath("//*[@id = 'description']").extract()[0].strip()) import unicodedata readmeItem=fileObjectItem() readmeItem["name"]="README.md" readmeItem["parent"]=projectObject['SID'] readmeItem["filename"]=u""+unicodedata.normalize('NFKD',readme).encode('ascii','ignore') readmeItem['isReadme'] = True yield readmeItem #projectObject['readme'] = u""+unicodedata.normalize('NFKD',readme).encode('ascii','ignore') #also a markdown file I guess we'd want. try: instructions = u""+h2t.handle(response.selector.xpath("//*[@id = 'instructions']").extract()[0].strip()).encode('ascii','ignore') instructionItem=fileObjectItem() instructionItem["name"]="Instructions.md" instructionItem["parent"]=projectObject['SID'] instructionItem["filename"]=instructions yield instructionItem except IndexError: pass #print("xpath to get the instructions IndexError'd") ## now, because the format of the license on thingi is always the same, we can pull this off. ## but I expect it is rather fragile. licenseurl =response.selector.xpath("//*[contains(@class,\'license-text\')]/a/@href")[2].extract().strip() licensetext = response.selector.xpath("//*[contains(@class,\'license-text\')]/a/text()")[1].extract().strip() licenceItem=fileObjectItem() licenceItem["name"]="License.md" licenceItem["parent"]=projectObject['SID'] licenceItem["filename"]="["+licensetext+"]("+licenseurl+")" yield licenceItem ## get all the projects image and file objects filelist = response.selector.xpath('//*[contains(@class,\'thing-file\')]/a/@href') for i in filelist: yield scrapy.http.Request(url=urlparse.urljoin(response.url, i.extract()), callback=self.item, meta={'parent':projectObject['SID']}) #Grab only raw images. imagelist = response.selector.xpath('//*[contains(@class,\'thing-gallery-thumbs\')]/div[@data-track-action="viewThumb"][@data-thingiview-url=""]/@data-large-url') for i in imagelist: yield scrapy.http.Request(dont_filter=True, url=urlparse.urljoin(response.url, i.extract()), callback=self.item, meta={'parent':projectObject['SID']})
def item(self,response): item=fileObjectItem() ## warning stupid preasent here. # splitting and grabing from urlparse for filename may not be best. item['name']=urlparse.urlparse(response.url)[2].split("/")[-1] item['name']=item['name'].replace("_display_large","") item['parent'] = response.meta['parent'] item['filename']=response.body yield(item)