def calculate_result(self, url, userPK, **kwargs): print("importing project : {}".format(url)) response=get_response(url) goo=response.read() dom=html.fromstring(goo) #print(dom.xpath('//*[contains(@class,\'thing-header-data\')]/h1/text()')) # Getting some metadatas for the project. #this is probably fine. If you're confused, feel free to make it more verbose. project=Project() project.author_id = userPK# User.objects.get(pk=userPK) project.title = dom.xpath('//*[contains(@class,\'thing-header-data\')]/h1/text()')[0].strip() tags = dom.xpath("//*[contains(@class,\'thing-info-content thing-detail-tags-container\')]/div/a/text()") project.draft=True if Project.objects.filter(title=project.title): import datetime project.title+= " -- "+str(datetime.datetime.today()) project.save() for tag in tags: project.tags.add(tag) ## get special text files. (readme, instructions, license) import html2text h2t = html2text.HTML2Text() #Get the reame file, do stuff to it. readme = etree.tostring(dom.xpath("//*[@id = 'description']")[0]) readme = readme.encode("utf-8") readme = h2t.handle(readme) import unicodedata readmeItem=fileobject() readmeItem.parent=project#projectObject['SID'] readmeItem.isReadme = True readmename="README.md" readmefile=u""+unicodedata.normalize('NFKD',readme).encode('ascii','ignore') print(readmename) print(readmefile) readmeItem.fromText(readmefile,readmename) readmeItem.save() project.bodyFile=readmeItem project.save() print("bodyFile:") print(project.bodyFile) #projectObject['readme'] = u""+unicodedata.normalize('NFKD',readme).encode('ascii','ignore') #also a markdown file I guess we'd want. try: instructions = etree.tostring(dom.xpath("//*[@id = 'instructions']")[0]) instructions = u""+h2t.handle(instructions).encode('ascii','ignore') instructionItem=fileobject() instructionItem.parent=project#Object['SID'] name="Instructions.md" filename=instructions instructionItem.fromText(filename,name) instructionItem.save() except IndexError: pass #print("xpath to get the instructions IndexError'd") ## now, because the format of the license on thingi is always the same, we can pull this off. ## but I expect it is rather fragile. licenseurl =dom.xpath("//*[contains(@class,\'license-text\')]/a/@href")[2].strip() licensetext = dom.xpath("//*[contains(@class,\'license-text\')]/a/text()")[1].strip() licenceItem=fileobject() licenceItem.parent=project#Object['SID'] lname="License.md" lfile="["+licensetext+"]("+licenseurl+")" licenceItem.fromText(lfile,lname) licenceItem.save() ## get all the projects image and file objects #grab files filelist = dom.xpath('//*[contains(@class,\'thing-file\')]/a/@href') #Grab only raw images. imagelist = dom.xpath('//*[contains(@class,\'thing-gallery-thumbs\')]/div[@data-track-action="viewThumb"][@data-thingiview-url=""]/@data-large-url') fileurls=[urlparse.urljoin('http://www.thingiverse.com/', fl) for fl in imagelist+filelist] print("fileurls:") print(fileurls) bundle_o_tasks=[] for fileurl in fileurls: bundle_o_tasks+=[ThingiFileTask().si(url=fileurl,projectPK=project.pk)] filetask = chord(bundle_o_tasks) filetask(ResaveProjectTask().si(projectPK=project.pk)) return(project.title)