Beispiel #1
0
 def get(self):
  if "id" in self.complaint and self.complaint["id"]:
   self.complaint["year"] = int("20" + self.complaint["id"][0:2])
   docType = self.docType(self.complaint["year"])
   docfilename = download.getFilename(self.complaint["year"], "docs", self.complaint["id"], "." + docType)
   textfilename = download.getFilename(self.complaint["year"], "text", self.complaint["id"], ".txt")
   htmlfilename = download.getFilename(self.complaint["year"], "html", self.complaint["id"], ".html")
   #download.getFile("/".join(("http://203.152.114.11/decisions", self.complaint["id"][0:2], self.complaint["id"] + "." + docType)), docfilename, self.refresh, returnfile = False)
   download.getFile("http://old.asa.co.nz/decision_file.php?ascbnumber=" + self.complaint["id"], docfilename, self.refresh, returnfile = False)
   if docType == "doc":
    self.__getPage(download.getFile("http://old.asa.co.nz/display.php?ascb_number=" + self.complaint["id"], download.getFilename(self.complaint["year"], 'pages', self.complaint["id"]), self.refresh))
   for field in ["docdate", "docsize", "docwords", "complainants", "companies", "meetingdate"]:
    self.complaint[field] = None
   self.complaint["docdate"], self.complaint["docsize"] = download.getFileDetails(docfilename)
   if os.path.exists(docfilename):
    if os.path.exists(textfilename) and (self.complaint["docdate"] == os.path.getmtime(textfilename) or self.quick):
     self.complaint["doc"] = download.loadResource(textfilename)
    else:    
     if docType == "doc":
      command = [antiword, "-w", "0", "-m", "8859-1.txt"]
     else:
      command = [unrtf, "--text"]
     command.append(os.path.join(os.getcwd(), docfilename))
     try:
      self.complaint["doc"] = subprocess.check_output(command).decode('unicode_escape')
     except Exception, e:
      print "Failed to convert doc " + os.path.join(os.getcwd(), docfilename) + ": " + str(e)
     else:
      if docType == "rtf":
       self.complaint["doc"] = self.complaint["doc"].split("-----------------", 1)[1]
      download.saveResourcePost(textfilename, self.complaint["doc"], (time.time(), self.complaint["docdate"]))
    if os.path.exists(htmlfilename) and (self.complaint["docdate"] == os.path.getmtime(htmlfilename) or self.quick):
     self.complaint["html"] = download.loadResource(htmlfilename)
    else:    
     command2 = [unoconv, "-f", "html", "-o", os.path.join(os.getcwd(), htmlfilename), os.path.join(os.getcwd(), docfilename)]
     print "Saving: " + htmlfilename
     subprocess.check_output(command2)
    self.complaint["html"] = re.search('<body(?:.*?)>(.*?)</body>', download.loadResource(htmlfilename), flags=re.DOTALL).group(1)
Beispiel #2
0
 def getList(self, folder, filename):
  return download.loadResource(os.path.join(folder, filename)).decode('unicode_escape').splitlines()