def hash(self): self.size = os.path.getsize(self.path_pdf) if (self.path_pdf != ''): self.hash = md5sum_file(self.path_pdf) else: self.hash = ""
def __init__(self, url = "", path_pdf = "", path_txt = path_pdf + ".txt"): self.url = url self.path_pdf = path_pdf self.path_txt = path_txt if (self.path_pdf == ''): self.hash = "" self.size = 0 else: self.hash = md5sum_file(path_pdf) self.size = os.path.getsize(self.path_pdf) #If processed = 0, then it has not yet been processed, 1 - successfully processed #-1 - cannot be processed self.processed = 0 self.words = [] self.shingles = []
def makehash(self): if (self.path != None): self.hash = md5sum_file(self.path) else: self.hash = ""