def run(self): """ Downloading files """ try: threadLock.acquire() print("{0}) Downloading \"{1}\"".format(self._id, self._file.url), end = "\n\n") threadLock.release() self._file.path = download.download(self._file.url, directory = DOWNLOAD_DIR, callback = None) if (self._file.path == ""): threadLock.acquire() print("Error, during downloading (server is not corresponding)") print("\"{0}\"".format(self._file.url), end = "\n\n") threadLock.release() self._file.appropriate = False self._return = None return except: threadLock.acquire() print("Download \"{0}\" failed".format(self._file.url), end = "\n\n") threadLock.release() exit(0) else: threadLock.acquire() print("File \"{0}\"".format(self._file.url)) print("Saved in \'{0}\'".format(self._file.path), end = "\n\n") threadLock.release() """ Checking hash sum for coinsidence """ self._file.makehash() if (self._file.hash == self._compare_file.hash): self._file.similarity = "HASH_COINSIDENCE" self._return = self._file ''' You can delete it ''' threadLock.acquire() print('////////////////////////////////////////////') print("Similarity with file") print("\'{0}\'".format(self._file.url)) print("is") print("HASH_COINSIDENCE") print('////////////////////////////////////////////', end = "\n\n") threadLock.release() return """ Parsing PDFs into TXTs """ self._file.path_txt = self._file.path + ".txt" threadLock.acquire() start_time = time() print("Started parsing \"{0}\"".format(self._file.path), end = "\n\n") threadLock.release() parse_result = parse_pdf(self._file) end_time = time() threadLock.acquire() print("Parsing \"{0}\" took {1:.3f}".format(self._file.path, end_time - start_time), end = "\n\n") threadLock.release() if (self._file.appropriate == False): self._return = None return threadLock.acquire() print("Started getting shingles for file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() start_time = time() self._file.text = get_text(self._file) self._file.words = get_words(self._file, self._stopwords) self._file.shingles = shingles.gen_shingles(self._file.words) end_time = time() if len(self._file.shingles) == 0: threadLock.acquire() print("No shingles were built in file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() self._return = None return threadLock.acquire() print("Shingles were built successfully for an file \"{0}\"".format(self._file.path_txt)) print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n") threadLock.release() """ Comparing, files, using shingles """ threadLock.acquire() print("Started getting similarity based on shingles in file \"{0}\"".format(self._file.path_txt), end = "\n\n") threadLock.release() start_time = time() self._file.similarity = shingles.compare(self._compare_file.shingles, self._file.shingles) end_time = time() threadLock.acquire() print("Ended comparing shingles for file \"{0}\"".format(self._file.path_txt)) print("Took {0:.3f}s".format(end_time - start_time), end = "\n\n") threadLock.release() threadLock.acquire() print('////////////////////////////////////////////') print("Similarity with file") print("\'{0}\'".format(self._file.url)) print("is") print("{0:.3f}".format(self._file.similarity)) print('////////////////////////////////////////////', end = "\n\n") threadLock.release() self._return = self._file
input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf" input_file = file(path = input_file_path) try: retcode = subprocess.call(["python", "./pdf_import.py", f.path_pdf, f.path_txt]) if (retcode != 0): print("Error while parsing {0}".format(f.path_pdf)) exit(-1) except: print("Error while parsing {0}".format(f.path_pdf)) exit(-1) input_file.processed = 1 words = [] f_tmp = open(input_file.path_txt, "r") input_text = f_tmp.read() f_tmp.close() candidate_keywords = [] candidate_keywords = generateCandidateKeywords(input_text, stopwords, lemmatizer) for sublist in candidate_keywords: for word in sublist: words.append(word) input_file.words = words input_file.shingles = shingles.gen_shingles(words) print(len(input_file.shingles)) number = 0 for f in files: number += 1 if (f.processed == 1): print("{0}) {1}".format(number, shingles.compare(input_file.shingles, f.shingles)))