Ejemplo n.º 1
0
    def run(self):
        """
        Downloading files
        """
        try:
            threadLock.acquire()
            print("{0}) Downloading \"{1}\"".format(self._id, self._file.url), end = "\n\n")
            threadLock.release()
            self._file.path = download.download(self._file.url, directory = DOWNLOAD_DIR, callback = None)
            if (self._file.path == ""):
                threadLock.acquire()
                print("Error, during downloading (server is not corresponding)")
                print("\"{0}\"".format(self._file.url), end = "\n\n")
                threadLock.release()
                self._file.appropriate = False
                self._return = None
                return
        except:
            threadLock.acquire()
            print("Download \"{0}\" failed".format(self._file.url), end = "\n\n")
            threadLock.release()
            exit(0)
        else:
            threadLock.acquire()
            print("File \"{0}\"".format(self._file.url))
            print("Saved in \'{0}\'".format(self._file.path), end = "\n\n")
            threadLock.release()

        """
        Checking hash sum for coinsidence
        """
        self._file.makehash()
        if (self._file.hash == self._compare_file.hash):
            self._file.similarity = "HASH_COINSIDENCE"
            self._return = self._file
            '''
            You can delete it
            '''
            threadLock.acquire()
            print('////////////////////////////////////////////')
            print("Similarity with file")
            print("\'{0}\'".format(self._file.url))
            print("is")
            print("HASH_COINSIDENCE")
            print('////////////////////////////////////////////', end = "\n\n")
            threadLock.release()
            return

        """
        Parsing PDFs into TXTs
        """
        self._file.path_txt = self._file.path + ".txt"
        threadLock.acquire()
        start_time = time()
        print("Started parsing \"{0}\"".format(self._file.path), end = "\n\n")
        threadLock.release()
        parse_result = parse_pdf(self._file)
        end_time = time()
        threadLock.acquire()
        print("Parsing \"{0}\" took {1:.3f}".format(self._file.path, end_time - start_time), end = "\n\n")
        threadLock.release()
        if (self._file.appropriate == False):
            self._return = None
            return

        threadLock.acquire()
        print("Started getting shingles for file \"{0}\"".format(self._file.path_txt), end = "\n\n")
        threadLock.release()

        start_time = time()
        self._file.text = get_text(self._file)
        self._file.words = get_words(self._file, self._stopwords)
        self._file.shingles = shingles.gen_shingles(self._file.words)
        end_time = time()
        if len(self._file.shingles) == 0:
            threadLock.acquire()
            print("No shingles were built in file \"{0}\"".format(self._file.path_txt), end = "\n\n")
            threadLock.release()
            self._return = None
            return
           
        threadLock.acquire() 
        print("Shingles were built successfully for an file \"{0}\"".format(self._file.path_txt))
        print("Took about {0:.3f}s".format(end_time - start_time), end = "\n\n")
        threadLock.release()

        """
        Comparing, files, using shingles
        """
        threadLock.acquire()
        print("Started getting similarity based on shingles in file \"{0}\"".format(self._file.path_txt), end = "\n\n")
        threadLock.release()
        start_time = time()
        self._file.similarity = shingles.compare(self._compare_file.shingles, self._file.shingles)
        end_time = time()
        threadLock.acquire()
        print("Ended comparing shingles for file \"{0}\"".format(self._file.path_txt))
        print("Took {0:.3f}s".format(end_time - start_time), end = "\n\n")
        threadLock.release()

        threadLock.acquire()
        print('////////////////////////////////////////////')
        print("Similarity with file")
        print("\'{0}\'".format(self._file.url))
        print("is")
        print("{0:.3f}".format(self._file.similarity))
        print('////////////////////////////////////////////', end = "\n\n")
        threadLock.release()

        self._return = self._file
Ejemplo n.º 2
0
input_file_path = "./Task/Выделение набора ключевых слов/0470749822.pdf"
input_file = file(path = input_file_path)
try:
    retcode = subprocess.call(["python", "./pdf_import.py", f.path_pdf, f.path_txt])
    if (retcode != 0):
        print("Error while parsing {0}".format(f.path_pdf))
        exit(-1)
except:
    print("Error while parsing {0}".format(f.path_pdf))
    exit(-1)
input_file.processed = 1
words = []
f_tmp = open(input_file.path_txt, "r")
input_text = f_tmp.read()
f_tmp.close()
candidate_keywords = []
candidate_keywords = generateCandidateKeywords(input_text, stopwords, lemmatizer)
for sublist in candidate_keywords:
    for word in sublist:
        words.append(word)
input_file.words = words
input_file.shingles = shingles.gen_shingles(words)
print(len(input_file.shingles))

number = 0
for f in files:
    number += 1
    if (f.processed == 1):
        print("{0}) {1}".format(number, shingles.compare(input_file.shingles, f.shingles)))