Beispiel #1
0
    print("Started getting text from \"{0}\"".format(input_file.path_txt))
    start_time = time()
    input_file.text = file_process.get_text(input_file)
    end_time = time()
    print("Getting text from \"{0}\" took {1:.3f}s".format(input_file.path_txt, end_time - start_time), end = "\n\n")

    """
    Started getting keywords from input_file
    """
    stopwords = set()
    if (stopwords_file.path_txt == None):
        pass
    else:
        print("Started parsing TXT with stopwords, wait for a while...")
        start_time = time()
        stopwords = split.get_list(stopwords_file.path_txt, enableComments = True)
        stopwords = set(stopwords)
        end_time = time()
        print("Parsing TXT with stopwords took {0:.3f}s".format(end_time - start_time), end = "\n\n")

    """
    Started getting keyword phrases from input_file
    """
    print("Started getting keyword phrases")
    start_time = time()
    keywords = keywords.getKeyPhrases(input_file.text, stopwords, lemmatizer = lemmatize)
    end_time = time()
    input_file.keywords = keywords
    if len(keywords) == 0:
        print("No keywords were found for an input file \"{0}\"".format(input_file.path_txt))
        exit(0)
		retcode = subprocess.call(["python", "./pdf_import.py", pdf, pdf + ".txt"])
		if (retcode != 0):
			print("Error while parsing PDF file!")
			exit(1)

		end_time = time.time()
		print("Parsing PDF took {0:.3f}".format(end_time - start_time), "seconds")

	except OSError:
		print("Error while trying to parse pdf file!")
		exit(1)

#Getting words from a txt file
print("\nStarted parsing TXT, wait for a while...")
start_time = time.time()
text = split.get_list(input_file, enableComments = False)
end_time = time.time()
print("Parsing TXT took {0:.3f}".format(end_time - start_time), "seconds")

#Getting words for deleting
if (stopwords_file == ''):
	stopwords = set()
	pass
else:
	#print("\nStarted parsing TXT with stopwords, wait for a while...")
	start_time = time.time()
	stopwords = split.get_list(stopwords_file, enableComments = True)
	end_time = time.time()
	print("\nParsing TXT with stopwords took {0:.3f}".format(end_time - start_time), "seconds")
	stopwords = set(stopwords)
#Getting words from a txt file
print("\nStarted parsing TXT, wait for a while...")
start_time = time()
#text = split.get_list(input_file, enableComments = False)
text = split.get_text(input_file)
end_time = time()
print("Parsing TXT took {0:.3f}".format(end_time - start_time), "seconds")

#Getting words for deleting
if (stopwords_file == ''):
	stopwords = set()
	pass
else:
	#print("\nStarted parsing TXT with stopwords, wait for a while...")
	start_time = time()
	stopwords = split.get_list(stopwords_file, enableComments = True)
	end_time = time()
	print("\nParsing TXT with stopwords took {0:.3f}".format(end_time - start_time), "seconds")
	stopwords = set(stopwords)

print("\nStarted getting keyword phrases")
start_time = time()
keywords = keywords.getKeyPhrases(text, stopwords, lemmatizer = lemmatizer)
end_time = time()
print("Getting keyword phrases took {0:.3f}".format(end_time - start_time), "seconds")

if (output_file == ''):
	print("\nKeywords (generated by RAKE):\n")
	for key in keywords:
		print(key[0])
else: