def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75): ''' Entry Point. Args: url: PDF url. ''' if similarity_mode == "TfIdfCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors. similarity_filter = TfIdfCosine() elif similarity_mode == "Dice": # The object of `Similarity Filter`. # The similarity observed by this object is the Dice coefficient. similarity_filter = Dice() elif similarity_mode == "Jaccard": # The object of `Similarity Filter`. # The similarity observed by this object is the Jaccard coefficient. similarity_filter = Jaccard() elif similarity_mode == "Simpson": # The object of `Similarity Filter`. # The similarity observed by this object is the Simpson coefficient. similarity_filter = Simpson() else: raise ValueError() # The object of the NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() # Set the object of NLP. similarity_filter.nlp_base = nlp_base # If the similarity exceeds this value, the sentence will be cut off. similarity_filter.similarity_limit = similarity_limit # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) # The object of automatic sumamrization. auto_abstractor = AutoAbstractor() # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc, similarity_filter) # Output summarized sentence. [ print(result_dict["summarize_result"][i]) for i in range(len(result_dict["summarize_result"])) if i < 3 ]
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of automatic summarization with N-gram. auto_abstractor = NgramAutoAbstractor() # n-gram object auto_abstractor.n_gram = Ngram() # n of n-gram auto_abstractor.n = 3 # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc) # Output 3 summarized sentences. limit = 3 i = 1 for sentence in result_dict["summarize_result"]: print(sentence) if i >= limit: break i += 1
def Main(url): ''' Entry point. Args: url: target url. ''' # Object of web scraping. web_scrape = WebScraping() # Web-scraping. document = web_scrape.scrape(url) # Object of automatic summarization. auto_abstractor = AutoAbstractor() # Set tokenizer. auto_abstractor.tokenizable_doc = SimpleTokenizer() # Set delimiter. auto_abstractor.delimiter_list = [".", "\n"] # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Summarize document. result_dict = auto_abstractor.summarize(document, abstractable_doc) # Output 3 summarized sentences. limit = 3 i = 1 for sentence in result_dict["summarize_result"]: print(sentence) if i >= limit: break i += 1
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) batch_size = 10 if len(sentence_list) < batch_size: raise ValueError("The number of extracted sentences is insufficient.") all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = LSTMRTRBM() vectorlizable_sentence.learn(sentence_list=sentence_list, token_master_list=list(set(all_token_list)), hidden_neuron_count=1000, batch_size=batch_size, learning_rate=1e-03, seq_len=5) test_list = sentence_list[:batch_size] feature_points_arr = vectorlizable_sentence.vectorize(test_list) print("Feature points (Top 5 sentences):") print(feature_points_arr)
def Main(url): ''' Entry Point. Args: url: target url. ''' # The object of Web-Scraping. web_scrape = WebScraping() # Execute Web-Scraping. document = web_scrape.scrape(url) # The object of NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() sentence_list = nlp_base.listup_sentence(document) all_token_list = [] for i in range(len(sentence_list)): nlp_base.tokenize(sentence_list[i]) all_token_list.extend(nlp_base.token) sentence_list[i] = nlp_base.token vectorlizable_sentence = EncoderDecoder() vectorlizable_sentence.learn( sentence_list=sentence_list, token_master_list=list(set(all_token_list)), epochs=60 ) test_list = sentence_list[:5] feature_points_arr = vectorlizable_sentence.vectorize(test_list) reconstruction_error_arr = vectorlizable_sentence.controller.get_reconstruction_error().mean() print("Feature points (Top 5 sentences):") print(feature_points_arr) print("Reconstruction error(MSE):") print(reconstruction_error_arr)
def Main(url): ''' Entry Point. Args: url: PDF url. ''' # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) # The object of automatic sumamrization. auto_abstractor = AutoAbstractor() # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc) # Output summarized sentence. [print(sentence) for sentence in result_dict["summarize_result"]]
# Object of automatic summarization. auto_abstractor = AutoAbstractor() # Set tokenizer. auto_abstractor.tokenizable_doc = SimpleTokenizer() # Set delimiter. auto_abstractor.delimiter_list = [".", ","] # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Summarize document. result_dict = auto_abstractor.summarize(document, abstractable_doc) return result_dict # # Output 3 summarized sentences. # limit = 3 # i = 1 # for sentence in result_dict["summarize_result"]: # print(sentence) # if i >= limit: # break # i += 1 if __name__ == "__main__": import sys # web site url. url = sys.argv[1] # Object of web scraping. web_scrape = WebScraping() # Web-scraping. # document = web_scrape.scrape("https://en.wikipedia.org/wiki/Internet_of_things") # Main(document)
''' reward_value = 0.0 if state_key in self.__state_action_list_dict: if action_key in self.__state_action_list_dict[state_key]: reward_value = 1.0 return reward_value if __name__ == "__main__": import sys url = sys.argv[1] # Object of web scraping. web_scrape = WebScraping() # Web-scraping. document = web_scrape.scrape(url) limit = 1000 if len(sys.argv) > 2: limit = int(sys.argv[2]) alpha_value = 0.9 gamma_value = 0.9 boltzmann_q_learning = AutocompletionBoltzmannQLearning() boltzmann_q_learning.alpha_value = alpha_value boltzmann_q_learning.gamma_value = gamma_value boltzmann_q_learning.initialize(n=2) boltzmann_q_learning.pre_training(document=document)
def Main(url, similarity_mode="TfIdfCosine", cluster_num=10): ''' Entry Point. Args: url: PDF url. ''' # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) if similarity_mode == "EncoderDecoderClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM. similarity_filter = EncoderDecoderClustering( document, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=cluster_num, max_iter=100, debug_mode=True) elif similarity_mode == "LSTMRTRBMClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM-RTRBM. similarity_filter = LSTMRTRBMClustering(document, tokenizable_doc=None, hidden_neuron_count=1000, batch_size=100, learning_rate=1e-03, seq_len=5, cluster_num=cluster_num, max_iter=100, debug_mode=True) else: raise ValueError() print("#" * 100) for i in range(cluster_num): print("Label: " + str(i)) key_arr = np.where(similarity_filter.labeled_arr == i)[0] sentence_list = np.array( similarity_filter.sentence_list)[key_arr].tolist() for j in range(len(sentence_list)): print("".join(sentence_list[j])) print()
def Main(url, similarity_mode="TfIdfCosine", similarity_limit=0.75): ''' Entry Point. Args: url: PDF url. ''' # The object of Web-scraping. web_scrape = WebScraping() # Set the object of reading PDF files. web_scrape.readable_web_pdf = WebPDFReading() # Execute Web-scraping. document = web_scrape.scrape(url) if similarity_mode == "EncoderDecoderCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of manifolds, # which is embedded in hidden layer of Encoder/Decoder based on LSTM. similarity_filter = EncoderDecoderCosine(document, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, debug_mode=True) elif similarity_mode == "EncoderDecoderClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM. similarity_filter = EncoderDecoderClustering( document, hidden_neuron_count=200, epochs=100, batch_size=100, learning_rate=1e-05, learning_attenuate_rate=0.1, attenuate_epoch=50, bptt_tau=8, weight_limit=0.5, dropout_rate=0.5, test_size_rate=0.3, cluster_num=10, max_iter=100, debug_mode=True) elif similarity_mode == "LSTMRTRBMCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of manifolds, # which is embedded in hidden layer of LSTM-RTRBM. similarity_filter = LSTMRTRBMCosine(document, training_count=1, hidden_neuron_count=100, batch_size=100, learning_rate=1e-03, seq_len=5, debug_mode=True) elif similarity_mode == "LSTMRTRBMClustering": # The object of `Similarity Filter`. # The similarity is observed by checking whether each sentence belonging to the same cluster, # and if so, the similarity is `1.0`, if not, the value is `0.0`. # The data clustering algorithm is based on K-Means method, # learning data which is embedded in hidden layer of LSTM-RTRBM. similarity_filter = LSTMRTRBMClustering(document, tokenizable_doc=None, hidden_neuron_count=1000, batch_size=100, learning_rate=1e-03, seq_len=5, cluster_num=10, max_iter=100, debug_mode=True) elif similarity_mode == "TfIdfCosine": # The object of `Similarity Filter`. # The similarity observed by this object is so-called cosine similarity of Tf-Idf vectors. similarity_filter = TfIdfCosine() elif similarity_mode == "Dice": # The object of `Similarity Filter`. # The similarity observed by this object is the Dice coefficient. similarity_filter = Dice() elif similarity_mode == "Jaccard": # The object of `Similarity Filter`. # The similarity observed by this object is the Jaccard coefficient. similarity_filter = Jaccard() elif similarity_mode == "Simpson": # The object of `Similarity Filter`. # The similarity observed by this object is the Simpson coefficient. similarity_filter = Simpson() else: raise ValueError() # The object of the NLP. nlp_base = NlpBase() # Set tokenizer. This is japanese tokenizer with MeCab. nlp_base.tokenizable_doc = MeCabTokenizer() # Set the object of NLP. similarity_filter.nlp_base = nlp_base # If the similarity exceeds this value, the sentence will be cut off. similarity_filter.similarity_limit = similarity_limit # The object of automatic sumamrization. auto_abstractor = AutoAbstractor() # Set tokenizer. This is japanese tokenizer with MeCab. auto_abstractor.tokenizable_doc = MeCabTokenizer() # Object of abstracting and filtering document. abstractable_doc = TopNRankAbstractor() # Execute summarization. result_dict = auto_abstractor.summarize(document, abstractable_doc, similarity_filter) # Output summarized sentence. [ print(result_dict["summarize_result"][i]) for i in range(len(result_dict["summarize_result"])) if i < 3 ]