def run_interface(self, job_id:str, input_dict:Dict[str,Any], method:str, use_cython:bool=True, is_use_cache:bool=True, is_use_memmap:bool=True): """* What you can do - Background task that runs a long function with progress reports. - It saves result into background DB """ ### started_at = datetime.now() self.update_state(state='PROGRESS', meta={'method': method, 'started_at': started_at.strftime('%Y-%m-%d %H:%M:%S')}) scored_result_obj = interface.run_feature_selection( input_dict=input_dict, method=method, use_cython=use_cython, is_use_cache=is_use_cache, is_use_memmap=is_use_memmap, path_working_dir=flask_app.config['PATH_WORKING_DIR'] ) backend_database_handler.insert_record(job_id=job_id, result_obj=scored_result_obj.ScoreMatrix2ScoreDictionary()) return {'job_id': job_id, 'status': 'completed', 'method': method, 'started_at': started_at.strftime('%Y-%m-%d %H:%M:%S')}
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True) elapsed_time = time.time() - start print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))
def pmi_with_threading(input_corpus): start = time.time() logging.debug(msg='With threading backend') scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, joblib_backend='threading') elapsed_time = time.time() - start print(("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)))
def pmi_with_parallel(input_corpus): logging.debug(msg='With multiprocessing backend') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, ) elapsed_time = time.time() - start logger.info("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
def test_interface_shelve(self): """パラメタ条件を組み合わせてテストを実行する - cythonモード使う or not - cacheモード使う or not - memmapモード使う or not """ shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json') for key, value in self.input_dict.items(): shelve_obj[key] = value sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True) for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value for method_name in self.method: for cython_flag in self.bool_cython: for cache_flag in self.is_use_cache: for memmap_flag in self.is_use_memmap: scored_result_persisted = interface.run_feature_selection( input_dict=shelve_obj, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag, is_use_memmap=memmap_flag ) # type: ScoredResultObject self.assertIsInstance(scored_result_persisted, ScoredResultObject) self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list) scored_result_sqlite3_persisted = interface.run_feature_selection( input_dict=sqlite3_dict_obj, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list) # You check if result is same between data-source = shelve_obj and data-source = dict-object scored_result_dict = interface.run_feature_selection( input_dict=self.input_dict, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag) # type: ScoredResultObject self.assertIsInstance(scored_result_dict, ScoredResultObject) self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list) numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray())
def pmi_with_cython(input_corpus): logging.debug(msg='With cython is True') start = time.time() scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time))
def pmi_with_threading(input_corpus): start = time.time() logging.debug(msg='With threading backend') scored_matrix_obj = interface.run_feature_selection( input_dict=input_corpus, method='pmi', n_jobs=-1, joblib_backend='threading' ) elapsed_time = time.time() - start print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
def get_bns(text): tokens = [w for x in text for w in x] # bns input_dict = {} input_dict['0'] = [[w] for i, w in enumerate(tokens[:-1]) if tokens[i + 1] != '.'] input_dict['1'] = [[w] for i, w in enumerate(tokens[:-1]) if tokens[i + 1] == '.'] bns_scored_object = interface.run_feature_selection( input_dict=input_dict, method=method, n_jobs=4) return bns_scored_object.ScoreMatrix2ScoreDictionary()
def test_interface_shelve(self): """パラメタ条件を組み合わせてテストを実行する - cythonモード使う or not - cacheモード使う or not - memmapモード使う or not """ shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json') for key, value in self.input_dict.items(): shelve_obj[key] = value sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True) for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value for method_name in self.method: for cython_flag in self.bool_cython: for cache_flag in self.is_use_cache: for memmap_flag in self.is_use_memmap: scored_result_persisted = interface.run_feature_selection( input_dict=shelve_obj, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag, is_use_memmap=memmap_flag ) # type: ScoredResultObject self.assertIsInstance(scored_result_persisted, ScoredResultObject) self.assertIsInstance( scored_result_persisted. ScoreMatrix2ScoreDictionary(), list) scored_result_sqlite3_persisted = interface.run_feature_selection( input_dict=sqlite3_dict_obj, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag ) # type: ScoredResultObject self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject) self.assertIsInstance( scored_result_sqlite3_persisted. ScoreMatrix2ScoreDictionary(), list) # You check if result is same between data-source = shelve_obj and data-source = dict-object scored_result_dict = interface.run_feature_selection( input_dict=self.input_dict, method=method_name, use_cython=cython_flag, is_use_cache=cache_flag ) # type: ScoredResultObject self.assertIsInstance(scored_result_dict, ScoredResultObject) self.assertIsInstance( scored_result_dict.ScoreMatrix2ScoreDictionary(), list) numpy.testing.assert_array_equal( scored_result_persisted.scored_matrix.toarray(), scored_result_dict.scored_matrix.toarray()) numpy.testing.assert_array_equal( scored_result_sqlite3_persisted.scored_matrix. toarray(), scored_result_dict.scored_matrix.toarray())
[ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ], [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ], [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] ], "label_b": [ [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ], [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ], [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ] ] } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # tf idf tf_idf_scored_object = interface.run_feature_selection( input_dict=input_dict_tuple_feature, method='tf_idf', n_jobs=5 ) pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary()) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # pmi pmi_scored_object = interface.run_feature_selection( input_dict=input_dict_tuple_feature, method='pmi', n_jobs=5 ) pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())
web_corpus = webtext.sents() gutenberg_corpus = gutenberg.sents() # Case of PersistentDict persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') persistent_dict_obj['abc'] = list(abc_corpus) persistent_dict_obj['genesis'] = list(genesis_corpus) persistent_dict_obj['web'] = list(web_corpus) persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation scored_matrix_obj = interface.run_feature_selection( input_dict=persistent_dict_obj, method='pmi', use_cython=True, is_use_cache=True, is_use_memmap=True) elapsed_time = time.time() - start print("elapsed_time with cython:{} [sec]".format(elapsed_time)) # Case of SqliteDict persisten_sqlite3_dict_obj = SqliteDict('./my_db.sqlite', autocommit=True) persisten_sqlite3_dict_obj['abc'] = list(abc_corpus) persisten_sqlite3_dict_obj['genesis'] = list(genesis_corpus) persisten_sqlite3_dict_obj['web'] = list(web_corpus) persisten_sqlite3_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() scored_matrix_obj_ = interface.run_feature_selection( input_dict=persisten_sqlite3_dict_obj, method='pmi', use_cython=True)
["hero", "cc", "bb"], ], "label_c": [ ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # tf idf tf_idf_scored_object = interface.run_feature_selection( input_dict=input_dict, method='tf_idf', ngram=1, n_jobs=5 ) # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ # pmi pmi_scored_object = interface.run_feature_selection( input_dict=input_dict, method='pmi', ngram=1, n_jobs=1, use_cython=False ) pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary()) # you can use cython version pmi also
def calculate_PMI_from_labeled_data(bengali_neg, bengali_pos, output_file): sentence_tokens_neq = get_top_words_from_labeled_set(bengali_neg) sentence_tokens_pos = get_top_words_from_labeled_set(bengali_pos) #return #print(freq_neq[:100]) #print(freq_pos[:100]) input_dict = {} input_dict['neg'] = sentence_tokens_neq input_dict['pos'] = sentence_tokens_pos ''' input_dict = { "label_a": [ ["I", "aa", "aa", "aa", "aa", "aa"], ["bb", "aa", "aa", "aa", "aa", "aa"], ["I", "aa", "hero", "some", "ok", "aa"] ], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ], "label_c": [ ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } ''' from DocumentFeatureSelection import interface x = interface.run_feature_selection( input_dict, method='pmi', use_cython=True).convert_score_matrix2score_record() # y = interface.run_feature_selection(input_dict, method='bns', use_cython=True).convert_score_matrix2score_record() ''' from DocumentFeatureSelection.common.data_converter import DataCsrMatrix from DocumentFeatureSelection.bns import bns_python3 from DocumentFeatureSelection.common import data_converter data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix( labeled_documents=input_dict, n_jobs=5 ) assert isinstance(data_csr_matrix, DataCsrMatrix) csr_matrix_ = data_csr_matrix.csr_matrix_ n_docs_distribution = data_csr_matrix.n_docs_distribution result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_, y=None, unit_distribution=n_docs_distribution, use_cython=True) print(x[:10]) print(result_bns[:10]) ''' #return #list_of_words = [] list_of_words = set() word_dic = {} for i in range(5000): #1300 used for labeled word = x[i] #print(word) #list_of_words.append(word['feature']) list_of_words.add(word['feature']) #print(word['feature'], word['label'], " ",word['frequency'], round(word['score'],6)) key = word['feature'] + "_" + word['label'] #print(key) word_dic[key] = str(word['frequency']) + "_" + str( round(word['score'], 6)) print("~~~~~~~~~~~~~~") top_words = [] negative = [] for word in list_of_words: key_1 = word + "_neg" key_2 = word + "_pos" if key_1 not in word_dic or key_2 not in word_dic: continue freq_neg = float(word_dic[key_1].split("_")[0]) freq_pos = float(word_dic[key_2].split("_")[0]) diff = float(freq_pos) - float(freq_neg) diff_ratio = abs(diff / float(freq_neg + freq_pos)) if diff_ratio > 0.50: #print(word) #, diff) top_words.append(word) #freq_neg, freq_pos, print("Number of word: ", len(top_words)) #return lexicon_from_english = [] filename = "/Users/russell/Documents/NLP/Paper-4/resources/unique_list.txt" with open(filename) as text: for line in text: lexicon_from_english.append(line.strip()) lexicon_from_labled = [] filename = "/Users/russell/Documents/NLP/Paper-4/resources/lexicon_from_training_set_11807.txt" with open(filename) as text: for line in text: lexicon_from_labled.append(line.strip()) print("\n\n--- ") count_found = 0 count_not_found = 0 unlabeled_lexicon = [] for word in top_words: if word not in lexicon_from_english and word not in lexicon_from_labled: #print(word) count_not_found += 1 unlabeled_lexicon.append(word) else: #print(word) count_found += 1 print("Not found, found", count_not_found, count_found) write_reviews(output_file, unlabeled_lexicon)
], "label_b": [ ["bb", "bb", "bb"], ["bb", "bb", "bb"], ["hero", "ok", "bb"], ["hero", "cc", "bb"], ], "label_c": [ ["cc", "cc", "cc"], ["cc", "cc", "bb"], ["xx", "xx", "cc"], ["aa", "xx", "cc"], ] } # 前処理済みデータから{file_name: [単語]}のdictを得る filename2morphs = load_preprocessed_record(path_preprocessed_jsonl) # テーブル情報から[(クラスタ番号, 元ファイル名)]を得る arg_information = [(r[2], json.loads(r[4])) for r in cluster_leaf_table] # [(クラスタ番号, [単語])]のリストを作る cluster_word = [(t[0], [word_pos[0] for word_pos in filename2morphs[t[1]['file_name']]]) for t in arg_information] # 入力形式を整える input_dict = {c_id: [t[1] for t in g_obj] for c_id, g_obj in itertools.groupby(sorted(cluster_word, key=lambda t: t[0]), key=lambda t: t[0])} feature_selection_result = interface.run_feature_selection(input_dict, method='tf_idf', use_cython=True).convert_score_matrix2score_record() # 重み付け結果をファイル出力 import pandas df_feature_selection = pandas.DataFrame(feature_selection_result) df_feature_selection.to_csv('./analysis_data/feature_selection.csv', index_label=False, index=False)
gutenberg_corpus = gutenberg.sents() # Case of PersistentDict persistent_dict_obj = PersistentDict('demo.json', 'c', format='json') persistent_dict_obj['abc'] = list(abc_corpus) persistent_dict_obj['genesis'] = list(genesis_corpus) persistent_dict_obj['web'] = list(web_corpus) persistent_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() # If you put is_use_cache=True, it uses cache object for keeping huge objects during computation # If you put is_use_memmap=True, it uses memmap for keeping matrix during computation scored_matrix_obj = interface.run_feature_selection( input_dict=persistent_dict_obj, method='pmi', use_cython=True, is_use_cache=True, is_use_memmap=True ) elapsed_time = time.time() - start print ("elapsed_time with cython:{} [sec]".format(elapsed_time)) # Case of SqliteDict persisten_sqlite3_dict_obj = SqliteDict('./my_db.sqlite', autocommit=True) persisten_sqlite3_dict_obj['abc'] = list(abc_corpus) persisten_sqlite3_dict_obj['genesis'] = list(genesis_corpus) persisten_sqlite3_dict_obj['web'] = list(web_corpus) persisten_sqlite3_dict_obj['gutenberg'] = list(gutenberg_corpus) start = time.time() scored_matrix_obj_ = interface.run_feature_selection(