コード例 #1
0
def run_interface(self,
                  job_id:str,
                  input_dict:Dict[str,Any],
                  method:str,
                  use_cython:bool=True,
                  is_use_cache:bool=True,
                  is_use_memmap:bool=True):
    """* What you can do
    - Background task that runs a long function with progress reports.
    - It saves result into background DB

    """
    ###
    started_at = datetime.now()
    self.update_state(state='PROGRESS', meta={'method': method,
                                              'started_at': started_at.strftime('%Y-%m-%d %H:%M:%S')})

    scored_result_obj = interface.run_feature_selection(
        input_dict=input_dict,
        method=method,
        use_cython=use_cython,
        is_use_cache=is_use_cache,
        is_use_memmap=is_use_memmap,
        path_working_dir=flask_app.config['PATH_WORKING_DIR']
    )

    backend_database_handler.insert_record(job_id=job_id,
                                           result_obj=scored_result_obj.ScoreMatrix2ScoreDictionary())

    return {'job_id': job_id,
            'status': 'completed',
            'method': method,
            'started_at': started_at.strftime('%Y-%m-%d %H:%M:%S')}
コード例 #2
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus, method='pmi', n_jobs=-1, use_cython=True)
    elapsed_time = time.time() - start
    print(("elapsed_time with cython:{} [sec]".format(elapsed_time)))
コード例 #3
0
def pmi_with_threading(input_corpus):
    start = time.time()
    logging.debug(msg='With threading backend')
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        joblib_backend='threading')
    elapsed_time = time.time() - start
    print(("elapsed_time with multiprocess:{} [sec]".format(elapsed_time)))
コード例 #4
0
def pmi_with_parallel(input_corpus):
    logging.debug(msg='With multiprocessing backend')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
    )
    elapsed_time = time.time() - start
    logger.info("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
コード例 #5
0
    def test_interface_shelve(self):
        """パラメタ条件を組み合わせてテストを実行する 
        - cythonモード使う or not
        - cacheモード使う or not
        - memmapモード使う or not
        """
        shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json')
        for key, value in self.input_dict.items(): shelve_obj[key] = value

        sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent, autocommit=True)
        for key, value in self.input_dict.items(): sqlite3_dict_obj[key] = value

        for method_name in self.method:
            for cython_flag in self.bool_cython:
                for cache_flag in self.is_use_cache:
                    for memmap_flag in self.is_use_memmap:
                        scored_result_persisted = interface.run_feature_selection(
                            input_dict=shelve_obj,
                            method=method_name,
                            use_cython=cython_flag,
                            is_use_cache=cache_flag,
                            is_use_memmap=memmap_flag
                        )  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_persisted, ScoredResultObject)
                        self.assertIsInstance(scored_result_persisted.ScoreMatrix2ScoreDictionary(), list)

                        scored_result_sqlite3_persisted = interface.run_feature_selection(
                            input_dict=sqlite3_dict_obj,
                            method=method_name, use_cython=cython_flag, is_use_cache=cache_flag)  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_sqlite3_persisted, ScoredResultObject)
                        self.assertIsInstance(scored_result_sqlite3_persisted.ScoreMatrix2ScoreDictionary(), list)

                        # You check if result is same between data-source = shelve_obj and data-source = dict-object
                        scored_result_dict = interface.run_feature_selection(
                            input_dict=self.input_dict,
                            method=method_name, use_cython=cython_flag, is_use_cache=cache_flag)  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_dict, ScoredResultObject)
                        self.assertIsInstance(scored_result_dict.ScoreMatrix2ScoreDictionary(), list)

                        numpy.testing.assert_array_equal(scored_result_persisted.scored_matrix.toarray(),
                                                         scored_result_dict.scored_matrix.toarray())
                        numpy.testing.assert_array_equal(scored_result_sqlite3_persisted.scored_matrix.toarray(),
                                                         scored_result_dict.scored_matrix.toarray())
コード例 #6
0
def pmi_with_cython(input_corpus):
    logging.debug(msg='With cython is True')
    start = time.time()
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        use_cython=True
    )
    elapsed_time = time.time() - start
    print ("elapsed_time with cython:{} [sec]".format(elapsed_time))
コード例 #7
0
def pmi_with_threading(input_corpus):
    start = time.time()
    logging.debug(msg='With threading backend')
    scored_matrix_obj = interface.run_feature_selection(
        input_dict=input_corpus,
        method='pmi',
        n_jobs=-1,
        joblib_backend='threading'
    )
    elapsed_time = time.time() - start
    print ("elapsed_time with multiprocess:{} [sec]".format(elapsed_time))
コード例 #8
0
        def get_bns(text):
            tokens = [w for x in text for w in x]
            # bns
            input_dict = {}
            input_dict['0'] = [[w] for i, w in enumerate(tokens[:-1])
                               if tokens[i + 1] != '.']
            input_dict['1'] = [[w] for i, w in enumerate(tokens[:-1])
                               if tokens[i + 1] == '.']

            bns_scored_object = interface.run_feature_selection(
                input_dict=input_dict, method=method, n_jobs=4)
            return bns_scored_object.ScoreMatrix2ScoreDictionary()
コード例 #9
0
    def test_interface_shelve(self):
        """パラメタ条件を組み合わせてテストを実行する 
        - cythonモード使う or not
        - cacheモード使う or not
        - memmapモード使う or not
        """
        shelve_obj = PersistentDict(self.path_shelve_file, 'c', 'json')
        for key, value in self.input_dict.items():
            shelve_obj[key] = value

        sqlite3_dict_obj = SqliteDict(filename=self.path_sqlite3_persistent,
                                      autocommit=True)
        for key, value in self.input_dict.items():
            sqlite3_dict_obj[key] = value

        for method_name in self.method:
            for cython_flag in self.bool_cython:
                for cache_flag in self.is_use_cache:
                    for memmap_flag in self.is_use_memmap:
                        scored_result_persisted = interface.run_feature_selection(
                            input_dict=shelve_obj,
                            method=method_name,
                            use_cython=cython_flag,
                            is_use_cache=cache_flag,
                            is_use_memmap=memmap_flag
                        )  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_persisted,
                                              ScoredResultObject)
                        self.assertIsInstance(
                            scored_result_persisted.
                            ScoreMatrix2ScoreDictionary(), list)

                        scored_result_sqlite3_persisted = interface.run_feature_selection(
                            input_dict=sqlite3_dict_obj,
                            method=method_name,
                            use_cython=cython_flag,
                            is_use_cache=cache_flag
                        )  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_sqlite3_persisted,
                                              ScoredResultObject)
                        self.assertIsInstance(
                            scored_result_sqlite3_persisted.
                            ScoreMatrix2ScoreDictionary(), list)

                        # You check if result is same between data-source = shelve_obj and data-source = dict-object
                        scored_result_dict = interface.run_feature_selection(
                            input_dict=self.input_dict,
                            method=method_name,
                            use_cython=cython_flag,
                            is_use_cache=cache_flag
                        )  # type: ScoredResultObject
                        self.assertIsInstance(scored_result_dict,
                                              ScoredResultObject)
                        self.assertIsInstance(
                            scored_result_dict.ScoreMatrix2ScoreDictionary(),
                            list)

                        numpy.testing.assert_array_equal(
                            scored_result_persisted.scored_matrix.toarray(),
                            scored_result_dict.scored_matrix.toarray())
                        numpy.testing.assert_array_equal(
                            scored_result_sqlite3_persisted.scored_matrix.
                            toarray(),
                            scored_result_dict.scored_matrix.toarray())
コード例 #10
0
        [ (("he", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ],
        [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("guy", "N"),) ],
        [ (("i", "N"), ("am", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
    ],
    "label_b": [
        [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("girl", "N"),) ],
        [ (("you", "N"), ("are", "V")), (("very", "ADV"), ("awesome", "ADJ")), (("girl", "N"),) ],
        [ (("she", "N"), ("is", "V")), (("very", "ADV"), ("good", "ADJ")), (("guy", "N"),) ]
    ]
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# tf idf
tf_idf_scored_object = interface.run_feature_selection(
    input_dict=input_dict_tuple_feature,
    method='tf_idf',
    n_jobs=5
)
pprint.pprint(tf_idf_scored_object.ScoreMatrix2ScoreDictionary())


# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# pmi
pmi_scored_object = interface.run_feature_selection(
    input_dict=input_dict_tuple_feature,
    method='pmi',
    n_jobs=5
)
pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())

コード例 #11
0
web_corpus = webtext.sents()
gutenberg_corpus = gutenberg.sents()

# Case of PersistentDict
persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
persistent_dict_obj['abc'] = list(abc_corpus)
persistent_dict_obj['genesis'] = list(genesis_corpus)
persistent_dict_obj['web'] = list(web_corpus)
persistent_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
scored_matrix_obj = interface.run_feature_selection(
    input_dict=persistent_dict_obj,
    method='pmi',
    use_cython=True,
    is_use_cache=True,
    is_use_memmap=True)
elapsed_time = time.time() - start
print("elapsed_time with cython:{} [sec]".format(elapsed_time))

# Case of SqliteDict
persisten_sqlite3_dict_obj = SqliteDict('./my_db.sqlite', autocommit=True)
persisten_sqlite3_dict_obj['abc'] = list(abc_corpus)
persisten_sqlite3_dict_obj['genesis'] = list(genesis_corpus)
persisten_sqlite3_dict_obj['web'] = list(web_corpus)
persisten_sqlite3_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
scored_matrix_obj_ = interface.run_feature_selection(
    input_dict=persisten_sqlite3_dict_obj, method='pmi', use_cython=True)
コード例 #12
0
        ["hero", "cc", "bb"],
    ],
    "label_c": [
        ["cc", "cc", "cc"],
        ["cc", "cc", "bb"],
        ["xx", "xx", "cc"],
        ["aa", "xx", "cc"],
    ]
}

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# tf idf

tf_idf_scored_object = interface.run_feature_selection(
    input_dict=input_dict,
    method='tf_idf',
    ngram=1,
    n_jobs=5
)

# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
# pmi
pmi_scored_object = interface.run_feature_selection(
    input_dict=input_dict,
    method='pmi',
    ngram=1,
    n_jobs=1,
    use_cython=False
)
pprint.pprint(pmi_scored_object.ScoreMatrix2ScoreDictionary())

# you can use cython version pmi also
コード例 #13
0
def calculate_PMI_from_labeled_data(bengali_neg, bengali_pos, output_file):

    sentence_tokens_neq = get_top_words_from_labeled_set(bengali_neg)
    sentence_tokens_pos = get_top_words_from_labeled_set(bengali_pos)

    #return

    #print(freq_neq[:100])
    #print(freq_pos[:100])

    input_dict = {}
    input_dict['neg'] = sentence_tokens_neq
    input_dict['pos'] = sentence_tokens_pos
    '''
    input_dict = {
    "label_a": [
        ["I", "aa", "aa", "aa", "aa", "aa"],
        ["bb", "aa", "aa", "aa", "aa", "aa"],
        ["I", "aa", "hero", "some", "ok", "aa"]
    ],
    "label_b": [
        ["bb", "bb", "bb"],
        ["bb", "bb", "bb"],
        ["hero", "ok", "bb"],
        ["hero", "cc", "bb"],
    ],
    "label_c": [
        ["cc", "cc", "cc"],
        ["cc", "cc", "bb"],
        ["xx", "xx", "cc"],
        ["aa", "xx", "cc"],
    ]
    }
    '''
    from DocumentFeatureSelection import interface
    x = interface.run_feature_selection(
        input_dict, method='pmi',
        use_cython=True).convert_score_matrix2score_record()

    # y = interface.run_feature_selection(input_dict, method='bns', use_cython=True).convert_score_matrix2score_record()
    '''
    from DocumentFeatureSelection.common.data_converter import DataCsrMatrix
    from DocumentFeatureSelection.bns import bns_python3
    from DocumentFeatureSelection.common import data_converter
    
    data_csr_matrix = data_converter.DataConverter().convert_multi_docs2document_frequency_matrix(
            labeled_documents=input_dict,
            n_jobs=5
        )
    assert isinstance(data_csr_matrix, DataCsrMatrix)
    csr_matrix_ = data_csr_matrix.csr_matrix_
    n_docs_distribution = data_csr_matrix.n_docs_distribution

    result_bns = bns_python3.BNS().fit_transform(X=csr_matrix_,
                                    y=None,
                                    unit_distribution=n_docs_distribution,
                                    use_cython=True)

    print(x[:10])
    print(result_bns[:10])
    
    '''
    #return
    #list_of_words = []

    list_of_words = set()
    word_dic = {}
    for i in range(5000):  #1300 used for labeled
        word = x[i]
        #print(word)
        #list_of_words.append(word['feature'])
        list_of_words.add(word['feature'])
        #print(word['feature'], word['label'], "    ",word['frequency'], round(word['score'],6))
        key = word['feature'] + "_" + word['label']
        #print(key)
        word_dic[key] = str(word['frequency']) + "_" + str(
            round(word['score'], 6))

    print("~~~~~~~~~~~~~~")
    top_words = []
    negative = []
    for word in list_of_words:
        key_1 = word + "_neg"
        key_2 = word + "_pos"
        if key_1 not in word_dic or key_2 not in word_dic:
            continue
        freq_neg = float(word_dic[key_1].split("_")[0])
        freq_pos = float(word_dic[key_2].split("_")[0])

        diff = float(freq_pos) - float(freq_neg)

        diff_ratio = abs(diff / float(freq_neg + freq_pos))

        if diff_ratio > 0.50:
            #print(word) #, diff)
            top_words.append(word)
        #freq_neg, freq_pos,

    print("Number of word: ", len(top_words))

    #return

    lexicon_from_english = []

    filename = "/Users/russell/Documents/NLP/Paper-4/resources/unique_list.txt"
    with open(filename) as text:
        for line in text:
            lexicon_from_english.append(line.strip())

    lexicon_from_labled = []
    filename = "/Users/russell/Documents/NLP/Paper-4/resources/lexicon_from_training_set_11807.txt"
    with open(filename) as text:
        for line in text:
            lexicon_from_labled.append(line.strip())

    print("\n\n--- ")
    count_found = 0
    count_not_found = 0

    unlabeled_lexicon = []
    for word in top_words:
        if word not in lexicon_from_english and word not in lexicon_from_labled:
            #print(word)
            count_not_found += 1
            unlabeled_lexicon.append(word)

        else:
            #print(word)
            count_found += 1

    print("Not found, found", count_not_found, count_found)
    write_reviews(output_file, unlabeled_lexicon)
コード例 #14
0
    ],
    "label_b": [
        ["bb", "bb", "bb"],
        ["bb", "bb", "bb"],
        ["hero", "ok", "bb"],
        ["hero", "cc", "bb"],
    ],
    "label_c": [
        ["cc", "cc", "cc"],
        ["cc", "cc", "bb"],
        ["xx", "xx", "cc"],
        ["aa", "xx", "cc"],
    ]
}

# 前処理済みデータから{file_name: [単語]}のdictを得る
filename2morphs = load_preprocessed_record(path_preprocessed_jsonl)
# テーブル情報から[(クラスタ番号, 元ファイル名)]を得る
arg_information = [(r[2], json.loads(r[4])) for r in cluster_leaf_table]
# [(クラスタ番号, [単語])]のリストを作る
cluster_word = [(t[0], [word_pos[0] for word_pos in filename2morphs[t[1]['file_name']]]) for t in arg_information]
# 入力形式を整える
input_dict = {c_id: [t[1] for t in g_obj]
              for c_id, g_obj
              in itertools.groupby(sorted(cluster_word, key=lambda t: t[0]), key=lambda t: t[0])}
feature_selection_result = interface.run_feature_selection(input_dict, method='tf_idf', use_cython=True).convert_score_matrix2score_record()
# 重み付け結果をファイル出力
import pandas
df_feature_selection = pandas.DataFrame(feature_selection_result)
df_feature_selection.to_csv('./analysis_data/feature_selection.csv', index_label=False, index=False)
コード例 #15
0
gutenberg_corpus = gutenberg.sents()

# Case of PersistentDict
persistent_dict_obj = PersistentDict('demo.json', 'c', format='json')
persistent_dict_obj['abc'] = list(abc_corpus)
persistent_dict_obj['genesis'] = list(genesis_corpus)
persistent_dict_obj['web'] = list(web_corpus)
persistent_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
# If you put is_use_cache=True, it uses cache object for keeping huge objects during computation
# If you put is_use_memmap=True, it uses memmap for keeping matrix during computation
scored_matrix_obj = interface.run_feature_selection(
        input_dict=persistent_dict_obj,
        method='pmi',
        use_cython=True,
        is_use_cache=True,
        is_use_memmap=True
    )
elapsed_time = time.time() - start
print ("elapsed_time with cython:{} [sec]".format(elapsed_time))

# Case of SqliteDict
persisten_sqlite3_dict_obj = SqliteDict('./my_db.sqlite', autocommit=True)
persisten_sqlite3_dict_obj['abc'] = list(abc_corpus)
persisten_sqlite3_dict_obj['genesis'] = list(genesis_corpus)
persisten_sqlite3_dict_obj['web'] = list(web_corpus)
persisten_sqlite3_dict_obj['gutenberg'] = list(gutenberg_corpus)

start = time.time()
scored_matrix_obj_ = interface.run_feature_selection(