def exec_pre_test(test_data_path):
    subfiles = fi.listchildren(test_data_path, children_type='file')
    # file_list = fu.split_multi_format(
    #     [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6)
    # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi,
    #                                 [(file_list_slice,) for file_list_slice in file_list])
    twarr_blocks = filter_twarr(
        [fu.load_array(file) for file in subfiles if file.endswith('.json')])
    twarr = au.merge_array(twarr_blocks)

    tu.start_ner_service(pool_size=16)
    tu.twarr_ner(twarr)
    tu.end_ner_service()

    all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv'))
    pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv'))
    non_pos_ids = all_ids.difference(pos_ids)

    pos_twarr = list()
    non_pos_twarr = list()
    for tw in twarr:
        twid = tw[tk.key_id]
        if twid in pos_ids:
            pos_twarr.append(tw)
        elif twid in non_pos_ids:
            non_pos_twarr.append(tw)

    fu.dump_array(getcfg().pos_data_file, pos_twarr)
    fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
Beispiel #2
0
def get_semantic_tokens_multi(file_path):
    pos_type_info = {
        ark.prop_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_prop_dict_file},
        ark.comm_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_comm_dict_file},
        ark.verb_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_verb_dict_file},
        ark.hstg_label: {K_IFD: IdFreqDict(), K_FILE: getcfg().pre_hstg_dict_file},
    }
    total_doc_num = 0
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 40)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_semantic_tokens, [(file_list,) for file_list in file_list_block])
    for res_type_info, doc_num in res_list:
        total_doc_num += doc_num
        for label in res_type_info.keys():
            pos_type_info[label][K_IFD].merge_freq_from(res_type_info[label][K_IFD])
    print('total_doc_num', total_doc_num)
    for label in pos_type_info.keys():
        ifd, file_name = pos_type_info[label][K_IFD], pos_type_info[label][K_FILE]
        ifd.drop_words_by_condition(3)
        if label != ark.hstg_label:
            ifd.drop_words_by_condition(lambda word, _: word.startswith('#'))
        ifd.dump_dict(file_name)
        print('{}; vocab size:{}'.format(file_name, ifd.vocabulary_size()))
Beispiel #3
0
def main():
    # fi.iterate_file_tree(getconfig().data_path, summary_files_in_path,
    #                      summary_path='/home/nfs/cdong/tw/testdata/cdong/non')
    
    # summarization.get_semantic_tokens_multi(getcfg().origin_path)
    
    summarization.get_tokens_multi(getcfg().origin_path)
Beispiel #4
0
def main(args):
    input_base = getcfg().origin_path
    output_base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/'
    import utils.timer_utils as tmu
    tmu.check_time()
    parse_query_list(input_base, output_base, seed_queries, n_process=15)
    tmu.check_time()
    return
Beispiel #5
0
def parse_args():
    parser = argparse.ArgumentParser(description="Seeding information")
    parser.add_argument(
        '--summary_path',
        default=getcfg().origin_path,
        help=
        'Filtered tweets organized in days as file XX_XX_XX_XX.sum under this path.'
    )
    parser.add_argument(
        '--seed_path',
        default=getcfg().seed_path,
        help='Path for extracted seed instances according to particular query.'
    )

    parser.add_argument('--unlb',
                        action='store_true',
                        default=False,
                        help='If query is performed for unlabeled tweets.')
    parser.add_argument('--cntr',
                        action='store_true',
                        default=False,
                        help='If query is performed for counter tweets.')

    # parser.add_argument('--query', action='store_true', default=False,
    #                     help='If query tweets from summarized tw files.')
    # parser.add_argument('--ner', action='store_true', default=False,
    #                     help='If perform ner on queried file.')
    # parser.add_argument('--train', action='store_true', default=False,
    #                     help='If train the model according to the queried tweets, with internal logic.')
    # parser.add_argument('--temp', action='store_true', default=False,
    #                     help='Just a temp function.')
    # parser.add_argument('--matrix', action='store_true', default=False,
    #                     help='To obtain the matrix for both train and test twarr.')
    #
    # parser.add_argument('--test_data_path', default=getcfg().test_data_path,
    #                     help='Path for test data from dzs.')
    # parser.add_argument('--pre_test', action='store_true', default=False,
    #                     help='Just a temp function to preprocess data from dzs.')
    return parser.parse_args()
Beispiel #6
0
def get_tokens_multi(file_path):
    file_path = fi.add_sep_if_needed(file_path)
    # subfiles = au.random_array_items(fi.listchildren(file_path, children_type=fi.TYPE_FILE), 20)
    subfiles = fi.listchildren(file_path, children_type=fi.TYPE_FILE)
    file_list_block = mu.split_multi_format([(file_path + subfile) for subfile in subfiles], process_num=20)
    res_list = mu.multi_process(get_tokens, [(file_list,) for file_list in file_list_block])
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for ifd, doc_num in res_list:
        total_doc_num += doc_num
        id_freq_dict.merge_freq_from(ifd)
    print('total_doc_num', total_doc_num, 'total vocabulary_size', id_freq_dict.vocabulary_size())
    id_freq_dict.drop_words_by_condition(3)
    id_freq_dict.dump_dict(getcfg().post_dict_file)
import math
import subprocess
import multiprocessing as mp
from config.configure import getcfg

service_command = getcfg().ner_service_command


class ServiceException(Exception):
    def __init__(self, message):
        Exception.__init__(self)
        self.message = message


class NerServiceProxy:
    def __init__(self):
        self.service = None
    
    def clear_service(self):
        if self.service is not None:
            self.service.terminate()
            del self.service
            self.service = None
    
    def is_service_open(self):
        return self.service is not None and self.service.poll() is None
    
    def open_ner_service(self, classify, pos):
        if self.is_service_open():
            return
        self.service = subprocess.Popen(service_command, shell=True, close_fds=True,
Beispiel #8
0
    def input_twarr_with_label(twarr, label):
        alpha_range = beta_range = [i / 100 for i in range(1, 10, 2)
                                    ] + [i / 10 for i in range(1, 10, 2)]
        """cluster using different hyperparams in multiprocess way"""
        iter_num = 100
        process_num = 20
        hyperparams = [(a, b) for a in alpha_range for b in beta_range]
        params = [(None, twarr, *param, iter_num, label)
                  for param in hyperparams]
        res_list = ClusterService.clustering_multi(GSDPMM.GSDPMM_twarr, params,
                                                   process_num)
        param_num = len(hyperparams)
        """group the data by alpha"""
        frame = pd.DataFrame(index=np.arange(0, param_num),
                             columns=['alpha', 'beta'])
        for i in range(param_num):
            frame.loc[i] = hyperparams[i]
        """start plotting figures"""
        for alpha, indices in frame.groupby('alpha').groups.items():
            fig = plt.figure()
            fig.set_figheight(8)
            fig.set_figwidth(8)
            ax1 = fig.add_subplot(211)
            ax2 = fig.add_subplot(212)
            for i in indices:
                beta = frame.loc[i]['beta']
                topic_word_dstrb, tw_cluster_pred, iter_x, nmi_y, k_y = res_list[
                    i]
                ax1.plot(iter_x,
                         nmi_y,
                         '-',
                         lw=1.5,
                         label='beta=' + str(round(beta, 2)))
                ax2.plot(iter_x,
                         k_y,
                         '^',
                         lw=1.5,
                         label='beta=' + str(round(beta, 2)))
            title = 'alpha=' + str(round(alpha, 2))
            ax1.set_title(title)
            ax1.set_ylabel('NMI')
            ax1.set_ylim(0.25, 0.75)
            ax1.legend(loc='lower left')
            ax1.text(iter_num * 0.6,
                     0.70,
                     'final nmi: ' +
                     str(round(max([res_list[i][3][-1] for i in indices]), 4)),
                     fontsize=15)
            ax2.set_xlabel('iteration')
            ax2.set_ylabel('K num')
            ax2.legend(loc='lower left')
            plt.grid(True, '-', color='#333333', lw=0.8)
            plt.savefig(getcfg().dc_test + 'GSDPMM/GSDPMM_alpha=' + title +
                        '.png')

        # top_K = 20
        # alpha_idx = 0
        # beta_idx = 1
        # tw_cluster_pred_idx = 3
        # nmi_idx = 5
        # table_idx = 7
        # recall_idx = 8
        #
        # event_cluster_label = [i for i in range(12)]
        # summary_list = [hyperparams[i] + res_list[i] +
        #                 ClusterService.event_table_recall(label, res_list[i][1], event_cluster_label)
        #                 for i in range(param_num)]
        # top_recall_summary_list = [summary_list[i] for i in
        #                            np.argsort([summary[recall_idx] for summary in summary_list])[::-1][
        #                            :top_K]]
        # top_nmi_summary_list = [summary_list[i] for i in
        #                         np.argsort([summary[nmi_idx][-1] for summary in summary_list])[::-1][:top_K]]
        #
        # top_nmi_path = getcfg().dc_test + 'GSDPMM/max_nmis/'
        # top_recall_path = getcfg().dc_test + 'GSDPMM/max_recalls/'
        # fi.rmtree(top_nmi_path)
        # fi.rmtree(top_recall_path)
        #
        # def dump_cluster_info(summary_list, base_path):
        #     for rank, summary in enumerate(summary_list):
        #         res_dir = base_path + '{}_recall_{}_nmi_{}_alpha_{}_beta_{}/'. \
        #             format(rank, round(summary[recall_idx], 6), round(summary[nmi_idx][-1], 6),
        #                    summary[alpha_idx], summary[beta_idx])
        #         fi.makedirs(res_dir)
        #         tw_topic_arr = ClusterService.create_clusters_with_labels(twarr, summary[tw_cluster_pred_idx])
        #         for i, _twarr in enumerate(tw_topic_arr):
        #             if not len(_twarr) == 0:
        #                 fu.dump_array(res_dir + str(i) + '.txt', [tw[tk.key_text] for tw in _twarr])
        #         table = summary[table_idx]
        #         table.to_csv(res_dir + 'table.csv')
        #
        # dump_cluster_info(top_recall_summary_list, top_recall_path)
        # dump_cluster_info(top_nmi_summary_list, top_nmi_path)
        return None, None
import os
import shutil
from subprocess import Popen, PIPE

from config.configure import getcfg
import utils.function_utils as fu
import utils.array_utils as au
import utils.file_iterator as fi
import utils.multiprocess_utils as mu
import utils.tweet_keys as tk
import utils.timer_utils as tmu

autophrase_base = getcfg().autophrase_path
autophrase_output_base = fi.join(
    autophrase_base, "OUTPUTS/")  # 保证任何autophrase的输出限制到output_base之内的某个角落
command = fi.join(autophrase_base, "auto_phrase.sh")
fi.mkdir(autophrase_output_base)


def autophrase(input_text_file, output_path, commander, process_base, min_sup):
    p = Popen(commander,
              shell=True,
              bufsize=1,
              stdin=PIPE,
              stdout=PIPE,
              stderr=PIPE,
              cwd=process_base)
    p.stdin.flush()
    p.stdin.write((input_text_file + '\n').encode("utf8"))
    p.stdin.write((output_path + '\n').encode("utf8"))
    p.stdin.write((min_sup + '\n').encode("utf8"))
Beispiel #10
0
 def input_twarr_with_label(self, twarr, label):
     # def GSDMM_twarr(self, alpha, etap, etac, etav, etah, K, iter_num, ref_labels=None)
     # self.GSDMM_twarr(0.01, 0.01, 0.01, 0.01, 0.01, 5, 30)
     base_path = getcfg().dc_test + 'SEMANTIC/'
     a_range = etap_range = etac_range = etav_range = etah_range = [
         0.01, 0.05, 0.1
     ]
     K_range = [30, 40]
     iter_num = 50
     """cluster using different hyperparams in multiprocess way"""
     process_num = 19
     hyperparams = [(a, ep, ec, ev, eh, k) for a in a_range
                    for ep in etap_range for ec in etac_range
                    for ev in etav_range for eh in etah_range
                    for k in K_range]
     param_num = len(hyperparams)
     res_list = cs.clustering_multi(SemanticClusterer.GSDMM_twarr,
                                    [(self, *param, iter_num, label)
                                     for param in hyperparams], process_num)
     column_name = ['alpha', 'etap', 'etac', 'etav', 'etah', 'K']
     # """start plotting figures"""
     # frame = pd.DataFrame(index=np.arange(0, param_num), columns=column_name, data=hyperparams)
     # for (alpha, K), indices in frame.groupby(['alpha', 'K']).groups.items():
     #     fig = plt.figure()
     #     fig.set_figheight(8)
     #     fig.set_figwidth(8)
     #     for i in indices:
     #         clu_word_distrb, tw_cluster_pred, iter_x, nmi_y = res_list[i]
     #         legend_params = ('etap', 'etac', 'etav', 'etah')
     #         plt_label = ','.join([p_name + str(frame.loc[i][p_name]) for p_name in legend_params])
     #         plt.plot(iter_x, nmi_y, '-', lw=1.5, label=plt_label)
     #     title = 'alpha=' + str(alpha) + ',K=' + str(K)
     #     plt.title(title)
     #     plt.ylabel('NMI')
     #     plt.ylim(0.25, 0.75)
     #     plt.legend(loc='lower left')
     #     plt.text(iter_num * 0.6, 0.70,
     #              'final nmi: ' + str(round(max([res_list[i][3][-1] for i in indices]), 4)), fontsize=15)
     #     plt.grid(True, '-', color='#333333', lw=0.8)
     #     plt.savefig(base_path + 'SEMANTIC' + title + '.png')
     """start dumping cluster information"""
     # def concat_param_name_values(param_names, param_values):
     #     if not len(param_names) == len(param_values):
     #         raise ValueError('inconsistent param number')
     #     return '_'.join(['{}_{:<3}'.format(param_names[i], param_values[i]) for i in range(len(param_names))])
     #
     # top_rank = 30
     # true_cluster = [i for i in range(12)]
     # tbl_recall_list = [ClusterService.event_table_recall(label, res_list[i][1], true_cluster) for i in range(param_num)]
     # top_recall_idx = pd.DataFrame(data=[(i, tbl_recall_list[i][1], res_list[i][3][-1]) for i in range(param_num)])\
     #     .sort_values(by=[1, 2], ascending=False).loc[:, 0][:top_rank]
     # top_nmi_idx = np.argsort([res_list[i][3][-1] for i in range(param_num)])[-1:-top_rank-1:-1]
     #
     # def dump_cluster_info(top_idx_list_, base_path_):
     #     for rank, idx in enumerate(top_idx_list_):
     #         res_dir = '{}{}_recall_{:0<6}_nmi_{:0<6}_{}/'.\
     #             format(base_path_, rank, round(tbl_recall_list[idx][1], 4), round(res_list[idx][3][-1], 4),
     #                    concat_param_name_values(column_name, hyperparams[idx]))
     #         fi.makedirs(res_dir)
     #         tw_topic_arr = ClusterService.create_clusters_with_labels(twarr, res_list[idx][1])
     #         for i, _twarr in enumerate(tw_topic_arr):
     #             if not len(_twarr) == 0:
     #                 fu.dump_array(res_dir + str(i) + '.txt', [tw[tk.key_text] for tw in _twarr])
     #         cluster_table = tbl_recall_list[idx][0]
     #         cluster_table.to_csv(res_dir + 'table.csv')
     #
     # top_recall_path = base_path + 'max_recalls/'
     # fi.rmtree(top_recall_path)
     # dump_cluster_info(top_recall_idx, top_recall_path)
     # top_nmi_path = base_path + 'max_nmis/'
     # fi.rmtree(top_nmi_path)
     # dump_cluster_info(top_nmi_idx, top_nmi_path)
     return 0, 0
Beispiel #11
0
 def input_twarr_with_label(twarr, label):
     # alpha_range = beta_range = [i/100 for i in range(1, 10, 3)] + [i/10 for i in range(1, 10, 3)] + \
     #                            [i for i in range(1, 10, 3)]
     # K_range = [30, 40, 50]
     alpha_range = beta_range = [i / 100 for i in range(1, 10, 4)
                                 ] + [i / 10 for i in range(1, 10, 4)]
     K_range = [30, 40, 50]
     """cluster using different hyperparams in multiprocess way"""
     iter_num = 100
     process_num = 20
     hyperparams = [(a, b, K) for a in alpha_range for b in beta_range
                    for K in K_range]
     res_list = list()
     for i in range(int(math.ceil(len(hyperparams) / process_num))):
         param_list = [
             (twarr, *param, iter_num)
             for param in hyperparams[i * process_num:(i + 1) * process_num]
         ]
         res_list += utils.multiprocess_utils.multi_process(
             GSDMM.GSDMM_twarr, param_list)
         print('{:<4} /'.format((i + 1) * process_num), len(hyperparams),
               'params processed')
     """group the data by K"""
     frame = pd.DataFrame(index=np.arange(0, len(hyperparams)),
                          columns=['alpha', 'beta', 'K'])
     for i in range(len(hyperparams)):
         frame.loc[i] = hyperparams[i]
     print('\n', frame, '\n')
     """start plotting figures"""
     for (alpha, K), indices in frame.groupby(['alpha',
                                               'K']).groups.items():
         fig = plt.figure()
         fig.set_figheight(8)
         fig.set_figwidth(8)
         all_nmi = list()
         for i in indices:
             beta = frame.loc[i]['beta']
             tw_cluster_pred_iter = res_list[i]
             iter_x = range(len(tw_cluster_pred_iter))
             nmi_y = [
                 au.score(label, pred, 'nmi')
                 for pred in tw_cluster_pred_iter
             ]
             all_nmi.append(nmi_y)
             plt.plot(iter_x,
                      nmi_y,
                      '-',
                      lw=1.5,
                      label='beta={}'.format(round(beta, 2)))
         plt.xlabel('iteration')
         plt.ylabel('NMI')
         plt.ylim(0.0, 0.75)
         plt.title('K=' + str(K))
         plt.legend(loc='lower right')
         plt.grid(True, '-', color='#333333', lw=0.8)
         plt.text(iter_num - 40,
                  0.70,
                  'final nmi: ' +
                  str(round(max([nmi[-1] for nmi in all_nmi]), 6)),
                  fontsize=14,
                  verticalalignment='bottom',
                  horizontalalignment='left')
         plt.savefig(getcfg().dc_test + 'GSDMM/' +
                     'alpha={},K={}.png'.format(round(alpha, 2), K))
Beispiel #12
0
class ExtractSubProcess(DaemonProcess):
    def start(self, func):
        self.process = mp.Process(target=func, args=(self.inq, self.outq))
        self.process.daemon = False
        self.process.start()
        # print('ExtractSubProcess', self.process.pid)


extract_sub_process = ExtractSubProcess()

END_PROCESS = -1
SET_PARAMS = 0
INPUT_LIST = 1
OUTPUT_LIST = 3
OUT_BASE = getcfg().output_path
fi.mkdir(OUT_BASE, remove_previous=True)


def extract_sub_main(inq, outq):
    """
    聚类信息提取模块子进程的主函数,该进程还有下属的若干子进程用于执行实际操作,
    负责读取主进程输入,调用子进程组进行一次地点提取,根据返回结果,对多个聚类依地点进行合并操作,
    并再次调用子进程组对合并结果进行完整处理,以该结果为最终结果输出到文件
    :param inq: mp.Queue,主进程向子进程的输入队列
    :param outq: mp.Queue,子进程向主进程的输出队列
    :return:
    """
    pool_size, event_type = inq.get()
    extract_pool = CustomDaemonPool()
    extract_pool.start(extract_pool_main, pool_size)
Beispiel #13
0
        if IfdGetter.K_IFD_FILE in kwargs:
            self.ifd_file = kwargs.get(IfdGetter.K_IFD_FILE)
        if self.ifd_file is None:
            raise ValueError('An id freq dict should be specified.')
        if self.ifd is None:
            self.ifd = IdFreqDict()
            self.ifd.load_dict(self.ifd_file)
        return self.ifd

    def reload(self, ifd_file):
        if self.ifd is not None:
            self.ifd.load_dict(ifd_file)


# pre_dict_file = getcfg().pre_dict_file
post_dict_file = getcfg().post_dict_file
token_dict = IfdGetter(post_dict_file)

# pre_list = [getcfg().pre_prop_file, getcfg().pre_comm_file, getcfg().pre_verb_file, getcfg().pre_hstg_file]
# post_list = [getcfg().post_prop_file, getcfg().post_comm_file, getcfg().post_verb_file, getcfg().post_hstg_file]
# prop_dict, comm_dict, verb_dict, hstg_dict = [IfdGetter(post_file) for post_file in post_list]

if __name__ == '__main__':
    import utils.pattern_utils as pu

    def word_remove(word, freq):
        if pu.search_pattern(r'!?<>.,&\'`\^*', word) is not None or freq < 10:
            return True
        return False

    pre2post = dict(zip(pre_list, post_list))
Beispiel #14
0
from config.configure import getcfg


# TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了
afinn_file = getcfg().afinn_file


ark_marks = {'N': 0, 'O': 1, '^': 2, 'S': 3, 'Z': 4, 'V': 5, 'A': 6, 'R': 7, '!': 8, 'D': 9, 'P': 10,
             '&': 11, 'T': 12, 'X': 13, '$': 14, ',': 15, 'G': 16, 'L': 17, 'M': 18, 'Y': 19}
affinn_dict = {}


with open(afinn_file, 'r') as fp:
    for line in fp.readlines():
        l1 = line.split('\t')
        affinn_dict[l1[0].strip()] = int(l1[1].strip())


def count_ark_mark(ark_list):
    mark_list = [0] * 20
    for pos_mark in ark_list:
        if pos_mark[1] in ark_marks:
            mark_list[ark_marks[pos_mark[1]]] += 1
    return mark_list


def count_sentiment(text):
    words = text.split()
    sentiment = 0
    for word in words:
        if word in affinn_dict:
from config.configure import getcfg
from preprocess.filter.filter_utils import get_all_substrings
from preprocess.filter.pos_tag_process import count_sentiment

import utils.array_utils as au
import utils.function_utils as fu
import utils.file_iterator as fi
import utils.pattern_utils as pu
import utils.tweet_keys as tk
import utils.timer_utils as tmu


sys.path.append(os.path.abspath(os.path.dirname(__file__)))

chat_filter_file = getcfg().chat_filter_file
is_noise_dict_file = getcfg().is_noise_dict_file
clf_model_file = getcfg().clf_model_file
black_list_file = getcfg().black_list_file


class UseGSDMM:
    def __init__(self):
        try:
            with open(chat_filter_file, 'rb') as f:
                self.c = pickle.load(f)
            with open(is_noise_dict_file, 'rb') as f:
                self.is_noise_dict = set(pickle.load(f))
        except:
            print('load error')
            traceback.print_exc()
Beispiel #16
0
results will contain a list of lists (one per tweet) of triples, each triple represents (term, type, confidence)
"""
import shlex
import subprocess

import utils.pattern_utils as pu
import utils.tweet_keys as tk
from config.configure import getcfg

# The only relavent source I've found is here:
# http://m1ked.com/post/12304626776/pos-tagger-for-twitter-successfully-implemented-in
# which is a very simple implementation, my implementation is a bit more useful (but not much).

# NOTE this command is directly lifted from runTagger.sh
# RUN_TAGGER_CMD = "java -XX:ParallelGCThreads=2 -Xmx500m -jar {}".format(getconfig().ark_service_command)
RUN_TAGGER_CMD = getcfg().ark_service_command


def _split_results(wordtags):
    """ Parse the tab-delimited returned lines, modified from:
        https://github.com/brendano/ark-tweet-nlp/blob/master/scripts/show.py
        :param wordtags: corresponds to a set of word and their tags (String)in a tweet. """
    word_tag_arr = list()
    for wordtag in wordtags:
        wordtag = wordtag.strip()  # remove '\n'
        if len(wordtag) > 0:
            parts = wordtag.split('\t')
            tokens, tags, confidence = parts[0], parts[1], float(parts[2])
            word_tag_arr.append(
                (tokens, tags,
                 confidence))  # yield generates a result on getting a request
Beispiel #17
0
import numpy as np
import utils.array_utils as au
import utils.file_iterator as fi
import utils.function_utils as fu
import classifying.fast_text_utils as ftu
import utils.multiprocess_utils as mu
import utils.pattern_utils as pu
import utils.tweet_keys as tk
import utils.timer_utils as tmu
from config.configure import getcfg

from classifying.terror.classifier_terror import \
    ClassifierTerror, file2label_text_array, text2label_text_array

value_t, value_f = ftu.value_t, ftu.value_f
nd_ft_model_file = getcfg().nd_ft_model_file
nd_clf_model_file = getcfg().nd_lr_model_file


class ClassifierNaturalDisaster(ClassifierTerror):
    def __init__(self,
                 ft_model_file=nd_ft_model_file,
                 clf_model_file=nd_clf_model_file):
        ClassifierTerror.__init__(self, ft_model_file, clf_model_file)
        self.ft_model = self.clf_model = None
        if ft_model_file:
            self.load_ft_model(ft_model_file)
        if clf_model_file:
            self.load_clf_model(clf_model_file)

    def textarr2featurearr(self, textarr):
# from sklearn.neural_network import MLPClassifier
# from sklearn.ensemble import GradientBoostingClassifier

from config.configure import getcfg
import classifying.fast_text_utils as ftu
import utils.array_utils as au
import utils.file_iterator as fi
import utils.function_utils as fu
import utils.multiprocess_utils as mu
import utils.pattern_utils as pu
import utils.spacy_utils as su
import utils.tweet_keys as tk
import utils.timer_utils as tmu

value_t, value_f = ftu.value_t, ftu.value_f
terror_ft_model_file = getcfg().terror_ft_model_file
terror_clf_model_file = getcfg().terror_lr_model_file


class ClassifierTerror:
    nlp = None
    sensitive_words = {
        'shooting', 'wounded', 'shots', 'attack', 'shooter', 'wounds', 'dead',
        'terrorist', 'hurt', 'terror', 'police', 'killed', 'gunman', 'weapon',
        'injured', 'attacked', 'bomb', 'bombed', 'attacker'
    }

    # @staticmethod
    # def get_nlp():
    #     if ClassifierTerror.nlp is None:
    #         ClassifierTerror.nlp = su.get_nlp_disable_for_ner()
Beispiel #19
0
from os import listdir
from os.path import isfile, join

import pickle
import traceback
import pandas as pd

from config.configure import getcfg
from preprocess.filter.ChatFilter import ChatFilter
from preprocess.filter.filter_utils import readFilesAsJsonList
import utils.pattern_utils as pu
import utils.tweet_keys as tk

# TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了
class_dist_file = getcfg().class_dist_file
chat_filter_file = getcfg().chat_filter_file
is_noise_dict_file = getcfg().is_noise_dict_file
orgn_predict_label_file = getcfg().orgn_predict_label_file


class UseGSDMM:
    def __init__(self, trainning=None):
        self.c = ChatFilter()
        self.orgn_predict_label = None
        self.class_dist = None
        self.is_noise_dict = None

        if trainning is None:
            try:
                with open(chat_filter_file, 'rb') as f:
                    self.c = pickle.load(f)
Beispiel #20
0
import numpy as np
import utils.file_iterator as fi
import classifying.fast_text_utils as ftu
import utils.timer_utils as tmu
from config.configure import getcfg

from classifying.natural_disaster.classifier_nd import \
    ClassifierNaturalDisaster, text2label_text_array, recover_train_matrix, generate_train_matrices

value_t, value_f = ftu.value_t, ftu.value_f
k_ft_model_file = getcfg().k_ft_model_file
k_clf_model_file = getcfg().k_lr_model_file


class ClassifierK(ClassifierNaturalDisaster):
    def __init__(self,
                 ft_model_file=k_ft_model_file,
                 clf_model_file=k_clf_model_file):
        ClassifierNaturalDisaster.__init__(self, ft_model_file, clf_model_file)


def _generate_matrices(ft_model_file, lbl_txt_arr, mtx_file, lbl_file):
    print(len(lbl_txt_arr), mtx_file, lbl_file)
    textarr, labelarr = text2label_text_array(lbl_txt_arr)
    clf = ClassifierK(ft_model_file, None)
    featurearr = clf.textarr2featurearr(textarr)
    np.save(mtx_file, featurearr)
    np.save(lbl_file, labelarr)


if __name__ == "__main__":
Beispiel #21
0
from config.configure import getcfg
import classifying.fast_text_utils as ftu
from collections import Counter
import utils.array_utils as au


label2value = ftu.binary_label2value
model_file = getcfg().ft_add_model_file


def predict(target, threshold=0.5):
    """ returns value/value array given input text/text array; value(s) are dependent on the threshold """
    model = ftu.get_model(model_file)
    pred_value_arr, score_arr = ftu.binary_predict(target, model, threshold)
    return pred_value_arr, score_arr


def train(train_file, model_file):
    model = ftu.FastText()
    model.train_supervised(input=train_file, epoch=50, lr=2, wordNgrams=2, verbose=2, minCount=10)
    ftu.save_model(model_file, model)
    return model


def test(test_file, model_file):
    textarr, labelarr = list(), list()
    with open(test_file) as testfp:
        lines = testfp.readlines()
    for line in lines:
        label, text = line.strip().split(' ', 1)
        textarr.append(text)
from sklearn import metrics
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split, GridSearchCV

from preprocess.filter.filter_utils import readFilesAsJsonList, get_all_substrings
from preprocess.filter.pos_tag_process import count_ark_mark, count_sentiment
from preprocess.filter.pre_process import filterArray
from preprocess.filter.use_GSDMM import UseGSDMM
from utils.ark_service_proxy import twarr_ark
import utils.tweet_keys as tk
import utils.pattern_utils as pu
from config.configure import getcfg


# TODO 先用绝对路径标识文件名,我这里在配置文件中写成绝对路径了
clf_model_file = getcfg().clf_model_file
black_list_file = getcfg().black_list_file


class EffectCheck:
    def __init__(self, T_dir=None, F_dir=None):
        if T_dir is not None or F_dir is not None:
            self.T_corpus = readFilesAsJsonList(T_dir)
            self.F_corpus = readFilesAsJsonList(F_dir)
        self.gsdmm = None
        with open(black_list_file, 'r') as fp:
            self.spam_words = set([line.strip() for line in fp.readlines()])

    def run_a_function_list(self, function_list, print_pos_matchcase=False):
        for function in function_list:
            T_filtered = 0