コード例 #1
0
def make_text_files():
    for idx, file in enumerate(neg_2012_full_files):
        twarr = fu.load_array(file)
        txtarr = list()
        for tw in twarr:
            text = pu.text_normalization(tw[tk.key_text])
            if pu.is_empty_string(text) or len(text) < 20:
                continue
            txtarr.append(text)
        print('len delta', len(twarr) - len(txtarr))
        path = Path(file)
        out_file_name = '_'.join([path.parent.name,
                                  path.name]).replace('json', 'txt')
        out_file = ft_data_pattern.format(out_file_name)
        print(out_file)
        fu.write_lines(out_file, txtarr)
    return
    p_twarr_blocks = map(fu.load_array, pos_files)
    p_txtarr_blocks = map(twarr2textarr, p_twarr_blocks)
    p_txtarr = au.merge_array(list(p_txtarr_blocks))
    p_out_file = ft_data_pattern.format('pos_2016.txt')
    fu.write_lines(p_out_file, p_txtarr)

    for f in neg_files:
        in_file = neg_event_pattern.format(f)
        out_file = ft_data_pattern.format(f.replace("json", "txt"))
        twarr = fu.load_array(in_file)
        txtarr = twarr2textarr(twarr)
        print(len(twarr), '->', len(txtarr), len(twarr) - len(txtarr))
        fu.write_lines(out_file, txtarr)
コード例 #2
0
def exec_pre_test(test_data_path):
    subfiles = fi.listchildren(test_data_path, children_type='file')
    # file_list = fu.split_multi_format(
    #     [(test_data_path + file) for file in subfiles if file.endswith('.json')], process_num=6)
    # twarr_blocks = fu.multi_process(fi.summary_unzipped_tweets_multi,
    #                                 [(file_list_slice,) for file_list_slice in file_list])
    twarr_blocks = filter_twarr(
        [fu.load_array(file) for file in subfiles if file.endswith('.json')])
    twarr = au.merge_array(twarr_blocks)

    tu.start_ner_service(pool_size=16)
    tu.twarr_ner(twarr)
    tu.end_ner_service()

    all_ids = set(fu.load_array(test_data_path + 'test_ids_all.csv'))
    pos_ids = set(fu.load_array(test_data_path + 'test_ids_pos.csv'))
    non_pos_ids = all_ids.difference(pos_ids)

    pos_twarr = list()
    non_pos_twarr = list()
    for tw in twarr:
        twid = tw[tk.key_id]
        if twid in pos_ids:
            pos_twarr.append(tw)
        elif twid in non_pos_ids:
            non_pos_twarr.append(tw)

    fu.dump_array(getcfg().pos_data_file, pos_twarr)
    fu.dump_array(getcfg().non_pos_data_file, non_pos_twarr)
コード例 #3
0
ファイル: data_utils.py プロジェクト: leeyanghaha/authorship
 def __init__(self, domain, product_num):
     self.domain = domain
     self.seed_product = 'B003EYVXV4'
     self.datahelper = DataHelper()
     self.userhelper = UserHelper()
     self.product2user = fu.load_array(
         os.path.join(ku.index_root, domain, 'product2user.json'))[0]
     self.user2product = fu.load_array(
         os.path.join(ku.index_root, domain, 'user2product.json'))[0]
     self.all_products = fu.listchildren(os.path.join(
         ku.product_root, domain),
                                         concat=False)
     self.product_num = product_num
コード例 #4
0
    def order_twarr_through_time(self):
        print("data source : normal")
        event_blocks = fu.load_array("./data/events2016.txt")
        false_event_twarr = fu.load_array("./data/false_pos_events.txt")
        event_blocks.append(false_event_twarr)
        for block_idx, block in enumerate(event_blocks):
            for tw in block:
                tw[tk.key_event_label] = block_idx
        twarr = au.merge_array(event_blocks)
        tflt.filter_twarr_dup_id(twarr)

        def random_idx_for_item(item_arr, dest_item):
            from numpy import random

            def sample(prob):
                return random.rand() < prob

            non_dest_item_idx = [
                idx for idx in range(len(item_arr))
                if item_arr[idx] not in dest_item
            ]
            dest_item_idx = [
                idx for idx in range(len(item_arr))
                if item_arr[idx] in dest_item
            ]
            non_dest_cnt = dest_cnt = 0
            res = list()
            while len(non_dest_item_idx) > non_dest_cnt and len(
                    dest_item_idx) > dest_cnt:
                if sample((len(dest_item_idx) - dest_cnt) /
                          (len(dest_item_idx) - dest_cnt +
                           len(non_dest_item_idx) - non_dest_cnt)):
                    res.append(dest_item_idx[dest_cnt])
                    dest_cnt += 1
                else:
                    res.append(non_dest_item_idx[non_dest_cnt])
                    non_dest_cnt += 1
            while len(non_dest_item_idx) > non_dest_cnt:
                res.append(non_dest_item_idx[non_dest_cnt])
                non_dest_cnt += 1
            while len(dest_item_idx) > dest_cnt:
                res.append(dest_item_idx[dest_cnt])
                dest_cnt += 1
            return res

        idx_time_order = tu.rearrange_idx_by_time(twarr)
        twarr = [twarr[idx] for idx in idx_time_order]
        lbarr = self.lbarr_of_twarr(twarr)
        idx_random_item = random_idx_for_item(lbarr, {max(lbarr)})
        twarr = [twarr[idx] for idx in idx_random_item]
        return twarr
コード例 #5
0
 def load_tw_batches(self, load_cluid_arr):
     tw_batches = fu.load_array(self.labelled_batch_file)
     tu.twarr_nlp(au.merge_array(tw_batches))
     print("twarr nlp over")
     if load_cluid_arr:
         cluid_batches = fu.load_array(self.cluid_batch_file)
         assert len(tw_batches) == len(cluid_batches)
         for b_idx in range(len(tw_batches)):
             tw_batch, cluid_batch = tw_batches[b_idx], cluid_batches[b_idx]
             assert len(tw_batch) == len(cluid_batch)
             for idx in range(len(tw_batch)):
                 tw, cluid = tw_batch[idx], cluid_batch[idx]
                 tw[tk.key_event_cluid] = cluid
     return tw_batches
コード例 #6
0
def test1():
    import utils.tweet_keys as tk
    import utils.array_utils as au
    import utils.pattern_utils as pu
    import utils.timer_utils as tmu
    import calling.back_extractor as bext
    import utils.file_iterator as fi
    import utils.function_utils as fu
    from extracting.cluster_infomation import merge_cic_list2cluid_twarr_list

    # C_NAME = BackCluster.G_CLASS.__name__
    # _base = '/home/nfs/cdong/tw/src/calling/tmp_{}'.format(C_NAME)
    # fi.mkdir(_base, remove_previous=True)
    # _cluid_cluster_list_file = '_cluid_cluster_list_{}'.format(C_NAME)

    _twarr = fu.load_array("./filtered_twarr.json")[:5000]
    _batches = au.array_partition(_twarr, [1] * 43, random=False)

    _max_window_size, _full_interval = 4, 2
    _alpha, _beta = 30, 0.01
    start_pool(_max_window_size, _full_interval, _alpha, _beta)
    for idx, twarr in enumerate(_batches):
        input_twarr(twarr)
        if idx > 0 and idx % 2 == 0:
            execute_cluster(10)
            ctl = wait_get_cluid_twarr_list()
            print('len(ctl)={}'.format(len(ctl)) if ctl else 'ctl is None')

    end_pool()
コード例 #7
0
def get_semantic_tokens(file_list):
    pos_type_info = {
        ark.prop_label: {
            K_IFD: IdFreqDict()
        },
        ark.comm_label: {
            K_IFD: IdFreqDict()
        },
        ark.verb_label: {
            K_IFD: IdFreqDict()
        },
        ark.hstg_label: {
            K_IFD: IdFreqDict()
        },
    }
    total_doc_num = 0
    for file in file_list:
        twarr = ark.twarr_ark(fu.load_array(file))
        total_doc_num += len(twarr)
        pos_tokens = au.merge_array([tw[tk.key_ark] for tw in twarr])
        for pos_token in pos_tokens:
            word = pos_token[0].strip().lower()
            if len(word) <= 2 or not pu.is_valid_keyword(word):
                continue
            real_label = ark.pos_token2semantic_label(pos_token)
            if real_label:
                pos_type_info[real_label][K_IFD].count_word(word)
    return pos_type_info, total_doc_num
コード例 #8
0
ファイル: main.py プロジェクト: locta66/TweetEventDetection
def main():
    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold)
    bclu.start_pool(hold_batch_num, batch_size, alpha, beta)
    # bext.start_pool(ext_pool_size)

    sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/",
                                fi.TYPE_FILE,
                                concat=True)[-4000:]
    for _idx, _file in enumerate(sub_files):
        _twarr = fu.load_array(_file)
        print("1-- {} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        twarr2filter(_twarr)
        # if _idx > 0 and (_idx + 1) % 1000 == 0:
        #     dt = tmu.check_time('if_idx>0and(_idx+1)%1000==0:', print_func=None)
        #     emu.send_email('notification', '{}/{} file, {}s from last 1000 file'.format(_idx+1, len(sub_files), dt))
        # if _idx % 50 == 0:
        #     tmu.check_time('_idx, _file', print_func=lambda dt: print("{} s from last 50".format(dt)))
        if _idx > 0 and _idx % 10 != 0:
            continue
        try_filter2cluster()

        # cluid_twarr_list = bclu.get_cluid_twarr_list()
        # print(len(cluid_twarr_list) if cluid_twarr_list else '--not ready')
        # if cluid_twarr_list:
        #     print(len(cluid_twarr_list))

    ensure_filter_workload()
コード例 #9
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def sum_files(file_list, filter_level):
    res_twarr = list()
    for file in file_list:
        twarr = fu.load_twarr_from_bz2(file) if file.endswith('.bz2') else fu.load_array(file)
        twarr = tflt.filter_twarr(twarr, filter_level)
        res_twarr.extend(twarr)
    return res_twarr
コード例 #10
0
ファイル: id_freq_dict.py プロジェクト: leeyanghaha/my_merge
 def load_dict(self, file_name):
     self.clear()
     word_id_freq_arr = fu.load_array(file_name)
     for word, wid, freq in word_id_freq_arr:
         self._word2id[word] = {K_FREQ: int(freq), K_ID: int(wid)}
         self._freq_sum += freq
     self.calc_freq_sum()
     return self
コード例 #11
0
 def load_tw_batches(self, load_cluid_arr):
     temp_len = 60000
     twarr = fu.load_array(self.filtered_twarr_file)[:temp_len]
     print("load_tw_batches, len(twarr)=", len(twarr))
     if load_cluid_arr:
         cluidarr = fu.load_array(self.filtered_cluidarr_file)[:temp_len]
         assert len(twarr) == len(cluidarr)
         for idx in range(len(twarr)):
             tw, twid = twarr[idx], twarr[idx][tk.key_id]
             origin_id, cluid = cluidarr[idx]
             assert twid == origin_id
             tw[tk.key_event_cluid] = cluid
     twarr = tu.twarr_nlp(twarr)
     tw_batches = split_array_into_batches(twarr, self.batch_size)
     print("batch distrb {}, {} batches, total {} tweets".format(
         [len(b) for b in tw_batches], len(tw_batches), len(twarr)))
     return tw_batches
コード例 #12
0
def identify_korea():
    file = '/home/nfs/cdong/tw/seeding/NorthKorea/korea.json'
    twarr_blocks = fu.load_array(file)
    twarr = au.merge_array(twarr_blocks)
    for tw in twarr:
        text = tw[tk.key_text]
        if not re.search('korea', text, flags=re.I):
            print(text)
コード例 #13
0
def query_from_files(file_list, query):
    res_twarr = []
    for file in file_list:
        twarr = fu.load_array(file)
        for tw in twarr:
            if tk.key_text in tw and query.is_text_desired(tw.get(
                    tk.key_text)):
                res_twarr.append(tw)
    return res_twarr
コード例 #14
0
def multi(file):
    # ent_tags = {'FAC', 'GPE', 'LOC', 'ORG', 'NORP'}
    word_type = list()
    twarr = fu.load_array(file)
    twarr = tu.twarr_nlp(twarr)
    for tw in twarr:
        doc = tw[tk.key_spacy]
        for token in doc:
            word_type.append([token.text, token.ent_type_, token.tag_])
    return word_type
コード例 #15
0
def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)
コード例 #16
0
def organise_by_user(source_root, domain):
    source_path = os.path.join(source_root, '{}.json'.format(domain))
    source_reviews = fu.load_array(source_path)
    user_dict = {}
    for review in source_reviews:
        user_ID = review[ku.reviewer_ID]
        if user_ID in user_dict:
            user_dict[user_ID].append(review)
        else:
            user_dict[user_ID] = [review]
    return user_dict
コード例 #17
0
def extract_bad_tweets_into(files, output_file):
    total_tw_num = 0
    neg_twarr = list()
    for file in files:
        twarr = fu.load_array(file)
        total_tw_num += len(twarr)
        for tw in twarr:
            text = tw[tk.key_text]
            if len(text) < 20 or not pu.has_enough_alpha(text, 0.6):
                neg_twarr.append(tw)
    fu.dump_array(output_file, neg_twarr)
    return len(neg_twarr), total_tw_num
コード例 #18
0
 def read_tweet_from_json_file(self, file):
     if not self.is_file_of_query_date(file):
         return
     for tw in fu.load_array(file):
         tw_added = False
         for seed_query in self.seed_query_list:
             tw_added = seed_query.append_desired_tweet(tw, usingtwtime=False) or tw_added
         if tw_added:
             if tw['id'] in self.added_ids:
                 continue
             else:
                 self.added_ids.add(tw['id'])
             self.added_twarr.append(tw)
コード例 #19
0
def main():
    """
    启动各进程(池),遍历 _sub_files 中的文件名,逐个读取文件内容,
    每读取一个文件,输入过滤&聚类模块,并尝试从分类器读取返回结果后输入聚类模块;
    每过指定时间,向聚类模块发送聚类指令;
    随后尝试从聚类模块读取返回结果,并输入聚类信息提取模块
    :return:
    """
    tmu.check_time('qwertyui')
    tmu.check_time('main line 116', print_func=None)

    bflt.start_pool(flt_pool_size, ne_threshold, clf_threshold, event_type)
    bclu.start_pool(max_window_size, full_interval, alpha, beta)
    bext.start_pool(ext_pool_size, event_type)
    alarm = tmu.Alarm()
    # _sub_files = fi.listchildren("/home/nfs/cdong/tw/origin/", fi.TYPE_FILE, concat=True)[-4000:]
    _sub_files = fi.listchildren(
        "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive",
        fi.TYPE_FILE,
        concat=True)
    # _twarr = fu.load_array(_sub_files[0])
    # _twarr = fu.change_from_lxp_format(_twarr)
    for _idx, _file in enumerate(_sub_files):
        _twarr = fu.load_array(_file)
        if config.using_api_format == 'False':
            _twarr = fu.change_from_lxp_format(_twarr)
        if (_idx + 1) % 1000 == 0:
            dt = tmu.check_time('main line 116', print_func=None)
            emu.send_email('file {}/{}'.format(_idx + 1, len(_sub_files)),
                           '{}s from last 1000 file'.format(dt))
        if _idx > 0 and _idx % 10 == 0:
            print("main: {} th twarr to filter, len: {}".format(
                _idx, len(_twarr)))
        print("{} th twarr to filter, len: {}".format(_idx, len(_twarr)))
        twarr2filter(_twarr)
        filter2cluster()
        if alarm.is_time_elapse_bigger_than(check_every_sec):
            alarm.initialize_timestamp()
            filter2cluster(5)
            bclu.execute_cluster()
            time.sleep(60)
        cluster2extractor()
    # time.sleep(300)
    end_it()
    tmu.check_time('qwertyui')
コード例 #20
0
def refilter_twarr(in_file, out_file):
    twarr = fu.load_array(in_file)[:200000]
    origin_len = len(twarr)
    print(origin_len)
    clf_filter = ClassifierTerror()

    # for idx in range(len(twarr) - 1, -1, -1):
    #     text = twarr[idx][tk.key_text]
    #     if not pu.has_enough_alpha(text, 0.6):
    #         print(text)
    #         twarr.pop(idx)
    # text_filter_len = len(twarr)
    # print("delta by text =", origin_len - text_filter_len)

    tmu.check_time("refilter_twarr")
    twarr = clf_filter.filter(twarr, 0.2)
    tmu.check_time("refilter_twarr")
    print(len(twarr))
    fu.dump_array(out_file, twarr[:100000])
コード例 #21
0
ファイル: summarization.py プロジェクト: leeyanghaha/my_merge
def get_tokens(file_list):
    id_freq_dict, total_doc_num = IdFreqDict(), 0
    for file in file_list:
        twarr = fu.load_array(file)
        total_doc_num += len(twarr)
        for tw in twarr:
            tokens = re.findall(r'[a-zA-Z_#\-]{3,}', tw[tk.key_text].lower())
            real_tokens = list()
            for token in tokens:
                if len(token) >= 16:
                    real_tokens.extend(pu.segment(token))
                else:
                    real_tokens.append(token)
            for token in real_tokens:
                if (not pu.is_stop_word(token)) and pu.has_azAZ(token) and 3 <= len(token):
                    id_freq_dict.count_word(token)
    id_freq_dict.drop_words_by_condition(2)
    print(id_freq_dict.vocabulary_size())
    return id_freq_dict, total_doc_num
コード例 #22
0
def func3():
    str_arr = fu.load_array('sim_info.txt')
    feature = list()
    labels = list()
    for string in str_arr:
        num_arr = [float(s) for s in re.findall('\d\.\d+|\d+', string)]
        # if num_arr[4] < 0.5:
        #     continue
        feature.append([num_arr[1], num_arr[3], num_arr[4]])
        labels.append(1 if num_arr[0] == num_arr[2] else 0)
        print(num_arr, feature[-1], labels[-1])

    split_idx = int(len(feature) * 0.3)
    trainX, testX = feature[split_idx:], feature[:split_idx]
    trainY, testY = labels[split_idx:], labels[:split_idx]

    clf = svm.SVC()
    # clf.fit(feature, labels)
    # predY = clf.predict(feature)
    # auc = sklearn.metrics.roc_auc_score(labels, predY)

    clf.fit(trainX, trainY)
    predY = clf.predict(testX)
    auc = sklearn.metrics.roc_auc_score(testY, predY)
    print(auc)
    for idx in range(len(predY)):
        print(predY[idx], testY[idx])

    precision, recall, thresholds = metrics.precision_recall_curve(
        testY, predY)

    last_idx = 0
    for ref in [i / 10 for i in range(3, 8)]:
        for idx in range(last_idx, len(thresholds)):
            if thresholds[idx] >= ref:
                print('threshold', round(thresholds[idx], 2), '\tprecision',
                      round(precision[idx], 5), '\trecall',
                      round(recall[idx], 5))
                last_idx = idx
                break
コード例 #23
0
def parse_cluster_to_ordereddict(cluster, twarr_info):
    # cluid = cluster_info
    # readable_info_list, text_times, earliest_time_str, latest_time_str, hot, level, sorted_twarr = twarr_info
    od = OrderedDict()

    clu_info = OrderedDict()
    clu_info["id"] = 124386342
    clu_info["level"] = "{}({})".format(2, 'just soso')
    clu_info["hot"] = 1234

    geo_list = [
        ["japan", "12", "32"],
        ["bangkok", "523", "435"],
        ["italy", "1234", "431"],
    ]
    geo_infer = array2ordereddict(geo_list, ['name', 'lat', 'lng'], "geo_")

    time_list = [
        ["20160234 12:23:73", "nimabi"],
        ["20179212 32:56:89", "tomorrow"],
    ]
    time_infer = OrderedDict()
    time_infer["earliest_time"] = "20180902 12:23:21"
    time_infer["latest_time"] = "20180902 12:23:21"
    time_text = array2ordereddict(time_list, ["inferred", "text"], "time_")
    time_infer.update(time_text)

    tw_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/2016-01-29_attack_Dalori.json"
    twarr = fu.load_array(tw_file)[:10]
    jsonarr = [[json.dumps(tw)[50:70]] for tw in twarr]
    tweet_list = array2ordereddict(jsonarr, row_prefix="tweet_")

    od['cluster_info'] = clu_info
    od['inferred_geo'] = geo_infer
    od['inferred_time'] = time_infer
    od['sorted_twarr'] = tweet_list
    od = {'cluster': od}
    return od
コード例 #24
0
    def _load_user_reviews(self, user):
        # 获取某个user, 某个domain 下的reviews
        '''

        :param file:
        :param min_threshold: 用户发表的最小评论数
        :param max_threshold: 用户发表的最大评论数
        :return:
        '''
        res = []
        file = os.path.join(self.root, self.domain, user)
        with open(file) as f:
            reviews = f.readlines()
        if self.min_threshold is not None and self.max_threshold is not None:
            for review in reviews:
                review = json.loads(review)
                if self.min_threshold <= review[
                        ku.reviewer_count] <= self.max_threshold:
                    res.append(review)
        elif self.min_threshold is not None:
            for review in reviews:
                review = json.loads(review)
                if review[ku.reviewer_count] >= self.min_threshold:
                    res.append(review)
        elif self.max_threshold is not None:
            for review in reviews:
                review = json.loads(review)
                if review[ku.reviewer_count] <= self.max_threshold:
                    res.append(review)
        else:
            res = fu.load_array(file)
        if self.num_reviews_per_user is not None and len(
                res) > self.num_reviews_per_user:
            res = res[:self.num_reviews_per_user]
        if self.shuffle:
            res = sku.shuffle(res)
        return res
コード例 #25
0
ファイル: my_main.py プロジェクト: leeyanghaha/authorship
def get_reviews():
    file = r'/home/leeyang/research/data/Movie.json'
    reviews = fu.load_array(file)
    return reviews
コード例 #26
0
def func2():
    file = '/home/nfs/cdong/tw/src/clustering/data/events.txt'
    twarr_blocks = fu.load_array(file)
    for tw in twarr_blocks[19]:
        print(tw[tk.key_text])
コード例 #27
0
    import utils.tweet_keys as tk
    import utils.array_utils as au
    import utils.pattern_utils as pu
    import utils.timer_utils as tmu
    import calling.back_extractor as bext
    import utils.file_iterator as fi
    import utils.function_utils as fu
    fi.mkdir('/home/nfs/cdong/tw/src/calling/tmp', remove_previous=True)

    tmu.check_time()
    _hold_batch_num = 100
    _batch_size = 100
    _alpha, _beta = 30, 0.01
    # _alpha, _beta = 50, 0.005
    _file = "./filtered_twarr.json"
    _twarr = fu.load_array(_file)[:10200]
    start_pool(_hold_batch_num, _batch_size, _alpha, _beta)
    input_twarr_batch(_twarr)

    print('---> waiting for _cluid_cluster_list')
    while True:
        _cluid_cluster_list = cluster_daemon.outq2.get()
        print('     - some thing returned, type :{}'.format(
            type(_cluid_cluster_list)))
        if _cluid_cluster_list is not None:
            break
    print('---> get _cluid_cluster_list, len:{}'.format(
        len(_cluid_cluster_list)))

    _ext_pool_size = 10
    bext.start_pool(_ext_pool_size)
コード例 #28
0
ファイル: draft.py プロジェクト: locta66/TweetEventDetection
    print(i, j)

# import re
# def preprocess(doc):
#     # pattern = re.compile(r'(\d\s\.\s\d)')
#     return re.sub(r'(\d\s\.\s\d)', '.', doc)
#
# for text in textarr[100:300]:
#     print(preprocess(text))

import sys

sys.path.append('../utils')
import utils.function_utils as fu

twarr = fu.load_array(
    '/home/nfs/cdong/tw/seeding/NaturalDisaster/queried/NaturalDisaster.sum')
arr1 = twarr[:2000]
arr2 = twarr[2000:]
# cv = CV(analyzer='word', token_pattern=r'([a-zA-Z_-]+|\d+\.\d+|\d+)',
#         stop_words=stop_words, max_df=0.8, min_df=1e-5)

import re

print(re.findall(r'([a-zA-Z_-]+|\d+\.\d+|\d+)', ))

from nltk.corpus import stopwords

stop_words = set(stopwords.words('english'))

import re
# s = 'RT @bugwannostra: @Louuu_ thx		#FFFFs People power -_-      works	❤signing…		https://t.co/pl2bquE5Az'
コード例 #29
0
    ]


def get_quality_autophrase(process_code, textarr, conf_thres, len_thres):
    conf_word_list = autophrase_wrapper(process_code, textarr)
    return filter_keywords(conf_word_list, conf_thres, len_thres)


if __name__ == '__main__':
    import utils.tweet_utils as tu

    # text_file = "/home/nfs/cdong/tw/src/extracting/3796_r.txt"
    # textarr = fu.read_lines(text_file)
    twarr_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-01-11_blast_Istanbul.json"
    twarr_file = "/home/nfs/cdong/tw/seeding/Terrorist/queried/positive/2016-03-26_suicide-bomb_Lahore.json"
    twarr = fu.load_array(twarr_file)
    textarr = [tw[tk.key_text] for tw in twarr]
    _conf_word_list = autophrase_wrapper(0, textarr)
    # _keywords = [item[1] for item in _conf_word_list]
    print(filter_keywords(_conf_word_list, 50))
    print('\n')
    print(textarr)
    # idx_groups = tu.group_textarr_similar_index(keywords, 0.2)
    # for g in idx_groups:
    #     print([keywords[i] for i in g], '\n')
    # print(_conf_word_list[:30])
    # print()
    # print(textarr)
    exit()
    """ 文本数量小于30时关键词的质量已经相当低,应尽量使进入的文本数量大于一定阈值 """
    """ __main__里面的内容保持不变,是最终的接口形式 """
コード例 #30
0

def merge_events_2016():
    base = '/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/'
    subs = fi.listchildren(base, fi.TYPE_FILE)
    twarr_list = []
    for sub in subs:
        twarr = fu.load_array(base + sub)
        # twarr = tu.twarr_ner(twarr)
        # twarr = ark.twarr_ark(twarr)
        twarr_list.append(twarr)
    fu.dump_array('/home/nfs/cdong/tw/seeding/Terrorist/queried/event2016.txt',
                  twarr_list)


if __name__ == '__main__':
    # merge_events_2016()
    import utils.pattern_utils as pu
    base = "/home/nfs/cdong/tw/seeding/Terrorist/queried/event_corpus/"
    files = fi.listchildren(base, fi.TYPE_FILE, concat=True)
    for file in files:
        twarr = fu.load_array(file)
        len_pre = len(twarr)
        for idx in range(len(twarr) - 1, -1, -1):
            text = twarr[idx][tk.key_text]
            if not pu.has_enough_alpha(text, 0.6):
                print(text)
                twarr.pop(idx)
        print(len_pre, '->', len(twarr), '\n\n')
        # fu.dump_array(file, twarr)