def __init__(self, filepath=os.path.join(pwd(__file__), './data/data.json'), num_category=3, num_per_catetory=4): self.file = filepath self.num_category = num_category self.num_per_catetory = num_per_catetory
def get_stop_words(file_dir=os.path.join(pwd(__file__), "./stopwords")) -> set: stop_wordss = set() for file in os.listdir(file_dir): if ".txt" in file: with open(os.path.join(file_dir, file), 'r', encoding='utf-8') as fp: words = set([w.strip() for w in fp.readlines()]) for word in words: stop_wordss.update(jieba.cut(word)) return stop_wordss
import scipy.spatial from utils2 import pwd, now, load_json, load_pickle, dump_json, dump_pickle import os import numpy as np from sklearn.decomposition import TruncatedSVD from preprocess import DataLoader, cutwords import random from multiprocessing import Pool, TimeoutError import argparse from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import torch import torch.nn as nn import torch.nn.functional as F from preprocess import cutwords output_dir = os.path.join(pwd(__file__), './output') if __name__ == '__main__': N = 100 #每个类别有100个文本参与比较 type = [ 'fraudsters', 'intentkill', 'thieves', 'rape', 'traffic', 'rob', 'position', 'drug', 'damage' ] num = [N for _ in range(9)] data_pkl = os.path.join(pwd(__file__), './data/data.pkl') datas = load_pickle(data_pkl) documents = [] #存放的比较的文档 for i, atype in enumerate(type): for apeople in datas: if apeople["type"] == atype and num[i] > 0: documents.append(apeople["note"]) num[i] = num[i] - 1
from utils2 import load_json, load_pickle, dump_json, dump_pickle, pwd, now,cosine_distance import os import logging from clusterer import DPClusterer, KMeansClusterer import numpy as np from argparse import ArgumentParser import numpy as np from reprs import docs_repr output_dir = os.path.join(pwd(__file__), './output') # representer_file = os.path.join(output_dir, "representer.pkl") districts_file = os.path.join(output_dir, './districts.json') log_dir = os.path.join(pwd(__file__), './logs') logfile = os.path.join(log_dir, 'clustering_{}.log'.format(now())) logging.basicConfig(filename=logfile, level=logging.INFO) logger = logging.getLogger("clustering") docs_file = os.path.join(pwd(__file__), './data/data.pkl') def find_districtidx(districts, idx): for district_idx, district in enumerate(districts): if idx in district['voters']: return district_idx raise Exception("Cannot find district!") def find_label(labels,globalidx): for idx,label in labels: if idx==globalidx: return label raise Exception('Cannot find label')
import scipy.spatial from utils2 import pwd, now, load_json, load_pickle, dump_json, dump_pickle import os import numpy as np from sklearn.decomposition import TruncatedSVD from preprocess import DataLoader, cutwords import random from multiprocessing import Pool, TimeoutError import argparse from sklearn.feature_extraction.text import CountVectorizer from sklearn.feature_extraction.text import TfidfTransformer import torch import torch.nn as nn import torch.nn.functional as F from preprocess import cutwords output_dir = os.path.join(pwd(__file__), './output') log_dir = os.path.join(pwd(__file__), './logs') logging.basicConfig(filename=os.path.join(pwd(__file__), './logs/{}.log'.format(now())), level=logging.INFO) defaultlogger = logging.getLogger("GLSA Repr") # input size must be large enough, use padding class AutoEncoder(nn.Module): def __init__(self, input_size, hidden_size): super(AutoEncoder, self).__init__() self.input_size = input_size self.hidden_size = hidden_size self.i2h = nn.Linear(input_size, hidden_size) self.h2o = nn.Linear(hidden_size, input_size)
def __call__(self) -> list: datas = load_json(os.path.join(pwd(__file__), './data/data.json')) return datas