Ejemplo n.º 1
0
 def __init__(self,
              filepath=os.path.join(pwd(__file__), './data/data.json'),
              num_category=3,
              num_per_catetory=4):
     self.file = filepath
     self.num_category = num_category
     self.num_per_catetory = num_per_catetory
Ejemplo n.º 2
0
def get_stop_words(file_dir=os.path.join(pwd(__file__), "./stopwords")) -> set:
    stop_wordss = set()
    for file in os.listdir(file_dir):
        if ".txt" in file:
            with open(os.path.join(file_dir, file), 'r',
                      encoding='utf-8') as fp:
                words = set([w.strip() for w in fp.readlines()])
                for word in words:
                    stop_wordss.update(jieba.cut(word))
    return stop_wordss
Ejemplo n.º 3
0
import scipy.spatial
from utils2 import pwd, now, load_json, load_pickle, dump_json, dump_pickle
import os
import numpy as np
from sklearn.decomposition import TruncatedSVD
from preprocess import DataLoader, cutwords
import random
from multiprocessing import Pool, TimeoutError
import argparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from preprocess import cutwords
output_dir = os.path.join(pwd(__file__), './output')
if __name__ == '__main__':
    N = 100  #每个类别有100个文本参与比较
    type = [
        'fraudsters', 'intentkill', 'thieves', 'rape', 'traffic', 'rob',
        'position', 'drug', 'damage'
    ]
    num = [N for _ in range(9)]
    data_pkl = os.path.join(pwd(__file__), './data/data.pkl')
    datas = load_pickle(data_pkl)
    documents = []  #存放的比较的文档
    for i, atype in enumerate(type):
        for apeople in datas:
            if apeople["type"] == atype and num[i] > 0:
                documents.append(apeople["note"])
                num[i] = num[i] - 1
Ejemplo n.º 4
0
from utils2 import load_json, load_pickle, dump_json, dump_pickle, pwd, now,cosine_distance
import os
import logging
from clusterer import DPClusterer, KMeansClusterer
import numpy as np
from argparse import ArgumentParser
import numpy as np
from reprs import docs_repr
output_dir = os.path.join(pwd(__file__), './output')
# representer_file = os.path.join(output_dir, "representer.pkl")
districts_file = os.path.join(output_dir, './districts.json')
log_dir = os.path.join(pwd(__file__), './logs')

logfile = os.path.join(log_dir, 'clustering_{}.log'.format(now()))
logging.basicConfig(filename=logfile, level=logging.INFO)
logger = logging.getLogger("clustering")

docs_file = os.path.join(pwd(__file__), './data/data.pkl')

def find_districtidx(districts, idx):
    for district_idx, district in enumerate(districts):
        if idx in district['voters']:
            return district_idx
    raise Exception("Cannot find district!")

def find_label(labels,globalidx):
    for idx,label in labels:
        if idx==globalidx:
            return label
    raise Exception('Cannot find label')
Ejemplo n.º 5
0
import scipy.spatial
from utils2 import pwd, now, load_json, load_pickle, dump_json, dump_pickle
import os
import numpy as np
from sklearn.decomposition import TruncatedSVD
from preprocess import DataLoader, cutwords
import random
from multiprocessing import Pool, TimeoutError
import argparse
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
import torch
import torch.nn as nn
import torch.nn.functional as F
from preprocess import cutwords
output_dir = os.path.join(pwd(__file__), './output')
log_dir = os.path.join(pwd(__file__), './logs')
logging.basicConfig(filename=os.path.join(pwd(__file__),
                                          './logs/{}.log'.format(now())),
                    level=logging.INFO)
defaultlogger = logging.getLogger("GLSA Repr")


# input size must be large enough, use padding
class AutoEncoder(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(AutoEncoder, self).__init__()
        self.input_size = input_size
        self.hidden_size = hidden_size
        self.i2h = nn.Linear(input_size, hidden_size)
        self.h2o = nn.Linear(hidden_size, input_size)
Ejemplo n.º 6
0
 def __call__(self) -> list:
     datas = load_json(os.path.join(pwd(__file__), './data/data.json'))
     return datas