class AdModelTrainer: def __init__(self): self.dataset = {} self.splitWords = {} self.document = {} self.combined = [] self.textMatrix = None self.predictor = None def load_data(self): allNormalText = open(r"./data/data-normal.txt", 'r', encoding='utf-8').read() self.dataset[0] = allNormalText.split('\n') for i in range(1, len(AdPredictor.Types)): name = AdPredictor.Types[i] allAdText = open(r"./data/data-%s.txt" % name, 'r', encoding='utf-8').read() self.dataset[i] = allAdText.replace('\r\n', '\n').split('\n\n') def generate_matrix(self): self.predictor = AdPredictor() for i in range(0, len(AdPredictor.Types)): self.splitWords[i] = [ list(self.predictor.splitWords(ad)) for ad in self.dataset[i] ] self.document[i] = [ " ".join(sent0) for sent0 in self.splitWords[i] ] self.combined.extend(self.document[i]) self.textMatrix = self.predictor.transformTextToSparseMatrix( self.combined) return self.textMatrix.head() def train_model(self): features = pd.DataFrame(self.textMatrix.apply(sum, axis=0)) # extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0] > 5] y = [] for i in range(0, len(AdPredictor.Types)): name = AdPredictor.Types[i] document = self.document[i] print("Document %s Len: %d" % (name, len(document))) y.extend(numpy.full(len(document), i)) self.predictor.train(self.textMatrix, y) def train(self): self.load_data() self.generate_matrix() self.train_model()
def generate_matrix(self): self.predictor = AdPredictor() for i in range(0, len(AdPredictor.Types)): self.splitWords[i] = [ list(self.predictor.splitWords(ad)) for ad in self.dataset[i] ] self.document[i] = [ " ".join(sent0) for sent0 in self.splitWords[i] ] self.combined.extend(self.document[i]) self.textMatrix = self.predictor.transformTextToSparseMatrix( self.combined) return self.textMatrix.head()
def main(host='127.0.0.1', port=48519): logging.info("Root path %s" % os.getcwd()) logging.info("Loading model AdFilter ...") adPredictor = AdPredictor.from_saved_model("./adfilter/data") logging.info("Loading model CaptchaBreaker ...") model_path = "captcha-breaker-v%d.pth" % CaptchaNN.version() net = CaptchaNN() net = net.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu')) net.load_state_dict(torch.load(model_path)) net.eval() captchaBreaker = CaptchaBreaker(net) logging.info("Loading API Server ...") server = APIServer(captchaBreaker, adPredictor, host=host, port=port) server.start() logging.info("Shutdown system ...") pass
# -*- coding: UTF-8 -*- from adfilter.model import AdPredictor text1 = "这个是咱们学校的学习墙开学准备(基本包含各学科)技能提升(计算机二级 word ppt等)各种考证资料(考研 四六级 二级 教资 单招 会计 专升本等)基本都有的,抗疫时期闲着也是闲着 需要的加墙墙就好啦 " text2 = "java垃圾!C垃圾!计算机二级垃圾!PS学姐biss!资料墙biss!👴精通CAD!!" predictor = AdPredictor.from_saved_model() print(predictor.predict_ad(text1), predictor.predict_ad(text2))