Ejemplo n.º 1
0
class AdModelTrainer:
    def __init__(self):
        self.dataset = {}
        self.splitWords = {}
        self.document = {}
        self.combined = []
        self.textMatrix = None
        self.predictor = None

    def load_data(self):
        allNormalText = open(r"./data/data-normal.txt", 'r',
                             encoding='utf-8').read()
        self.dataset[0] = allNormalText.split('\n')

        for i in range(1, len(AdPredictor.Types)):
            name = AdPredictor.Types[i]
            allAdText = open(r"./data/data-%s.txt" % name,
                             'r',
                             encoding='utf-8').read()
            self.dataset[i] = allAdText.replace('\r\n', '\n').split('\n\n')

    def generate_matrix(self):
        self.predictor = AdPredictor()

        for i in range(0, len(AdPredictor.Types)):
            self.splitWords[i] = [
                list(self.predictor.splitWords(ad)) for ad in self.dataset[i]
            ]
            self.document[i] = [
                " ".join(sent0) for sent0 in self.splitWords[i]
            ]
            self.combined.extend(self.document[i])

        self.textMatrix = self.predictor.transformTextToSparseMatrix(
            self.combined)
        return self.textMatrix.head()

    def train_model(self):
        features = pd.DataFrame(self.textMatrix.apply(sum, axis=0))
        # extractedfeatures = [features.index[i] for i in range(features.shape[0]) if features.iloc[i, 0] > 5]

        y = []

        for i in range(0, len(AdPredictor.Types)):
            name = AdPredictor.Types[i]
            document = self.document[i]
            print("Document %s Len: %d" % (name, len(document)))
            y.extend(numpy.full(len(document), i))

        self.predictor.train(self.textMatrix, y)

    def train(self):
        self.load_data()
        self.generate_matrix()
        self.train_model()
Ejemplo n.º 2
0
    def generate_matrix(self):
        self.predictor = AdPredictor()

        for i in range(0, len(AdPredictor.Types)):
            self.splitWords[i] = [
                list(self.predictor.splitWords(ad)) for ad in self.dataset[i]
            ]
            self.document[i] = [
                " ".join(sent0) for sent0 in self.splitWords[i]
            ]
            self.combined.extend(self.document[i])

        self.textMatrix = self.predictor.transformTextToSparseMatrix(
            self.combined)
        return self.textMatrix.head()
Ejemplo n.º 3
0
def main(host='127.0.0.1', port=48519):
    logging.info("Root path %s" % os.getcwd())
    logging.info("Loading model AdFilter ...")
    adPredictor = AdPredictor.from_saved_model("./adfilter/data")

    logging.info("Loading model CaptchaBreaker ...")
    model_path = "captcha-breaker-v%d.pth" % CaptchaNN.version()
    net = CaptchaNN()
    net = net.to(torch.device('cuda' if torch.cuda.is_available() else 'cpu'))
    net.load_state_dict(torch.load(model_path))
    net.eval()
    captchaBreaker = CaptchaBreaker(net)

    logging.info("Loading API Server ...")
    server = APIServer(captchaBreaker, adPredictor, host=host, port=port)
    server.start()

    logging.info("Shutdown system ...")
    pass
Ejemplo n.º 4
0
# -*- coding: UTF-8 -*-

from adfilter.model import AdPredictor

text1 = "这个是咱们学校的学习墙开学准备(基本包含各学科)技能提升(计算机二级  word  ppt等)各种考证资料(考研 四六级  二级  教资 单招 会计 专升本等)基本都有的,抗疫时期闲着也是闲着 需要的加墙墙就好啦 "
text2 = "java垃圾!C垃圾!计算机二级垃圾!PS学姐biss!资料墙biss!👴精通CAD!!"

predictor = AdPredictor.from_saved_model()
print(predictor.predict_ad(text1), predictor.predict_ad(text2))