Ejemplo n.º 1
0
 def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking")
                      for elem in BackpageLogger.query.filter_by(
                          is_trafficking=True).all()]
     training_data += [(elem, "not trafficking")
                       for elem in BackpageLogger.query.filter_by(
                           is_trafficking=False).all()]
     trafficking_numbers = [
         elem.phone_number for elem in BackpageLogger.query.filter_by(
             is_trafficking=True).all()
     ]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(
         training_data
     ) > 50  #totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(
                         datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(
         case_number)  #this is an infinite loop, which I am okay with.
 def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     training_data += [
         (elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()
     ]
     trafficking_numbers = [elem.phone_number for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(training_data) > 50  # totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == "trafficking":
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(case_number)  # this is an infinite loop, which I am okay with.
Ejemplo n.º 3
0
 def investigate(self):
     data = self.scrape(self.base_urls)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] 
     training_data = [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()]
     cls = []
     cls.append(algorithms.svm(train))
     cls.append(algorithms.decision_tree(train))
     nb = algorithms.naive_bayes(train)
     for datum in data:
         if len(train) > 50: #totally a hack/rule of thumb 
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save_ads([datum])
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save_ads([datum])
     time.sleep(700) # wait ~ 12 minutes (consider changing this)
     self.investigate() #this is an infinite loop, which I am okay with.
Ejemplo n.º 4
0
def test_decision_tree():
    testing = [("hello there","greeting"),("later","goodbye")]
    cl = algorithms.decision_tree(testing)
    test = algorithms.preprocess("hello there friends")
    assert cl.classify(test) == "greeting"
def test_decision_tree():
    testing = [("hello there", "greeting"), ("later", "goodbye")]
    cl = algorithms.decision_tree(testing)
    test = algorithms.preprocess("hello there friends")
    assert cl.classify(test) == "greeting"