Esempio n. 1
0
 def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking")
                      for elem in BackpageLogger.query.filter_by(
                          is_trafficking=True).all()]
     training_data += [(elem, "not trafficking")
                       for elem in BackpageLogger.query.filter_by(
                           is_trafficking=False).all()]
     trafficking_numbers = [
         elem.phone_number for elem in BackpageLogger.query.filter_by(
             is_trafficking=True).all()
     ]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(
         training_data
     ) > 50  #totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(
                         datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(
         case_number)  #this is an infinite loop, which I am okay with.
 def investigate(self, case_number):
     data = self.scrape(links=self.base_urls, scraping_ads=True)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     training_data += [
         (elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()
     ]
     trafficking_numbers = [elem.phone_number for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()]
     cls = []
     cls.append(algorithms.svm(training_data))
     cls.append(algorithms.decision_tree(training_data))
     using_naive_bayes = len(training_data) > 50  # totally a hack, consider getting advice / changing this??
     if using_naive_bayes:
         nb = algorithms.naive_bayes(training_data)
     for datum in data:
         if datum["phone_number"] in trafficking_numbers:
             self.save([datum], case_number)
         if not using_naive_bayes:
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save([datum], case_number)
         else:
             if nb.classify(datum["text_body"]) == "trafficking":
                 self.save([datum], case_number)
     time.sleep(700)  # wait ~ 12 minutes (consider changing this)
     self.investigate(case_number)  # this is an infinite loop, which I am okay with.
Esempio n. 3
0
 def investigate(self):
     data = self.scrape(self.base_urls)
     training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] 
     training_data = [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()]
     cls = []
     cls.append(algorithms.svm(train))
     cls.append(algorithms.decision_tree(train))
     nb = algorithms.naive_bayes(train)
     for datum in data:
         if len(train) > 50: #totally a hack/rule of thumb 
             for cl in cls:
                 if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking":
                     self.save_ads([datum])
         else:
             if nb.classify(datum["text_body"]) == 'trafficking':
                 self.save_ads([datum])
     time.sleep(700) # wait ~ 12 minutes (consider changing this)
     self.investigate() #this is an infinite loop, which I am okay with.
from text_classify import algorithms
testing = [
    ("Hello","greeting"),
    ("Hi","greeting"),
    ("Hello there","greeting"),
    ("How are you?","greeting"),
    ("Wazzup?"),("greeting"),
    ("Hey!","greeting"),
    ("hey.","greeting"),
    ("hi.","greeting"),
    ("Hi there","greeting"),
    ("Heyy","greeting"),
    ("Hello, how are you?","greeting"),
    ("bye","goodbye"),
    ("goodbye","goodbye"),
    ("byee","goodbye"),
    ("later","goodbye"),
    ("bye bye","goodbye"),
    ("adios","goodbye"),
    ("ciao","goodbye"),
    ("see ya","goodbye")
]
cl = algorithms.svm(testing)
print cl.classify(algorithms.preprocess("byee"))
algorithms.cross_validate(testing,model="svm")
def test_svm():
    testing = [("hello there","greeting"),("later","goodbye")]
    cl = algorithms.svm(testing)
    test = algorithms.preprocess("hello there friends")
    assert cl.classify(test) == "greeting"
Esempio n. 6
0
def classify_query(query):
    training_data = pickle.load(open("training_data.pickle","r"))
    cl = algorithms.svm(training_data)
    classification = cl.classify(algorithms.preprocess(query))
    #To do add classification_sanity_check
    return classification
Esempio n. 7
0
def classify_query(query):
    training_data = pickle.load(open("training_data.pickle", "r"))
    cl = algorithms.svm(training_data)
    classification = cl.classify(algorithms.preprocess(query))
    #To do add classification_sanity_check
    return classification
def test_svm():
    testing = [("hello there", "greeting"), ("later", "goodbye")]
    cl = algorithms.svm(testing)
    test = algorithms.preprocess("hello there friends")
    assert cl.classify(test) == "greeting"