def investigate(self, case_number): data = self.scrape(links=self.base_urls, scraping_ads=True) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by( is_trafficking=True).all()] training_data += [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by( is_trafficking=False).all()] trafficking_numbers = [ elem.phone_number for elem in BackpageLogger.query.filter_by( is_trafficking=True).all() ] cls = [] cls.append(algorithms.svm(training_data)) cls.append(algorithms.decision_tree(training_data)) using_naive_bayes = len( training_data ) > 50 #totally a hack, consider getting advice / changing this?? if using_naive_bayes: nb = algorithms.naive_bayes(training_data) for datum in data: if datum["phone_number"] in trafficking_numbers: self.save([datum], case_number) if not using_naive_bayes: for cl in cls: if cl.classify(algorithms.preprocess( datum["text_body"])) == "trafficking": self.save([datum], case_number) else: if nb.classify(datum["text_body"]) == 'trafficking': self.save([datum], case_number) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate( case_number) #this is an infinite loop, which I am okay with.
def investigate(self, case_number): data = self.scrape(links=self.base_urls, scraping_ads=True) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] training_data += [ (elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all() ] trafficking_numbers = [elem.phone_number for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] cls = [] cls.append(algorithms.svm(training_data)) cls.append(algorithms.decision_tree(training_data)) using_naive_bayes = len(training_data) > 50 # totally a hack, consider getting advice / changing this?? if using_naive_bayes: nb = algorithms.naive_bayes(training_data) for datum in data: if datum["phone_number"] in trafficking_numbers: self.save([datum], case_number) if not using_naive_bayes: for cl in cls: if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking": self.save([datum], case_number) else: if nb.classify(datum["text_body"]) == "trafficking": self.save([datum], case_number) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate(case_number) # this is an infinite loop, which I am okay with.
def investigate(self): data = self.scrape(self.base_urls) training_data = [(elem, "trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=True).all()] training_data = [(elem, "not trafficking") for elem in BackpageLogger.query.filter_by(is_trafficking=False).all()] cls = [] cls.append(algorithms.svm(train)) cls.append(algorithms.decision_tree(train)) nb = algorithms.naive_bayes(train) for datum in data: if len(train) > 50: #totally a hack/rule of thumb for cl in cls: if cl.classify(algorithms.preprocess(datum["text_body"])) == "trafficking": self.save_ads([datum]) else: if nb.classify(datum["text_body"]) == 'trafficking': self.save_ads([datum]) time.sleep(700) # wait ~ 12 minutes (consider changing this) self.investigate() #this is an infinite loop, which I am okay with.
from text_classify import algorithms testing = [ ("Hello","greeting"), ("Hi","greeting"), ("Hello there","greeting"), ("How are you?","greeting"), ("Wazzup?"),("greeting"), ("Hey!","greeting"), ("hey.","greeting"), ("hi.","greeting"), ("Hi there","greeting"), ("Heyy","greeting"), ("Hello, how are you?","greeting"), ("bye","goodbye"), ("goodbye","goodbye"), ("byee","goodbye"), ("later","goodbye"), ("bye bye","goodbye"), ("adios","goodbye"), ("ciao","goodbye"), ("see ya","goodbye") ] cl = algorithms.svm(testing) print cl.classify(algorithms.preprocess("byee")) algorithms.cross_validate(testing,model="svm")
def test_svm(): testing = [("hello there","greeting"),("later","goodbye")] cl = algorithms.svm(testing) test = algorithms.preprocess("hello there friends") assert cl.classify(test) == "greeting"
from text_classify import algorithms testing = [("hello there","Phil"),("later","Gena")] cl = algorithms.svm(testing) test = algorithms.preprocess("hello there friends") print cl.classify(test) == "Phil"
def classify_query(query): training_data = pickle.load(open("training_data.pickle","r")) cl = algorithms.svm(training_data) classification = cl.classify(algorithms.preprocess(query)) #To do add classification_sanity_check return classification
def classify_query(query): training_data = pickle.load(open("training_data.pickle", "r")) cl = algorithms.svm(training_data) classification = cl.classify(algorithms.preprocess(query)) #To do add classification_sanity_check return classification
from text_classify import algorithms testing = [("hello there", "Phil"), ("later", "Gena")] cl = algorithms.svm(testing) test = algorithms.preprocess("hello there friends") print cl.classify(test) == "Phil"
def test_svm(): testing = [("hello there", "greeting"), ("later", "goodbye")] cl = algorithms.svm(testing) test = algorithms.preprocess("hello there friends") assert cl.classify(test) == "greeting"