Example #1
0
    def investigate(self):
        
        data = self.scrape(self.base_urls)
        train_crud = CRUD("sqlite:///database.db",Ads,"ads")
        #getting dummy data from http://www.dummytextgenerator.com/#jump
        dummy_crud = CRUD("sqlite:///database.db",TrainData,"training_data")
        train = train_crud.get_all()
        dummy = dummy_crud.get_all()
        t_docs = [elem.text for elem in train_crud.get_all()] #all documents with trafficking
        train = [(elem.text,"trafficking") for elem in train] + [(elem.text,"not trafficking") for elem in dummy]
        cls = []
        #make use of tdf-idf here
        #add in this example: http://scikit-learn.org/0.11/auto_examples/document_classification_20newsgroups.html
        cls.append(NBC(train))
        cls.append(DTC(train))
        for datum in data:
            for cl in cls:
                if cl.classify(datum["text_body"]) == "trafficking":
                    self.save_ads([datum])
            #so I don't have to eye ball things
            if doc_comparison(datum["text_body"],t_docs) == "trafficking":
                self.save_ads([datum])
                if self.doc_comparison(datum["text_body"],t_docs) == "trafficking":
                    self.save_ads([datum])

            #so I don't have to eye ball things
            if doc_comparison(datum["text_body"],t_docs) == "trafficking":
                self.save_ads([datum])

                if self.doc_comparison(datum["text_body"],t_docs) == "trafficking":
                    self.save_ads([datum])

        time.sleep(700) # wait ~ 12 minutes
        self.investigate() #this is an infinite loop, which I am okay with.
Example #2
0
    def investigate(self):

        data = self.scrape(self.base_urls)
        train_crud = CRUD("sqlite:///database.db", Ads, "ads")
        #getting dummy data from http://www.dummytextgenerator.com/#jump
        dummy_crud = CRUD("sqlite:///database.db", TrainData, "training_data")
        train = train_crud.get_all()
        dummy = dummy_crud.get_all()
        t_docs = [elem.text for elem in train_crud.get_all()
                  ]  #all documents with trafficking
        train = [(elem.text, "trafficking")
                 for elem in train] + [(elem.text, "not trafficking")
                                       for elem in dummy]
        cls = []
        #make use of tdf-idf here
        #add in this example: http://scikit-learn.org/0.11/auto_examples/document_classification_20newsgroups.html
        cls.append(NBC(train))
        cls.append(DTC(train))
        for datum in data:
            for cl in cls:
                if cl.classify(datum["text_body"]) == "trafficking":
                    self.save_ads([datum])
            #so I don't have to eye ball things
            if doc_comparison(datum["text_body"], t_docs) == "trafficking":
                self.save_ads([datum])
                if self.doc_comparison(datum["text_body"],
                                       t_docs) == "trafficking":
                    self.save_ads([datum])
        time.sleep(700)  # wait ~ 12 minutes
        self.investigate()  #this is an infinite loop, which I am okay with.
Example #3
0
	def save_ads(self, data, site):
		crud = CRUD("sqlite:///database.db", table=site)
		
		for datum in data:
			ad = Ads()
			ad.title=datum["title"]
			ad.phone_numbers=json.dumps(datum["phone_numbers"])
			ad.text_body=datum["text_body"]
			ad.photos=json.dumps(datum["images"])#change this so I'm saving actual pictures to the database.
			ad.link=datum["link"]
			ad.posted_at = datum["posted_at"]
			ad.scraped_at=datum["scraped_at"]
			ad.language=datum["language"]
			ad.polarity=datum["polarity"]
			ad.translated_body=datum["translated_body"]
			ad.translated_title=datum["translated_title"]
			ad.subjectivity=datum["subjectivity"]
			crud.insert(ad)
 def save_ads(self, data, site):
     crud = CRUD("sqlite:///database.db", table=site)