Esempio n. 1
0
    def init_training(self, shill_filepath, notshill_filepath):

        s_content = []
        with open(shill_filepath, 'r') as fs:
            s_content = fs.readlines()
        shill_targets = [x.strip() for x in s_content]

        notshill_targets = []
        ns_content = []
        if notshill_filepath:
            with open(notshill_filepath, 'r') as fns:
                ns_content = fns.readlines()

            notshill_targets = [x.strip() for x in ns_content]

        corpus = []
        for shill in shill_targets:
            worker = BasicUserParseWorker(shill)
            result, root = worker.run(training_label=LABEL_SHILL, local=True)
            corpus += result

        for notshill in notshill_targets:
            worker = BasicUserParseWorker(notshill)
            result, root = worker.run(training_label=LABEL_NOTSHILL,
                                      local=True)
            corpus += result

        self.train_classifier({'data': corpus})
Esempio n. 2
0
            def send_mother(self):

                worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

                data = worker.recv(self.buff_size)
                original_target = None
                send_to_mother(self, data, original_target)
Esempio n. 3
0
 def test_worker_max_links(self):
     """
     Purpose: Test the current links count is propperly set to 0 before running 
     :return:
     """
     worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
     self.assertEqual(0, worker.cur_links)
Esempio n. 4
0
    def test_worker_add_links_in_crawled(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before)




            def test_worker_contact(self):

                contact = handle_worker_contact(self, worker, address):

                self.assertRaises(ConnectionRefusedError, worker.run)

            def send_mother(self):

                worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

                data = worker.recv(self.buff_size)
                original_target = None
                send_to_mother(self, data, original_target)
Esempio n. 5
0
 def test_worker_invalid_links(self):
     """
     Purpose: Test running of Worker if it is given an invalid link to crawl (a link that returns 404).
     Expectation: WorkerException is raised.
     """
     #the following link: http://gdalskjfakl.com/ was invalid at the time this test was written
     worker = BasicUserParseWorker("http://gdalskjfakl.com/")
     self.assertRaises(WorkerException, worker.run)
Esempio n. 6
0
 def test_worker_url(self):
     """
     Purpose: Test the url of the worker, before the it is run
     :return:
     """
     worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
     self.assertEqual("https://www.reddit.com/user/Chrikelnel",
                      worker.to_crawl[0])
Esempio n. 7
0
    def test_worker_add_links_empty_list(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        before_links = len(worker.to_crawl)
        worker.add_links([])
        after_links = len(worker.to_crawl)

        self.assertEqual(before_links, after_links)
    def test_worker_add_links_in_crawled(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before)
Esempio n. 9
0
    def test_worker_improper_link(self):
    	"""
    	Purpose: Test that improper links raise exception.
    	Expectation: Startup system, fail to hit reddit user, raise exception.

    	:return:
    	"""
    	worker = BasicUserParseWorker("https://www.reddit.com /user/Chrikelnel")
    	self.assertRaises(WorkerException, worker.run)
    def test_worker_add_links(self):
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links("test.com")
        len_to_crawl_after = len(worker.to_crawl)

        self.assertGreater(len_to_crawl_after, len_to_crawl_before)
Esempio n. 11
0
    def test_worker_clears_variables(self):
        worker = BasicUserParseWorker(None)
        empty = []
        len_of_empty = len(empty)

        worker.to_crawl = []
        len_to_crawl = len(worker.to_crawl)

        self.assertEqual(len_of_empty, len_to_crawl)
Esempio n. 12
0
    def test_worker_link_delay(self):
        """
        Purpose: Test test the link_delay default value is set correctly

        :return:
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        self.assertEqual(0.25, worker.link_delay)
Esempio n. 13
0
    def test_worker_fails_on_nonexistent_page(self):
        """
        Purpose: Test failure handling of worker.
        Expectation: worker raises exception when given a url that returns a non-200 response / no response.

        :precondition: non_a_proper_url does not resolve
        :return:
        """
        worker = BasicUserParseWorker("not_a_proper_url")
        self.assertRaises(WorkerException, worker.run)
Esempio n. 14
0
    def test_worker_duplicate_links(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []
        len_initial = len(worker.to_crawl)

        worker.crawled.append("https://www.reddit.com/user/Chrikelnel")
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])
        len_after_adding_duplicate = len(worker.to_crawl)

        self.assertEqual(len_after_adding_duplicate, len_initial)
Esempio n. 15
0
    def test_URL_reachabillity(self):
        """
        Purpose: Test error if URL not reached
        Expectation: startup system, Fail to find user (exception)
        :precondition: URL not reachable
        :return:
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/badLinkBadUser")

        # Can't reach url
        self.assertRaises(IOError, worker.run)
Esempio n. 16
0
    def test_worker_adding_new_links(self):
    	"""
    	Purpose: Test adding new links to the to_crawl list.
    	Expectations: New link is added to to_crawl list and length of list increases.

    	:return:
    	"""
    	worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

    	len_before = len(worker.to_crawl)
    	worker.add_links("https://www.reddit.com/user/Groggen2")
    	self.assertGreater(len(worker.to_crawl), len_before)
Esempio n. 17
0
    def test_worked_cannot_add_already_crawled_links(self):
        """
        adding a link that has already been crawled does not change the to_crawl length
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 1)
Esempio n. 18
0
    def test_basic_worker_connection(self):
        """
        Purpose: Test regular running of worker
        Expectation: startup system, hit the reddit user and parse the data, fail to send to mothership (exception)

        :precondition: Mothership server not running
        :return:
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        # Can't connect to mother, so should raise ConnectionRefusedError, but should run everything else
        self.assertRaises(ConnectionRefusedError, worker.run)
    def test_worker_add_links_max_limit(self):
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        worker.max_links = 0
        before = worker.to_crawl[:]
        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links("test.com")
        after = worker.to_crawl[:]
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(before, after)
Esempio n. 20
0
    def test_worker_add_links_under_max_limit(self):
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

        worker.max_links = 7  # max_links = 7 now
        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links("test.com")
        len_to_crawl_after = len(worker.to_crawl)

        self.assertNotEqual(
            len_to_crawl_after,
            len_to_crawl_before)  # Check that add_links adds links successfuly
Esempio n. 21
0
    def test_worker_parsing_next_page(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        file_path = '%s/%s' % (os.path.dirname(os.path.realpath(__file__)),
                               'test_resources/sample_GET_response.html')

        with codecs.open(file_path, encoding='utf-8') as f:
            text = f.read()

        results, next_page = worker.parse_text(
            str(text).strip().replace('\r\n', ''))

        self.assertIsNotNone(next_page)
        self.assertGreater(len(next_page), 0)
    def test_zelan_test_two(self):
        """
        this test is to test if the added link in list
        """
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])

        self.assertIn("https://www.reddit.com/user/Chrikelnel",
                      worker.to_crawl)
Esempio n. 23
0
    def test_worker_add_results components(self):
		#test if all three are properly added to results
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        file_path = '%s/%s' % (os.path.dirname(os.path.realpath(__file__)), 'test_resources/sample_GET_response.html')

        with codecs.open(file_path, encoding='utf-8') as f:
            text = f.read()

        results, next_page = worker.parse_text(str(text).strip().replace('\r\n', ''))

        self.assertGreater(len(results[0]), 0)
        self.assertGreater(len(results[1]), 0)
        self.assertGreater(len(results[2]), 0)
Esempio n. 24
0
    def add_multiple_links(self):
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)

        worker.add_links([
            "https://www.reddit.com/user/Chrikelnel", "https://www.google.ca",
            "https://hotmail.com"
        ])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 2)
Esempio n. 25
0
    def test_worker_adding_duplicate_links(self):
    	"""
    	Purpose: Test adding duplicate links to the to_crawl list. (Fixed version of above code provided by Caleb Shortt)
    	Expectation: Link is not added to to_crawl list and length of list remains the same. 

    	:return:
    	"""
    	worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")

    	worker.run
    	duplicate = "https://www.reddit.com/user/Chrikelnel"
    	worker.add_links("https://www.reddit.com/user/Chrikelnel")
    	if duplicate not in worker.to_crawl:
    		self.assertTrue(True)
    def test_zelan_test_four(self):
        """
        this test is to test if the added two links in list, which the two links are not same
        """
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links([
            "https://www.reddit.com/user/Chrikelnel", "https://www.google.ca"
        ])
        len_to_crawl_after = len(worker.to_crawl)
        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 2)
    def test_zelan_test_one(self):
        """
        this test is to test if add one to worker correctly
        """
        worker = None
        worker = BasicUserParseWorker("https://www.reddit.com")
        worker.crawled = []
        worker.max_links = 5

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/Chrikelnel"])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 1)
Esempio n. 28
0
    def test_worker_add_links_in_crawled(self):
        """
        calling add_links() with one link on a worker increases the to_crawl length by 1

        This unit test was partially implemented in class but was broken
        It is now fixed
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links(["https://www.reddit.com/user/GallowBoob"])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 1)
Esempio n. 29
0
    def test_worker_cannot_add_duplicate_links(self):
        """
        calling add_links() with two identical links only adds 1 link
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        worker.crawled = []

        len_to_crawl_before = len(worker.to_crawl)
        worker.add_links([
            "https://www.reddit.com/user/GallowBoob",
            "https://www.reddit.com/user/GallowBoob"
        ])
        len_to_crawl_after = len(worker.to_crawl)

        self.assertEqual(len_to_crawl_after, len_to_crawl_before + 1)
Esempio n. 30
0
    def test_worker_parsing(self):
        """
        Purpose: Test regular parsing mechanisms of worker
        Expectation: Load html file, send it to worker to parse, should return list of results

        :return:
        """
        worker = BasicUserParseWorker("https://www.reddit.com/user/Chrikelnel")
        file_path = '%s/%s' % (os.path.dirname(os.path.realpath(__file__)), 'test_resources/sample_GET_response.html')

        with codecs.open(file_path, encoding='utf-8') as f:
            text = f.read()

        results, next_page = worker.parse_text(str(text).strip().replace('\r\n', ''))

        self.assertGreater(len(results), 0)     # Check that results are returned
        self.assertEqual(len(results[0]), 3)    # Check that results are in triplets (check formatting)