Esempio n. 1
0
	def __init__(self, test=False):
		"Inits the script"

		Trace.info("Starting" + (" ", " test")[test] +" script...")
		# change paths and indexes in case of test
		if test:
			# path of the files
			self.hotels_file = os.path.join(self.filedir,"./data/hotels_test.csv")
			self.comments_file = os.path.join(self.filedir,"./data/comments_test.csv")
			self.bitext_file = os.path.join(self.filedir,"./data/bitext_tuipilot_test.csv")
			# indexes
			self.hotels_index = "test_hotels"
			self.comments_index = "test_comments"
			self.bitext_index = "test_bitext"
			self.bitext_unique_index = "test_bitext_unique"
			self.bitext_unique_posneg_index = "test_bitext_unique_posneg"
		
		# hotels first
		self.build_hotels_index()
		# then comments
		self.build_comments_index()
		# then the rest
		self.build_bitext_indexes()

		Trace.info(("S", "Test s")[test] + "cript finished.")
 def process_package(self, package_name):
     _id = package_name.replace("/","_")
     # grab npmjs registry information
     npm_registry_info = json.loads(requests.get("http://registry.npmjs.org/" + package_name).text) 
     Trace.info("npm_registry_info processed ok")
     # grab npm-stat_info
     today = date.today()
     month_ago = today - timedelta(30)
     npm_stat_info = json.loads(requests.get("http://npm-stat.com/downloads/range/" + date.strftime(month_ago, "%Y-%m-%d") + ":" + date.strftime(today, "%Y-%m-%d") + "/" + package_name).text)
     Trace.info("npm_stat_info processed ok")
     # build the doc and feed elasticsearch
     # _type first. _type will be the repo of the package. "no_repo" in case there is no repo.
     _type = "no_repo"
     if ("repository" in npm_registry_info and "type" in npm_registry_info["repository"]):
         _type = npm_registry_info["repository"]["type"].replace("/","_")
     # init document with versions
     document = {
         "versions": 0
     }
     if "versions" in npm_registry_info:
         document["versions"] = len(npm_registry_info["versions"].keys())
     # calculate downloads
     downloads = [0]
     if "downloads" in npm_stat_info and len(npm_stat_info["downloads"]) > 0:
         downloads = [item["downloads"] for item in npm_stat_info["downloads"]]
     document["average_downloads"] = reduce(lambda x, y: x + y, downloads) / len(downloads)
     # insert document
     Trace.info("about to upsert")
     Trace.info(json.dumps(self.elasticsearch.upsert_document(self._index, _type, _id, document)))
     Trace.info("upserted")
    def build_npm_packages_index(self):

        global test_packages
        package_names = []
        # check if testing
        if test_packages != None and len(test_packages) > 0:
            package_names = test_packages
            Trace.info("Testing. Packages reduced to: " + str(len(package_names)))
        else: #not testing
            # get all the docs
            Trace.info("grabbing all packages from npm registry...")
            packages = json.loads(requests.get("https://skimdb.npmjs.com/registry/_all_docs").text)["rows"]
            package_names = [item["id"] for item in packages]
            Trace.info(str(len(package_names)) + " total packages grabbed")
        # apply offset
        package_names = package_names[self._offset:]
        Trace.info("Offset. Packages reduced to: " + str(len(package_names)))
        # go through them and feed elasticsearch
        for package_name in package_names:
            Trace.info("processing package: " + package_name)
            try:
                self.process_package(package_name)
            except:
                print ("Error processing package: " + package_name + ": " + str(sys.exc_info()[0]))
                continue
Esempio n. 4
0
	def build_comments_index(self):
		Trace.info("Building comments index...")
		# build the typemap
		comments_typemap = {"averageWebScore": int}
		comments_replace = [{"key":"commentId", "find":".", "replace":""}, {"key":"hotelSequence", "find":".", "replace":""}]
		# get the bulk of documents
		comments = CsvManager.read(self.comments_file, typemap=comments_typemap, replace=comments_replace)
		Trace.info(str(len(comments)) + " comments read")
		# bulk_upsert
		comments_upserted = self.elasticsearch.upsert_bulk(self.comments_index, "hotelSequence", "commentId", comments)
		Trace.info(str(comments_upserted) + " comments upserted in " + self.comments_index)
Esempio n. 5
0
	def build_hotels_index(self):
		Trace.info("Building hotels index...")
		# build the typemap
		hotels_keys = CsvManager.read_keys(self.hotels_file)
		hotels_typemap = dict(zip(hotels_keys[3:], [int]*len(hotels_keys[3:])))
		hotels_replace = [{"key":"hotelSequence", "find":".", "replace":""}, {"key":"mailsEnviados", "find":".", "replace":""}]
		# get the bulk of documents
		hotels = CsvManager.read(self.hotels_file, typemap=hotels_typemap, replace=hotels_replace)
		Trace.info(str(len(hotels)) + " hotels read")
		# bulk_upsert
		hotels_upserted = self.elasticsearch.upsert_bulk(self.hotels_index, "destinationCode", "hotelSequence", hotels)
		Trace.info(str(hotels_upserted) + " hotels upserted in " + self.hotels_index)
    def __init__(self, test=False, offset = 0):
        "Inits the script"

        global test_packages

        Trace.info("Starting" + (" ", " test")[test] +" script...")
        # change paths and indexes in case of test
        if test:
            test_packages_file = os.path.join(self.filedir,"./data/test_npm_package_names")
            test_packages = [item["test_package_name"] for item in CsvManager.read(test_packages_file)]
            self._index = "test_npm_packages"
            Trace.info("test_packages: " + json.dumps(test_packages))

        # set offset
        self._offset = offset
        
        # build npm_packages_index
        self.build_npm_packages_index()

        Trace.info(("S", "Test s")[test] + "cript finished.")
Esempio n. 7
0
	def build_bitext_indexes(self):
		"Builds bitext, bitext_unique and bitext_unique_posneg indexes"
		Trace.info("Building bitext, bitext_unique and bitext_unique_posneg indexes...")
		# typemap and replace
		bitext_replace = [{"key":"score", "find":",", "replace":"."}]
		bitext_typemap = {"score": float}
		# get the bulk of bitexts
		bitexts = CsvManager.read(self.bitext_file, typemap=bitext_typemap, replace=bitext_replace)
		# iterate the bulk of bitexts and insert the element in each of the indexes
		for _id,bitext_item in enumerate(bitexts):
			# add info from hotels
			hotel = self.elasticsearch.read_document(self.hotels_index, "_all", bitext_item["hotelSequence"])
			if "found" in hotel and hotel["found"]:
				# add found hotel fields to bitext item
				bitext_item = dict(bitext_item.items() + hotel["_source"].items())
			# upsert element
			bitext_type = bitext_item["section"]
			del bitext_item["section"]
            		Trace.info("upserting bitext " + str(_id))
            		self.elasticsearch.upsert_document(self.bitext_index, bitext_type, str(_id), bitext_item)
			# update bitext_unique_posneg index
			previous_average_score = 0
			previous_count = 0
			previous_categories = ""
			separator = ""
			bitext_unique_posneg_id = bitext_item["commentId"] + bitext_type
			bitext_unique_posneg_item = self.elasticsearch.read_document(self.bitext_unique_posneg_index, "_all", bitext_unique_posneg_id)
			if "found" in bitext_unique_posneg_item and bitext_unique_posneg_item["found"]:
				previous_count = bitext_unique_posneg_item["_source"]["count"]
				previous_average_score = bitext_unique_posneg_item["_source"]["averageScore"]
				previous_categories = bitext_unique_posneg_item["_source"]["category"]
				separator = ", "
			bitext_unique_posneg_upsert_doc = {
				"section": bitext_type,
				"averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1),
				"count": previous_count + 1,
				"category": previous_categories + separator + bitext_item["category"]
			}
			# upsert
			self.elasticsearch.upsert_document(self.bitext_unique_posneg_index, bitext_item["hotelSequence"], bitext_unique_posneg_id, bitext_unique_posneg_upsert_doc)
			# update bitext_unique index
			previous_average_score = 0
			previous_count = 0
			previous_categories = ""
			separator = ""
			bitext_unique_id = bitext_item["commentId"]
			bitext_unique_item = self.elasticsearch.read_document(self.bitext_unique_index, "_all", bitext_unique_id)
			if "found" in bitext_unique_item and bitext_unique_item["found"]:
				previous_count = bitext_unique_item["_source"]["count"]
				previous_average_score = bitext_unique_item["_source"]["averageScore"]
				previous_categories = bitext_unique_item["_source"]["category"]
				separator = ", "
			bitext_unique_upsert_doc = {
				"averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1),
				"count": previous_count + 1,
				"category": previous_categories + separator + bitext_item["category"]
			}
			# look for the comment in the comment index
			comment = self.elasticsearch.read_document(self.comments_index, "_all", bitext_unique_id)
			if "found" in comment and comment["found"]:
				# add found comment averageWebScore to bitext unique item
				bitext_unique_upsert_doc["averageWebScore"] = comment["_source"]["averageWebScore"]
                		bitext_unique_upsert_doc["scoresDiff"] = bitext_unique_upsert_doc["averageScore"] - bitext_unique_upsert_doc["averageWebScore"]
                		bitext_unique_upsert_doc["scoresAbsDiff"] = math.fabs(bitext_unique_upsert_doc["scoresDiff"])

			# upsert
			self.elasticsearch.upsert_document(self.bitext_unique_index, bitext_item["hotelSequence"], bitext_unique_id, bitext_unique_upsert_doc)
Esempio n. 8
0
    	self.assertEquals(last_bitext["_source"]["score"], 2.0)
    	self.assertEquals(last_bitext["_source"]["mailsEnviados"], 37)
    	# test bitext_unique_posneg index
    	bitext330956POS = self.elasticsearch.read_document("test_bitext_unique_posneg", "69559", "330956POS")
    	self.assertTrue(bitext330956POS["found"])
    	self.assertEquals(bitext330956POS["_source"]["averageScore"], 2.0)
    	# test bitext_unique index
    	bitext330956 = self.elasticsearch.read_document("test_bitext_unique", "69559", "330956")
    	self.assertTrue(bitext330956["found"])
    	self.assertEquals(bitext330956["_source"]["averageScore"], 2.0)
        self.assertEquals(bitext330956["_source"]["averageWebScore"], 5)
        self.assertEquals(bitext330956["_source"]["scoresDiff"], -3.0)
        self.assertEquals(bitext330956["_source"]["scoresAbsDiff"], 3.0)

    def tearDown(self):
    	# delete indexes
    	self.elasticsearch.remove_index("test_hotels")
    	self.elasticsearch.remove_index("test_comments")
    	self.elasticsearch.remove_index("test_bitext")
    	self.elasticsearch.remove_index("test_bitext_unique_posneg")
    	self.elasticsearch.remove_index("test_bitext_unique")

if __name__ == '__main__':
	#unittest.main()
	if len(sys.argv)>1 and sys.argv[1] == "test":
		Trace.info("test")
		unittest.main(argv=sys.argv[:1], exit=True)
	else:
		Trace.info("main")
    	_Main()
    @unittest.skipIf(not(elasticsearch.is_up()), "irrelevant test if there is no elasticsearch instance")
    def test_script(self):
        global test_packages
        _Main(test = True)
        # count documents
        self.assertTrue(self.elasticsearch.count_documents("test_npm_packages") > 0)
        # assert express
        express_package = self.elasticsearch.read_document("test_npm_packages", "_all", "express")
        self.assertTrue(express_package["found"])

    def tearDown(self):
        # delete indexes
        self.elasticsearch.remove_index("test_npm_packages")

if __name__ == '__main__':
    #unittest.main()
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        Trace.info("test")
        unittest.main(argv=sys.argv[:1], exit=True)
    else:
        if len(sys.argv) > 1:    
            try:
                offset = int(sys.argv[1])
                Trace.info("main with offset: " + str(offset))
                _Main(offset = offset)
            except:
                pass
        else:
            Trace.info("main")
            _Main()