def build_npm_packages_index(self):

        global test_packages
        package_names = []
        # check if testing
        if test_packages != None and len(test_packages) > 0:
            package_names = test_packages
            Trace.info("Testing. Packages reduced to: " + str(len(package_names)))
        else: #not testing
            # get all the docs
            Trace.info("grabbing all packages from npm registry...")
            packages = json.loads(requests.get("https://skimdb.npmjs.com/registry/_all_docs").text)["rows"]
            package_names = [item["id"] for item in packages]
            Trace.info(str(len(package_names)) + " total packages grabbed")
        # apply offset
        package_names = package_names[self._offset:]
        Trace.info("Offset. Packages reduced to: " + str(len(package_names)))
        # go through them and feed elasticsearch
        for package_name in package_names:
            Trace.info("processing package: " + package_name)
            try:
                self.process_package(package_name)
            except:
                print ("Error processing package: " + package_name + ": " + str(sys.exc_info()[0]))
                continue
 def process_package(self, package_name):
     _id = package_name.replace("/","_")
     # grab npmjs registry information
     npm_registry_info = json.loads(requests.get("http://registry.npmjs.org/" + package_name).text) 
     Trace.info("npm_registry_info processed ok")
     # grab npm-stat_info
     today = date.today()
     month_ago = today - timedelta(30)
     npm_stat_info = json.loads(requests.get("http://npm-stat.com/downloads/range/" + date.strftime(month_ago, "%Y-%m-%d") + ":" + date.strftime(today, "%Y-%m-%d") + "/" + package_name).text)
     Trace.info("npm_stat_info processed ok")
     # build the doc and feed elasticsearch
     # _type first. _type will be the repo of the package. "no_repo" in case there is no repo.
     _type = "no_repo"
     if ("repository" in npm_registry_info and "type" in npm_registry_info["repository"]):
         _type = npm_registry_info["repository"]["type"].replace("/","_")
     # init document with versions
     document = {
         "versions": 0
     }
     if "versions" in npm_registry_info:
         document["versions"] = len(npm_registry_info["versions"].keys())
     # calculate downloads
     downloads = [0]
     if "downloads" in npm_stat_info and len(npm_stat_info["downloads"]) > 0:
         downloads = [item["downloads"] for item in npm_stat_info["downloads"]]
     document["average_downloads"] = reduce(lambda x, y: x + y, downloads) / len(downloads)
     # insert document
     Trace.info("about to upsert")
     Trace.info(json.dumps(self.elasticsearch.upsert_document(self._index, _type, _id, document)))
     Trace.info("upserted")
Example #3
0
	def __init__(self, test=False):
		"Inits the script"

		Trace.info("Starting" + (" ", " test")[test] +" script...")
		# change paths and indexes in case of test
		if test:
			# path of the files
			self.hotels_file = os.path.join(self.filedir,"./data/hotels_test.csv")
			self.comments_file = os.path.join(self.filedir,"./data/comments_test.csv")
			self.bitext_file = os.path.join(self.filedir,"./data/bitext_tuipilot_test.csv")
			# indexes
			self.hotels_index = "test_hotels"
			self.comments_index = "test_comments"
			self.bitext_index = "test_bitext"
			self.bitext_unique_index = "test_bitext_unique"
			self.bitext_unique_posneg_index = "test_bitext_unique_posneg"
		
		# hotels first
		self.build_hotels_index()
		# then comments
		self.build_comments_index()
		# then the rest
		self.build_bitext_indexes()

		Trace.info(("S", "Test s")[test] + "cript finished.")
Example #4
0
 def write(self, strings):
   "Write a list of strings"
   for string in strings:
     if not isinstance(string, basestring):
       Trace.error('Not a string: ' + unicode(string) + ' in ' + unicode(strings))
       return
     self.writestring(string)
Example #5
0
 def readall(self):
     "Read the whole file"
     for encoding in FileConfig.parsing['encodings']:
         try:
             return self.readcodec(encoding)
         except UnicodeDecodeError:
             pass
     Trace.error('No suitable encoding for ' + self.filename)
     return []
Example #6
0
 def readall(self):
     "Read the whole file"
     for encoding in FileConfig.parsing["encodings"]:
         try:
             return self.readcodec(encoding)
         except UnicodeDecodeError:
             pass
     Trace.error("No suitable encoding for " + self.filename)
     return []
Example #7
0
 def dotseparated(self, number):
   "Get the number separated by dots: 1.1.3"
   dotsep = ''
   if len(number) == 0:
     Trace.error('Empty number')
     return '.'
   for piece in number:
     dotsep += '.' + unicode(piece)
   return dotsep[1:]
Example #8
0
 def removebackdirs(self):
   "Remove any occurrences of ../ (or ..\ on Windows)"
   self.path = os.path.normpath(self.path)
   backdir = '..' + os.path.sep
   while self.path.startswith(backdir):
     Trace.debug('Backdir in: ' + self.path)
     self.path = self.path[len(backdir):]
   while self.url.startswith('../'):
     Trace.debug('Backdir in: ' + self.url)
     self.url = self.url[len('../'):]
Example #9
0
 def gethtml(self, container):
   "Return the HTML code"
   html = []
   if container.contents == None:
     return html
   for element in container.contents:
     if not hasattr(element, 'gethtml'):
       Trace.error('No html in ' + element.__class__.__name__ + ': ' + unicode(element))
       return html
     html += element.gethtml()
   return html
Example #10
0
 def convert(self, filename, directory = ''):
   "Convert the filename adding the appropriate directories."
   if os.path.exists(filename):
     return filename
   newname = os.path.join(self.directory, filename)
   if os.path.exists(newname):
     return newname
   newname = os.path.join(directory, filename)
   if os.path.exists(newname):
     return newname
   Trace.error('Missing file ' + filename)
   return None
Example #11
0
 def convert(self, filename, directory=''):
     "Convert the filename adding the appropriate directories."
     if os.path.exists(filename):
         return filename
     newname = os.path.join(self.directory, filename)
     if os.path.exists(newname):
         return newname
     newname = os.path.join(directory, filename)
     if os.path.exists(newname):
         return newname
     Trace.error('Missing file ' + filename)
     return None
Example #12
0
 def gethtml(self, container):
     "Return the HTML code"
     html = []
     if container.contents == None:
         return html
     for element in container.contents:
         if not hasattr(element, 'gethtml'):
             Trace.error('No html in ' + element.__class__.__name__ + ': ' +
                         unicode(element))
             return html
         html += element.gethtml()
     return html
Example #13
0
 def increase(self, number):
   "Increase the number (or letter)"
   if not isinstance(number, str):
     return number + 1
   if number == '-':
     index = 0
   elif not number in NumberGenerator.letters:
     Trace.error('Unknown letter numeration ' + number)
     return 0
   else:
     index = NumberGenerator.letters.index(number) + 1
   return self.letter(index)
Example #14
0
 def readline(self):
   "Read a line from file"
   self.current = self.file.readline()
   if not isinstance(self.file, codecs.StreamReaderWriter):
     self.current = self.current.decode('utf-8')
   if len(self.current) == 0:
     self.depleted = True
   self.current = self.current.rstrip('\n\r')
   self.linenumber += 1
   self.mustread = False
   Trace.prefix = 'Line ' + unicode(self.linenumber) + ': '
   if self.linenumber % 1000 == 0:
     Trace.message('Parsing')
Example #15
0
 def generateordered(self, type):
   "Generate ordered numbering: a number to use and possibly concatenate "
   "with others. Example: Chapter 1, Section 1.5."
   level = self.getlevel(type)
   if level == 0:
     Trace.error('Impossible level 0 for ' + type)
     return '.'
   if len(self.number) >= level:
     self.number = self.number[:level]
   else:
     while len(self.number) < level:
       self.number.append(0)
   self.number[level - 1] = self.increase(self.number[level - 1])
   return self.dotseparated(self.number)
    def __init__(self, *argv):
        "Analyze the command line args and launch the Twitter location stream"

        southwest = None
        northeast = None
        output = sys.stdout
        # set KeyboardInterrupt signal handler
        signal.signal(signal.SIGINT, self.keyboard_interrupt_handler)
        #turn tuple into list
        argv = list(argv)
        # remove the first argument
        argv.pop(0)
        # iterate the list
        for argument in argv[:]:
            # look for southwest
            if argument == "-sw":
                try:
                    southwest = argv[argv.index("-sw") + 1]
                except IndexError:
                    self.usage()
                    return
                argv.remove("-sw")
                argv.remove(southwest)
            # look for northeast
            if argument == "-ne":
                try:
                    northeast = argv[argv.index("-ne") + 1]
                except IndexError:
                    self.usage()
                    return
                argv.remove("-ne")
                argv.remove(northeast)
        # check if argv was correct
        if southwest is None or northeast is None or len(argv) > 1:
            self.usage()
            return
        # check for output
        if len(argv) == 1:
            output = open(argv[0], "w")
        # launch the LocationStream
        self.twitterstream = LocationStream(southwest + "," + northeast)
        try:
            stream = self.twitterstream.start()
            Trace.message("Twitter stream started!!")
            Trace.message("Press ctrl+c to stop.")
        except:
            Trace.error("Raised exception: " + str(sys.exc_info()[0]))
            Trace.error("Stopping twitterstream")
            self.twitterstream.stop()
            return
        for line in stream:
            print >> output, line.strip()
Example #17
0
 def number(self, layout):
   "Set all attributes: number, entry, level..."
   if self.generator.isunique(layout):
     number = self.generator.generateunique(layout.type)
     self.setcommonattrs(layout, number)
     layout.anchortext = ''
     if layout.number != '':
       layout.anchortext = layout.entry + '.'
     return
   if not self.generator.isinordered(layout):
     Trace.error('Trying to number wrong ' + unicode(layout))
     return
   # ordered or unordered
   if self.generator.isnumbered(layout):
     number = self.generator.generateordered(layout.type)
   else:
     number = self.generator.generateunique(layout.type)
   self.setcommonattrs(layout, number)
   layout.anchortext = layout.number
   layout.output.tag = layout.output.tag.replace('?', unicode(layout.level))
Example #18
0
	def build_comments_index(self):
		Trace.info("Building comments index...")
		# build the typemap
		comments_typemap = {"averageWebScore": int}
		comments_replace = [{"key":"commentId", "find":".", "replace":""}, {"key":"hotelSequence", "find":".", "replace":""}]
		# get the bulk of documents
		comments = CsvManager.read(self.comments_file, typemap=comments_typemap, replace=comments_replace)
		Trace.info(str(len(comments)) + " comments read")
		# bulk_upsert
		comments_upserted = self.elasticsearch.upsert_bulk(self.comments_index, "hotelSequence", "commentId", comments)
		Trace.info(str(comments_upserted) + " comments upserted in " + self.comments_index)
Example #19
0
	def build_hotels_index(self):
		Trace.info("Building hotels index...")
		# build the typemap
		hotels_keys = CsvManager.read_keys(self.hotels_file)
		hotels_typemap = dict(zip(hotels_keys[3:], [int]*len(hotels_keys[3:])))
		hotels_replace = [{"key":"hotelSequence", "find":".", "replace":""}, {"key":"mailsEnviados", "find":".", "replace":""}]
		# get the bulk of documents
		hotels = CsvManager.read(self.hotels_file, typemap=hotels_typemap, replace=hotels_replace)
		Trace.info(str(len(hotels)) + " hotels read")
		# bulk_upsert
		hotels_upserted = self.elasticsearch.upsert_bulk(self.hotels_index, "destinationCode", "hotelSequence", hotels)
		Trace.info(str(hotels_upserted) + " hotels upserted in " + self.hotels_index)
Example #20
0
 def findtranslation(self):
   "Find the translation for the document language."
   self.langcodes = None
   if not self.language:
     Trace.error('No language in document')
     return
   if not self.language in TranslationConfig.languages:
     Trace.error('Unknown language ' + self.language)
     return
   if TranslationConfig.languages[self.language] == 'en':
     return
   langcodes = [TranslationConfig.languages[self.language]]
   try:
     self.translation = gettext.translation('elyxer', None, langcodes)
   except IOError:
     Trace.error('No translation for ' + unicode(langcodes))
Example #21
0
 def findtranslation(self):
     "Find the translation for the document language."
     self.langcodes = None
     if not self.language:
         Trace.error('No language in document')
         return
     if not self.language in TranslationConfig.languages:
         Trace.error('Unknown language ' + self.language)
         return
     if TranslationConfig.languages[self.language] == 'en':
         return
     langcodes = [TranslationConfig.languages[self.language]]
     try:
         self.translation = gettext.translation('elyxer', None, langcodes)
     except IOError:
         Trace.error('No translation for ' + unicode(langcodes))
    def __init__(self, test=False, offset = 0):
        "Inits the script"

        global test_packages

        Trace.info("Starting" + (" ", " test")[test] +" script...")
        # change paths and indexes in case of test
        if test:
            test_packages_file = os.path.join(self.filedir,"./data/test_npm_package_names")
            test_packages = [item["test_package_name"] for item in CsvManager.read(test_packages_file)]
            self._index = "test_npm_packages"
            Trace.info("test_packages: " + json.dumps(test_packages))

        # set offset
        self._offset = offset
        
        # build npm_packages_index
        self.build_npm_packages_index()

        Trace.info(("S", "Test s")[test] + "cript finished.")
Example #23
0
 def nextline(self):
   "Go to next line"
   if self.depleted:
     Trace.fatal('Read beyond file end')
   self.mustread = True
Example #24
0
	def build_bitext_indexes(self):
		"Builds bitext, bitext_unique and bitext_unique_posneg indexes"
		Trace.info("Building bitext, bitext_unique and bitext_unique_posneg indexes...")
		# typemap and replace
		bitext_replace = [{"key":"score", "find":",", "replace":"."}]
		bitext_typemap = {"score": float}
		# get the bulk of bitexts
		bitexts = CsvManager.read(self.bitext_file, typemap=bitext_typemap, replace=bitext_replace)
		# iterate the bulk of bitexts and insert the element in each of the indexes
		for _id,bitext_item in enumerate(bitexts):
			# add info from hotels
			hotel = self.elasticsearch.read_document(self.hotels_index, "_all", bitext_item["hotelSequence"])
			if "found" in hotel and hotel["found"]:
				# add found hotel fields to bitext item
				bitext_item = dict(bitext_item.items() + hotel["_source"].items())
			# upsert element
			bitext_type = bitext_item["section"]
			del bitext_item["section"]
            		Trace.info("upserting bitext " + str(_id))
            		self.elasticsearch.upsert_document(self.bitext_index, bitext_type, str(_id), bitext_item)
			# update bitext_unique_posneg index
			previous_average_score = 0
			previous_count = 0
			previous_categories = ""
			separator = ""
			bitext_unique_posneg_id = bitext_item["commentId"] + bitext_type
			bitext_unique_posneg_item = self.elasticsearch.read_document(self.bitext_unique_posneg_index, "_all", bitext_unique_posneg_id)
			if "found" in bitext_unique_posneg_item and bitext_unique_posneg_item["found"]:
				previous_count = bitext_unique_posneg_item["_source"]["count"]
				previous_average_score = bitext_unique_posneg_item["_source"]["averageScore"]
				previous_categories = bitext_unique_posneg_item["_source"]["category"]
				separator = ", "
			bitext_unique_posneg_upsert_doc = {
				"section": bitext_type,
				"averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1),
				"count": previous_count + 1,
				"category": previous_categories + separator + bitext_item["category"]
			}
			# upsert
			self.elasticsearch.upsert_document(self.bitext_unique_posneg_index, bitext_item["hotelSequence"], bitext_unique_posneg_id, bitext_unique_posneg_upsert_doc)
			# update bitext_unique index
			previous_average_score = 0
			previous_count = 0
			previous_categories = ""
			separator = ""
			bitext_unique_id = bitext_item["commentId"]
			bitext_unique_item = self.elasticsearch.read_document(self.bitext_unique_index, "_all", bitext_unique_id)
			if "found" in bitext_unique_item and bitext_unique_item["found"]:
				previous_count = bitext_unique_item["_source"]["count"]
				previous_average_score = bitext_unique_item["_source"]["averageScore"]
				previous_categories = bitext_unique_item["_source"]["category"]
				separator = ", "
			bitext_unique_upsert_doc = {
				"averageScore": 1.0*(previous_average_score*previous_count + bitext_item["score"])/(previous_count + 1),
				"count": previous_count + 1,
				"category": previous_categories + separator + bitext_item["category"]
			}
			# look for the comment in the comment index
			comment = self.elasticsearch.read_document(self.comments_index, "_all", bitext_unique_id)
			if "found" in comment and comment["found"]:
				# add found comment averageWebScore to bitext unique item
				bitext_unique_upsert_doc["averageWebScore"] = comment["_source"]["averageWebScore"]
                		bitext_unique_upsert_doc["scoresDiff"] = bitext_unique_upsert_doc["averageScore"] - bitext_unique_upsert_doc["averageWebScore"]
                		bitext_unique_upsert_doc["scoresAbsDiff"] = math.fabs(bitext_unique_upsert_doc["scoresDiff"])

			# upsert
			self.elasticsearch.upsert_document(self.bitext_unique_index, bitext_item["hotelSequence"], bitext_unique_id, bitext_unique_upsert_doc)
Example #25
0
 def usage(self):
     Trace.error('Usage: coalesce.py filein [fileout]')
     return
Example #26
0
    	self.assertEquals(last_bitext["_source"]["score"], 2.0)
    	self.assertEquals(last_bitext["_source"]["mailsEnviados"], 37)
    	# test bitext_unique_posneg index
    	bitext330956POS = self.elasticsearch.read_document("test_bitext_unique_posneg", "69559", "330956POS")
    	self.assertTrue(bitext330956POS["found"])
    	self.assertEquals(bitext330956POS["_source"]["averageScore"], 2.0)
    	# test bitext_unique index
    	bitext330956 = self.elasticsearch.read_document("test_bitext_unique", "69559", "330956")
    	self.assertTrue(bitext330956["found"])
    	self.assertEquals(bitext330956["_source"]["averageScore"], 2.0)
        self.assertEquals(bitext330956["_source"]["averageWebScore"], 5)
        self.assertEquals(bitext330956["_source"]["scoresDiff"], -3.0)
        self.assertEquals(bitext330956["_source"]["scoresAbsDiff"], 3.0)

    def tearDown(self):
    	# delete indexes
    	self.elasticsearch.remove_index("test_hotels")
    	self.elasticsearch.remove_index("test_comments")
    	self.elasticsearch.remove_index("test_bitext")
    	self.elasticsearch.remove_index("test_bitext_unique_posneg")
    	self.elasticsearch.remove_index("test_bitext_unique")

if __name__ == '__main__':
	#unittest.main()
	if len(sys.argv)>1 and sys.argv[1] == "test":
		Trace.info("test")
		unittest.main(argv=sys.argv[:1], exit=True)
	else:
		Trace.info("main")
    	_Main()
Example #27
0
 def usage(self):
   Trace.error('Usage: coalesce.py filein [fileout]')
   return
 def keyboard_interrupt_handler(self, signal, frame):
     "handles KeyboardInterrupt signal"
     Trace.message("\nProcess interrupted by user. Exiting...")
     self.twitterstream.stop()
     sys.exit(0)
    @unittest.skipIf(not(elasticsearch.is_up()), "irrelevant test if there is no elasticsearch instance")
    def test_script(self):
        global test_packages
        _Main(test = True)
        # count documents
        self.assertTrue(self.elasticsearch.count_documents("test_npm_packages") > 0)
        # assert express
        express_package = self.elasticsearch.read_document("test_npm_packages", "_all", "express")
        self.assertTrue(express_package["found"])

    def tearDown(self):
        # delete indexes
        self.elasticsearch.remove_index("test_npm_packages")

if __name__ == '__main__':
    #unittest.main()
    if len(sys.argv) > 1 and sys.argv[1] == "test":
        Trace.info("test")
        unittest.main(argv=sys.argv[:1], exit=True)
    else:
        if len(sys.argv) > 1:    
            try:
                offset = int(sys.argv[1])
                Trace.info("main with offset: " + str(offset))
                _Main(offset = offset)
            except:
                pass
        else:
            Trace.info("main")
            _Main()
    def usage(self):
        "Show command line help."

        Trace.error('Usage: twitterstream.py -sw 2.012,45.3232 -ne 3.119,48.8777 [fileout]')
        Trace.error('Launch a twitter stream client and send the result to an output')
        Trace.error('[fileout]: the file to dump the output. Stdout if omitted')
        Trace.error('  Parameters:')
        Trace.error('    --sw: longitude,latitude coordinates of the South West corner of the bounding box. Compulsory.')
        Trace.error('    --ne: longitude,latitude coordinates of the North East corner of the bounding box. Compulsory.')
        Trace.error('Example: python run_location_stream.py -sw -11.733398,35.763229 -ne 5.009766,42.970492')