def classify(self, test_features_file, output_file, metric="LC", num_sim=18, threshold=.2, ranked=False): ''' Weighted-vote k-nn classifier The test_features_file should be in the format: trackid[TAB]track_features_path ''' print "Autotagging...(ranked=%s)" % ranked with open(output_file, "w") as outfile: with open(test_features_file) as infile: count = 0 for line in infile: trackid, features_filepath = line.strip().split("\t") if not os.path.exists(features_filepath): print "ERROR: features file '%s' not found" % features_filepath continue query_point = gaia2.Point() query_point.load(features_filepath) query_point.setName("%s" % (trackid)) proposed_tags = self.__propagation(query_point, metric, num_sim, threshold, ranked) for tag, freq in proposed_tags: outfile.write("%s\t%s\t%s\t%s\n" % (trackid, features_filepath, tag, freq)) count += 1 if count % 100 == 0: print "%d songs processed" % count
def __call__(self, *args, **kwargs): # pre-check for errors that might happen very often and where 1 good error message would # be real nice to have if (self.methodName.startswith('nnSearch') and self.methodName not in ('nnSearchById', 'nnSearchByIdWithFilter', 'nnSearchByExample', 'nnSearchByExampleWithFilter')): raise AttributeError( 'You need to use either nnSearchById{WithFilter} or nnSearchByExample{WithFilter}' ) # pre-processing for certain specific methods if self.methodName.startswith('nnSearchByExample'): args = (args[0].toBase64(), ) + args[1:] # in the case of an nnSearch request, we shouldn't do the query immediately but rather # return a proxy object that allows to chain queries using the search_space argument. # the actual query should only be resolved when the user calls the get() method on this # proxy object if self.methodName.startswith('nnSearch'): return ResultSet(self.endPoint, self.methodName, args, kwargs) # actual processing by the server result = YamlRPCMethod.__call__(self, *args, **kwargs) # post-processing for certain specific methods if self.methodName == 'layout': result = yaml.load(result) elif self.methodName == 'getPoint': try: import gaia2 except ImportError: raise ImportError( 'You need to have the gaia2 python module installed in order to be able to retrieve single points' ) p = gaia2.Point() p.fromBase64(result) result = p elif self.methodName == 'getPoints': try: import gaia2 except ImportError: raise ImportError( 'You need to have the gaia2 python module installed in order to be able to retrieve points' ) ds = gaia2.DataSet() ds.fromBase64(result) result = ds return result
def mergeAll(pointList, outputFilename, chunkSize, transfoFile, select=None, exclude=None): # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected) totalPoints = len(fastyaml.load(open(pointList).read())) begin, end = 0, chunkSize partfiles = [] partfileTemplate = outputFilename + '_%d_%d.partdb' # keep this information for future reference as it won't be accessible anymore # once the dataset is merged excluded = [] if exclude: try: p = gaia2.Point() p.load(list(gaia2.fastyaml.loadfile(pointList).items())[0][1]) excluded = p.layout().descriptorNames(exclude) except: raise # merge each chunk separately # this includes removevl and fixlength, which should yield smaller files than just after # merging, so it should then be possible to load all of them together to merge them while begin < totalPoints: end = min(end, totalPoints) partfile = partfileTemplate % (begin, end) partfiles += [partfile] mergeChunk(pointList, partfile, transfoFile, begin, end, select, exclude) begin, end = end, end + chunkSize horizontalLine() # make sure all histories are the same, if not do whatever it takes to reach that point # also "simplify" the histories so that they are the minimum history representation required # to get to the layout of the final dataset print( 'Harmonizing chunks so that they all have the same layout & history...' ) vldescs, nandescs, rdescs = harmonizeChunks(partfiles) rdescs = rdescs | set(excluded) horizontalLine() # merge all those partfiles together print('Assembling full dataset together...') dstotal = DataSet() for pfile in partfiles: print('Merging partfile', pfile) ds = DataSet() ds.load(pfile) dstotal.appendDataSet(ds) dstotal.save(outputFilename) # print a nice informative summary of what has been done to the dataset horizontalLine() msg = ''' Final dataset information ------------------------- Number of points: %s Descriptors removed: - because they were of variable length: %s - because they were either constant, contained NaN or contained Inf: %s - because they were removed explicitly: %s Your dataset has been saved at %s''' # remove leading dot vldescs = sorted(d[1:] for d in vldescs) nandescs = sorted(d[1:] for d in nandescs) rdescs = sorted(d[1:] for d in rdescs) print(msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs), ', '.join(rdescs), outputFilename)) # clean up temporary files for pfile in partfiles: os.remove(pfile) os.remove(pfile + '.raw')