def classify(self,
              test_features_file,
              output_file,
              metric="LC",
              num_sim=18,
              threshold=.2,
              ranked=False):
     '''
     Weighted-vote k-nn classifier
     The test_features_file should be in the format:
     trackid[TAB]track_features_path
     '''
     print "Autotagging...(ranked=%s)" % ranked
     with open(output_file, "w") as outfile:
         with open(test_features_file) as infile:
             count = 0
             for line in infile:
                 trackid, features_filepath = line.strip().split("\t")
                 if not os.path.exists(features_filepath):
                     print "ERROR: features file '%s' not found" % features_filepath
                     continue
                 query_point = gaia2.Point()
                 query_point.load(features_filepath)
                 query_point.setName("%s" % (trackid))
                 proposed_tags = self.__propagation(query_point, metric,
                                                    num_sim, threshold,
                                                    ranked)
                 for tag, freq in proposed_tags:
                     outfile.write("%s\t%s\t%s\t%s\n" %
                                   (trackid, features_filepath, tag, freq))
                 count += 1
                 if count % 100 == 0:
                     print "%d songs processed" % count
Beispiel #2
0
    def __call__(self, *args, **kwargs):
        # pre-check for errors that might happen very often and where 1 good error message would
        # be real nice to have
        if (self.methodName.startswith('nnSearch') and self.methodName
                not in ('nnSearchById', 'nnSearchByIdWithFilter',
                        'nnSearchByExample', 'nnSearchByExampleWithFilter')):
            raise AttributeError(
                'You need to use either nnSearchById{WithFilter} or nnSearchByExample{WithFilter}'
            )

        # pre-processing for certain specific methods
        if self.methodName.startswith('nnSearchByExample'):
            args = (args[0].toBase64(), ) + args[1:]

        # in the case of an nnSearch request, we shouldn't do the query immediately but rather
        # return a proxy object that allows to chain queries using the search_space argument.
        # the actual query should only be resolved when the user calls the get() method on this
        # proxy object
        if self.methodName.startswith('nnSearch'):
            return ResultSet(self.endPoint, self.methodName, args, kwargs)

        # actual processing by the server
        result = YamlRPCMethod.__call__(self, *args, **kwargs)

        # post-processing for certain specific methods
        if self.methodName == 'layout':
            result = yaml.load(result)

        elif self.methodName == 'getPoint':
            try:
                import gaia2
            except ImportError:
                raise ImportError(
                    'You need to have the gaia2 python module installed in order to be able to retrieve single points'
                )
            p = gaia2.Point()
            p.fromBase64(result)
            result = p

        elif self.methodName == 'getPoints':
            try:
                import gaia2
            except ImportError:
                raise ImportError(
                    'You need to have the gaia2 python module installed in order to be able to retrieve points'
                )
            ds = gaia2.DataSet()
            ds.fromBase64(result)
            result = ds

        return result
Beispiel #3
0
def mergeAll(pointList,
             outputFilename,
             chunkSize,
             transfoFile,
             select=None,
             exclude=None):
    # TODO: validation of the yaml file format? (ie: pre-2.3 yaml files should be rejected)
    totalPoints = len(fastyaml.load(open(pointList).read()))

    begin, end = 0, chunkSize
    partfiles = []
    partfileTemplate = outputFilename + '_%d_%d.partdb'

    # keep this information for future reference as it won't be accessible anymore
    # once the dataset is merged
    excluded = []
    if exclude:
        try:
            p = gaia2.Point()
            p.load(list(gaia2.fastyaml.loadfile(pointList).items())[0][1])
            excluded = p.layout().descriptorNames(exclude)
        except:
            raise

    # merge each chunk separately
    # this includes removevl and fixlength, which should yield smaller files than just after
    # merging, so it should then be possible to load all of them together to merge them
    while begin < totalPoints:
        end = min(end, totalPoints)
        partfile = partfileTemplate % (begin, end)
        partfiles += [partfile]

        mergeChunk(pointList, partfile, transfoFile, begin, end, select,
                   exclude)
        begin, end = end, end + chunkSize

        horizontalLine()

    # make sure all histories are the same, if not do whatever it takes to reach that point
    # also "simplify" the histories so that they are the minimum history representation required
    # to get to the layout of the final dataset
    print(
        'Harmonizing chunks so that they all have the same layout & history...'
    )
    vldescs, nandescs, rdescs = harmonizeChunks(partfiles)
    rdescs = rdescs | set(excluded)
    horizontalLine()

    # merge all those partfiles together
    print('Assembling full dataset together...')
    dstotal = DataSet()

    for pfile in partfiles:
        print('Merging partfile', pfile)
        ds = DataSet()
        ds.load(pfile)
        dstotal.appendDataSet(ds)

    dstotal.save(outputFilename)

    # print a nice informative summary of what has been done to the dataset
    horizontalLine()

    msg = '''
Final dataset information
-------------------------

Number of points: %s

Descriptors removed:
  - because they were of variable length: %s
  - because they were either constant, contained NaN or contained Inf: %s
  - because they were removed explicitly: %s

Your dataset has been saved at %s'''

    # remove leading dot
    vldescs = sorted(d[1:] for d in vldescs)
    nandescs = sorted(d[1:] for d in nandescs)
    rdescs = sorted(d[1:] for d in rdescs)

    print(msg % (str(dstotal.size()), ', '.join(vldescs), ', '.join(nandescs),
                 ', '.join(rdescs), outputFilename))

    # clean up temporary files
    for pfile in partfiles:
        os.remove(pfile)
        os.remove(pfile + '.raw')