def analyze_missing_similar():
    s = SongSegment()

    segment_data = []

    for segment in s._db._db[s._dbcol].find({
            'similar.' + str(MATCHES - 1): {
            '$exists': False
        }
    }).limit(1000000):
        if segment['mfcc'] == None or segment['chroma'] == None or segment[
                'tempogram'] == None:
            break

        feature = _create_feature(np.frombuffer(segment['mfcc']),
                                  np.frombuffer(segment['chroma']),
                                  np.frombuffer(segment['tempogram']))

        segment_data.append((segment['_id'], segment['song_id'],
                             segment['time_from'], feature))

    s.close()

    print("Updating similar for " + str(len(segment_data)) + " segments")

    if len(segment_data):
        analyze_segments(segment_data)
    else:
        # This seems like the wrong place for this, but good enough for now
        time.sleep(60 * 10)
class Loader(pykka.ThreadingActor):
    def __init__(self):
        super().__init__()
        self.seg_db = SongSegment()

    def load(self, song):
        return _load_song(song[0], song[1], self.seg_db)

    def on_stop(self):
        self.seg_db.close()
def query_similar(song_id, from_time, to_time):
    """Queries the database for segments
    similar to the segment provided

    Parameters
    ----------
    song_id : string
        Id of the given song
    from_time : int
        The start time of the segment
    to_time : int
        The end time of the segment

    Returns
    -------
    list of Dict[song_id : string, from_time: time_from, to_time: time_to]
        A list of all the segments which are similar
    """

    seg_db = SongSegment()
    segments = seg_db.get_all_by_song_id(song_id)

    best = (None, None)
    for segment in segments:
        localdist = abs(from_time - segment['time_from'])
        if best is None or localdist < best[0]:
            best = (localdist, segment)

    if best is None or 'similar' not in best[1]:
        return None

    segment = best[1]

    similar = segment['similar']

    similar_ids = list(map(lambda sim: sim['id'], similar))

    similar_full = seg_db.get_by_ids(similar_ids)

    similar_segments = []
    for i in range(0, len(similar)):
        sim_seg = next(seg for seg in similar_full
                       if seg['_id'] == similar[i]['id'])
        similar_segments.append(
            dict({
                'song_id': sim_seg['song_id'],
                'from_time': sim_seg['time_from'],
                'to_time': sim_seg['time_to'],
                'distance': similar[i]['distance'],
            }))

    seg_db.close()

    return similar_segments
def analyze_segments(segs):
    ss = SongSegment()

    count = ss.count()

    allMatches = list(map(lambda x: [], segs))

    matchers = [Matcher.start().proxy() for _ in range(cpu_count())]
    print("Searching through " + str(count // BUCKET_SIZE + 1) +
          " buckets, with " + str(BUCKET_SIZE) + " segments in each")
    for i in range(0, count // BUCKET_SIZE + 1):
        print("Bucket: " + str(i + 1))
        established_segments = list(
            filter(
                lambda x: x['mfcc'] is not None and x['chroma'] is not None and
                x['tempogram'] is not None,
                ss.get_all_in_range(i * BUCKET_SIZE, (i + 1) * BUCKET_SIZE)))
        established_segments = list(
            map(_process_db_segment, established_segments))

        data = np.array(list(map(lambda x: x[3], established_segments)))

        bucket = _create_bucket(data)

        query_object = bucket[1].construct_query_pool()
        query_object.set_num_probes(25)

        matched = []
        for i, seg in enumerate(segs):
            matched.append(matchers[i % len(matchers)].match(
                seg, query_object))

        matches = pykka.get_all(matched)

        for j in range(0, len(matches)):
            allMatches[j].append(
                list(map(lambda x: established_segments[x], matches[j])))

        del data
        del bucket
        del established_segments
        del query_object
        del matches

    for matcher in matchers:
        matcher.stop()

    for i in range(0, len(segs)):
        best = _find_best_matches(_flatten(allMatches[i]), segs[i])

        matches = ss.get_by_ids(list(map(lambda match: match[0][0], best)))
        matches = list(
            map(lambda match: (match['_id'], match['similar']), matches))

        for j in range(0, len(matches)):
            matches[j][1].append(
                dict({
                    'id': segs[i][0],
                    'distance': best[j][1],
                }))

            match_ids = list(set(map(lambda x: x['id'], matches[j][1])))
            innerMatches = list(
                map(
                    lambda match_id: next(x for x in matches[j][1]
                                          if x['id'] == match_id), match_ids))
            innerMatches.sort(key=lambda m: m['distance'])
            ss.update_similar(matches[j][0], innerMatches[:10])

        formatted = []
        for match in best:
            formatted.append(dict({'id': match[0][0], 'distance': match[1]}))

        ss.update_similar(segs[i][0], formatted)

    ss.close()