def addTestLyrics(urls): db = dBDelegate.getDBConnection() for url in urls: song = GeniusScraper.scrapeSongByURL(url) song_lyrics = positionalIndex.tokenizeText(song.lyrics) lyrics = positionalIndex.tokenizeLyrics(song.lyrics) song_id = dBDelegate.addSongMetadata(db, song.url, song.artist, song.title) song_positional = positionalIndex.createPositionalIndex(song_lyrics) dBDelegate.addPositionalIndex(db, song_positional, song_id)
def test_main(base, cap): artist_list = GeniusScraper.scrapeLyricsByArtist(base, cap) if artist_list != None: for song_list in artist_list: for song in song_list: db = dBDelegate.getDBConnection() song_lyrics = positionalIndex.tokenizeText(song.lyrics) song_id = dBDelegate.addSongMetadata(db, song.url, song.artist, song.title) song_positional = positionalIndex.createPositionalIndex(song_lyrics) dBDelegate.addPositionalIndex(db, song_positional, song_id)
def detectASample(query): db = dBDelegate.getDBConnection() query = query print "****************************************************************" print "Songs which contain your query, in perfect order:" relevant_positional_index = createPositionalIndex(db, query) songs_that_contain_all_query_words = getIntersectingPositionalIndex(db, query) sampled_songs = compareLists(query, relevant_positional_index, songs_that_contain_all_query_words, db) average_song_length = db.songs.find_one({'average_length' : {'$exists' : True}})['average_length'] tfidf_values = calculateWeightedTfidf(sampled_songs, query, relevant_positional_index, songs_that_contain_all_query_words, average_song_length, db) tfidf.sortTfidfValues(tfidf_values)
from __future__ import division import re, string, math, random import dBDelegate from bson.objectid import ObjectId from collections import Counter from itertools import islice import itertools import collections import crawler appropriate_punctuation = '!"#$%&()*+,./:;<=>?@[\\]^_`{|}~' punct = re.compile(r'[\s{}]+'.format(re.escape(appropriate_punctuation))) k = 2 db = dBDelegate.getDBConnection() def remove_stopwords(): stopwords_file = open('stopwords.txt', 'r') stopwords = stopwords_file.read() stopwords = stopwords.split('\n') stopwords_file.close() return stopwords def calculateTfidf(query, positional_index, song, avg_songlength, collection_length, weight, tfidf_values): f = open('tfidf_samples.txt', 'a') # for song in list_of_matching_documents: similarity = 0 for word in set(query): querytf = query.count(word) raw_tf = len(positional_index[word]['document_dict'][song]) songtf = len(positional_index[word]['document_dict'])