Exemple #1
0
def addTestLyrics(urls):
	db = dBDelegate.getDBConnection()

	for url in urls:
		song = GeniusScraper.scrapeSongByURL(url)

		song_lyrics = positionalIndex.tokenizeText(song.lyrics)
		lyrics = positionalIndex.tokenizeLyrics(song.lyrics)
		song_id = dBDelegate.addSongMetadata(db, song.url, song.artist, song.title)

		song_positional = positionalIndex.createPositionalIndex(song_lyrics)
		dBDelegate.addPositionalIndex(db, song_positional, song_id)
Exemple #2
0
def test_main(base, cap):

	artist_list = GeniusScraper.scrapeLyricsByArtist(base, cap)
	if artist_list != None:
		for song_list in artist_list:
			for song in song_list:
				db = dBDelegate.getDBConnection()
				song_lyrics = positionalIndex.tokenizeText(song.lyrics)
				song_id = dBDelegate.addSongMetadata(db, song.url, song.artist, song.title)

				song_positional = positionalIndex.createPositionalIndex(song_lyrics)
				dBDelegate.addPositionalIndex(db, song_positional, song_id)
def detectASample(query):
	db = dBDelegate.getDBConnection()
	query = query
	print "****************************************************************"
	print "Songs which contain your query, in perfect order:"

	relevant_positional_index = createPositionalIndex(db, query)
	songs_that_contain_all_query_words = getIntersectingPositionalIndex(db, query)
	sampled_songs = compareLists(query, relevant_positional_index, songs_that_contain_all_query_words, db)
	average_song_length = db.songs.find_one({'average_length' : {'$exists' : True}})['average_length']

	tfidf_values = calculateWeightedTfidf(sampled_songs, query, relevant_positional_index, songs_that_contain_all_query_words, average_song_length, db)
	tfidf.sortTfidfValues(tfidf_values)
Exemple #4
0
from __future__ import division
import re, string, math, random
import dBDelegate
from bson.objectid import ObjectId
from collections import Counter
from itertools import islice
import itertools
import collections

import crawler

appropriate_punctuation = '!"#$%&()*+,./:;<=>?@[\\]^_`{|}~'
punct = re.compile(r'[\s{}]+'.format(re.escape(appropriate_punctuation)))
k = 2
db = dBDelegate.getDBConnection()

def remove_stopwords(): 
	stopwords_file = open('stopwords.txt', 'r')
	stopwords = stopwords_file.read()
	stopwords = stopwords.split('\n')
	stopwords_file.close()
	return stopwords

def calculateTfidf(query, positional_index, song, avg_songlength, collection_length, weight, tfidf_values):
	f = open('tfidf_samples.txt', 'a')
	# for song in list_of_matching_documents:
	similarity = 0
	for word in set(query):
		querytf = query.count(word)
		raw_tf = len(positional_index[word]['document_dict'][song])
		songtf = len(positional_index[word]['document_dict'])