def __init__(self): ### Verifying needed folders if not os.path.isdir("downloaded_mp3s"): os.mkdir("downloaded_mp3s") ### setting variables self.arguments = get_arguments() self.sql = SqliteDatabase() assert 3 - [ self.arguments.id, self.arguments.restore_file, self.arguments.channel_url ].count( None ) <= 1, "Can't have any of the ID, channel, or restore file as combined arguments." # if there is no url or id, ask for url if (self.arguments.id is None and self.arguments.channel_url is None and self.arguments.restore_file is None): self.arguments.channel_url = input( "URL: " ) #Example input: www.youtube.com/c/GlitchxCity/featured # if there is a url, verify if it's a correct URL if self.arguments.channel_url is not None: self.verify_url(self.arguments.channel_url) self.ignore_checked = self.arguments.ignore self.verbose = self.arguments.verbose self.speedmode = self.arguments.speedmode self.vprint(str(self.arguments), "yellow") ### Make sure that there are no leftovers from previous runs self.delete_mp3s()
def run_recognition(filename, print_output=False): db = SqliteDatabase() abs_filename = os.path.abspath(filename) filename = abs_filename.rsplit(os.sep)[-1] r = FileReader(abs_filename) data = r.parse_audio() Fs = data["Fs"] channel_amount = len(data["channels"]) matches = [] for channeln, channel in enumerate(data["channels"]): # TODO: Remove prints or change them into optional logging. #if print_output: # msg = " fingerprinting channel %d/%d" # print( # logmsg(msg, attrs=["dark"], prefix=filename) # % (channeln + 1, channel_amount) # ) matches.extend(find_matches(db, channel, Fs, filename, print_output)) #if print_output: # msg = " finished channel %d/%d, got %d hashes" # print( # logmsg(msg, attrs=["dark"], prefix=filename) # % (channeln + 1, channel_amount, len(matches)) # ) #print_match_results(db, matches, filename) return matches
def run_recognition(filename, logger): db = SqliteDatabase() abs_filename = os.path.abspath(filename) filename = abs_filename.rsplit(os.sep)[-1] r = FileReader(abs_filename) data = r.parse_audio() Fs = data["Fs"] channel_amount = len(data["channels"]) matches = [] for channeln, channel in enumerate(data["channels"]): msg = " fingerprinting channel %d/%d" logger.info( logmsg(msg, attrs=["dark"], prefix=filename), channeln + 1, channel_amount, ) matches.extend(find_matches(db, channel, logger, Fs, filename)) msg = " finished channel %d/%d, got %d hashes" logger.info( logmsg(msg, attrs=["dark"], prefix=filename), channeln + 1, channel_amount, len(matches), ) print_match_results(db, matches, logger, filename)
#!/usr/bin/python import os import sys import libs import libs.fingerprint as fingerprint from termcolor import colored from libs.reader_file import FileReader from libs.db_sqlite import SqliteDatabase from libs.config import get_config if __name__ == '__main__': config = get_config() db = SqliteDatabase() path = "mp3/" # fingerprint all files in a directory for filename in os.listdir(path): if filename.endswith(".mp3"): reader = FileReader(path + filename) audio = reader.parse_audio() song = db.get_song_by_filehash(audio['file_hash']) song_id = db.add_song(filename, audio['file_hash']) msg = ' * %s %s: %s' % ( colored('id=%s', 'white', attrs=['dark']), # id colored('channels=%d', 'white', attrs=['dark']), # channels colored('%s', 'white', attrs=['bold']) # filename
import argparse from argparse import RawTextHelpFormatter from itertools import izip_longest from termcolor import colored from libs.config import get_config from libs.reader_microphone import MicrophoneReader from libs.visualiser_console import VisualiserConsole as visual_peak from libs.visualiser_plot import VisualiserPlot as visual_plot from libs.db_sqlite import SqliteDatabase # from libs.db_mongo import MongoDatabase if __name__ == '__main__': config = get_config() db = SqliteDatabase() parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter) parser.add_argument('-s', '--seconds', nargs='?') args = parser.parse_args() if not args.seconds: parser.print_help() sys.exit(0) seconds = int(args.seconds) chunksize = 2**12 # 4096 channels = 2#int(config['channels']) # 1=mono, 2=stereo record_forever = False
#!/usr/bin/python3 from libs.db_sqlite import SqliteDatabase if __name__ == "__main__": db = SqliteDatabase() # # songs table db.query("DROP TABLE IF EXISTS songs;") print("removed db.songs") db.query( """ CREATE TABLE songs ( id INTEGER PRIMARY KEY AUTOINCREMENT, name TEXT, filehash TEXT ); """ ) print("created db.songs") # # fingerprints table db.query("DROP TABLE IF EXISTS fingerprints;") print("removed db.fingerprints") db.query( """
from libs.visualiser_plot import VisualiserPlot as visual_plot from libs.db_sqlite import SqliteDatabase song = '/home/blacksec21/hello.mp3' #seconds = 5 r = FileReader(song) # only get filename # get data,fs,file_hash,extension,songname,num_channels data = r.parse_audio() Fs = data['Fs'] num_channel = len(data['channels']) result = set() matches = [] db = SqliteDatabase() #channel_amount = len(data) def grouper(iterable, n, fillvalue=None): args = [iter(iterable)] * n return (filter(None, values) for values in izip_longest(fillvalue=fillvalue, *args)) def find_matches(samples, Fs=fingerprint.DEFAULT_FS): hashes = fingerprint.fingerprint(samples, Fs=Fs) return return_matches(hashes) def return_matches(hashes):
#!/usr/bin/python import sys, os sys.path.append(os.path.join(sys.path[0], '..')) from libs.db_sqlite import SqliteDatabase from termcolor import colored if __name__ == '__main__': db = SqliteDatabase() row = db.executeOne("SELECT 2+3 as x;") assert row[0] == 5, "failed simple sql execution" print ' * %s' % colored('ok', 'green')
def run_recognition(): config = get_config() # Set up logging handlers = [] if bool(config["log.console_out"]): handlers.append(logging.StreamHandler()) if bool(config["log.file_out"]): handlers.append(logging.FileHandler("microphone_rec.log")) logger = logging.basicConfig( handlers=handlers, format=config["log.format"], level=config["log.level"], ) db = SqliteDatabase() parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter) parser.add_argument("-s", "--seconds", nargs="?") args = parser.parse_args() if not args.seconds: parser.print_help() sys.exit(0) seconds = int(args.seconds) chunksize = 2**12 # 4096 channels = int(config["channels"]) # 1=mono, 2=stereo record_forever = False save_recorded = bool(config["mic.save_recorded"]) visualise_console = bool(config["mic.visualise_console"]) visualise_plot = bool(config["mic.visualise_plot"]) reader = MicrophoneReader() reader.start_recording(seconds=seconds, chunksize=chunksize, channels=channels) msg = " * started recording.." logger.info(msg) # print(colored(msg, attrs=["dark"])) while True: bufferSize = int(reader.rate / reader.chunksize * seconds) for i in range(0, bufferSize): nums = reader.process_recording() if visualise_console: msg = colored(" %05d", attrs=["dark"]) + colored( " %s", "green") logger.info(msg, visual_peak.calc(nums)) # print(msg % visual_peak.calc(nums)) else: msg = " processing %d of %d.." % (i, bufferSize) logger.info(msg) # print(colored(msg, attrs=["dark"])) if not record_forever: break if visualise_plot: data = reader.get_recorded_data()[0] visual_plot.show(data) reader.stop_recording() msg = " * recording has been stopped" logger.info(msg) # print(colored(msg, attrs=["dark"])) data = reader.get_recorded_data() msg = " * recorded %d samples" logger.info(msg, len(data[0])) # print(colored(msg, attrs=["dark"]) % len(data[0])) if save_recorded: reader.save_recorded("test.wav") Fs = fingerprint.DEFAULT_FS channel_amount = len(data) matches = [] for channeln, channel in enumerate(data): msg = " fingerprinting channel %d/%d" logger.info(msg, channeln + 1, channel_amount) # print(colored(msg, attrs=["dark"]) % (channeln + 1, channel_amount)) matches.extend(find_matches(db, channel, logger, Fs)) msg = " finished channel %d/%d, got %d hashes" logger.info(msg, channeln + 1, channel_amount, len(matches)) # print( # colored(msg, attrs=["dark"]) # % (channeln + 1, channel_amount, len(matches)) # ) print_match_results(db, matches, logger)
#!/usr/bin/python import argparse import sys from libs.db_sqlite import SqliteDatabase from termcolor import colored from argparse import RawTextHelpFormatter if __name__ == '__main__': parser = argparse.ArgumentParser(formatter_class=RawTextHelpFormatter) parser.add_argument('-q', '--query', nargs='?') args = parser.parse_args() if not args.query: parser.print_help() sys.exit(0) db = SqliteDatabase() row = db.executeOne(args.query) print(row)
from libs.config import get_config from libs.db_sqlite import SqliteDatabase import pandas as pd if __name__ == '__main__': config = get_config() db = SqliteDatabase() distinct_names_in_db = db.executeAll(" SELECT id,name,filehash FROM songs") df = pd.DataFrame.from_dict( [{c: cv for cv, c in zip(r, ['id', 'name', 'hash'])} for r in distinct_names_in_db]) df.to_pickle('./db/fingerprinted_songs.pkl')
from termcolor import colored from libs.reader_file import FileReader from libs.db_sqlite import SqliteDatabase from libs.config import get_config import argparse from argparse import RawTextHelpFormatter from itertools import izip_longest from libs.reader_microphone import MicrophoneReader from libs.visualiser_console import VisualiserConsole as visual_peak from libs.visualiser_plot import VisualiserPlot as visual_plot from libs.db_sqlite import SqliteDatabase if __name__ == '__main__': config = get_config() db = SqliteDatabase() path = "mp3/" # fingerprint all files in a directory for filename in os.listdir(path): if filename.endswith(".mp3"): reader = FileReader(path + filename) audio = reader.parse_audio() song = db.get_song_by_filehash(audio['file_hash']) song_id = db.add_song(filename, audio['file_hash']) msg = ' * %s %s: %s' % ( colored('id=%s', 'white', attrs=['dark']), # id colored('channels=%d', 'white', attrs=['dark']), # channels
class Finder: def __init__(self): ### Verifying needed folders if not os.path.isdir("downloaded_mp3s"): os.mkdir("downloaded_mp3s") ### setting variables self.arguments = get_arguments() self.sql = SqliteDatabase() assert 3 - [ self.arguments.id, self.arguments.restore_file, self.arguments.channel_url ].count( None ) <= 1, "Can't have any of the ID, channel, or restore file as combined arguments." # if there is no url or id, ask for url if (self.arguments.id is None and self.arguments.channel_url is None and self.arguments.restore_file is None): self.arguments.channel_url = input( "URL: " ) #Example input: www.youtube.com/c/GlitchxCity/featured # if there is a url, verify if it's a correct URL if self.arguments.channel_url is not None: self.verify_url(self.arguments.channel_url) self.ignore_checked = self.arguments.ignore self.verbose = self.arguments.verbose self.speedmode = self.arguments.speedmode self.vprint(str(self.arguments), "yellow") ### Make sure that there are no leftovers from previous runs self.delete_mp3s() def verify_url(self, url): ### Check if the channel url is in right format expr_channel = r"^.*(/c(hannel)?/[a-zA-Z0-9-_]+)" expr_user = r"^.*(/u(ser)?/[a-zA-Z0-9-_]+)" channel_path_match = re.match(expr_channel, url) user_path_match = re.match(expr_user, url) if channel_path_match is None and user_path_match is None: self.arguments.channel_url = input( "The URL you entered is invalid. Please enter a valid URL: ") elif channel_path_match is not None: channel_path = channel_path_match.groups()[0] self.channel_url = "https://www.youtube.com" + channel_path + "/videos" else: channel_path = user_path_match.groups()[0] self.channel_url = "https://www.youtube.com" + channel_path + "/videos" return True def vprint(self, text: str, colour: str = "white"): """ Helpful function for printing when verbose is turned on """ if self.verbose: cprint(text, colour) def get_song_mp3_do_not_use2(self, id: str) -> str: """ Downloads the audio from a youtube video in mp3 format given a video id. """ dir_here = os.path.abspath(os.getcwd()) dir_mp3s = os.path.join(dir_here, "downloaded_mp3s") url = "https://youtube.com/watch?v=" + id interm_out = os.path.join(dir_mp3s, f"{id}.%(ext)s") postprocessor_args = [] postprocessor_args.extend(["-ss", "00:00:00.00"]) postprocessor_args.extend(["-t", "00:00:15.00"]) ydl_opts = { "format": "bestaudio/best", "outtmpl": interm_out, 'postprocessors': [{ 'key': 'FFmpegExtractAudio', 'preferredcodec': 'mp3', 'preferredquality': '192' }], "postprocessor_args": postprocessor_args, "quiet": True, "no_warnings": True } with youtube_dl.YoutubeDL(ydl_opts) as ydl: ydl.download([url]) self.vprint(f"Audio downloaded! Performing fingerprint match scan...") return os.path.abspath( os.path.join("downloaded_mp3s", os.listdir("downloaded_mp3s")[0])) def get_song_mp3(self, id: str) -> str: """ Downloads the audio from a youtube video in mp3 format given a video id. """ ### Delete existing mp3 files in downloaded_mp3s directory in case there is one left of a previous run #self.delete_mp3s() url = "https://youtube.com/watch?v=" + id dir_here = os.path.abspath(os.getcwd()) dir_youtube_dl_dir = os.path.join(dir_here, "youtube-dl") ### Set youtube-dl exectuable for windows and linux users if sys.platform == "win32": youtube_dl_exec = "youtube-dl.exe" else: youtube_dl_exec = "youtube-dl" path_youtube_dl_exec = os.path.join(dir_youtube_dl_dir, youtube_dl_exec) ### initialise this variable to make the destination argument for the youtube-dl command dir_downloaded_mp3s = os.path.join(dir_here, "downloaded_mp3s") ### '%(title)s.%(ext)s' comes from how youtube-dl.exe outputs files with ### filename as youtube title #destination_arg = os.path.join(dir_downloaded_mp3s, "%(title)s.%(ext)s") destination_arg = os.path.join(dir_downloaded_mp3s, f"{id}.%(ext)s") ### Make the mp3 folder which will contain a downloaded mp3 if not os.path.isdir(dir_downloaded_mp3s): os.mkdir(dir_downloaded_mp3s) ### Setting up the command for the different modes if not self.speedmode: cmd = [ f"{path_youtube_dl_exec}", "-x", "--audio-format", "mp3", "--no-warnings", "-o", f"{destination_arg}", f"{url}" ] else: cmd = [ f"{path_youtube_dl_exec}", "-x", "--postprocessor-args", "\"-ss 00:00:00.00 -t 00:00:15.00\"", f"{url}", "--audio-format", "mp3", "-o", f"{destination_arg}" ] try: subprocess.check_output(' '.join(cmd)) sleep(0.1) self.vprint( f"Audio downloaded! Performing fingerprint match scan...") except KeyboardInterrupt: ### completely exit program if this is what user wants self.delete_mp3s() exit() except: ### always show error even when verbose is off cprint( "Video audio couldn't be downloaded. Skipping for now. Please check missed.txt for more info.", "red") with open("missed.txt", "a") as f: f.write( f"{currentDate} {currentTime}: Could not check video with ID {id}. Please copy and paste this URL in your browser to check: 'youtube.com/watch?v={id}'\n" ) ### when return value is None, we go to the next song to check (see code in line 326) return None ### Even though this may not be the best way to do it, this does support greek letters on both Windows and Linux return os.path.abspath( os.path.join("downloaded_mp3s", os.listdir("downloaded_mp3s")[0])) def delete_mp3s(self): """ Deletes all mp3s in the mp3s folder. """ current_directory = os.getcwd() for file in os.listdir("downloaded_mp3s"): full_path = os.path.join(current_directory, "downloaded_mp3s", file) os.remove(full_path) def get_channel_source(self): ### if a restore file is supplied, use that instead if self.arguments.restore_file is not None: with open(self.arguments.restore_file) as f: source = f.read() return source ### Open a browser and catch chromedriver not found error try: driver = webdriver.Chrome() except common.exceptions.WebDriverException: try: driver = webdriver.Chrome( executable_path= r"C:\ProgramData\chocolatey\bin\chromedriver.exe") except: print( "If you see this message, that means selenium can't find 'chromedriver.exe.'" ) print( "To fix this, search for 'chromedriver.exe' on your file system." ) print( r"Example of 'chromedriver.exe' path: 'C:\ProgramData\chocolatey\bin\chromedriver.exe'" ) location = input( "Once you've found 'chromedriver.exe', paste the location to it here: " ) driver = webdriver.Chrome(executable_path=location) print( "Alternatively, you can put it in the code yourself so you don't have to constantly fill this in." ) print( "To do that, in the file 'find_stable.py', search for the line \"driver = webdriver.Chrome()\" and in between the brackets put:" ) print("executable_path = (your chromedriver location)") driver.get(self.channel_url) sleep(5) source = driver.page_source ### Keep scrolling until we hit the end of the page scroll_by = 5000 driver.execute_script(f"window.scrollBy(0, {scroll_by});") while driver.page_source != source: source = driver.page_source driver.execute_script(f"window.scrollBy(0, {scroll_by});") sleep(0.1) driver.quit() with open("restore_file.html", "w") as f: f.write(source.encode('utf-8').decode('ascii', 'ignore')) return source def check_file(self, fpath, thresh=20): """ Fingerprint and try to match a song against database """ ### Getting ID from filepath, Might just supply ID as argument base = os.path.basename(fpath) id_, _ = os.path.splitext(base) matches = run_recognition(fpath) song = align_matches(self.sql, matches) confidence = song['CONFIDENCE'] self.vprint(f"Confidence of a match: {confidence}.", "yellow") ### If there's an exact match, give feedback to user, otherwise if there's a possible match notify the user as well if confidence >= 400: self.vprint(f"EXACT MATCH FOUND FOR ID: {id_}, CHECK MATCHES.TXT", "green") with open("MATCHES.txt", "a") as f: f.write( f"{currentDate} {currentTime}: You've found an identical match with the database. Video with ID {id_} is an EXACT match, with a confidence of {confidence}!! Check it out at youtube.com/watch?v={id_}!\n" ) elif confidence >= thresh: self.vprint( f"POSSIBLE MATCH FOUND FOR ID: {id_}, CHECK MATCHES.TXT", "green") with open("MATCHES.txt", "a") as f: f.write( f"{currentDate} {currentTime}: Video with YT ID {id_} has a possible match with the database, with a confidence of {confidence}! Check it out at youtube.com/watch?v={id_}!\n" ) return confidence >= thresh def get_videos(self, source): """ Extract video ids and durations from channel video page source """ ### get video ids form page source. watch_expr = r'href="/watch\?v=([a-zA-Z0-9_-]+)"' matches = re.finditer(watch_expr, source) ### For each video, the id is put twice in the page source, ### so we have to use [::2] to grab only half of the ids video_ids = [match.groups()[0] for match in matches][::2] ### Get duration of video corresponding to each video id. soup = bs.BeautifulSoup(source, "html.parser") ### all time durations are contained within a tag with class ### "style-scope ytd-thumbnail-overlay-time-status-renderer" time_spans = soup.findAll("span", { "class": "style-scope ytd-thumbnail-overlay-time-status-renderer" }) raw_durations = [ts.text.strip() for ts in time_spans] del time_spans ### Making video durations list durations = [] for raw_duration in raw_durations: if raw_duration[0] not in "0123456789": continue time_units = raw_duration.split(":") seconds = int(time_units[-1]) minutes = int(time_units[-2]) hours = int(time_units[-3]) if len(time_units) > 2 else 0 ### Get total duration in seconds. duration = seconds + (minutes * 60) + (hours * 3600) durations.append(duration) # Construct and return a list of videos, where each video is a dict # containing the video id and video duration in seconds. videos = [] for (video_id, duration) in zip(video_ids, durations): videos.append({"id": video_id, "duration": duration}) return videos '''We may need to add also the video titles if we want to include a speedmode but for now this will do.''' def check_one_video(self, id_): song_fpath = self.get_song_mp3(id_) if song_fpath is None: return possible_match = self.check_file(song_fpath, ) if possible_match: song_fname = os.path.split(song_fpath)[1] with open("MATCHES.txt", "a") as f: f.write( f"{currentDate} {currentTime}: {song_fname} with YT ID {id_} has a possible match with the database! Check it out!\n" ) else: self.vprint("Probably not a match.") def check_channel(self, min_duration, max_duration): #Get the HTML source of the channel's video section source = self.get_channel_source() videos = self.get_videos(source) target_videos = [] for video in videos: ### this seems like complicated logic but it's exactly what we want, ### please fill in "(p^~q) or (p ^ (q^ (~r)))" on the website ### https://web.stanford.edu/class/cs103/tools/truth-table-tool/ to see for yourself correctDuration = video["duration"] >= min_duration and video[ "duration"] <= max_duration if ((self.ignore_checked == False and correctDuration) or (correctDuration and (self.ignore_checked == True and not self.sql.in_checked_ids(video["id"])))): target_videos.append(video) ### Get total number of videos to display progress percentage total_videos = len(target_videos) if total_videos == 0: self.vprint( "All videos have been checked or are longer than than the maximum duration.", "green"), exit() ### We use two indexes, both for a different purpose, _ is for progress percentage, 'index' is for getting correct slices of target_videos (so multithreading purposes) _ = 0 for index in range(round(len(target_videos) / self.arguments.threads)): section = target_videos[self.arguments.threads * index:self.arguments.threads * (index + 1)] ### Downloading mp3 with multithreading jobs = [] for video in section: id_ = video["id"] try: thread = threading.Thread(target=self.get_song_mp3, args=(id_, )) except KeyboardInterrupt: self.delete_mp3s exit() jobs.append(thread) self.vprint(f"Downloading audio from video with ID {id_}...") for job in jobs: _ += 1 job.start() for job in jobs: job.join() ### Fingerprinting with multithreading jobs = [] for file in os.listdir("downloaded_mp3s"): p = threading.Thread(target=self.check_file, args=( os.path.join("downloaded_mp3s", file), self.arguments.threshold, )) filename, file_extension = os.path.splitext(file) self.sql.add_checked_id(filename) jobs.append(p) for job in jobs: job.start() for job in jobs: job.join() self.vprint(f"{100*(_)/total_videos:.2f}% done") self.delete_mp3s() print("") self.delete_mp3s() def main(self): if self.arguments.id is not None: self.check_one_video(self.arguments.id) else: self.check_channel(min_duration=self.arguments.min_duration, max_duration=self.arguments.max_duration) self.vprint( f"Duration of channel scan in seconds: {tm.time() - start_time}")