def iterate(current_node): print current_node if current_node is None: return if isinstance(current_node, spacy.tokens.Token): word_matching_cols = db_helper.get_matching_columns(current_node, helpers.to_unicode(str(current_node))) phrase_matching_cols = db_helper.get_matching_columns(current_node, u'') return wrapper_node(current_node, word_matching_cols, phrase_matching_cols) if isinstance(current_node, FirstOrderExpression): if current_node.are_all_expressions_tokens(): phrase_permutations = list(permutations(current_node.expressions)) phrase_matching_cols = [] for p in phrase_permutations: phrase = u'' for tok in p: phrase += str(tok) + " " phrase = phrase[:-1] if(phrase in line): print phrase phrase_matching_cols = db_helper.get_matching_columns(phrase, u'') break output = [] for expression in current_node.expressions: word_matching_cols = db_helper.get_matching_columns(phrase, helpers.to_unicode(str(expression))) output.append(wrapper_node(current_node, word_matching_cols,phrase_matching_cols)) return FirstOrderExpression(current_node.operator, output) else: output = [] for smaller_exp in current_node.expressions: output.append(iterate(smaller_exp)) return FirstOrderExpression(current_node.operator, output)
def get_representative_columns(self, table_name): """ This function should return potential columns that can represent a row to a human For example, they should return columns like ID, Name, Description :param table_name: :return: """ to_return = [] if False == table_name in self._columns.keys(): print "Table not found: " + table_name return to_return for column in self._columns[table_name]: # add if it's a primary key if column[2]: to_return.append(column[0]) elif helpers.similarity_score(helpers.to_unicode(column[0]), helpers.to_unicode("name")) > 0.5: to_return.append(column[0]) elif helpers.similarity_score(helpers.to_unicode( column[0]), helpers.to_unicode("description")) > 0.5: to_return.append(column[0]) return to_return
def should_store(self, column_name, distinct_count, row_count): # if number of distinct values is less than number or rows # it's probably something matchable if distinct_count > 0 and distinct_count < row_count: return True # let's match names if helpers.similarity_score(helpers.to_unicode(column_name), helpers.to_unicode("name")) > 0.2: return True
def download_youtube_by_itag(yt, itag, target): target = target or os.getcwd() filepath = None try: url = yt.watch_url stream = yt.streams.get_by_itag(itag) title = stream.title resolution = stream.resolution video_codec = stream.video_codec abr = stream.abr audio_codec = stream.audio_codec fps = stream.fps bitrate = stream.bitrate filesize = stream.filesize filename = '{title}_{video}_{video_codec}_{audio}_{audio_codec}_{fps}_{bitrate}_{filesize}'.format( title=title, video=resolution, video_codec=video_codec, audio=abr, audio_codec=audio_codec, fps=fps, bitrate=bitrate, filesize=filesize) filename = to_unicode(safe_filename(filename)) logger.debug("Filename = {filename}".format(filename=filename)) yt.register_on_progress_callback(on_progress) filepath = yt.streams.get_by_itag(itag).download(output_path=target, filename=filename) except: logger.error( "Unable to download YT, url = [{url}], itag = [{itag}]".format( url=url, itag=itag)) return filepath
def match_with_column_name(self, phrase, value, column_name): """ Uses NLP similarity to estimate matching index of column and phrase/value :param phrase: :param value: :param column_name: :return: a real value indicating match index of phrase/value and column """ bing_corrector = spell_corrector.bing_spell_corrector() corrected_name = bing_corrector.spell_correct(column_name) # use corrected name to find similarity word_similarity = helpers.similarity_score(\ helpers.to_unicode(corrected_name), \ helpers.to_unicode(value)) phrase_similarity = helpers.similarity_score( \ helpers.to_unicode(corrected_name), \ helpers.to_unicode(phrase)) return max(word_similarity, phrase_similarity)
def match_with_column_name(self, phrase, value, column_name, tags=[]): """ Uses NLP similarity to estimate matching index of column and phrase/value :param phrase: :param value: :param column_name: :return: a real value indicating match index of phrase/value and column """ bing_corrector = spell_corrector.bing_spell_corrector() corrected_name = bing_corrector.spell_correct(column_name) # use corrected name to find similarity word_similarity = helpers.similarity_score( \ helpers.to_unicode(corrected_name), \ helpers.to_unicode(value)) phrase_similarity = helpers.similarity_score( \ helpers.to_unicode(corrected_name), \ helpers.to_unicode(phrase)) tag_similarity = 0.0 if tags.__len__() > 0: for tag in tags: cur_sim = helpers.similarity_score( \ helpers.to_unicode(tag), \ helpers.to_unicode(corrected_name)) if cur_sim > tag_similarity: tag_similarity = cur_sim return max(word_similarity, phrase_similarity, tag_similarity)
def get_matching_table(self, phrase): """ Takes a phrase or word and returns the table that potentially contains necessary info :param phrase: :return: """ to_return = [] max_score = 0.0 max_score_table = "" corrector = spell_corrector.bing_spell_corrector() for table in self._columns.keys(): corrected_table_name = corrector.spell_correct(table) cur_score = helpers.similarity_score( helpers.to_unicode(corrected_table_name), helpers.to_unicode(phrase)) if cur_score > max_score: max_score = cur_score max_score_table = table return (max_score_table, max_score)
def get_captions(yt, lang): if lang: filename = to_unicode(safe_filename(yt.title)) codes = query_captions_codes(yt) for code in codes: if (lang == True) or (code.lower() == lang.lower()): # logger.info('downloading captions for language code = [%s]' % code) try: filepath = yt.captions[code].download( title=filename, srt=True, output_path=args.target) logger.info( 'captions language code = [{code}] downloaded [{filepath}]' .format(code=code, filepath=filepath)) except: logger.error( 'unable to download caption code = [{code}'.format( code=code)) return True
def get_correct_yt(url, retry): yt = None # Get Youtube Object with correct filename if args.proxy: logger.info('via proxy = [%s]' % args.proxy) proxy_params = {urlparse.urlparse(args.url).scheme: args.proxy} else: proxy_params = None for i in range(1, retry): logger.debug(f"{i} retry in get_correct_yt()") try: filename = None # while filename in [None, "YouTube"]: yt = YouTube(url, on_progress_callback=on_progress, proxies=proxy_params) filename = to_unicode(safe_filename(yt.title)) logger.debug("URL = {url}".format(url=url)) logger.debug("Filename = {filename}".format(filename=filename)) if filename != 'YouTube': break except Exception as ex: logger.error('Unable to get FileName from = [%s]' % url) template = "An exception of type {0} occurred. Arguments:\n{1!r}" message = template.format(type(ex).__name__, ex.args) logger.error('Due to the reason = [%s]' % message) else: logger.error( "Unable to get correct YouTube object for {retry} times, skip download this time" .format(retry=args.retry)) yt = None return yt
def get_matching_columns(self, phrase, value, tags=[], table_name=""): """ Goes through all the tables Matches with all columns Returns potential matches :param phrase: phrase for which we are figuring out query :param value: value to be looked up in cached data :return: list of (table_name, column_name) """ to_return = [] # it's possible that we get numbers as strings if helpers.is_number(value): value = float(value) # go through all the tables for table in self._columns.keys(): # if a table name is provided, search only in that table if table_name.strip() != "" and table != table_name: continue # go through each column in the table found_some_column = False for column in self._columns[table]: # check if types match if type(value) == str and column[1] == "TEXT": # check if it's a perfect match if column[0] in self._distinct_values[table]: match_result = self.match_with_values( self._distinct_values[table][column[0]], value) # sometimes we can have substring in names # in such cases, let's make similarity score 0.5 if helpers.similarity_score(helpers.to_unicode(column[0]), helpers.to_unicode("name")) > 0.5 \ and any(value in string for string in self._distinct_values[table][column[0]]): if match_result[0] < 0.5: match_result = 0.5 if match_result[0] > 0: to_return.append(table, column[0], match_result[0]) found_some_column = True # TODO::Improve matching logic by considering a small error range elif (type(value) == int or type(value) == float) and (column[1] == "INTEGER" or column[1] == "REAL"): if column[0] in self._distinct_values[table]: match_result = self.match_for_numbers( self._distinct_values[table][column[0]], value) if (match_result[0] > 0): to_return.append( (table, column[0], match_result[0])) found_some_column = True if not found_some_column: # find the column which matches the most match_score = 0.0 match_col = "" for column in self._columns[table]: col_score = self.match_with_column_name( phrase, value, column[0], tags) # TODO::Refine the filtering logic if col_score > 0 and col_score > match_score: match_score = col_score match_col = column[0] to_return.append((table, match_col, match_score)) return to_return
def _download(yt, itag=18, out=None, replace=True, skip=True, proxies=None, retry=10): """Start downloading a YouTube video. :param str url: A valid YouTube watch URL. :param str itag: YouTube format identifier code. """ thumbnail_url = yt.thumbnail_url stream = yt.streams.get_by_itag(itag) filesize = stream.filesize filename = to_unicode(stream.default_filename) logger.info('Youtube filename = [%s]' % filename) logger.info('Youtube filesize = [%s]' % filesize) logger.info( '\n{title} |\n{description} |\n\n{views} views | {rating} rating | {length} secs' .format(title=yt.title, description=yt.description, views=yt.views, rating=yt.rating, length=yt.length, thumbnail_url=yt.thumbnail_url)) print( '\n{title} |\n{description} |\n\n{views} views | {rating} rating | {length} secs' .format(title=yt.title, description=yt.description, views=yt.views, rating=yt.rating, length=yt.length, thumbnail_url=yt.thumbnail_url)) print('\n{fn} | {fs} bytes'.format(fn=filename, fs=filesize)) # detect of out is a directory outdir = None if out: if os.path.isdir(out): outdir = out out = None else: filename = out if outdir: if not os.path.exists(outdir): os.makedirs(outdir) filename = os.path.join(outdir, filename) filename = to_unicode(filename) # check file existance and decide skip or not # add numeric ' (x)' suffix if filename already exists if os.path.exists(filename): fsize = os.path.getsize(filename) logger.info( 'filename = [%s] filesize = [%s] already exists in system' % (filename, fsize)) if fsize == filesize: if skip: logger.info( 'filename = [%s] filesize = [%s] already exists in system and skip download again' % (filename, fsize)) return filename, thumbnail_url elif not replace: filename = filename_fix_existing(filename) else: name, ext = os.path.splitext(filename) filename = filename_fix_existing(filename) # TODO this workaround, need remove after # give second chance if skip: try: oldfilename = u'{}{}{}'.format(name, '_(1)', ext) fsize = os.path.getsize(oldfilename) logger.debug( 'Trying to check filename = [%s] and filesize = [%s] if exists and match' % (oldfilename, fsize)) if fsize == filesize: logger.info( 'filename = [%s] filesize = [%s] already exists in system and skip download again' % (oldfilename, fsize)) return oldfilename, thumbnail_url except: pass name, ext = os.path.splitext(filename) logger.info('target local filename = [%s]' % filename) logger.info('target local filesize = [%s]' % filesize) # create tmp file (fd, tmpfile) = tempfile.mkstemp(suffix=ext, prefix="", dir=outdir, text=False) tmpfile = to_unicode(tmpfile) os.close(fd) os.unlink(tmpfile) logger.info('target local tmpfile = [%s]' % tmpfile) tmppath, tmpbase = ntpath.split(tmpfile) tmpname, tmpext = os.path.splitext(tmpbase) logger.debug(f'target local tmpfile name = [{tmpname}], ext = [{tmpext}]') try: stream.download(output_path=tmppath, filename=tmpname, filename_prefix=None, skip_existing=skip) sys.stdout.write('\n') shutil.move(tmpfile, filename) logger.info("File = [{0}] Saved".format(filename)) except KeyboardInterrupt: sys.exit(1) sys.stdout.write('\n') return (filename, thumbnail_url)
def unitest(): base = os.path.basename(__file__) filename, _ = os.path.splitext(base) test_file = '{name}.{ext}_unittest'.format(name=filename, ext='ini') # url = 'https://www.youtube.com/watch?v=F1fqet9V494' url = 'https://www.youtube.com/watch?v=xwsYvBYZcx4' playlist = 'https://www.youtube.com/playlist?list=PLteWjpkbvj7rUU5SFt2BlNVCQqkjulPZR' def test1(): logger.info( "Testing with 'display_streams()' for url = {0}".format(url)) args.list = True main() def test2(): logger.info( "Testing with 'build_playback_report()' for url = {0}".format(url)) args.build_playback_report = True main() def test3(): logger.info( "Testing with 'get_captions(lang=zh-TW)' for url = {0}".format( url)) args.caption = 'zh-TW' main() logger.info( "Testing with 'get_captions(lang=True)' for url = {0}".format(url)) args.caption = True main() def test4(): logger.info("Testing with download file from ini file") main() def test5(): logger.info("Testing with download all files from ini file") args.replace = True args.quality = 'All' args.mode = 'ALL' main() def test6(): logger.info("Testing with downloading playlist from input") args.replace = False args.skip = True args.playlist = playlist main() args.url = url test3() test2() test1() args.url = None args.file = test_file fp = to_unicode(args.file) with open(fp, mode='w+') as fh: fh.write(url) test4() test5() test6() with open(fp, mode='w+') as fh: fh.write(playlist) test4()