Ejemplo n.º 1
0
def iterate(current_node):
    print current_node

    if current_node is None:
        return

    if isinstance(current_node, spacy.tokens.Token):
        word_matching_cols = db_helper.get_matching_columns(current_node, helpers.to_unicode(str(current_node)))
        phrase_matching_cols = db_helper.get_matching_columns(current_node, u'')
        return wrapper_node(current_node, word_matching_cols, phrase_matching_cols)

    if isinstance(current_node, FirstOrderExpression):
        if current_node.are_all_expressions_tokens():
            phrase_permutations = list(permutations(current_node.expressions))
            phrase_matching_cols = []
            for p in phrase_permutations:
                phrase = u''
                for tok in p:
                    phrase += str(tok) + " "
                phrase = phrase[:-1]
                if(phrase in line):
                    print phrase
                    phrase_matching_cols = db_helper.get_matching_columns(phrase, u'')
                    break

            output = []
            for expression in current_node.expressions:
                word_matching_cols = db_helper.get_matching_columns(phrase, helpers.to_unicode(str(expression)))
                output.append(wrapper_node(current_node, word_matching_cols,phrase_matching_cols))
            return FirstOrderExpression(current_node.operator, output)
        else:
            output = []
            for smaller_exp in current_node.expressions:
                output.append(iterate(smaller_exp))
            return FirstOrderExpression(current_node.operator, output)
Ejemplo n.º 2
0
    def get_representative_columns(self, table_name):
        """
        This function should return potential columns that can represent a row
        to a human
        For example, they should return columns like ID, Name, Description
        :param table_name:
        :return:
        """
        to_return = []
        if False == table_name in self._columns.keys():
            print "Table not found: " + table_name
            return to_return

        for column in self._columns[table_name]:

            # add if it's a primary key
            if column[2]:
                to_return.append(column[0])
            elif helpers.similarity_score(helpers.to_unicode(column[0]),
                                          helpers.to_unicode("name")) > 0.5:
                to_return.append(column[0])
            elif helpers.similarity_score(helpers.to_unicode(
                    column[0]), helpers.to_unicode("description")) > 0.5:
                to_return.append(column[0])

        return to_return
Ejemplo n.º 3
0
 def should_store(self, column_name, distinct_count, row_count):
     # if number of distinct values is less than number or rows
     # it's probably something matchable
     if distinct_count > 0 and distinct_count < row_count:
         return True
     # let's match names
     if helpers.similarity_score(helpers.to_unicode(column_name),
                                 helpers.to_unicode("name")) > 0.2:
         return True
def download_youtube_by_itag(yt, itag, target):
    target = target or os.getcwd()
    filepath = None
    try:
        url = yt.watch_url
        stream = yt.streams.get_by_itag(itag)
        title = stream.title
        resolution = stream.resolution
        video_codec = stream.video_codec
        abr = stream.abr
        audio_codec = stream.audio_codec
        fps = stream.fps
        bitrate = stream.bitrate
        filesize = stream.filesize
        filename = '{title}_{video}_{video_codec}_{audio}_{audio_codec}_{fps}_{bitrate}_{filesize}'.format(
            title=title,
            video=resolution,
            video_codec=video_codec,
            audio=abr,
            audio_codec=audio_codec,
            fps=fps,
            bitrate=bitrate,
            filesize=filesize)
        filename = to_unicode(safe_filename(filename))
        logger.debug("Filename = {filename}".format(filename=filename))

        yt.register_on_progress_callback(on_progress)
        filepath = yt.streams.get_by_itag(itag).download(output_path=target,
                                                         filename=filename)
    except:
        logger.error(
            "Unable to download YT, url = [{url}], itag = [{itag}]".format(
                url=url, itag=itag))

    return filepath
    def match_with_column_name(self, phrase, value, column_name):
        """
        Uses NLP similarity to estimate matching index of column and phrase/value
        :param phrase:
        :param value:
        :param column_name:
        :return: a real value indicating match index of phrase/value and column
        """
        bing_corrector = spell_corrector.bing_spell_corrector()
        corrected_name = bing_corrector.spell_correct(column_name)
        # use corrected name to find similarity
        word_similarity = helpers.similarity_score(\
            helpers.to_unicode(corrected_name), \
            helpers.to_unicode(value))

        phrase_similarity = helpers.similarity_score( \
            helpers.to_unicode(corrected_name), \
            helpers.to_unicode(phrase))

        return max(word_similarity, phrase_similarity)
Ejemplo n.º 6
0
    def match_with_column_name(self, phrase, value, column_name, tags=[]):
        """
        Uses NLP similarity to estimate matching index of column and phrase/value
        :param phrase:
        :param value:
        :param column_name:
        :return: a real value indicating match index of phrase/value and column
        """
        bing_corrector = spell_corrector.bing_spell_corrector()
        corrected_name = bing_corrector.spell_correct(column_name)
        # use corrected name to find similarity
        word_similarity = helpers.similarity_score( \
            helpers.to_unicode(corrected_name), \
            helpers.to_unicode(value))

        phrase_similarity = helpers.similarity_score( \
            helpers.to_unicode(corrected_name), \
            helpers.to_unicode(phrase))

        tag_similarity = 0.0
        if tags.__len__() > 0:
            for tag in tags:
                cur_sim = helpers.similarity_score( \
                    helpers.to_unicode(tag), \
                    helpers.to_unicode(corrected_name))
                if cur_sim > tag_similarity:
                    tag_similarity = cur_sim

        return max(word_similarity, phrase_similarity, tag_similarity)
Ejemplo n.º 7
0
    def get_matching_table(self, phrase):
        """
        Takes a phrase or word and returns the table that
        potentially contains necessary info
        :param phrase:
        :return:
        """
        to_return = []

        max_score = 0.0
        max_score_table = ""
        corrector = spell_corrector.bing_spell_corrector()
        for table in self._columns.keys():
            corrected_table_name = corrector.spell_correct(table)
            cur_score = helpers.similarity_score(
                helpers.to_unicode(corrected_table_name),
                helpers.to_unicode(phrase))
            if cur_score > max_score:
                max_score = cur_score
                max_score_table = table

        return (max_score_table, max_score)
def get_captions(yt, lang):
    if lang:
        filename = to_unicode(safe_filename(yt.title))
        codes = query_captions_codes(yt)
        for code in codes:
            if (lang == True) or (code.lower() == lang.lower()):
                # logger.info('downloading captions for language code = [%s]' % code)
                try:
                    filepath = yt.captions[code].download(
                        title=filename, srt=True, output_path=args.target)
                    logger.info(
                        'captions language code = [{code}] downloaded [{filepath}]'
                        .format(code=code, filepath=filepath))
                except:
                    logger.error(
                        'unable to download caption code = [{code}'.format(
                            code=code))

    return True
def get_correct_yt(url, retry):
    yt = None
    # Get Youtube Object with correct filename

    if args.proxy:
        logger.info('via proxy = [%s]' % args.proxy)
        proxy_params = {urlparse.urlparse(args.url).scheme: args.proxy}
    else:
        proxy_params = None

    for i in range(1, retry):
        logger.debug(f"{i} retry in get_correct_yt()")
        try:
            filename = None
            # while filename in [None, "YouTube"]:
            yt = YouTube(url,
                         on_progress_callback=on_progress,
                         proxies=proxy_params)
            filename = to_unicode(safe_filename(yt.title))
            logger.debug("URL      = {url}".format(url=url))
            logger.debug("Filename = {filename}".format(filename=filename))
            if filename != 'YouTube':
                break

        except Exception as ex:
            logger.error('Unable to get FileName from = [%s]' % url)
            template = "An exception of type {0} occurred. Arguments:\n{1!r}"
            message = template.format(type(ex).__name__, ex.args)
            logger.error('Due to the reason = [%s]' % message)
    else:
        logger.error(
            "Unable to get correct YouTube object for {retry} times, skip download this time"
            .format(retry=args.retry))
        yt = None

    return yt
Ejemplo n.º 10
0
    def get_matching_columns(self, phrase, value, tags=[], table_name=""):
        """
        Goes through all the tables
        Matches with all columns
        Returns potential matches
        :param phrase: phrase for which we are figuring out query
        :param value: value to be looked up in cached data
        :return: list of (table_name, column_name)
        """
        to_return = []

        # it's possible that we get numbers as strings
        if helpers.is_number(value):
            value = float(value)

        # go through all the tables
        for table in self._columns.keys():

            # if a table name is provided, search only in that table
            if table_name.strip() != "" and table != table_name:
                continue

            # go through each column in the table
            found_some_column = False
            for column in self._columns[table]:
                # check if types match
                if type(value) == str and column[1] == "TEXT":
                    # check if it's a perfect match
                    if column[0] in self._distinct_values[table]:
                        match_result = self.match_with_values(
                            self._distinct_values[table][column[0]], value)

                        # sometimes we can have substring in names
                        # in such cases, let's make similarity score 0.5
                        if helpers.similarity_score(helpers.to_unicode(column[0]), helpers.to_unicode("name")) > 0.5 \
                                and any(value in string for string in self._distinct_values[table][column[0]]):
                            if match_result[0] < 0.5:
                                match_result = 0.5

                        if match_result[0] > 0:
                            to_return.append(table, column[0], match_result[0])
                            found_some_column = True

                # TODO::Improve matching logic by considering a small error range
                elif (type(value) == int
                      or type(value) == float) and (column[1] == "INTEGER"
                                                    or column[1] == "REAL"):
                    if column[0] in self._distinct_values[table]:
                        match_result = self.match_for_numbers(
                            self._distinct_values[table][column[0]], value)
                        if (match_result[0] > 0):
                            to_return.append(
                                (table, column[0], match_result[0]))
                            found_some_column = True

            if not found_some_column:
                # find the column which matches the most
                match_score = 0.0
                match_col = ""
                for column in self._columns[table]:
                    col_score = self.match_with_column_name(
                        phrase, value, column[0], tags)
                    # TODO::Refine the filtering logic
                    if col_score > 0 and col_score > match_score:
                        match_score = col_score
                        match_col = column[0]

                to_return.append((table, match_col, match_score))

        return to_return
def _download(yt,
              itag=18,
              out=None,
              replace=True,
              skip=True,
              proxies=None,
              retry=10):
    """Start downloading a YouTube video.
    :param str url:
        A valid YouTube watch URL.
    :param str itag:
        YouTube format identifier code.
    """

    thumbnail_url = yt.thumbnail_url
    stream = yt.streams.get_by_itag(itag)
    filesize = stream.filesize
    filename = to_unicode(stream.default_filename)
    logger.info('Youtube filename = [%s]' % filename)
    logger.info('Youtube filesize = [%s]' % filesize)
    logger.info(
        '\n{title} |\n{description} |\n\n{views} views | {rating} rating | {length} secs'
        .format(title=yt.title,
                description=yt.description,
                views=yt.views,
                rating=yt.rating,
                length=yt.length,
                thumbnail_url=yt.thumbnail_url))
    print(
        '\n{title} |\n{description} |\n\n{views} views | {rating} rating | {length} secs'
        .format(title=yt.title,
                description=yt.description,
                views=yt.views,
                rating=yt.rating,
                length=yt.length,
                thumbnail_url=yt.thumbnail_url))
    print('\n{fn} | {fs} bytes'.format(fn=filename, fs=filesize))

    # detect of out is a directory
    outdir = None
    if out:
        if os.path.isdir(out):
            outdir = out
            out = None
        else:
            filename = out
    if outdir:
        if not os.path.exists(outdir):
            os.makedirs(outdir)
        filename = os.path.join(outdir, filename)
    filename = to_unicode(filename)

    # check file existance and decide skip or not
    # add numeric ' (x)' suffix if filename already exists
    if os.path.exists(filename):
        fsize = os.path.getsize(filename)
        logger.info(
            'filename = [%s] filesize = [%s] already exists in system' %
            (filename, fsize))
        if fsize == filesize:
            if skip:
                logger.info(
                    'filename = [%s] filesize = [%s] already exists in system and skip download again'
                    % (filename, fsize))
                return filename, thumbnail_url
            elif not replace:
                filename = filename_fix_existing(filename)
        else:
            name, ext = os.path.splitext(filename)
            filename = filename_fix_existing(filename)
            # TODO this workaround, need remove after
            # give second chance
            if skip:
                try:
                    oldfilename = u'{}{}{}'.format(name, '_(1)', ext)
                    fsize = os.path.getsize(oldfilename)
                    logger.debug(
                        'Trying to check filename = [%s] and filesize = [%s] if exists and match'
                        % (oldfilename, fsize))
                    if fsize == filesize:
                        logger.info(
                            'filename = [%s] filesize = [%s] already exists in system and skip download again'
                            % (oldfilename, fsize))
                        return oldfilename, thumbnail_url
                except:
                    pass

    name, ext = os.path.splitext(filename)
    logger.info('target local filename = [%s]' % filename)
    logger.info('target local filesize = [%s]' % filesize)

    # create tmp file
    (fd, tmpfile) = tempfile.mkstemp(suffix=ext,
                                     prefix="",
                                     dir=outdir,
                                     text=False)
    tmpfile = to_unicode(tmpfile)
    os.close(fd)
    os.unlink(tmpfile)
    logger.info('target local tmpfile  = [%s]' % tmpfile)
    tmppath, tmpbase = ntpath.split(tmpfile)
    tmpname, tmpext = os.path.splitext(tmpbase)
    logger.debug(f'target local tmpfile name = [{tmpname}], ext = [{tmpext}]')

    try:
        stream.download(output_path=tmppath,
                        filename=tmpname,
                        filename_prefix=None,
                        skip_existing=skip)
        sys.stdout.write('\n')
        shutil.move(tmpfile, filename)
        logger.info("File = [{0}] Saved".format(filename))
    except KeyboardInterrupt:
        sys.exit(1)

    sys.stdout.write('\n')
    return (filename, thumbnail_url)
def unitest():
    base = os.path.basename(__file__)
    filename, _ = os.path.splitext(base)
    test_file = '{name}.{ext}_unittest'.format(name=filename, ext='ini')

    # url = 'https://www.youtube.com/watch?v=F1fqet9V494'
    url = 'https://www.youtube.com/watch?v=xwsYvBYZcx4'
    playlist = 'https://www.youtube.com/playlist?list=PLteWjpkbvj7rUU5SFt2BlNVCQqkjulPZR'

    def test1():
        logger.info(
            "Testing with 'display_streams()' for url =  {0}".format(url))
        args.list = True
        main()

    def test2():
        logger.info(
            "Testing with 'build_playback_report()' for url = {0}".format(url))
        args.build_playback_report = True
        main()

    def test3():
        logger.info(
            "Testing with 'get_captions(lang=zh-TW)' for url = {0}".format(
                url))
        args.caption = 'zh-TW'
        main()
        logger.info(
            "Testing with 'get_captions(lang=True)' for url = {0}".format(url))
        args.caption = True
        main()

    def test4():
        logger.info("Testing with download file from ini file")
        main()

    def test5():
        logger.info("Testing with download all files from ini file")
        args.replace = True
        args.quality = 'All'
        args.mode = 'ALL'
        main()

    def test6():
        logger.info("Testing with downloading playlist from input")
        args.replace = False
        args.skip = True
        args.playlist = playlist
        main()

    args.url = url
    test3()
    test2()
    test1()
    args.url = None
    args.file = test_file
    fp = to_unicode(args.file)
    with open(fp, mode='w+') as fh:
        fh.write(url)
    test4()
    test5()

    test6()
    with open(fp, mode='w+') as fh:
        fh.write(playlist)
    test4()