Exemple #1
0
def load_stats_tsv(statspath):
    """
    :param statspath: Path to the tsv file that is generated by etl_transcribe_stats.py
    :return: List of keys ordered as appearing in tsv file, and a dictionary containing the tsv data.
    """
    key_list = []
    stats_dict = {}
    header = None
    with open(statspath, 'r') as tsv_fp:
        tsv = csv.reader(tsv_fp, delimiter='\t')
        for row in tsv:
            if not header:
                header = row
                log_kv("header", header)
            else:
                key_list.append(row[0])
                key = row[0]
                if key in stats_dict:
                    logging.error("Key already encountered previously: %s",
                                  key)
                    logging.error("Previous entry: %s   New entry: %s",
                                  stats_dict[key], row[1:])
                else:
                    stats_dict[key] = {}
                    for ii in range(1, len(header)):
                        stats_dict[key][header[ii]] = row[ii]

    return key_list, stats_dict
Exemple #2
0
def load_txt(path):
    result = {}
    log_kv("Loading", path)
    if os.path.exists(path):
        with open(path) as fp:
            result = fp.read()
    else:
        logging.error("Not exist: %s", path)
    return result
def fetch_rows_for_column(con, cur, tablename, column, value):
    value = '"%s"' % value if type(value) is str else value
    query = 'SELECT * FROM {tn} WHERE {cc} = {val}'.format(tn=tablename,
                                                           cc=column,
                                                           val=value)
    log_kv("query", query)
    cur.execute(query)
    con.commit()
    return cur.fetchall()
def insert_from_tsv(con, tablename, path):
    sql_insert = "insert into " + tablename + " %s "
    with open(path, 'r') as fp:
        log_kv("Loading %s from" % tablename, path)
        reader = csv.reader(fp, delimiter='\t')
        header = next(reader)
        query = sql_insert % str(tuple(header)) + " VALUES %s"
        for rec in reader:
            con.execute(query % str(tuple(rec)))
def fetch_rows_columns_for_column(con, cur, columns, tablename, column, value):
    value = '"%s"' % value if type(value) is str else value
    columns_string = ",".join(columns)
    query = 'SELECT {xx} FROM {tn} WHERE {cc} = {val}'.format(
        xx=columns_string, tn=tablename, cc=column, val=value)
    log_kv("query", query)
    cur.execute(query)
    con.commit()
    return cur.fetchall()
Exemple #6
0
def load_json(path):
    result = {}
    log_kv("Loading", path)
    if os.path.exists(path):
        with open(path) as file1:
            result = json.load(file1)
    else:
        logging.error("Not exist: %s", path)
    return result
Exemple #7
0
def calc_transcript_counts(ibm_stats_path, google_stats_path):
    """
    :param ibm_stats_path: to file containing index of IBM transcripts
    :param google_stats_path: to file containing index of Google transcripts
    :return:
      Counts number of IBM transcripts
      Counts number of Google transcripts
      Counts portion of IBM transcripts within processed folders that have Google transcript
      Counts portion of Google transcripts within processed folders that have IBM transcript
    """
    ibm_stats = load_json(ibm_stats_path)
    google_stats = load_json(google_stats_path)
    log_kv("Number of IBM Transcripts", len(ibm_stats))
    log_kv("Numberof Google Transcripts", len(google_stats))
    i_set = set([os.path.dirname(x).replace(IBM_PATH,'').replace(BASE_PATH,'') for x in ibm_stats])
    g_set = set([os.path.dirname(x).replace(GOOGLE_PATH,'').replace(BASE_PATH,'') for x in google_stats])

    print
    i_top_level_folders = sorted(set([xx.split("/")[0] for xx in i_set]))
    print "IBM folders in %s :\n%s " % (ibm_stats_path, i_top_level_folders)
    print
    g_top_level_folders = sorted(set([xx.split("/")[0] for xx in g_set]))
    print "Google folders in %s :\n%s" % (google_stats_path, g_top_level_folders)

    i_count = 0
    i_in_g = 0
    for xx in i_set:
        if xx.split("/")[0] not in g_top_level_folders:
            continue
        i_count += 1
        if xx in g_set:
            i_in_g += 1
    i_portion = float(i_in_g) / i_count

    g_count = 0
    g_in_i = 0
    for xx in g_set:
        if xx.split("/")[0] not in i_top_level_folders:
            continue
        g_count += 1
        if xx in i_set:
            g_in_i += 1
    g_portion = float(g_in_i) / g_count

    print;print
    print "IBM also in Google: %d/%d  (%.2f) " % (i_in_g, i_count, i_portion)
    print "Google also in IBM: %d/%d  (%.2f) " % (g_in_i, g_count, g_portion)
Exemple #8
0
def etl_transcripts(log_stats_path, word_counts_path, api, ext=".out"):
    """
    Merges the two datasets using a canonicalized key.
    If the two share a filed then the value in <word_counts_path> will overwrite the value from <log_stats_path>
    :param log_stats_path: gives processing time
    :param word_counts_path: gives word counts
    :param api: "ibm" or "google"
    :param ext: extension stripped from basename
    :return: dict
    """
    result = {}
    loaded = {}
    if os.path.isfile(log_stats_path):
        with open(log_stats_path) as fp:
            log_kv("Loading(%s log stats)" % api, log_stats_path)
            loaded = json.load(fp)
    if loaded and type(loaded) is dict:
        log_kv("Count  (%s log stats)" % api, len(loaded))
    else:
        logging.error("Expected log stats data.")

    for key, row in loaded.items():
        id = key
        if key.startswith(API_META[api]["base"] + "/"):
            id = key.replace(API_META[api]["base"] + "/", "")
        result[id] = row

    counts = {}
    if os.path.isfile(word_counts_path):
        with open(word_counts_path) as fp:
            log_kv("Loading(%s word counts)" % api, word_counts_path)
            counts = json.load(fp)
    if counts and type(counts) is dict:
        log_kv("Count  (%s word counts)" % api, len(counts))
    else:
        logging.error("Expected word counts data.")

    for key, row in counts.items():
        id = key
        if key.startswith(API_META[api]["base"] + "/"):
            id = key.replace(API_META[api]["base"] + "/", "")
        for suffix in API_META[api]["transcripts"]:
            if id.endswith("/" + suffix):
                id = re.sub(re.escape("/" + suffix) + r"$", "", id)
        if id.endswith(ext):
            id = re.sub(re.escape(ext) + r"$", "", id)
        if id in result:
            for x, y in row.items():
                result[id][x] = y
        else:
            result[id] = row

    return result
Exemple #9
0
def calc_bleu_scores(google_results, ibm_results, verbose=False):
    """
    :param google_results: basic stats about google transcripts
    :param ibm_results:  basic stats about ibm transcripts
    :param verbose: prints warnings when bleu is averaged with jaccard,
    which is done when hypothesis word count falls below threshold.
    :return: first two arguments, supplemented with bleu and ratcliff
    """
    logging.info("===   Processing Google transcripts   ===")
    time2 = time.time()
    google_results = do_comparisons(google_results, verbose)
    logging.info("(%.2f min)" % ((time.time() - time2) / 60.0))

    logging.info("===   Processing IBM transcripts   ===")
    time3 = time.time()
    ibm_results = do_comparisons(ibm_results)
    logging.info("(%.2f min)" % ((time.time() - time3) / 60.0))

    ibm_bleu_count = len([1 for x in ibm_results if "bleu" in ibm_results[x]])
    ibm_avg_bleu = sum([ibm_results[x]["bleu"] for x in ibm_results if "bleu" in ibm_results[x]]) \
                   / float(ibm_bleu_count)
    google_bleu_count = len(
        [1 for x in google_results if "bleu" in google_results[x]])
    google_avg_bleu = sum([google_results[x]["bleu"] for x in google_results if "bleu" in google_results[x]]) \
                      / float(google_bleu_count)
    print
    log_kv("ibm bleu count", ibm_bleu_count)
    log_kv("google bleu count", google_bleu_count)
    log_kv("ibm avg bleu", "%.5f" % ibm_avg_bleu)
    log_kv("google avg bleu", "%.5f" % google_avg_bleu)

    ibm_ratcliff_count = len(
        [1 for x in ibm_results if "ratcliff" in ibm_results[x]])
    ibm_avg_ratcliff = sum([ibm_results[x]["ratcliff"] for x in ibm_results if "ratcliff" in ibm_results[x]]) \
                       / float(ibm_ratcliff_count)
    google_ratcliff_count = len(
        [1 for x in google_results if "ratcliff" in google_results[x]])
    google_avg_ratcliff = sum([google_results[x]["ratcliff"] for x in google_results if "ratcliff" in google_results[x]]) \
                          / float(google_ratcliff_count)
    print
    log_kv("ibm ratcliff count", ibm_ratcliff_count)
    log_kv("google ratcliff count", google_ratcliff_count)
    log_kv("ibm avg ratcliff", "%.5f" % ibm_avg_ratcliff)
    log_kv("google avg ratcliff", "%.5f" % google_avg_ratcliff)

    return google_results, ibm_results
Exemple #10
0
def do_comparisons(stats, verbose=False):
    """
    Calculates bleu and ratcliff similarity between reference and hypothesis transcripts.
    :param stats: dict containing pointers to reference and hypothesis transcripts.
    :param verbose:
    :return: first argument, supplemented with bleu and ratcliff stats.
    """

    count = 0
    for key in stats:
        if "reference_path" in stats[key] and "transcript_path" in stats[key]:
            if not os.path.exists(stats[key]["reference_path"]):
                raise ValueError("Expected path to exist: %s",
                                 stats[key]["reference_path"])
            if not os.path.exists(stats[key]["transcript_path"]):
                raise ValueError("Expected path to exist: %s",
                                 stats[key]["transcript_path"])
            with open(stats[key]["reference_path"], "r") as fp1:
                reference_string = fp1.read()
            with open(stats[key]["transcript_path"], "r") as fp2:
                hypothesis_string = fp2.read()
            stats[key]["ratcliff"] = ratcliff_obershelp_similarity(
                reference_string, hypothesis_string)

            reference_tokens = tokenize(reference_string)
            hypothesis_tokens = tokenize(hypothesis_string)
            if len(reference_tokens) < 7:
                bleu_score = nltk.translate.bleu_score.sentence_bleu(
                    reference_tokens, hypothesis_tokens, weights=(0.5, 0.5))
                if verbose:
                    logging.warn(
                        "Short reference: %2d words. Hypothesis:%5d words.  Bleu: %.5f",
                        len(reference_tokens), len(hypothesis_tokens),
                        bleu_score)
                    if len(hypothesis_tokens) > 2 * len(reference_tokens):
                        logging.warn("Reference path : %s",
                                     stats[key]["reference_path"])
                        logging.warn("Hypothesis path: %s",
                                     stats[key]["transcript_path"])
            elif len(hypothesis_tokens) < 15:
                bleu_score = nltk.translate.bleu_score.sentence_bleu(
                    reference_tokens, hypothesis_tokens, weights=(0.5, 0.5))
                jaccard = jaccard_score(reference_tokens, hypothesis_tokens)
                size_h = len(set(hypothesis_tokens))
                size_r = len(set(reference_tokens))
                size_b = size_h + size_r
                avg_bleu_score = (bleu_score * size_h /
                                  size_b) + (jaccard * size_r / size_b)
                if verbose:
                    logging.warn(
                        "Short hypothesis. Using avg(bleu,jaccard).  "
                        "Reference:%5d words/%5d set. Hypothesis:%5d words/%5d set. "
                        "Bleu: %.5f  Jaccard: %.5f  Avg: %.5f",
                        len(reference_tokens), size_r, len(hypothesis_tokens),
                        size_h, bleu_score, jaccard, avg_bleu_score)
                if avg_bleu_score > max(bleu_score, jaccard):
                    print
                    logging.error("Avg bleu (%.5f) > max(bleu, jaccard).",
                                  avg_bleu_score)
                    logging.warn(
                        "avg_bleu_score = (bleu_score * size_h/size_b + (jaccard * size_r/size_b))"
                    )
                    logging.warn(
                        "       %.5f = ( %.5f * %d/%d ) + ( %.5f * %d/%d )",
                        avg_bleu_score, bleu_score, size_h, size_b, jaccard,
                        size_r, size_b)
                    print
                bleu_score = avg_bleu_score
            else:
                bleu_score = nltk.translate.bleu_score.sentence_bleu(
                    reference_tokens, hypothesis_tokens)
            stats[key]['bleu'] = bleu_score
            stats[key]['word_count'] = len(reference_tokens)

            count += 1
            if count % 50 == 0:
                log_kv("completed", count)
            #
            # if count<11:
            #     break
        else:
            continue

    log_kv("done", count)
    return stats
Exemple #11
0
                        '-s',
                        action='store',
                        default=STATS_FILEPATH,
                        help='TSV file containing transcription stats ')
    parser.add_argument('--api',
                        '-a',
                        action='store',
                        default="ibm",
                        help='API. Default=ibm')
    parser.add_argument('--load',
                        '-L',
                        action='store_true',
                        help='Load previously stored results.')
    args = parser.parse_args()

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))

    references_path = os.path.realpath(os.path.expanduser(args.reference))
    log_kv("references folder", references_path)

    google_path = os.path.realpath(os.path.expanduser(args.google))
    log_kv("google path", google_path)

    ibm_path = os.path.realpath(os.path.expanduser(args.ibm))
    log_kv("ibm path", ibm_path)

    outpath = os.path.realpath(os.path.expanduser(args.outfolder))
    log_kv("outpath", outpath)

    #   Loads transcript statistics file
    parser = argparse.ArgumentParser(description='Sqlite Helper')
    parser.add_argument('--infolder',
                        '-i',
                        action='store',
                        default='.',
                        help='folder containing previous ETL files')
    parser.add_argument('--outfolder',
                        '-o',
                        action='store',
                        default='./output',
                        help='output directory')

    args = parser.parse_args()

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    print

    inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd())
    log_kv("inpath", inpath)
    outpath = os.path.realpath(
        args.outfolder if args.outfolder else u'./output')
    log_kv("outpath", outpath)
    make_dir(outpath)

    log_kv("")
    log_kv("Audio stats file", AUDIO_STATS_TSV)
    log_kv("Transcript stats", TRANSCRIBED_STATS_TSV)
    log_kv("")
def process_transcript_stats(inpath, basepath, outpath, args):

    file_stats_path = os.path.join(outpath, IBM_TRANSCRIPT_STATS_FILENAME)
    if args.google:
        file_stats_path = os.path.join(outpath,
                                       GOOGLE_TRANSCRIPT_STATS_FILENAME)

    log_kv("file stats", file_stats_path)
    previous_results = {}
    result_dict = {}
    if os.path.exists(file_stats_path):
        log_kv("Loading file", file_stats_path)
        with open(file_stats_path) as file1:
            previous_results = json.load(file1)
            log_kv("Count(previous)", len(previous_results))
            for xx in previous_results:
                result_dict[xx] = previous_results[xx]

    print
    print 105 * "="
    print
    num_processed = 0
    num_skipped = 0
    num_done = 0

    # Gets list of transcript filepaths
    file_list = walk_files(folder=inpath + "/", basepath=basepath)
    uniques = set()
    for x, y in file_list:
        uniques = get_uniques(x, uniques)
    log_kv("Number Transcriptions", len(file_list))
    log_kv("Unique Transcriptions", len(uniques))
    print
    print 105 * "="
    print
    num_dictated = 0
    sum_word_count = 0
    sum_char_count = 0
    for uu in uniques:
        id = uu.replace(basepath, '').lstrip("/")
        if args.max and num_processed >= args.max:
            log_kv("Max met", args.max)
            break
        num_dictated += 1 if uu.endswith(".dictated") else 0
        num_processed += 1
        if id in previous_results:
            num_skipped += 1
            logging.debug("Skipping %s", uu)
            if result_dict[id] != previous_results[id]:
                logging.error("Mismatch")
                raise Exception('Expected %s , but encountered %s',
                                result_dict[id], previous_results[id])
            sum_word_count += result_dict[id]["word_count"]
            sum_char_count += result_dict[id]["char_count"]
        else:
            num_done += 1
            logging.debug("Doing %s", uu)
            word_count, char_count = calc_transcription_counts(
                uu, basepath, args)
            sum_word_count += word_count
            sum_char_count += char_count
            result_dict[id] = {
                "word_count": word_count,
                "char_count": char_count
            }

    log_kv("Previous", len(previous_results))
    log_kv("Processed", num_processed)
    log_kv("Dictated", num_dictated)
    log_kv("Done", num_done)
    log_kv("Skipped", num_skipped)
    log_kv("Result count", len(result_dict))

    print
    running_avg_word_count = (float(sum_word_count) / len(result_dict))
    running_avg_char_count = (float(sum_char_count) / len(result_dict))
    log_kv("Avg Word count", "%.1f" % running_avg_word_count)
    log_kv("Avg Char count", "%.1f" % running_avg_char_count)

    log_kv("Writing", file_stats_path)
    with open(file_stats_path, 'w') as outfile:
        json.dump(result_dict, outfile, indent=2)
Exemple #14
0

if __name__ == '__main__':

    start_time = time.time()
    parser = argparse.ArgumentParser(description='Compare Google STT vs IBM STT')
    parser.add_argument('--folder','-f', action='store', default='/tmp/transcription/text2stats', help='text2stats.py output directory')
    parser.add_argument('--verbose','-v', action='store_true', help='Spew logs profusely.')
    args = parser.parse_args()

    if args.verbose:
        print "Relies on the following intermediate result files under %s :" % args.folder
        print ", ".join([IBM_TRANSCRIPT_STATS_FILENAME, GOOGLE_TRANSCRIPT_STATS_FILENAME, IBM_PROCESSED_STATS_FILENAME,
                         GOOGLE_PROCESSED_STATS_FILENAME, RESULT_FILENAME])

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    folder = args.folder
    log_kv("--folder", folder)
    path = os.path.realpath(folder)

    if not os.path.isdir(path):
        raise IOError("Path not found: %s" % path)


    ibm_stats_path = os.path.join(path, IBM_TRANSCRIPT_STATS_FILENAME)
    google_stats_path = os.path.join(path, GOOGLE_TRANSCRIPT_STATS_FILENAME)
    ibm_pstats_path = os.path.join(path, IBM_PROCESSED_STATS_FILENAME)
    google_pstats_path = os.path.join(path, GOOGLE_PROCESSED_STATS_FILENAME)
    audio_stats_path = os.path.join(path, RESULT_FILENAME)
Exemple #15
0
    parser = argparse.ArgumentParser(description='Analyze transcribe rate')
    parser.add_argument('--infolder',
                        '-i',
                        action='store',
                        default='.',
                        help='folder containing previous ETL files')
    parser.add_argument('--outfolder',
                        '-o',
                        action='store',
                        default='./output',
                        help='output directory')

    args = parser.parse_args()

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    print

    inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd())
    log_kv("inpath", inpath)
    outpath = os.path.realpath(
        args.outfolder if args.outfolder else u'./output')
    log_kv("outpath", outpath)
    make_dir(outpath)

    log_kv("")
    log_kv("IBM log stats", IBM_LOG_STATS_FILEPATH)
    log_kv("IBM transcript stats", IBM_WORD_COUNT_FILEPATH)
    log_kv("Google log stats", GOOGLE_LOG_STATS_FILEPATH)
    log_kv("Google transcript stats", GOOGLE_WORD_COUNT_FILEPATH)
def run_query(con, cur, query):
    log_kv("query", query)
    cur.execute(query)
    con.commit()
    return cur.fetchall()
        '-k',
        action='store_true',
        help=
        'Do not overwrite previously converted audio files, or results folder already containing hypotheses.txt.'
    )
    parser.add_argument(
        '--google',
        '-g',
        action='store_true',
        help='Analyze Google transcripts instead of IBM Watson.')
    args = parser.parse_args()

    if not args.basefolder:
        args.basefolder = args.infolder

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    print

    inpath = os.path.realpath(args.infolder if args.infolder else os.getcwd())
    log_kv("inpath", inpath)

    basepath = os.path.realpath(args.basefolder if args.basefolder else u'/')
    log_kv("basepath", basepath)

    if inpath.startswith(basepath):
        if inpath == basepath:
            logging.warn("inpath == basepath.  Are you sure?  [Y/n]")
            choice = raw_input().lower()
            if choice not in set(['yes', 'y']):
                logging.info("Quitting")
Exemple #18
0
def calc_transcript_words_per_minute(ibm_stats_path, google_stats_path, ibm_pstats_path, google_pstats_path, audio_stats_path):

    ibm_stats = load_json(ibm_stats_path)
    google_stats = load_json(google_stats_path)
    ibm_pstats = load_json(ibm_pstats_path)
    google_pstats = load_json(google_pstats_path)
    audio_stats = load_json(audio_stats_path)

    count_processed_ibm = len(ibm_pstats)
    count_processed_google = len(google_pstats)
    count_transcribed_ibm = len(ibm_stats)
    count_transcribed_google = len(google_stats)
    print
    if count_processed_ibm < count_transcribed_ibm :
        logging.error("count_processed_ibm < count_transcribed_ibm")
    log_kv("IBM Transcribed/Processed", "%d/%d" % (count_transcribed_ibm, count_processed_ibm))
    if count_processed_google < count_transcribed_google:
        logging.error("count_processed_google < count_transcribed_google")
    log_kv("Google Transcribed/Processed", "%d/%d" % (count_transcribed_google, count_processed_google))
    print
    log_kv("Num audio files", len(audio_stats))
    print
    print "==============================================================="
    print "Calculating number of IBM transcript words per minute of audio"
    print "==============================================================="
    suffix = ".out/hypotheses.txt.dictated"
    suffix2 = ".out/hypotheses.txt"
    prefix = "ibm_stt/"
    i_words_per_min = calc_stat_per_minute(ibm_stats, audio_stats, prefix, suffix, suffix2, "word_count")
    print "==============================================================="
    print "IBM wpm tallied: %d" % len(i_words_per_min)
    print "==============================================================="
    print
    print "==============================================================="
    print "Calculating IBM processing time per minute of audio"
    print "==============================================================="
    i_proc_per_min = calc_stat_per_minute(ibm_pstats, audio_stats, prefix, suffix, suffix2, "transcribe_seconds")
    print "==============================================================="
    print "IBM ppm tallied: %d" % len(i_proc_per_min)
    print "==============================================================="
    print
    print "==============================================================="
    print "Calculating number of Google transcript words per minute of audio"
    print "==============================================================="
    suffix = ".out/transcript.txt.dictated"
    suffix2 = ".out/transcript.txt"
    prefix = "google_stt/"
    g_words_per_min = calc_stat_per_minute(google_stats, audio_stats, prefix, suffix, suffix2, "word_count")
    print "==============================================================="
    print "Google wpm tallied: %d" % len(g_words_per_min)
    print "==============================================================="
    print
    print "==============================================================="
    print "Calculating Google processing time per minute of audio"
    print "==============================================================="
    i_proc_per_min = calc_stat_per_minute(google_pstats, audio_stats, prefix, suffix, suffix2, "transcribe_seconds")
    print "==============================================================="
    print "Google ppm tallied: %d" % len(i_proc_per_min)
    print "==============================================================="
    print



    print
def analyze_transcribe_time(inpath,
                            basepath,
                            outpath,
                            ext=".out",
                            logname="sttclient.log"):
    result = {}
    if args.google:
        processed_filepath = os.path.join(outpath,
                                          GOOGLE_PROCESSED_STATS_FILENAME)
    else:
        processed_filepath = os.path.join(outpath,
                                          IBM_PROCESSED_STATS_FILENAME)
    if os.path.isfile(processed_filepath):
        with open(processed_filepath) as file1:
            loaded = json.load(file1)
            log_kv("Loaded", processed_filepath)
            if loaded and type(loaded) is dict:
                result = loaded
            log_kv("Count(previous)", len(result))
    logs = walk_logs(inpath, basepath, logname)
    cumulative_time = 0.0
    count = 0
    prev = 0
    skipped = 0
    total = 0
    for xx, yy in logs:
        total += 1
        # The actual key used to store the result.
        # If basepath==inpath, then keys in google result match keys in ibm result for easier cross-reference.
        # If basepath<inpath, then keys in google and ibm results retain their distinction for easier merge and safety
        id = yy.replace((ext + "/" + logname), '')
        if args.keep and id in result:
            prev += 1
            count += 1
            try:
                cumulative_time += result[id]["transcribe_seconds"]
            except Exception as e:
                ValueError("Missing field transcribe_seconds : %s", yy)
            continue
        unixmtime = os.path.getmtime(xx)
        birthtime = os.stat(xx).st_birthtime
        diff = unixmtime - birthtime
        if unixmtime:
            if id not in result:
                result[id] = {"unixmtime": unixmtime}
            else:
                result[id]["unixmtime"] = unixmtime

        if diff > 3600 or diff < 10:
            logging.warn("Skipped transcription time: %s", id)
            skipped += 1
            continue
        else:
            cumulative_time += diff
            count += 1
            if id not in result:
                result[id] = {"transcribe_seconds": diff}
            else:
                result[id]["transcribe_seconds"] = diff

    log_kv("Skipped", skipped)
    log_kv("Result size", len(result))
    if result:
        if DRYRUN:
            log_kv("Warning", "Dry run only")
        else:
            log_kv("Writing", processed_filepath)
            with open(processed_filepath, 'w') as outfile:
                json.dump(result, outfile, indent=2)
    print "\n\n"
    print "Transcription Processing Time (estimated) \n"
    if skipped:
        print "Previous: %d  Skipped: %d   Total: %d" % (prev, skipped, total)
    print "Count:    %s  Avg transcribe time: %.2f minutes" % (
        count, float(cumulative_time) / 60.0 / count if count else 0)
    print
Exemple #20
0
    logging.info("")


if __name__ == '__main__':

    start_time = time.time()

    parser = argparse.ArgumentParser(description='Tally audio file specs')
    parser.add_argument('--infolder','-i', action='store', default='.', help='folder containing audio files')
    parser.add_argument('--basefolder','-b', action='store', default=DEFAULT_BASE_PATH, help='base directory')
    parser.add_argument('--outfolder','-o', action='store', default='/tmp/transcription/text2stats_dev', help='output directory')
    parser.add_argument('--verbose','-v', action='store_true', help='Spew logs profusely.')
    parser.add_argument('--keep','-k', action='store_true', help='Do not reprocess files already in previous result.')
    args = parser.parse_args()

    log_kv("Running", __file__)
    log_kv("From", os.path.dirname(os.path.realpath(__file__)))
    log_kv("--infolder", args.infolder)
    inpath = os.path.realpath(os.path.expanduser(args.infolder))
    inpath = inpath if inpath.endswith("/") else inpath+"/"
    log_kv("inpath", inpath)
    log_kv("--outfolder", args.outfolder)
    outpath = os.path.realpath(os.path.expanduser(args.outfolder))
    log_kv("outpath", outpath)
    log_kv("--basefolder", args.basefolder)
    basepath = os.path.realpath(os.path.expanduser(args.basefolder))
    log_kv("basepath", basepath)

    result_filepath = os.path.join(outpath, RESULT_FILENAME)
    log_kv('result_filepath', result_filepath)
    make_dir(outpath)