Ejemplo n.º 1
0
def upload_folder(root, folder='', bucket_name='illiad-audio', make_public=False):
    """
    Uploads a folder to google cloud storage
    :param path: path to the folder 
    :param bucket_name: name of the bucket to upload to
    :return: list of paths to files
    """
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob_names = []
    urls = []
    files = os.listdir(os.path.join(root,folder))
    for i, file in enumerate(files):
        progress_bar(i, len(files), text="uploading file:")
        blob_name = "%s/%s"%(folder, file)
        blob = bucket.blob(blob_name)
        with open(os.path.join(root, folder, file), 'rb') as f:
            blob.upload_from_file(f)
        blob_names.append("gs://%s/%s"%(bucket_name, blob_name))
        if make_public:
            print("making %d public"%i)
            ##if i == 4:
            #    pdb.set_trace()
            blob.make_public()
            urls.append(unquote(blob.public_url))
        ##pdb.set_trace()
    return blob_names, urls
Ejemplo n.º 2
0
def upload_files(file_list, bucket_name='illiad-audio', make_public=False):
    """
        Uploads a list of files to google cloud storage
        :param file list: list of paths to the files
        :param bucket_name: name of the bucket to upload to
        :return: list of paths to files
        """
    client = storage.Client()
    bucket = client.get_bucket(bucket_name)
    blob_names = []
    urls = []
    for i, f in enumerate(file_list):
        progress_bar(i, len(file_list), text="uploading file:")
        f = os.path.abspath(f)
        blob_name = "/".join(f.split("/")[-2:])
        blob = bucket.blob(blob_name)
        with open(f, 'rb') as f:
            blob.upload_from_file(f)
        blob_names.append("gs://%s/%s" % (bucket_name, blob_name))
        if make_public:
            print("making %s public" % blob_name)
            ##if i == 4:
            #    pdb.set_trace()
            blob.make_public()
            urls.append(unquote(blob.public_url))
            ##pdb.set_trace()
    return blob_names, urls
Ejemplo n.º 3
0
def slice_file(file):
    rate, data = sf.read(file)
    if len(data.shape) == 1:
        data = np.vstack((data,data)).T

    all_midpoints, all_startpoints = find_divisions(data, SILENCE, rate)
    print("Number of found midpoints is \n"
          "channel1: %d \n"
          "channel2: %d \n" % (len(all_midpoints[0]), len(all_midpoints[1])))
    directory, file = os.path.split(file)
    name = file.split(".")[0]
    flac_dir = "%s/%s_split" % (directory, name)
    if not os.path.isdir(flac_dir):
        os.mkdir(flac_dir)

    for i, channel in enumerate(all_midpoints):

        for j, pair in enumerate(zip([0] + channel, channel + [data.shape[0]])):
            slice = data[pair[0]:pair[1], i]
            startpoint = all_startpoints[i][j]
            time_in_sec = float(startpoint) / float(rate)
            outfile = encode_filename(name, channel=i, timestamp=time_in_sec, extension="flac")
            progress_bar(j, len(channel)+1, "writing flac file: ")
            sf.write(os.path.join(flac_dir, outfile), slice, rate)
    return flac_dir
Ejemplo n.º 4
0
def test(model):
    model.eval() #drop outを適用しない
    test_loss = 0.0
    correct = 0
    total = 0
    with torch.no_grad():
        for batch_idx, (data, labels) in enumerate(valloader):

            if use_gpu:
                inputs = Variable(data.cuda())
                labels = Variable(labels.cuda())
            else:
                inputs, labels = Variable(data), Variable(labels)

            # forward
            outputs = model(inputs)
            _, preds = torch.max(outputs.data, 1)
            loss = criterion(outputs, labels)

            # statistics
            test_loss += loss.item()
            #total += labels.size(0)
            correct += preds.eq(labels).sum().item()
            total =  len(valset)

            progress_bar(batch_idx, len(valloader),
                            'Test Loss: %.3f | Test Acc: %.3f%% (c:%d/t:%d)'
                            % (test_loss/(batch_idx+1), 100.*correct/total, correct, total))
    # 精度が改善したらモデルを保存
    acc = 100.*correct/total
    return acc
Ejemplo n.º 5
0
def train(model, criterion, optimizer, scheduler):
    model.train() #drop outを適用
    train_loss = 0
    correct = 0
    total = 0
    for batch_idx, (data, labels) in enumerate(trainloader):

        if use_gpu: #GPUが使えるなら
            inputs = Variable(data.cuda())
            labels = Variable(labels.cuda())
        else:
            inputs, labels = Variable(data), Variable(labels)

        optimizer.zero_grad()

        # forward
        outputs = model(inputs)
        _, preds = torch.max(outputs.data, 1)
        loss = criterion(outputs, labels)

        loss.backward() #Back propagation
        #optimizer.step() # n epoch でlearning rate を m倍する

        train_loss += loss.item()
        correct += preds.eq(labels).sum().item()
        total =  len(trainset)

        progress_bar(batch_idx, len(trainloader),
                        'Train Loss: %.3f | Train Acc: %.3f%% (c:%d/t:%d)'
                        % (train_loss/(batch_idx+1), 100.*correct/total, correct, total))

    # そのepoch最後のLossとAccuracy
    print('Train Loss: {:.4f}, Train Acc: {:.4f} %'.format(train_loss/(batch_idx+1), 100.*correct/total))
Ejemplo n.º 6
0
def transcribe_slices(uri_list, name="", save_intermediate=True):
    """
    Transcribe a list of uri's that are all part of the same recording
    :param uri_list:
    :param name:
    :param save_intermediate:
    :return:
    """
    transcript = []
    word_num = 0
    complete_operations = np.zeros((len(uri_list),))
    for i, uri in enumerate(uri_list):
        progress_bar(i, len(uri_list), text="transcribing uri: ")
        alternatives = get_google_transcription(uri)
        file = uri.split("//")[-1]
        metadata = decode_filename(file)
        for j, alternative in enumerate(alternatives):
            if not alternative:
                continue
            word = {"text":alternative.transcript,
                    "confidence": alternative.confidence}
            if "channel" in metadata:
                word["speaker"] = metadata["channel"]
            if "timestamp" in metadata:
                word["starttime"] = metadata["timestamp"]
            if "name" in metadata:
                name = metadata["name"]
            word["id"] = "%s_%d" % (name, word_num),
            transcript.append(word)
            word_num += 1
            complete_operations[i] = 1
        if i % INTERMEDIATE_SAVE_TIMESTEPS is 0 and save_intermediate:
            intermediate = {"uri_list": uri_list, "complete_uris": list(complete_operations), "transcript": transcript}
            with open(TEMP_FILE, 'w') as f:
                f.write(json.dumps(intermediate))
    if save_intermediate:
        os.remove(TEMP_FILE)
    return transcript
Ejemplo n.º 7
0
def transcribe_in_parallel(uri_list, name=None, save_intermediate=True):

    transcript = []
    word_num = 0
    uri_index = 0
    clients = []
    for i in range (0, MAX_CLIENTS):
        clients.append(speech.Client())

    # Do batches of 10 at a time
    for k in range(0, len(uri_list), MAX_CLIENTS):
        operations = []
        if k+MAX_CLIENTS < len(uri_list):
            uris = uri_list[k:k+MAX_CLIENTS]
        else:
            uris = uri_list[k:]
        for i, uri in enumerate(uris):
            print('creating client')
            audio_sample = clients[i].sample(
                content=None,
                source_uri=uri,
                encoding='FLAC')
            operations.append(audio_sample.long_running_recognize('en-US'))

        complete_operations = np.zeros((len(operations),))

        while np.sum(complete_operations) != len(operations):
            time.sleep(2)
            incomplete_operations = np.where(complete_operations == 0)[0]
            for index in incomplete_operations:
                operation = operations[index]
                success = False

                try:
                    operation.poll()
                except ValueError:
                    print("valueerror")


                if operation.complete:
                    complete_operations[index] = 1
                    results = operation.results
                    uri_index += 1
                    if results:
                        file = uri_list[k + index].split("//")[-1]
                        metadata = decode_filename(file)
                        for alternative in results:
                            if not alternative:
                                continue
                            word = {"text": alternative.transcript,
                                    "confidence": alternative.confidence}
                            if "channel" in metadata:
                                word["speaker"] = metadata["channel"]
                            if "timestamp" in metadata:
                                word["starttime"] = metadata["timestamp"]
                            if "name" in metadata:
                                name = metadata["name"]
                            word["id"] = "%s_%d" % (name, word_num)
                            transcript.append(word)
                            word_num += 1
        progress_bar(uri_index, len(uri_list), text="transcribed uris: ")
        if save_intermediate:
            intermediate = {"uri_list": uri_list, "complete_uris":list(complete_operations), "transcript":transcript}
            with open(TEMP_FILE, 'w') as f:
                f.write(json.dumps(intermediate))
    if save_intermediate:
        os.remove(TEMP_FILE)
    return transcript
Ejemplo n.º 8
0
            headers['Accept-Language'] = 'en-US,en;q=0.9'

            res = utility.request_url(url, headers)

            length = res.getheader('content-length')
            if length:
                length = int(length)

                if os.path.isfile(file):
                    if os.path.getsize(file) == length:
                        logger.debug('same file {} and same size exist. do not download again'.format(file))
                        print('file {} already exist, cancel download.'.format(file))
                        exit(0)

                block_size = 1024 * 16
                utility.progress_bar(0, length, prefix='Progress:', suffix='Complete', length=50)
                with open(dn_file, "wb") as f:
                    size = 0
                    while True:
                        data = res.read(block_size)
                        if not data:
                            if size == length:
                                break
                            else:
                                raise Exception('{} downloaded size is not same as expected length.'.format(dn_file))
                        f.write(data)
                        size += len(data)
                        if length:
                            utility.progress_bar(size, length, prefix='Progress:', suffix='Complete', length=50)

                if os.path.isfile(file):
Ejemplo n.º 9
0
                ((long_regions[:, 1] - long_regions[:, 0]) / 2 +
                 long_regions[:, 0]).astype("int32")))
        all_midpoints.append(midpoints)
    return all_midpoints


if __name__ == "__main__":
    rate, data = wavfile.read(path)
    all_midpoints = find_divisions(data, SILENCE, rate)
    print("Number of found midpoints is \n"
          "channel1: %d \n"
          "channel2: %d \n" % (len(all_midpoints[0]), len(all_midpoints[1])))
    directory, file = os.path.split(path)
    name = file.split(".")[0]
    flac_dir = "%s/%s_split" % (directory, name)
    if not os.path.isdir(flac_dir):
        os.mkdir(flac_dir)
    for i, channel in enumerate(all_midpoints):
        for j, midpoint in enumerate(channel):
            if j is 0:
                slice = data[0:midpoint, i]
            elif j is len(channel) - 1:
                slice = data[midpoint:-1, i]
            else:
                slice = data[channel[j - 1]:midpoint, i]
            #pdb.set_trace()
            time_in_sec = float(midpoint) / float(rate)
            outfile = "%s/%s_channel_%d_timestamp_%d.flac" % (
                flac_dir, name, i, int(100 * time_in_sec))
            progress_bar(j, len(channel))
            sf.write(outfile, slice, rate)
Ejemplo n.º 10
0
def download_tasks(size):
    global all_topics
    cnt_waiting, cnt_doing, cnt_completed, cnt_terminated, cnt_killed = update_download_tasks_status(
    )
    cnt_topic_waiting, cnt_topic_completed, cnt_topic_downloading, cnt_topic_failed = count_topic_download_status(
    )

    free_space = size - cnt_doing - cnt_waiting
    cnt_topic_remains = 0
    cnt = 0
    for topic in all_topics:
        if cnt_topic_completed + cnt_topic_failed < len(
                all_topics):  # to avoid double progress bar
            utility.progress_bar(cnt_topic_completed + cnt_topic_failed,
                                 len(all_topics),
                                 prefix='Scanning:',
                                 suffix='Complete',
                                 length=50)
        if 'download' not in topic:
            if free_space > 0:
                cnt += 1
                utility.progress_bar(cnt_topic_completed + cnt_topic_failed,
                                     len(all_topics),
                                     prefix='Loading :',
                                     suffix='{}/{}     '.format(
                                         cnt, free_space),
                                     length=50)
                mid = topic['mid']
                aid = topic['aid']
                cid = topic['cid']
                title = topic['title']
                videos = get_videos(mid, aid, cid, title)
                if videos is None:
                    topic['url'] = ''
                    save_failed_download(topic)
                else:
                    for video in videos:
                        video['status'] = 0  # waiting to start
                        video['process'] = None
                    all_downloads.extend(videos)
                    topic['download'] = videos
                    free_space -= len(videos)

            else:
                cnt_topic_remains += 1

    trigger_downloads(size)

    cnt_waiting, cnt_doing, cnt_completed, cnt_terminated, cnt_killed = update_download_tasks_status(
    )
    cnt_topic_waiting, cnt_topic_completed, cnt_topic_downloading, cnt_topic_failed = count_topic_download_status(
    )

    utility.progress_bar(cnt_topic_completed + cnt_topic_failed,
                         len(all_topics),
                         prefix='Progress:',
                         suffix='Complete',
                         length=50)

    if cnt_topic_remains > 0 or cnt_waiting > 0 or cnt_doing > 0:
        timer = threading.Timer(2.0, download_tasks, [size])
        timer.start()
Ejemplo n.º 11
0
def load_topics(urls, words=None):
    if not check_urls(urls):
        exit(-1)
    else:
        results = []
        idx = 0
        for url in urls:
            idx += 1
            print('processing url(s) {}/{} : {}'.format(idx, len(urls), url))
            mid = (url.split('/'))[3]
            if len(mid) == 0:
                logger.warning('error : failed to locate mid')
            else:
                print('loading topics primary data ...')
                page = 1
                topics = get_topics(mid, page)
                if topics is not None:
                    pages = topics[0]['pages']
                    count = topics[0]['count']

                    print('{} topics in {} pages to be loaded'.format(
                        count, pages))
                    # reload and apply keywords
                    topics = []
                    utility.progress_bar(0,
                                         pages,
                                         prefix='Progress:',
                                         suffix='Complete',
                                         length=50)
                    page = 0
                    while page < pages:
                        page += 1
                        new_page_topics = get_topics(mid, page, words)
                        if new_page_topics is not None:
                            topics.extend(new_page_topics)
                        utility.progress_bar(page,
                                             pages,
                                             prefix='Progress:',
                                             suffix='Complete',
                                             length=50)
                    if len(topics) > 0:
                        print('loading cid for {} topic(s)'.format(
                            len(topics)))
                        step = 0
                        utility.progress_bar(step,
                                             len(topics),
                                             prefix='Progress:',
                                             suffix='Complete',
                                             length=50)
                        for topic in topics:
                            step += 1
                            ref = topic['ref']
                            title = topic['title']
                            aid = topic['aid']
                            cid = get_cid(aid, ref)
                            if cid is not None:
                                results.append(
                                    dict(mid=mid,
                                         aid=aid,
                                         cid=cid,
                                         title=title,
                                         url=url))
                            else:
                                # failed to get cid, save to error file
                                save_failed_download(
                                    dict(mid=mid,
                                         aid=aid,
                                         cid='',
                                         title=title,
                                         url=url))
                            utility.progress_bar(step,
                                                 len(topics),
                                                 prefix='Progress:',
                                                 suffix='Complete',
                                                 length=50)
        print('{} topics loaded.'.format(len(results)))
        return results