Ejemplo n.º 1
0
def main(argv):
    # Args parser
    args = parseArgs(argv)

    print("=============================================================")
    print(f"Quantizing data from {args.pathDB}")
    print("=============================================================")

    # Get splits
    if args.split:
        assert len(args.split.split("-"))==2 and int(args.split.split("-")[1]) >= int(args.split.split("-")[0]) >= 1, \
            "SPLIT must be under the form idxSplit-numSplits (numSplits >= idxSplit >= 1), eg. --split 1-20"
        idx_split, num_splits = args.split.split("-")
        idx_split = int(idx_split)
        num_splits = int(num_splits)

    # Find all sequences
    print("")
    print(f"Looking for all {args.file_extension} files in {args.pathDB}")
    seqNames, _ = findAllSeqs(args.pathDB,
                              speaker_level=1,
                              extension=args.file_extension,
                              loadCache=True)
    if len(seqNames) == 0 or not os.path.splitext(seqNames[0][1])[1].endswith(
            args.file_extension):
        print(
            f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list"
        )
        seqNames, _ = findAllSeqs(args.pathDB,
                                  speaker_level=1,
                                  extension=args.file_extension,
                                  loadCache=False)
    print(f"Done! Found {len(seqNames)} files!")

    # Filter specific sequences
    if args.pathSeq:
        print("")
        print(f"Filtering seqs in {args.pathSeq}")
        with open(args.pathSeq, 'r') as f:
            seqs = set([x.strip() for x in f])
        filtered = []
        for s in seqNames:
            if os.path.splitext(s[1].split('/')[-1])[0] in seqs:
                filtered.append(s)
        seqNames = filtered
        print(f"Done! {len(seqNames)} files filtered!")

    # Check if directory exists
    if not os.path.exists(args.pathOutputDir):
        print("")
        print(f"Creating the output directory at {args.pathOutputDir}")
        Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
    writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)

    # Check if output file exists
    if not args.split:
        nameOutput = "quantized_outputs.txt"
    else:
        nameOutput = f"quantized_outputs_split_{idx_split}-{num_splits}.txt"
    outputFile = os.path.join(args.pathOutputDir, nameOutput)

    # Get splits
    if args.split:
        startIdx = len(seqNames) // num_splits * (idx_split - 1)
        if idx_split == num_splits:
            endIdx = len(seqNames)
        else:
            endIdx = min(
                len(seqNames) // num_splits * idx_split, len(seqNames))
        seqNames = seqNames[startIdx:endIdx]
        print("")
        print(
            f"Quantizing split {idx_split} out of {num_splits} splits, with {len(seqNames)} files (idx in range({startIdx}, {endIdx}))."
        )

    # Debug mode
    if args.debug:
        nsamples = 20
        print("")
        print(f"Debug mode activated, only load {nsamples} samples!")
        # shuffle(seqNames)
        seqNames = seqNames[:nsamples]

    # Continue
    addEndLine = False  # to add end line (\n) to first line or not
    if args.resume:
        if os.path.exists(outputFile):
            with open(outputFile, 'r') as f:
                lines = [line for line in f]
            existing_files = set([x.split()[0] for x in lines if x.split()])
            seqNames = [
                s for s in seqNames if os.path.splitext(s[1].split('/')[-1])[0]
                not in existing_files
            ]
            print(
                f"Found existing output file, continue to quantize {len(seqNames)} audio files left!"
            )
            if len(lines) > 0 and not lines[-1].endswith("\n"):
                addEndLine = True
    else:
        assert not os.path.exists(outputFile), \
            f"Output file {outputFile} already exists !!! If you want to continue quantizing audio files, please check the --resume option."

    assert len(seqNames) > 0, \
        "No file to be quantized!"

    # Load Clustering args
    assert args.pathClusteringCheckpoint[-3:] == ".pt"
    if os.path.exists(args.pathClusteringCheckpoint[:-3] + "_args.json"):
        pathConfig = args.pathClusteringCheckpoint[:-3] + "_args.json"
    elif os.path.exists(
            os.path.join(os.path.dirname(args.pathClusteringCheckpoint),
                         "checkpoint_args.json")):
        pathConfig = os.path.join(
            os.path.dirname(args.pathClusteringCheckpoint),
            "checkpoint_args.json")
    else:
        assert False, \
            f"Args file not found in the directory {os.path.dirname(args.pathClusteringCheckpoint)}"
    clustering_args = readArgs(pathConfig)
    print("")
    print(
        f"Clutering args:\n{json.dumps(vars(clustering_args), indent=4, sort_keys=True)}"
    )
    print('-' * 50)

    # Load CluterModule
    print("")
    print(f"Loading ClusterModule at {args.pathClusteringCheckpoint}")
    clusterModule = loadClusterModule(args.pathClusteringCheckpoint)
    if not args.cpu:
        clusterModule.cuda()
    print("ClusterModule loaded!")

    # Get the CPC checkpoint path from clustering args
    if not os.path.isabs(
            clustering_args.pathCheckpoint):  # Maybe it's relative path
        clustering_args.pathCheckpoint = os.path.join(
            os.path.dirname(os.path.abspath(args.pathClusteringCheckpoint)),
            clustering_args.pathCheckpoint)
    assert os.path.exists(clustering_args.pathCheckpoint), \
        f"CPC path at {clustering_args.pathCheckpoint} does not exist!!"

    # Load FeatureMaker
    print("")
    print(f"Loading CPC FeatureMaker from {clustering_args.pathCheckpoint}")
    ## If we don't apply batch implementation, we can set LSTM model to keep hidden units
    ## making the quality of the quantized units better (that's why I set keep_hidden=args.nobatch)
    featureMaker = loadCPCFeatureMaker(
        clustering_args.pathCheckpoint,
        gru_level=vars(clustering_args).get('level_gru', None),
        get_encoded=clustering_args.encoder_layer,
        keep_hidden=args.nobatch)
    if clustering_args.dimReduction is not None:
        dimRed = loadDimReduction(clustering_args.dimReduction,
                                  clustering_args.centroidLimits)
        featureMaker = torch.nn.Sequential(featureMaker, dimRed)
    if not clustering_args.train_mode:
        featureMaker.eval()
    if not args.cpu:
        featureMaker.cuda()

    def cpc_feature_function(x):
        if args.nobatch is False:
            return buildFeature_batch(featureMaker,
                                      x,
                                      seqNorm=False,
                                      strict=args.strict,
                                      maxSizeSeq=args.max_size_seq,
                                      batch_size=args.batch_size)
        else:
            return buildFeature(featureMaker,
                                x,
                                seqNorm=False,
                                strict=args.strict)

    print("CPC FeatureMaker loaded!")

    # Quantization of files
    print("")
    print(f"Quantizing audio files and saving outputs to {outputFile}...")
    f = open(outputFile, "a")
    bar = progressbar.ProgressBar(maxval=len(seqNames))
    bar.start()
    start_time = time()
    for index, vals in enumerate(seqNames):
        bar.update(index)

        file_path = vals[1]
        file_path = os.path.join(args.pathDB, file_path)

        # Quantizing
        quantLine = quantize_file(file_path, cpc_feature_function,
                                  clusterModule)

        # Save the outputs
        file_name = os.path.splitext(os.path.basename(file_path))[0]
        outLine = "\t".join([file_name, quantLine])
        if addEndLine:
            f.write("\n" + outLine)
        else:
            f.write(outLine)
            addEndLine = True
    bar.finish()
    print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
    f.close()
def main(argv):
    # Args parser
    args = parseArgs(argv)

    print("=============================================================")
    print(f"Building 1-hot features from {args.pathQuantizedUnits}")
    print("=============================================================")

    # Load input file
    print("")
    print(f"Reading input file from {args.pathQuantizedUnits}")
    seqNames = []
    seqInputs = []
    with open(args.pathQuantizedUnits, 'r') as f:
        for line in f:
            file_name, file_seq = line.strip().split("\t")
            # Convert sequence to the desired input form
            file_seq = file_seq.replace(",", " ")
            # Add to lists
            seqNames.append(file_name)
            seqInputs.append(file_seq)
    print(f"Found {len(seqNames)} sequences!")

    # Verify the output directory
    if os.path.exists(args.pathOutputDir):
        existing_files = set([
            os.path.splitext(os.path.basename(x))[0]
            for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy"
        ])
        seqNames = [
            s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0]
            not in existing_files
        ]
        print(
            f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!"
        )
    else:
        print("")
        print(f"Creating the output directory at {args.pathOutputDir}")
        Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
    writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)

    # Debug mode
    if args.debug:
        nsamples = 20
        print("")
        print(f"Debug mode activated, only load {nsamples} samples!")
        # shuffle(seqNames)
        seqNames = seqNames[:nsamples]
        seqInputs = seqInputs[:nsamples]

    # Load 1hot dictionary in case we use it
    if seqInputs and not seqInputs[0].split()[0].isdigit(
    ):  #multi-group ie. 65-241
        assert args.dict is not None, \
            "A dictionary must be given when the quantized outputs is not digits (multi-group case)!"
    if args.dict:
        print("")
        print(f"Loading onehot dictionary from {args.dict}...")
        with open(args.dict, "r") as f:
            lines = f.read().split("\n")
        pair2idx = {
            word.split()[0]: i
            for i, word in enumerate(lines)
            if word and not word.startwith("madeupword")
        }
        args.n_units = len(pair2idx)

    # Define onehot_feature_function
    def onehot_feature_function(input_sequence):
        if args.dict:
            indexes_sequence = np.array(
                [pair2idx[item] for item in input_sequence.split()])
        else:
            indexes_sequence = np.array(
                [int(item) for item in input_sequence.split()])

        onehotFeatures = np.eye(args.n_units)[indexes_sequence]

        return onehotFeatures

    # Building features
    print("")
    print(
        f"Building 1-hot features and saving outputs to {args.pathOutputDir}..."
    )
    bar = progressbar.ProgressBar(maxval=len(seqNames))
    bar.start()
    start_time = time()
    for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
        bar.update(index)

        # Computing features
        onehot_features = onehot_feature_function(input_seq)

        # Save the outputs
        file_name = os.path.splitext(name_seq)[0] + ".txt"
        file_out = os.path.join(args.pathOutputDir, file_name)
        np.savetxt(file_out, onehot_features)
    bar.finish()
    print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
def main(argv):
    # Args parser
    args = parseArgs(argv)

    print("=============================================================")
    print(f"Building BERT features from {args.pathQuantizedUnits}")
    print("=============================================================")

    # Load input file
    print("")
    print(f"Reading input file from {args.pathQuantizedUnits}")
    seqNames = []
    seqInputs = []
    with open(args.pathQuantizedUnits, 'r') as f:
        for line in f:
            file_name, file_seq = line.strip().split("\t")
            # Convert sequence to the desired input form
            file_seq = file_seq.replace(",", " ")
            # Add to lists
            seqNames.append(file_name)
            seqInputs.append(file_seq)
    print(f"Found {len(seqNames)} sequences!")

    # Verify the output directory
    if os.path.exists(args.pathOutputDir):
        existing_files = set([
            os.path.splitext(os.path.basename(x))[0]
            for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy"
        ])
        seqNames = [
            s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0]
            not in existing_files
        ]
        print(
            f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!"
        )
    else:
        print("")
        print(f"Creating the output directory at {args.pathOutputDir}")
        Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
    writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)

    # Debug mode
    if args.debug:
        nsamples = 20
        print("")
        print(f"Debug mode activated, only load {nsamples} samples!")
        # shuffle(seqNames)
        seqNames = seqNames[:nsamples]
        seqInputs = seqInputs[:nsamples]

    # Load LSTM model
    if args.dict is None:
        pathData = os.path.dirname(args.pathLSTMCheckpoint)
    else:
        pathData = os.path.dirname(args.dict)
    assert os.path.exists(os.path.join(pathData, "dict.txt")), \
        f"Dictionary file (dict.txt) not found in {pathData}"
    print("")
    print(f"Loading LSTM model from {args.pathLSTMCheckpoint}...")
    print(f"Path data {pathData}")
    model, task = loadLSTMLMCheckpoint(args.pathLSTMCheckpoint, pathData)
    model.eval()  # disable dropout (or leave in train mode to finetune)
    if not args.cpu:
        model.cuda()
    print("Model loaded !")

    # Define LSTM_feature_function
    def LSTM_feature_function(input_sequence, n_hidden=-1):
        # Get the number of layers
        num_layers = len(model.decoder.layers)
        assert abs(n_hidden) <= num_layers, \
            "absolute value of n_hidden must be less than or equal to the number of hidden layers = {}".format(num_layers)

        if n_hidden < 0:
            n_hidden = num_layers + 1 + n_hidden

        # Get input tensor
        input_tensor = task.source_dictionary.encode_line(
            "<s> " + input_sequence, append_eos=True,
            add_if_not_exist=False).type(torch.LongTensor).unsqueeze(0)
        if not args.cpu:
            input_tensor = input_tensor.cuda()

        # Get the output
        if n_hidden == 0:  # Take the embedding layer
            with torch.no_grad():
                output_tensor = model.decoder.embed_tokens(input_tensor)

        else:
            decoder_clone = deepcopy(model.decoder)

            # We don't take the final fc features
            decoder_clone.fc_out = torch.nn.Identity()
            decoder_clone.additional_fc = torch.nn.Identity()

            # Restrict the number of hiddden layers to n_hidden
            decoder_clone.layers = decoder_clone.layers[:n_hidden]

            with torch.no_grad():
                output_tensor = decoder_clone(input_tensor)[0]

        return output_tensor[0].data.cpu().numpy()

    # Building features
    print("")
    print(
        f"Building LSTM features and saving outputs to {args.pathOutputDir}..."
    )
    bar = progressbar.ProgressBar(maxval=len(seqNames))
    bar.start()
    start_time = time()
    for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
        bar.update(index)

        # Computing features
        LSTM_features = LSTM_feature_function(input_seq,
                                              n_hidden=args.hidden_level)

        # Save the outputs
        file_name = os.path.splitext(name_seq)[0] + ".txt"
        file_out = os.path.join(args.pathOutputDir, file_name)
        np.savetxt(file_out, LSTM_features)
    bar.finish()
    print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
def main(argv):
    # Args parser
    args = parseArgs(argv)

    print("=============================================================")
    print(f"Building BERT features from {args.pathQuantizedUnits}")
    print("=============================================================")

    # Load input file
    print("")
    print(f"Reading input file from {args.pathQuantizedUnits}")
    seqNames = []
    seqInputs = []
    with open(args.pathQuantizedUnits, 'r') as f:
        for line in f:
            file_name, file_seq = line.strip().split("\t")
            # Convert sequence to the desired input form
            file_seq = file_seq.replace(",", " ")
            # Add to lists
            seqNames.append(file_name)
            seqInputs.append(file_seq)
    print(f"Found {len(seqNames)} sequences!")

    # Verify the output directory
    if os.path.exists(args.pathOutputDir):
        existing_files = set([
            os.path.splitext(os.path.basename(x))[0]
            for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy"
        ])
        seqNames = [
            s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0]
            not in existing_files
        ]
        print(
            f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!"
        )
    else:
        print("")
        print(f"Creating the output directory at {args.pathOutputDir}")
        Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
    writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)

    # Debug mode
    if args.debug:
        nsamples = 20
        print("")
        print(f"Debug mode activated, only load {nsamples} samples!")
        # shuffle(seqNames)
        seqNames = seqNames[:nsamples]
        seqInputs = seqInputs[:nsamples]

    # Load BERT model
    if args.dict is None:
        pathData = os.path.dirname(args.pathBERTCheckpoint)
    else:
        pathData = os.path.dirname(args.dict)
    assert os.path.exists(os.path.join(pathData, "dict.txt")), \
        f"Dictionary file (dict.txt) not found in {pathData}"
    print("")
    print(f"Loading RoBERTa model from {args.pathBERTCheckpoint}...")
    print(f"Path data {pathData}")
    roberta = loadRobertaCheckpoint(args.pathBERTCheckpoint,
                                    pathData,
                                    from_pretrained=False)
    roberta.eval()  # disable dropout (or leave in train mode to finetune)
    if not args.cpu:
        roberta.cuda()
    print("Model loaded !")

    # Define BERT_feature_function
    def BERT_feature_function(input_sequence, n_hidden=-1):
        sentence_tokens = roberta.task.source_dictionary.encode_line(
            "<s> " + input_sequence, append_eos=True,
            add_if_not_exist=False).type(torch.LongTensor)
        if not args.cpu:
            sentence_tokens = sentence_tokens.cuda()

        with torch.no_grad():
            outputs = roberta.extract_features(sentence_tokens,
                                               return_all_hiddens=True)

        return outputs[n_hidden].squeeze(0).float().cpu().numpy()

    # Building features
    print("")
    print(
        f"Building BERT features and saving outputs to {args.pathOutputDir}..."
    )
    bar = progressbar.ProgressBar(maxval=len(seqNames))
    bar.start()
    start_time = time()
    for index, (name_seq, input_seq) in enumerate(zip(seqNames, seqInputs)):
        bar.update(index)

        # Computing features
        BERT_features = BERT_feature_function(input_seq,
                                              n_hidden=args.hidden_level)

        # Save the outputs
        file_name = os.path.splitext(name_seq)[0] + ".txt"
        file_out = os.path.join(args.pathOutputDir, file_name)
        np.savetxt(file_out, BERT_features)
    bar.finish()
    print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")
Ejemplo n.º 5
0
def main(argv):
    # Args parser
    args = parseArgs(argv)

    print("=============================================================")
    print(f"Building CPC features from {args.pathDB}")
    print("=============================================================")

    # Find all sequences
    print("")
    print(f"Looking for all {args.file_extension} files in {args.pathDB}")
    seqNames, _ = findAllSeqs(args.pathDB,
                              speaker_level=1,
                              extension=args.file_extension,
                              loadCache=True)
    if len(seqNames) == 0 or not os.path.splitext(seqNames[0][-1])[1].endswith(
            args.file_extension):
        print(
            f"Seems like the _seq_cache.txt does not contain the correct extension, reload the file list"
        )
        seqNames, _ = findAllSeqs(args.pathDB,
                                  speaker_level=1,
                                  extension=args.file_extension,
                                  loadCache=False)
    print(f"Done! Found {len(seqNames)} files!")

    # Verify the output directory
    if os.path.exists(args.pathOutputDir):
        existing_files = set([
            os.path.splitext(os.path.basename(x))[0]
            for x in os.listdir(args.pathOutputDir) if x[-4:] == ".npy"
        ])
        seqNames = [
            s for s in seqNames if os.path.splitext(os.path.basename(s[1]))[0]
            not in existing_files
        ]
        print(
            f"Found existing output directory at {args.pathOutputDir}, continue to build features of {len(seqNames)} audio files left!"
        )
    else:
        print("")
        print(f"Creating the output directory at {args.pathOutputDir}")
        Path(args.pathOutputDir).mkdir(parents=True, exist_ok=True)
    writeArgs(os.path.join(args.pathOutputDir, "_info_args.json"), args)

    # Debug mode
    if args.debug:
        nsamples = 20
        print("")
        print(f"Debug mode activated, only load {nsamples} samples!")
        # shuffle(seqNames)
        seqNames = seqNames[:nsamples]

    # Load CPC feature maker
    print("")
    print(f"Loading CPC featureMaker from {args.pathCPCCheckpoint}")
    featureMaker = loadCPCFeatureMaker(args.pathCPCCheckpoint,
                                       gru_level=args.gru_level,
                                       get_encoded=args.get_encoded,
                                       keep_hidden=True)
    featureMaker.eval()
    if not args.cpu:
        featureMaker.cuda()
    print("CPC FeatureMaker loaded!")

    # Define CPC_feature_function
    def CPC_feature_function(x):
        CPC_features = buildFeature(featureMaker,
                                    x,
                                    seqNorm=args.seq_norm,
                                    strict=args.strict,
                                    maxSizeSeq=args.max_size_seq)
        return CPC_features.squeeze(0).float().cpu().numpy()

    # Building features
    print("")
    print(
        f"Building CPC features and saving outputs to {args.pathOutputDir}...")
    bar = progressbar.ProgressBar(maxval=len(seqNames))
    bar.start()
    start_time = time()

    file_out = os.path.join(args.pathOutputDir, file_name)
    for index, vals in enumerate(seqNames):
        bar.update(index)

        file_path = vals[1]
        file_path = os.path.join(args.pathDB, file_path)

        # Computing features
        CPC_features = CPC_feature_function(file_path)

        # Save the outputs
        file_name = os.path.splitext(
            os.path.basename(file_path))[0] + ".ark.gz"
        with WriteHelper(f"ark:| gzip -c > {file_name}") as writer:
            writer('arr_0', CPC_features)
    bar.finish()
    print(f"...done {len(seqNames)} files in {time()-start_time} seconds.")