Beispiel #1
0
def create_cache(args):
    datasetName = args.datasetName
    tokenColId = args.tokenColumnId
    cudaDevice = args.cuda_device
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights

    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file= 'pretrained/velmo_options.json'
    #elmo_weight_file = 'pretrained/velmo_weights.hdf5'

    # :: Logging level ::
    loggingLevel = logging.INFO
    logger = logging.getLogger()
    logger.setLevel(loggingLevel)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(loggingLevel)
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    commentSymbol = None
    columns = {tokenColId: 'tokens'}

    #picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_perturbed_03.pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_clean.pkl"
    picklePath = args.pkl_path
    embLookup = ELMoWordEmbeddings(None,
                                   elmo_options_file,
                                   elmo_weight_file,
                                   elmo_cuda_device=cudaDevice)

    print("ELMo Cache Generation")
    print("Output file:", picklePath)
    print("CUDA Device:", cudaDevice)

    splitFiles = ['train.txt', 'dev.txt', 'test.txt']

    for splitFile in splitFiles:
        inputPath = os.path.join('data', datasetName, splitFile)

        print("Adding file to cache: " + inputPath)
        sentences = readCoNLL(inputPath, columns, commentSymbol)
        tokens = [sentence['tokens'] for sentence in sentences]

        start_time = time.time()
        embLookup.addToCache(tokens)
        end_time = time.time()
        print("%s processed in %.1f seconds" %
              (splitFile, end_time - start_time))
        print("\n---\n")

    print("Store file at:", picklePath)
    embLookup.storeCache(picklePath)
Beispiel #2
0
columns = {tokenColId: 'tokens'}

picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"

embLookup = ELMoWordEmbeddings(None,
                               elmo_options_file,
                               elmo_weight_file,
                               elmo_cuda_device=cudaDevice)

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']

for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)
    tokens = [sentence['tokens'] for sentence in sentences]

    start_time = time.time()
    embLookup.addToCache(tokens)
    end_time = time.time()
    print("%s processed in %.1f seconds" % (splitFile, end_time - start_time))
    print("\n---\n")

print("Store file at:", picklePath)
embLookup.storeCache(picklePath)