Example #1
0
def create_cache(args):
    datasetName = args.datasetName
    tokenColId = args.tokenColumnId
    cudaDevice = args.cuda_device
    elmo_options_file = args.elmo_options
    elmo_weight_file = args.elmo_weights

    #elmo_options_file= 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_options.json'
    #elmo_weight_file = 'pretrained/elmo_2x4096_512_2048cnn_2xhighway_5.5B_weights.hdf5'
    #elmo_options_file= 'pretrained/velmo_options.json'
    #elmo_weight_file = 'pretrained/velmo_weights.hdf5'

    # :: Logging level ::
    loggingLevel = logging.INFO
    logger = logging.getLogger()
    logger.setLevel(loggingLevel)

    ch = logging.StreamHandler(sys.stdout)
    ch.setLevel(loggingLevel)
    formatter = logging.Formatter('%(message)s')
    ch.setFormatter(formatter)
    logger.addHandler(ch)

    commentSymbol = None
    columns = {tokenColId: 'tokens'}

    #picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_perturbed_03.pkl"
    #picklePath = "embeddings/velmo_cache_conll2000_data_clean.pkl"
    picklePath = args.pkl_path
    embLookup = ELMoWordEmbeddings(None,
                                   elmo_options_file,
                                   elmo_weight_file,
                                   elmo_cuda_device=cudaDevice)

    print("ELMo Cache Generation")
    print("Output file:", picklePath)
    print("CUDA Device:", cudaDevice)

    splitFiles = ['train.txt', 'dev.txt', 'test.txt']

    for splitFile in splitFiles:
        inputPath = os.path.join('data', datasetName, splitFile)

        print("Adding file to cache: " + inputPath)
        sentences = readCoNLL(inputPath, columns, commentSymbol)
        tokens = [sentence['tokens'] for sentence in sentences]

        start_time = time.time()
        embLookup.addToCache(tokens)
        end_time = time.time()
        print("%s processed in %.1f seconds" %
              (splitFile, end_time - start_time))
        print("\n---\n")

    print("Store file at:", picklePath)
    embLookup.storeCache(picklePath)
Example #2
0
columns = {tokenColId: 'tokens'}

picklePath = "embeddings/elmo_cache_" + datasetName + ".pkl"

embLookup = ELMoWordEmbeddings(None,
                               elmo_options_file,
                               elmo_weight_file,
                               elmo_cuda_device=cudaDevice)

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']

for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)
    tokens = [sentence['tokens'] for sentence in sentences]

    start_time = time.time()
    embLookup.addToCache(tokens)
    end_time = time.time()
    print("%s processed in %.1f seconds" % (splitFile, end_time - start_time))
    print("\n---\n")

print("Store file at:", picklePath)
embLookup.storeCache(picklePath)
if os.path.isfile(picklePath):
    embLookup.loadCache(picklePath)

print("ELMo Cache Generation")
print("Output file:", picklePath)
print("CUDA Device:", cudaDevice)

splitFiles = ['train.txt', 'dev.txt', 'test.txt']
for splitFile in splitFiles:
    inputPath = os.path.join('data', datasetName, splitFile)

    print("Adding file to cache: " + inputPath)
    sentences = readCoNLL(inputPath, columns, commentSymbol)

    totalSentences = len(sentences)
    sentCnt = 0
    for sentence in sentences:
        embLookup.addToCache(sentence['tokens'])
        sentCnt += 1
        current = sentCnt
        percent = 100.0 * current / totalSentences
        line = '[{0}{1}]'.format('=' * int(percent / 2),
                                 ' ' * (50 - int(percent / 2)))
        status = '\r{0:3.0f}%{1} {2:3d}/{3:3d} Sentences'
        sys.stdout.write(status.format(percent, line, current, totalSentences))

    print("\n\n---\n")

print("Store file at:", picklePath)
embLookup.storeCache(picklePath)