Exemple #1
0
def predict(files, model_path, output_dir, format):

    # Must specify output format
    if format not in ['i2b2']:
        print >> sys.stderr, '\n\tError: Must specify output format'
        print >> sys.stderr, '\tAvailable formats: i2b2 '
        print >> sys.stderr, ''
        exit(1)

    # Load model
    with open(model_path, 'rb') as f:
        model = pickle.load(f)

    # Tell user if not predicting
    if not files:
        print >> sys.stderr, "\n\tNote: You did not supply any input files\n"
        exit()

    n = len(files)
    for i, txt in enumerate(sorted(files)):

        note = Document(txt)

        # Output file

        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con'
        out_path = os.path.join(output_dir, fname)

        #'''
        if os.path.exists(out_path):
            #print '\tWARNING: prediction file already exists (%s)' % out_path
            continue
        #'''

        print '-' * 30
        print '\n\t%d of %d' % (i + 1, n)
        print '\t', txt, '\n'

        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)

        # Output the concept predictions
        print '\n\nwriting to: ', out_path
        with open(out_path, 'w') as f:
            print >> f, output
        print
Exemple #2
0
def train(training_list, model_path, format, use_lstm, logfile=None):

    # Read the data into a Document object
    docs = []
    for txt, con in training_list:
        doc_tmp = Document(txt, con)
        docs.append(doc_tmp)

    # file names
    if not docs:
        print 'Error: Cannot train on 0 files. Terminating train.'
        return 1

    # Create a Machine Learning model
    model = ClinerModel(use_lstm)

    # Train the model using the Documents's data
    model.train(docs)

    # Pickle dump
    print '\nserializing model to %s\n' % model_path
    with open(model_path, "wb") as m_file:
        pickle.dump(model, m_file)
    model.log(logfile, model_file=model_path)
    model.log(sys.stdout, model_file=model_path)
Exemple #3
0
def train(training_list,
          model_path,
          format,
          use_lstm,
          logfile=None,
          val=[],
          test=[]):
    # Read the data into a Document object
    train_docs = []
    # print(training_list) # [('data/examples/ex_doc.txt', 'data/examples/ex_doc.con')]

    for txt, con in training_list:
        doc_tmp = Document(txt, con)
        train_docs.append(doc_tmp)

    val_docs = []
    for txt, con in val:
        doc_tmp = Document(txt, con)
        val_docs.append(doc_tmp)

    test_docs = []
    for txt, con in test:
        doc_tmp = Document(txt, con)
        test_docs.append(doc_tmp)

    # file names
    if not train_docs:
        print('Error: Cannot train on 0 files. Terminating train.')
        return 1

    # Create a Machine Learning model

    model = ClinerModel(use_lstm)

    # Train the model using the Documents's data
    model.train(train_docs, val=val_docs, test=test_docs)

    # Pickle dump
    print('\nserializing model to %s\n' % model_path)
    with open(model_path, "wb") as m_file:
        pickle.dump(model, m_file)

    model.log(logfile, model_file=model_path)
    model.log(sys.stdout, model_file=model_path)
def predict(files, model_path, output_dir, format, use_lstm=True):

    # Must specify output format
    if format not in ['i2b2']:
        sys.stderr.write('\n\tError: Must specify output format\n')
        sys.stderr.write('\tAvailable formats: i2b2\n')
        sys.stderr.write('\n')
        exit(1)

    # Load model
    #if use_lstm==False:
    with open(model_path, 'rb') as f:
        model = pickle.load(f, encoding='latin1')

    if model._use_lstm:
        import helper_dataset as hd
        import DatasetCliner_experimental as Exp
        import entity_lstm as entity_model

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        temp_pretrained_dataset_adress = parameters[
            'model_folder'] + os.sep + "dataset.pickle"
        model._pretrained_dataset = pickle.load(
            open(temp_pretrained_dataset_adress, 'rb'))
        model._pretrained_wordvector = hd.load_pretrained_token_embeddings(
            parameters)
        model._current_model = None
        '''
        updating_notes=[]
        for i,txt in enumerate(sorted(files)):
           note=Document(txt)
           tokenized_sents  = note.getTokenizedSentences()
           updating_notes+=tokenized_sents
        print (updating_notes)
        fictional_labels= copy.deepcopy(tokenized_sents)
        for idx,x in enumerate(fictional_labels):
           for val_id,value in enumerate(x):
                fictional_labels[idx][val_id]='O'
        Datasets_tokens={}
        Datasets_labels={}
        Datasets_tokens['deploy']=tokenized_sents
        Datasets_labels['deploy']=fictional_labels
        dataset = Exp.Dataset() 
        token_to_vector=dataset.load_dataset(Datasets_tokens, Datasets_labels, "", parameters,token_to_vector=model._pretrained_wordvector, pretrained_dataset=model._pretrained_dataset)
        parameters['Feature_vector_length']=dataset.feature_vector_size
        parameters['use_features_before_final_lstm']=False       
        dataset.update_dataset("", ['deploy'],Datasets_tokens,Datasets_labels)
        model._pretrained_dataset=dataset
        model_LSTM=entity_model.EntityLSTM(dataset,parameters)
        model._current_model=model_LSTM
        ._current_model
        '''
        print("END TEST")
        #exit()
        #model.parameters=None

    # Tell user if not predicting
    if not files:
        sys.stderr.write("\n\tNote: You did not supply any input files\n\n")
        exit()

    n = len(files)

    for i, txt in enumerate(sorted(files)):
        note = Document(txt)

        # Output file
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con'
        out_path = os.path.join(output_dir, fname)

        #'''
        if os.path.exists(out_path):
            print('\tWARNING: prediction file already exists (%s)' % out_path)
            #continue
        #'''

        sys.stdout.write('%s\n' % ('-' * 30))
        sys.stdout.write('\n\t%d of %d\n' % (i + 1, n))
        sys.stdout.write('\t%s\n\n' % txt)

        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)

        # Output the concept predictions
        sys.stdout.write('\n\nwriting to: %s\n' % out_path)
        with open(out_path, 'w') as f:
            write(f, '%s\n' % output)
        sys.stdout.write('\n')
Exemple #5
0
def predict(files, model_path, output_dir, format, use_lstm=True):

    # Must specify output format
    if format not in ['i2b2']:
        sys.stderr.write('\n\tError: Must specify output format\n')
        sys.stderr.write('\tAvailable formats: i2b2\n')
        sys.stderr.write('\n')
        exit(1)

    # Load model
    #if use_lstm==False:
    with open(model_path, 'rb') as f:
        model = pickle.load(f, encoding='latin1')

    if model._use_lstm:
        import helper_dataset as hd
        import DatasetCliner_experimental as Exp
        import entity_lstm as entity_model

        parameters = hd.load_parameters_from_file("LSTM_parameters.txt")
        parameters['use_pretrained_model'] = True

        temp_pretrained_dataset_adress = parameters[
            'model_folder'] + os.sep + "dataset.pickle"
        model._pretrained_dataset = pickle.load(
            open(temp_pretrained_dataset_adress, 'rb'))
        model._pretrained_wordvector = hd.load_pretrained_token_embeddings(
            parameters)
        model._current_model = None

        print("END TEST")
        #exit()
        #model.parameters=None

    # Tell user if not predicting
    if not files:
        sys.stderr.write("\n\tNote: You did not supply any input files\n\n")
        exit()

    n = len(files)

    for i, txt in enumerate(sorted(files)):
        note = Document(txt)
        # Output file
        fname = os.path.splitext(os.path.basename(txt))[0] + '.' + 'con'
        out_path = os.path.join(output_dir, fname)

        if os.path.exists(out_path):
            print()
            #print('\tWARNING: prediction file already exists (%s)' % out_path)
            #continue
        '''
        sys.stdout.write('%s\n' % ('-' * 30))
        sys.stdout.write('\n\t%d of %d\n' % (i+1,n))
        sys.stdout.write('\t%s\n\n' % txt)
        '''
        # Predict concept labels
        labels = model.predict_classes_from_document(note)

        # Get predictions in proper format
        output = note.write(labels)

        print("-----------OUTPUT----------\n")
        print(output)

        # Output the concept predictions
        sys.stdout.write('\n\nwriting to: %s\n' % out_path)
        with open(out_path, 'w') as f:
            write(f, '%s\n' % output)
        sys.stdout.write('\n')