def load_train_data(args, train_dir, valid_prop=0.10):
    """load training data and write to IO formatted training and validation files"""
    vocab = set()
    tfile = codecs.open(join(args.work_dir, TRAIN_FILE_NAME), 'w', 'utf-8')
    vfile = codecs.open(join(args.work_dir, VALID_FILE_NAME), 'w', 'utf-8')
    txt_files = [f for f in listdir(train_dir) if f.endswith(".txt")]
    random.shuffle(txt_files)
    num_val_files = int(len(txt_files) * valid_prop)
    for findex, txt_file in enumerate(txt_files):
        print("Reading", txt_file)
        rfile = vfile if findex < num_val_files else tfile
        doc_tokens, file_vocab = tokenize_document(join(train_dir, txt_file))
        vocab = vocab.union(file_vocab)
        annotations = read_annotations(join(train_dir, txt_file[:-3] + "ann"))
        for token in doc_tokens:
            ignore_token = False
            for ann in annotations:
                if token.start >= ann.start and token.end <= ann.end:
                    # Change this for IOB annotations
                    if ann.atype == LOC_ANN_TAG:
                        token.encoding = "I-LOC"
                    if ann.atype == PRO_ANN_TAG:
                        ignore_token = True
                    break
            if not ignore_token:
                print(token.text + "\t" + token.encoding, file=rfile)
    tfile.close()
    vfile.close()
    return vocab
Example #2
0
def get_input_pmc(word_emb_model, input_file):
    '''loads files for annotation'''
    window_size = 5  # By default
    n_neighbors = int(window_size / 2)
    doc_tokens, _ = tokenize_document(input_file)
    # print("processing file: {} and neighbors = {}".format(input_file, n_neighbors))
    padding = "<s>"
    words = []
    for _ in range(n_neighbors):
        words.append(padding)
    for token in doc_tokens:
        words.append(token.text)
    for _ in range(n_neighbors):
        words.append(padding)
    instances = []
    for i in range(n_neighbors, len(words) - n_neighbors):
        context = []
        for j in range(-n_neighbors, n_neighbors + 1):
            context = np.append(context, word_emb_model[words[i + j]])
        instances.append(context)
    assert len(doc_tokens) == len(instances)
    return doc_tokens, instances
def load_test_data(args, test_dir):
    """load test data and write to IO formatted file"""
    vocab = set()
    tfile = codecs.open(join(args.work_dir, "test-io.txt"), 'w', 'utf-8')
    txt_files = [f for f in listdir(test_dir) if f.endswith(".txt")]
    for _, txt_file in enumerate(txt_files):
        print("Reading", txt_file)
        doc_tokens, file_vocab = tokenize_document(join(test_dir, txt_file))
        vocab = vocab.union(file_vocab)
        annotations = read_annotations(join(test_dir, txt_file[:-3] + "ann"))
        for token in doc_tokens:
            ignore_token = False
            for ann in annotations:
                if token.start >= ann.start and token.end <= ann.end:
                    # Change this for IOB annotations
                    if ann.atype == LOC_ANN_TAG:
                        token.encoding = "I-LOC"
                    if ann.atype == PRO_ANN_TAG:
                        ignore_token = True
                    break
            if not ignore_token:
                print(token.text + "\t" + token.encoding, file=tfile)
    tfile.close()
    return vocab
Example #4
0
import nltk
import pickle
from utils import tokenize_document

resume_file = open('../assets/resume.txt', 'r')
resume = resume_file.read()
resume_file.close()

tokenizer = nltk.RegexpTokenizer(r'\w+')
resume_tokenized = tokenize_document(resume, tokenizer)
print(resume_tokenized)
pickle.dump(resume_tokenized, open('../assets/resume_tokens.p', 'wb'))
tokens_to_generate = config['predict']['tokens_to_generate']
model_path = str(u.get_model_path())

# Load the model weights ---------------------------------------------------------
model = cm.create_model()
model.load_weights(model_path)

# Setup `dict` to un-vectorize ---------------------------------------------------
int_to_token = dict((x[1], x[0]) for x in token_to_int.items())

# Seed the prediction process ----------------------------------------------------
# pin our seed for reproduceability
np.random.seed(0)
for file in data_path.iterdir():
    if file.suffix == '.txt':
        tokens = u.tokenize_document(file)
        vector = u.vectorize(tokens, token_to_int)
        indx = np.random.randint(len(tokens) - sequence_length)
        seed = [int(x) for x in vector[indx:(indx + sequence_length)]]
        break
print(f'Seed: {u.vec_to_str(seed, int_to_token)}')

# Generation  --------------------------------------------------------------------
for diversity in [1, 1.33, 1.66, 2]:
    np.random.seed(0)
    curr = seed
    generated_tokens = []
    for i in range(tokens_to_generate):
        one_hot_x = u.one_hot_single(curr, token_count)
        one_hot_y = model.predict(one_hot_x)
        y = u.un_one_hot(one_hot_y, diversity)
import pickle
from utils import tokenize_document, tokenizer
from collections import Counter

tokens_with_stopwords = tokenize_document(open('../assets/resume.txt',
                                               'r').read(),
                                          tokenizer,
                                          remove_stopwords=False)
print('Number of Tokens: (with stopwords)', len(tokens_with_stopwords))
print('Number of Unique tokens: (with stopwords)',
      len(set(tokens_with_stopwords)), '\n')

tokens = pickle.load(open('../assets/resume_tokens.p', 'rb'))
print('Number of Tokens: (without stopwords)', len(tokens))
tokens_set = set(tokens)
print('Number of Unique tokens: (without stopwords)', len(tokens_set))

print('\nPercentage Reduction in tokens after removing stopwords:',
      (len(set(tokens_with_stopwords)) - len(tokens_set)) / len(tokens_set) *
      100)

frequencies = Counter(tokens)
print(
    '\nThe Frequencies of the most common 10 tokens are: (in tokens without stopwords)\n',
    frequencies.most_common(10))