Exemple #1
0
    def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None,
            test_size=None, random_state=None):
        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        np.save(self.get_config_file_path(model_dir_path), self.config)

        self.create_model()
        json = self.model.to_json()
        open(self.get_architecture_file_path(model_dir_path), 'w').write(json)

        xs = []
        ys = []
        for text, label in text_label_pairs:
            tokens = [x.lower() for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[label])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        weight_file_path = self.get_weight_file_path(model_dir_path)

        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs,
                                 validation_data=[x_test, y_test], callbacks=[checkpoint],
                                 verbose=1)

        self.model.save_weights(weight_file_path)

        np.save(model_dir_path + '/' + WordVecCnnLstm.model_name + '-history.npy', history.history)

        score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1)
        print('score: ', score[0])
        print('accuracy: ', score[1])

        return history
Exemple #2
0
    def fit(self, text_data_model, text_label_pairs, model_dir_path, batch_size=None, epochs=None,
            test_size=None, random_state=None):
        if batch_size is None:
            batch_size = 64
        if epochs is None:
            epochs = 20
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        np.save(self.get_config_file_path(model_dir_path), self.config)

        self.create_model()
        json = self.model.to_json()
        open(self.get_architecture_file_path(model_dir_path), 'w').write(json)

        ys = []
        X = np.zeros(shape=(len(text_label_pairs), self.glove_model.embedding_dim))
        for i, (text, label) in enumerate(text_label_pairs):
            words = [w.lower() for w in word_tokenize(text)]
            E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len))
            for j in range(len(words)):
                word = words[j]
                try:
                    E[:, j] = self.glove_model.encode_word(word)
                except KeyError:
                    pass
            X[i, :] = np.sum(E, axis=1)
            ys.append(self.labels[label])
        Y = np_utils.to_categorical(ys, len(self.labels))

        x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=test_size, random_state=random_state)
        print(x_train.shape, x_test.shape, y_train.shape, y_test.shape)

        weight_file_path = self.get_weight_file_path(model_dir_path)

        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit(x=x_train, y=y_train, batch_size=batch_size, epochs=epochs,
                                 validation_data=[x_test, y_test], callbacks=[checkpoint],
                                 verbose=1)

        self.model.save_weights(weight_file_path)

        np.save(model_dir_path + '/' + WordVecGloveFFN.model_name + '-history.npy', history.history)

        score = self.model.evaluate(x=x_test, y=y_test, batch_size=batch_size, verbose=1)
        print('score: ', score[0])
        print('accuracy: ', score[1])

        return history
Exemple #3
0
 def predict(self, sentence):
     xs = []
     tokens = [w.lower() for w in word_tokenize(sentence)]
     wid = [self.word2idx[token] if token in self.word2idx else len(self.word2idx) for token in tokens]
     xs.append(wid)
     x = pad_sequences(xs, self.max_len)
     output = self.model.predict(x)
     return output[0]
Exemple #4
0
    def fit(self, text_data_model, text_label_pairs, model_dir_path,
            test_size=None, random_state=None,
            epochs=None, batch_size=None):
        if epochs is None:
            epochs = 10
        if batch_size is None:
            batch_size = 16
        if test_size is None:
            test_size = 0.3
        if random_state is None:
            random_state = 42

        self.config = text_data_model
        self.idx2word = self.config['idx2word']
        self.word2idx = self.config['word2idx']
        self.max_len = self.config['max_len']
        self.vocab_size = self.config['vocab_size']
        self.labels = self.config['labels']

        verbose = 1

        config_file_path = WordVecMultiChannelCnn.get_config_file_path(model_dir_path)
        np.save(config_file_path, text_data_model)

        max_input_tokens = len(self.word2idx)
        self.model = self.define_model(self.max_len, max_input_tokens)
        open(self.get_architecture_file_path(model_dir_path), 'wt').write(self.model.to_json())

        xs = []
        ys = []
        for text, label in text_label_pairs:
            tokens = [x.lower() for x in word_tokenize(text)]
            wid_list = list()
            for w in tokens:
                wid = 0
                if w in self.word2idx:
                    wid = self.word2idx[w]
                wid_list.append(wid)
            xs.append(wid_list)
            ys.append(self.labels[label])

        X = pad_sequences(xs, maxlen=self.max_len)
        Y = np_utils.to_categorical(ys, len(self.labels))

        weight_file_path = WordVecMultiChannelCnn.get_weight_file_path(model_dir_path)
        checkpoint = ModelCheckpoint(weight_file_path)

        history = self.model.fit([X, X, X], Y, epochs=epochs, batch_size=batch_size,
                                 validation_split=test_size,
                                 verbose=verbose, callbacks=[checkpoint])
        # save the model
        self.model.save(weight_file_path)

        np.save(model_dir_path + '/' + WordVecMultiChannelCnn.model_name + '-history.npy', history.history)

        return history
    def encode_docs(self, docs, max_allowed_doc_length=None):
        if max_allowed_doc_length is None:
            max_allowed_doc_length = 500
        doc_count = len(docs)
        X = np.zeros(shape=(doc_count, self.embedding_dim))
        max_len = 0
        for doc in docs:
            max_len = max(max_len, len([word_tokenize(doc)]))
        max_len = min(max_len, max_allowed_doc_length)
        for i in range(0, doc_count):
            doc = docs[i]
            words = [w.lower() for w in word_tokenize(doc)]
            E = np.zeros(shape=(self.embedding_dim, max_len))
            for j in range(max_len):
                word = words[j]
                try:
                    E[:, j] = self.word2em[word]
                except KeyError:
                    pass
            X[i, :] = np.sum(E, axis=1)

        return X
Exemple #6
0
    def predict(self, sentence):

        tokens = [w.lower() for w in word_tokenize(sentence)]

        X = np.zeros(shape=(1, self.glove_model.embedding_dim))
        E = np.zeros(shape=(self.glove_model.embedding_dim, self.max_len))
        for j in range(0, len(tokens)):
            word = tokens[j]
            try:
                E[:, j] = self.glove_model.encode_word(word)
            except KeyError:
                pass
        X[0, :] = np.sum(E, axis=1)
        output = self.model.predict(X)
        return output[0]
Exemple #7
0
    def parse(self, texts, print_line=False):
        self.raw = texts
        for p in texts:
            if len(p) > 10:
                s = word_tokenize(p.lower())
                unknown = True
                name = extract_name(s, p)
                email = extract_email(s, p)
                sex = extract_sex(s, p)
                race = extract_ethnicity(s, p)
                education = extract_education(s, p)
                experience = extract_experience(s, p)
                objective = extract_objective(s, p)
                expertise = extract_expertise(s, p)
                mobile = extract_mobile(s, p)
                if name is not None:
                    self.name = name
                    unknown = False
                if email is not None:
                    self.email = email
                    unknown = False
                if sex is not None:
                    self.sex = sex
                    unknown = False
                if race is not None:
                    self.ethnicity = race
                    unknown = False
                if education is not None:
                    self.education = education
                    unknown = False
                if experience is not None:
                    self.experience = experience
                    unknown = False
                if objective is not None:
                    self.objective = objective
                    unknown = False
                if expertise is not None:
                    self.expertise.append(expertise)
                    unknown = False
                if mobile is not None:
                    self.mobile = mobile
                    unknown = False

                if unknown is False:
                    self.unknown = unknown

                if print_line:
                    print('parsed: ', p)
    def encode_doc(self, doc, max_allowed_doc_length=None):
        if max_allowed_doc_length is None:
            max_allowed_doc_length = 500

        words = [w.lower() for w in word_tokenize(doc)]
        max_len = min(len(words), max_allowed_doc_length)
        E = np.zeros(shape=(self.embedding_dim, max_len))
        X = np.zeros(shape=(self.embedding_dim, ))
        for j in range(max_len):
            word = words[j]
            try:
                E[:, j] = self.word2em[word]
            except KeyError:
                pass
        X[:] = np.sum(E, axis=1)
        return X
Exemple #9
0
def fit_text(data_dir_path, max_vocab_size=None, label_type=None):
    if label_type is None:
        label_type = 'line_type'
    if max_vocab_size is None:
        max_vocab_size = 5000
    counter = collections.Counter()
    max_len = 0
    labels = dict()
    for f in os.listdir(data_dir_path):
        data_file_path = os.path.join(data_dir_path, f)
        if os.path.isfile(data_file_path) and f.lower().endswith('.txt'):
            file = open(data_file_path, mode='rt', encoding='utf8')

            for line in file:
                res = line.strip().split('\t')
                if len(res) == 3:
                    line_type, line_label, sentence = res[0], res[1], res[2]
                    tokens = [x.lower() for x in word_tokenize(sentence)]
                    for token in tokens:
                        counter[token] += 1
                    max_len = max(max_len, len(tokens))
                    label = line_label
                    if label_type != 'line_label':
                        label = line_type
                    if label not in labels:
                        labels[label] = len(labels)
            file.close()

    word2idx = collections.defaultdict(int)
    for idx, word in enumerate(counter.most_common(max_vocab_size)):
        word2idx[word[0]] = idx
    idx2word = {v: k for k, v in word2idx.items()}
    vocab_size = len(word2idx) + 1

    model = dict()

    model['word2idx'] = word2idx
    model['idx2word'] = idx2word
    model['vocab_size'] = vocab_size
    model['max_len'] = max_len
    model['labels'] = labels

    return model
Exemple #10
0
    def parse(self, texts, print_line=False):
        self.raw = texts
        for p in texts:
            if len(p) > 10:
                s = word_tokenize(p.lower())
                line_label = self.line_label_classifier.predict_class(
                    sentence=p)
                line_type = self.line_type_classifier.predict_class(sentence=p)
                unknown = True
                name = extract_name(s, p)
                email = extract_email(s, p)
                sex = extract_sex(s, p)
                race = extract_ethnicity(s, p)
                education = self.extract_education(line_label, p)
                project = self.extract_project(line_label, p)
                experience = self.extract_experience(line_label, p)
                objective = extract_objective(s, p)
                knowledge = self.extract_knowledge(line_label, p)
                mobile = extract_mobile(s, p)
                if name is not None:
                    self.name = name
                    unknown = False
                if email is not None:
                    self.email = email
                    unknown = False
                if sex is not None:
                    self.sex = sex
                    unknown = False
                if race is not None:
                    self.ethnicity = race
                    unknown = False
                if education is not None:
                    self.education.append(education)
                    unknown = False
                if knowledge is not None:
                    self.knowledge.append(knowledge)
                    unknown = False
                if project is not None:
                    self.project.append(project)
                    unknown = False
                if objective is not None:
                    self.objective = objective
                    unknown = False
                if experience is not None:
                    self.experience.append(experience)
                    unknown = False
                if mobile is not None:
                    self.mobile = mobile
                    unknown = False

                if line_type == 'meta':
                    self.meta.append(p)
                    unknown = False
                if line_type == 'header':
                    self.header.append(p)

                if unknown is False:
                    self.unknown = unknown

                if print_line:
                    print('parsed: ', p)
    def parse(self, texts, print_line=False):
        self.raw = texts
        proc = TextPreprocessor(n_jobs=-0)
        predictions = {'line': [], 'type': [], 'label':[]}
        for p in texts:
            if len(p) > 10:
                s = word_tokenize(p)
                original_line = deepcopy(p).lower()
                p = proc._preprocess_text(p)
                line_label = self.line_label_classifier.predict_class(sentence=p)
                line_type = self.line_type_classifier.predict_class(sentence=p)
                predictions['line'].append(p)
                unknown = True
                # Find if the line belongs to header
                name = extract_name(s, original_line)
                email = extract_email(s, original_line)
                sex = extract_sex(s, original_line)
                race = extract_ethnicity(s, original_line)
                education = self.extract_education(line_label, p)
                project = self.extract_project(line_label, p)
                experience = self.extract_experience(line_label, p)
                objective = extract_objective(s, p)
                knowledge = self.extract_knowledge(line_label, original_line)
                mobile = extract_mobile(s, original_line)
                if mobile or name or email or sex or race:
                    predictions['type'].append('header')
                    predictions['label'].append('personal')
                else:
                    predictions['type'].append(line_type)
                    predictions['label'].append(line_label)
                if name is not None:
                    self.name = name
                    unknown = False
                if email is not None:
                    self.email = email
                    unknown = False
                if sex is not None:
                    self.sex = sex
                    unknown = False
                if race is not None:
                    self.ethnicity = race
                    unknown = False
                if education is not None:
                    self.education.append(education)
                    unknown = False
                if knowledge is not None:
                    self.knowledge.append(knowledge)
                    unknown = False
                if project is not None:
                    self.project.append(project)
                    unknown = False
                if objective is not None:
                    self.objective = objective
                    unknown = False
                if experience is not None:
                    self.experience.append(experience)
                    unknown = False
                if mobile is not None:
                    self.mobile = mobile
                    unknown = False

                if line_type == 'meta':
                    self.meta.append(p)
                    unknown = False
                if line_type == 'header':
                    self.header.append(p)

                if unknown is False:
                    self.unknown = unknown
        return predictions