def __iter__(self): files = {filename[:-4] for filename in os.listdir(self.dirname)} for doc_id, fname in enumerate(files): d = Document(doc_id, os.path.join(self.dirname, fname + '.txt')) for sentence in d.read_sentences(): yield sentence
def __iter__(self): files = {filename[:-4] for filename in os.listdir(self.dirname)} for doc_id, fname in enumerate(files): print(str(doc_id) + '||||' + fname) #这句是我加的,查看读取的文件进度,否则等的心急 d = Document(doc_id, os.path.join(self.dirname, fname + '.txt')) for sentence in d.read_sentences(): yield sentence
def build_x_and_y(data: DataList, **kwargs): """ Given file names and their directory, build (X, y) data matrices :param filenames: iterable of strings showing file ids (no extension) :param file_directory: path to a directory where those files lie :param kwargs: additional necessary data for matrix building e.g. scaler :return: a tuple (X, y) """ label_indices = kwargs['label_indices'] word2vec_model = kwargs['word2vec_model'] scaler = kwargs['scaler'] nn_model = kwargs['nn_model'] regression = kwargs.get('regression', False) x_matrix = np.zeros( (len(data), SAMPLE_LENGTH, word2vec_model.vector_size)) if regression: # print('YES REGRESSION') y_matrix = np.zeros((len(data), 1), dtype=np.float_) # print(y_matrix) else: # print('NOT REGRESSION') y_matrix = np.zeros((len(data), len(label_indices)), dtype=np.bool_) for doc_id, example in enumerate(data): doc = Document(example['text']) words = doc.get_all_words()[:SAMPLE_LENGTH] for i, w in enumerate(words): if w in word2vec_model.wv: word_vector = word2vec_model.wv[w].reshape(1, -1) x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0] labels = [example['label']] for lab in labels: if regression: y_matrix[doc_id] = float(lab) else: index = label_indices[lab] y_matrix[doc_id][index] = True if nn_model and isinstance(nn_model.input, list): return [x_matrix] * len(nn_model.input), y_matrix else: return [x_matrix], y_matrix
def get_documents_from_mongo(ids, mongo_collection, as_generator=True, shuffle=False): """ Extract documents from *.txt files in a given directory :param data_dir: path to the directory with .txt files :param as_generator: flag whether to return a document generator or a list :param shuffle: flag whether to return the documents in a shuffled vs sorted order :return: generator or a list of Document objects """ print("get document from mongo!") if shuffle: random.shuffle(ids) docs_step = 500000 steps_times = len(ids) // docs_step steps = [docs_step * i for i in range(steps_times + 1)] + [len(ids)] cursors = [ mongo_collection.find({"_id": { "$in": ids[steps[i - 1]:steps[i]] }}) for i in range(1, len(steps)) ] all_docs = (x for c in cursors for x in c) generator = (Document(doc_id, None, text=d["full_text"]) for doc_id, d in enumerate(all_docs)) return generator if as_generator else list(generator)
def predict_from_text(self, text): """ Predict labels for a given string of text :param text: string or unicode with the text :return: list of labels with corresponding confidence intervals """ doc = Document(0, None, text=text) return self._predict(doc)
def build_x_and_y(filenames, file_directory, **kwargs): """ Given file names and their directory, build (X, y) data matrices :param filenames: iterable of strings showing file ids (no extension) :param file_directory: path to a directory where those files lie :param kwargs: additional necessary data for matrix building e.g. scaler :return: a tuple (X, y) """ label_indices = kwargs['label_indices'] word2vec_model = kwargs['word2vec_model'] scaler = kwargs['scaler'] nn_model = kwargs['nn_model'] x_matrix = np.zeros((len(filenames), SAMPLE_LENGTH, EMBEDDING_SIZE)) y_matrix = np.zeros((len(filenames), len(label_indices)), dtype=np.bool_) for doc_id, fname in enumerate(filenames): doc = Document(doc_id, os.path.join(file_directory, fname + '.txt')) words = doc.get_all_words()[:SAMPLE_LENGTH] for i, w in enumerate(words): if w in word2vec_model: word_vector = word2vec_model[w].reshape(1, -1) x_matrix[doc_id][i] = scaler.transform(word_vector, copy=True)[0] labels = get_answers_for_doc( fname + '.txt', file_directory, filtered_by=set(label_indices.keys()), ) for lab in labels: index = label_indices[lab] y_matrix[doc_id][index] = True if nn_model and type(nn_model.input) == list: return_data = [x_matrix] * len(nn_model.input), y_matrix else: return_data = [x_matrix], y_matrix if type(nn_model) == Graph: return {'input': return_data[0], 'output': return_data[1]} else: return return_data
def predict_from_file(self, filepath): """ Predict labels for a txt file :param filepath: path to the file :return: list of labels with corresponding confidence intervals """ doc = Document(0, filepath) return self._predict(doc)
def _predict(self, doc: Document, return_float=False): """ Predict labels for a given Document object :param doc: Document object :return: list of labels with corresponding confidence intervals """ set_tf_growth() if isinstance(self.keras_model.input, list): _, sample_length, embedding_size = self.keras_model.input_shape[0] else: _, sample_length, embedding_size = self.keras_model.input_shape words = doc.get_all_words()[:sample_length] x_matrix = np.zeros((1, sample_length, embedding_size)) for i, w in enumerate(words): if w in self.word2vec_model.wv: word_vector = self.word2vec_model.wv[w].reshape(1, -1) scaled_vector = self.scaler.transform(word_vector, copy=True)[0] x_matrix[0][i] = scaled_vector if isinstance(self.keras_model.input, list): x = [x_matrix] * len(self.keras_model.input) else: x = [x_matrix] with tf.device('/cpu:0'): y_predicted = self.keras_model.predict(x) # return weighted avg of labels # return reduce(lambda acc, x: acc + (x[0] * x[1]), zipped, 1) #weighted avg # TODO make this return weighted avg or max prob a param # max probablitiy, corresponding to standard keras mmethodology # print(f'model output shape {self.keras_model.output_shape}') if self.keras_model.output_shape[1] == 1: # print(f'returning {y_predicted[0][0]}') float_y_pred = float(y_predicted[0][0]) # if not isinstance(y_predicted[0][0], float): # print(type(y_predicted[0][0])) # print(y_predicted, y_predicted[0][0]) assert(isinstance(float_y_pred, float)) # print(float_y_pred) return float_y_pred elif return_float: zipped = zip(self.labels, y_predicted[0]) return float( sorted( zipped, key=lambda elem: elem[1], reverse=True)[0][0]) else: zipped = zip(self.labels, y_predicted[0]) return sorted(zipped, key=lambda elem: elem[1], reverse=True)[0][0]
def predict_from_text(self, text, test=False, return_float=False): """ Predict labels for a given string of text :param text: string or unicode with the text :return: list of labels with corresponding confidence intervals """ if hasattr(self, 'training_set') and not test: if text in self.training_set: print(f'found text in training set: {text}') # assert(not test or (text not in self.training_set)) # else: # # print("pretrained model not checking for test train split") doc = Document(text) return self._predict(doc, return_float=return_float)
def predict_from_file(self, filepath): """ Predict labels for a txt file :param filepath: path to the file :return: list of labels with corresponding confidence intervals 预测TXT文件的标签 :param file path:文件路径 :返回:具有相应置信区间的标签列表 """ doc = Document(0, filepath) return self._predict(doc)
def get_documents(data_dir, as_generator=True, shuffle=False): """ Extract documents from *.txt files in a given directory :param data_dir: path to the directory with .txt files :param as_generator: flag whether to return a document generator or a list :param shuffle: flag whether to return the documents in a shuffled vs sorted order :return: generator or a list of Document objects """ files = list({filename[:-4] for filename in os.listdir(data_dir)}) files.sort() if shuffle: random.shuffle(files) generator = (Document(doc_id, os.path.join(data_dir, f + '.txt')) for doc_id, f in enumerate(files)) return generator if as_generator else list(generator)
def fit_scaler(data: DataList, word2vec_model, batch_size=1024, persist_to_path=None): """ Get all the word2vec vectors in a 2D matrix and fit the scaler on it. This scaler can be used afterwards for normalizing feature matrices. """ if isinstance(word2vec_model, str): word2vec_model = Word2Vec.load(word2vec_model) # TODO add other non-text features here doc_generator = iter([Document(example['text']) for example in data]) scaler = StandardScaler(copy=False) no_more_samples = False while not no_more_samples: batch = [] for i in range(batch_size): try: batch.append(six.next(doc_generator)) except StopIteration: no_more_samples = True break vectors = [] for doc in batch: for word in doc.get_all_words(): if word in word2vec_model.wv: vectors.append(word2vec_model.wv[word]) matrix = np.array(vectors) print("Fitted to {} vectors".format(matrix.shape[0])) scaler.partial_fit(matrix) if persist_to_path: save_to_disk(persist_to_path, scaler) return scaler