class InvertedIndex: def __init__(self, fpath, dump_fpath): self.text_processor = TextProcessor() self.queries = [] self.documents = [] self.original_documents = [] self.is_duplicates = [] self.vectorizer = CountVectorizer() self.build_index(fpath) self.dump(dump_fpath) def build_index(self, fpath): with open(abspath(fpath), 'r', encoding='utf-8') as file: table = csv_reader(file) for row in tqdm(list(table)): if row[0] == '': continue # TODO: сделать полный индекс if row[0] == '10000': break self.queries.append(self.text_processor.process(row[1])) self.documents.append(self.text_processor.process(row[2])) self.original_documents.append(row[2]) self.is_duplicates.append(row[3]) self.vectorizer.fit_transform(self.queries) def dump(self, dump_fpath): json_encoded = jp_encode(self) with open(dump_fpath, 'w', encoding='utf-8') as file: json_dump(json_encoded, file, ensure_ascii=False, indent=4) @staticmethod def restore(dump_fpath): with open(dump_fpath, "r", encoding='utf-8') as file: idx_dump = json_load(file) return jp_decode(idx_dump) @staticmethod def from_dump_or_build(dump_fpath, corpora_fpath): if isfile(dump_fpath): try: return InvertedIndex.restore(dump_fpath) except Exception: return InvertedIndex(corpora_fpath, dump_fpath) else: return InvertedIndex(corpora_fpath, dump_fpath)
def process_files(self): files=[f for f in listdir(self.path)] files_dic={} for file in files: #process the file based on the file extension file_ext=file.split('.')[-1] if file_ext=='txt': files_dic[file]=self.process_txt(self.path+file) elif file_ext=='pdf': files_dic[file]=self.process_pdf(self.path+file) elif file_ext=='html': files_dic[file]=self.process_html(self.path+file) tp=TextProcessor() for file, text in files_dic.items(): #call the text_processor module text_proc_result=tp.process(JSONEncoder().encode({'action':'process', 'data':text})) text_proc_result=JSONDecoder().decode(text_proc_result)['terms'] files_dic[file]=text_proc_result return files_dic
def form_valid(self, form): text_processor = TextProcessor() form.instance.processed_text = text_processor.process( form.cleaned_data['origin_text']) return super().form_valid(form)
def main(): device = torch.device('cuda') embedding_vectors = torch.load(f'{EMBEDDINGS_DIR}/vectors.pkl') text_processor = TextProcessor( wti=pickle.load(open(f'{EMBEDDINGS_DIR}/wti.pkl', 'rb')), tokenizer=get_tokenizer('basic_english'), standardize=True, min_len=3, ) dataset = TextDataset(CORPUS_DIR, text_processor) # split into training and test set # TODO: fix this splitting sometimes failing when corpus size changes train_set, test_set = torch.utils.data.random_split( dataset, [ int(len(dataset) * DATA_SPLIT), int(len(dataset) * (1.0 - DATA_SPLIT)) ]) # count number of samples in each class class_count = [0, 0] for data, label in dataset: class_count[int(label.item())] += 1 # get relative weights for classes _sum = sum(class_count) class_count[0] /= _sum class_count[1] /= _sum # reverse the weights since we're getting the inverse for the sampler class_count = list(reversed(class_count)) # set weight for every sample weights = [class_count[int(x[1].item())] for x in train_set] # weighted sampler sampler = torch.utils.data.WeightedRandomSampler( weights=weights, num_samples=len(train_set), replacement=True) train_loader = DataLoader(dataset=train_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN), sampler=sampler) test_loader = DataLoader(dataset=test_set, batch_size=32, collate_fn=Sequencer(SEQUENCE_LEN)) # number of filters in each convolutional filter N_FILTERS = 64 # sizes and number of convolutional layers FILTER_SIZES = [2, 3] # dropout for between conv and dense layers DROPOUT = 0.5 model = TextCNN( embeddings=embedding_vectors, n_filters=N_FILTERS, filter_sizes=FILTER_SIZES, dropout=DROPOUT, ).to(device) print(model) print('Trainable params:', sum(p.numel() for p in model.parameters() if p.requires_grad)) criterion = nn.BCELoss() optimizer = torch.optim.Adam(model.parameters(), lr=1e-3) EPOCHS = 12 best_acc = 0.0 # training loop for epoch in range(EPOCHS): print('Epoch', epoch + 1) for i, data in tqdm(enumerate(train_loader), total=len(train_loader)): # get word indices vector and corresponding labels x, labels = data # send to device x = x.to(device) labels = labels.to(device) # make predictions predictions = model(x).squeeze() # calculate loss loss = criterion(predictions, labels) # learning stuff... optimizer.zero_grad() loss.backward() optimizer.step() # evaluate with torch.no_grad(): model.eval() correct = 0 wrong = 0 m = [[0, 0], [0, 0]] for data in test_loader: x, label = data x = x.to(device) predictions = model(x).squeeze() for truth, prediction in zip(label, predictions): y = int(truth.item()) y_pred = 1 if prediction.item() > 0.5 else 0 m[y][y_pred] += 1 if y == y_pred: correct += 1 else: wrong += 1 model.train() acc = correct / (correct + wrong) if acc > best_acc: best_acc = acc for file in glob.glob('models/model_*.pth'): os.remove(file) torch.save(model.state_dict(), f'models/state_{epoch}.pth') print() print('Correct:', f'{correct}/{correct + wrong}', 'Accuracy:', acc) print('[[TN, FP], [FN, TP]]') print(m) print() # put into evaluation mode model.eval() text_processor.do_standardize = True with torch.no_grad(): while True: text = input('Prompt: ') x = text_processor.process(text) x = torch.tensor(x).unsqueeze(dim=0) print(model(x.to(device)).squeeze())
def process_file(file_id): file = File.objects.get(pk=file_id) print(file) try: origin_path = file.origin_file.path except ValueError: origin_path = None file.input_type = get_input_type(file) print(file.input_type) file.progress += 10 file.save() sleep(.5) file.progress += 10 file.save() document = None if file.input_type == File.InputTypes.IMAGE: document = Document() text = image_to_text(origin_path) elif file.input_type == File.InputTypes.TEXTBOX: text = file.origin_text else: document = Document(origin_path, text_params) text = document.parse() # file.progress += 50 file.progress += 20 file.save() sleep(.5) file.progress += 10 file.save() sleep(.5) file.progress += 10 file.save() text_processor = TextProcessor() processed_text = text_processor.process(text) # file.progress += 30 file.progress += 20 file.save() sleep(.5) file.progress += 10 file.save() if file.input_type == File.InputTypes.TEXTBOX: file.processed_text = processed_text else: if document is None: raise ValueError('Error with document') output_name = get_output_field(file) document.change_text(processed_text) document.save(file.processed_file.storage.path(output_name)) file.processed_file = output_name sleep(.5) file.progress = 100 file.save() print(file)
def process_words(): textProcessor = TextProcessor() processed_text = textProcessor.process(request.args.get('text')) return jsonify(response=processed_text)