def main(): print('Load raw data') data = utils.load_dumped('../data/raw/dump.txt') print('Filter text') content = [utils.filter_text(_[1]) for _ in tqdm(data)] idx = np.arange(len(content)) np.random.seed(19) np.random.shuffle(idx) test_len = int(0.1 * len(idx)) print('Split into train/test') test = "".join(content[_] for _ in tqdm(idx[:test_len])) train = "".join(content[_] for _ in tqdm(idx[test_len:])) vocab = utils.generate_vocab() with open('../data/processed/vocab.json', 'w') as fout: json.dump(vocab, fout) print('Encoding test') test = utils.encode_text(test, vocab) np.save('../data/processed/test', test) print('Encoding train') train = utils.encode_text(train, vocab) np.save('../data/processed/train', train)
def test(): model = keras.models.load_model("tf/model.h5") word_index = modify_word_index() with open('tf/test_review.txt', encoding='utf-8 ') as file: review = filter_text(file.read()) print(review) encode = add_padding([encode_review(review, word_index)], value=word_index['<PAD>'], maxlen=250) print(decode_review(encode_review(review, word_index), word_index)) predict = model.predict(encode) print("\n\n") print("Original Review link: " "https://timesofindia.indiatimes.com/" "entertainment/marathi/movie-reviews/" "basta/movie-review/80670252.cms") print("Rating on Site: 2.5/5") print("Prediction: ", predict[0])
def get_queries(self): """Get list of queries from the metadata of the score for querying audio sources spotify or echonest The list of queries are created from title composer and subtitle of the score. From lower index to higher of the list strictness of the length and coverage of the query decreases and probability of finding results for the query increases: | strictness:query | 0:title + composer + subtitle | 1:title + composer + subtitle with stopwords removed | 2:title + composer | 3:title + composer with stopwords removed | 4:title | 5:title with stopwords removed | Identical queries with lower strictness removed from the list Returns: list of tuples (query,strictness) """ tokens = [self.title, self.composer, self.subtitle] for token in tokens: token = ' '.join(token.split()) queries = [] query_temp = [] j = 0 for i in range(len(tokens), 0, -1): query = ' '.join(tokens[:i]) query = utils.remove_multi_spaces_from_text(query) query = utils.normalize(query) if query not in query_temp: queries.append((query, 2 * j)) query_temp.append(query) queryFiltered = utils.filter_text(query) if queryFiltered != query: queryFiltered = ' '.join(queryFiltered.split()) if queryFiltered not in query_temp: queries.append((queryFiltered, 2 * j + 1)) j = j + 1 return queries
def load_dataset(self, source_file='train.json'): with open(source_file) as dataset_file: self.qa_dataset = json.load(dataset_file) print(self.qa_dataset) dialogues = [(dialogue_line['question'], dialogue_line['answer']) for dialogue_line in self.qa_dataset] questions = set() qa_dataset = [] for replicas in dialogues: if len(replicas) < 2: continue # remove /n, ? and . question = filter_text(replicas[0][2:]) answer = replicas[1][2:] if question and question not in questions: questions.add(question) qa_dataset.append([question, answer]) qa_by_word_dataset = {} # {'word': [[q, a], ...]} for question, answer in qa_dataset: words = question.split(' ') for word in words: if word not in qa_by_word_dataset: qa_by_word_dataset[word] = [] qa_by_word_dataset[word].append((question, answer)) qa_by_word_dataset_filtered = { word: qa_list for word, qa_list in qa_by_word_dataset.items() if len(qa_list) < 1000 } return qa_by_word_dataset_filtered
def generate_answer_by_text(self, text, consider_last_n=1000, min_result_threshold=0.2): text = filter_text(text) words = text.split(' ') qa = [] qa_by_word_dataset_filtered = self.load_dataset() for word in words: if word in qa_by_word_dataset_filtered: qa += qa_by_word_dataset_filtered[word] qa = list(set(qa))[:consider_last_n] results = [] for question, answer in qa: dist = nltk.edit_distance(question, text) dist_percentage = dist / len(question) results.append([dist_percentage, question, answer]) if results: dist_percentage, question, answer = min(results, key=lambda pairs: pairs[0]) if dist_percentage < min_result_threshold: return answer
def process_file(input, mask): # create dictionary of words from text file lines = utils.remove_punctuation(input) cleaned_words = utils.filter_text(lines) frequencies = utils.calculate_frequencies(cleaned_words) # mask for word cloud cloud_mask = None if mask: try: # resize the amsk image basewidth = 1920 img = Image.open(mask) wpercent = (basewidth / float(img.size[0])) hsize = int((float(img.size[1]) * float(wpercent))) img = img.resize((basewidth, hsize), Image.ANTIALIAS) # generate arry of mask image cloud_mask = np.array(img) except UnidentifiedImageError: raise SystemExit(f"Error: cannot identify image file {mask}") # generate image form word cloud cloud = WordCloud(width=1920, height=1080, background_color="black", mask=cloud_mask, contour_width=1, contour_color='black', colormap='Set2', collocations=False) cloud.generate_from_frequencies(frequencies) return cloud
def save(self, *args, **kwargs): if not self.id: self.created_at = datetime.now() self.body_filter = 'Markdown' self.body_html = filter_text(self.body, self.body_filter) super(Notice, self).save(*args, **kwargs)