def processed_rude_words(): if CommentStaticData.processed_rude_word_list is not None: return CommentStaticData.processed_rude_word_list CommentStaticData.processed_rude_word_list = [word for word in process_text(u' '.join(CommentStaticData.rude_word_list)).split(' ') if len(word.strip()) > 0] CommentStaticData.processed_rude_word_list.extend([word for word in process_text(u' '.join(CommentStaticData.additional_stop_words)).split(' ') if len(word.strip()) > 0]) return CommentStaticData.processed_rude_word_list
def store_qas(dataset, qas, vocab, max_length=20): total = len(qas) questions = dataset.create_dataset( 'questions', (total, args.max_length), dtype='i') answers = dataset.create_dataset( 'answers', (total, args.max_length), dtype='i') categories = dataset.create_dataset( 'categories', (total,), dtype='i') image_indices = dataset.create_dataset( 'image_indices', (total,), dtype='i') image_ids = [] bar = progressbar.ProgressBar(maxval=len(qas)) for idx, entry in enumerate(qas): i_image = len(image_ids) if entry['image_id'] in image_ids: i_image = image_ids.index(entry['image_id']) else: image_ids.append(entry['image_id']) image_indices[idx] = i_image categories[idx] = entry['category'] q, length = process_text(entry['question'].encode('utf-8'), vocab, max_length=max_length) questions[idx, :length] = q a, length = process_text(entry['answer'].encode('utf-8'), vocab, max_length=max_length) answers[idx, :length] = a bar.update(idx) return image_ids
def calculate_similarity(self, minhash=True, simhash=True, sequence_match=True, shingle_settings=shingle_settings): """ checking all common resources for changes image checking is broken for now, requires a separate handling :param minhash: True or False, default True :param simhash: True or False, default True :param sequence_match: True or False, default True :param shingle_settings: see `shingle_settings` in toggles.py :return: { resource_url_path: "hash_change" : True or False (sha1 change) "minhash": minhash_coefficient, "simhash": simhash_distance, } """ compared = dict() for content_type in self.resources['modified'].keys(): for url in self.resources['modified'][content_type]: compared[url] = {} p1 = utils.get_payload(url, self.warc1) p2 = utils.get_payload(url, self.warc2) dp1 = utils.decompress_payload(p1) dp2 = utils.decompress_payload(p2) cleaned_dp1 = utils.process_text(dp1) cleaned_dp2 = utils.process_text(dp2) # shingle cleaned text shingles1 = utils.shingle(cleaned_dp1, shingle_settings=shingle_settings) shingles2 = utils.shingle(cleaned_dp2, shingle_settings=shingle_settings) if minhash: compared[url]['minhash'] = utils.get_minhash( shingles1, shingles2) if simhash: compared[url]['simhash'] = utils.get_simhash( shingles1, shingles2) if sequence_match: compared[url]['sequence_matched'] = utils.sequence_match( cleaned_dp1, cleaned_dp2) return compared
def make_site_comment_params(parsed_args, verified, verified_user_id, is_rude): comment_id, body, post_id, post_title, score, parent_post_id, creation_date, author_id, author_username, post_author_id, diff_with_post = parsed_args question_id = -1 answer_id = -1 if parent_post_id > 0: question_id = parent_post_id answer_id = post_id else: question_id = post_id return { 'comment_id': comment_id, 'question_id': question_id, 'answer_id': answer_id, 'post_author_id': post_author_id, 'post_score': score, 'body': body, 'title': post_title, 'processed_body': process_text(body), 'creation_date': creation_date, 'author_id': author_id, 'author_name': author_username, 'verified': verified, 'verified_user_id': verified_user_id, 'is_rude': is_rude, 'diff_with_post': diff_with_post }
def run(args): doc = read_txt(args.path_to_doc) doc_tokens = [ process_text(entry, lower=not args.cased, remove_stopwords=args.remove_stopwords, remove_punctuation=args.remove_punctuation) for entry in doc ] all_tokens = [] for entry_tokens in doc_tokens: all_tokens += entry_tokens rare_tokens, selected_tokens = get_rare_tokens(all_tokens, args.min_freq, args.max_tokens, return_non_rare=True) if args.remove_rare: doc_tokens = [ filter_tokens(entry_tokens, set(rare_tokens)) for entry_tokens in doc_tokens ] gu = GloVeUtility(args.path_to_glove) vectorizer = CountVectorizer(ngram_range=(args.ngram_lower, args.ngram_upper), vocabulary=selected_tokens) count_vector = vectorizer.fit_transform( [" ".join(entry_tokens) for entry_tokens in doc_tokens]) csr_mat = count_vector.T * count_vector csr_mat.setdiag(0) cooccur_ar = csr_mat.toarray() mittens_model = Mittens(n=gu.d, max_iter=args.iter) embeddings = mittens_model.fit(cooccur_ar, vocab=selected_tokens, initial_embedding_dict=gu.vector_dict) filename = args.path_to_glove.split(os.path.sep)[-1] os.makedirs(args.output, exist_ok=True) embeddings_dict = dict(zip(selected_tokens, embeddings)) progress_bar.std_print("\nTrained on {} tokens.".format( len(embeddings_dict))) if args.save_new_only: savepath = os.path.join(args.output, "new_" + filename) embeddings_list = [ " ".join([key] + [str(val) for val in embeddings_dict[key]]) for key in embeddings_dict ] write_txt(savepath, embeddings_list) else: savepath = os.path.join(args.output, filename) gu.add_replace_vectors(embeddings_dict) gu.save_vectors(savepath)
def get_data_to_buffer(): buffer = list() text = process_text(os.path.join("data", "train.txt")) cnt = 0 start = time.perf_counter() for i in range(len(text)): cnt += 1 mel_gt_name = os.path.join(hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i + 1)) mel_gt_target = np.load(mel_gt_name) duration = np.load( os.path.join(hparams.alignment_path, str(i) + ".npy")) character = text[i][0:len(text[i]) - 1] character = np.array(text_to_sequence(character, hparams.text_cleaners)) character = torch.from_numpy(character) duration = torch.from_numpy(duration) mel_gt_target = torch.from_numpy(mel_gt_target) buffer.append({ "text": character, "duration": duration, "mel_target": mel_gt_target }) if cnt % 1000 == 0: print("{:d} record has been processed.".format(cnt)) end = time.perf_counter() print("cost {:.2f}s to load all data into buffer.".format(end - start)) return buffer
def main1(): path = os.path.join("data", "LJSpeech-1.1") #preprocess_ljspeech(path) text_path = os.path.join("data", "train.txt") texts = process_text(text_path) if not os.path.exists(hp.alignment_path): os.mkdir(hp.alignment_path) tacotron2 = get_Tacotron2() num = 0 for ind, text in enumerate(texts[num:]): if (ind > 10): exit(0) character = text[0:len(text) - 1] mel_gt_name = os.path.join(hp.mel_ground_truth, "ljspeech-mel-%05d.npy" % (ind + num + 1)) mel_gt_target = np.load(mel_gt_name) _, _, D = load_data(character, mel_gt_target, tacotron2) np.save(os.path.join(hp.alignment_path, str(ind + num) + ".npy"), D, allow_pickle=False)
def parse_question_file(filename="questions.csv"): full_text = "" with open(filename, 'rt', encoding="utf8") as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') for row in csv_reader: _, _, _, title, body, tags = row full_text += " " + process_text(body) return full_text
def parse_answer_file(filename="answers.csv"): full_text = "" with open(filename, 'rt', encoding="utf8") as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') for row in csv_reader: _, _, body, _ = row full_text += " " + process_text(body) return full_text
def parse_comment_file(filename="comments.csv"): full_text = "" with open(filename, 'rt', encoding="utf8") as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') for row in csv_reader: body, post_id = row full_text += " " + process_text(body) return full_text
def decrypt(text, key): ''' Function -- decrypt decrypts cipher text by replacing letters in the text with letters in the alphabet based on key index paramters: text -- cipher text string key -- plain text string of len 26 returns decrypted plain text version of the cipher text ''' # check validity of the inputs if key == '' or text == '': raise ValueError('both text and key values must be given') if not isinstance(key, str): raise TypeError('key must be a string') if not isinstance(text, str): raise TypeError('text must be a string') try: text = utils.strip(text) text = utils.process_text(text) text = utils.latin_caps(text) key = utils.strip(key) key = utils.process_text(key) key = utils.latin_caps(key) except ValueError: raise ValueError('text and key must only contain valid letters') if not utils.check_full(key): raise ValueError('key must contain only each letter of alphabet once') # alphabet and cipher alphabet_str = string.ascii_uppercase alphabet = list(alphabet_str) plain = '' # iterate through text and replace letter by key index for letter in text: if letter in alphabet: plain = plain + alphabet[key.index(letter)] else: raise ValueError('text and key must only contain valid letters') return plain
def main(args): # Load the arguments. model_dir = os.path.dirname(args.model_path) params = Dict2Obj( json.load(open(os.path.join(model_dir, "args.json"), "r"))) # Config logging log_format = '%(levelname)-8s %(message)s' logfile = os.path.join(model_dir, 'eval.log') logging.basicConfig(filename=logfile, level=logging.INFO, format=log_format) logging.getLogger().addHandler(logging.StreamHandler()) logging.info(json.dumps(args.__dict__)) # Load vocabulary wrapper. vocab = load_vocab(params.vocab_path) # Build data loader logging.info("Building data loader...") # Load GloVe embedding. if params.use_glove: embedding = get_glove_embedding(params.embedding_name, 300, vocab) else: embedding = None # Processing input text logging.info("Processing input text...") text, length = process_text(args.text, vocab, max_length=20) d_text = text logging.info("Done") # Build the models logging.info('Creating IQ model...') model = Classifier(len(vocab), embedding_dim=params.embedding_dim, embedding=embedding, hidden_dim=params.num_hidden_nodes, output_dim=params.num_output_nodes, num_layers=params.num_layers, bidirectional=params.bidirectional, dropout=params.dropout, rnn_cell=params.rnn_cell) logging.info("Done") logging.info("Loading model.") model.load_state_dict( torch.load(args.model_path + "model-tf-" + args.state + ".pkl")) # Setup GPUs. if torch.cuda.is_available(): logging.info("Using available GPU...") model.cuda() predict(model, d_text)
def read_dumped_comments(self, filename='comments.csv'): data = list() def to_bool(field): return (str(field).lower() == 'true') def to_int(field, default_value=-1): try: value = int(field) return value except ValueError: return default_value def to_date(field, default_value=None): date_format = "%Y-%m-%d %H:%M:%S" try: value = datetime.strptime(field, date_format), return value except ValueError: return default_value with open(self.prefix + filename, 'rt', encoding="utf8") as csvfile: csv_reader = csv.reader(csvfile, delimiter=',') for row in csv_reader: comment_id, question_id, answer_id, post_author_id, post_score, title, body, creation_date, author_id, author_name, diff_with_post, verified, is_rude, verified_user_id, added, analysed, looks_rude, skipped = row data_item = { "comment_id": int(comment_id), "question_id": int(question_id), "answer_id": int(answer_id), "post_author_id": int(post_author_id), "post_score": int(post_score), "title": title, "body": body, "processed_body": process_text(body), "creation_date": to_date(creation_date), "author_id": int(author_id), "author_name": author_name, "diff_with_post": int(diff_with_post), "verified": to_date(verified), "is_rude": to_bool(is_rude), "verified_user_id": to_int(verified_user_id), "added": to_date(added), "analysed": to_date(analysed), "looks_rude": to_bool(looks_rude), "skipped": to_date(skipped) } data.append(data_item) return data
def count(ngram, hash_size, doc_id): """Fetch the text of a document and compute hashed ngrams counts.""" global DOC2IDX row, col, data = [], [], [] # Get ngrams after tokenizing and processing (i.e. stopword/punctuation filtering) ngrams = utils.process_text(utils.normalize(fetch_text(doc_id)), stopwords=True, stem=True, ngram=ngram) # Hash ngrams and count occurences counts = Counter([utils.hash(gram, hash_size) for gram in ngrams]) # Return in sparse matrix data format. row.extend(counts.keys()) col.extend([DOC2IDX[doc_id]] * len(counts)) data.extend(counts.values()) return row, col, data
def get_figure_mentions_by_lines(grobid_article_dir, figure_num, length, mentions_dict): """Get the text of figure mentions in the article using sentence windows. Args: grobid_article_dir: (string) the directory the processed Grobid file of the article. figure_num: (string) the figure number. length: (int) the number of lines that should surround a figure mention. mentions_dict: (dictionary) the locations of sentences in the texts where the mentions are. Returns: (list). The merged representation. """ mentions_text = '' doc_txt = '' with open(grobid_article_dir, 'r') as input_file: for line in input_file: line = line.rstrip('\n') doc_txt += line + ' ' doc_txt = doc_txt.rstrip(' ') doc_txt = utils.process_text(doc_txt) sentences = sent_tokenize(doc_txt) if str(figure_num) in mentions_dict.keys(): all_summary_ids = [] for sentence_id in mentions_dict[str(figure_num)]: summary_ids = [] for i in range(length, 0, -1): summary_ids += [sentence_id - i] summary_ids += [sentence_id] for i in range(1, length + 1): summary_ids += [sentence_id + i] all_summary_ids += [summary_ids] all_summary_ids = merge_texts(all_summary_ids) for summary_ids in all_summary_ids: for i in summary_ids: if i < 0 or i >= len(sentences): continue mentions_text += ' ' + sentences[i] mentions_text += ' ... ' mentions_text = mentions_text.rstrip(' ...') + '\"' return str(mentions_text)
def generator(model): os.makedirs("gta", exist_ok=True) with torch.no_grad(): text = process_text(os.path.join("data", "train.txt")) start = time.perf_counter() for i in tqdm(range(len(text))): mel_gt_name = os.path.join(hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i + 1)) mel_gt_target = np.load(mel_gt_name) character = text[i][0:len(text[i])-1] character = np.array(text_to_sequence(character, hparams.text_cleaners)) character = torch.stack([torch.from_numpy(character)]).long().to(device) length = torch.Tensor([character.size(1)]).long().to(device) mel_gt_target = torch.stack([torch.from_numpy(mel_gt_target.T)]).float().to(device) mel_gta = model.gta(character, mel_gt_target, length) np.save(os.path.join("gta", "ljspeech-mel-%05d.npy" % (i + 1)), mel_gta.cpu()[0].numpy()) end = time.perf_counter() print("cost {:.2f}s to generate gta data.".format(end - start))
def main(): path = os.path.join("data", "LJSpeech-1.1") preprocess_ljspeech(path) text_path = os.path.join(path, "metadata.csv") texts = process_text(text_path) if not os.path.exists(hp.alignment_path): os.mkdir(hp.alignment_path) num = 0 for ind, line in enumerate(texts[num:]): parts = line.strip().split('|') phones = parts[4] # sumLen=parts[5]; mel_gt_name = os.path.join(hp.mel_ground_truth, "ljspeech-mel-%05d.npy" % (ind + num + 1)) mel_gt_target = np.load(mel_gt_name) D = np.array(phones.split(' ')).astype(int) if (ind % 100 == 0): print("calc number:", ind, D.sum(), parts[4], mel_gt_target.shape[0], line) if (D.sum() > mel_gt_target.shape[0]): print("phonelen error:", D.sum(), mel_gt_target.shape[0], line) exit(0) if (abs(mel_gt_target.shape[0] - D.sum()) > 3): print("phonelen error:", D.sum(), mel_gt_target.shape[0], line) exit(0) if (D.sum() < mel_gt_target.shape[0]): gap = mel_gt_target.shape[0] - D.sum() fron = int(gap / 2) end = gap - fron D[0] = D[0] + fron D[len(D) - 1] = D[len(D) - 1] + end np.save(os.path.join(hp.alignment_path, str(ind + num) + ".npy"), D, allow_pickle=False)
def word_overlap_phi(claim, evidence): """Basis for features for the words in both the premise and hypothesis. This tends to produce very sparse representations. Parameters ---------- claim : a string evidence : a list of sentences Returns ------- defaultdict Maps each word in both claim and evidence to 1. """ sents = [] for sent in evidence: sents.extend(utils.process_sent(sent)) overlap = set([w1 for w1 in utils.process_text(claim) if w1 in sents]) return Counter(overlap)
def fit(self, n_iter, num_proc=2): ''' Trains the Word2vec :param n_iter: (int) :param num_proc: (int) number of parallel threads ''' self.n_iter = n_iter self.num_proc = num_proc self.X, self.y, self.word_to_index, \ self.index_to_word, self.occurence = process_text(text=self.text, vocab_size=self.vocab_size, window_size=self.window_size) # Training self.M_in, self.M_out, self.loss, self.process_time = Hogwild(self.X, self.y, self.n_iter, self.vocab_size, self.embedding_size, self.learning_rate, self.window_size, self.n_negative, self.occurence, num_proc=self.num_proc)
def make_site_comment_params(comment, info): comment_id, post_id, body, creation_date, author_id, author_name = comment question_id, answer_id, post_author_id, post_author_name, score, title, post_creation_date = info return { "comment_id": comment_id, "question_id": question_id, "answer_id": answer_id, "post_author_id": post_author_id, "post_score": score, "title": title, "body": body, "processed_body": process_text(body), "creation_date": creation_date, "author_id": author_id, "author_name": author_name, "verified": None, "is_rude": False, "diff_with_post": (creation_date - post_creation_date).total_seconds() }
def main(): # path = os.path.join("data", "LJSpeech-1.1") # preprocess_ljspeech(path) text_path = os.path.join("data", "train.txt") texts = process_text(text_path) if not os.path.exists(hp.cemb_path): os.mkdir(hp.cemb_path) if not os.path.exists(hp.alignment_path): os.mkdir(hp.alignment_path) if not os.path.exists(hp.mel_tacotron2): os.mkdir(hp.mel_tacotron2) tacotron2 = get_Tacotron2() # wave_glow = get_WaveGlow() num = 0 for ind, text in enumerate(texts[num:]): print(ind) # mel_name = os.path.join(hp.mel_ground_truth, # "ljspeech-mel-%05d.npy" % (ind+1)) # mel_target = np.load(mel_name) character = text[0:len(text) - 1] mel_tacotron2, cemb, D = load_data_from_tacotron2(character, tacotron2) np.save(os.path.join(hp.mel_tacotron2, str(ind + num) + ".npy"), mel_tacotron2, allow_pickle=False) np.save(os.path.join(hp.cemb_path, str(ind + num) + ".npy"), cemb, allow_pickle=False) np.save(os.path.join(hp.alignment_path, str(ind + num) + ".npy"), D, allow_pickle=False)
def get_figure_ids_line_pos(grobid_article_dir): """Get the line numbers of lines in the text that have the figure mentions. Args: grobid_article_dir: (string) the directory the processed grobid file of the article. Returns: (dictionary of lists). Mapping from figure identifiers to the list of indexes in the text where the mentions are. """ figures_dict = {} doc_txt = '' with open(grobid_article_dir, 'r') as input_file: for line in input_file: line = line.rstrip('\n') doc_txt += line + ' ' doc_txt = doc_txt.rstrip(' ') doc_txt = utils.process_text(doc_txt) sentences = sent_tokenize(doc_txt) for sentence_id, sentence in enumerate(sentences): if '<figcaptions>' in sentence: break words = sentence.split() for word_id, word in enumerate(words): if word.split('.')[0].lower().lstrip('(').rstrip( ')') in figure_formats: word = word.replace('III', '3').replace('II', '2').replace('I', '1') if len(word.split('.')) > 1: word_length = len(word.split('.')) fig_num = utils.extract_number('.'.join( word.split('.')[1:word_length])) else: fig_num = utils.extract_number(words[word_id + 1].replace( 'III', '3').replace('II', '2').replace('I', '1')) figures_dict[fig_num] = figures_dict.get(fig_num, []) + [sentence_id] return figures_dict
def word_cross_product_phi(claim, evidence): """Basis for cross-product features. This tends to produce pretty dense representations. Parameters ---------- claim : a string evidence : a list of sentences Returns ------- defaultdict Maps each (w1, w2) in the cross-product of words in claim and evidence to its count. This is a multi-set cross-product (repetitions matter). """ sents = [] for sent in evidence: sents.extend(utils.process_sent(sent)) return Counter([(w1, w2) for w1, w2 in product(utils.process_text(claim), sents)])
def get_data_to_buffer(): buffer = list() text = process_text(os.path.join("data", "train.txt")) start = time.perf_counter() for i in tqdm(range(len(text))): mel_gt_name = os.path.join(hparams.mel_ground_truth, "ljspeech-mel-%05d.npy" % (i + 1)) mel_gt_target = np.load(mel_gt_name) character = text[i][0:len(text[i]) - 1] character = np.array(text_to_sequence(character, hparams.text_cleaners)) character = torch.from_numpy(character) mel_gt_target = torch.from_numpy(mel_gt_target) buffer.append({"text": character, "mel_target": mel_gt_target}) end = time.perf_counter() print("cost {:.2f}s to load all data into buffer.".format(end - start)) return buffer
def clusteringtask(): print('Classification endpoint hit') start = time() try: data = request.json['text'] if type(data) == str: data = [data] data = process_text(data) pred = model.predict_batch(data) return jsonify({ 'message': 'Classification successful', 'classification': [label_map(int(x[0][0])) for x in pred], 'enum': [int(x[0][0]) for x in pred], 'confidence': [x[0][1] for x in pred], 'time': time() - start }), 200 except Exception as e: tb = traceback.format_exc() print(f"TRACEBACK:\n\n{tb}\n") return jsonify({'message': str(e), 'stacktrace': str(tb)}), 500
def decrypt(text, key): ''' Function -- decrypt decrypts cipher text by shifting letter indices to the left in the alphabet per the key (reversed if negative) paramters: text -- cipher text string key -- integer indicating magnitude and direction of index shift returns decrypted plain text version of the cipher text ''' # check validity of inputs if key == '' or text == '': raise ValueError('both text and key values must be given') if not isinstance(key, int): raise TypeError('key must be an integer') if not isinstance(text, str): raise TypeError('text must be a string') try: text = utils.strip(text) text = utils.process_text(text) except ValueError: raise ValueError('text must only contain latin letters') key = key % 26 # decrypt the string plain = '' a_index = list(range(0, 26)) for x in text: if ord(x) >= 65 and ord(x) <= 90: plain += chr(a_index[((ord(x) - 65) - key) % 26] + 65) else: raise ValueError('text must only contain latin letters') return plain
def word_cross_product_phi(claim, evidence): """Basis for cross-product features. This tends to produce pretty dense representations. # Parameters # ---------- # claim : a string # evidence : a list of sentences # # Returns # ------- # defaultdict # Maps each (w1, w2) in the cross-product of words in claim and # evidence to its count. This is a multi-set cross-product # (repetitions matter). # # """ local_sents = [] for sent in evidence: local_sents.extend(utils.process_sent(sent)) print("Sent: " + str(local_sents)) return Counter([ (w1, w2) for w1, w2 in product(utils.process_text(claim), local_sents) ])
def word_list_maker(words): return [ word for word in process_text(u' '.join(words)).split(u' ') if len(word.strip()) > 0 ]
parser.add_argument('--model', default="Seq2Seq", type=str, help='choose a model: Seq2Seq') args = parser.parse_args() if __name__ == '__main__': dataset = 'Couplets' # 数据集 model_name = args.model # Seq2Seq x = import_module('models.' + model_name) #一个函数运行需要根据不同项目的配置,动态导入对应的配置文件运行。 config = x.Config(dataset) #进入到对应模型的__init__方法进行参数初始化 start_time = time.time() print("Loading data...") input_texts, target_texts, input_characters, target_characters = process_text( config) num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length, input_token_index, target_token_index = bulid_token_index( input_texts, target_texts, input_characters, target_characters) encoder_input_data, decoder_input_data, decoder_target_data = build_dataset( input_texts, target_texts, num_encoder_tokens, num_decoder_tokens, max_encoder_seq_length, max_decoder_seq_length, input_token_index, target_token_index) config.num_encoder_tokens = num_encoder_tokens config.num_decoder_tokens = num_decoder_tokens config.max_encoder_seq_length = max_encoder_seq_length config.max_decoder_seq_length = max_decoder_seq_length time_dif = get_time_dif(start_time) print("Time usage:", time_dif)
def __init__(self): self.text = process_text(os.path.join("data", "train.txt"))