def add_content(self, text): if self.stem: self.content += ' ' + cleaner.clean_text_and_stem(text) elif self.clean: self.content += ' ' + cleaner.clean_text(text) else: self.content += text
def main(): mydict = dictionary.Dictionary() modifyfunct = lambda text: cleaner.clean_text(text,mydict) csvsource = 'csvs/A_test_csv.csv' modifydest = 'csvs/testing_with_fun.csv' modify(csvsource,modifydest,modifyfunct)
def __init__(self, post_id, post_time_str, forum_id, thread_id, author_id, author_name, first_post_marker, title, content, clean = True, stem = False): self.post_id = int(post_id) self.post_time_str = post_time_str self.forum_id = int(forum_id) self.thread_id = int(thread_id) self.author_id = int(author_id) self.author_name = author_name self.is_first_post = int(first_post_marker) self.title = title self.clean = clean self.stem = stem if stem: self.content = cleaner.clean_text_and_stem(content) elif clean: self.content = cleaner.clean_text(content) else: self.content = content
def main(): """Run the MODEL interactively.""" print("\nWelcome to COVID-19 chatbot!") print("The input prompt will appear shortly\n\n") models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR)) assert NSAMPLES % BATCH_SIZE == 0 enc = encoder.get_encoder(MODEL_NAME) hparams = model.default_hparams() with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file: hparams.override_from_dict(json.load(file)) if LENGTH is None: length = hparams.n_ctx // 2 elif LENGTH > hparams.n_ctx: raise ValueError( "Can't get samples longer than window size: {}".format( hparams.n_ctx)) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [BATCH_SIZE, None]) np.random.seed(SEED) tf.set_random_seed(SEED) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=BATCH_SIZE, temperature=TEMPERATURE, top_k=TOP_K, ) saver = tflex.Saver() saver.restore(sess, CHECKPOINT) while True: question = input("COVID-19 CHATBOT> ") while not question: print("Prompt should not be empty!") question = input("COVID-19 CHATBOT> ") context_tokens = [enc.encode(question)] * BATCH_SIZE # custom for full length text total_tokens = len(context_tokens[0]) generated_once = False gen_texts = [] answers = "" split_length = int(1023 * SPLIT_CONTEXT) split_output_length = min(length, 1023 - split_length) for _ in range(NSAMPLES // BATCH_SIZE): gen_text = [np.array([])] * BATCH_SIZE truncated = [False] * BATCH_SIZE while False in truncated: num_tokens = 1023 - (len(context_tokens[0])) if generated_once: new_split_output_length = min(length - total_tokens, 1023 - split_length) if new_split_output_length != split_output_length: split_output = sample.sample_sequence( hparams=hparams, length=new_split_output_length, start_token=enc.encoder['<|endoftext|>'] if not question else None, context=context if question else None, batch_size=BATCH_SIZE, temperature=TEMPERATURE, top_k=TOP_K, top_p=TOP_P)[:, 1:] out = sess.run(split_output, feed_dict={context: context_tokens}) else: out = sess.run(output, feed_dict={context: context_tokens}) total_tokens += num_tokens for i in range(BATCH_SIZE): text = out[i] trunc_text = "" if question: text = np.append(context_tokens[i][:1], text) if TRUNCATE or all(gen_text): context_tokens[i] = out[i][(1023 - split_length - 1):] if generated_once: text = out[i][split_length:] if TRUNCATE: to_trunc = enc.decode(text) truncate_esc = re.escape(TRUNCATE) if question and not include_prefix: prefix_esc = re.escape(question) pattern = '(?:{})(.*?)(?:{})'.format( prefix_esc, truncate_esc) else: pattern = '(.*?)(?:{})'.format( truncate_esc) trunc_text = re.search(pattern, to_trunc, re.S) if trunc_text: text = enc.encode(trunc_text.group(1)) # better to re-encode here then decode every generation cycle, I think if not truncated[i]: gen_text[i] = np.concatenate((gen_text[i], text), axis=None) if trunc_text or (length is not None and total_tokens >= length - 1): truncated[i] = True gen = enc.decode(gen_text[i]).lstrip('\n') ''' if destination_path: f.write("{}\n{}".format(gen, sample_delim)) if not return_as_list and not destination_path: print("{}\n{}".format(gen, sample_delim), end='') ''' answers += gen generated_once = True answers = "" for idx in range(BATCH_SIZE): answers += enc.decode(out[idx]) # Process the string (cleanup) clean_answers = cleaner.clean_additional(" ".join( cleaner.clean_text(answers))) final_answers = cleaner.chunk_into_sentences(clean_answers) try: #print(similarity.use_filter(question, answers, 5)) print(answers) except Exception: print(" ".join(answers)) print("WARNING: Model cannot generate an answer using USE") ''' for _ in range(NSAMPLES // BATCH_SIZE): out = sess.run( output, feed_dict={ context: [context_tokens for _ in range(BATCH_SIZE)] }, )[:, len(context_tokens) :] # Build the answers string answers = "" for idx in range(BATCH_SIZE): answers += enc.decode(out[idx]) # Process the string (cleanup) clean_answers = cleaner.clean_additional( " ".join(cleaner.clean_text(answers)) ) final_answers = cleaner.chunk_into_sentences(clean_answers) try: #print(similarity.use_filter(question, answers, 5)) print(answers) except Exception: print(" ".join(answers)) print("WARNING: Model cannot generate an answer using USE") ''' print() print("=" * 79) print()
def main(): """Run the MODEL interactively.""" print("\nWelcome to COVID-19 chatbot!") print("The input prompt will appear shortly\n\n") models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR)) assert NSAMPLES % BATCH_SIZE == 0 enc = encoder.get_encoder(MODEL_NAME) hparams = model.default_hparams() with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file: hparams.override_from_dict(json.load(file)) if LENGTH is None: length = hparams.n_ctx // 2 elif LENGTH > hparams.n_ctx: raise ValueError( "Can't get samples longer than window size: {}".format( hparams.n_ctx)) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [BATCH_SIZE, None]) np.random.seed(SEED) tf.set_random_seed(SEED) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=BATCH_SIZE, temperature=TEMPERATURE, top_k=TOP_K, ) saver = tflex.Saver() saver.restore(sess, CHECKPOINT) while True: question = input("COVID-19 CHATBOT> ") while not question: print("Prompt should not be empty!") question = input("COVID-19 CHATBOT> ") context_tokens = enc.encode(question) for _ in range(NSAMPLES // BATCH_SIZE): out = sess.run( output, feed_dict={ context: [context_tokens for _ in range(BATCH_SIZE)] }, )[:, len(context_tokens):] # Build the answers string answers = "" for idx in range(BATCH_SIZE): answers += enc.decode(out[idx]) # Process the string (cleanup) clean_answers = cleaner.clean_additional(" ".join( cleaner.clean_text(answers))) final_answers = cleaner.chunk_into_sentences(clean_answers) try: print(similarity.use_filter(question, final_answers, 5)) except Exception: print(" ".join(final_answers)) print("WARNING: Model cannot generate an answer using USE") print() print("=" * 79) print()
import cleaner import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') #Loading Training Data train = pd.read_csv('datasets/train.csv') test = pd.read_csv('datasets/test.csv') sample_sub = pd.read_csv('datasets/sample_submission.csv') #Concation train and test data total_data = train.append(test, ignore_index=True) total_data['cleaned_tweets'] = total_data['tweet'].apply( lambda x: cleaner.clean_text(x)) train_data = total_data[total_data['label'].isnull() != True] test_data = total_data[total_data['label'].isnull() == True] # Linear_svm Model params = { 'tfidf__max_df': [0.9, 0.95], 'tfidf__ngram_range': [(1, 1), (1, 2)], "svc__C": [0.001, .01, .1, 1, 10, 100] } pipeline = Pipeline([ ("tfidf", TfidfVectorizer(sublinear_tf=True, stop_words='english')), ("svc", LinearSVC(penalty='l2',
def chatbot_response(question: str) -> str: """Respond to a question.""" models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR)) assert NSAMPLES % BATCH_SIZE == 0 enc = encoder.get_encoder(MODEL_NAME, dirback=True) hparams = model.default_hparams() with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file: hparams.override_from_dict(json.load(file)) if LENGTH is None: length = hparams.n_ctx // 2 elif LENGTH > hparams.n_ctx: raise ValueError( "Can't get samples longer than window size: {}".format( hparams.n_ctx)) with tf.Session(graph=tf.Graph()) as sess: context = tf.placeholder(tf.int32, [BATCH_SIZE, None]) np.random.seed(SEED) tf.set_random_seed(SEED) output = sample.sample_sequence( hparams=hparams, length=length, context=context, batch_size=BATCH_SIZE, temperature=TEMPERATURE, top_k=TOP_K, ) saver = tflex.Saver() saver.restore(sess, CHECKPOINT) context_tokens = enc.encode(question) response: str = "" for _ in range(NSAMPLES // BATCH_SIZE): out = sess.run( output, feed_dict={ context: [context_tokens for _ in range(BATCH_SIZE)] }, )[:, len(context_tokens):] # Build the answers string answers = "" for idx in range(BATCH_SIZE): answers += enc.decode(out[idx]) # Process the string (cleanup) clean_answers = cleaner.clean_additional(" ".join( cleaner.clean_text(answers))) final_answers = cleaner.chunk_into_sentences(clean_answers) try: response += similarity.use_filter(question, final_answers, 5) except Exception: response += " ".join(final_answers) return response
#Necesssay Library import pandas as pd import cleaner import nltk nltk.download('punkt') nltk.download('stopwords') nltk.download('wordnet') from sklearn.externals import joblib # Loading Data total_data = pd.read_csv('datasets/test2.csv') # Clean total_data['cleaned_tweets'] = total_data['tweet'].apply(lambda x: cleaner.clean_text(x)) # Load model gs = joblib.load('filename.pkl') predicted = gs.predict(total_data['cleaned_tweets']) length = len(predicted) sum_predicted = sum(predicted) hate_speech_percent = round((sum_predicted*100)/length, 2) print(hate_speech_percent) sub_df = pd.DataFrame(columns=['id', 'label', 'tweet']) sub_df['id'] = total_data['id'] sub_df['label'] = predicted sub_df['tweet'] = total_data['tweet'] sub_df.to_csv('test_prediction_svm.csv', index=False)