Python clean_text Examples, cleaner.clean_text Python Examples

Example #1

0

Show file

File: cancer_reader.py Project: shriphani/cancer_data

	def add_content(self, text):
		if self.stem:
			self.content += ' ' + cleaner.clean_text_and_stem(text)
		elif self.clean:
			self.content += ' ' + cleaner.clean_text(text)
		else:
			self.content += text

Example #2

0

Show file

File: modifycsv.py Project: beliefs22/DeIdentification-Tool

def main():
    mydict = dictionary.Dictionary()
    modifyfunct = lambda text: cleaner.clean_text(text,mydict)

    csvsource = 'csvs/A_test_csv.csv'
    modifydest = 'csvs/testing_with_fun.csv'

    modify(csvsource,modifydest,modifyfunct)

Example #3

0

Show file

File: cancer_reader.py Project: shriphani/cancer_data

	def __init__(self, post_id, post_time_str, forum_id, thread_id, author_id, author_name, first_post_marker, title, content, clean = True, stem = False):
		self.post_id = int(post_id)
		self.post_time_str = post_time_str
		self.forum_id = int(forum_id)
		self.thread_id = int(thread_id)
		self.author_id = int(author_id)
		self.author_name = author_name
		self.is_first_post = int(first_post_marker)
		self.title = title 
		self.clean = clean
		self.stem = stem
		if stem:
			self.content = cleaner.clean_text_and_stem(content)
		elif clean:
			self.content = cleaner.clean_text(content)
		else:
			self.content = content

Example #4

0

Show file

File: interact.py Project: mylesdc/gpt2-hdf5

def main():
    """Run the MODEL interactively."""

    print("\nWelcome to COVID-19 chatbot!")
    print("The input prompt will appear shortly\n\n")

    models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR))

    assert NSAMPLES % BATCH_SIZE == 0

    enc = encoder.get_encoder(MODEL_NAME)
    hparams = model.default_hparams()

    with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file:
        hparams.override_from_dict(json.load(file))

    if LENGTH is None:
        length = hparams.n_ctx // 2

    elif LENGTH > hparams.n_ctx:
        raise ValueError(
            "Can't get samples longer than window size: {}".format(
                hparams.n_ctx))

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [BATCH_SIZE, None])
        np.random.seed(SEED)
        tf.set_random_seed(SEED)
        output = sample.sample_sequence(
            hparams=hparams,
            length=length,
            context=context,
            batch_size=BATCH_SIZE,
            temperature=TEMPERATURE,
            top_k=TOP_K,
        )

        saver = tflex.Saver()
        saver.restore(sess, CHECKPOINT)

        while True:
            question = input("COVID-19 CHATBOT> ")

            while not question:
                print("Prompt should not be empty!")
                question = input("COVID-19 CHATBOT> ")

            context_tokens = [enc.encode(question)] * BATCH_SIZE

            # custom for full length text
            total_tokens = len(context_tokens[0])
            generated_once = False
            gen_texts = []
            answers = ""
            split_length = int(1023 * SPLIT_CONTEXT)
            split_output_length = min(length, 1023 - split_length)

            for _ in range(NSAMPLES // BATCH_SIZE):
                gen_text = [np.array([])] * BATCH_SIZE
                truncated = [False] * BATCH_SIZE
                while False in truncated:
                    num_tokens = 1023 - (len(context_tokens[0]))

                    if generated_once:
                        new_split_output_length = min(length - total_tokens,
                                                      1023 - split_length)
                        if new_split_output_length != split_output_length:
                            split_output = sample.sample_sequence(
                                hparams=hparams,
                                length=new_split_output_length,
                                start_token=enc.encoder['<|endoftext|>']
                                if not question else None,
                                context=context if question else None,
                                batch_size=BATCH_SIZE,
                                temperature=TEMPERATURE,
                                top_k=TOP_K,
                                top_p=TOP_P)[:, 1:]
                        out = sess.run(split_output,
                                       feed_dict={context: context_tokens})

                    else:
                        out = sess.run(output,
                                       feed_dict={context: context_tokens})

                    total_tokens += num_tokens
                    for i in range(BATCH_SIZE):
                        text = out[i]
                        trunc_text = ""
                        if question:
                            text = np.append(context_tokens[i][:1], text)
                        if TRUNCATE or all(gen_text):
                            context_tokens[i] = out[i][(1023 - split_length -
                                                        1):]
                            if generated_once:
                                text = out[i][split_length:]

                            if TRUNCATE:
                                to_trunc = enc.decode(text)
                                truncate_esc = re.escape(TRUNCATE)
                                if question and not include_prefix:
                                    prefix_esc = re.escape(question)
                                    pattern = '(?:{})(.*?)(?:{})'.format(
                                        prefix_esc, truncate_esc)
                                else:
                                    pattern = '(.*?)(?:{})'.format(
                                        truncate_esc)

                                trunc_text = re.search(pattern, to_trunc, re.S)
                                if trunc_text:
                                    text = enc.encode(trunc_text.group(1))
                                    # better to re-encode here then decode every generation cycle, I think

                        if not truncated[i]:
                            gen_text[i] = np.concatenate((gen_text[i], text),
                                                         axis=None)
                            if trunc_text or (length is not None
                                              and total_tokens >= length - 1):
                                truncated[i] = True
                                gen = enc.decode(gen_text[i]).lstrip('\n')
                                '''
                                if destination_path:
                                    f.write("{}\n{}".format(gen, sample_delim))
                                if not return_as_list and not destination_path:
                                    print("{}\n{}".format(gen, sample_delim), end='')
                                '''
                                answers += gen
                    generated_once = True

                answers = ""
                for idx in range(BATCH_SIZE):
                    answers += enc.decode(out[idx])

                # Process the string (cleanup)
                clean_answers = cleaner.clean_additional(" ".join(
                    cleaner.clean_text(answers)))

                final_answers = cleaner.chunk_into_sentences(clean_answers)

                try:
                    #print(similarity.use_filter(question, answers, 5))
                    print(answers)

                except Exception:
                    print(" ".join(answers))
                    print("WARNING: Model cannot generate an answer using USE")
                    '''
                    for _ in range(NSAMPLES // BATCH_SIZE):
                        out = sess.run(
                            output,
                            feed_dict={
                                context: [context_tokens for _ in range(BATCH_SIZE)]
                            },
                        )[:, len(context_tokens) :]

                        # Build the answers string
                        answers = ""
                        for idx in range(BATCH_SIZE):
                            answers += enc.decode(out[idx])

                        # Process the string (cleanup)
                        clean_answers = cleaner.clean_additional(
                            " ".join(cleaner.clean_text(answers))
                        )

                        final_answers = cleaner.chunk_into_sentences(clean_answers)

                        try:
                            #print(similarity.use_filter(question, answers, 5))
                            print(answers)

                        except Exception:
                            print(" ".join(answers))
                            print("WARNING: Model cannot generate an answer using USE")
                    '''

            print()
            print("=" * 79)
            print()

Example #5

0

Show file

def main():
    """Run the MODEL interactively."""

    print("\nWelcome to COVID-19 chatbot!")
    print("The input prompt will appear shortly\n\n")

    models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR))

    assert NSAMPLES % BATCH_SIZE == 0

    enc = encoder.get_encoder(MODEL_NAME)
    hparams = model.default_hparams()

    with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file:
        hparams.override_from_dict(json.load(file))

    if LENGTH is None:
        length = hparams.n_ctx // 2

    elif LENGTH > hparams.n_ctx:
        raise ValueError(
            "Can't get samples longer than window size: {}".format(
                hparams.n_ctx))

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [BATCH_SIZE, None])
        np.random.seed(SEED)
        tf.set_random_seed(SEED)
        output = sample.sample_sequence(
            hparams=hparams,
            length=length,
            context=context,
            batch_size=BATCH_SIZE,
            temperature=TEMPERATURE,
            top_k=TOP_K,
        )

        saver = tflex.Saver()
        saver.restore(sess, CHECKPOINT)

        while True:
            question = input("COVID-19 CHATBOT> ")

            while not question:
                print("Prompt should not be empty!")
                question = input("COVID-19 CHATBOT> ")

            context_tokens = enc.encode(question)

            for _ in range(NSAMPLES // BATCH_SIZE):
                out = sess.run(
                    output,
                    feed_dict={
                        context: [context_tokens for _ in range(BATCH_SIZE)]
                    },
                )[:, len(context_tokens):]

                # Build the answers string
                answers = ""
                for idx in range(BATCH_SIZE):
                    answers += enc.decode(out[idx])

                # Process the string (cleanup)
                clean_answers = cleaner.clean_additional(" ".join(
                    cleaner.clean_text(answers)))

                final_answers = cleaner.chunk_into_sentences(clean_answers)

                try:
                    print(similarity.use_filter(question, final_answers, 5))

                except Exception:
                    print(" ".join(final_answers))
                    print("WARNING: Model cannot generate an answer using USE")

            print()
            print("=" * 79)
            print()

Example #6

0

Show file

File: hatespeech.py Project: jjanczur/hackaton-berlin-legal-tech-2020

import cleaner
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

#Loading Training Data
train = pd.read_csv('datasets/train.csv')
test = pd.read_csv('datasets/test.csv')
sample_sub = pd.read_csv('datasets/sample_submission.csv')

#Concation train and test data
total_data = train.append(test, ignore_index=True)

total_data['cleaned_tweets'] = total_data['tweet'].apply(
    lambda x: cleaner.clean_text(x))

train_data = total_data[total_data['label'].isnull() != True]
test_data = total_data[total_data['label'].isnull() == True]

# Linear_svm Model
params = {
    'tfidf__max_df': [0.9, 0.95],
    'tfidf__ngram_range': [(1, 1), (1, 2)],
    "svc__C": [0.001, .01, .1, 1, 10, 100]
}

pipeline = Pipeline([
    ("tfidf", TfidfVectorizer(sublinear_tf=True, stop_words='english')),
    ("svc",
     LinearSVC(penalty='l2',

Example #7

0

Show file

File: generate_response.py Project: mylesdc/gpt2-hdf5

def chatbot_response(question: str) -> str:
    """Respond to a question."""

    models_dir = os.path.expanduser(os.path.expandvars(MODELS_DIR))

    assert NSAMPLES % BATCH_SIZE == 0

    enc = encoder.get_encoder(MODEL_NAME, dirback=True)
    hparams = model.default_hparams()

    with open(os.path.join(models_dir, MODEL_NAME, "hparams.json")) as file:
        hparams.override_from_dict(json.load(file))

    if LENGTH is None:
        length = hparams.n_ctx // 2

    elif LENGTH > hparams.n_ctx:
        raise ValueError(
            "Can't get samples longer than window size: {}".format(
                hparams.n_ctx))

    with tf.Session(graph=tf.Graph()) as sess:
        context = tf.placeholder(tf.int32, [BATCH_SIZE, None])
        np.random.seed(SEED)
        tf.set_random_seed(SEED)
        output = sample.sample_sequence(
            hparams=hparams,
            length=length,
            context=context,
            batch_size=BATCH_SIZE,
            temperature=TEMPERATURE,
            top_k=TOP_K,
        )

        saver = tflex.Saver()
        saver.restore(sess, CHECKPOINT)

        context_tokens = enc.encode(question)

        response: str = ""
        for _ in range(NSAMPLES // BATCH_SIZE):
            out = sess.run(
                output,
                feed_dict={
                    context: [context_tokens for _ in range(BATCH_SIZE)]
                },
            )[:, len(context_tokens):]

            # Build the answers string
            answers = ""
            for idx in range(BATCH_SIZE):
                answers += enc.decode(out[idx])

            # Process the string (cleanup)
            clean_answers = cleaner.clean_additional(" ".join(
                cleaner.clean_text(answers)))

            final_answers = cleaner.chunk_into_sentences(clean_answers)

            try:
                response += similarity.use_filter(question, final_answers, 5)

            except Exception:
                response += " ".join(final_answers)

        return response

Example #8

0

Show file

File: detector.py Project: jjanczur/hackaton-berlin-legal-tech-2020

#Necesssay Library
import pandas as pd
import cleaner
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
from sklearn.externals import joblib


# Loading Data
total_data = pd.read_csv('datasets/test2.csv')

# Clean
total_data['cleaned_tweets'] = total_data['tweet'].apply(lambda x: cleaner.clean_text(x))

# Load model
gs = joblib.load('filename.pkl')

predicted = gs.predict(total_data['cleaned_tweets'])
length = len(predicted)
sum_predicted = sum(predicted)
hate_speech_percent = round((sum_predicted*100)/length, 2)
print(hate_speech_percent)

sub_df = pd.DataFrame(columns=['id', 'label', 'tweet'])
sub_df['id'] = total_data['id']
sub_df['label'] = predicted
sub_df['tweet'] = total_data['tweet']
sub_df.to_csv('test_prediction_svm.csv', index=False)