Beispiel #1
0
def debate_text_process(text):
    from nltk.tokenize import word_tokenize
    tokens = word_tokenize(str(text))
    # convert to lower case
    tokens = [w.lower() for w in tokens]
    # remove punctuation from each word
    import string
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    # remove remaining tokens that are not alphabetic
    words = [word for word in stripped if word.isalpha()]
    # filter out stop words
    from nltk.corpus import stopwords
    from spacy.lang.en.stop_words import STOP_WORDS
    stop_words = set(stopwords.words('english'))

    STOP_WORDS.update(stop_words)
    STOP_WORDS.update({
        'nt', 'okay', 'ha', 'thank', 'wa', 'got', 'oh', 'said', 'going',
        'want', 'let', 'know'
    })
    words = [w for w in words if not w in STOP_WORDS]
    #print(len(STOP_WORDS))

    from nltk.stem import WordNetLemmatizer
    wordnet_lemmatizer = WordNetLemmatizer()
    words = [wordnet_lemmatizer.lemmatize(w) for w in words]
    return words
import hashlib
from pytorch_pretrained_bert import BertTokenizer
import string
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
import string

nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner', 'tagger'])
STOP_WORDS.update(string.punctuation)
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')


def bert_tokenization_length(context, question, reference, candidate):
    context_len = len(tokenizer.tokenize(context))
    question_len = len(tokenizer.tokenize(question))
    candidate_len = len(tokenizer.tokenize(candidate))
    reference_len = len(tokenizer.tokenize(reference))

    return max(context_len + question_len + candidate_len,
               context_len + question_len + reference_len)


def check_data_and_return_hash(context, question, reference, candidate):
    assert type(context) == type(question) == type(reference) == type(
        candidate) == str

    if context == '' or question == '' or reference == '' or candidate == '':
        return None

    sample = context + question + reference + candidate
    hash_object = hashlib.md5(sample.encode())
Beispiel #3
0
# build preprocess tokenizer
remove_strs = ['<br />', '(', ')', '"']
nlp = spacy.load('en')


# def tokenizer(text):
#     text = utils.remove_str_from_sentence(text, remove_strs)
#     return [token.text for token in nlp.tokenizer(text)]
def tokenizer(text):
    # text = utils.remove_str_from_sentence(text, remove_strs)
    return text.split()


user_stop_words = {'.', ','}
STOP_WORDS.update(user_stop_words)
stop_words = STOP_WORDS

# Pretrain Model
PRE_TRAIN_MODEL_BASE_PATH = '/home/ubuntu/likun/nlp_vectors'
PRE_TRAIN_MODEL_DIR = 'glove'
PRE_TRAIN_MODEL_NAME = 'glove.6B.200d.txt'
USE_PRE_TRAIN_MODEL = True
cache = '.vector_cache'
vector_path = os.path.join(PRE_TRAIN_MODEL_BASE_PATH, PRE_TRAIN_MODEL_DIR,
                           PRE_TRAIN_MODEL_NAME)
vectors = Vectors(name=vector_path,
                  cache=cache) if USE_PRE_TRAIN_MODEL else None

# Build Dataset
TEXT = data.Field(unk_token=UNK_TOKEN,
Beispiel #4
0
        LEMMA: "not",
        NORM: "not",
        TAG: "RB"
    }]
}
TOKENIZER_EXCEPTIONS = update_exc(TOKENIZER_EXCEPTIONS)

# updating the stopset
calfresh_stopwords = {
    "Calfresh", "CalFresh", "calfresh", "CALFRESH", "foodstamps", "sar7",
    "sar", "sr7", "sr", "SAR7", "SR7", "SAR", "SR", "Sar", "Sar7", "ebt"
}
calfresh_placeholders = {
    "PERSON", "ORG", "GPE", "LOC", "DATE", "MONEY", "CARDINAL"
}
stopset = STOP_WORDS.update(calfresh_stopwords, calfresh_placeholders)

regex = re.compile(r'\W|\d', flags=re.UNICODE)


def clean_words(text):
    try:
        text = regex.sub('', text)
    except:
        pass
    return text


def enchant_spellchecker(doc):
    for token in doc:
        word = token.text
Beispiel #5
0
    non_empty_data = [
        article for article in dataset['data'][:100]
        if article and not article.isspace()
    ]

    # Process the articles with spaCy (tokenization only needed)
    nlp = spacy.load('en_core_web_sm', disable=['parser', 'textcat', 'ner'])
    print('Running spaCy processing')
    id_to_tokens = {
        i: nlp(article)
        for i, article in tqdm(enumerate(non_empty_data))
    }
    print('Done processing')

    # Remove the stop words and lemmatize
    STOP_WORDS.update(
        ['think', 'know', 'people', 'like', 'thing', 'good', 'use', 'come'])
    id_to_tokens = {
        i: preprocess_spacy_doc(article, STOP_WORDS)
        for i, article in id_to_tokens.items()
    }

    unique_words = set().union(*id_to_tokens.values())
    vocabulary = list(unique_words)

    # Remove rare and overly common words from corpus
    filtered_tokens = filter_extremes(id_to_tokens.values(),
                                      vocabulary,
                                      more_than=10)
    id_to_filtered = {i: tokens for i, tokens in enumerate(filtered_tokens)}
    unique_words = set().union(*id_to_filtered.values())
    vocabulary = list(unique_words)