def __init__(self,
                 corpus_path=None,
                 corpus_frac=None,
                 max_df=0.65,
                 min_word_len=3,
                 max_tfidf_features=10000,
                 n_svd_components=100,
                 label_col=None,
                 id_col='id',
                 append_pos_tags=False):
        self.max_df = max_df
        self.min_word_len = min_word_len
        self.max_tfidf_features = max_tfidf_features
        self.n_svd_components = n_svd_components
        self.append_pos_tags = append_pos_tags

        self.stemmer = PorterStemmer()
        self.lemmatizer = WordNetLemmatizer()

        self.stopwords = []
        with (DATA_DIR / 'misc' / 'stopwords.txt').open('r') as fd:
            self.stopwords = [line.strip() for line in fd]

        self.regex = {
            'number':
            re.compile(r'[0-9]+?'),
            'web_email':
            re.compile(
                r'((www.+?\s)|(http.+?\s)|([a-z]+?\@.+?s)|(\.[a-z]{2,3}))'),
            'spacer':
            re.compile(r'[\_\-]'),
            'punct':
            re.compile(
                r'[\[\]\'\.,\/\#\!\?\$\%\^\&\*;\:{}=\_`~\(\)\n\r�\<\>\@\\]+?')
        }

        self.tokenizer = str.split

        self.corpus = self.load_corpus(corpus_path,
                                       label_col=label_col,
                                       corpus_frac=corpus_frac)
        self.training_corpus, self.testing_corpus = self.split_corpus()

        self.vectorizer_params = {
            # 'lowercase': True,  # Covered by preprocessor
            # 'stop_words': self.stopwords,  # Covered by preprocessor
            'analyzer': 'word',
            'preprocessor': self.preprocess,
            'tokenizer': self.tokenizer,
            'max_df': self.max_df,
            'max_features': self.max_tfidf_features,
        }

        self.svd_params = {'n_components': self.n_svd_components, 'n_iter': 5}

        self.count_vectorizer = None
        self.tfidf_transformer = TfidfTransformer()

        self.tfidf_vectorizer = None
        self.svd = None
        self.lsa = None

        self.set_vectorizers()
        self.set_svd()
import nltk
import pandas as pd
import numpy as np
import pickle
import re
from nltk.corpus import stopwords
from nltk import PorterStemmer, WordNetLemmatizer

data = pd.read_csv('spam.csv', sep=',', encoding='latin-1')
data.drop(['Unnamed: 2', 'Unnamed: 3', 'Unnamed: 4'], axis=1, inplace=True)

data['Type'] = data['Type'].map({'ham': 0, 'spam': 1})
X = data['Message']
y = data['Type']

stem = PorterStemmer()
corpus = []

for i in range(len(data)):
    words = re.sub('[^a-zA-Z]', ' ', data['Message'][i])
    words = words.lower()
    words = words.split()
    words = [
        stem.stem(word) for word in words
        if word not in set(stopwords.words('english'))
    ]
    words = ' '.join(words)
    corpus.append(words)

#creating BagOfWords
from sklearn.feature_extraction.text import CountVectorizer
best_seller_group = shampoo.groupby('best_selling', )
best_seller_group.agg(['mean', 'std', 'median'])

rating_mask = shampoo['rating'].isnull() == False
rating_group = shampoo.loc[rating_mask, :]
rating_group['rating'] = rating_group['rating'].astype('float')
rating_grouped = rating_group.groupby('best_selling', )

rating_grouped.agg(['mean', 'std', 'median'])

# Natural Language Processing
from nltk.corpus import stopwords
stop = stopwords.words('english')
from textblob import TextBlob
from nltk import PorterStemmer
stemmer = PorterStemmer()
import nltk

df = pd.read_csv('description_df')
df['nlp_description'] = df['nlp_description'].astype('string')

# add product specific stop words
stop.extend([
    'shampoo', 'conditioner', 'soap', 'cleanse', 'hair', 'head', 'shoulders',
    'loréal', 'pari', 'product', 'help', 'use', 'free', 'make', 'type'
])

#Pre Processing
#remove stop words
df['nlp_description'] = df['nlp_description'].apply(
    lambda text: " ".join(word for word in text.split() if word not in stop))
def stem_words(f):
	stemmer=PorterStemmer()
	processed=tokenize(f)
	for i in range(len(processed)):
		processed[i]=stemmer.stem(processed[i])
	return processed
Beispiel #5
0
 def stem_it(self):
     stemmer = PorterStemmer()
     self.word = stemmer.stem(self.word)
Beispiel #6
0
def tfidf_classifier(fname):
    with open(fname + ".txt", "r") as file:
        paragraph = file.read()

    #clean the extracted content
    paragraph = " ".join(re.findall(r"\b[a-z0-9]+\b", paragraph,
                                    flags=re.I)).lower()

    #get the part of speech for every word in the content
    pos_tag_words = pos_tag(paragraph.split())
    porter_stemmer_obj = PorterStemmer()
    stem = porter_stemmer_obj.stem
    pos_tag_words = [(str(stem(tag[0])),
                      tag[-1]) if tag[-1].startswith("VB") else tag
                     for tag in pos_tag_words]
    paragraph = " ".join([w[0] for w in pos_tag_words])

    #extract all the nouns, adjectives, adverbs and verbs from the paragraph
    temp_noun_adj_list = []
    temp_verb_adv_list = []
    all_words = []
    all_words_count_dict = {}
    for pos_words in pos_tag_words:
        if (pos_words[-1].startswith("NN") or pos_words[-1].startswith("JJ")):
            temp_noun_adj_list.append(pos_words[0])
            if len(temp_verb_adv_list) > 1:
                adv_verb_str = " ".join(temp_verb_adv_list)
                if adv_verb_str not in all_words_count_dict:
                    all_words_count_dict[adv_verb_str] = paragraph.count(
                        adv_verb_str)
                temp_verb_adv_list = []
            elif temp_verb_adv_list:
                if temp_verb_adv_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_verb_adv_list[0]] = paragraph.count(
                            temp_verb_adv_list[0])
                temp_verb_adv_list = []
        elif pos_words[-1].startswith("VB"):
            temp_verb_adv_list.append(pos_words[0])
            if len(temp_noun_adj_list) > 1:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            elif temp_noun_adj_list:
                if temp_noun_adj_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_noun_adj_list[0]] = paragraph.count(
                            temp_noun_adj_list[0])
                temp_noun_adj_list = []
        elif pos_words[-1].startswith("RB"):
            temp_verb_adv_list.append(pos_words[0])
            if len(temp_noun_adj_list) > 1:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            elif temp_noun_adj_list:
                if temp_noun_adj_list[0] not in all_words_count_dict:
                    all_words_count_dict[
                        temp_noun_adj_list[0]] = paragraph.count(
                            temp_noun_adj_list[0])
                temp_noun_adj_list = []
        else:
            if temp_noun_adj_list:
                adj_noun_str = " ".join(temp_noun_adj_list)
                if adj_noun_str not in all_words_count_dict:
                    all_words_count_dict[adj_noun_str] = paragraph.count(
                        adj_noun_str)
                temp_noun_adj_list = []
            if temp_verb_adv_list:
                adv_str = " ".join(temp_verb_adv_list)
                if adv_str not in all_words_count_dict:
                    all_words_count_dict[adv_str] = paragraph.count(adv_str)
                temp_verb_adv_list = []

    if len(temp_noun_adj_list) > 0:
        adj_noun_str = " ".join(temp_noun_adj_list)
        if adj_noun_str not in all_words_count_dict:
            all_words_count_dict[adj_noun_str] = paragraph.count(adj_noun_str)
    if len(temp_verb_adv_list) > 0:
        adv_str = " ".join(temp_verb_adv_list)
        if adv_str not in all_words_count_dict:
            all_words_count_dict[adv_str] = paragraph.count(adv_str)

    with open(fname + ".json", "w") as file:
        json.dump(all_words_count_dict, file)
Beispiel #7
0
def feature_maker(embed_file, dataframe, embed_signal='n'):
    '''takes a path to embeddings file, dataframe as input - default keyword
    embed-signal means that embeddings are not encoded by default
    returns an expanded dataframe with:
    a column of lemmatised words; a column of stemmed words; a column indicating
    capitalisation status; a column indicating capilatisation status of previous
    token; columns indicating shape, previous shape, short shape, previous
    short shape, following token short shape.
    If kwarg embed_signal is 'y', a list of embeddings is also generated.

    '''

    wnl = WordNetLemmatizer()
    prtr = PorterStemmer()
    stringed_list = [str(x) for x in dataframe['token']]
    wn_lemma_list = [wnl.lemmatize(t) for t in stringed_list]
    dataframe['lemma'] = wn_lemma_list
    prtr_stemmer_list = [prtr.stem(t) for t in stringed_list]
    dataframe['stem'] = prtr_stemmer_list

    dataframe['caps'] = 'no caps'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['caps']] = 'begin_cap'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['caps']] = 'all_caps'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['caps']] = 'caps_inside'

    temp_list = dataframe['caps'].to_list()
    temp_list.insert(0, 'no_cap')
    temp_list.pop()
    dataframe['prev_caps'] = temp_list

    dataframe['short_shape'] = 'x'
    dataframe.loc[dataframe['token'].str.contains('^[A-Z][a-z]'),
                  ['short_shape']] = 'Xx'
    dataframe.loc[dataframe['token'].str.contains('[A-Z][A-Z]'),
                  ['short_shape']] = 'XX'
    dataframe.loc[dataframe['token'].str.contains('[a-z][A-Z]]'),
                  ['short_shape']] = 'xXx'
    dataframe.loc[dataframe['token'].str.contains('\W'), ['short_shape']] = '-'

    prev_short_shape_list = []
    prev_short_shape_list = dataframe['short_shape'].to_list()
    prev_short_shape_list.insert(0, '-')
    prev_short_shape_list.pop()
    dataframe['prev_short_shape'] = prev_short_shape_list

    next_short_shape_list = []
    next_short_shape_list = dataframe['short_shape'].to_list()
    next_short_shape_list.pop(0)
    next_short_shape_list.append('-')
    dataframe['next_short_shape'] = next_short_shape_list

    shape_list = []
    pre_list = []
    suf_list = []
    for text in dataframe['token']:

        prefix = text[:3]
        suffix = text[-3:]
        pre_list.append(prefix)
        suf_list.append(suffix)
        replace_caps = re.sub('[A-Z]', 'X', text)
        replace_lowers = re.sub('[a-z]', 'x', replace_caps)
        replace_digits = re.sub('\d', 'd', replace_lowers)

        shape_list.append(replace_digits)

    dataframe['shape'] = shape_list

    prev_shape_list = []
    prev_shape_list = dataframe['shape'].to_list()
    prev_shape_list.insert(0, '-')
    prev_shape_list.pop()
    dataframe['prev_shape'] = prev_shape_list

    dataframe['prefix'] = pre_list
    dataframe['suffix'] = suf_list

    if embed_signal == 'y':
        word_embedding_model = gensim.models.KeyedVectors.load_word2vec_format(
            embed_file, binary=True)
        embeddings = []
        for token in dataframe['token']:
            if token in word_embedding_model:
                vector = word_embedding_model[token]
            else:
                vector = [0] * 300
            embeddings.append(vector)
        return dataframe, embeddings
    else:
        return dataframe
Beispiel #8
0
def stem(word):
    word = PorterStemmer().stem(word)
    return word
def stemming_Porter(tokens):
    Stemmer = PorterStemmer()
    return [
        Token(Stemmer.stem(word.token), word.pos, forceToken=True)
        for word in tokens
    ]
 def stem(word_list):
     return map(lambda x: PorterStemmer().stem(x), word_list)
Beispiel #11
0
def stemming(word):
    word = PorterStemmer().stem_word(word.lower())
    return word

		listKata = []

		for genre,kata in lyricsData:
			if(genre == listGenre[choice-1]):
				for word in word_tokenize(kata):
					valid = True
					for w in word:
						if(w in string.punctuation):
							valid = False

					word = word.lower()

					if(word not in stopwords.words("english") and valid):
						WordNetLemmatizer().lemmatize(PorterStemmer().stem(word),pos='a')
						listKata.append(word)

		hasilFreqDist = FreqDist(listKata)

		print("20 most common words")
		print("===================")

		for kata,freq in hasilFreqDist.most_common(20):
			print(kata," -> ",freq)

		input("press enter to go back")

	elif(index == 3):
		saveFile = open('genre.pickle',"wb")
		pickle.dump(genreClassifier,saveFile)
Beispiel #13
0
#!/usr/bin/env python3
# -*- coding: utf-8 -*-
"""Python for AHDA.

Part 5, Example 7.
"""

# Stemming words - test your tools

from nltk import LancasterStemmer
from nltk import PorterStemmer

print('LancasterStemmer')
print(LancasterStemmer().stem('nation'))
print(LancasterStemmer().stem('nationality'))
print(LancasterStemmer().stem('nationally'))
print(LancasterStemmer().stem('natural'))
print(LancasterStemmer().stem('naturally'))
print(LancasterStemmer().stem('nature'))
print()
print('PorterStemmer')
print(PorterStemmer().stem('nation'))
print(PorterStemmer().stem('nationality'))
print(PorterStemmer().stem('nationally'))
print(PorterStemmer().stem('natural'))
print(PorterStemmer().stem('naturally'))
print(PorterStemmer().stem('nature'))
Beispiel #14
0
detectClasses = u.detectClasses
extract_classes = u.extract_classes

classifier = MLPClassifier(verbose=True,
                           early_stopping=True,
                           max_iter=10,
                           hidden_layer_sizes=(300, 300),
                           tol=0.000001)  # F1=0.50
# classifier = RandomForestClassifier(max_depth=3000, n_jobs=4, n_estimators=20)  # F1=0.30
# classifier = ExtraTreeClassifier(max_depth=1000)  # F1=0.32
# classifier = GaussianNB() #not working with simultaneous multiclass
vectorizer = TFIDFVectorizer(mx_features=None,
                             ngram_range=(1, 2),
                             minDf=10,
                             maxDF=0.98,
                             token_transformer=PorterStemmer().stem)
# vectorizer = BagOfWordsVectorizer(mx_features=None, n_gram_range=(1, 2), minDf=10, maxDF=0.98, token_transformer=PorterStemmer().stem)

# ------------------- Configuration Section ---------------

print("Loading dataset")
dataset = pd.read_csv("C:\\tmp\\dabble\\movies_metadata.csv")

print("Preprocessing")
dataset = pre_process(dataset)  # lower case, cleanse, etc.

print("Detecting classes")
dataset, class_count = detectClasses(
    dataset, column=CLASS_COLUMN,
    prefix=CLASS_PREFIX)  # generates new columns, one per class
Beispiel #15
0
quotes_token = nltk.word_tokenize(qt)

quotes_bigrams = list(nltk.bigrams(quotes_token))
print(quotes_bigrams)

quotes_trigrams = list(nltk.trigrams(quotes_token))
print(quotes_trigrams)

quotes_quadgrams = list(nltk.ngrams(quotes_token, 4))
print(quotes_quadgrams)

# stemming
from nltk import PorterStemmer

pst = PorterStemmer()
pst.stem("having")
pst.stem("sudeep")

words_stem = ["give", "giving", "given", "gave"]
for words in words_stem:
    print(words + " :" + pst.stem(words))

from nltk import LancasterStemmer

lnst = LancasterStemmer()
for words in words_stem:
    print(words + " :" + lnst.stem(words))

from nltk import SnowballStemmer
def clean_text(text):
    tc = TextCleaner(text, PorterStemmer())
    return tc.remove_stop_words().remove_punctuation().stem().tokenize()
Beispiel #17
0
def try_basic_query_tokenizer():
    stemmer = PorterStemmer()
    x = "answer(cityid('new york', _))"
    y = basic_query_tokenizer(
        x, strtok=lambda x: [stemmer.stem(xe) for xe in x.split()])
Beispiel #18
0
def process_email(email_contents: str) -> List[int]:
    """Pre-process the body of an email and return a list of indices of the
    words contained in the email.

    :param email_contents: the body of an email
    :return: a list of indices of the words contained in the email
    """

    # Load the vocabulary.
    vocabulary_dict = get_vocabulary_dict()

    # Initialize the return value.
    word_indices = []

    # ========================== Preprocess Email ===========================

    # Find the Headers ( \n\n and remove )
    # Uncomment the following lines if you are working with raw emails with the
    # full headers

    # header_token = '\n\n'
    # header_start = email_contents.find(header_token)
    # email_contents = email_contents[header_start+len(header_token):]

    # Convert email content to lower case.
    email_contents = email_contents.lower()

    # Strip all HTML
    # Looks for any expression that starts with < and ends with > and replace
    # and does not have any < or > in the tag it with a space
    email_contents = re.sub('<[^<>]+>', ' ', email_contents)

    # Handle numbers.
    # Convert all sequences of digits (0-9) to a 'number' token.
    email_contents = re.sub('[0-9]+', 'number', email_contents)

    # Handle URLs.
    # Convert all strings starting with http:// or https:// to a 'httpaddr' token.
    email_contents = re.sub('(http://|https://)+\S*', 'httpaddr',
                            email_contents)

    # Handle email addresses.
    # Convert all strings with @ in the middle to a 'emailaddr' token.
    email_contents = re.sub('[\S*]+(@)+\S*', 'emailaddr', email_contents)

    # Handle $ sign
    # Convert all sequences of $ signs to a 'dollar' token.
    email_contents = re.sub('[$]', 'dollar', email_contents)

    # ========================== Tokenize Email ===========================

    # Output the email to screen as well
    print('\n==== Processed Email ====\n\n')
    # Process file
    col = 0

    # Tokenize and also get rid of any punctuation
    tokens = re.split('[ @$/#.-:&*\+=\[\]?!\(\)\{\},'
                      '">_<;#\n\r]', email_contents)

    for token in tokens:

        # Remove any non alphanumeric characters
        token = re.sub('[^a-zA-Z0-9]', '', token)

        # Stem the word
        token = PorterStemmer().stem(token.strip())

        # Skip the word if it is too short
        if len(token) < 1:
            continue

        # Look up the word in the dictionary and add to word_indices if
        # found
        for i, word in vocabulary_dict.items():
            if token == word:
                word_indices.append(i)

        # Print to screen, ensuring that the output lines are not too long
        if (col + len(token) + 1) > 78:
            print('')
            col = 0
        print('{} '.format(token), end='', flush=True)
        col = col + len(token) + 1

    # Print footer
    print('\n\n=========================\n')

    return word_indices
from nltk.stem.wordnet import WordNetLemmatizer
import string
import gensim
from gensim import corpora
# from nltk.tokenize import word_tokenize

df = pd.read_json('related_data_rm_duplicacy.json')
QATags = df.content
# print(QATags)
QATags = list(QATags)
# print(QATags[:10])

stop = set(stopwords.words('english'))
exclude = set(string.punctuation)
lemma = WordNetLemmatizer()
port = PorterStemmer()


def clean(doc):
    stop_free = " ".join([i for i in doc.lower().split() if i not in stop])
    # print(stop_free)
    punc_free = ''.join(ch for ch in stop_free if ch not in exclude)
    # print(punc_free)
    normalized = " ".join(lemma.lemmatize(word) for word in punc_free.split())
    stem = " ".join(port.stem(word) for word in normalized.split())
    remove_non_english = stem.encode("ascii", errors="ignore").decode()
    return remove_non_english


Text_clean = [clean(doc).split() for doc in QATags]
Beispiel #20
0
 def stem(self):
     """
     Description: stem tokens with Porter Stemmer.
     """
     self.tokens = np.array([PorterStemmer().stem(t) for t in self.tokens])
def stem(array):
    stemmer = PorterStemmer()
    return [stemmer.stem(w) for w in array]
Beispiel #22
0
 def __init__(self):
     self.ps = PorterStemmer()
Beispiel #23
0
 def __init__(self):
     self.stemmer = PorterStemmer()
 def __init__(self):
     self.speechProcessor = SpeechProcessor()
     self.stemmer = PorterStemmer()
     self.propositions = []
     self.synsetsList = []
Beispiel #25
0
from collections import defaultdict
import re
import json
from nltk import PorterStemmer
from nltk.corpus import words
import math
import string
#asfasfasf

INDEX_DICT = {}
#DOC_ID_DICT = {}
directory = "C:\\Users\\tajun\\PycharmProjects\\ICS-121\\DevlopZip\\DEV"
doc_counter = 0
partial_counter = 0
NumOfDocs = 0
ps = PorterStemmer()
token_count = 0
output_dict = {}  #where {filenum;(word,[list of postings]}
skip_count = 0


class Postings:  #each doc id is a posting?
    def __init__(self, docid, positions):
        self.docid = docid
        self.positions = positions
        self.tfidf = 0  # use freq counts for now

    #   self.fields = fields


#takes in a file name to tokenize and return a list of tokens//should return a list of lists? where first element is tok, second is count, third is and so on.
Beispiel #26
0
def run(
    lr=0.001,
    batsize=20,
    epochs=100,
    embdim=64,
    encdim=128,
    numlayers=1,
    dropout=.25,
    wreg=1e-10,
    cuda=False,
    gpu=0,
    minfreq=2,
    gradnorm=3.,
    beamsize=1,
    cosine_restarts=1.,
    seed=456789,
):
    # DONE: Porter stemmer
    # DONE: linear attention
    # DONE: grad norm
    # DONE: beam search
    # DONE: lr scheduler
    print(locals())
    torch.manual_seed(seed)
    np.random.seed(seed)
    tt = q.ticktock("script")
    device = torch.device("cpu") if not cuda else torch.device("cuda", gpu)
    tt.tick("loading data")
    stemmer = PorterStemmer()
    tokenizer = lambda x: [stemmer.stem(xe) for xe in x.split()]
    ds = GeoQueryDatasetFunQL(
        sentence_encoder=SequenceEncoder(tokenizer=tokenizer),
        min_freq=minfreq)

    train_dl = ds.dataloader("train", batsize=batsize)
    test_dl = ds.dataloader("test", batsize=batsize)
    tt.tock("data loaded")

    do_rare_stats(ds)

    # batch = next(iter(train_dl))
    # print(batch)
    # print("input graph")
    # print(batch.batched_states)

    model = create_model(embdim=embdim,
                         hdim=encdim,
                         dropout=dropout,
                         numlayers=numlayers,
                         sentence_encoder=ds.sentence_encoder,
                         query_encoder=ds.query_encoder,
                         feedatt=True)

    # model.apply(initializer)

    tfdecoder = SeqDecoder(
        model,
        tf_ratio=1.,
        eval=[
            CELoss(ignore_index=0, mode="logprobs"),
            SeqAccuracies(),
            TreeAccuracy(
                tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
        ])

    losses = make_array_of_metrics("loss", "elem_acc", "seq_acc", "tree_acc")
    # beamdecoder = BeamActionSeqDecoder(tfdecoder.model, beamsize=beamsize, maxsteps=50)
    if beamsize == 1:
        freedecoder = SeqDecoder(
            model,
            maxtime=100,
            tf_ratio=0.,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(
                    tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
            ])

        vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    else:
        print("Doing beam search!")
        freedecoder = BeamDecoder(
            model,
            beamsize=beamsize,
            maxtime=60,
            eval=[
                SeqAccuracies(),
                TreeAccuracy(
                    tensor2tree=partial(tensor2tree, D=ds.query_encoder.vocab))
            ])

        vlosses = make_array_of_metrics("seq_acc", "tree_acc")
    # # test
    # tt.tick("doing one epoch")
    # for batch in iter(train_dl):
    #     batch = batch.to(device)
    #     ttt.tick("start batch")
    #     # with torch.no_grad():
    #     out = tfdecoder(batch)
    #     ttt.tock("end batch")
    # tt.tock("done one epoch")
    # print(out)
    # sys.exit()

    # beamdecoder(next(iter(train_dl)))

    # print(dict(tfdecoder.named_parameters()).keys())

    # 4. define optim
    optim = torch.optim.Adam(tfdecoder.parameters(), lr=lr, weight_decay=wreg)
    # optim = torch.optim.SGD(tfdecoder.parameters(), lr=lr, weight_decay=wreg)

    # lr schedule
    if cosine_restarts >= 0:
        # t_max = epochs * len(train_dl)
        t_max = epochs
        print(f"Total number of updates: {t_max} ({epochs} * {len(train_dl)})")
        lr_schedule = q.WarmupCosineWithHardRestartsSchedule(
            optim, 0, t_max, cycles=cosine_restarts)
        reduce_lr = [lambda: lr_schedule.step()]
    else:
        reduce_lr = []

    # 6. define training function (using partial)
    clipgradnorm = lambda: torch.nn.utils.clip_grad_norm_(
        tfdecoder.parameters(), gradnorm)
    trainbatch = partial(q.train_batch, on_before_optim_step=[clipgradnorm])
    trainepoch = partial(q.train_epoch,
                         model=tfdecoder,
                         dataloader=train_dl,
                         optim=optim,
                         losses=losses,
                         _train_batch=trainbatch,
                         device=device,
                         on_end=reduce_lr)

    # 7. define validation function (using partial)
    validepoch = partial(q.test_epoch,
                         model=freedecoder,
                         dataloader=test_dl,
                         losses=vlosses,
                         device=device)
    # validepoch = partial(q.test_epoch, model=tfdecoder, dataloader=test_dl, losses=vlosses, device=device)

    # 7. run training
    tt.tick("training")
    q.run_training(run_train_epoch=trainepoch,
                   run_valid_epoch=validepoch,
                   max_epochs=epochs)
    tt.tock("done training")
Beispiel #27
0
 def s(tokens):
     return [PorterStemmer().stem(t) for t in tokens]
 def __init__(self):
     super().__init__()
     self._stemmer = PorterStemmer()
Beispiel #29
0
def queryResults(queryString, vocabDict, documents, numberOfRowsForResults):
    stop_words = set(stopwords.words('english'))
    scores = {}
    N = len(documents)
    queryString = queryString.lower()
    #queryStringExpansion = queryExpansionMethod(model_glove_twitter,queryString)
    queryStringExpansion = queryString
    # create our tokenizer that will also remove punctuation
    tokenizer = RegexpTokenizer(r'\w+')
    # removing the I'm , can't to Im and cant
    queryString = queryString.replace("'", "")
    # tokenize here
    queryString = tokenizer.tokenize(queryString)
    # remove stop words
    porterStemmer = PorterStemmer()
    queryString = [
        porterStemmer.stem(w) for w in queryString if not w in stop_words
    ]

    # we are collecting the weights for the query string and it's length
    weightsForQuery = {}
    lengthOfQuery = 0
    for stemword in queryString:
        if stemword.isnumeric():
            continue
        #adding check here so see if the stem word is actually in our vocab. If it's not then we can simply skip it
        if stemword not in vocabDict:
            continue
        # docsFoundForStemWord = vocabDict[stemword]
        # calculate weight for query word i
        df_i = vocabDict[stemword][0]
        tf_iq = queryString.count(stemword) / len(queryString)
        idf = math.log((N / df_i), 2)
        w_iq = (0.5 + 0.5 * tf_iq) * idf
        if stemword not in weightsForQuery:
            weightsForQuery[stemword] = w_iq
            lengthOfQuery += w_iq**2

    # we now have the length of the query vector and a dict of weights w_iq
    lengthOfQuery = math.sqrt(lengthOfQuery)

    # print(weightsForQuery)

    for word in weightsForQuery:
        docsFoundForStemWord = vocabDict[word][1]
        for doc in docsFoundForStemWord:
            scores[doc] = cosineCalculator(doc, documents, lengthOfQuery,
                                           weightsForQuery)

    arrayOfSortedScoresTuples = sorted(scores.items(),
                                       key=lambda x: x[1],
                                       reverse=True)
    #here we add a dictionary that will store the documents and their new scores on the query expansion
    arrayOfSortedScoresTuplesExpanded = {}
    for i in range(len(arrayOfSortedScoresTuples)):
        docId = arrayOfSortedScoresTuples[i][0]
        originalScore = arrayOfSortedScoresTuples[i][1]
        docSentence = documents[docId][0]  #get sentence
        #get the tokens in our twitter embedding model
        tokens_1 = [t for t in docSentence.split() if t in model_glove_twitter]
        tokens_2 = [
            t for t in queryStringExpansion.split() if t in model_glove_twitter
        ]
        cosine = 0
        if (len(tokens_1) > 0 and len(tokens_2) > 0):
            cosine = model_glove_twitter.n_similarity(tokens_1, tokens_2)
            #take the average of both scores!
            newScoreAvg = (originalScore + cosine) / 2
            #store the score with the document
            arrayOfSortedScoresTuplesExpanded[docId] = newScoreAvg
    #sort by highest value!
    arrayOfSortedScoresTuplesExpanded = sorted(
        arrayOfSortedScoresTuplesExpanded.items(),
        key=lambda x: x[1],
        reverse=True)
    return arrayOfSortedScoresTuplesExpanded[:numberOfRowsForResults]
def stemming_by_portter_1(term):
    return PorterStemmer().stem(term)