Esempio n. 1
0
def clean_text(s):
    s = preprocess.to_lower_case(s)
    s = preprocess.remove_special_chars(s)
    s = preprocess.remove_accents(s)
    s = preprocess.remove_stopwords(s)
    s = preprocess.remove_punctuation(s)
    s = preprocess.remove_extra_space(s)
    return s
def index_document(document, inverted_index, movieID):
    tokens = nltk.word_tokenize(document)                               # Tokenize the script/synopsis
    tokens = [x for x in tokens if x not in string.punctuation]
    tokens = remove_stopwords(tokens)                                   # Remove the stopwords
    tokens = stem_words(tokens)                                         # Stem words

    for i in range(0, len(tokens)):
        tokens[i] = tokens[i].lower()                                   # Makes all words lowercase

    create_inverted_index(inverted_index, movieID, tokens)              # Create the inverted index
def calculate_query_TFIDF(query_string, inverted_index, num_files, profile):
    # List of words to remove words from profile text that appear often but have no bearing on user's likes/dislikes
    words_to_remove = ["birthday", "bday", "facebook", "lol", "thank", "christmas", "hanukkah", "happy"]

    # First we must preprocess the query (social media profile)
    m = NameDataset()
    tokens = nltk.word_tokenize(query_string)                           # Tokenizes the string using NLTK
    tokens = [x for x in tokens if x not in string.punctuation]         # Don't include punctuation
    query_tokens = remove_stopwords(tokens)                             # Remove the stopwords

    # Only includes words that are: 1.) In English 2.) Not in  words_to_remove 3.) Not a first name or last name
    query_tokens = [x for x in query_tokens if (wordnet.synsets(x) and x not in words_to_remove and
                                                not m.search_first_name(x)) and not m.search_last_name(x)]

    query_tokens = stem_words(query_tokens)                             # Stem words for preprocessing

    for i in range(0, len(query_tokens)):                               # Converts all tokens to lowercase
        query_tokens[i] = query_tokens[i].lower()

    query_tokens = [x for x in query_tokens if x != 'birthdai']         # Makes sure this common word doesn't appear
    query_appearances = collections.Counter()
    query_weights = [0] * len(inverted_index)                           # Initialize vector to hold query weights
    query_length = 0.0
    l = list(inverted_index.keys())                                     # Gets list of tuples (query_term, index)

    for query_token in query_tokens:                                    # Counter that keeps track of word appearances
        query_appearances[query_token] += 1

    # Iterate through each term in the query vector and assign nonzero weight if the term appears in inverted index
    for query_term in query_appearances:
        if query_term in inverted_index:
            index_of_word = l.index(query_term)                         # Since ordered dict, calculate index of term
            num_postings = inverted_index[query_term].length + 0.0      # Document frequency
            idf = math.log10(num_files / num_postings)                  # Inverse document frequency
            tf = query_appearances[query_term]                          # Term frequency
            query_weights[index_of_word] = tf * idf                     # Query weight
            query_length += (tf * idf) * (tf * idf)                     # Update running total for query length

    query_length = math.sqrt(query_length)                              # Calculate final query length

    # Writes the query data to pickles
    pickle_out = open("data/"+profile+"/query_appearances.pickle", "wb")
    pickle.dump(query_appearances, pickle_out)
    pickle_out.close()

    pickle_out2 = open("data/" + profile + "/query_weights.pickle", "wb")
    pickle.dump(query_weights, pickle_out2)
    pickle_out2.close()

    return (query_weights, query_length, query_appearances)             # Returns the tuple of necessary data
Esempio n. 4
0
def summarize(article):
    try:
        _create_unverified_https_context = ssl._create_unverified_context
    except AttributeError:
        pass
    else:
        ssl._create_default_https_context = _create_unverified_https_context

    #nltk.download('averaged_perceptron_tagger')
    sentences = preprocess.tokenize_sentences(article)
    clean_sentences = pdpip3.Series(sentences).str.replace("[^a-zA-Z]", " ")
    clean_sentences = [s.lower() for s in clean_sentences]
    clean_sentences = [
        preprocess.remove_stopwords(r.split()) for r in clean_sentences
    ]

    word_embeddings = {}
    f = open('/Users/apple/Downloads/glove.6B/glove.6B.100d.txt',
             encoding='utf-8')
    for line in f:
        values = line.split()
        word = values[0]
        coefs = np.asarray(values[1:], dtype='float32')
        word_embeddings[word] = coefs
    f.close()

    sentence_vectors = []
    for i in clean_sentences:
        if len(i) != 0:
            v = sum(
                [word_embeddings.get(w, np.zeros((100, )))
                 for w in i.split()]) / (len(i.split()) + 0.001)
        else:
            v = np.zeros((100, ))
        sentence_vectors.append(v)

    sim_mat = np.zeros([len(sentences), len(sentences)])
    for i in range(len(sentences)):
        for j in range(len(sentences)):
            if i != j:
                sim_mat[i][j] = cosine_similarity(
                    sentence_vectors[i].reshape(1, 100),
                    sentence_vectors[j].reshape(1, 100))[0, 0]

    nx_graph = nx.from_numpy_array(sim_mat)
    scores = nx.pagerank(nx_graph)

    ranked_sentences = sorted(
        ((scores[i], s) for i, s in enumerate(sentences)), reverse=True)

    # r = Rake()

    ques = []

    for i in range(len(ranked_sentences)):
        tokens = []
        print(ranked_sentences[i][1])
        article = ranked_sentences[i][1]
        print("Article:", article)
        # r.extract_keywords_from_text(ranked_sentences[i][1])
        # print("*********************")
        # print(r.get_ranked_phrases()) # To get keyword phrases ranked highest to lowest.
        # tokens.extend(r.get_ranked_phrases())
        # lis = []
        # for i in range(len(tokens)):
        #     if len(tokens[i].split()) > 1:
        #         lis.extend(nltk.word_tokenize(tokens[i]))
        #
        #     else:
        #         lis.append(tokens[i])
        # print("Parts of speech tagging: ", pos_tag(lis))
        # for i in range(len(ranked_sentences)):
        doc = nlp(article)
        print("DOC", doc.ents)
        print([(X.text, X.label_) for X in doc.ents])
        for X in doc.ents:
            if X.label_:
                print("Inside for")
                article = article.replace(X.text, "__________")
                ques.append(article)
                break
                #print(ques)
                #print(type(ques))
        print(i + 1, ":", article)

    print(ques)
    return ques
Esempio n. 5
0
# 1. a
lines = get_file_lines(test_file_name)
print("***********Test-1***********")
print(lines)
print("****************************")

# 1. b
for i in range(len(lines)):
    lines[i] = clean_text(lines[i])
print("***********Test-2***********")
print(lines)
print("****************************")

# 1.c
for i in range(len(lines)):
    lines[i] = remove_stopwords(stopwords_file, lines[i], do_clean=True)
print("***********Test-3***********")
print(lines)
print("****************************")

# 1.d
for i in range(len(lines)):
    lines[i] = apply_stemming(lines[i])
    lines[i] = apply_lemmatization(lines[i])
print("***********Test-4***********")
print(lines)
print("****************************")

# 1.e
print("***********Test-5***********")
for line in lines:
bigram = gensim.models.Phrases(data_words, min_count=5, threshold=100)
trigram = gensim.models.Phrases(bigram[data_words], threshold=100)

bigram_mod = gensim.models.phrases.Phraser(bigram)
trigram_mod = gensim.models.phrases.Phraser(trigram)


def make_bigrams(texts):
    return [bigram_mod[doc] for doc in texts]


def make_trigrams(texts):
    return [trigram_mod[bigram_mod[doc]] for doc in texts]


data_words_nostops = remove_stopwords(data_words)
data_words_bigram = make_bigrams(data_words_nostops)
nlp = spacy.load('en_core_web_sm', disable=['parser', 'ner'])

data_lemmatized = lemmatization(data_words_bigram,
                                allowed_postags=['NOUN', 'ADJ', 'VERB', 'ADV'])

id2word = corpora.dictionary.Dictionary(data_lemmatized)
texts = data_lemmatized
corpus = [id2word.doc2bow(text) for text in texts]

lda_model = gensim.models.ldamodel.LdaModel(corpus=corpus,
                                            id2word=id2word,
                                            num_topics=3,
                                            random_state=100,
                                            update_every=1,
Esempio n. 7
0
    def test_remove_stopwords(self):
        t1 = "There is not way in hell I'm gonna wait till 1am for transfer news. :)"
        t2 = "I had a very vivid dream that I was pregnant last night, (like, scary real) and today I've felt off. And something still doesn't feel right."

        self.assertEqual(p.remove_stopwords("not", ["not"]), "not")
Esempio n. 8
0
    def test_remove_stopwords(self):
        t1 = "There is not way in hell I'm gonna wait till 1am for transfer news. :)"
        t2 = "I had a very vivid dream that I was pregnant last night, (like, scary real) and today I've felt off. And something still doesn't feel right."

        self.assertEqual(p.remove_stopwords("not", ["not"]), "not")
Esempio n. 9
0
import preprocess
def intersect(a, b):
	return list(set(a) & set(b))
	
files=[x for x in os.listdir(os.getcwd()+"/Output") if "_output" in x]
glo=[[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0],[0,0,0]]
for File in files:
	targetFile=""
	for ch in File:
		if ch=="_":
			break
		else:
			targetFile=targetFile+ch
	
	target=open(os.getcwd()+"/clusters _summarized(2)/"+targetFile)
	summary1=nltk.word_tokenize(preprocess.remove_stopwords((target.readline()[:-1]).lower()))
	summary2=nltk.word_tokenize(preprocess.remove_stopwords((target.readline()[:-1]).lower()))
	
	print(File)
	length_summary1=len(summary1)
	print("K: "+str(summary1))
	length_summary2=len(summary2)
	print("P: "+str(summary2))
	target.close()
	
	book = xlwt.Workbook(encoding="utf-8")
	sheet1 = book.add_sheet("Sheet 1",cell_overwrite_ok=True)
	
	counter=0
	sheet1.write(counter,0,"Base")
	sheet1.write(counter,1,"Precision")
Esempio n. 10
0
import preprocess
import pandas as pd

filename = 'dataset_related.csv'
output = 'data_related_extracted/data_related_extracted_remstop.txt'

df = pd.read_csv(filename)
df_extract = df.loc[:, ['tweet', 'class']]
df_extract = df_extract.dropna()
df_extract = df_extract.drop_duplicates()

tweets = df_extract.tweet.values.tolist()
classes = df_extract['class'].values.tolist()

# Preprocess
for i in range(len(tweets)):
    tweets[i] = tweets[i].replace('\n', ' ')
    tweets[i] = preprocess.preprocess(tweets[i])
    tweets[i] = preprocess.remove_punc(tweets[i])
    tweets[i] = preprocess.lemmatize(tweets[i])
    tweets[i] = preprocess.remove_stopwords(tweets[i])

s = []
for tweet, cl in zip(tweets, classes):
    s.append(tweet + '\t' + str(int(cl)) + '\n')

with open(output, 'wb') as f:
    for x in s:
        f.write(x.encode('utf-8'))
Esempio n. 11
0
from underthesea import pos_tag, word_tokenize
import preprocess
from nltk.tokenize import word_tokenize as word_tokenize1

while True:
    question = input('Query: ')
    print(pos_tag(question))
    print(word_tokenize(question))
    print(preprocess.remove_stopwords(question.lower()))
    print(word_tokenize1(question.lower()))

    # 1Q nC --> relevance 1Q-1C-Score --> top kC
    # 1Q -> VECTOR, 1C -> VECTOR
    # 1 ele vector = 1 word
    
Esempio n. 12
0
def third_problem(a, b, c):
    tree = stree.Suffix_tree()

    for i in range(0, len(b)):
        tree.add(b[i])

    rank = [0] * len(c)
    count = 1
    out = tree.search(a)

    if out != -1:
        out.sort()
        k = 0
        out2 = []

        while k < len(out):
            out3 = [0, out[k][0]]
            j = 1

            while k + j < len(out) and out[k][0] == out[k + j][0]:
                j += 1

            out3[0] += j
            out2.append(out3)
            k += j

        out2.sort()
        out2.reverse()
        for j in range(0, len(out2)):
            rank[out2[j][1]] = count
            count += 1

    if 0 in rank:
        d = (r.remove_stopwords([a]))[0].split(" ")
        outing = [[0, 0]] * len(rank)
        for j in range(0, len(outing)):
            outing[j] = [0, j]

        for i in d:
            out4 = tree.search(i)
            if out4 != -1:
                for j in range(0, len(out4)):
                    if out4[j][0] < len(outing):
                        outing[out4[j][0]][0] += 1

        outing.sort()
        outing.reverse()

        k = 0
        while k < len(outing) and outing[k][0] != 0:
            if rank[outing[k][1]] == 0:
                rank[outing[k][1]] = count
                count += 1
            k += 1

    if 0 in rank:
        subs2 = get_all_substrings(a)
        i = len(subs2) - 2

        outing = [[0, 0]] * len(rank)

        for j in range(0, len(outing)):
            outing[j] = [0, j]

        while i >= 0:
            output = tree.search(subs2[i])
            if output != -1:
                for j in range(0, len(output)):
                    if output[j][0] < len(outing):
                        outing[output[j][0]][0] += len(subs2[i])
            i -= 1

        outing.sort()
        outing.reverse()

        k = 0
        while k < len(outing) and outing[k][0] != 0:
            if rank[outing[k][1]] == 0:
                rank[outing[k][1]] = count
                count += 1
            k += 1

    if 0 in rank:
        for q in range(0, len(rank)):
            if rank[q] == 0:
                rank[q] = count

    last_out = [[0, 0]] * len(c)

    for j in range(0, len(rank)):
        last_out[j] = [rank[j], j]

    last_out.sort()
    print(
        "\t\tTITLES OF THE TALES IN ORDER OF RELEVANCE (FROM HIGHEST TO LOWEST) FOR THE QUERY STRING '",
        a, "' ARE =>\n\n")
    for i in last_out:
        print("\t\t\t\t'", c[i[1]], "'\n")
Esempio n. 13
0
#!/usr/bin/env python 3.6

import preprocess
A = open('data/FS_FSociety.txt').read()
B = preprocess.normalize(A).lower()
C = preprocess.remove_stopwords(B)
from preprocess import CollocationList
coll2 = CollocationList(C)
coll2.find_collocations()
collocations = coll2.head(40)
D = preprocess.utils.hypenation(C,collocations)
coll3 = CollocationList(D)
coll3.find_collocations()
from nltk.tokenize import RegexpTokenizer
tokenizer = RegexpTokenizer(r"\s+", gaps=True)
tokens = tokenizer.tokenize(D)
print ("Total inicial de palabras: ", len(B.split()))
print ("Total sin stopwords: ", len(C.split()))
print ("Total after collocations hypen: ", len(D.split()))
tokens_unique=set([])
tokens_unique = set(tokens)
print ("Palabras únicas:", len(tokens_unique))
#Inicializar un diccionario para guardar el # de apariciones de cada palabra.
dict = {}
for word in tokens_unique:
    dict[word]=0
#Diccionario con word = # apariciones.
for token in tokens:
    dict[token]+=1
#Operar con una tupla puede ser mejor. Lista([#apariciones,word])
tupla = []