Esempio n. 1
0
def text_summarization_main(ORIGINAL_TEXT):
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    ORIGINAL_TEXT = str(ORIGINAL_TEXT)
    intermedia_text = ORIGINAL_TEXT.lower().replace(". ", " qwertyuiop")
    intermedia_text = re.sub('[^a-zA-Z]', ' ', intermedia_text)
    intermedia_text = re.sub(r'\s+', ' ', intermedia_text)
    intermedia_text = intermedia_text.split(" qwertyuiop")

    average_sentence_word_count = len(intermedia_text)
    sum_word_count = 0
    for c, text in enumerate(intermedia_text):
        intermedia_text[c] = ' '.join(
            [word for word in text.split() if word not in STOPWORDS])
        sum_word_count += len(intermedia_text[c].split(" "))

    average_sentence_word_count = sum_word_count / average_sentence_word_count

    sentence_scores = get_text_weighted_score(intermedia_text,
                                              average_sentence_word_count)
    original_dict = {}
    ORIGINAL_TEXT = ORIGINAL_TEXT.split(". ")
    for i, sentences in enumerate(sentence_scores.items()):
        original_dict[ORIGINAL_TEXT[i]] = sentences[1]
    sorted_sentences = sorted(original_dict.items(),
                              key=lambda x: x[1],
                              reverse=True)
    final_list = []
    for i, s in enumerate(sorted_sentences):
        if i < 10:
            final_list.append(s[0])
    return final_list
Esempio n. 2
0
def pre_process_text(ORIGINAL_TEXT):
    """Polishes text"""
    STOPWORDS = set(stopwords.words('english'))
    STOPWORDS.add("-")
    frp = []
    for i, c in enumerate(ORIGINAL_TEXT):
        reg = c.lower()
        reg = ' '.join(reg)
        reg = ' '.join([word for word in c.split() if word not in STOPWORDS])
        reg = re.sub('[^a-zA-Z]', ' ', reg)
        reg = re.sub(r'\s+', ' ', reg)
        frp.append(reg)
    return frp
Esempio n. 3
0
    def preprocess(self, text, min_len=2, max_len=240, remove_common=False):
        '''
        Function to remove stop words and perform lemmatization.

        INPUT:
            - text (str): numpy array of tweet text.
            - min_len (int): words with less characters than the value of min_len will be removed.
            - max_len (int): words with more character than the value of max_len will be removed.
            - remove_common (bool): add common words in the corpus to the stopwords list.
        OUTPUT: cleaned string
        '''

        result = []
        stopwords = STOPWORDS.copy()
        stopwords = set(stopwords)
        spanish = self._get_spanish_stopwords()
        stopwords.update(spanish)
        stopwords.update(['http', 'f**k', 'rt'])
        if remove_common:
            stopwords.update(['google', 'apple', 'twitter', 'microsoft'])

        for token in gensim.utils.simple_preprocess(text,
                                                    min_len=min_len,
                                                    max_len=max_len):
            if token not in stopwords:
                result.append(self._lemmatize_stemming(token))
        return result
Esempio n. 4
0
def make_stop_words():
    global stop_words
    letters = list('abcdefghijklmnopqrstuvwxyz')
    numbers = list('0123456789')
    words = ['oz', 'ml', 'pour', 'poured', 'bottle', 'can', 'ounce',\
         'bomber', 'botttle', 'stubby', 'ouncer', 'pouring', 'growler', 'snifter',\
         'tulip', 'bottled', 'brewery', 'pint', 'glass', 'cap', 'cork']
    stopwords = stop_words.union(set(letters)).union(set(numbers)).union(set(words))
    
    my_stop_words = text.ENGLISH_STOP_WORDS.union(stopwords)
    return my_stop_words
Esempio n. 5
0
def make_wordcloud(recipe_list):
    fig = plt.figure()
    stop_words = stopwords.words('english')
    stop_words.extend(['i','ive',"i've",'didnt','them', 'little','use','added','good','great', 'think', 'taste',\
                       'recipe', 'used','made','make','still','also','baked','bake','thank','thanks','cup'])
    stop_words = STOPWORDS.union(set(stop_words))
    review_list = interactions[interactions['recipe_id'].isin(recipe_list.id)]['review']
    text = " ".join(str(review) for review in review_list)
    wordcloud = WordCloud(stopwords=stop_words, background_color="white").generate(text)
    plt.imshow(wordcloud, interpolation='bilinear')
    plt.axis("off")
    fig.savefig('/images/wordcloud.png')
Esempio n. 6
0
def lemm_tokenize_doc_spacy_pos(doc):
    '''
    INPUT: string that corresponds to a document in a raw corpus and a list of stop words.
    OUTPUT: (1) a list of tokens that corresponds to a corpus document. Strings are byte decoded, punctuation, digits, and newlines removed, words are lowered and lemmatized (words brought back to their 'base' form), only nouns are kept, non-words and stop-words are removed.
    PACKAGE USED: spaCy
    '''
    # decode bytes to utf-8 from doc
    ascii_doc = unidecode(doc.decode('utf-8'))

    # remove punctuation, digits, newlines, and lower the text
    clean_doc = ascii_doc.translate(None, punctuation).translate(
        None, digits).replace('\n', '').lower()

    # spaCy expects a unicode object
    spacy_doc = nlp(clean_doc.decode('utf-8'))

    # lemmatize, only keep nouns and verbs, transform to ascii as will no longer use spaCy
    noun_tokens = [
        unidecode(token.lemma_) for token in spacy_doc
        if token.pos_ == 'NOUN' or token.pos_ == 'VERB'
    ]

    # keep tokens longer than 2 characters
    long_tokens = [
        token for token in noun_tokens if len(token) >= 3 and len(token) < 15
    ]

    # remove tokens that have 3 equal consecutive characters
    triples = [
        ''.join(triple)
        for triple in zip(ascii_lowercase, ascii_lowercase, ascii_lowercase)
    ]
    good_tokens = [
        token for token in long_tokens
        if not [triple for triple in triples if triple in token]
    ]

    # remove tokens that are present in stoplist
    stop_specific = [
        'date', 'state', 'surface', 'location', 'oil', 'operator',
        'commission', 'colorado', 'conservation', 'denver', 'ogcc', 'cogcc'
    ]

    # remove tokens that are present in stoplist
    # stop_specific = ['wattenberg', 'yes', 'acre', 'number', 'mum', 'nwse', 'swne', 'lease', 'rule', 'drilling', 'permit', 'application', 'form', 'felfwl', 'fnlfsl', 'fnl', 'fsl', 'page', 'file', 'date', 'state', 'surface', 'location', 'oil', 'operator', 'commission', 'colorado', 'conservation', 'prod', 'formation', 'denver', 'ogcc', 'cogcc']

    NLTKstopwords = sw.words('english')

    stoplist = STOPWORDS.union(NLTKstopwords).union(stop_specific)

    final_tokens = [token for token in good_tokens if token not in stoplist]

    return final_tokens
Esempio n. 7
0
    def _preprocess(self,
                    text,
                    min_len=2,
                    max_len=240,
                    custom_stopwords=False):
        result = []
        if custom_stopwords:
            stopwords = STOPWORDS.copy()
            stopwords = set(stopwords)
            spanish = self._get_spanish_stopwords()
            custom = self._get_custom_stopwords()
            stopwords.update(spanish)
            # stopwords.update(['http', 'f**k', 'rt'])
            stopwords.update(custom)
        else:
            stopwords = STOPWORDS.copy()

        for token in gensim.utils.simple_preprocess(text,
                                                    min_len=min_len,
                                                    max_len=max_len):
            if token not in stopwords:
                result.append(self._lemmatize_stemming(token))
        return result
Esempio n. 8
0
def pd(test):
    tests = []
    tests.append([
        word for word in re.sub("_", " ", test.lower()).split()
        if word not in STOPWORDS.union(stoplist)
    ])
    tests = tests[0]
    w2v = [word2vec.wv[word] for word in tests if word in word2vec.wv]

    X = np.empty((len(w2v), 8000))
    w2v = np.reshape(w2v, (-1))
    zero = np.zeros((8000) - int(w2v.size))
    X = np.concatenate((w2v, zero))
    ans = int(sgd.predict(X.reshape(1, -1))[0])

    return ans
Esempio n. 9
0
    def tokenize_text(self, text):
        tokens = []
        # Adding to stopwords
        stopwords = STOPWORDS.copy()
        stopwords = set(stopwords)
        spanish = self._get_spanish_stopwords()
        stopwords.update(spanish)
        stopwords.update(['http', 'f**k', 'rt'])

        for sent in nltk.sent_tokenize(text):
            for word in nltk.word_tokenize(sent):
                # if word not in stopwords:
                if len(word) < 2:
                    continue
                # tokens.append(self._lemmatize_stemming(word.lower()))
                tokens.append(word.lower())
        return tokens
Esempio n. 10
0
def extract_text_n_corpus(docs, remove_uniq=True):
    stoplist = set('bitcoin bitcoins s m d t u ll ur ve'.split())
    texts = [[word for word in re.split("\W+", re.sub(r"[,.]", "", doc.lower()))
            if word not in STOPWORDS.union(stoplist) and word is not ""] for doc in docs]
    frequency = defaultdict(int)
    for text in texts:
        for token in text:
            frequency[token] += 1

    if remove_uniq:
        # Remove all empty strings
        frequency[''] = 0
        # Extract only duplicate words
        texts = [[token for token in text if frequency[token] > 1] for text in texts]

    dictionary = corpora.Dictionary(texts)
    corpus = [dictionary.doc2bow(text) for text in texts]
    corpora.MmCorpus.serialize('/tmp/deerwester.mm', corpus)
    return texts, corpus
Esempio n. 11
0
def processing(body_text):
    p = PorterStemmer()
    stopset = set([
        'doi', 'preprint', 'copyright', 'org', 'https', 'et', 'al', 'author',
        'figure', 'table', 'rights', 'reserved', 'permission', 'use', 'used',
        'using', 'biorxiv', 'medrxiv', 'license', 'fig', 'fig.', 'al.',
        'Elsevier', 'PMC', 'CZI', '-PRON-', 'usually', r'\usepackage{amsbsy',
        r'\usepackage{amsfonts', r'\usepackage{mathrsfs',
        r'\usepackage{amssymb', r'\usepackage{wasysym',
        r'\setlength{\oddsidemargin}{-69pt', r'\usepackage{upgreek',
        r'\documentclass[12pt]{minimal'
    ])
    cStopwords = STOPWORDS.union(stopset)
    resultlist = []
    for text in body_text:
        tokens = []
        for item in gensim.parsing.preprocess_string(text):
            if item not in cStopwords:
                p.stem(item)
                tokens.append(item)
        yield model.infer_vector(tokens)
Esempio n. 12
0
def nettoyer_texte(text):
    # Replacing specials chars and specific strings like "http"
    result = text.lower()
    result = result.replace('\n', ' ')
    result = re.sub("www", " ", result)
    result = re.sub("http", " ", result)
    result = re.sub(".com", " ", result)
    result = re.sub(".gg", " ", result)
    result = re.sub(r"[0-9,.;@\-\*\(\)/#?%!:|&$]+\ *", " ", result)
    result = re.sub("\[.*?\]", " " , result)
    result = re.sub(" +", " ", result)
    
    # Removing stopwords
    # list found on https://gist.github.com/sebleier/554280
    stopwords = ["i", "me", "my", "myself", "we", "our", "ours", "ourselves", "you", "your", "yours", "yourself", "yourselves", "he", "him", "his", "himself", "she", "her", "hers", "herself", "it", "its", "itself", "they", "them", "their", "theirs", "themselves", "what", "which", "who", "whom", "this", "that", "these", "those", "am", "is", "are", "was", "were", "be", "been", "being", "have", "has", "had", "having", "do", "does", "did", "doing", "a", "an", "the", "and", "but", "if", "or", "because", "as", "until", "while", "of", "at", "by", "for", "with", "about", "against", "between", "into", "through", "during", "before", "after", "above", "below", "to", "from", "up", "down", "in", "out", "on", "off", "over", "under", "again", "further", "then", "once", "here", "there", "when", "where", "why", "how", "all", "any", "both", "each", "few", "more", "most", "other", "some", "such", "no", "nor", "not", "only", "own", "same", "so", "than", "too", "very", "s", "t", "can", "will", "just", "don", "should", "now"]
    all_stopwords = STOPWORDS.union(stopwords)
    words = result.split()
    result = [word for word in words if word not in all_stopwords]
    
    result = ' '.join( [word for word in result if len(word)>1] )
    return result
Esempio n. 13
0
def lemm_tokenize_doc(doc, stop):
    '''
    INPUT: string that corresponds to a document in a raw corpus and a list of stop words.
    OUTPUT: (1) a list of tokens that corresponds to a corpus document. Strings are byte decoded, punctuation, digits, and newlines removed, words are lowered and lemmatized (words brought back to their 'base' form), only nouns are kept, non-words and stop-words are removed.
    PACKAGE USED: spaCy
    '''
    # decode bytes to utf-8 from doc
    ascii_doc = unidecode(doc.decode('utf-8'))

    # remove punctuation, digits, newlines, and lower the text
    clean_doc = ascii_doc.translate(None, punctuation).translate(
        None, digits).replace('\n', '').lower()

    # spaCy expects a unicode object
    spacy_doc = nlp(clean_doc.decode('utf-8'))

    # lemmatize, only keep nouns, transform to ascii as will no longer use spaCy
    # noun_tokens = [unidecode(token.lemma_) for token in spacy_doc if token.pos_ == 'NOUN']
    noun_tokens = [unidecode(token.lemma_) for token in spacy_doc]

    # keep tokens longer than 2 characters
    long_tokens = [token for token in noun_tokens if len(token) >= 3]

    # remove tokens that have 3 equal consecutive characters
    triples = [
        ''.join(triple)
        for triple in zip(ascii_lowercase, ascii_lowercase, ascii_lowercase)
    ]
    good_tokens = [
        token for token in long_tokens
        if not [triple for triple in triples if triple in token]
    ]

    NLTKstopwords = sw.words('english')

    stoplist = STOPWORDS.union(NLTKstopwords).union(stop)

    final_tokens = [token for token in good_tokens if token not in stoplist]

    return final_tokens
Esempio n. 14
0
import sqlite3
from gensim.parsing.preprocessing import remove_stopwords
from gensim.parsing.preprocessing import STOPWORDS
import pandas as pd
import csv

my_stop_words = STOPWORDS.union(
    set(['I', 'The', 'If', 'But', 'This', 'like', 'going']))
word_list = []
flair_list = ['Loss', 'Gain']
d = {}


def freq_for_all():
    conn = sqlite3.connect('stonks.db')
    c = conn.cursor()
    c.execute("select text from posts where  text <> '[removed]'")
    total = c.fetchall()
    for i in total:
        for x in i:
            filtered = remove_stopwords(x)
            split = filtered.split()
            for z in split:
                if z in my_stop_words:
                    pass
                else:
                    word_list.append(z)

    conn.commit()
    conn.close()

# parse docs into individual words ignoring words that are less than 3 letters long
# and stopwords: him, her, them, for, there, ect since "their" is not a topic.
# then append the tolkens into a list

with open('../data/more_stop_words.txt', 'r') as f:
    customize_stop_words = f.read().replace('\n', '')

with open('../data/add_num_stops.txt', 'r') as f:
    customize_stop_nums = f.read().replace('\n', '')

combined_stops = [customize_stop_words + customize_stop_nums]

from gensim.parsing.preprocessing import STOPWORDS
expanded_stop_words = STOPWORDS.union(set(combined_stops))
# print(expanded_stop_words, type(expanded_stop_words))


def preprocess(text):
    result = []
    for token in gensim.utils.simple_preprocess(text):
        if token not in expanded_stop_words and len(token) > 3:
            nltk.bigrams(token)
            result.append(lemmatize_stemming(token))
    return result


# look at a random row 4310 and see if things worked out
# note that the document created was already preprocessed
Esempio n. 16
0
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
from scipy.stats import entropy
from tempfile import TemporaryFile

from scipy.special import (entr, rel_entr)
from numpy import (arange, putmask, ravel, ones, shape, ndarray, zeros, floor,
                   logical_and, log, sqrt, place, argmax, vectorize, asarray,
                   nan, inf, isinf, NINF, empty)

from libs.my_paths import base_model_lda, base_model_ngram, base_model

MY_STOP_WORDS = STOPWORDS.union(
    set([
        'use', 'be', 'work', 'user', 'try', 'cell', 'row', 'want', 'item',
        'go', 'get', 'add', 'went', 'tried', 'return', 'sort', 'test', 'run',
        'check', 'click', 'hour', 'minute', 'second', 'version', 'app',
        'paragraph', 'error', 'log', 'press', 'need', 'feed', 'thank', 'way',
        'like', 'kill', 'help'
    ]))


def clear_text(text):
    text = re.sub('<code>(.|\n)*?<\/code>', '', text)
    text = re.sub(r'(\<(/?[^>]+)>)', '', text)
    text = re.sub("[\'\"\\/\@\%\(\)\~\`\{\}]", '', text)
    text = re.sub('\s+', ' ', text)

    return text


def lemmatize_stemming(text, stemmer):
Esempio n. 17
0
def preprocess(texts_list):
    my_stopwords = STOPWORDS.union({'\n'})
    texts_tokens = [tokenize(text, lower=True) for text in texts_list]
    texts_no_stop = [[word for word in text if word not in my_stopwords] for text in texts_tokens]
    return texts_no_stop
Esempio n. 18
0
def clean_df(text):
    print(text)
    #remove url
    text0 = remove_URL(text)

    #remove the phrace 'item documentation'
    if (text0[:22] == '{{item documentation}}') | (
            text0[:22] == '{{Item documentation}}') | (
                text0[:22] == '{{item Documentation}}') | (
                    text0[:22] == '{{Item Documentation}}'):
        text1 = text0[22:]
    else:
        text1 = text0

    # split into words
    #text_tokenization = word_tokenize(text0)
    #text_regular_expresion=regexp_tokenize(text0,pattern='\w+|\$[\d\.]+|\S+')
    text_wordpunct = wordpunct_tokenize(text1)
    #text_whitespace=WhitespaceTokenizer().tokenize(text0)
    #text_stanford=StanfordTokenizer().tokenize(text0)
    # convert to lower case
    text_lowercase = [w.lower() for w in text_wordpunct]

    # remove punctuation from each word
    table = str.maketrans('', '', string.punctuation)
    #text_punctuation1= [w.translate(table) for w in text_lowercase]
    text_punctuation = [w for w in text_lowercase if w.translate(table)]

    # filter out stop words
    #stop_words = set(stopwords.words('english'))
    #text_stopwords = [w for w in text_lowercase if not w in stop_words]
    all_stopwords_gensim = STOPWORDS.union(set(['likes', 'play']))
    text_stopwords = [
        w for w in text_punctuation if not w in all_stopwords_gensim
    ]

    #remove extra stop words
    text_extra_stopwords = extend_stopwords(text_stopwords)

    #remove markup
    text_markup_words = markup_words_WikidataSymbols(text_extra_stopwords)

    #remove months
    months = {m.lower() for m in month_name[1:]}  # create a set of month names
    text_no_months = [word for word in text_markup_words if not word in months]

    #remove non English words
    #words_engl = set(nltk.corpus.words.words())
    #text_non_english_words=[w for w in text_no_months if w in words_engl or not w.isalpha()]

    #replace q with item

    text_item1 = [
        'item' if (val[:1] == 'q') and any(chr.isdigit()
                                           for chr in val) else val
        for k, val in enumerate(text_no_months)
    ]
    text_item2 = [
        'item' if val == 'q' else val for k, val in enumerate(text_item1)
    ]
    text_item3 = [
        val for k, val in enumerate(text_item2)
        if not ((val == 'item') and (text_item2[(k - 1)] == 'item'))
    ]

    #replace p with property
    text_property1 = [
        'property' if (val[:1] == 'p') and any(chr.isdigit()
                                               for chr in val) else val
        for k, val in enumerate(text_item3)
    ]
    text_property2 = [
        'property' if val == 'p' else val
        for k, val in enumerate(text_property1)
    ]
    text_property3 = [
        val for k, val in enumerate(text_property2) if not (
            (val == 'property') and (text_property2[(k - 1)] == 'property'))
    ]

    # remove remaining tokens that are not alphabetic
    text_not_alphabetic = [word for word in text_property3 if word.isalpha()]

    #spelling check
    #spells = [spell(w) for w in (nltk.word_tokenize(text))]

    #remove sigle letters
    text_single_letters = [w for w in text_not_alphabetic if len(w) > 2]

    #remove the non Engilish words/lowercase words like april are concidered false
    text_non_english = remove_non_english_words(text_single_letters)
    #remove non English words
    #words_engl = set(nltk.corpus.words.words())
    #text_non_english_words=[w for w in text_no_months if w in words_engl or not w.isalpha()]

    #stemming of words
    porter = PorterStemmer()
    text_stemmed = [porter.stem(word) for word in text_non_english]

    #lemmatization
    lemmatizer = WordNetLemmatizer()
    text_lemma = [lemmatizer.lemmatize(t, pos="v") for t in text_stemmed]

    return text_lemma
import pandas as pd
import os
import operator
import gensim
import numpy as np
import matplotlib.pyplot as plt
from gensim.parsing.preprocessing import STOPWORDS
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from nltk.stem.porter import *
np.random.seed(2018)
import pickle
stemmer = PorterStemmer()
from spellchecker import SpellChecker
spell = SpellChecker()
STOPWORDS = list(STOPWORDS)
STOPWORDS.append('covid')
STOPWORDS.append('coronavirus')
STOPWORDS.append('corona')
STOPWORDS.append('uganda')
from utils import preprocess, BreakIt, produce_mapping
from apiclient import discovery
from google.oauth2 import service_account
from datetime import datetime
from numpy.random import multinomial
from numpy import log, exp
from numpy import argmax


class MovieGroupProcess:
    def __init__(self, K=8, alpha=0.1, beta=0.1, n_iters=30):
        '''
Esempio n. 20
0
    from spacy.lang.en import English
    parser = English()
    unfiltered_tokens = parser(text)
    tokens = [
        preprocess_token(token) for token in unfiltered_tokens
        if is_token_allowed(token)
    ]
    return tokens


######  Tokenization with Gensim #####

from gensim import utils
import gensim.parsing.preprocessing as gsp
from gensim.parsing.preprocessing import STOPWORDS
my_stop_words = STOPWORDS.union(set(['http', 'com', 'www']))


def preprocess(text):
    result = []
    for token in gsp.utils.simple_preprocess(text):
        if token not in my_stop_words:
            result.append(token)
    return ' '.join(result)


filters = [
    gsp.strip_tags, gsp.strip_punctuation, gsp.strip_multiple_whitespaces,
    gsp.strip_numeric, gsp.remove_stopwords, gsp.strip_short, gsp.stem_text
]
Esempio n. 21
0
def suggest_next_video(original_id, input_chunks, search_term):
    if(search_term == ''):
        global last_search
        search_term = last_search
    
	# This video_id is just a test case
    #if (original_id == 'R9npBuS9AsE'):
    #    output_id_list = get_canned_search_results()
    #else:

    output_video_list = query_video_ids(search_term)
    
    output_name_map = dict(output_video_list)
    output_id_list = [video[0] for video in output_video_list]
    
    #Truncate possible video list to 20 for performance reasons
    try:
        output_id_list.remove(original_id)
    except:
        pass
    output_id_list = output_id_list[:40]
        
    chunk_lookup_dict = {}
    
    start = time.time()
    
    chunk_counter = 0
    output_chunks = []
    myq = queue.Queue()
    threads = list()
    for video_id in output_id_list:
        thread = threading.Thread(target=queueTranscript,args=(video_id,myq))
        threads.append(thread)
        thread.start()
    
    for thread in threads:
        thread.join()
    
    for transcript in list(myq.queue):
        transcript_counter = 0
        #try:
        #    output_video_list = yttapi.get_transcript(str(video_id))
        #except yttapi.CouldNotRetrieveTranscript:
        #    continue
        output_video_list = transcript[1]
        video_length = len(transcript[1])
        video_id = transcript[0]
    
        for i in range(video_length//10):
            chunk_text_list = []
            for j in range(10):
                try:
                    chunk_text_list.append(output_video_list[transcript_counter]['text'])
                except Exception:
                    break
                chunk_text = ' '.join(chunk_text_list)
                transcript_counter += 1
                
            output_chunks.append(chunk_text)
            chunk_lookup_dict[chunk_counter] = video_id
            chunk_counter += 1
            
    print ("After chunking output: " + str(time.time() - start))
    
    start = time.time()
	# Exclude common stop words and those used frequently in YouTube transcripts
    my_stop_words = STOPWORDS.union(set(['[Music]', '[music]', '[Applause]', 'subscribe', 'channel', 'youtube']))
    #stoplist = set('for a of the and to in [music]'.split())
    texts = [
        [word for word in document.lower().split() if word not in my_stop_words]
        for document in output_chunks
    ]
    
    dictionary = corpora.Dictionary(texts)
    
    corpus = [dictionary.doc2bow(text) for text in texts]
    lsi = models.LsiModel(corpus, id2word=dictionary, num_topics=10)
    
    # generates an index of the corpus, need only do this once 
    index = similarities.MatrixSimilarity(lsi[corpus])
    
    print ("After building index: " + str(time.time() - start))
    
    video_average_score = {}
    for video_id in output_id_list:
        video_average_score[video_id] = []
    
    start = time.time()
    
    # Go through each input chunk and get an average score for each video
    for i in range(len(input_chunks)):
        
        # Skip over chunks the user didn't watch
        watched_score = input_chunks[i][1]
        if (watched_score == 0):
            continue
        
        doc=input_chunks[i][0]
        #doc=input_chunks[0][0]
        vec_bow = dictionary.doc2bow(doc.lower().split())
        vec_lsi = lsi[vec_bow]
        similarity_score = index[vec_lsi]
    
    
        # sorts based on descending relevance (earlier sort order = more useful)
        similarity_scores = sorted(enumerate(similarity_score), key=lambda item: -item[1])
        
        #chunk_ranking = [(documents[x],y) for (x,y) in similarity_scores]
        
        video_total_score = {}
        video_chunk_counts = {}
    
        
        for video_id in output_id_list:
            video_total_score[video_id] = 0
            video_chunk_counts[video_id] = 0  
            
            
        for chunk_id, score in similarity_scores:
            video_total_score[chunk_lookup_dict[chunk_id]] += score
            video_chunk_counts[chunk_lookup_dict[chunk_id]] += 1
            
        # Multiply the similarity ranking by the 'score' given to us that represents how slowly they 
        # watched the video chunk and how many times they repeated it
        # We append this to a list of average scores for the video
        for video_id in output_id_list:
            if (video_chunk_counts[video_id] == 0):
                video_average_score[video_id].append(0)
            else:
                avg_score = video_total_score[video_id]/video_chunk_counts[video_id]
                video_average_score[video_id].append(avg_score)
    
    print ("After looping through input chunks: " + str(time.time() - start))
        
    video_sum = {}
    for idx, video_id in enumerate(video_average_score.keys()):
        total_score = sum(x for x in video_average_score[video_id])
        #video_sum[video_id] = (total_score * (1 + RL_WEIGHT_FACTOR * rl_network.weights['param_' + str(idx)]), output_name_map[video_id])
        video_sum[video_id] = (total_score, output_name_map[video_id])

    sorted_videos = list(sorted(video_sum.items(), key=lambda kv: -kv[1][0]))

    # now apply geva
    return_videos = []
    for i in range(0,10):
        return_videos.append((sorted_videos[i][0], (sorted_videos[i][1][0]* (1.0+RL_WEIGHT_FACTOR * rl_network.weights['param_'+str(i)]),sorted_videos[i][1][1])))

    return return_videos
Esempio n. 22
0
    return perplexity, coherence_lda


if __name__ == '__main__':

    # Get singltracks trail summary data
    X = get_st_descriptions()

    # Create initial stopwords to remove before creating n-grams
    not_stops_firstpass = [
        'not', 'bottom', 'few', 'many', 'more', 'less', 'most', 'least',
        'never', 'off', 'out', 'very', 'too', 'overly', 'so'
    ]
    new_stops_firstpass = ['br']
    first_stopwords = (
        STOPWORDS.difference(not_stops_firstpass)).union(new_stops_firstpass)

    # Create second set of stopwords to use after creating n-grams
    my_stopwords = set([
        'climb', 'mountain', 'road', 'singletrack', 'loop', 'trail', 'trails',
        'ride', 'area', 'route', 'way', 'feature', 'section', 'sections',
        'riding', 'loop', 'br', 'mile', 'miles', 'right', 'left', 'www',
        'http', 'https', 'bike', 'bikes', 'bicycle', 'bicycles', 'continue',
        'rider', 'riders', 'parking', 'lot', 'turn', 'start', 'starts',
        'description', 'cross', 'north', 'south', 'east', 'west', '-PRON-',
        'pron', 'nee', 'regard', 'shall', 'use', 'win', 'park', 'point',
        'biking', 'follow', 'single', 'track', 'intersection', 'trailhead',
        'head', 'good', 'great', 'nice', 'time', 'include', 'place', 'come',
        'downhill', 'look', 'near'
    ])
    bitri_stops = set([
Esempio n. 23
0

noContLines = []
for line in lines:
    noContLines += [decontracted(line)]

noPuncLines = []
for line in noContLines:
    noPuncLines += [re.sub(r'[^\w\s]', ' ', line)]

noNumbLines = []
for line in noPuncLines:
    noNumbLines += [re.sub(r'[0-9]+', ' ', line)]

from gensim.parsing.preprocessing import STOPWORDS
stopwords_gensim = STOPWORDS.union(set(['like', 'also', 'let', 'lot', 'hi']))
# stopwords_gensim.add("like");
# stopwords_gensim.add("also");
# stopwords_gensim.add("let");
noStopLines = []
for line in noNumbLines:
    noStopLines += [
        ' '.join([
            word for word in line.split() if not word in stopwords_gensim
            if len(word) > 1
        ])
    ]

from nltk.stem.wordnet import WordNetLemmatizer

noLemmaLines = []
Esempio n. 24
0
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
from gensim.parsing.preprocessing import remove_stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem.wordnet import WordNetLemmatizer
from gensim.corpora import Dictionary
from gensim.models import LdaModel
import json
from pprint import pprint
from gensim.parsing.preprocessing import STOPWORDS
from matplotlib.ticker import MaxNLocator
from entity_properties.wikiapi import get_text
import timeit

STOPWORDS = list(STOPWORDS)
STOPWORDS.extend('add pp new ed isbn year time'.split())

with open('entity_properties/property_blacklist.txt') as f:
	prop_blacklist = f.readlines()
	prop_blacklist = [p.rstrip() for p in prop_blacklist]

with open('entity_properties/property_frequencies.json') as f:
	data = json.load(f)	

'''
TFIDF = tf(t,d)*log(N/(df+1)) => http://www.tfidf.com/
tf(t,d) = count t in d / number of words in d
df(t) = occ of t in docs

1. build a matrix with tfidf of every word-property pair
2. add up the row for each word
Esempio n. 25
0
import preprocessor as p
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.decomposition import NMF, LatentDirichletAllocation
from nltk.stem import WordNetLemmatizer, SnowballStemmer
from sklearn.model_selection import GridSearchCV
import gensim
from gensim.parsing.preprocessing import STOPWORDS
from joblib import dump, load
from datetime import datetime

n_samples = 2000
n_features = 1000
n_components = 30
n_top_words = 5

stop_words = STOPWORDS.union(
    set(['', 'ive', 'im', 'amp', 'like', 'f**k', 'shit']))

p.set_options(p.OPT.URL, p.OPT.MENTION, p.OPT.HASHTAG, p.OPT.EMOJI)
stemmer = SnowballStemmer('english')
punct_str = '''!"$%&'()*+,-./:;<=>?[\]^_`{|}~’'''


def lemmatize_stemming(text):
    '''
    '''
    return stemmer.stem(WordNetLemmatizer().lemmatize(text, pos='v'))


def split_count(text):
    '''
    '''
Esempio n. 26
0
"""
    This file contains all the stopword removal steps.
"""
from gensim.parsing.preprocessing import STOPWORDS
from gensim.parsing.preprocessing import remove_stopwords

STOPWORDS = STOPWORDS.union(set(["no", "not", "never"]))

class StopwordRemoval:
    """
        This class contains all the methods which will handle numeric preprocessing.
    """
    @classmethod
    def consider_negative_stopwords(cls, text):
        """
            This method will remove the stopwords from text but not remove negative words.
        """
        temp_str = text.split()
        new_string = [word for word in temp_str if word not in STOPWORDS]
        return text

    @classmethod
    def donot_consider_negative_stopwords(cls, text):
        """
            THis method will remove all stopwords.
        """
        return remove_stopwords(text)

    @classmethod
    def run_stopwords(cls, text, stopwords):
        """
Esempio n. 27
0
    'social',
    'governance',
    'corporate',
    'responsibility',
    'million',
    'billion',
]

# add company names as stop words
organisations = esg_corpus.select("company").distinct().toPandas().company
for organisation in organisations:
    for t in organisation.split(' '):
        org_stop_words.append(t)

# our list contains all english stop words + companies names + specific keywords
stop_words = STOPWORDS.union(org_stop_words)

# COMMAND ----------

# DBTITLE 1,Lemmatize content
import nltk
from nltk.stem import WordNetLemmatizer, PorterStemmer
from pyspark.sql.functions import pandas_udf, PandasUDFType
from gensim.utils import simple_preprocess


def lemmatize(text):

    results = []
    lemmatizer = WordNetLemmatizer()
    stemmer = PorterStemmer()
Esempio n. 28
0
    def _lemmatize(self, text):
        pos = self._get_wordnet_pos(text)
        return WordNetLemmatizer().lemmatize(text, pos=pos)

    def _preprocess(self, text):
        result = []
        for token in text:
            if (token not in self.second_stopwords) and (len(token) > 3):
                lem = self._lemmatize(token)
                if lem not in self.second_stopwords:
                    result.append(lem)
        return result

    def print_params(self):
        print(f'Bigrams={self.bigrams}')
        print(f'Trigrams={self.trigrams}\n')
        print(f'First set of stopwords: {self.first_stopwords}\n')
        print(f'Second set of stopwords: {self.second_stopwords}.')


if __name__ == '__main__':

    added_stopwords = set(['bike', 'trail', 'mountain'])
    my_featurizer = Featurizer(STOPWORDS,
                               STOPWORDS.union(added_stopwords),
                               bigrams=False,
                               trigrams=False)
    my_featurizer.print_params()
    my_featurizer.update_stopwords(['ride', 'road'])
    my_featurizer.update_ngrams(grams='bigrams', set_to=True)
    my_featurizer.print_params()
Esempio n. 29
0

#### LDA Preprocessing
'''
Augmenting stopwords with words used to filter the tweets originally.
Since they show up in almost every tweet, they aren't useful for differentiating
between topics.

stopwords are all lowercase.
'''
COVID_STOPWORDS = set([
    'coronavirus', '2019ncov', 'coronaviruspandemic', 'coronaoutbreak',
    'wuhanvirus', 'covid19', 'covid-19', 'ncov', 'ncov2019', 'corona', 'virus',
    'covid', 'covidー', 'cov', 'sarscov', 'sarscov2', 'amp'
])
FILTER_WORDS = STOPWORDS.union(COVID_STOPWORDS)


def decontract(tweet):
    '''
    helper function for splitting contractions.
    \'s is removed because we can't disambiguate between possession (Julia's)
    and is (Julia is ...)
    '''
    tweet = re.sub(r"\b([A-Za-z]+)'([A-Za-z]+)\b", r"\1\2", tweet)
    return tweet


# Source: https://medium.com/@gaurav5430/using-nltk-for-lemmatizing-sentences-c1bfff963258

Esempio n. 30
0
from nltk.tokenize import word_tokenize

text = "Nick likes to play football, however he is not too fond of tennis."
# Documentation: https://radimrehurek.com/gensim/
filtered_sentence = remove_stopwords(text)

print(filtered_sentence)

# Adicionando e removendo palavras de parada na lista de palavras de parada de Gensim padrão.
all_stopwords = gensim.parsing.preprocessing.STOPWORDS
print(all_stopwords)

# Adicionando palavras de parada à lista de palavras de parada de Gensim padrão.
# Para adicionar um elemento, você deve aplicar a função union no conjunto e passar a ele o conjunto de novas palavras.
# O union método retornará um novo conjunto que contém suas palavras recém-adicionadas.
all_stopwords_gensim = STOPWORDS.union(set(['likes', 'play']))

text = "Nick likes to play football, however he is not too fond of tennis."
text_tokens = word_tokenize(text)
tokens_without_sw = [
    word for word in text_tokens if not word in all_stopwords_gensim
]

print(tokens_without_sw)

# Removendo palavras irrelevantes da lista de palavras irrelevantes padrão do Gensim
all_stopwords_gensim = STOPWORDS
sw_list = {"not"}
# Para remover palavras irrelevantes da lista de palavras Gensim, você deve chamar o difference().
all_stopwords_gensim = STOPWORDS.difference(sw_list)