Ejemplo n.º 1
0
    def combine_stopwords(dataframe_in, stopword_dict):
        """
        Please use the stopwords() function and input that into the stopword_dict parameter.
        returns filtered tokens.
        :param dataframe_in:
        :param stopword_dict:
        :return:
        """
        nlp = spacy.load("en_core_web_lg")

        # Tokenizer
        tokenizer = Tokenizer(nlp.vocab)

        tokens = []

        for doc in tokenizer.pipe(dataframe_in, batch_size=500):

            doc_tokens = []

            for token in doc:
                if token.text.lower() not in stopword_dict:
                    doc_tokens.append(token.text.lower())

            tokens.append(doc_tokens)

        return tokens
    def predict():  #define a prediction function
        body = body.str.replace
        tokenizer = Tokenizer(nlp.vocab)
        tokens = []
        """ Make them tokens """
        #stop words
        STOP_WORDS = nlp.Defaults.stop_words.union(
            ['', ' ', '-', 'reddit', 'post'])
        tokens = []
        for doc in tokenizer.pipe(df['combo'], batch_size=500):
            doc_tokens = []
            for token in doc:
                if ((token.text.lower() not in STOP_WORDS)
                        and (token.is_stop == False)
                        and (token.is_punct == False)
                        and (token.pos_ != 'PRON')):
                    doc_tokens.append(token.lemma_.lower())
            tokens.append(' '.join(doc_tokens))

        df['tokens'] = tokens
        tfidf = TfidfVectorizer(min_df=0.025, max_df=.98, ngram_range=(1, 2))
        vec_text = tfidf.transform(user_input)
        output = model.predict(vec_text.todense())

        # give output to sender.
        return jsonify({"response": output})
Ejemplo n.º 3
0
def tokenize_data(input_data):
    nlp = spacy.load("en")
    tokenizer = Tokenizer(nlp.vocab)
    string_data = [str(data) for data in input_data]
    tokenized_data = [[str(w) for w in doc]
                      for doc in tokenizer.pipe(string_data, batch_size=50)]
    return tokenized_data
Ejemplo n.º 4
0
    def _tokenizer(df):
        nlp = English()
        tokenizer = Tokenizer(nlp.vocab)

        for doc in tokenizer.pipe(df.values.tolist(), batch_size=50):
            for token in doc:
                yield token
Ejemplo n.º 5
0
 def transform(self, data):
     tokenizer = Tokenizer(nlp.vocab)
     return np.array([
         np.mean([
             self.model[w.text.lower()] * self.word2weight[w.text.lower()]
             for w in words if w.text.lower() in self.model
         ] or [np.zeros(self.dim)],
                 axis=0) for words in tokenizer.pipe(data)
     ])
Ejemplo n.º 6
0
def get_lemmas(text):

    # nlp = spacy.load("en_core_web_sm-2.2.5", path="airbnb_api/")
    nlp = spacy.load("en_core_web_sm-2.2.5", path="./")
    # nlp = spacy.load("en_core_web_sm")
    # nlp = en_core_web_sm.load()

    tokenizer = Tokenizer(nlp.vocab)

    STOP_WORDS = nlp.Defaults.stop_words.union([
        '  ', 'und', '-', 'die', 'der', 'berlin', 'ein', 'das', 'mit', 'ist',
        'im', 'zu', 'eine', 'es', 'für'
        'berlin.', 'zum', 'sind', 'für', 'Berlin.', '-pron-', 's', 'u', '',
        "'", ' ', '-PRON-'
    ])

    lemmas = []

    doc = nlp(text)

    for token in doc:
        lemmas.append(token.lemma_)

    lemma_summary = []

    working_set = ""
    for lemma in lemmas:
        working_set += lemma + ' '
    lemma_summary.append(working_set)

    description = [lemma_summary[0]]

    tokens = []

    for doc in tokenizer.pipe(description, batch_size=500):

        doc_tokens = []

        for token in doc:
            if ((token.is_stop == False) and
                (token.is_punct == False)) and (token.pos_ != 'PRON'):
                if token.text.lower() not in STOP_WORDS:
                    doc_tokens.append(token.text.lower())

        tokens.append(doc_tokens)

    token_summary = []

    for set_of_tokens in tokens:
        working_set = ""
        for variable in set_of_tokens:
            working_set += variable + ' '
        token_summary.append(working_set)

    return token_summary[0]
Ejemplo n.º 7
0
class SpacyTokenize(Transformer):
    def __init__(self):
        nlp = spacy.load('en')
        self.tok = Tokenizer(nlp.vocab)

    def transform(self, xx):
        rrr = []
        for doc in self.tok.pipe(xx):
            rr = []
            for token in doc:
                rr.append(token.text.lower())
            rrr.append(rr)
        return rrr
Ejemplo n.º 8
0
def test_spacy_tokenizer_pipe(nlp):
    tokenizer = Tokenizer(nlp.vocab)

    token_sets = []
    for doc in tokenizer.pipe(DOCUMENTS, batch_size=2):
        doc_tokens = [token.text for token in doc]
        token_sets.append(doc_tokens)

    assert token_sets == [['all', 'the', 'kings', 'men'],
                          ['ate', 'all', 'the', 'kings', 'hens'],
                          [
                              'until', 'they', 'all', 'got', 'tired', 'and',
                              'went', 'to', 'sleep', 'zzz'
                          ]]
Ejemplo n.º 9
0
    def tokenize(df_in):
        """
        Tokenize by inputting a dataframe. Outputs a tokenized list.
        :param df_in:
        :return:
        """
        nlp = spacy.load("en_core_web_lg")

        # Tokenizer
        tokenizer = Tokenizer(nlp.vocab)
        tokens = []
        for doc in tokenizer.pipe(df_in, batch_size=500):
            doc_tokens = [token.text for token in doc]
            tokens.append(doc_tokens)
        return tokens
Ejemplo n.º 10
0
    def __init__(self):
        """ coppied from notebook at app/ml/Build_week_IsaacGrove.ipynb
        """
        self.PICKLE_PATH = path.join(path.dirname(__file__), '..', 'pickles',
                                     '')

        # for now i'm loading data from a static link, will try to pull live data
        # in future iters
        leafly = pd.read_csv(
            'https://raw.githubusercontent.com/Build-Week-Med-Cabinet-6/DS/mark-dev/data/cannabis.csv'
        )

        # Set up spacy tokenizer
        nlp = English()
        tokenizer = Tokenizer(nlp.vocab)

        # work around for pickle
        self.nlp = nlp

        # clean some missing info
        leafly.replace('None', np.NaN, inplace=True)
        leafly = leafly.dropna()

        # Make tokens out of descriptions
        tokens = []
        for desc in tokenizer.pipe(leafly['Description'], batch_size=500):
            desc_tokens = [token.text for token in desc]
            tokens.append(desc_tokens)
        leafly['tokens'] = tokens
        leafly['tokens'].head()

        # Instantiate vectorizer object
        tfidf = TfidfVectorizer(ngram_range=(1, 2),
                                max_df=.7,
                                min_df=.001,
                                tokenizer=self.tokenize)

        # Create a vocabulary and get word counts per listing
        dtm = tfidf.fit_transform(leafly['Description'])

        # Get feature names to use a dataframe column headers
        dtm = pd.DataFrame(dtm.todense(), columns=tfidf.get_feature_names())
        # Fit on dtm
        nn = NearestNeighbors(n_neighbors=20, algorithm='kd_tree')
        nn.fit(dtm)
        self.model = nn
        self.transform = tfidf
        return
Ejemplo n.º 11
0
 def transform(self, data):
     tokenizer = Tokenizer(nlp.vocab)
     return np.array(
         [
             np.mean(
                 [
                     self.model[w.text.lower()]
                     for w in words
                     if w.text.lower() in self.model
                 ]
                 or [np.zeros(self.dim)],
                 axis=0,
             )
             for words in tokenizer.pipe(data)
         ]
     )
def tokenizing_text(text):
    tokenizer = Tokenizer(nlp.vocab)
    custom_stopwords = ['hi', '\n', '\n\n', '&', ' ', '.', '-', 'got', "it's", 'it’s', "i'm", 'i’m', 'im', 'want',
                        'like', '$', '@']
    STOP_WORDS = nlp.Defaults.stop_words.union(custom_stopwords)
    ALL_STOP_WORDS = STOP_WORDS.union(stopwords)

    tokens = []

    for doc in tokenizer.pipe(text):
        doc_tokens = []
        for token in doc:
            if token.text.lower() not in STOP_WORDS:
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    # Makes tokens column
    return tokens
Ejemplo n.º 13
0
def tokenize_v5(my_docs, my_nlp=NLP, batch_size=200):
    """
    Uses a tokenizer pipeline for performance gains (JK still very slow).
    Params:
        my_docs (list of str, or dataframe column of str) the documents to tokenize
        my_nlp (spacy.lang.en.English) one of spacy's natural language models
    Returns: a token set (list of token lists)
    """
    #print("TOKENIZING (v5)...")
    tokenizer = Tokenizer(my_nlp.vocab)
    token_sets = []
    for doc in tokenizer.pipe(my_docs, batch_size=batch_size):
        # tokens = [token.lemma_.lower() for token in doc if token.is_stop == False and token.is_punct == False and token.is_space == False]
        # ... for some reason there are special characters, so maybe...
        clean_text = re.sub(ALPHANUMERIC_PATTERN, "", doc.text)
        clean_doc = my_nlp(clean_text)
        tokens = [token.lemma_.lower() for token in clean_doc if token.is_stop == False and token.is_punct == False and token.is_space == False]
        # ... hmm stopwords are still making their way through if the lemma is a stopword
        token_sets.append(tokens)
    return token_sets
Ejemplo n.º 14
0
def ValuePredictor(yelp_url, from_isbn=False):
    '''Takes a url, scrape site for reviews
    and calculates the term frequencies
    sorts and returns the top 10 as a json object
    containing term, highratingscore, poorratingscore.'''

    base_url = "https://www.yelp.com/biz/"  # add business id
    api_url = "/review_feed?sort_by=date_desc&start="
    bid = yelp_url.replace('https://www.yelp.com/biz/', '')
    if '?' in yelp_url:  #deletes everything after "?" in url
        bid = yelp_url.split('?')[0]

    class Scraper():
        def __init__(self):
            self.data = pd.DataFrame()

        def get_data(self, n, bid=bid):
            with Session() as s:
                with s.get(
                        base_url + bid + api_url + str(n * 20)
                ) as resp:  #makes an http get request to given url and returns response as json
                    r = dict(
                        resp.json())  #converts json response into a dictionary
                    _html = html.fromstring(
                        r['review_list'])  #loads from dictionary

                    dates = _html.xpath(
                        "//div[@class='review-content']/descendant::span[@class='rating-qualifier']/text()"
                    )
                    reviews = [
                        el.text for el in _html.xpath(
                            "//div[@class='review-content']/p")
                    ]
                    ratings = _html.xpath(
                        "//div[@class='review-content']/descendant::div[@class='biz-rating__stars']/div/@title"
                    )

                    df = pd.DataFrame([dates, reviews, ratings]).T

                    self.data = pd.concat([self.data, df])

        def scrape(self):  #makes it faster
            # multithreaded looping
            with Executor(max_workers=40) as e:
                list(e.map(self.get_data, range(10)))

    s = Scraper()
    s.scrape()
    df = s.data  #converts scraped data into
    df.columns = ['date', 'review', 'rating']

    df = df.set_index(df.columns.drop('review', 1).tolist()).review.str.split(
        '.', expand=True).stack().reset_index().rename(columns={
            0: 'review'
        }).loc[:, df.columns]

    df = df.replace(',', '')
    df = df.replace('!', '')
    df = df.replace('#', '')
    df = df.replace('.', '')
    tokenizer = Tokenizer(nlp.vocab)
    STOP_WORDS = nlp.Defaults.stop_words.union([
        'gets', 'incredible', 'disappoint', 'from', 'perfection', 'loved',
        'definitely', 'happy', 'find', 'found', 'simply', 'fantastic',
        'recommend', 'feel', 'little', 'i', 'wow', 'absolute', 'favorite',
        'excellent', 'delicious', 'great', 'maybe', 'very', 'enjoy', 'list',
        'gave', 'date', 'went', 'disappointed', 'nyc', 'got', '#', 'crazy',
        'other', 'fairness', 'fair', 'mid', 'from', 'highly', 'perfect',
        'perfectly', 'come', 'lovely', 'visit', 'ny', 'nyc', 'best', 'amazing',
        'love', 'absolutely', 'like', 'good', 'other', 'from', 'ny',
        'restaurant', 'we', 'will', 'because', 'not', 'friends', 'amazing',
        'awesome', 'first', 'he', 'check-in', '=', '= =', 'male', 'u', 'want',
        'u want', 'cuz', 'him', "i've", 'deaf', 'on', 'her', 'told',
        'told him', 'ins', 'check-ins', 'check-in', 'check', 'I', 'i"m', 'i',
        'it', "it's", 'it.', 'they', 'coffee', 'place', 'they', 'the', 'this',
        'its', 'l', '-', 'they', 'this', 'don"t', 'the ', ' the', 'it', 'i"ve',
        'i"m', '!', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '(', ')',
        '/', '.', ',', '!'
    ])
    # STOP_WORDS
    df = df[df['review'] != None]
    tokens = []

    for doc in tokenizer.pipe(df['review'], batch_size=500):

        doc_tokens = []

        for token in doc:
            if (token.text not in STOP_WORDS) & (token.is_punct == False):
                doc_tokens.append(token.text.lower())
        tokens.append(doc_tokens)

    df['review'] = tokens
    jointty = lambda x: ' '.join(map(lambda x: str(x), x['review']))
    df['review'] = df.apply(jointty, axis=1)
    df['review'].replace(' ', np.nan, inplace=True)
    df = df.dropna()

    corpus = (st.CorpusFromPandas(df,
                                  category_col='rating',
                                  text_col='review',
                                  nlp=nlp).build().remove_terms(
                                      STOP_WORDS, ignore_absences=True))

    term_freq_df = corpus.get_term_freq_df()
    term_freq_df['highratingscore'] = corpus.get_scaled_f_scores(
        '5.0 star rating')

    term_freq_df['poorratingscore'] = corpus.get_scaled_f_scores(
        '1.0 star rating')
    # term_freq_df = term_freq_df[term_freq_df['1.0 star rating freq'] > 3]
    dp = term_freq_df.sort_values(by='poorratingscore', ascending=False)
    dp = dp[~dp.index.str.contains('-')]
    dp = dp[~dp.index.str.contains("'")]
    dp = dp[~dp.index.str.contains('/')]
    dh = term_freq_df.sort_values(by='highratingscore', ascending=False)
    dh = dh[~dh.index.str.contains('-')]
    dh = dh[~dh.index.str.contains("'")]
    dh = dh[~dh.index.str.contains('/')]
    dhi = dh.head(75)
    dpo = dh.tail(75)
    dfinal = pd.concat([dhi, dpo])
    # dh = dh.reset_index(drop=False)

    # return dh.to_dict('index')
    return dfinal.to_dict('index')
# Read data form URL
url = "https://raw.githubusercontent.com/LambdaSchool/DS-Unit-4-Sprint-1-NLP/master/module1-text-data/data/yelp_coffeeshop_review_data.csv"
shops = pd.read_csv(url)

#clean up data
shops['date'] = shops['full_review_text'].apply(lambda x: x.split()[0])
shops['review'] = shops['full_review_text'].apply(
    lambda x: " ".join(x.split()[1:]))

# Tokenizer
STOP_WORDS = nlp.Defaults.stop_words.union(
    ["it's", '1', "i'm", "i've", 'place', "-"])
tokenizer = Tokenizer(nlp.vocab)
tokens = []
""" tokens w/o stopwords"""
for doc in tokenizer.pipe(shops['full_review_text'], batch_size=500):
    doc_tokens = []
    for token in doc:
        if (token.text.lower() not in STOP_WORDS) & (token.is_punct == False):
            doc_tokens.append(token.text.lower())
    tokens.append(doc_tokens)
shops['tokens'] = tokens

# View Counts by Rating
shops.loc[(shops.star_rating == ' 5.0 star rating ') |
          (shops.star_rating == ' 4.0 star rating '), 'rating'] = 'good'
shops.loc[(shops.star_rating == ' 3.0 star rating ') |
          (shops.star_rating == ' 2.0 star rating ') |
          (shops.star_rating == ' 1.0 star rating '), 'rating'] = 'bad'
Ejemplo n.º 16
0
from sklearn.neighbors import NearestNeighbors
from sklearn.feature_extraction.text import TfidfVectorizer

# Instantiate nlp with pretrained statistical model for English language (first install with, "python -m spacy download en_core_web_lg")

nlp = spacy.load("en_core_web_lg")
df.head()

# The Tokenizer

tokenizer = Tokenizer(nlp.vocab)

# Make the tokens for descrption

description_tokens = []
for txt in tokenizer.pipe(df['Description'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    description_tokens.append(txt_tokens)
df['description_tokens'] = description_tokens
#print(df['description_tokens'].head())

# Make the tokens for flavor

flavor_tokens = []
for txt in tokenizer.pipe(df['Flavor'], batch_size=500):
    txt_tokens = [token.text for token in txt]
    flavor_tokens.append(txt_tokens)
df['flavor_tokens'] = flavor_tokens
#print(df['flavor_tokens'].head())

#  Make the tokens for effects