Esempio n. 1
0
 def load_custom_stoplist(self, stoplist_file):
     """Load custom stoplist."""
     with open(stoplist_file, 'r') as f:
         stoplist = f.read().split('\n')
     for item in stoplist:
         STOP_WORDS.add(item)
         self.nlp.vocab[item].is_stop = True
Esempio n. 2
0
    def _collect_words(self):
        """Collects all the unique word and pos_tag pairs from the text."""
        nlp = spacy.load("en_core_web_lg")
        # coref = NeuralCoref(nlp.vocab)
        # nlp.add_pipe(coref, name='neuralcoref')

        print("Preparing Spacy object")
        nlp.max_length = len(self.text)
        text_obj = nlp(str(self.text.lower()), disable=['NER'])

        print("Preparing Spacy object")
        # Resolve co-reference using neuralcoref
        # self.text = text_obj._.coref_resolved
        # nlp.remove_pipe("neuralcoref")
        # text_obj = nlp(str(self.text.lower()), disable=['NER'])

        prev_sent = Sentence(nlp(''), None)
        words = {}
        STOP_WORDS.add('_')
        logging.info("Collecting words")
        for sent in tqdm(text_obj.sents):
            # sent = nlp(Sentence.clean_sentence(sent.text))
            curr_sent = Sentence(sent, prev_sent)
            for token in sent:
                if token.text in STOP_WORDS or\
                        token.pos_ in ['PART', 'PUNCT', 'SPACE', 'NUM', 'SYM']:
                    continue
                key = token.text.strip() + ' ; ' + token.tag_
                if key not in words:
                    words[key] = Word(token)
                words[key].include_sentence(curr_sent)

        return words
Esempio n. 3
0
    def spacy_adder(self, model, verbose=False):

        for stopword in self.vocab_list:
            STOP_WORDS.add(stopword)

        model.vocab.add_flag(
            lambda s: s.lower() in spacy.lang.en.stop_words.STOP_WORDS,
            spacy.attrs.IS_STOP)
        if verbose:
            print(
                f"Complete. There are {len(self.vocab_list)} stop words in the list."
            )
Esempio n. 4
0
def words_stop():
    words_stop._log.debug("\nThe outcomes of words stop are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
    words_stop._log.debug(nlp.Defaults.stop_words)
Esempio n. 5
0
def construct_stop_words():
    """
    Update the spacy stopwords list
    :return:
    """
    stop_words_list = [
        "uk", "ceo", "apple", "wal", "st", "q1", "q2", "q3", "q4", "bp",
        "wednesday", "tuesday", "monday", "thursday", "friday", "sept",
        "johnson", "inc", "david", "amazon.com"
    ]

    for words in stop_words_list:
        STOP_WORDS.add(words)

    return STOP_WORDS
def summarization():

  with open("./stories/d3370f0d60746aebcc5f61a068805b8545357e6f.story", "r", encoding="utf-8") as f:
    text = " ".join(f.readlines())
    core = en_core_web_sm.load()

  doc = core(text)
  # clean sentences
  corpus = [sent.text.lower() for sent in doc.sents]
  STOP_WORDS.add("@highlight")
  cv = CountVectorizer(stop_words=list(STOP_WORDS))
  cv_fit = cv.fit_transform(corpus)
  word_list = cv.get_feature_names()
  count_list = cv_fit.toarray().sum(axis=0)
  
  # zip it in a way that pair word and the its count
  word_frequency = dict(zip(word_list, count_list))
  words_freqs = sorted(word_frequency.values())
  higher_word_frequencies = [word for word,
                             freq in word_frequency.items() if freq in words_freqs[-3:]]
  print("higher frequency words : ", higher_word_frequencies)

  higher_frequency = words_freqs[-1]
  # normalise the frequencies values
  for word in word_frequency.keys():
    word_frequency[word] = (word_frequency[word]/higher_frequency)

  sentence_rank = {}
  for sent in doc.sents:
    for word in sent:
      if word.text.lower() in word_frequency.keys():
        if sent in sentence_rank.keys():
          sentence_rank[sent] += word_frequency[word.text.lower()]
        else:
          sentence_rank[sent] = word_frequency[word.text.lower()]
      else:
        continue

  # fetch top sentences which have the higher top-freq words
  top_sentences = (sorted(sentence_rank.values())[::-1])
  top_sent = top_sentences[:3]

  summary = []
  for sent, strength in sentence_rank.items():
    if strength in top_sent:
      summary.append(sent)

  return text, summary
Esempio n. 7
0
def Stop():
    print("\nThe outcomes of Stop Words are:")
    from spacy.lang.en.stop_words import STOP_WORDS
    # print (STOP_WORDS)
    STOP_WORDS.add("your_additional_stop_word_here")
    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)

    nlp.Defaults.stop_words |= {"了", "啊", "吧", "嗯"}  # 单个词可以直接.add()
    nlp.Defaults.stop_words -= {"嗯"}  # 单个词可以直接.remove()
    for word in nlp.Defaults.stop_words:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True
        # print (lexeme.text)
    print(nlp.Defaults.stop_words)
def preprocess_text(author_df):
    nlp = spacy.load('en')
    STOP_WORDS.add("'s")
    STOP_WORDS.add('the')
    STOP_WORDS.add('a')
    for word in STOP_WORDS:
        nlp.vocab[word].is_stop = True
    doc = author_df.text.apply(nlp)

    # remove stop words and punctuations
    clean_and_lemmatize = lambda x: ' '.join([t.lemma_ for t in x if not t.is_punct and not t.is_stop])
    author_df['text_cleaned'] = doc.apply(clean_and_lemmatize)

    # enteties
    author_df['text_with_entities'] = doc.apply(replace_ents)

    # pos-tag pairs
    author_df['text_pos_tag_pairs'] = author_df['text'].apply(lambda row: pos_tag_pairs_sentence(row))

    # additional nlp meta features
    author_df['polarity_of_text'] = author_df['text'].apply(lambda row: get_polarity(row))
    author_df['punct_cnt'] = doc.apply(lambda x: len([t for t in x if t.is_punct]))
    author_df['words_cnt'] = doc.apply(lambda x: len([t for t in x if not t.is_punct]))
    author_df['ents_cnt'] = doc.apply(lambda x: len(x.ents))
    author_df['noun_chunks_cnt'] = doc.apply(lambda x: len(list(x.noun_chunks)))
    author_df['fraction_noun'] = author_df['text'].apply(lambda row: fraction_noun(row))
    author_df['fraction_adj'] = author_df['text'].apply(lambda row: fraction_adj(row))
    author_df['fraction_verbs'] = author_df['text'].apply(lambda row: fraction_verbs(row))

    return author_df
    def detectTextIn(self, Text):
        classFromText = []
        classFromText.append(Text)
        # Text=Text.lower()
        nlp = spacy.load('en_core_web_sm')
        # Adding Custom stop words
        STOP_WORDS.add("picture")
        STOP_WORDS.add("image")
        STOP_WORDS.add("images")
        STOP_WORDS.add("pics")
        STOP_WORDS.add("portrait")
        for word in STOP_WORDS:
            lexeme = nlp.vocab[word]
            lexeme.is_stop = True

        uni_string = str(Text)
        doc = nlp(uni_string)
        for ent in doc.ents:
            classFromText.append(ent.label_)

        Text = Text.lower()
        uni_string = str(Text)
        doc = nlp(uni_string)

        for token in doc:
            # """token.text, token.lemma_, token.pos_, token.tag_, token.dep_,
            #       token.shape_, token.is_alpha, token.is_stop"""
            if not token.is_stop:
                classFromText.append(token.lemma_)
                classFromText.append(token.text)

        classFromText = [a.lower() for a in classFromText]
        for text in classFromText:
            if text == "":
                classFromText.remove(text)
        classFromText = set(classFromText)
        return classFromText
nlp.vocab['better'].is_stop  #will return False


#filtering the stopwords
ex1 = nlp("How do I keep looping through until the len(new_list) = len(data_list) (i.e. all the numbers are in the new list) with everything sorted without using the built in max, min, sort functions? I'm not sure if it's necessary to create a new list either.")
for word in ex1:
    if word.is_stop:
        print(word)
        
        
#another way
mylist = [word for word in ex1 if word.is_stop]

#adding/removing stopwords
print(nlp.vocab['lamao'].is_stop)
STOP_WORDS.add('lol')
print(nlp.vocab['lol'].is_stop)
STOP_WORDS.remove('lol')
print(nlp.vocab['lol'].is_stop)




########################################################
docs = nlp('Aditya went to the Tajmahal in the Agra and ate icecream there')
for token in docs.noun_chunks:
    print(token.text)  #it wll print 'the'
    

for token in docs.noun_chunks:
    print(token.root.text) #it will print with the
Esempio n. 11
0
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en_core_web_md')
domain_stop_words = ['chapter', '<', '>', ';', 'vinegar', 'of', '%']
for word in domain_stop_words:
    STOP_WORDS.add(word)
STOP_WORDS1 = STOP_WORDS.copy()
STOP_WORDS1.discard('other')


def nlp0(sentence):
    sentence = sentence.lower()

    word_list = [
        token.lemma_ for token in nlp(sentence)
        if not token.is_stop and not token.is_punct
    ]

    return word_list


def nlp1(sentence):
    sentence = sentence.lower()
    word_list = [
        str(token.lemma_) for token in nlp(sentence)
        if str(token) not in STOP_WORDS1 and not token.is_punct
    ]
    word_list1 = []
    flag = 0
    for i in word_list:
        if i == 'other':
Esempio n. 12
0
def updateStopWords():
    '''this function is used to update the stop words corpus'''

    # adding couple of stop words
    STOP_WORDS.add("i'm")
    STOP_WORDS.add("isn't")
    STOP_WORDS.add("let's")
    STOP_WORDS.add("ha")
    STOP_WORDS.add("according")
    STOP_WORDS.add("want")
    STOP_WORDS.add("like")
Esempio n. 13
0
        doc = nlp(" ".join(sent)) 
        texts_out.append([token.lemma_ for token in doc if token.pos_ in allowed_postags])
    return pd.Series(texts_out)
"""Remove Stop Words"""
data_words_nostops = remove_stopwords(data_words)# Remove Stop Words
""" Form Bigrams"""
data_words_bigrams = make_bigrams(data_words_nostops)

""" Initialize spacy 'en' model, keeping only tagger component (for efficiency)
# python3 -m spacy download en"""
nlp = spacy.load('en', disable=['parser', 'ner'])
"""Do lemmatization keeping only noun, adj, vb, adv"""
data_lemmas = lemmatization(data_words_bigrams, allowed_postags=['NOUN' ]) #, 'VERB', 'ADV', 'ADJ',
         
"""Remove more Stop Words, find ways to remove a long list of SWS, not one, this only adds one sw"""
STOP_WORDS.add("artificial_intelligence")
data_lemmatized = remove_stopwords(data_lemmas)# Rem


"""4. Create the Dictionary and Corpus and BOW
# 4.1 Create Dictionary"""
id2word = corpora.Dictionary(data_lemmatized) 
"""size of dictionary"""
print("Found {} words.".format(len(id2word.values())))
""" corpus"""  
corpus = [id2word.doc2bow(text) for text in data_lemmatized]
"""check sparsity, for instance, sparse = .99  tokens which are missing from more than 99 of the documents in the corpus. """
data_dense = gensim.matutils.corpus2dense(corpus, num_docs=len(corpus),
                                  num_terms= len(id2word.values()))
print("Sparsicity: ", 1.0 - (data_dense > 0).sum() / data_dense.size)
"""
Esempio n. 14
0
if __name__ == '__main__':
    if len(sys.argv) != 3:
        print('Usage: ')
        print('\tpython nmf.py SUBREDDIT_NAME NUM_TOPICS')
        sys.exit(1)

    # Disable tagger, parser and named-entity recognition
    nlp = spacy.load('en', disable=['tagger', 'parser', 'ner'])

    # Load custom stoplist
    with open('../../data/stoplist.txt', 'r') as f:
        stops = f.read().split()

    for stop in stops:
        STOP_WORDS.add(stop)

    for word in STOP_WORDS:
        lexeme = nlp.vocab[word]
        lexeme.is_stop = True

    # Read data.
    DATA_FILE = '../pmf/NeutralPolitics.csv'  #'../data/bigquery/2017/11-12/' + sys.argv[1] + '.csv'
    data = pd.read_csv(DATA_FILE)
    data = data.iloc[:, 0].fillna('').astype(str).squeeze()  # FIXME change 0 to 1
    print('Loaded Reddit comments.')

    # Disregard the bottom 70% of all comments, by simple count of split tokens.
    counts = data.apply(lambda s: len(s.split()))
    threshold = counts.quantile(0.7)
    data = data[counts > threshold]
Esempio n. 15
0
            if ent.text not in STOP_WORDS and ent.label_ in tracked_entities:
                anon_comment = anon_comment.replace(ent.text, ent.label_)
        return name_placeholder(anon_comment)
    except:
        print(text)
        pass


#let's hold off on whitespacing:
#def paddingFunc(text):
#    text = re.sub('([.,!?()])', r' \1 ', text)
#    text = re.sub('\s{2,}', ' ', text)
#    return text

#original entity replacement code for reference
'''# -*- coding: utf-8 -*-
import pandas as pd
import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from spacy.vocab import Vocab
import pandas as pd
import numpy as np
import os
import xx_ent_wiki_sm

nlp = spacy.load('en_core_web_sm')

#python -m spacy download xx for multilingual processing
nlp_multilingual = xx_ent_wiki_sm.load()

from spacy.lang.en.stop_words import STOP_WORDS
Esempio n. 16
0
def queryTokens(cadena, languages):
    cadena = __preprocessString(cadena)
    # remove non ascii-characters
    cadena = ''.join(i for i in cadena if ord(i) < 128)
    cadena = cadena.strip()  # remove initial and end spaces
    word_tokens = word_tokenize(cadena)
    # Detect in which language the text is written
    lang = detect_language(word_tokens, languages)
    stop_words = set(stopwords.words(lang))  # Filtering stop words
    inverters = set([
        'dont', 'doesnt', 'havent', 'arent', 'didnt', 'wasnt', 'werent', 'not',
        'never', 'hardly', 'seldom'
    ])
    incrementers = set(['too', 'many', 'much', 'very', 'lots'])
    STOP_WORDS.add('im')
    STOP_WORDS.add('pm')
    STOP_WORDS.add('ai')
    STOP_WORDS.add('ie')
    STOP_WORDS.add('still')
    STOP_WORDS.add('cant')
    STOP_WORDS.add('isnt')
    STOP_WORDS.add('couldnt')
    STOP_WORDS.add('youre')
    STOP_WORDS.add('seen')
    STOP_WORDS.add('say')
    STOP_WORDS.add('says')
    STOP_WORDS.add('tell')
    STOP_WORDS.add('lot')
    STOP_WORDS.add('lol')
    STOP_WORDS.add('hes')
    STOP_WORDS.add('s')
    STOP_WORDS.add('be')
    filtered_sentence = [
        w for w in word_tokens if not w in stop_words and not w in inverters
        and not w in incrementers and not w in STOP_WORDS
    ]  # Checking not in stop_words
    return filtered_sentence
Esempio n. 17
0
import pickle
import spacy
import re
from spacy.lang.en.stop_words import STOP_WORDS
nlp = spacy.load('en', disable=['parser'])


CLASSIFIER_ROOT = 'classifiers/'
TRANSFORMERS = ['transform_bag_of_words_0.sav',
                'transform_bag_of_words_1.sav']
MODELS = ['nb.sav']


STOP_WORDS.add("'s")
for word in STOP_WORDS:
    lexeme = nlp.vocab[word]
    lexeme.is_stop = True

def load_model(model_name):
    with open('{0}{1}'.format(CLASSIFIER_ROOT, model_name), 'rb') as f:
        model = pickle.load(f)
    return model

CLF_NB = load_model(MODELS[0])
TRANSFORMERS_MODELS = [load_model(TRANSFORMERS[0]), load_model(TRANSFORMERS[1])]

def clean_html(raw_html):
  cleanr = re.compile('<.*?>')
  cleantext = re.sub(cleanr, '', raw_html)
  return cleantext.lower()
Esempio n. 18
0
# code based on https://t.co/69FA0rkKUU

import dns
import spacy
import argparse
import json

import numpy as np
import pandas as pd

from urllib.parse import quote
from pymongo import MongoClient
from collections import OrderedDict
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add('rt')
STOP_WORDS.add('#')
STOP_WORDS.add('%')
STOP_WORDS.add('|')

parser = argparse.ArgumentParser()
parser.add_argument("user", help="Username for server access", type=str)
parser.add_argument("password", help="Password for server access", type=str)
parser.add_argument("server", help="Mongo DB server address", type=str)
parser.add_argument("day",
                    help='Day in twitter format. Example: "Wed May 06"',
                    type=str)
args = parser.parse_args()

nlp = spacy.load('en_core_web_sm')
Esempio n. 19
0
def LDA_Analysis():
    #http://nbviewer.jupyter.org/github/skipgram/modern-nlp-in-python/blob/master/executable/Modern_NLP_in_Python.ipynb

    if 0 == 1:
        with open('data/review_text_all.txt','w') as myfile:
            myfile.write("")
        
        '''
        loop through db and write jobs descriptions
        '''
        
        with open('data/review_text_all.txt','a') as myfile:
            with Job() as db:
                a=0
                max_ = int(db.getNoJobs()[0][0])
                while (a < max_):
                    #print(a)
                    sample_review = db.readJobDetailClean(a)[0][1]
                    if (sample_review != 'Json Error'):
                        myfile.write(str(sample_review)+'\n')
                    a += 1
    
    #unigram_sentences_filepath = os.path.join(intermediate_directory, 'unigram_sentences_all.txt')
    
    if 0 == 1:
    
        with codecs.open('data/unigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            for sentence in lemmatized_sentence_corpus('data/review_text_all.txt'):
                f.write(sentence + '\n')
    
    unigram_sentences = LineSentence('data/unigram_sentences_all.txt')
   
    '''
    for unigram_sentence in it.islice(unigram_sentences, 230, 240):
        print(u' '.join(unigram_sentence))
        print(u'')
    '''
        
    #bigram_model_filepath = os.path.join(intermediate_directory, 'bigram_model_all')
    
    if 0 == 1:

        bigram_model = Phrases('data/unigram_sentences_all.txt')
    
        bigram_model.save('data/bigram_model_all')
    
    # load the finished model from disk
    bigram_model = Phrases.load('data/bigram_model_all')
    
    #bigram_sentences_filepath = os.path.join(intermediate_directory, 'bigram_sentences_all.txt')
   
    if 0 == 1:
    
        with codecs.open('data/bigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for unigram_sentence in unigram_sentences:
                
                bigram_sentence = u' '.join(bigram_model[unigram_sentence])
                
                f.write(bigram_sentence + '\n')
            
    bigram_sentences = LineSentence('data/bigram_sentences_all.txt')
            
    '''                    
    for bigram_sentence in it.islice(bigram_sentences, 230, 240):
        print(u' '.join(bigram_sentence))
        print(u'')  
    '''

    #trigram_model_filepath = os.path.join(intermediate_directory, 'trigram_model_all')

    if 0 == 1:
    
        trigram_model = Phrases(bigram_sentences)
    
        trigram_model.save('data/trigram_model_all')
        
    # load the finished model from disk
    trigram_model = Phrases.load('data/trigram_model_all')

    #trigram_sentences_filepath = os.path.join(intermediate_directory, 'trigram_sentences_all.txt')                     

    if 0 == 1:
    
        with codecs.open('data/trigram_sentences_all.txt', 'w', encoding='utf_8') as f:
            
            for bigram_sentence in bigram_sentences:
                
                trigram_sentence = u' '.join(trigram_model[bigram_sentence])
                
                f.write(trigram_sentence + '\n')
                
    trigram_sentences = LineSentence('data/trigram_sentences_all.txt')

    '''
    for trigram_sentence in it.islice(trigram_sentences, 230, 240):
        print(u' '.join(trigram_sentence))
        print(u'')
    '''

    #trigram_reviews_filepath = os.path.join(intermediate_directory, 'trigram_transformed_reviews_all.txt')
    
    if  0 == 1:
      
        import csv
        
        '''
        Variant A: Use Stopwords
        1) download StopWords.csv from MySQL table: KeyWords.
        2) Remove all relevant words by hand ;)
        '''
        with open('data/StopWords.csv', newline='') as csvfile:
          
          stopwords_ = csv.reader(csvfile, delimiter=' ', quotechar='|')
          for words_ in stopwords_:
            #print(words_[0])
            STOP_WORDS.add(words_[0])
    
        #print(STOP_WORDS)
        
        '''
        Varaint B: Use Dictionary
        '''
        with open('data/Dictionary.csv', 'r', newline='') as csvfile:
          
          file_ = csv.reader(csvfile, delimiter=',', quotechar='"')
          
          dictionary_ = []

          for row in file_:
              dictionary_.append(row[0])
          
          #with open('file.csv', 'r') as f:
  #reader = csv.reader(f)
  #your_list = list(reader)
    
    
        with codecs.open('data/trigram_transformed_reviews_all.txt', 'w', encoding='utf_8') as f:
            
            for parsed_review in nlp.pipe(line_review('data/review_text_all.txt'), batch_size=10000, n_threads=4):
                
                # lemmatize the text, removing punctuation and whitespace
                unigram_review = [token.lemma_ for token in parsed_review
                                  if not punct_space(token)]
                
                # apply the first-order and second-order phrase models
                bigram_review = bigram_model[unigram_review]
                trigram_review = trigram_model[bigram_review]
                
                # remove any remaining stopwords
                '''
                Variant A:
                '''
                #trigram_review = [term for term in trigram_review
                #                  if term not in STOP_WORDS]#spacy.en.STOPWORDS] !!!!! CHECK THIS !!!!! module 'spacy' has no attribute 'en'
                
                '''
                Variant B:
                '''
                trigram_review = [term for term in trigram_review
                                  if term in dictionary_]#
                
                # write the transformed review as a line in the new file
                trigram_review = u' '.join(trigram_review)
                f.write(trigram_review + '\n')
                
    '''
    print(u'Original:' + u'\n')
    
    for review in it.islice(line_review('review_text_all.txt'), 11, 12):
        print(review)
    
    print(u'----' + u'\n')
    print(u'Transformed:' + u'\n')
    
    with codecs.open('trigram_transformed_reviews_all.txt', encoding='utf_8') as f:
        for review in it.islice(f, 11, 12):
            print(review)
    '''

    #trigram_dictionary_filepath = os.path.join(intermediate_directory, 'trigram_dict_all.dict')

    if 0 == 1:
    
        trigram_reviews = LineSentence('data/trigram_transformed_reviews_all.txt')
    
        # learn the dictionary by iterating over all of the reviews
        trigram_dictionary = Dictionary(trigram_reviews)
        
        # filter tokens that are very rare or too common from
        # the dictionary (filter_extremes) and reassign integer ids (compactify)
        trigram_dictionary.filter_extremes(no_below=10, no_above=0.4)#,keep_n=100000)#,)
        trigram_dictionary.compactify()
    
        trigram_dictionary.save('data/trigram_dict_all.dict')
        
    # load the finished dictionary from disk
    trigram_dictionary = Dictionary.load('data/trigram_dict_all.dict')
    
    #trigram_bow_filepath = os.path.join(intermediate_directory, 'trigram_bow_corpus_all.mm')
    
    if 0 == 1:
    
        # generate bag-of-words representations for
        # all reviews and save them as a matrix
        MmCorpus.serialize('data/trigram_bow_corpus_all.mm', trigram_bow_generator(trigram_dictionary,'data/trigram_transformed_reviews_all.txt'))
        
    # load the finished bag-of-words corpus from disk
    trigram_bow_corpus = MmCorpus('data/trigram_bow_corpus_all.mm')
    
    #lda_model_filepath = os.path.join(intermediate_directory, 'lda_model_all')
    
    if 0 == 1:
    
        with warnings.catch_warnings():
            warnings.simplefilter('ignore')
            
            # workers => sets the parallelism, and should be
            # set to your number of physical cores minus one
            lda = LdaMulticore(trigram_bow_corpus,
                               num_topics=15,
                               id2word=trigram_dictionary,
                               workers=1)
        
        lda.save('data/lda_model_all')
        
    # load the finished LDA model from disk
    lda = LdaMulticore.load('data/lda_model_all')

    #explore_topic(lda, topic_number=1)

    topic_names = {0:u'Risk Management Bank', 
                   1:u'Big Data Report', 
                   2:u'Automotive SAP', 
                   3:u'Microsoft Java Scrum', 
                   4:u'Medical Consultant', 
                   5:u'Java Engineer', 
                   6:u'Computer Vision Developer', 
                   7:u'Data Analyst', 
                   8:u'BI SAP BW', 
                   9:u'IOT Reporting R', 
                   10:u'Global Project Presentation',
                   11:u'Cloud Engineer IOT', 
                   12:u'Industry 4.0', 
                   13:u'Risk Consulting', 
                   14:u'Machine Learning Data Science'}
    
    #topic_names_filepath = os.path.join(intermediate_directory, 'topic_names.pkl')
    
    with open('data/topic_names.pkl', 'wb') as f:
        pickle.dump(topic_names, f)
    
    #load sameple_review from database
    #sample_review = get_sample_review(10)
    
    #lda_description(bigram_model, trigram_model, trigram_dictionary, lda, topic_names, sample_review)

    #LDAvis_data_filepath = os.path.join(intermediate_directory, 'ldavis_prepared')
    
    if 0 == 1:
        
        #term_ix = np.sort(topic_info.index.unique().values)
    
        LDAvis_prepared = pyLDAvis.gensim_.prepare(lda, trigram_bow_corpus, trigram_dictionary)
    
        with open('data/ldavis_prepared', 'wb') as f:
            pickle.dump(LDAvis_prepared, f)
            
    '''
    export LDA file
    '''
    
    # load the pre-prepared pyLDAvis data from disk
    with open('data/ldavis_prepared', 'rb') as f:
        LDAvis_prepared = pickle.load(f)

    with open('data/DSJobs_LDA.html', 'w') as f:
        pyLDAvis.save_html(LDAvis_prepared, f)            
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

pd.set_option('display.expand_frame_repr', False)
np.random.seed(42)

nlp = spacy.load('en_core_web_sm')

# Combine spacy and linguistic utils stopwords
stop_words = pd.read_csv(
    'http://ir.dcs.gla.ac.uk/resources/linguistic_utils/stop_words',
    header=None,
    squeeze=True)

for stop_word in stop_words:
    STOP_WORDS.add(stop_word)

# where your text files are
text_path = Path('data/text')
text_files = text_path.glob('*.txt')

# where the clean version should go
clean_path = Path('data/clean')
if not clean_path.exists():
    clean_path.mkdir(exist_ok=True, parents=True)
for i, text_file in enumerate(text_files):
    if i % 100 == 0:
        print(i, end=' ', flush=True)

    doc = text_file.read_text()
    clean_doc = ' '.join([
import string
import nltk

try:
    stemmer = SnowballStemmer("english")
except:
    nltk.download("wordnet")
    stemmer = SnowballStemmer("english")

try:
    nlp = en_core_web_sm.load()
except:
    os.system("python -m spacy download en_core_web_sm")
    nlp = en_core_web_sm.load()

stop.add("phone")

punc = set(string.punctuation)


def clean(doc):
    # lower text and remove punctuations
    s = ""
    for char in doc.lower():
        s += char if char not in punc else " "
    # remove stopwords, adjectives, and adverbs
    normalized = []
    for token in nlp(s):
        if not (
            token.is_space
            or token.is_stop
Windows:python -m spacy download en as Administrator

Linux:sudo python -m spacy download en
"""

nlp = spacy.load('en')

"""#Exploring spaCy"""

from spacy.lang.en.stop_words import STOP_WORDS
STOP_WORDS

f'There are {len(STOP_WORDS)} stopwords in spaCy'

# You can add your own corpora specific STOPWORDS using the .add syntax
STOP_WORDS.add("your_additional_stop_word_here")
f'After adding your own stop words, spaCy will use {len(STOP_WORDS)} stopwords'

doc = nlp("I am learning the most important ideas Natural Language Processing ideas using Python")
print(doc)  # doc is a spaCy object which stores the entire document string

"""**About spaCy objects**"""

for token in doc:
    print(token)

simplified_doc = [token for token in doc if not token.is_punct | token.is_stop]
simplified_doc
# please note that .orth_ attribute returns the unicode string representation of the token

"""We can also check what other things we know about these tags in the simplified_doc:"""
Esempio n. 23
0
# glue
job_df = pd.concat([jd_df, req_df], axis=1, ignore_index=True)
job_df.columns = ["Req ID","Req Title",
               "Job Requisition Status", "Candidate ID",
              "Division", "Function", "Job Description"]
job_df.head()



### Clean text ###

# tokenize every text
tokenizer = RegexpTokenizer(r'\w+')


# remove numbers
resume_df["Resume Text"].replace(r'[\d]','',regex=True, inplace=True)
job_df["Job Description"].replace(r'[\d]','',regex=True, inplace=True)

# lower case all words
resume_df["Resume Text"] = resume_df["Resume Text"].str.lower()
job_df["Job Description"] = job_df["Job Description"].str.lower()

# remove stopwords
STOP_WORDS.add('')
# try_df["Resume Text"] = resume_df["Resume Text"].apply(lambda x: [str(word) for word in x if word not in STOP_WORDS])
# try_df["Job Description"] = job_df["Job Description"].apply(lambda x: [word for word in x if word not in STOP_WORDS])


resume_df.to_csv('data/cleaned_resume.csv', index=False)
job_df.to_csv('data/cleaned_job.csv', index=False)
Esempio n. 24
0
import os
import re
from unidecode import unidecode
import numpy as np
import json
import sys
import logging
from numpy.linalg import norm
from gensim.test.utils import datapath
from gensim.models.fasttext import load_facebook_model
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add('de_l_la_le_di')
logging.basicConfig(level=logging.INFO,
                    format="%(asctime)s [%(levelname)-5.5s]  %(message)s",
                    handlers=[logging.StreamHandler(sys.stdout)])
logger = logging.getLogger()


class MemoryGenerator():
    def __init__(self, dataset, conv2kg, kgs, fasttext_emb_path):
        logger.info("Initializing Memory Generator ....")
        self.conv2kg = conv2kg
        self.kgs = kgs
        self.mapping = json.load(open("data/" + dataset + "/ERmapping.json"))
        self.maxEntity, self.maxRel = self.read_dataset(dataset)
        logger.info("MaxENT: " + str(self.maxEntity) + " maxREL: " +
                    str(self.maxRel))
        self.matrix_dim = self.maxEntity + self.maxRel
        self.word_emb = load_facebook_model(
            datapath(os.getcwd() + "/" + fasttext_emb_path))
Esempio n. 25
0
#Add & Remove a new Stop Word
import nltk
STOP_WORDS = nltk.corpus.stopwords.words('english')
STOP_WORDS.append('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import nltk

STOP_WORDS.remove('Test')

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.add("Test")

print(len(STOP_WORDS))
print(STOP_WORDS)

import spacy
from spacy.lang.en.stop_words import STOP_WORDS

STOP_WORDS.remove("Test")

print(len(STOP_WORDS))
print(STOP_WORDS)
Esempio n. 26
0
import os
import re

import spacy
from spacy.lang.en.stop_words import STOP_WORDS
from typing import Text, List

from utils import deprecated


PathType = str
# Load the spacy english model
nlp = spacy.load("en")
STOP_WORDS.add("-PRON-")
STOP_WORDS.add("~sil")


@deprecated
def remove_stopwords(text: Text
                     ) -> Text:
    # This function removes stopwords from a list of strings
    # Parameter: 'list_of_tokens': a list of strings
    # return: 'list_of_tokens' without the stopwords
    list_of_tokens = re.split(r"\s", text)
    assert type(list_of_tokens) is list, "list_of_tokens must be of type list"
    doc = " ".join([t for t in list_of_tokens if t not in STOP_WORDS])
    return re.sub(r' +', r' ', doc)


@deprecated
def lemmatize(text: Text
Esempio n. 27
0
def add_sw(new_sw):
    STOP_WORDS.add(new_sw)
    return