def load_stoplist(topic_words=False, lang="en"): try: if lang == "en": if topic_words: return set(get_stop_words("en") + STOP_LIST + get_topic_stoplist()) else: return set(get_stop_words("en") + STOP_LIST + stopwords.words('english')) elif lang == "nl": return set(get_stop_words("nl") + stopwords.words('dutch') + STOP_LIST_NL) except: print "warning: no stopwords were downloaded. check nltk corpora" print format_exc() return set()
def test_filters(self): language = 'en' before = get_stop_words(language, False) letter = random.choice(random.choice(before)) def remove_letter(stopwords, language): return [word for word in stopwords if letter not in word] stop_words.add_filter(remove_letter) after = get_stop_words(language, False) for stopword in after: self.assertFalse(letter in stopword) self.assertTrue(stop_words.remove_filter(remove_letter))
def get_most_freq(all_comments): APP_ROOT = os.path.dirname(os.path.abspath(__file__)) APP_STATIC = os.path.join(APP_ROOT, 'static') file_name = os.path.join(APP_STATIC, 'freq_portugues.p') dict_freq = pickle.load(open(file_name, "rb" ) ) web_stopWords = ["q","vc","vcs","tipo","ta","pra","pq","ne","sobre","ser","cara","la"] all_comments = remove_accents(all_comments) tokens = all_comments.split() #build token dictionary dict_tokens = {} for token in tokens: if token in dict_tokens: dict_tokens[token] += 1 else: dict_tokens[token] = 1 #remove stop words stopWords = get_stop_words('portuguese', cache=True) stopWords += get_stop_words('english', cache=True) stopWords += web_stopWords #remove stop words for word in stopWords: dict_tokens.pop(remove_accents(word), None) #for word in dict_tokens: # print(dict_tokens[token]) # dict_tokens[token] = 1+math.log(dict_tokens[token]) #sorted by frequency sorted_tokens = sorted(dict_tokens.items(), key=operator.itemgetter(1),reverse=True) num_tokens = int(min(len(sorted_tokens)/2, 1000)) sorted_tokens = sorted_tokens[0:num_tokens] #normalize by frequency standart_frequency = dict_freq["acelga"] for i in range(len(sorted_tokens)): (token,value) = sorted_tokens[i] if token in dict_freq: sorted_tokens[i] = (token, math.log(value/dict_freq[token])) else: sorted_tokens[i] = (token,math.log(value/standart_frequency)) sorted_tokens_after = sorted(sorted_tokens,key=operator.itemgetter(1), reverse=True) max_num_words = 100 sorted_tokens_after = sorted_tokens_after[0:max_num_words] return sorted_tokens_after
def test_get_stop_words_cache(self): self.assertFalse('french' in stop_words.STOP_WORDS_CACHE) sw = get_stop_words('fr') self.assertTrue('french' in stop_words.STOP_WORDS_CACHE) original_stop_words_dir = stop_words.STOP_WORDS_DIR stop_words.STOP_WORDS_DIR = 'not-existing-directory' self.assertEqual(sw, get_stop_words('french')) stop_words.STOP_WORDS_DIR = original_stop_words_dir try: get_stop_words('klingon') except: pass self.assertFalse('klingon' in stop_words.STOP_WORDS_CACHE)
def word_list(text ) : list = {} words = text.split() stop_words = get_stop_words('en') # stop words is a list of common words used in English stop_words = get_stop_words('english') words = [word for word in words if word not in stop_words] #removing stop words for i in words: if all(j.isdigit() for j in i): # classifing token as number feature if list.has_key("NUMBER"): list["NUMBER"]+=1 else: list["NUMBER"]=1 elif (len (i) >=4 and i[0] == 'h' and i[1] == 't' and i[2] == 't' and i[3] == 'p'): if list.has_key("LINKS"): # classifing token as link feature list["LINKS"]+=1 else: list["LINKS"]=1 elif all(j in string.punctuation for j in i): if list.has_key("PUNCTUATION"): # classifing token as punctuation feature list["PUNCTUATION"]+=1 else: list["PUNCTUATION"]=1 elif len(i.translate(None,string.punctuation)) < 3: continue elif i.upper()==i: if list.has_key("CAPSLOCK"): # classifing token as capital word feature list["CAPSLOCK"]+=1 else: list["CAPSLOCK"]=1 else: j = i.translate(None,string.punctuation).lower() if list.has_key(j): list[j]+=1 else: list[j]=1 return list
def textToWordList(txt): p_stemmer = RussianStemmer() tokenizer = RegexpTokenizer(r'\w+') stop_w = [p_stemmer.stem(i) for i in get_stop_words('ru')] r = re.compile('^[а-я]+$') badword =[ 'дом', 'город', "дорог", "час", "ноч", "слов", "утр", "стран", "пут", "путешеств", "мест", 'нов', "друз", "добр" ] txt = txt.lower().replace("<br>", "\n") tokens = [p_stemmer.stem(i) for i in tokenizer.tokenize(txt)] tokens = [i for i in tokens if not i in stop_w and r.match(i) and not i in badword] return tokens
def keywords_search(reviews): key_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_general.txt", 'r'): for k in open(keyword_general_path, 'r'): a = k.strip().split(", ") key_map[a[0]] = a[1] special_map = {} # for k in open(os.getcwd() + "/KeyWord/keyword_map_special.txt", 'r'): for k in open(keyword_special_path, 'r'): a = k.strip().split(", ") special_map[a[0]] = a[1] raw = reviews.lower() tokenizer = TweetTokenizer() tokens = tokenizer.tokenize(raw) # remove punctuations no_punc_tokens = [i for i in tokens if (not i in string.punctuation+string.digits) and (not "." in i)] # remove stop words from tokens en_stop = get_stop_words('en') stopped_tokens = [i for i in no_punc_tokens if not i in en_stop] # stem tokens # wordnet_lemmatizer = WordNetLemmatizer() # stemmed_tokens = [wordnet_lemmatizer.lemmatize(i) for i in stopped_tokens ] chosen_key_words = [] # Search in general key word key_words_dict = dict.fromkeys(key_map.values(), 0) # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in key_map.keys(): if t in s: key_words_dict[key_map[t]] += 1 for d in sorted(zip(key_words_dict.values(), key_words_dict.keys()))[:-4:-1]: if d[0] > 0: chosen_key_words.append(d[1]) # Search in special keyword special_words_dict = dict.fromkeys(special_map.values(), 0) # Select keyword using wordnet # Select keyword use only key word to select # s = set(stemmed_tokens) s = set(stopped_tokens) for t in special_map.keys(): if t in s: special_words_dict[special_map[t]] += 1 for d in sorted(zip(special_words_dict.values(), special_words_dict.keys()))[:-3:-1]: if d[0] > 0: chosen_key_words.append(d[1]) return ' '.join(chosen_key_words)
def load_dataset(dataset_file): """ It is more efficient (O(n) vs. O(1)) to search a dictionary or a set compared to a list as they are implemented with a hash. Therefore, the dataset is kept with 2 dictionaries where the values are sets. """ items_original_form = defaultdict(set) items_by_keyword_start = defaultdict(set) items_by_id = defaultdict(set) stop_words = get_stop_words('english') with open(dataset_file) as f: lines = csv.reader(f, delimiter=',') for line in lines: item_id, *descriptors = line # save original form (3 seperate fields: # id, description, company name) for output items_original_form[item_id] = descriptors # create 2 dictionaries for searching: # 1. Key: 3 lower-case first letters of each # word of item descriptors. Value: item ids. # 2. Key: item id. Value: item descriptors in lower-case. descriptors_set = set(" ".join(descriptors).lower().split()) for d in descriptors_set: if d not in stop_words: items_by_keyword_start[d[:3]].add(item_id) items_by_id[item_id] = descriptors_set return (items_by_keyword_start, items_by_id, items_original_form)
def lda(data): data = get_only_text(data) only_tweet = data length = len(only_tweet) length = min(20,length) for i in xrange(0,length): print i print only_tweet[i] return tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() length = len(only_tweet) length = min(20,length) total_texts = [] for i in xrange(0,length): print only_tweet[i] print to_lower = only_tweet[i].lower() tokens = tokenizer.tokenize(to_lower) stopped_tokens = [k for k in tokens if not k in en_stop] texts = [p_stemmer.stem(k) for k in stopped_tokens] total_texts.append(texts) dictionary = corpora.Dictionary(total_texts) corpus = [dictionary.doc2bow(text) for text in total_texts] ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=2, id2word = dictionary, passes=20) result = ldamodel.print_topics(num_topics=2, num_words=1) for i in result: print i
def lemmatization_intern(lang, rss, result, doc): # Construction et configuration du wrapper tagger = treetaggerwrapper.TreeTagger(TAGLANG=lang, TAGDIR=treetagger_path, TAGINENC='utf-8', TAGOUTENC='utf-8') # Utilisation tags = tagger.TagText(rss) data = formatTTG(tags, tagger, stop_words.get_stop_words(language=lang)) for k in [1, 2, 3]: i = 0 liste = [] while i <= len(data) - k: lemma = getLemma(data[i]) for j in range(k - 1): lemma += " " + getLemma(data[i + j + 1]) if lemma not in result: result[k-1][lemma] = 0 doc[k-1][lemma] = 1 liste += [lemma] elif lemma not in liste: doc[k-1][lemma] += 1 liste += [lemma] result[k-1][lemma] += 1 i += 1 return result, doc
def bag_of_words_vectorizer(datafile, k_features): """ Computes sparse term-document matrix of datafile documents, selects k best features by chi2 test. Yields batches of BATCH_SIZE of dense tdm vectors and vector of labels, transformed for keras nn. """ data = [] labels = [] for jsoned_entity in open("data.json", errors="ignore").readlines(): entity = json.loads(jsoned_entity) if entity["lang"] == "en": data.append(entity["text"]) labels.append(entity["label"]) vectorizer = TfidfVectorizer(stop_words=get_stop_words("english")) data = vectorizer.fit_transform(data) data = SelectKBest(chi2, k=k_features).fit_transform(data, labels) for vector_label_batch in batch(zip(data, labels), config.BATCH_SIZE): vectors = [] labels = [] for vec_label in vector_label_batch: vectors.append(vec_label[0].toarray()) labels.append(vec_label[1]) X = np.vstack(vectors) Y = np_utils.to_categorical(labels, 2) yield X, Y
def process_line_mymodel(line): """ @params line: list of all tokens contained in a line format: id_img nb_pairs(word, points) w1 p1 w2 p2 .... wn pn return: key, value for the dictionary key: id_img value: list of pairs w-p remove stop words? """ en_stop = get_stop_words('en') #print en_stop key = line[0] nb_pairs = int(line[1]) i = 0 value = [] weights = {} while i<nb_pairs*2: #print line[2+i] #if line[2+i] not in en_stop: value.append(re.sub(r'[^\x00-\x7f]',r'',line[2+i])) weights[re.sub(r'[^\x00-\x7f]',r'',line[2+i])]=int(line[3+i]) i+=2 #assert nb_pairs == len(value), "length of data diferent (nb_pairs =/= len(pairs))" return key, value, weights
def issue_analysis(df): df_sub = df[['Issue']] df_sub.insert(0, 'count', 1) Issue_List=[] for i in range(0,50): Issue_List.append(df_sub.groupby(['Issue']).sum().sort_index(by='count', ascending=False).ix[i].name) tokenizer = RegexpTokenizer(r'[A-Za-z0-9\']+') # set tokenize Reg en_stop = get_stop_words('en') # create English stop words list p_stemmer = PorterStemmer() # Create p_stemmer of class PorterStemmer texts = [] # list for tokenized documents in loop text_view = '' # loop through document list for i in Issue_List: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens and add them to list stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) #print ' '.join(stemmed_tokens) text_view += ' '.join(stemmed_tokens) text_view += ' ' wordcloud = WordCloud().generate(text_view) fig = plt.figure(figsize=(8,6)) fig1 = fig.add_subplot(1,1,1) fig1.set_title("Top issued words", fontdict={'fontsize':25}) fig1.imshow(wordcloud) fig1.axis("off") #plt.savefig('ComplainCount_WC.png') plt.savefig('ComplainCount_WC_2016.png') # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=25, id2word = dictionary) LDAText = ldamodel.print_topics(num_topics=5, num_words=3) #print "\n Topic analysis result for top 25 issues with LDA" #print(LDAText) vis_data = gensimvis.prepare(ldamodel, corpus, dictionary) #pyLDAvis.show(vis_data) #pyLDAvis.save_html(vis_data, "issue_lda.html") #pyLDAvis.save_json(vis_data, "issue_lda.json") pyLDAvis.save_html(vis_data, "issue_lda_2016.html") pyLDAvis.save_json(vis_data, "issue_lda_2016.json") return 0
def preprocess_wikidata(raw): # Initialize Tokenizer tokenizer = RegexpTokenizer(r'\w+') # Initialize Lemmatizer lemma = WordNetLemmatizer() # create English stop words list en_stop = get_stop_words('en') # Decode Wiki Markup entities and remove markup text = filter_wiki(raw) text = re.sub(filter_more, '', text) # clean and tokenize document string text = text.lower().split('../img/')[0] tokens = tokenizer.tokenize(text) # remove stop words from tokens tokens = [i for i in tokens if not i in en_stop] # stem tokens tokens = [lemma.lemmatize(i) for i in tokens] # remove non alphabetic characters tokens = [re.sub(r'[^a-z]', '', i) for i in tokens] # remove unigrams and bigrams tokens = [i for i in tokens if len(i)>2] return (tokens, text)
def sentence_to_tokens(self, sentence, option='lemmatize'): """ Given a sentence in english, return list of tokens with stop-word filtered, or lemmatized, tokenized :param sentence: English sentence :param option: lemmatize, stemming or none :return: list of non-stop word english tokens """ log.debug("Tokenizing sentence") tokens = self.tokenizer.tokenize(sentence.lower()) log.debug(tokens) # filter stop words log.debug("Filtering stop words") tokens = filter(lambda word: word not in get_stop_words('en'), tokens) log.debug(tokens) if option == 'lemmatize': # lemmatize log.debug("Lemmatizing") tokens = [self.lemmatizer.lemmatize(w) for w in tokens] log.debug(tokens) elif option == 'stem': # stemming log.debug("Stemming") tokens = [self.stemmer.stem(w) for w in tokens] log.debug(tokens) else: pass return tokens
def Tokenize(TextData): tokenizer = RegexpTokenizer(r'\w+') tokens = list() # create English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() # clean and tokenize document string raw = TextData.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] tokens = stemmed_tokens TOKENIZEDTEXT_FILE = path.join(os.pardir, "Resources/TokenizedTextFiles/Personal-Narration/Unbroken - Motivational Video.txt") fp = open(TOKENIZEDTEXT_FILE, "w") print(TOKENIZEDTEXT_FILE) # pickle.dump(tokens, fp) fp.write(str(tokens)) fp.close()
def modeling_tags(liked, numTopics): print '- my likes tag modeling :' # get documents documents = [] for like in liked: s = unicode(' ') documents.append(s.join(liked[like][0])) # remove common and repeated words, and tokenize #stoplist = set('for a of the and to in'.split()) stoplist = get_stop_words('en') texts = [[word for word in document.lower().split() if word not in stoplist] for document in documents] frequency = defaultdict(int) for text in texts: for token in text: frequency[token] += 1 texts = [[token for token in text if frequency[token] > 0] for text in texts] dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] # transformations tfidf = models.TfidfModel(corpus) corpus_tfidf = tfidf[corpus] lsi = models.LsiModel(corpus_tfidf, id2word=dictionary, num_topics=numTopics) corpus_lsi = lsi[corpus_tfidf] index = similarities.MatrixSimilarity(corpus_lsi) # save transformation dictionary.save('/tmp/tags.dict') corpora.MmCorpus.serialize('/tmp/tags.mm', corpus) index.save('/tmp/tags.index') lsi.save('/tmp/tags.lsi') print ' ok' print ''
def get_frequency(self): # Selecting all the text in the database cursor = self.select_content('Content') # Initialising variables words = [] count_handle = Counter() # Generating common word list to be removed from the keyword list to be generated sw = stop_words.get_stop_words("english") # Extracting all words from the given database for row in cursor: words += re.compile('\w+').findall(row[1]) #Remove stop words from 'words' list words = [w.lower() for w in words if w.lower() not in sw] # Calculating the frequency of all words in the given database for w in words: count_handle[w] += 1 # Writing the keywords returned into the file = category+ "_keyword.txt" with open(self.out, 'w') as file_name: for word in count_handle.most_common(self.limit): file_name.write(word[0]+"\t"+str(word[1])+"\n")
def get_stopset(): """ Gets a set of stopwords """ stopset = set(get_stop_words('en')) # get those contractions add_stops = nltk.word_tokenize(' '.join(stopset)) stopset.update(add_stops) # make sure to get contractions without punctuation, so that # order of operations doesn't matter later add_stops = [stopword.strip(string.punctuation) for stopword in stopset] stopset.update(add_stops) # custom stop words add_stops = [u'lp', u'ep', u'record', u'records', u'recorded' u'label', u'labels', u'release', u'releases', u'released', u'listen', u'listens', u'listened', u'listener', u'version', u'versions', u'album', u'albums', u'song', u'songs', u'track', u'tracks', u'sound', u'sounds', u'thing', u'things', u'something', u'music'] stopset.update(add_stops) return stopset
def createLDAModel(texts, n_topics, n_passes): """Generates a LDA model from an array of texts """ tokenizer = RegexpTokenizer(r'\w+') #Create EN stop words list en_stop = get_stop_words('en') #Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() texts_ = [] # loop through document list for i in texts: # clean and tokenize document string raw = i.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # stem tokens stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] # add tokens to list texts_.append(stemmed_tokens) # turn our tokenized documents into a id <-> term dictionary dictionary = corpora.Dictionary(texts_) # convert tokenized documents into a document-term matrix corpus = [dictionary.doc2bow(text) for text in texts_] # generate LDA model ldamodel = gensim.models.ldamodel.LdaModel(corpus, num_topics=n_topics, id2word = dictionary, passes=n_passes) return(ldamodel)
def get_corpus(): db_conn = MySQLdb.connect(host="localhost", port=8889, db="linked_reverb", user="******", passwd="root") cursor = db_conn.cursor() cursor.execute("select argument1, argument2 from linked_entity80_a") ls_result = [] ls_corpus = [] row_count = int(cursor.rowcount) for i in range(0, row_count): row = cursor.fetchone() ls_result.append(row) stop_words = get_stop_words('en') for i in range(len(ls_result)): for item in ls_result[i][0].split(" "): if item in stop_words: pass else: ls_corpus.append(item) for item in ls_result[i][1].split(" "): if item in stop_words: pass else: ls_corpus.append(item) # # ls_corpus.append(ls_result[i][0].split(" ")) # ls_corpus.append(ls_result[i][1].split(" ")) db_conn.close() return ls_corpus
def convert_amazon_to_dict(dict_field, is_text, in_fname, out_fname): id = 0 num_entries = 0 field_dict = {'':0} stop_words = get_stop_words('en') for entry in parse_amazon(in_fname): if entry.has_key(dict_field): num_entries += 1 # if text field, parse and populate. if is_text: words = entry[dict_field].split() for word in words: stemmed_word = stem(word) if stemmed_word not in stop_words and stemmed_word not in field_dict: id += 1 field_dict[stemmed_word] = id else: if entry[dict_field] not in field_dict: id += 1 field_dict[entry[dict_field]] = id #printf('%s -> %d\n', entry[dict_field], id) #if id > 100: # break print "num_entries:", num_entries print "length of field_dict:", len(field_dict) with open(out_fname, 'wb') as outf: pickle.dump(field_dict, outf)
def lda_approach_one(): tokenizer = RegexpTokenizer(r'\w+') en_stop = get_stop_words('en') p_stemmer = PorterStemmer() # doc_a = "Brocolli is good to eat. My brother likes to eat good brocolli, but not my mother." # doc_b = "My mother spends a lot of time driving my brother around to baseball practice." # doc_c = "Some health experts suggest that driving may cause increased tension and blood pressure." # doc_e = "Health professionals say that brocolli is good for your health." # doc_set = [doc_a, doc_b, doc_c, doc_e] print db.find().count() doc_set = [i['abstract'] for i in db.find()] texts = [] for i in doc_set: raw = i.lower() tokens = tokenizer.tokenize(raw) stopped_tokens = [i for i in tokens if not i in en_stop] stemmed_tokens = [p_stemmer.stem(i) for i in stopped_tokens] texts.append(stemmed_tokens) dictionary = corpora.Dictionary(texts) corpus = [dictionary.doc2bow(text) for text in texts] ldamodel = gensim.models.ldamodel.LdaModel( corpus, num_topics=4, id2word=dictionary, passes=20 ) print ldamodel.print_topics(10)
def getWordVector(inputString): tokenizer = RegexpTokenizer(r'\w+\'?\w+') # default English stop words list en_stop = get_stop_words('en') # Create p_stemmer of class PorterStemmer # It is considered to be the best for finding word roots p_stemmer = PorterStemmer() raw = inputString.lower() tokens = tokenizer.tokenize(raw) # remove stop words from tokens stopped_tokens = [i for i in tokens if not i in en_stop] # now POS words which are nouns, adjectives, adverbs and verbs pos_tagged = nltk.pos_tag(stopped_tokens) # stem tokens # p_stemmer.stem(i[0]) and other additions in if condition - or i[1][0] == 'R' or i[1][0] == 'V' stemmed_tokens = [i[0] for i in pos_tagged if i[1][0] == 'N'] # or i[1][0] == 'J'] return stemmed_tokens
def cal_idf_overlap(): list_subj = utils.list_subject ls_distance_final = [] ls_distance_row = [] #print len(list_att) stop_words = get_stop_words('en') tmp_corpus = [] for i in range(len(list_subj)): item = str(list_subj[i]).split(" ") for token in item: if token in stop_words: pass else: tmp_corpus.append(token) #print "corpus", corpus length = len(list_subj) for i in range(0, length): if i == 500 or i == 1000 or i == 1500: print i for j in range(0, length): print i, j idf_instance = IDF.IDF(str(list_subj[i]),str(list_subj[j]), tmp_corpus) distance = idf_instance.cal_overlap() ls_distance_row.append(distance) ls_distance_final.append(ls_distance_row) ls_distance_row = [] myarray = np.asarray(ls_distance_final) print myarray Z = linkage(myarray, "ward") thefile = open('/Users/Aaron/test.txt', 'w') for item in Z: thefile.write("%s\n" % item) plt.figure(figsize=(25, 10)) plt.title('Hierarchical Clustering Dendrogram') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, leaf_rotation=90., # rotates the x axis labels leaf_font_size=8., # font size for the x axis labels ) plt.show() plt.title('Hierarchical Clustering Dendrogram (truncated)') plt.xlabel('sample index') plt.ylabel('distance') dendrogram( Z, truncate_mode='lastp', # show only the last p merged clusters p=30, # show only the last p merged clusters show_leaf_counts=True, # otherwise numbers in brackets are counts leaf_rotation=90., leaf_font_size=12., show_contracted=True, # to get a distribution impression in truncated branches ) plt.show()
def remove_stopwords_from_individual_text(tokens): ''' Given a list of tokens, returns a list of strings without any stopwords. ''' en_stop_words = stop_words.get_stop_words('en') return filter(lambda w: w not in en_stop_words, tokens)
def __init__(self): self.__stemmer = stem.PorterStemmer() s = stop_words.get_stop_words("en") self.__stopwords = [] for word in s: if word.isalpha(): self.__stopwords.append(self.__stemmer.stem(word.lower(), 0, len(word) - 1)) self.__stopwords = set(self.__stopwords)
def __init__(self, dictionary, is_dev = False, model = ''): assert model is not '', "model cant be empty" self.en_stop = get_stop_words('en') self.dictionary_words = dictionary self.is_dev = is_dev filename_output = 'output_classify_' + model + '.txt' print 'opening file for output', filename_output self.file_output = open(filename_output, 'w') self.filename_output = filename_output
def stopWord(): '''These words don not indicate any sentiment and can be removed Repeating letter e.g hungrryyy for hungry Punctuation ''' stopWords = get_stop_words('en') stopWords.append('at_user') stopWords.append('url') return stopWords
def remove_stop_words(frequency_list): stop_words = get_stop_words('en') temp_list = [] for key,value in frequency_list: if key not in stop_words: temp_list.append([key, value]) return temp_list
def load_stoplist(): try: return set(get_stop_words("en") + STOP_LIST) except: print((format_exc())) return set()
def runFitting(params, objects): TASK = 'binary' #TASK = 'multi' ''' Preparing data ''' featureList = [] if params["sentenceComplexityCheck"]: featureList.append("posTag") if params["embeddingsTermFreqFiltering"]: objects["freqFilter"].embeddingsEnabled = True if params["oneHotTermFreqFiltering"]: objects["freqFilter"].oneHotEnabled = True objects["liguisticFeatureExtractor"].setFeatureList(featureList) #print('Reading in offenseval training data...') if TASK == 'binary': IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus( offenseval_train) else: IDsTrain, Xtrain, Ytrain = helperFunctions.read_corpus( offenseval_train, binary=False) Xtrain = helperFunctions.clean_samples(Xtrain) print("train data read") ''' Preparing vectorizer and classifier ''' # Vectorizing data / Extracting features #print('Preparing tools (vectorizer, classifier) ...') if params["tweetTokenization"]: count_word = transformers.CountVectorizer( ngram_range=(1, 2), stop_words=stop_words.get_stop_words('en'), tokenizer=TweetTokenizer().tokenize) else: count_word = transformers.CountVectorizer( ngram_range=(1, 2), stop_words=stop_words.get_stop_words('en')) count_char = transformers.CountVectorizer(analyzer='char', ngram_range=(3, 7)) embedder = features.Embeddings(objects["embeddings"], pool='max') vectorizer = FeatureUnion([('word', count_word), ('char', count_char), ('word_embeds', embedder)]) if len(featureList) > 0: vectorizer.transformer_list.append( ('lingFeats', objects["liguisticFeatureExtractor"])) if params["oneHotTermFreqFiltering"] or params[ "embeddingsTermFreqFiltering"]: vectorizer.transformer_list.append( ('freqFilter', objects["freqFilter"])) if params["charNgramFreqFiltering"]: objects["charFreqFilter"].oneHotEnabled = True objects["charFreqFilter"].embeddingsEnabled = False vectorizer.transformer_list.append( ('charfreqFilter', objects["charFreqFilter"])) if params["POStagCheck"]: vectorizer.transformer_list.append( ('posTagger', transformers.posTagExtractor(Xtrain, Ytrain))) # Set up SVM classifier with unbalanced class weights """ if TASK == 'binary': # cl_weights_binary = None cl_weights_binary = {'OTHER':1, 'OFFENSE':10} clf = LinearSVC(class_weight=cl_weights_binary) else: # cl_weights_multi = None cl_weights_multi = {'OTHER':0.5, 'ABUSE':3, 'INSULT':3, 'PROFANITY':4} clf = LinearSVC(class_weight=cl_weights_multi) """ clf = LinearSVC() #scaler = StandardScaler(with_mean=False) classifier = Pipeline([ ('vectorize', vectorizer), #('scale', scaler), ('classify', clf) ]) ''' Actual training and predicting: ''' ### predicting on set aside training data #print('Predicting on set aside data...') #Yguess = classifier.predict(XcustomTest) #result = cross_validate(classifier, Xtrain, Ytrain,cv=3) #print(result) ######## print('Fitting on training data...') classifier.fit(Xtrain, Ytrain) #print('accuracy on set aside') #print(classifier.score(Xtest_raw, Y_test)) #exit() #print('Predicting...') #Yguess = classifier.predict(Xtest) """ ''' Outputting in format required ''' print('Outputting predictions...') outdir = '/Users/balinthompot/RUG/Honours/HateSpeech/offenseval-rug-master/Submission' fname = 'rug_fine_2.txt' with open(outdir + '/' + fname, 'w', encoding='utf-8') as fo: assert len(Yguess) == len(Xtest_raw), 'Unequal length between samples and predictions!' for idx in range(len(Yguess)): # print(Xtest_raw[idx] + '\t' + Yguess[idx] + '\t' + 'XXX', file=fo) # binary task (coarse) print(Xtest_raw[idx] + '\t' + 'XXX' + '\t' + Yguess[idx], file=fo) # multi task (fine) print('Done.') """ return classifier
def tokenize(self, text: Text, attribute: Text = TEXT_ATTRIBUTE) -> List[Token]: stop_words = get_stop_words('en') '''if not self.case_sensitive: text = text.lower() if attribute != INTENT_ATTRIBUTE: # remove 'not a word character' if words = re.sub( # there is a space or an end of a string after it r"[^\w#@&]+(?=\s|$)|" # there is a space or beginning of a string before it # not followed by a number r"(\s|^)[^\w#@&]+(?=[^0-9\s])|" # not in between numbers and not . or @ or & or - or # # e.g. 10'000.00 or [email protected] # and not url characters r"(?<=[^0-9\s])[^\w._~:/?#\[\]()@!$&*+,;=-]+(?=[^0-9\s])", " ", text, ).split() # if we removed everything like smiles `:)`, use the whole text as 1 token if not words: words = [text] else: words = ( text.split(self.intent_split_symbol) if self.intent_tokenization_flag else [text] ) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) self.add_cls_token(tokens, attribute) return tokens ''' # text = message.get(attribute) if not self.case_sensitive: text = text.lower() s = re.sub(r'[\W]', ' ', text) # remove punctuations words = s.split() # split into tokens for x in list(words): if x in stop_words: words.remove(x) # remove stop words # if we removed everything like smiles `:)`, use the whole text as 1 token if not words: words = [text] else: words = (text.split(self.intent_split_symbol) if self.intent_tokenization_flag else [text]) running_offset = 0 tokens = [] for word in words: word_offset = text.index(word, running_offset) word_len = len(word) running_offset = word_offset + word_len tokens.append(Token(word, word_offset)) self.add_cls_token(tokens, attribute) return tokens
from sktools.matrix_denser import MatrixDenser import random from sklearn.preprocessing import MinMaxScaler from utils.rank import GaussRankScaler import torch from torch.utils.data import Dataset, DataLoader import pytorch_lightning as pl from pytorch_lightning.callbacks.early_stopping import EarlyStopping from torch import nn random.seed(42) np.random.seed(42) pd.set_option('display.max_rows', 500) stop_words = get_stop_words('catalan') submit = False # %% full_train = pd.read_csv("data/train.csv").rename(columns={ "TÍTOL": "title", "Codi QdC": "code" }) full_test = pd.read_csv("data/test.csv").rename(columns={ "TÍTOL": "title", "Codi QdC": "code" }) categories = pd.read_csv("data/categories.csv").drop( columns="Unnamed: 0").rename(columns={ "Codi QdC": "code", "Títol de entrada del catàleg": "target_title"
def load_stop_words(): # x = stopwords.words("english") x = get_stop_words("en") return [s.encode('ascii') for s in x] + list(string.printable)
#from sqlalchemy import create_engine #import psycopg2 #import pandas as pd #from scipy import spatial import numpy as np import re import nltk from nltk.tokenize import RegexpTokenizer tokenizer = RegexpTokenizer(r'\w+') from stop_words import get_stop_words english_stop_words = get_stop_words('en') from nltk.stem.porter import PorterStemmer p_stemmer = PorterStemmer() import gensim from gensim import corpora, models def clean(comment): comment = comment.lower() # Strip all HTML comment = re.sub(r'<[^<>]+>', ' ', comment) # Handle Numbers comment = re.sub(r'[0-9]+', '', comment)
import nltk nltk.download('stopwords') nltk.download('punkt') from stop_words import get_stop_words from nltk.corpus import stopwords import time #Claves Twitter consumer_key = 'wAiOgsZu8811j2Ac3mnyquwiT' consumer_secret = 'kOT4i7K8OoQnNZoDuYEHrsg5DmAW0TnpVyRWPVWWgr4NiYQmm0' access_token = '26922451-bGQYatrkw4zQZgl5qwIwO8nQXtIln0ZbScSmp1Rqv' access_secret = '1ZvfJFdBNSOBqmDmriOXqURGsO5Yudj4s8597LCqe9Wo5' #Busca los stop words en español y en ingles #Los stopwords incluyen preposiciones y artículos stop_words = list(get_stop_words('es')) #Have around 900 stopwords nltk_words = list(stopwords.words('english')) #Have around 150 stopwords stop_words.extend(nltk_words) # metodo para limpiar los tweets eliminando caracteres especiales # como acentos, ñ y puntuación def clean(tweet): output = [] tw = tweet.split(' ') for palabra in tw: if not palabra in stop_words: if "http" not in palabra: output.append(palabra) pal = ' '.join(output) mapping = {'á':'a','é':'e','í':'i','ó':'o','ú':'u', 'ñ':'n','ñ':'n','#':'' ,'Á':'A','É':'E','Í':'I',
from porter_stemmer import PorterStemmer import re import string from stop_words import get_stop_words stop = get_stop_words('english') class Tokenizer(object): def __init__(self): self.stemmer = PorterStemmer() # only admit non-number with length>2 def qualify(self, word): return len(word) > 2 and not word.isdigit() def process_desc(self, desc): ndesc = [] for word in desc.split(): # lowercase all characters word = word.lower() # replace words with hashtags with just the words if word[0] == "#": word = word[1:] # replace words with @ with "AT_USER" elif word[0] == "@": word = "AT_USER" # replace words with url beginnings with "URL"
from nltk.corpus import stopwords from sklearn.metrics.pairwise import cosine_similarity from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer import os import datetime import time import enchant tokenizer = RegexpTokenizer(r'\w+') # r'[a-zA-Z]+' tknzr = TweetTokenizer(strip_handles=True, reduce_len=True) # create English stop words list en_stop = get_stop_words('en') stop_word = stopwords.words('english') # Create p_stemmer of class PorterStemmer p_stemmer = PorterStemmer() d = enchant.Dict("en_US") # set1 = ['sexi', 'ass', 'bitch', 'oral', 'anal', 'gay', 'hot', 'fantasi', 'stalk', 'nasti', 'f**k', 'boob', 'dick', # 'nake', 'suck', 'lip', 'size', 'lick', 'tongu', 'turn', 'booti', 'thigh', 'bra', 'bed', 'horni', 'seduct', # 'ball', 'hoe', 'virgin', 'lesbian', 'bite', 'butt', 'straight', 'leg', 'beast', 'fluid', 'chocolati', # 'syrup', 'v****a', 'threesom', 'belli', 'homosexu'] # set2 = ['crush', 'date', 'dreams', 'friend', 'miss', 'babe', 'sweeti', 'candi', 'look', 'pie', 'appeal', # 'crave', 'propos', 'hit', 'cheek', 'feel', 'romanc', 'poetri', 'hang', 'desir', 'pleasur', 'bomb', # 'cute', 'eye', 'hug', 'chick', 'marri', 'love', 'babi', 'exchang', 'coffe', 'video']
import argparse import numpy as np import pandas as pd from tqdm import tqdm # Custom library from lib.textometry import * from lib.helpers import * from lib.constant import * from lib.cooc import * from lib.utils import partofspeech from stop_words import get_stop_words fr_stop = get_stop_words("french") # Pandas shoots warning non-stop ... so chouh ! import warnings warnings.simplefilter(action='ignore', category=pd.errors.DtypeWarning) code_questions = { 1:"transition_eco", 2:"democratie_et_citoy", 3:"fiscalite_et_depense_publique", 4:"organisation_de_etat_et_service_pub" } parser = argparse.ArgumentParser() parser.add_argument("data_fn")
def set_stopwords(self, stopwords): self.stopwords = get_stop_words(self.DEFAULT_LANGUAGE) if self.language: self.stopwords.extend(get_stop_words(self.language)) if stopwords: self.stopwords.extend([word.lower() for word in stopwords])
def without_stop_words(raw): if not raw: return '' stop_words = get_stop_words('en') return ''.join([c for c in raw if c not in stop_words])
time : 2018-7-20 """ import os import sys import jieba reload(sys) sys.setdefaultencoding('utf8') train_data_url = './data/FudanTrainData/' segment_data_url = './data/WordSegment/' from stop_words import get_stop_words STOP_WORDS_SET = get_stop_words() def get_all_file_by_path(path=train_data_url): """获取某个目录下的所有训练文件""" file_path = [] dir_list = os.listdir(train_data_url) for d in dir_list: file_path.extend( map(lambda x: train_data_url + d + '/' + x, os.listdir(train_data_url + d))) return file_path def read_file_sentence( file_path='./data/FudanTrainData/C3-Art/C3-Art0002.txt'):
import requests import textwrap import gensim import config import emoji from pandas.io.json import json_normalize from string import Template from gensim import corpora from pprint import pprint from tqdm import tqdm extra_chars = ['-', ',', '.', '!', '?', '(', ')', '[', ']', '\n'] morph = pymorphy2.MorphAnalyzer() stop_words = get_stop_words('ru') def get_wall(owner_id: str = '', domain: str = '', offset: int = 0, count: int = 10, filter: str = 'owner', extended: int = 0, fields: str = '', v: str = '5.103') -> pd.DataFrame: """ Возвращает список записей со стены пользователя или сообщества. @see: https://vk.com/dev/wall.get :param owner_id: Идентификатор пользователя или сообщества, со стены которого необходимо получить записи. :param domain: Короткий адрес пользователя или сообщества.
import pandas as pd import numpy as np from stop_words import get_stop_words import matplotlib.pyplot as plt import matplotlib as mpl from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator from PIL import Image stop_words = get_stop_words('en') STOPWORDS = [ 're', 've', 'people', 'said', 'president', 'thing', 'united states', 'way' ] + stop_words df_pre = pd.read_csv('data/debate.csv', encoding="cp1252") df = df_pre[['speaker', 'speech']] text = df[df['speaker'] == 'Joe Biden']['speech'].tolist() #print(text.head()) text = ' '.join(text).lower() #print(text) wordcloud = WordCloud(stopwords=STOPWORDS, collocations=True).generate(text) plt.imshow(wordcloud, interpolation='bilinear') plt.axis('off')
def load_stopwords(language): return [t for w in get_stop_words(language) for t in slugify(w).split("-")]
# coding=utf-8 import config import gensim import pymorphy2 import pyLDAvis import pyLDAvis.gensim import requests import re from stop_words import get_stop_words from string import punctuation stop_words = get_stop_words('russian') def get_wall( owner_id: str = '', domain: str='', offset: int=0, count: int=10, filter: str='owner', extended: int=0, fields: str='', v: str='5.103' ): """ Возвращает список записей со стены пользователя или сообщества. @see: https://vk.com/dev/wall.get
for i in mach_files: with open(i) as f: mach.append(f.readlines()) def process_string(s, process_stopwords, stopwords_list = []): s = re.sub(r'[^\w\s]','',s) s = s.lower() s = word_tokenize(s) s = [pt.stem(i) for i in s] if not process_stopwords: s = [i for i in s if i not in stopwords_list] return s ## processing stopwords stopwords = " ".join(get_stop_words('en')) stopwords = process_string(stopwords, True) ## a list of lists (each element a processed section of the text) processed_mach = [process_string("".join(i), False, stopwords) for i in mach] ## Making corpus level list: unlisting the lists of lists corpus_words = sum(processed_mach, []) ## Making list of 500 most used terms common_words = dict(Counter(corpus_words).most_common(500)) with open("doc_term_mat.csv", 'ab') as f: ## extracting only words (not counts) words = common_words.keys()
import numpy as np import matplotlib.pyplot as plt from datetime import datetime from stop_words import get_stop_words from scipy.stats import zscore import math ##APostrophes index ap_idx = [ 264, 418, 635, 667, 690, 851, 947, 963, 968, 980, 1053, 1118, 1304, 1358, 1406, 1546, 1558, 1596, 1667, 1776, 1784, 1813 ] ap_idx = list(np.asarray(ap_idx) - 1) # STOPWORDS itstops = get_stop_words('it') itstops.append("c'è") itstops.append("c'era") itstops.append("c'erano") itstops.append("l'") itstops.append("'") itstops.append("dell'") itstops.append("nell'") itstops.append("un'") itstops.append("quell'") itstops.append("po'") print('STOPWORDS LOADED!') ##INPUT DIRECTORY input_dir = 'output'
def parse_group(group): group_id = '-' + group offset = 0 all_posts = [] r = requests.get( 'https://api.vk.com/method/wall.get', params={ 'owner_id': group_id, 'offset': offset, 'count': 10, 'access_token': 'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe', 'v': '5.95' }) posts = r.json()['response']['items'] all_posts.extend(posts) data_posts = [] likes_response = [] all_likes = [] for p in all_posts: data_posts.append(get_data(p)) r = requests.get( 'https://api.vk.com/method/likes.getList', params={ 'owner_id': group_id, 'offset': offset, 'type': 'post', 'item_id': p['id'], 'filter': 'likes', 'friends_only': 0, 'extended': 1, 'count': p['likes']['count'], 'access_token': 'd933e827d933e827d933e82762d95bd7acdd933d933e827857a5be3f0d490a5fdc7bfbe', 'v': '5.95' }) likes_response.extend(r.json()['response']['items']) for like_response in likes_response: like = Like(group_id, like_response['id'], like_response['type'], like_response['first_name'], like_response['last_name']) all_likes.append(like) write_likes_json(all_likes, group_id) write_posts_json(data_posts, group_id) my_stop_words = get_stop_words('ru') vectorizer = TfidfVectorizer(ngram_range=(1, 1), stop_words=my_stop_words) X = vectorizer.fit_transform([data_post.text for data_post in data_posts]) idf = vectorizer.idf_ #*************** cv = CountVectorizer(max_df=0.85, stop_words=my_stop_words, max_features=10000) word_count_vector = cv.fit_transform( [data_post.text for data_post in data_posts]) tfidf_transformer = TfidfTransformer(smooth_idf=True, use_idf=True) tfidf_transformer.fit(word_count_vector) feature_names = cv.get_feature_names() #all keywords keywords = [] morph = pymorphy2.MorphAnalyzer() #generate tf-idf for the given document for data_post in data_posts: tf_idf_vector = tfidf_transformer.transform( cv.transform([data_post.text])) #sort the tf-idf vectors by descending order of scores sorted_items = sort_coo(tf_idf_vector.tocoo()) #extract only the top n; n here is 1 results = extract_topn_from_vector(feature_names, sorted_items, 1) result = '' if results: result = next(iter(results)) if result != '' and not result.isdigit(): result = morph.parse(result)[0].normal_form if len(result) > 2: keyword = KeyWord(data_post.id, result, 1) keywords.append(keyword) return data_posts, keywords
from stop_words import get_stop_words import nltk import pprint client = MongoClient() client = MongoClient('localhost', 27017) db = client['test'] customer = db.customer qndict = customer.find_one() qndictOld = qndict.copy() ### Parser Function ### stop_words = set(get_stop_words('english')) def myKeyWordParser(myString): myStringLowered = myString.lower() word_tokens = nltk.tokenize.RegexpTokenizer( '\\b\\w*[a-zA-Z]\\w+\\b').tokenize(myStringLowered) filtered_sentence = [] for w in word_tokens: if w not in stop_words: filtered_sentence.append(w) return filtered_sentence ### End of Parser Function ####
def __init__(self, czech=None): """ Constructor initialise the range of parameters of each tested model for pipeline. :param czech: czech stop words """ if not czech: czech = nltk.word_tokenize(' '.join(get_stop_words('cz'))) self._classifiers = [SVC(), NuSVC(), RandomForestClassifier(), LogisticRegression(), # MLPClassifier(), MultinomialNB(), ] self.parameters = [ # SVC { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__C': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000), 'cls__gamma': (0.5, 0.1, 0.05, 0.01, 0.005, 0.001, 0.0005, 0.0001), 'cls__kernel': ('linear', 'rbf', 'poly', 'sigmoid') }, # NuSVC { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__nu': (0.35, 0.4, 0.45, 0.5, 0.55, 0.6, 0.65), 'cls__kernel': ('linear', 'rbf', 'poly', 'sigmoid') }, # Random Forrest { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__max_depth': (None, 10, 20, 30, 40, 50, 60, 70, 80, 90), 'cls__max_feat': (10, 20, 30, 40, 50, 'sqrt', None), }, # Logistic regression { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__C': (0.0001, 0.001, 0.01, 0.1, 1, 10, 100, 1000), 'cls__class_weight': ('balanced', None), 'cls__penalty': ('l1', 'l2') }, # Naive Bayes { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__alpha': (0.0001, 0.0005, 0.001, 0.005, 0.01, 0.05, 0.1, 0.5), 'cls__fit_prior': (True, False) }, # Maximum Entropy { 'vect__max_df': (0.5, 0.75, 1.0), 'vect__ngram_range': ((1, 1), (1, 2), (1, 3)), 'vect__norm': ('l1', 'l2', None), 'vect__stop_words': (czech, None), 'cls__method': ('gis', 'iis', 'megam', 'tadm') } ] self.pipeline_data = zip(self._classifiers, self.parameters)
def stop_words(): """Retrieve the stop words for vectorization -Feel free to modify this function """ return get_stop_words('es') + get_stop_words('ca') + get_stop_words('en')
from PIL import Image from wordcloud import WordCloud # import wordcloud # from nltk.corpus import stopwords from stop_words import get_stop_words import random import numpy as np import matplotlib.pyplot as plt with open("Goethe_Sammler.txt", "r") as f: text = f.read() goethe_mask = np.array(Image.open('Goethe_Schattenriss.jpg')) blacklist = get_stop_words('german') blacklist = set(blacklist) blacklist = blacklist.union( {'wäre', 'konnte', 'lassen', 'sagte', 'muß', 'Oheim', 'Julie', 'sei'}) def grey_color(word, font_size, position, orientation, random_state=None, **kwargs): return ("hsl(0, 0%%, %d%%)" % np.random.randint(10, 20)) wc = WordCloud(background_color='white',
off, over, under, again, further, then, once, here, there, when, where, why, how, all, any, both, each, few, more, most, other, some, such, no, nor, not, only, own, same, so, than, too, very, s, t, can, will, just, don, don't, should, should've, now, d, ll, m, o, re, ve, y, ain, aren, aren't, couldn, couldn't, didn, didn't, doesn, doesn't, hadn, hadn't, hasn, hasn't, haven, haven't, isn, isn't, ma, mightn, mightn't, mustn, mustn't, needn, needn't, shan, shan't, shouldn, shouldn't, wasn, wasn't, weren, weren't, won, won't, wouldn, wouldn't, """ # http://pypi.python.org/pypi/stop-words # pip install stopwords from stop_words import get_stop_words pypi_stopwords = get_stop_words('en') print(len(pypi_stopwords)) # 174 # self defined stopwords stop_words = [ 'i', 'me', 'my', 'myself', 'we', 'our', 'ours', 'ourselves', 'you', 'your', 'yours', 'yourself', 'yourselves', 'he', 'him', 'his', 'himself', 'she', 'her', 'hers', 'herself', 'it', 'its', 'itself', 'they', 'them', 'their', 'theirs', 'themselves', 'what', 'which', 'who', 'whom', 'this', 'that', 'these', 'those', 'am', 'is', 'are', 'was', 'were', 'be', 'been', 'being', 'have', 'has', 'had', 'having', 'do', 'does', 'did', 'doing', 'a', 'an', 'the', 'and', 'but', 'if', 'or', 'because', 'as', 'until', 'while', 'of', 'at', 'by', 'for', 'with', 'about', 'against', 'between', 'into', 'through', 'during', 'before', 'after', 'above', 'below', 'to', 'from',
import string from datetime import datetime from nltk.stem.lancaster import LancasterStemmer # from nltk.corpus import stopwords from stop_words import get_stop_words punctuation = list(string.punctuation) + ['rt', 'via'] arabic_stopwords = get_stop_words('arabic') english_stopwords = get_stop_words('english') german_stopwords = get_stop_words('german') # Gets the tweet time. def get_time(tweet): return datetime.strptime(tweet['created_at'], "%a %b %d %H:%M:%S +0000 %Y") # Gets all hashtags. def get_hashtags(tweet): return [tag['text'] for tag in tweet['entities']['hashtags']] # Gets the screen names of any user mentions. def get_user_mentions(tweet): return [m['screen_name'] for m in tweet['entities']['user_mentions']] # Gets the text, sans links, hashtags, mentions, media, and symbols. def get_text_cleaned(tweet): text = tweet['text']
print() open('rmsle_%s.log' % name, 'a').write(str(lst[-1]) + '\n') def filter_word(d, w, c='item_description'): return d[d[c].map(lambda x: w in x)] # %% read data df: pd.DataFrame = pd.read_csv('data/cleaned_train.csv') df_test: pd.DataFrame = pd.read_csv('data/cleaned_test.csv') # %% pre-processing tokenizer = RegexpTokenizer(r'\w+') slash_tokenizer = RegexpTokenizer(r'[^/]+') en_stop: List[str] = get_stop_words('en') stemmer = PorterStemmer() def clean_text(t: str, tk=tokenizer) -> str: tokens = tk.tokenize(t.lower()) tokens = [stemmer.stem(token) for token in tokens if token not in en_stop or len(token) >= 2 and not token.isnumeric()] return ' '.join(tokens) def preprocess(d: pd.DataFrame, clean=False) -> pd.DataFrame: d: pd.DataFrame = d.copy() d.fillna('__nan__', inplace=True) if clean: print("Cleaning 'name'")
from stop_words import get_stop_words with open('stop_words.txt', 'w') as f: stop_words = get_stop_words('uk') for word in range(len(stop_words)): stop_words[word] += '\n' f.writelines(stop_words)
def __init__(self, max_workers): logger.info("Number of workers: %s", max_workers) self.max_workers = max_workers self.tokenizer = RegexpTokenizer(r'\w+') self.en_stopwords = set(get_stop_words('en')) self.p_stemmer = PorterStemmer()
# input_folder = data_folder+"data_input/" # output_folder = data_folder+"data_output/" results_folder=data_folder+dataset+"_results/" lang = 'en'#'de';#'fr' translator = Translator() num_context_word = 5 senti_folder = "../senti_lexicon/" senti_trans_folder = "../senti_lexicon_trans/" senti_file_path = "senti_dict.json" # nlp = spacy.load(lang+'_core_news_md') stop_words = set(get_stop_words(lang)) xml_bilexicon_path = "../XML_translation/" puncts = "—" #************* List of functions def convert_file_dicts(file_name): dict_toks = {} dict_NEs = {} dict_global_local_index = {} dict_lines_words = {} if file_name not in dict_toks: dict_toks[file_name] = [] dict_NEs[file_name] = [] dict_lines_words[file_name] = [] dict_global_local_index[file_name] = {}