Python append Exemples, nltk.corpus.stopwords.append Python Exemples

Exemple #1

0

Afficher le fichier

Fichier : process_email.py Projet : jczhu/spam_proj

def get_stopwords(filename):
    stopwords = []
    f = open(filename, 'r')
    for line in f:
    	line = line.strip(" \n\t")
        stopwords.append(line)
    f.close()
    return stopwords

Exemple #2

0

Afficher le fichier

 def getTermsAndStopList(self, str):
     stopwords = []
     words = []
     tokens = self.tokenizer.tokenize(str)
     for token in tokens:
         if token.isdigit() == False:
             if token not in self.stop:
                 words.append(token)
             else:
                 stopwords.append(token)
     terms = [self.stemmer.stem(word) for word in words]
     return terms, stopwords

Exemple #3

0

Afficher le fichier

Fichier : export.py Projet : err8029/homework

    def read(self, list, path):
        #read the file (forcing utf8) and output a list
        lemmer = WordNetLemmatizer()

        #count the words ,save those that have a frequency of 15 or less
        #so that the algorithm doesn't get distracted by irrellevant non-recurrent words
        data_file = open(path, 'rt', encoding='utf8', errors='replace')
        cn = Counter(word for l in data_file for word in l.split())
        words = dict((word, v) for word, v in cn.items() if v < 6)
        words_list = words.keys()
        data_file.close()

        #read the stopwords file and create a list with them
        stopwords_f = open('stopwords.txt',
                           'rt',
                           encoding='utf8',
                           errors='replace')
        stopwords = []
        for line in stopwords_f:
            stopwords.append(str(line.strip()))
        stopwords_f.close()

        data_file = open(path, 'rt', encoding='utf8', errors='replace')
        for line in data_file:
            #manually check for stopwords
            new_line = ''

            #manually eliminate puntuation and grammar stuff, according to stastics
            #of the data, least repeated and most repeated words (irrellevant ones)
            line = str(line).replace(",", " ")
            line = str(line).replace(' " ', ' ')
            line = str(line).replace(".", "")
            line = str(line).replace(" ?", "")
            line = str(line).replace(" : ", " ")
            line = str(line).replace(" ; ", " ")
            line = str(line).replace(" ( ", " ")
            line = str(line).replace(" ) ", " ")
            line = str(line).replace(". . .", " ")
            line = str(line).replace(" -- ", " ")

            #remove some stopwords manually as well as digits from the strings
            line = self.replaceMultiple(str(line), words_list, ' ')
            #remove least used words (words that have a frequency of 3 and less)
            line = self.replaceMultiple(str(line), stopwords, ' ')

            #lematize the final words in the line
            for word in line.split(' '):
                new_line = new_line + ' ' + str(lemmer.lemmatize(word))
            list.append(line.rstrip())

        data_file.close()
        #output the lsit
        return list

Exemple #4

0

Afficher le fichier

Fichier : Preprocess.py Projet : Shreya98-code/shreya-aiops-cloud

	def remove_stop_word(tweet):
        

        stopwords = []
        with open("english") as files:
            for line in files:
                values = line.split()
                word = values[0]
                stopwords.append(word)
        pattern = re.compile(r'\b(' + r'|'.join(stopwords) + r')\b\s*')
        tweet = pattern.sub('', tweet)
        return tweet

Exemple #5

0

Afficher le fichier

Fichier : main.py Projet : Fayepasvouri/Semantic_search_engine_from_scratch.py

def remove_stopwords(df):
    """ Removes stopwords based on a known set of stopwords
    available in the nltk package. In addition, we include our
    made up word in here.
    """
    # Luckily nltk already has a set of stopwords that we can remove from the texts.
    stopwords = nltk.corpus.stopwords.words('english')
    # we'll add our own special word in here 'qwerty'
    stopwords.append(our_special_word)

    df['stopwords_removed'] = list(map(lambda doc:
                                       [word for word in doc if word not in stopwords],
                                       df['tokenized_text']))

Exemple #6

0

Afficher le fichier

Fichier : candidate_retriever.py Projet : sherzod-hakimov/SimpleQA-NER

def get_stopwords():
    initial_stopwords = [
        "a", "about", "above", "across", "after", "afterwards", "again",
        "against", "all", "almost", "alone", "along", "already", "also",
        "although", "always", "am", "among", "amongst", "amoungst", "amount",
        "an", "and", "another", "any", "anyhow", "anyone", "anything",
        "anyway", "anywhere", "are", "around", "as", "at", "back", "be",
        "became", "because", "become", "becomes", "becoming", "been", "before",
        "beforehand", "behind", "being", "below", "beside", "besides",
        "between", "beyond", "bill", "both", "bottom", "but", "by", "call",
        "can", "cannot", "cant", "co", "con", "could", "couldnt", "cry", "de",
        "describe", "detail", "did", "do", "does", "done", "down", "due",
        "during", "each", "eg", "eight", "either", "eleven", "else",
        "elsewhere", "empty", "enough", "etc", "even", "ever", "every",
        "everyone", "everything", "everywhere", "except", "few", "fifteen",
        "fifty", "fill", "find", "fire", "first", "five", "for", "former",
        "formerly", "forty", "found", "four", "from", "front", "full",
        "further", "get", "give", "go", "had", "has", "hasnt", "have", "he",
        "hence", "her", "here", "hereafter", "hereby", "herein", "hereupon",
        "hers", "herself", "him", "himself", "his", "how", "however",
        "hundred", "i", "ie", "if", "in", "inc", "indeed", "interest", "into",
        "is", "it", "its", "itself", "keep", "last", "latter", "latterly",
        "least", "less", "ltd", "made", "many", "may", "me", "meanwhile",
        "might", "mill", "mine", "more", "moreover", "most", "mostly", "move",
        "much", "must", "my", "myself", "name", "namely", "neither", "never",
        "nevertheless", "next", "nine", "no", "nobody", "none", "noone", "nor",
        "not", "nothing", "now", "nowhere", "of", "off", "often", "on", "once",
        "one", "only", "onto", "or", "other", "others", "otherwise", "our",
        "ours", "ourselves", "out", "over", "own", "part", "per", "perhaps",
        "please", "put", "rather", "re", "same", "see", "seem", "seemed",
        "seeming", "seems", "serious", "several", "she", "should", "show",
        "side", "since", "sincere", "six", "sixty", "so", "some", "somehow",
        "someone", "something", "sometime", "sometimes", "somewhere", "still",
        "such", "system", "take", "ten", "than", "that", "the", "their",
        "them", "themselves", "then", "thence", "there", "thereafter",
        "thereby", "therefore", "therein", "thereupon", "these", "they",
        "thick", "thin", "third", "this", "those", "though", "three",
        "through", "throughout", "thru", "thus", "to", "together", "too",
        "top", "toward", "towards", "twelve", "twenty", "two", "un", "under",
        "until", "up", "upon", "us", "very", "via", "was", "we", "well",
        "were", "what", "whatever", "when", "whence", "whenever", "where",
        "whereafter", "whereas", "whereby", "wherein", "whereupon", "wherever",
        "whether", "which", "while", "whither", "who", "whoever", "whole",
        "whom", "whose", "why", "will", "with", "within", "without", "would",
        "yet", "you", "your", "yours", "yourself", "yourselves"
    ]
    stopwords = list()
    for s in initial_stopwords:
        s = normalize_string(s)
        stopwords.append(s)
    return initial_stopwords

Exemple #7

0

Afficher le fichier

Fichier : build_ldac_corpus_wikipedia.py Projet : clintpgeorge/gaur

def load_en_stopwords(filename):
    '''Loads English stop-words from a given file 
    
    Return: 
        a list of stop words
    Arguments: 
        the stop-words file name
    '''
    
    stopwords = []
    with codecs.open(filename, mode='r', encoding='utf-8') as fSW: 
        for line in fSW: 
            stopwords.append(line.strip().lower())
    return stopwords

Exemple #8

0

Afficher le fichier

Fichier : python_code.py Projet : RhythmIIITD/NLP

def ques23(entiresentence,string,np,x,y):
    stopwords=[]
    querywords = string.split()
    #print("querywords",querywords)
    grammar = "chunk:{<VB.?|MD|RP|RB.?>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}"
    res,sentence=findChunk(string,grammar,"chunk")
    #print("res",res,"sentence",sentence)
#     for subtree in sentence.subtrees():
#         if subtree.label() == 'chunk':
#             print(subtree)
#     for pr in np:
#         if()
    if(len(sentence)>0):
        for words in sentence:
            stopwords.append(words[0])
        #print("sp",stopwords)
        stopwords.append(np)
        stopwords.append(x)
        stopwords.append(y)
        stopwords=set(stopwords)
        #print(stopwords)
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print("rs",resultwords)
        WhatQues="What "+x+" "+np+" "+y+" "+resultwords+"?"
        #print(WhatQues)
        AnsQues[entiresentence].append(WhatQues)

Exemple #9

0

Afficher le fichier

def get_hindi_stopwords(filename="stop-words.txt"):
    """Get stopwords in Hindi.

    Args:
    filename - stopwords_set words file

    Returns:
    set of hindi stop words
    """
    stopwords = []
    with open(filename, "r") as stop_words:
        for word in stop_words.readlines():
            stopwords.append(unidecode(word.strip().decode("utf8")))
    return set(stopwords)

Exemple #10

0

Afficher le fichier

Fichier : raw_preprocess_util.py Projet : dedert/STMD

def preprocessing(tagged_by_Sentence):
    """
    1. 특수문자 제거, 소문자
    2. not, "n't" -> not_stemming(다음단어)
    3. 특수문자 제거, 숫자 제거
    4. stopword 제거
    5. stemming
    """
    from nltk.corpus import stopwords
    stopwords = stopwords.words('english')
    stopwords.remove("not")
    stopwords.remove('very')
    stopwords.append("'m")
    stopwords.append("'s")

    re_special = re.compile('[^A-Za-z0-9]+')  # 문자,숫자 제외한 나머지
    re_num = re.compile('[0-9]+')  # 숫자
    st = PorterStemmer()
    new_sent = []
    not_indice = []

    for sent in tagged_by_Sentence:
        text = [(tup[0].lower(), tup[1]) for tup in sent
                if not bool(re_special.match(tup[0]))]  # 1. 특수문자 제거, 소문자

        # 2. not, n't 랑 다음단어 합치기
        # not, n't 가 나오면 다음 단어랑 합치고, 그 다음 단어의 index를 저장해놨다가 del_element_by_indice 함수에서 제거
        new_text = []
        for index, tup in enumerate(text):
            if tup[0] == "n't" or tup[0] == "not":
                if index + 1 < len(text):
                    if not bool(re_special.match(
                            text[index + 1][0])) or text[index + 1][1] != 'CD':
                        new_text.append("not_" + st.stem(text[index + 1][0]))
                        not_indice.append(index)
                else:
                    new_text.append("not")
            else:
                if not bool(re_num.match(
                        tup[0])) or tup[1] != 'CD':  # 3. 특수문자, 숫자 제거
                    new_text.append(tup[0])
        new_text = del_element_by_indice(new_text, not_indice)

        new_words = [
            st.stem(word) for word in new_text if word not in stopwords
        ]  # 4,5 stopword 제거, stemming
        new_sent.append(new_words)
    return new_sent

Exemple #11

0

Afficher le fichier

Fichier : news_sentiment.py Projet : datavizhokie/google-news-election2020

def tokenize_headlines_with_sentiment(df):
    
    headlines = df.title.tolist()

    all_bigrams = []

    headlines_string = (' '.join(filter(None, headlines))).lower()
    tokens = word_tokenize(headlines_string)

    # Remove single letter tokens
    tokens_sans_singles = [i for i in tokens if len(i) > 1]

    # Remove stop words
    stopwords = nltk.corpus.stopwords.words('english')
    new_words=("s'","'s","election", "2020", "n't", "wo","...", "'")
    for i in new_words:
        stopwords.append(i)

    tokens_sans_stop = [t for t in tokens_sans_singles if t not in stopwords]
    tokens_sans_stop = [t.replace('wins', 'win') for t in tokens_sans_stop]

    # Get bigrams and frequencies
    bi_grams = list(ngrams(tokens_sans_stop, 2)) 
    counter = Counter(bi_grams)

    # Convert counter dictionary to dataframe
    counter_df = pd.DataFrame.from_dict(counter, orient='index').reset_index().rename(columns={"index": "bigram", 0: "freq"})
    counter_df_sort = counter_df.sort_values(by=['freq'],ignore_index=True, ascending=False)


    # Create concatenated bigram string for sentiment scoring
    counter_df_sort['word1'], counter_df_sort['word2'] = counter_df_sort.bigram.str
    counter_df_sort['bigram_joined'] = counter_df_sort.word1 + " " + counter_df_sort.word2
    counter_df_sort=counter_df_sort.drop(['word1','word2'], axis=1)

    # get sentiment for bigrams
    analyzer = SentimentIntensityAnalyzer()
    bigrams_scores = counter_df_sort['bigram_joined'].apply(analyzer.polarity_scores).tolist()
    df_bigrams_scores = pd.DataFrame(bigrams_scores).drop(['neg','neu','pos'], axis=1).rename(columns={"compound": "sentiment_compound"})
    bigrams_freq_and_scores = counter_df_sort.join(df_bigrams_scores, rsuffix='_right')

    print(f"There are {len(bigrams_freq_and_scores)} extracted bigrams across all headlines")

    return bigrams_freq_and_scores

Exemple #12

0

Afficher le fichier

Fichier : r_text_analysis.py Projet : gokamoto/AdvancedMLProject

def stem_tokenize(str_use):
    """
    Takes a string and tokenizes it, stripping it of punctuation and stopwords. Returns a list of strings.
    """
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.extend(string.punctuation)
    stopwords.append('')
    addstopwords = ["in", "on", "of", "''"]
    stopwords.append(addstopwords)
    stemmer = wordnet.WordNetLemmatizer()
    tokenizer = punkt.PunktWordTokenizer()

    # removes stopwords and punctuation, then splits the string into a list of words
    token = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(str_use)
             if token.lower().strip(string.punctuation) not in stopwords]
    text = [word for word in token if re.search(r'[a-zA-Z]', word) is not None]
    stem = [stemmer.lemmatize(word) for word in text]
    # Returns a list of strings
    return stem

Exemple #13

0

Afficher le fichier

def count_words(text):
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append('https')
    stopwords.append('http')
    stopwords.append('im')
    stopwords.append('# ')
    # RegEx for stopwords
    RE_stopwords = r'\b(?:{})\b'.format('|'.join(stopwords))
    # replace '|'-->' ' and drop all stopwords

    return (text.str.lower().replace([r'\|', RE_stopwords], [' ', ''],
                                     regex=True).str.cat(sep=' ').split())

Exemple #14

0

Afficher le fichier

 def displaywordcloud(data=None,
                      backgroundcolor='#fff',
                      width=1000,
                      height=1000):
     stopwords.append(product_name)
     wordcloud = WordCloud(
         font_path='C:Windows/Fonts/NanumGothicCoding-Bold.ttf',
         mask=mask_png,
         stopwords=stopwords,
         collocations=False,
         max_font_size=160,
         colormap='tab10',
         background_color=backgroundcolor,
         width=width,
         height=height).generate(data)
     fig = plt.figure(figsize=(10, 10))
     plt.imshow(wordcloud, interpolation="bilinear", aspect='auto')
     plt.axis("off")
     # plt.show
     fig.savefig('webservice/static/wordcloud.png')

Exemple #15

0

Afficher le fichier

Fichier : dataCleaning.py Projet : moustapha2018/Cleaning

def Cleaning(liste):
    import re 
    import nltk 
    nltk.download('stopwords')
    from nltk.corpus import stopwords
    from nltk.stem.porter import PorterStemmer
    ps = PorterStemmer()
    stopwords = list(set(stopwords.words('english')))
    Liste = ['a','b','c','d','e','f','g','h','i','j','k','l','m','n','o','p','q','r','s','t','u','v','w','x','y','z']
    for i in range(0,len(Liste)):
        stopwords.append(Liste[i])
    corpus = []
    for i in range(0,len(liste),1):
        review = re.sub('[^a-zA-Z]',' ',liste[i])
        review = review.lower()
        review = review.split()
        review = [ps.stem(i) for i in review if i not in stopwords]
        review = ' '.join(review)
        corpus.append(review)
    return corpus

Exemple #16

0

Afficher le fichier

Fichier : termv1.py Projet : pangaea-data-publisher/param-annotator

    def is_ci_stem_stopword_set_match(self, a, b, threshold=0.5):
        # Get default English stopwords and extend with punctuation
        stopwords = nltk.corpus.stopwords.words('english')
        stopwords.extend(string.punctuation)
        stopwords.append('')

        # Create tokenizer and stemmer
        tokenizer = nltk.tokenize.punkt.PunktWordTokenizer()
        stemmer = nltk.stem.snowball.SnowballStemmer('english')
        """Check if a and b are matches."""
        tokens_a = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(a) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        tokens_b = [token.lower().strip(string.punctuation) for token in tokenizer.tokenize(b) \
                    if token.lower().strip(string.punctuation) not in stopwords]
        stems_a = [stemmer.stem(token) for token in tokens_a]
        stems_b = [stemmer.stem(token) for token in tokens_b]

        # Calculate Jaccard similarity
        ratio = len(set(stems_a).intersection(stems_b)) / float(
            len(set(stems_a).union(stems_b)))
        return (ratio >= threshold)

Exemple #17

0

Afficher le fichier

Fichier : python_code.py Projet : RhythmIIITD/NLP

def nounphrase(tree):
    stopwords=[]
    querywords=[]
    grammar = "verb:{<VBG|VBN|VB.?|MD|RP>+}"
    res,verbtree=findChunkwithPOSTags(cstr,grammar,"verb")
#     print("res",res,"sentence",verbtree)
    #print(len(verbtree))
    if(len(verbtree)>0):
        for subtree in verbtree.subtrees():
            for words in subtree:
                stopwords.append(words[0]) 
        for subtree in tree.subtrees():
                for words in subtree:
                    querywords.append(words[0])
    #             print("querywords",querywords)
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print("rs",str(resultwords))
        return 1,resultwords
    else:
        return 0,""

Exemple #18

0

Afficher le fichier

Fichier : class_TextProcessor.py Projet : jirvingphd/dsc-5-capstone-project-online-ds-ft-021119

    def update_stopwords(self,
                         add_words=[],
                         remove_words=[],
                         update_corpus=True):
        stopwords = self.stopwords
        [stopwords.append(x) for x in add_words]

        [stopwords.remove(x) for x in remove_words if x in stopwords]

        self._stopwords_ = stopwords
        if update_corpus:
            self.prepare_corpus()

Exemple #19

0

Afficher le fichier

Fichier : python_code.py Projet : RhythmIIITD/NLP

def ques2_2(entiresentence,string,np,x,y):
    np=""
    stopwords=[]
    querywords = string.split()
    grammar =  "chunk:{<IN>+<DT>?<RB.?>*<JJ.?>*<NN.?|PRP|PRP$|POS|VBG|DT|CD|VBN>+}"
    res,sentence=findChunk(string,grammar,"chunk")
    if(res!=0):
        for words in sentence:
            stopwords.append(words[0])
        #print("sp",stopwords)
        stopwords.append(np)
        stopwords.append(x)
    #     stopwords.append(y)
        #print(stopwords)
        ########FIND THE PREPOSTION
        preposition=""
        for words in sentence:
                #print("words in sentence",words)
                if(words[1]=='IN'):
                        #print("prep found",words[0])
                        preposition=words[0]
                        #print("preposition is",preposition)
                        #print("///////////////////////////////////")
        resultwords  = [word for word in querywords if word not in stopwords]
        resultwords = ' '.join(str(e) for e in resultwords)
        #print(resultwords)
        prepques=preposition+" what "+x+np+" "+resultwords+"?"
        #print(prepques)
        AnsQues[entiresentence].append(prepques)

Exemple #20

0

Afficher le fichier

Fichier : NN_tweet.py Projet : fatemehsrz/Event_Attendance_Prediction

def getStopWordList(stopWordFile):
    stopwords = []
    stopwords.append("AT_USER")
    stopwords.append("URL")

    with open(stopWordFile, 'r') as f:
        reader = csv.reader(f)
        for w in reader:

            stopwords.append(w[0])

    return stopwords

Exemple #21

0

Afficher le fichier

Fichier : K-Means.py Projet : dzaf97/Terrorism-News-Classifier-FYP

print("Dataset Loaded...\n")

# Tokenization and Stemming
stemmer = SnowballStemmer('english')
tokenizer = RegexpTokenizer(r'[a-zA-Z\']+')


def tokenize(text):
    return [stemmer.stem(word) for word in tokenizer.tokenize(text.lower())]


#Adding new stopwords
stopwords = nltk.corpus.stopwords.words('english')
with open('StopWords.txt', 'r') as file:
    for i in file:
        stopwords.append(i.strip())

# Feature extraction using TF-IDF
tfidf = TfidfVectorizer(stop_words=stopwords, tokenizer=tokenize)
X = tfidf.fit_transform(data)
words = tfidf.get_feature_names()
print(words)
print("Number of features: " + str(len(words)))  #check number of features
print("Vectorization Completed...\n")
'''
# Clustering using K-Means
print("k-Means with 5 cluster:\n")
kmeans_1 = KMeans(init='k-means++', n_clusters = 5, random_state = 42)
kmeans_1.fit(X)
common_words = kmeans_1.cluster_centers_.argsort()[:,-1:-21:-1]
for num, centroid in enumerate(common_words):

Exemple #22

0

Afficher le fichier

Fichier : tokeninput.py Projet : mrkeaton1/410_Wikipedia_Footnote_Generator

import requests
import string

#reads from the file defined in project specs on ecampus
url = "http://www.gutenberg.org/cache/epub/9845/pg9845.txt"
r = requests.get(url, allow_redirects=True)

# alternate:
# raw = "your string here" or read in a txt file

#tokenize to keep only words
tokenizer = RegexpTokenizer(r'\w+')
tokens = tokenizer.tokenize(r.text)

#get stopwords + extra for the era of the book at the url
stopwords = stopwords.words('english')
extrastop = ["thee", "thy", "ye", "thine", "thou"]
stopwords.append(extrastop)
nostop = [w for w in tokens if not w in stopwords]

#remove repeats
norepeat = list(set(nostop))
#add all of the words + their initial location to a dictionary
dictionary = {}
for w in norepeat:
    dictionary[w.upper().encode('utf8')] = r.text.find(w)

#write to file
with open('tokens.txt', 'w') as file:
    file.write(str(dictionary))

Exemple #23

0

Afficher le fichier

Fichier : pylib.py Projet : mltypebench/MLFeatureTypeInference

def FeaturizeFile(df):
    # df = pd.read_csv(CSVfile,encoding = 'latin1')

    stats = []
    attribute_name = []
    sample = []
    id_value = []
    i = 0

    castability = []
    number_extraction = []

    avg_tokens = []
    ratio_dist_val = []
    ratio_nans = []

    keys = list(df.keys())

    attribute_name.extend(keys)
    summary_stat_result = summary_stats(df, keys)
    stats.extend(summary_stat_result)
    samples = get_sample(df, keys)
    sample.extend(samples)

    # castability.extend(castability_feature(df, keys))
    # number_extraction.extend(numeric_extraction(df, keys))

    # avg_tokens.extend(get_avg_tokens(samples))
    ratio_dist_val.extend(get_ratio_dist_val(summary_stat_result))
    ratio_nans.extend(get_ratio_nans(summary_stat_result))

    csv_names = [
        'Attribute_name', 'total_vals', 'num_nans', 'num_of_dist_val', 'mean',
        'std_dev', 'min_val', 'max_val', '%_dist_val', '%_nans', 'sample_1',
        'sample_2', 'sample_3', 'sample_4', 'sample_5'
    ]
    golden_data = pd.DataFrame(columns=csv_names)

    for i in range(len(attribute_name)):
        # print(attribute_name[i])
        val_append = []
        val_append.append(attribute_name[i])
        val_append.extend(stats[i])

        val_append.append(ratio_dist_val[i])
        val_append.append(ratio_nans[i])

        val_append.extend(sample[i])
        #     val_append.append(castability[i])
        #     val_append.append(number_extraction[i])
        #     val_append.append(avg_tokens[i])

        golden_data.loc[i] = val_append
    #     print(golden_data)

    curdf = golden_data

    for row in curdf.itertuples():

        # print(row[11])
        is_list = False
        curlst = [row[11], row[12], row[13], row[14], row[15]]

        delim_cnt, url_cnt, email_cnt, date_cnt = 0, 0, 0, 0
        chars_totals,word_totals,stopwords,whitespaces,delims_count = [],[],[],[],[]

        for value in curlst:
            word_totals.append(len(str(value).split(' ')))
            chars_totals.append(len(str(value)))
            whitespaces.append(str(value).count(' '))

            if del_reg.match(str(value)): delim_cnt += 1
            if url_reg.match(str(value)): url_cnt += 1
            if email_reg.match(str(value)): email_cnt += 1

            delims_count.append(len(delimeters.findall(str(value))))

            tokenized = word_tokenize(str(value))
            # print(tokenized)
            stopwords.append(len([w for w in tokenized if w in stop_words]))

            try:
                _ = pd.Timestamp(value)
                date_cnt += 1
            except ValueError:
                date_cnt += 0

        # print(delim_cnt,url_cnt,email_cnt)
        if delim_cnt > 2: curdf.at[row.Index, 'has_delimiters'] = True
        else: curdf.at[row.Index, 'has_delimiters'] = False

        if url_cnt > 2: curdf.at[row.Index, 'has_url'] = True
        else: curdf.at[row.Index, 'has_url'] = False

        if email_cnt > 2: curdf.at[row.Index, 'has_email'] = True
        else: curdf.at[row.Index, 'has_email'] = False

        if date_cnt > 2: curdf.at[row.Index, 'has_date'] = True
        else: curdf.at[row.Index, 'has_date'] = False

        curdf.at[row.Index, 'mean_word_count'] = np.mean(word_totals)
        curdf.at[row.Index, 'std_dev_word_count'] = np.std(word_totals)

        curdf.at[row.Index, 'mean_stopword_total'] = np.mean(stopwords)
        curdf.at[row.Index, 'stdev_stopword_total'] = np.std(stopwords)

        curdf.at[row.Index, 'mean_char_count'] = np.mean(chars_totals)
        curdf.at[row.Index, 'stdev_char_count'] = np.std(chars_totals)

        curdf.at[row.Index, 'mean_whitespace_count'] = np.mean(whitespaces)
        curdf.at[row.Index, 'stdev_whitespace_count'] = np.std(whitespaces)

        curdf.at[row.Index, 'mean_delim_count'] = np.mean(whitespaces)
        curdf.at[row.Index, 'stdev_delim_count'] = np.std(whitespaces)

        if curdf.at[row.Index,
                    'has_delimiters'] and curdf.at[row.Index,
                                                   'mean_char_count'] < 100:
            curdf.at[row.Index, 'is_list'] = True
        else:
            curdf.at[row.Index, 'is_list'] = False

        if curdf.at[row.Index, 'mean_word_count'] > 10:
            curdf.at[row.Index, 'is_long_sentence'] = True
        else:
            curdf.at[row.Index, 'is_long_sentence'] = False

        # print(np.mean(stopwords))

        # print('\n\n\n')

    golden_data = curdf

    return golden_data

Exemple #24

0

Afficher le fichier

    for word in text:
        if word in stopwords_set:
            continue
        if words.get(word) is None:
            words[word] = 1
        else:
            words[word] = words[word] + 1

word_lis = []
for word, no in words.items():
    word_lis.append([word, no])
word_lis = pd.DataFrame(word_lis, columns=["word", "freq"])
word_lis = word_lis.sort_values(by="freq", ascending=False)

for word in word_lis['word'][:20]:
    stopwords.append(word)
# print stopwords

# In[5]:

## Remove stop words


def s_wor_rm(text):
    words = []
    text = text.split()
    for word in text:
        if word in stopwords:
            continue

        words.append(word)

Exemple #25

0

Afficher le fichier

Fichier : analyser.py Projet : PFAWeb2Control/combined_results

def http (text):
	stopwords=[]
	stopwords.append([w for w in text if 'https' and 'http' in w ])
	mynewtext = [w for w in text if w not in stopwords[0]]
	return mynewtext

Exemple #26

0

Afficher le fichier

Fichier : preprocess.py Projet : PFAWeb2Control/Application

def remove_hashtag(sentence):
	stopwords=[]
	base = tokinizer(sentence)
	stopwords.append([w for w in base if w.startswith('#')])
	mynewtext = [w for w in base if w not in stopwords[0]]
	return WordList_to_sentence(mynewtext)

Exemple #27

0

Afficher le fichier

Fichier : extract_ne.py Projet : sherzod-hakimov/SimpleQA-NER

            dict[subject] = count

    return dict


if __name__ == "__main__":
    tokenizer = RegexpTokenizer(r'\w+')
    stemmer = SnowballStemmer("english")

    ## get stop words and normalize
    initial_stopwords = stopwords()
    stopwords = list()
    for s in initial_stopwords:
        s = normalize_string(s)
        stopwords.append(s)

    print("Loading index")
    mention_dict = load_index("../data/surface_forms_new.txt")
    subject_predicates_dict = [
    ]  # load_subject_predicates("data/SimpleQuestions_v2/freebase-FB2M.txt")
    subject_triple_counts = [
    ]  #load_subject_triple_counts("data/subject_triple_counts.txt")

    dataset_names = ["test"]
    max_ngram_size = 10
    exclude_small_ngrams = True
    exclude_stop_words = True

    for d in dataset_names:
        correct_count = 0

Exemple #28

0

Afficher le fichier

Fichier : webapp.py Projet : ChihabEddine98/AmazonAnalysis

from ML_models import train_svm,train_knn,knn_accuracies,train_log_regression,\
    log_regression_stats,svm_stats,train_random_forest
from dataset_builder import add_sub_pol_to_dataset

import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
from wordcloud import WordCloud
from nltk.corpus import stopwords

###################### Settings ######################
root_path = os.path.dirname(os.path.realpath(__file__))
ds_path = os.path.join(root_path, '..', 'common', 'dataset.csv')
clean_ds_path = os.path.join(root_path, '..', 'common', 'clean_dataset.csv')
stopwords = stopwords.words('french')
stopwords.append('a')
stopwords.append('e')
stopwords.append('Tre')
stopwords.append('cest')
amazon_img_path = 'https://i1.wp.com/www.joptimisemonsite.fr/wp-content/uploads/2015/02/logo-amazon.jpg?fit=810%2C295&ssl=1&is-pending-load=1'
#######################################################


###################### Utils ######################
@st.cache
def load_data():
    ds = pd.read_csv(ds_path)
    clean_ds = pd.read_csv(clean_ds_path)
    clean_ds = add_sub_pol_to_dataset(clean_ds)
    clean_ds = clean_ds.drop(clean_ds[clean_ds.Subjectivity > 1.0].index)
    return ds, clean_ds

Exemple #29

0

Afficher le fichier

Fichier : extract_topics.py Projet : tgilgs/chat-reduce

def cluster_topics(json_data):
    import json
    import time
    import nltk
    import re
    import numpy as np
    import scipy as sp
    import math
    from nltk.corpus import stopwords
    from nltk.stem.snowball import SnowballStemmer
    from sklearn.feature_extraction.text import TfidfVectorizer
    from sklearn.metrics.pairwise import cosine_similarity
    from sklearn.cluster import KMeans
    from sklearn.preprocessing import normalize


    def tokenize_and_stem(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens:
            if re.search('[a-zA-Z]', token) and token not in stopwords:
                filtered_tokens.append(token)
        stems = [stemmer.stem(t) for t in filtered_tokens]
        return stems


    def tokenize_only(text):
        # first tokenize by sentence, then by word to ensure that punctuation is caught as it's own token
        tokens = [word.lower() for sent in nltk.sent_tokenize(text) for word in nltk.word_tokenize(sent)]
        filtered_tokens = []
        # filter out any tokens not containing letters (e.g., numeric tokens, raw punctuation)
        for token in tokens :
            if re.search("[a-zA-Z]", token) and token not in stopwords:
                filtered_tokens.append(token)
        return filtered_tokens

    #data = json.loads(open("room_messages.json").read())
    data = json.loads(open(json_data).read())
    stopwords = stopwords.words("english")
    stopwords.append('blah')
    stemmer = SnowballStemmer("english")

    # Extract messages
    messages = []
    time_stamps = []
    for i in range(len(data["items"])):
        if "text" in data["items"][i]:
            message = (data["items"][i]["text"])
            time_stamp = data["items"][i]["created"]
            t = float(re.search("[0-9]*:[0-9]*", time_stamp).group(0).replace(":", "."))
            messages.append(message)
            time_stamps.append(t)

    # Extract stemmed and tokenized vocab
    totalvocab_stemmed = []
    totalvocab_tokenized = []
    for i in messages:
        allwords_stemmed = tokenize_and_stem(i)  # for each item in 'synopses', tokenize/stem
        totalvocab_stemmed.extend(allwords_stemmed)  # extend the 'totalvocab_stemmed' list

        allwords_tokenized = tokenize_only(i)
        totalvocab_tokenized.extend(allwords_tokenized)

    # vocab_frame = pd.DataFrame({'words': totalvocab_tokenized}, index=totalvocab_stemmed)
    # print('there are ' + str(vocab_frame.shape[0]) + ' items in vocab_frame')

    # Cluster messages according to topic using K-means
    # define vectorizer parameters
    if len(allwords_stemmed) > 200:
        tfidf_vectorizer = TfidfVectorizer(max_df=0.99, max_features=200000,
                                        min_df=0.01, stop_words='english',
                                        use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))
    else:
        tfidf_vectorizer = TfidfVectorizer(max_df=1.0, max_features=200000,
                                           min_df=0.0, stop_words='english',
                                           use_idf=True, tokenizer=tokenize_and_stem, ngram_range=(1, 3))

    # fit the vectorizer to messages
    # t1 = time.clock()
    tfidf_matrix = tfidf_vectorizer.fit_transform(messages)
    # t2 = time.clock()
    # print("Tf-Idf fit time: ", t2-t1)
    # print(tfidf_matrix.shape)
    terms = tfidf_vectorizer.get_feature_names()

    # insert additional time feature
    new_column = np.asarray(time_stamps).reshape(-1, 1)
    time_norm = normalize(new_column)
    # print(new_column.shape)
    final = sp.sparse.hstack((tfidf_matrix, new_column))
    # print(final.shape)
    terms.append('time')

    # calculate distance between messages using cosine similarity of tf-idf
    dist = 1 - cosine_similarity(final)

    # K-means clustering
    # num_clusters = 5
    if len(messages) > 10:
        num_clusters = math.floor(math.sqrt(len(messages)) / 2)
    else:
        num_clusters = 1

    km = KMeans(n_clusters=num_clusters)
    t3 = time.clock()
    km.fit(final)
    t4 = time.clock()
    #print("K-means fit time: ", t4 - t3)
    clusters = km.labels_.tolist()

    topics = {}
    for t in range(num_clusters):
        t_name = "topic" + str(t)
        topics[t_name] = {}
        t_messages = []
        for i in range(len(messages)):
            if clusters[i] == t:
                t_messages.append(messages[i])
        topics[t_name]['messages'] = t_messages

        #Export as json
        # with open('topics.json', 'w') as outfile:
        #json.dump(topics, outfile)


    # Inspect clusters
    # sorted_messages = {'message': messages, 'cluster': clusters}
    #
    # frame = pd.DataFrame(sorted_messages, index=[clusters], columns=['message', 'cluster'])
    # print(frame['cluster'].value_counts())
    #
    # # top words per cluster
    # print("Top terms per cluster:")
    # print()
    # # sort cluster centers by proximity to centroid
    # order_centroids = km.cluster_centers_.argsort()[:, ::-1]
    #
    # for i in range(num_clusters):
    #     print("Cluster %d words:" % i, end='')
    #
    #     for ind in order_centroids[i, :6]:  # replace 6 with n words per cluster
    #         print(' %s' % vocab_frame.ix[terms[ind].split(' ')].values.tolist()[0][0].encode('utf-8', 'ignore'), end=',')
    #     print()  # add whitespace
    #     print()  # add whitespace
    #
    #     print("Cluster %d messages:" % i, end='')
    #     for message in frame.ix[i]['message'].values.tolist():
    #         print(' %s,' % message, end='')
    #     print()  # add whitespace
    #     print()  # add whitespace

    return topics

Exemple #30

0

Afficher le fichier

Fichier : generate_dataset.py Projet : davidrapoport/ml_project2

from nltk.corpus import stopwords
from nltk.tokenize import wordpunct_tokenize
from sklearn.feature_extraction.text import CountVectorizer
from scipy.sparse.csr import csr_matrix
import re, pdb, os, json
import numpy as np
from collections import Counter
import nltk

np.random.seed(1234)

lemmatizer = WordNetLemmatizer()
pattern_train = r"(\d*),\"(.*)\",(\d*)"
pattern_test = r"(\d*),\"(.*)\""
stopwords = list(stopwords.words("english"))
stopwords.append("__EOS__")

sw = [
    "right",
    "love",
    "people",
    "feel",
    "yeah",
    "one",
    "see",
    "something",
    "want",
    "year",
    "yes",
    "still",
    "kind",

Exemple #31

0

Afficher le fichier

Fichier : wikicount.py Projet : epireve/clover

from nltk import bigrams, trigrams
import math
import json
from nltk.stem import WordNetLemmatizer
stopwords = nltk.corpus.stopwords.words('english')
tokenizer = RegexpTokenizer("[\w’]+", flags=re.UNICODE)
#st = LancasterStemmer()
wnl = WordNetLemmatizer()
keywords=[]
with open('keywords.txt','r') as f:
    for i in f:
        keywords.append(i.strip())    

with open('stopwords.txt','r') as f:
    for i in f:
        stopwords.append(i.strip())          
def freq(word, doc):
    return doc.count(word)
 
 
def word_count(doc):
    return len(doc)
 
 
def tf(word, doc):
    return (freq(word, doc) / float(word_count(doc)))
    
 
def calcu_tf(keyword):
    url = "http://en.wikipedia.com/wiki/"+keyword
    content = urllib2.urlopen(url).read()

Exemple #32

0

Afficher le fichier

Fichier : ngram_bhartendu.py Projet : bhartenduspoton/text-mining-tool

def get_ngram(filename = None,_type=None,is_stopword=None):
    file_content = open(filename).read()
    # Get the tockens. Use word punctuations for tokenizing too other than spaces
    tokens = nltk.word_tokenize(file_content)
    text = nltk.Text(tokens)
    word_filter = lambda *w: word_to_find not in w
    ## Bigrams
    # ENABLE STOP WORDS
    #stopwords = stopwords.words('english')
    stopwords = []
    stopwords.append('.')
    #stopwords.append('The')
    stopwords.append(':')
    stopwords.append(',')
    stopwords.append(';')
    stopwords.append('`')
    stopwords.append('``')
    stopwords.append('\"')
    #print stopwords

    if is_stopword == 1:
        filtered_tokens = []
        for ftoken in tokens:
            ftoken_low = ftoken.lower()
            if ftoken not in stopwords:
                #print "Removing ######### " + ftoken
                filtered_tokens.append(ftoken)
    else:
         filtered_tokens = tokens


    if _type == 2:
        finder = BigramCollocationFinder.from_words(filtered_tokens,window_size=3)
    else:
        finder = TrigramCollocationFinder.from_words(filtered_tokens,window_size=3)
    # only bigrams that appear 3+ times
    finder.apply_freq_filter(1)
    lst = list(finder.ngram_fd.viewitems())
    #print lst
    ll = sorted(lst, key=lambda x: x[1])
    #ll = lst.sort(key=lambda x: x[0])
    #print ll
    res =[]
    for i in ll:
        k1 = []
        k  = list(i)
        k1.append(' '.join(k[0]))
        k1.append(k[1])
        res.append(k1)
        #raw_input()
        #print k1
    #print res
    return res

Exemple #33

0

Afficher le fichier

Fichier : extractor.py Projet : Diwahars/Final-Year-Project

		rows = cursor.fetchall()
	except:
		print "Pid : ",pid," not found"
		return
	return rows
#print nltk.pos_tag(['flipkart','samsung'])

#lemmatiser and stopwords initialization
lemmatizer = nltk.WordNetLemmatizer()
from nltk.corpus import stopwords

#building the stopwords list
stopwords = stopwords.words('english')
stoplist = ['>','<','%','.','br/','(',')','=','!']
for i in stoplist:
	stopwords.append(i)

#normalise each qualified word
def normalise(word):
    """Normalises words to lowercase and stems and lemmatizes it."""
    word = word.lower()
    word = lemmatizer.lemmatize(word)
    return word

def armAssoc(dust,strength):
	print

def writetofile(l):
	feat=[]
	remove=["flipkart","problem","time","product","awesome","thing","port","delivery","buying","perfect","mode","reason","anything","point","excellent","hand","till","fact","market","weather","brand","life","option","guide","money"]
	for i in range(0,len(l)):

Exemple #34

0

Afficher le fichier

import bs4 as bs
import json
import re
import nltk
import heapq
import pickle
import sys
from pprint import pprint
import os
import codecs
import string
from sklearn_crfsuite import metrics
from DataExtraction import convertCONLLFormJustExtractionSemEvalPerfile
from FeatureExtraction import sent2labels, sent2features
from PhraseEval import phrasesFromTestSenJustExtractionWithIndex
from nltk import word_tokenize, pos_tag, ne_chunk
from nltk import conlltags2tree, tree2conlltags
#Swapped hardcoded link with system arguement
file_inLoc = sys.argv[1]
file_outLoc = sys.argv[1].split(".")[0] + "-DKE.txt"
file_outLoc = file_outLoc[15:]
with open(file_inLoc, 'r', encoding='utf-8-sig') as f:
    article_text = json.load(f)
pprint(article_text)
'''
scraped_data = urllib.request.Request(file_inLoc, headers={'User-Agent' : "Magic Browser"})
scraped_data=urllib.request.urlopen(scraped_data)
parsed_article = bs.BeautifulSoup(article,'lxml')
paragraphs = parsed_article.find_all('p')
article_text = ""

Exemple #35

0

Afficher le fichier

    words = tknzr.tokenize(words)
    exclude = set(string.punctuation)
    words2 = [word for word in words if
            not word in exclude]
    words_tag = dict(pos_tag(words))
    words = [word.lower() for word in words2 if
            not word in nltk.corpus.stopwords.words('english') and not word.isdigit()]
    # print(words)
    words = [lima(word, words) for word in words]
    # print(words)
    words = ' '.join(words)
    # print(words)
    return words

stopwords = stopwords.words('english')
stopwords.append('.')
# stopwords.union('sally')
# operators = set(('sally'))
# stop = set(nltk.corpus.stopwords.words('english')) + operators
# print(stopwords)
sentence_tokenizer = nltk.data.load('tokenizers/punkt/english.pickle')
word_tokenizer = nltk.tokenize.RegexpTokenizer(r'\w+')
chapters=[]
with open('user_posts_1641812829207516.csv') as File:
    tfidfReader = csv.reader(File)
    for row in tfidfReader:
        chapters.append(clean(row[0]).encode('utf-8'))
num_chapters = len(chapters)
fvs_lexical = np.zeros((len(chapters), 3), np.float64)
fvs_punct = np.zeros((len(chapters), 3), np.float64)
i=1

Exemple #36

0

Afficher le fichier

Fichier : headlinefinal.py Projet : sraybuck/FalloutSentimentAnalysis

def analyze(soup_object, csv_name):

    #select only the headlines in each google search result
    base = soup_object.select("div.g.card h3")

    #declare empty list where I'm going to put all the headlines
    headlines = []

    #loop to get rid of all html and keep only headline text
    for row in base:
        clean = row.text
        headlines.append(clean)

    #print to verify headlines are clean
    #print(headlines)

    #empty list to store tokenized headlines
    tokens = []

    #loop to tokenize headlines by using clean_tokens method from earlier
    for each in headlines:
        clean = clean_tokens(each)
        tokens.append(clean)

    #print tokens to verify
    #print('tokens =',tokens)

    #create stopwords list from nltk and add fallout 76 as stopwords
    stopwords = nltk.corpus.stopwords.words('english')
    stopwords.append("fallout")
    stopwords.append('76')
    stopwords.append("'fallout")

    #remove stopwords from tokens
    filtered = []
    for list in tokens:
        x = []
        for word in list:
            if word not in stopwords:
                x.append(word)
        filtered.append(x)

        #print to verify stopwords are gone
    #print("filtered = ", filtered)

    #declare empty list so I can put the tokens back into headlines without stopwords
    combined = []

    #put tokens back into headlines without stopwords
    for list in filtered:
        combined.append(" ".join(list))

    #import sentiment analyzer
    from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA

    #create sia object I think and then create empty list to put end results in
    sia = SIA()
    results = []

    #analyze sentiment of each headline
    for line in combined:
        pol_score = sia.polarity_scores(line)
        results.append(pol_score)

    #print to verify
    #for i in results:
        #print(i)


    #write sentiment analysis to csv file
    with open('resultsfinal.csv', 'a') as csv_file:
        writer = csv.writer(csv_file)
        for d in results:
            writer.writerow(['compound', d['compound']])

Exemple #37

0

Afficher le fichier

import nltk

tweets = pd.read_csv('../data/primary_debates_cleaned.csv')
tweets = tweets.drop(['URL','Location','Date','Line'], axis=1)
tweets = tweets.loc[tweets['Speaker'].isin(['Bush', 'Carson', 'Chafee', 'Christie', 'Clinton', 'Cruz', 'Fiorina', 'Gilmore', 'Graham', 'Huckabee', 'Jindal', 'Kasich', "O'Malley", 'Pataki', 'Paul', 'Perry', 'Rubio', 'Sanders', 'Santorum', 'Trump', 'Walker', 'Webb'])]
tweets = tweets.loc[~tweets['Text'].isin(['(APPLAUSE)', '(ANTHEM)', '(BELL)', '(BOOING)', '(COMMERCIAL)', '(CROSSTALK)', '(LAUGHTER)', '(MOMENT.OF.SILENCE)', '(SPANISH)', '(VIDEO.END)', '(VIDEO.START)', '(inaudible)'])]
#print(tweets)
#print(tweets.Tweet[0])

democrat = tweets[tweets.Party == 'Democratic']

republican = tweets[tweets.Party == 'Republican']

stopwords = stopwords.words('english')
#add some unnecessary word to stopwords list
stopwords.append("rt")
stopwords.append("u")
stopwords.append("amp")
stopwords.append("w")
stopwords.append("th")

clean_democrat = []
for d in democrat.Text:
    d = re.sub(r'https\S+', '', d)
    d = re.sub("[^a-zA-Z]", " ", d)
    d = d.lower()
    d = nltk.word_tokenize(d)
    d = [word for word in d if not word in set(stopwords)]
    lemma = nltk.WordNetLemmatizer()
    d = [lemma.lemmatize(word) for word in d]
    d = " ".join(d)

Exemple #38

0

Afficher le fichier

Fichier : MakeCorpus.py Projet : ljarray/TumblrSentClassifier

 def pullAllTheStops(self):
     stopwords = []
     for word in self.words:
         if word not in self.stops:
             stopwords.append(word)
     return stopwords

Exemple #39

0

Afficher le fichier

Fichier : DataLoader.py Projet : HashEas/faqbot-hari

import csv
import random
import nltk
from nltk import word_tokenize
from nltk.corpus import stopwords

stopwords = stopwords.words("english")
stopwords.append('.')
stopwords.append('?')
stopwords.append('\'')
stopwords.append(',')
stopwords.append('’')
stopwords.append(')')
stopwords.append('(')
stopwords.append('/')

class DataLoader:
    """ DataLoader Utility """
    def __init__(self, file):
        self.file = file
        self.documents = []
        self.all_words = []
        self.questions = []
        self.answers = []
        self.questions_and_answer = []

    # doc - document words and dictionary - corpus words
    def get_feature(self, doc, dictionary):
        vector = {}
        words = set(doc)
        for w in dictionary:

Exemple #40

0

Afficher le fichier

Fichier : PerturbationGenerator.py Projet : DrustZ/CorrectionRNN

    def perturb(self, stringseq, method=0):
        assert len(stringseq) > 0, "sentence must be more than one char!"
        assert any(
            c.lower() in string.ascii_lowercase for c in stringseq
        ), "the sentence has to contain at least one alphabet letter!"
        tokens = nltk.wordpunct_tokenize(stringseq)
        # print (tokens)
        #if method is not specified, we randomly select one.
        if method == 0:
            method = randint(1, 4)
        elif method < 0:
            method = randint(1, 4 + method)  # eliminate certain methods

        # qwerty
        if method == 1:
            while True:
                randidx = randint(0, len(tokens) - 1)
                token = tokens[randidx]
                #has to be at least one alpha letter
                if not any(c.lower() in string.ascii_lowercase for c in token):
                    continue
                res = self.pert_qwerty.perturb(token)
                if res[0] == res[1].lower():
                    continue
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr + res[2][0], res[2][1]), 0)
        # drop
        elif method == 2:

            stopwords = []
            for i, token in enumerate(tokens):
                if token in self.stopwords:
                    stopwords.append(i)

            if len(stopwords) > 0:
                randidx = choice(stopwords)
                token = tokens[randidx]
            else:
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]

            fr, to = self.spans(tokens, stringseq, randidx)
            # remove a whitespace if drop the word
            if to < len(stringseq) and stringseq[to] == ' ':
                # trailing whitespace
                return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx],
                        (fr, 0), 2)
            elif stringseq[fr - 1] == ' ':
                # leading whitespace
                return (stringseq[:fr - 1] + stringseq[to + 1:],
                        tokens[randidx], (fr - 1, 0), 1)
            else:
                return (stringseq[:fr] + stringseq[to + 1:], tokens[randidx],
                        (fr, 0), 0)
        # delete chars
        elif method == 3:
            pos_tags = nltk.pos_tag(tokens)
            cnt = 0
            while True:
                cnt += 1
                if cnt > 5:
                    break
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]
                if not any(c.lower() in string.ascii_lowercase for c in token):
                    continue
                if len(token) < 2 or pos_tags[randidx] in ["TO", "SYM"]:
                    continue
                res = self.pert_delete.perturb(token, pos_tags[randidx])
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr + res[2][0], res[2][1]), 0)
            return self.perturb(stringseq, -2)
        # synonym
        elif method == 4:
            cnt = 0
            while True:
                cnt += 1
                if cnt > 5:
                    # in case of dead loop
                    break
                randidx = randint(0, len(tokens) - 2)
                token = tokens[randidx]
                # has to be a word
                if not all(c.lower() in string.ascii_lowercase for c in token):
                    continue
                res = self.pert_synonym.perturb(token)
                if res[0] == res[1]:
                    continue
                fr, to = self.spans(tokens, stringseq, randidx)
                return (stringseq[:fr] + res[0] + stringseq[to:], res[1],
                        (fr, res[2][1]), 0)
            return self.perturb(stringseq, -1)

Exemple #41

0

Afficher le fichier

Fichier : wr_freq.py Projet : audxo14/ETRI

        dic.append(
            {'구분': 'trigram', '순위': '1~20', '단어': unicode(trigram[0]) + unicode(trigram[1]) + unicode(trigram[2]),
             '횟수': str(dic[19]['횟수']) + u"회 이상"})

    csv_columns = ['구분', '순위', '단어', '횟수']

    csv_file = filename.split('.')[0] + "freq.csv"

    WriteDictToCSV(csv_file, csv_columns, dic)

stopwords = []
stoptext = open("stopwords.txt", "r")
for sw in stoptext.readlines():
    sw = sw.decode('cp949')
    sw = re.sub('\\n', '', sw)
    stopwords.append(sw)


measures = collocations.BigramAssocMeasures()
measures2 = collocations.TrigramAssocMeasures()



def remove_values_from_list(the_list, val):
   return [value for value in the_list if value != val]

pwd = os.getcwd()+'\\document'


for path, dirs, files in os.walk(pwd):
    for file in files:

Exemple #42

0

Afficher le fichier

Fichier : main.py Projet : RodrigoBastos/similarityWords

# coding=utf-8
import math
from textblob import TextBlob as tb
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize


#STOPWORDS
stopwords = stopwords.words('portuguese')
stopwords.append('pra')


def remove_stopwords (sentences):
    phrases = []
    for sentence in sentences:
        #Gera tokens
        tokens = []
        words = word_tokenize(sentence)
        for word in words:
            if word.lower() not in stopwords:
                tokens.append(word)
        phrases.append(' '.join(tokens))
    return phrases

def get_cosine(vec1, vec2):
    size = len(vec1) - 1
    numerator = sum([vec1[x] * vec2[x] for x in range(size)])

    sum1 = sum([vec1[x]**2 for x in range(size)])
    sum2 = sum([vec2[x]**2 for x in range(size)])

Exemple #43

0

Afficher le fichier

Fichier : twebit.py Projet : securitywarrior/twebit

from tweepy import OAuthHandler
from tweepy.streaming import StreamListener
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

nltk.download('punkt')

consumerKey="xxxx"
consumerSecret="xxxx"
accessToken="xxxx-xxxx"
accessSecret="xxxxx"

stopwords = []
for i in open("stopwords.txt"):
    sword = i.rstrip('\n')
    stopwords.append(sword)

global check
check = []

class listener(StreamListener):
    x = 0
    y = 0
    def on_data(self, data):
        all_data = json.loads(data)
        tweet = all_data["text"]
        tweet = re.sub(r"http\S+", "", tweet)
        analysis = TextBlob(tweet)
        stop_words = set(stopwords)
        filtered_words = set(analysis.words.lower()) - stop_words # remove stop words from tweet
        for i in set(filtered_words): # remove @ tags