class LoadTrainingData: def __init__(self, pos_dir, neg_dir): self.pos_dir = pos_dir self.neg_dir = neg_dir text_processor = ProcessText() self.data = [] # Load preprocessed training data if it exists if os.path.exists("training_data.data"): with open("training_data.data") as f: self.data = eval(f.read()) # Preprocess again if it does not exist else: # Read each example in the negative training directory. print("Loading negatives") for txt_name in os.listdir(neg_dir): with open(os.path.join(neg_dir, txt_name)) as f: text = f.read() self.data.append( text_processor.process_text(txt_name, text, 0)) # # Read each example in the positive training directory. print("Loading positives") for txt_name in os.listdir(pos_dir): with open(os.path.join(pos_dir, txt_name)) as f: text = f.read() self.data.append( text_processor.process_text(txt_name, text, 1)) with open("training_data.data", "w") as f: f.write(str(self.data)) # Load precomputed if it exists if os.path.exists("word_freqs.data"): with open("word_freqs.data") as freqs: self.words_freq = eval(freqs.read()) else: print("Computing word frequencies") # Compute word frequencies all_words = [] for review in self.data: all_words = all_words + review["review"] self.words_freq = FreqDist(all_words) # This is long to compute. We must save this to a file. with open("word_freqs.data", "w") as f: self.words_freq.pprint(100000000, f)
def build_vocabulary(doc_set, nlp, stop_words, threshold=0.8, vocab_size=10000): # this now more # the pickled data is of this form [(sent., 0| 2| 4)] fdist = FreqDist() count = 0 log_times = 10000 total_size = len(doc_set) for pair in doc_set: # for log count += 1 if (np.mod(count, log_times)): print('%f finished' % (count / total_size)) for word in semantic_clean(pair[0], nlp, stop_words): fdist[word] += 1 fdist.pprint(maxlen=50) print('Begin flush freq dist into disk') util_dump(fdist, os.path.join(DIR, FREQ_DIST_PICK)) print('End flush freq dist into disk') return _build_vocabulary(fdist, threshold, vocab_size)
#turns all works into lowercase lowerToken = [i.lower() for i in token] #filters out non-alphanumeric filToken = [i for i in lowerToken if i.isalnum()] #lemmatizaton from nltk.stem import WordNetLemmatizer lemmatizer = WordNetLemmatizer() lemToken = [lemmatizer.lemmatize(i) for i in filToken] #stop words for english filtered out of text from nltk.corpus import stopwords a = set(stopwords.words("english")) finalToken = [x for x in lemToken if x not in a] #saves list externally with open('listfile.csv', 'a') as file: for i in finalToken: file.write('%s\n' % i) #records the frequency of words from nltk.probability import FreqDist fdist = FreqDist(finalToken) print(fdist.pprint(500)) print(number)
""" #Creating the frequency distribution of words import matplotlib.pyplot as plt import nltk from nltk.probability import FreqDist from nltk.tokenize import word_tokenize from nltk.tokenize import sent_tokenize from nltk.corpus import stopwords import pandas as pd import csv file = open("/home/verareyes/twitch_clips/fortnite/fort_01_time.txt", "r") p = file.read() fdist = FreqDist() for sentence in nltk.tokenize.sent_tokenize(p): for word in nltk.tokenize.word_tokenize(sentence): noise={"[", "]", "<", ">", "the", "to", "``", "a", "you", "?", "it", "!", "me", "and", "TO", "THIS", ":", "SPAM", "is", "HELP", "for", "i", "all", "in", "this", "on", "can", "of", "so", "please", "get", "of", "if", "do", "that", "be", "an", "my", "but", "no", "they", "will", "THE", "are", "at", "I", "'s", "'re", "'ll", "'", ",", ".", "have", "got", "with", "YOU", "your", "(", ")", "we", "’", "was", "A", "ME", "na", "did", "IT", "im", "IS", "IF", "gon", "WE", "'s", "''", "n't", "'m"} if word not in noise: fdist[word]+=1 import matplotlib.pyplot as plt fdist.plot(30,cumulative=False) plt.show() fdist.pprint(800) dataFrame = pd.DataFrame(list(fdist.items()), columns = ["Time", "Frequency" ]) dataFrame.to_csv('/home/verareyes/twitch_clips/fortnite/fort_01_freq.csv', index=False, header=True)
# needed to install some nltk stuff nltk.download('punkt') nltk.download('stopwords') sents = sent_tokenize(concatstring) words = word_tokenize(concatstring.lower()) _stopwords = set(stopwords.words('english') + list(punctuation)) words=[word for word in words if word not in _stopwords] print("\n".join(words)) #let's add stemming from nltk.probability import FreqDist freq = FreqDist(words) freq.pprint(200) type(freq) freqdict = dict(freq) # could iterat this way but no for word in freqdict: print(word, freqdict[word]) # another way to iterate over the whole freq s = [(k, freqdict[k]) for k in sorted(freqdict, key=freqdict.get, reverse=True)] for k, v in s[:1000]: k, v linestr = "{}, , , {}\n".format(k, v) with open("/Users/jayers/Temp/freqdistwords.csv", "a") as myfile: myfile.write(linestr)
def process_corpus(corpus_name): print(f'1. Corpus name: {corpus_name}') input_file = corpus_name + ".zip" corpus_contents = unzip_corpus(input_file) corpus_sentences = [] for content in corpus_contents: corpus_sentences.append(re.split(r'(?<=\.) ', content)) corpus_words = [] allwords = [] for sent in corpus_sentences: words = [] for word in sent: x = nltk.word_tokenize(word) words.append(x) for w in x: allwords.append(w.lower()) corpus_words.append(words) f = open(corpus_name + "-pos.txt", "w") allpos = [] for story in corpus_words: for sentence in story: sent = nltk.pos_tag(sentence) for word in sent: f.write(word[0] + "/" + word[1] + " ") allpos.append(word) f.write("\n\n") f.close() print(f'\n2. Total words in the corpus: {len(allwords)}') numunique = len(set(allwords)) print(f'\n3. Vocabulary size of the corpus: {numunique}') posfreq = {} for i in allpos: if i[1] in posfreq: posfreq[i[1]] += 1 else: posfreq[i[1]] = 1 inv = {v: k for k, v in posfreq.items()} sorted_posfreq = {k: inv[k] for k in sorted(inv)} l = list(sorted_posfreq.keys()) print( f'\n4. The most frequent part-of-speech tag is {sorted_posfreq.get(l[-1])} with frequency {l[-1]}' ) f = open(corpus_name + "-word-freq.txt", "w") fdist = FreqDist(word for word in allwords) fdist.pprint(maxlen=numunique, stream=f) f.close() cfdist = ConditionalFreqDist((word[1], word[0].lower()) for word in allpos) print( f'\n5. Frequencies and relative frequencies of all part-of-speech tags in the corpus in decreasing order of frequency are: ' ) for i in range(1, len(sorted_posfreq)): print( f'{sorted_posfreq.get(l[-i])} tag has frequency {l[-i]} and relative frequency {round(l[-i]/3676, 3)}.' ) f = open(corpus_name + "-pos-word-freq.txt", "w") with redirect_stdout(f): cfdist.tabulate() f.close() text = nltk.Text(allwords) pos_list = ["NN", "VBD", "JJ", "RB"] print("\n6.") for pos in pos_list: m = cfdist[pos].max() print( f'The most frequent word in the POS {pos} is {m} and its most similar words are:' ) text.similar(m) print(f'7. Collocations:') text.collocations()
# try the porter stemmer from nltk.stem import PorterStemmer st = PorterStemmer() stemmedWords=[st.stem(word) for word in words] words = stemmedWords # also a thing called Lemmatization? from nltk.stem.wordnet import WordNetLemmatizer lmtzr = WordNetLemmatizer() lmtzr.lemmatize('cars') # add parts of speech # tag list https://pythonprogramming.net/natural-language-toolkit-nltk-part-speech-tagging/ poswords = nltk.pos_tag(words) posfreq = FreqDist(poswords) freqdict = dict(freq) posfreq.pprint(200) # this looks good # add some word sense disambiguation # this will give all the definitions of words from nltk.corpus import wordnet as wn for ss in wn.synsets('bass'): print(ss, ss.definition()) #this will look as the use in sentence and return word sense from nltk.wsd import lesk sense1 = lesk(word_tokenize("Sing in a lower tone, along with the bass"),'bass') print(sense1, sense1.definition()) # calc the frequencies freq = FreqDist(words)