def convert_to_text(_input): if type(_input) is list: text_obj = Text(_input) return text_obj else: list1 = word_tokenize(_input) text_obj = Text(list1) return text_obj
def read_text(path): """Takes a file path, which is assumed to point to a file or a directory, and returns a Text instance.""" if os.path.isfile(path): with open(path) as fh: return Text(nltk.word_tokenize(fh.read())) elif os.path.isdir(path): # restrict to files with the mrg extension, avoiding hidden files like .DS_Store # that can cause trouble corpus = PlaintextCorpusReader(path, '.*.mrg') return Text(nltk.word_tokenize(corpus.raw()))
def tokenize_and_stem(text): #tokenize tokens = nltk.word_tokenize(text) #remove stopwords english_stopwords = stopwords.words("english") self_stopwords = [ 'comprises', 'comprising', 'first', 'second', 'includes', 'use', 'plurality', 'device', 'structure', 'arranged', 'connected', 'invention', 'provided', 'receive', 'provide', 'extend', 'relates', 'configured', 'method' ] english_punctuations = [ ',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$', '*' ] #stem st = PorterStemmer() words_clear = [] for i in tokens: if i.lower() not in english_stopwords: if i not in self_stopwords: if i not in english_punctuations: if re.search('[a-z]', i): t = st.stem(i) words_clear.append(t) words_text = Text(words_clear) return words_text
def bag_of_words(path): import string f = "foodhotel.txt" fo = open(path + f, "rw+") review = fo.read() sentence = word_tokenize(review) tokens = pos_tag(Text(sentence)) customStopWords = set( stopwords.words('english') + list(string.punctuation)) useful_words = [words for words in tokens if words not in customStopWords] nouns = [ word for word, pos in useful_words if pos == 'JJ' or pos == 'VBG' or pos == 'NN' or pos == 'RB' or pos == 'VB' ] freq_nouns = nltk.FreqDist(nouns) adj_tags = str(freq_nouns.most_common(100000)) check = adj_tags.split('),') string_list = '' for word in check: string_list = string_list + '\n' + str(word) print string_list output = open("/home/amisha/Desktop/sourcedata/output_food.txt", "w") output.write(string_list) output.close()
def search_word_freq(a_list, word): word_list = [] for tweet in a_list: word_list += preprocess(tweet) text_list = Text(word_list) fdist = FreqDist(text_list) print("Frequent of '%s' Word Search: ") % word + str(fdist[word])
def read_file(file_path): with open(file_path,'r') as f: contents = f.read() clean_html = clean_website(contents) t = nltk.tokenize.WhitespaceTokenizer() html_text = Text(t.tokenize(clean_html)) return html_text
def context_search(a_list, search_word): for tweet in a_list: word_list = preprocess(tweet) text_list = Text(word_list) search_word = search_word.lower() if search_word in word_list: text_list.concordance(search_word)
def visualize(self, number): f = open(self.myfile, encoding="utf8") raw = f.read() t = nltk.tokenize.WhitespaceTokenizer() c = Text(t.tokenize(raw)) fdist = nltk.FreqDist(c) fdist.plot(number)
def bookWriter(lis): actualBook = "Blank space, baby!" #Please be a T-swift fan. bookLis = [] for i in lis: actualBook = Text(nltk.corpus.gutenberg.words(i)) #https://stackoverflow.com/questions/29110950/python-concordance-command-in-nltk bookLis.append(actualBook) #It's best to just add them to the end, rather than replacing. return bookLis
def get_category_vocabs(categories): """Returns a dictionary of vocabularies indexed on category names.""" vocabs = {} for cat in categories: words = brown.words(categories=[cat]) vocabs[cat] = Vocabulary(Text(words)) return vocabs
def getStockNames(self, question): a = "What is the market colour for google?" b = "What is market color for GOOGL?" c = "market color GOOGL?" d = "How are apple and tesla doing?" e = "How are APPL and TESL doing?" stocks = [ 'APPL', 'apple', 'GOOG', 'google' ] sentence = question; #enter command here tokens = word_tokenize(sentence); textList = Text(tokens) print (textList) results = []; for word in tokens: if word in stocks: results.append(word); print (results) return [results]
def analyze(text): text = text.decode('utf-8') tokens = nltk.word_tokenize(text) vocab = Text(tokens).vocab() tokens_list = lemma(vocab) glossary = glossay_wrapper(tokens_list) return glossary
def filter_words_from_search(search_results): t0 = time.time() if (len(search_results) > 0): filtered_sites = [[ lemmatize(t.lower()) for t in x if t not in string.punctuation and not t.isdigit() and lemmatize(t.lower()) not in stop_words and len(t) > 1 ] for x in [ tokenizer.tokenize(a['title'] + " " + a['description'] + " " + a['content']) for a in search_results ]] site_dict = dict([[ page['link'], [ dict(site.vocab()) for site in [Text(tex) for tex in filtered_sites] ][page_num] ] for page_num, page in enumerate(search_results)]) context_list = [nltk.text.ContextIndex(tex) for tex in filtered_sites] t1 = time.time() print("Filter Time: " + str(t1 - t0)) stuff_dict = defaultdict(lambda: defaultdict()) for i in range(len(search_results)): stuff_dict["title"][search_results[i] ["link"]] = search_results[i]["title"] stuff_dict["importance"][search_results[i]["link"]] = i + 1 stuff_dict["description"][ search_results[i]["link"]] = search_results[i]["description"] return site_dict, stuff_dict, context_list else: return {'error': 404}
def getMsgFeatures(self, msgTokens): gText = Text(msgTokens) msgContent = ' '.join(msgTokens) features = {} features['length'] = len(msgContent) features['lexdiv'] = self.getLexicalDiversity(gText) features['longestword'] = self.getLongestWord(msgContent) for token in msgTokens: wordCount = msgContent.lower().count(token) if (wordCount > 1): features["repeats({})".format(token)] = wordCount for unigram in msgTokens: features["unigram({})".format(unigram)] = True for bigram in self.getBigrams(msgTokens): features["bigram({})".format(bigram)] = True for trigram in self.getTrigrams(msgTokens): features["trigram({})".format(trigram)] = True return features
def get_concordance(target_word, tar_passage, left_margin = 10, right_margin = 10): """ Function to get all the phases that contain the target word in a text/passage tar_passage. Workaround to save the output given by nltk Concordance function str target_word, str tar_passage int left_margin int right_margin --> list of str left_margin and right_margin allocate the number of words/pununciation before and after target word Left margin will take note of the beginning of the text """ # check for punkt tokenizer if download('punkt') == False: download('punkt') ## Create list of tokens using nltk function tokens = word_tokenize(tar_passage) ## Create the text of tokens text = Text(tokens) ## Collect all the index or offset position of the target word c = ConcordanceIndex(text.tokens, key = lambda s: s.lower()) ## Collect the range of the words that is within the target word by using text.tokens[start;end]. ## The map function is use so that when the offset position - the target range < 0, it will be default to zero concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin) > 0 else 0, [offset]))[0]: offset + right_margin] for offset in c.offsets(target_word)]) ## join the sentences for each of the target phrase and return it return [' '.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
def show_best_tf_idf(reader): vocabulary = set(stemmer.stem(word.lower()) for word in reader.words()) vocab = {} for category in reader.categories(): vocab_cat = {} for fileid in reader.fileids(categories=category): for word in vocabulary: count = 0 for fileid in reader.fileids(): if word in [ stemmer.stem(word.lower()) for word in reader.words(fileids=fileid) ]: count += 1 text = Text( stemmer.stem(word.lower()) for word in reader.words(fileids=fileid)) tf = len([w for w in text if w == word]) # simple term frequency #tf = 1 if word in document, 0 otherwise # binary weight idf = log(len(reader.fileids()) / count) # here we have the tf idf for each word of vocabulary in taht specific doc tfidf = tf * idf if word not in vocab_cat: vocab_cat[word] = tfidf else: vocab_cat[word] += tfidf # do something to add the category dict to the global dict print(vocab_cat)
def get(self, request): # Open and Read the txt files corpusFile = open(CORPUS_ROOT + "a1.txt", 'rU', encoding="utf8") corpusFileRead = corpusFile.read() # ftext1 = corpusFileRead.split() abst = Text(corpusFileRead.split()) result = (abst.concordance_list(str(request.GET.get('param')))) return Response(result)
class TestTextTransliteration(unittest.TestCase): txt = Text(["São", "Tomé", "and", "Príncipe"]) def test_repr(self): self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>") def test_str(self): self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
def freqDist(self, word): f = open(self.myfile, encoding="utf8") raw = f.read() t = nltk.tokenize.WhitespaceTokenizer( ) #Same functionality as countTheWord() c = Text(t.tokenize(raw)) fdist = nltk.FreqDist(c) print(fdist[word])
def tokenize(raw_text): result = re.sub(pattern="/[a-z+-]*[ .,$*]", repl=" ", string=raw_text) result = word_tokenize(result) result = [ w.lower() for w in result if w not in stopwords.words('english') and len(w) > 3 ] return Text(result)
def word_count(self, word=None, mode='cltk'): if mode == 'nltk': counts = dict(Text(self.tokenize()).vocab()) else: counts = Frequency().counter_from_str(self.data) # If a single word was specified, only return that frequency if word: return counts[word] return counts
def freqDistribution(self): f = open(self.myfile, encoding="utf8") raw = f.read() t = nltk.tokenize.WhitespaceTokenizer() c = Text(t.tokenize(raw)) fdist = nltk.FreqDist(c) voc2 = fdist.items() print('All words with their frequencies', voc2) print('The word with most frequency', fdist.max())
def get(self, request): # Open and Read the txt files corpusFile = open(CORPUS_ROOT + "a1.txt", 'rU', encoding="utf8") corpusFileRead = corpusFile.read() abst = Text(corpusFileRead.split()) fdist1 = FreqDist(abst) result = fdist1.most_common(50) return Response(result)
def read_text(path): if os.path.isdir(path): raw = PlaintextCorpusReader(path, '.*').raw() tokens = nltk.tokenize.word_tokenize(raw) else: f = open(path) raw = f.read() tokens = nltk.tokenize.word_tokenize(raw) return Text(tokens)
def read_url(url): #1 html_text = requests.get(url).text #2 t = nltk.tokenize.WhitespaceTokenizer() clean_html = clean_website(html_text) #3 html_text = Text(t.tokenize(clean_html)) return html_text
def get_five_question(num=100, window_size=2): # 第五题 data = pd.read_csv("./datasets/tweets_train.csv") totals = [] for one in data.values: tweets = one[0] tweetscut = tweets.encode("utf-8").lower().split() for x in tweetscut: totals.append(x) text = Text(totals) text.collocations(num=num, window_size=window_size)
def _freq_func(self): if self.input_text is not None: input_txt = [w for w in self.input_text.split()] else: raise Exception('The attr input_text object is None!') corpus = Text(input_txt) f_dist = FreqDist(corpus) w, v = zip(*f_dist.items()) self.freq_df = pd.DataFrame({'word': w, 'freq': v}) self.freq_df.sort_values('freq', ascending=False, inplace=True) self.freq_df['idx'] = np.arange(len(v))
def filter_words_from_search(search_results): filteredwords = [] punctuation = [",",".",":",";","'","-","!",'"',"-","|"] [] for page in search_results: tokens = tokenizer.tokenize(page['title']) filteredtokens = [w for w in tokens if w not in nltk.corpus.stopwords.words('english')] filteredwords += [x for x in filteredtokens if x not in punctuation] filteredtext = Text(w.lower() for w in filteredwords) return filteredtext
def __init__(self, data_root): self.data_root = data_root self.data = PlaintextCorpusReader(data_root, '.*') self.words = [i for i in self.data.words() if i.isalpha()] self.text = Text(self.words) self.stop = set(stopwords.words('english')).union({ 'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv', 'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume', 'march', 'boston', 'table' }) with open('bib.json') as fi: self.bib = json.load(fi)
def find_similar(filename, maxlines, keyword, tweets_str): #NLTK has a unique tokenizer for tweets t = nltk.tokenize.TweetTokenizer() tweet_txt_obj = Text(t.tokenize(tweets_str)) #by default similar() prints to console #this reroutes the output to a file similar_file = open(filename, 'w', encoding='utf8') tmpout = sys.stdout sys.stdout = similar_file tweet_txt_obj.similar(keyword, maxlines) similar_file.close() sys.stdout = tmpout
def __init__(self, source, name=None): if hasattr(source, "words"): source = [source.words(f) for f in source.fileids()] self._texts = source Text.__init__(self, LazyConcatenation(source), name) self._idf_cache = {}