Exemple #1
0
def convert_to_text(_input):
    if type(_input) is list:
        text_obj = Text(_input)
        return text_obj
    else:
        list1 = word_tokenize(_input)
        text_obj = Text(list1)
        return text_obj
Exemple #2
0
def read_text(path):
    """Takes a file path, which is assumed to point to a file or a directory,
    and returns a Text instance."""
    if os.path.isfile(path):
        with open(path) as fh:
            return Text(nltk.word_tokenize(fh.read()))
    elif os.path.isdir(path):
        # restrict to files with the mrg extension, avoiding hidden files like .DS_Store
        # that can cause trouble
        corpus = PlaintextCorpusReader(path, '.*.mrg')
        return Text(nltk.word_tokenize(corpus.raw()))
Exemple #3
0
def tokenize_and_stem(text):
    #tokenize
    tokens = nltk.word_tokenize(text)
    #remove stopwords
    english_stopwords = stopwords.words("english")
    self_stopwords = [
        'comprises', 'comprising', 'first', 'second', 'includes', 'use',
        'plurality', 'device', 'structure', 'arranged', 'connected',
        'invention', 'provided', 'receive', 'provide', 'extend', 'relates',
        'configured', 'method'
    ]
    english_punctuations = [
        ',', '.', ':', ';', '?', '(', ')', '[', ']', '!', '@', '#', '%', '$',
        '*'
    ]
    #stem
    st = PorterStemmer()
    words_clear = []

    for i in tokens:
        if i.lower() not in english_stopwords:
            if i not in self_stopwords:
                if i not in english_punctuations:
                    if re.search('[a-z]', i):
                        t = st.stem(i)
                        words_clear.append(t)
    words_text = Text(words_clear)
    return words_text
Exemple #4
0
def bag_of_words(path):
    import string
    f = "foodhotel.txt"

    fo = open(path + f, "rw+")
    review = fo.read()

    sentence = word_tokenize(review)
    tokens = pos_tag(Text(sentence))
    customStopWords = set(
        stopwords.words('english') + list(string.punctuation))
    useful_words = [words for words in tokens if words not in customStopWords]
    nouns = [
        word for word, pos in useful_words if pos == 'JJ' or pos == 'VBG'
        or pos == 'NN' or pos == 'RB' or pos == 'VB'
    ]
    freq_nouns = nltk.FreqDist(nouns)
    adj_tags = str(freq_nouns.most_common(100000))
    check = adj_tags.split('),')
    string_list = ''
    for word in check:
        string_list = string_list + '\n' + str(word)
    print string_list
    output = open("/home/amisha/Desktop/sourcedata/output_food.txt", "w")
    output.write(string_list)
    output.close()
def search_word_freq(a_list, word):
    word_list = []
    for tweet in a_list:
        word_list += preprocess(tweet)
    text_list = Text(word_list)
    fdist = FreqDist(text_list)
    print("Frequent of '%s' Word Search: ") % word + str(fdist[word])
Exemple #6
0
def read_file(file_path):
    with open(file_path,'r') as f:
        contents = f.read()
        clean_html = clean_website(contents)
    t = nltk.tokenize.WhitespaceTokenizer()
    html_text = Text(t.tokenize(clean_html))
    return html_text
def context_search(a_list, search_word):
    for tweet in a_list:
        word_list = preprocess(tweet)
        text_list = Text(word_list)
        search_word = search_word.lower()
        if search_word in word_list:
            text_list.concordance(search_word)
 def visualize(self, number):
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     t = nltk.tokenize.WhitespaceTokenizer()
     c = Text(t.tokenize(raw))
     fdist = nltk.FreqDist(c)
     fdist.plot(number)
def bookWriter(lis):
    actualBook = "Blank space, baby!" #Please be a T-swift fan. 
    bookLis = []
    for i in lis:
        actualBook = Text(nltk.corpus.gutenberg.words(i)) #https://stackoverflow.com/questions/29110950/python-concordance-command-in-nltk
        bookLis.append(actualBook) #It's best to just add them to the end, rather than replacing.
    return bookLis
Exemple #10
0
def get_category_vocabs(categories):
    """Returns a dictionary of vocabularies indexed on category names."""
    vocabs = {}
    for cat in categories:
        words = brown.words(categories=[cat])
        vocabs[cat] = Vocabulary(Text(words))
    return vocabs
	def getStockNames(self, question):
		a = "What is the market colour for google?"
		b = "What is market color for GOOGL?"
		c = "market color GOOGL?"
		d = "How are apple and tesla doing?"
		e = "How are APPL and TESL doing?"

		stocks = [
		    'APPL',
		    'apple',
		    'GOOG',
		    'google'
		]

		sentence = question; #enter command here
		tokens = word_tokenize(sentence);
		textList = Text(tokens)
		print (textList)

		results = [];
		for word in tokens:
			if word in stocks:
				results.append(word);
		print (results)
		
		return [results]
Exemple #12
0
def analyze(text):
    text = text.decode('utf-8')
    tokens = nltk.word_tokenize(text)
    vocab = Text(tokens).vocab()
    tokens_list = lemma(vocab)
    glossary = glossay_wrapper(tokens_list)
    return glossary
Exemple #13
0
def filter_words_from_search(search_results):
    t0 = time.time()
    if (len(search_results) > 0):
        filtered_sites = [[
            lemmatize(t.lower()) for t in x
            if t not in string.punctuation and not t.isdigit()
            and lemmatize(t.lower()) not in stop_words and len(t) > 1
        ] for x in [
            tokenizer.tokenize(a['title'] + " " + a['description'] + " " +
                               a['content']) for a in search_results
        ]]
        site_dict = dict([[
            page['link'],
            [
                dict(site.vocab())
                for site in [Text(tex) for tex in filtered_sites]
            ][page_num]
        ] for page_num, page in enumerate(search_results)])
        context_list = [nltk.text.ContextIndex(tex) for tex in filtered_sites]
        t1 = time.time()
        print("Filter Time: " + str(t1 - t0))

        stuff_dict = defaultdict(lambda: defaultdict())
        for i in range(len(search_results)):
            stuff_dict["title"][search_results[i]
                                ["link"]] = search_results[i]["title"]
            stuff_dict["importance"][search_results[i]["link"]] = i + 1
            stuff_dict["description"][
                search_results[i]["link"]] = search_results[i]["description"]

        return site_dict, stuff_dict, context_list
    else:
        return {'error': 404}
Exemple #14
0
    def getMsgFeatures(self, msgTokens):

        gText = Text(msgTokens)
        msgContent = ' '.join(msgTokens)

        features = {}
        features['length'] = len(msgContent)
        features['lexdiv'] = self.getLexicalDiversity(gText)
        features['longestword'] = self.getLongestWord(msgContent)

        for token in msgTokens:
            wordCount = msgContent.lower().count(token)
            if (wordCount > 1):
                features["repeats({})".format(token)] = wordCount

        for unigram in msgTokens:
            features["unigram({})".format(unigram)] = True

        for bigram in self.getBigrams(msgTokens):
            features["bigram({})".format(bigram)] = True

        for trigram in self.getTrigrams(msgTokens):
            features["trigram({})".format(trigram)] = True

        return features
Exemple #15
0
def get_concordance(target_word, tar_passage, left_margin = 10, right_margin = 10):
	"""
	Function to get all the phases that contain the target word in a text/passage tar_passage.
	Workaround to save the output given by nltk Concordance function
	 
	str target_word, str tar_passage int left_margin int right_margin --> list of str
	left_margin and right_margin allocate the number of words/pununciation before and after target word
	Left margin will take note of the beginning of the text
	"""
	# check for punkt tokenizer
	if download('punkt') == False:
		download('punkt')

	## Create list of tokens using nltk function
	tokens = word_tokenize(tar_passage)

	## Create the text of tokens
	text = Text(tokens)

	## Collect all the index or offset position of the target word
	c = ConcordanceIndex(text.tokens, key = lambda s: s.lower())

	## Collect the range of the words that is within the target word by using text.tokens[start;end].
	## The map function is use so that when the offset position - the target range < 0, it will be default to zero
	concordance_txt = ([text.tokens[list(map(lambda x: x-5 if (x-left_margin) > 0 else 0, [offset]))[0]: offset + right_margin]
	                for offset in c.offsets(target_word)])
	                 
	## join the sentences for each of the target phrase and return it
	return [' '.join([x + ' ' for x in con_sub]) for con_sub in concordance_txt]
Exemple #16
0
def show_best_tf_idf(reader):
    vocabulary = set(stemmer.stem(word.lower()) for word in reader.words())
    vocab = {}
    for category in reader.categories():
        vocab_cat = {}
        for fileid in reader.fileids(categories=category):
            for word in vocabulary:
                count = 0
                for fileid in reader.fileids():
                    if word in [
                            stemmer.stem(word.lower())
                            for word in reader.words(fileids=fileid)
                    ]:
                        count += 1
                text = Text(
                    stemmer.stem(word.lower())
                    for word in reader.words(fileids=fileid))
                tf = len([w for w in text
                          if w == word])  # simple term frequency
                #tf = 1 if word in document, 0 otherwise # binary weight
                idf = log(len(reader.fileids()) / count)
                # here we have the tf idf for each word of vocabulary in taht specific doc
                tfidf = tf * idf
                if word not in vocab_cat:
                    vocab_cat[word] = tfidf
                else:
                    vocab_cat[word] += tfidf
        # do something to add the category dict to the global dict
        print(vocab_cat)
Exemple #17
0
 def get(self, request):
     # Open and Read the txt files
     corpusFile = open(CORPUS_ROOT + "a1.txt", 'rU', encoding="utf8")
     corpusFileRead = corpusFile.read()
     # ftext1 = corpusFileRead.split()
     abst = Text(corpusFileRead.split())
     result = (abst.concordance_list(str(request.GET.get('param'))))
     return Response(result)
class TestTextTransliteration(unittest.TestCase):
    txt = Text(["São", "Tomé", "and", "Príncipe"])

    def test_repr(self):
        self.assertEqual(repr(self.txt), br"<Text: S\xe3o Tom\xe9 and Pr\xedncipe...>")

    def test_str(self):
        self.assertEqual(str(self.txt), b"<Text: Sao Tome and Principe...>")
 def freqDist(self, word):
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     t = nltk.tokenize.WhitespaceTokenizer(
     )  #Same functionality as countTheWord()
     c = Text(t.tokenize(raw))
     fdist = nltk.FreqDist(c)
     print(fdist[word])
Exemple #20
0
def tokenize(raw_text):
    result = re.sub(pattern="/[a-z+-]*[ .,$*]", repl=" ", string=raw_text)
    result = word_tokenize(result)
    result = [
        w.lower() for w in result
        if w not in stopwords.words('english') and len(w) > 3
    ]
    return Text(result)
Exemple #21
0
 def word_count(self, word=None, mode='cltk'):
     if mode == 'nltk':
         counts = dict(Text(self.tokenize()).vocab())
     else:
         counts = Frequency().counter_from_str(self.data)
     # If a single word was specified, only return that frequency
     if word:
         return counts[word]
     return counts
 def freqDistribution(self):
     f = open(self.myfile, encoding="utf8")
     raw = f.read()
     t = nltk.tokenize.WhitespaceTokenizer()
     c = Text(t.tokenize(raw))
     fdist = nltk.FreqDist(c)
     voc2 = fdist.items()
     print('All words with their frequencies', voc2)
     print('The word with most frequency', fdist.max())
Exemple #23
0
    def get(self, request):
        # Open and Read the txt files
        corpusFile = open(CORPUS_ROOT + "a1.txt", 'rU', encoding="utf8")
        corpusFileRead = corpusFile.read()
        abst = Text(corpusFileRead.split())

        fdist1 = FreqDist(abst)
        result = fdist1.most_common(50)
        return Response(result)
Exemple #24
0
def read_text(path):
    if os.path.isdir(path):
        raw = PlaintextCorpusReader(path, '.*').raw()
        tokens = nltk.tokenize.word_tokenize(raw)
    else:
        f = open(path)
        raw = f.read()
        tokens = nltk.tokenize.word_tokenize(raw)

    return Text(tokens)
Exemple #25
0
def read_url(url):

    #1
    html_text = requests.get(url).text
    #2
    t = nltk.tokenize.WhitespaceTokenizer()
    clean_html = clean_website(html_text)
    #3
    html_text = Text(t.tokenize(clean_html))
    return html_text
Exemple #26
0
def get_five_question(num=100, window_size=2):
    # 第五题
    data = pd.read_csv("./datasets/tweets_train.csv")
    totals = []
    for one in data.values:
        tweets = one[0]
        tweetscut = tweets.encode("utf-8").lower().split()
        for x in tweetscut:
            totals.append(x)
    text = Text(totals)
    text.collocations(num=num, window_size=window_size)
Exemple #27
0
 def _freq_func(self):
     if self.input_text is not None:
         input_txt = [w for w in self.input_text.split()]
     else:
         raise Exception('The attr input_text object is None!')
     corpus = Text(input_txt)
     f_dist = FreqDist(corpus)
     w, v = zip(*f_dist.items())
     self.freq_df = pd.DataFrame({'word': w, 'freq': v})
     self.freq_df.sort_values('freq', ascending=False, inplace=True)
     self.freq_df['idx'] = np.arange(len(v))
Exemple #28
0
def filter_words_from_search(search_results):
    filteredwords = []
    punctuation = [",",".",":",";","'","-","!",'"',"-","|"]
    []
    for page in search_results:
        tokens = tokenizer.tokenize(page['title'])
        filteredtokens = [w for w in tokens if w not in nltk.corpus.stopwords.words('english')]
        filteredwords += [x for x in filteredtokens if x not in punctuation]

    filteredtext = Text(w.lower() for w in filteredwords)
    return filteredtext
Exemple #29
0
 def __init__(self, data_root):
     self.data_root = data_root
     self.data = PlaintextCorpusReader(data_root, '.*')
     self.words = [i for i in self.data.words() if i.isalpha()]
     self.text = Text(self.words)
     self.stop = set(stopwords.words('english')).union({
         'cid', 'et', 'al', 'also', 'and', 'editingboston', 'arxiv',
         'pages', 'trackboston', 'preprint', 'page', 'vol', 'volume',
         'march', 'boston', 'table'
     })
     with open('bib.json') as fi:
         self.bib = json.load(fi)
Exemple #30
0
def find_similar(filename, maxlines, keyword, tweets_str):
    #NLTK has a unique tokenizer for tweets
    t = nltk.tokenize.TweetTokenizer()
    tweet_txt_obj = Text(t.tokenize(tweets_str))

    #by default similar() prints to console
    #this reroutes the output to a file
    similar_file = open(filename, 'w', encoding='utf8')
    tmpout = sys.stdout
    sys.stdout = similar_file
    tweet_txt_obj.similar(keyword, maxlines)
    similar_file.close()
    sys.stdout = tmpout
 def __init__(self, source, name=None):
     if hasattr(source, "words"):
         source = [source.words(f) for f in source.fileids()]
     self._texts = source
     Text.__init__(self, LazyConcatenation(source), name)
     self._idf_cache = {}